vcftoolbox

Tools for manipulating and parsing vcf files


Keywords
vcf, parsing
License
MIT
Install
pip install vcftoolbox==1.5.1

Documentation

Build Status

Set of tools for working with vcf files and variants.

Thanks to pyvcf for code base in header parser.

Utilities

def get_variant_dict(variant_line, header_line):
    """Parse a variant line
        
        Split a variant line and map the fields on the header columns
        
        Args:
            variant_line (str): A vcf variant line
            header_line (list): A list with the header columns
        Returns:
            variant_dict (dict): A variant dictionary
    """
def get_info_dict(info_line):
    """Parse a info field of a variant
        
        Make a dictionary from the info field of a vcf variant.
        Keys are the info keys and values are the raw strings from the vcf
        If the field only have a key (no value), value of infodict is True.
        
        Args:
            info_line (str): The info field of a vcf variant
        Returns:
            info_dict (dict): A INFO dictionary
    """
def get_variant_id(variant_dict):
    """Build a variant id
    
        The variant id is a string made of CHROM_POS_REF_ALT
        
        Args:
            variant_dict (dict): A variant dictionary
        
        Returns:
            variant_id (str)
    """
def get_vep_info(vep_string, vep_header):
    """Make the vep annotations into a dictionaries
    
        A vep dictionary will have the vep column names as keys and 
        the vep annotations as values.
        The dictionaries are stored in a list

        Args:
            vep_list (string): A string with the CSQ annotation
            vep_header (list): A list with the vep header
            allele (str): The allele that is annotated
        
        Return:
            vep_annotations (list): A list of vep dicts
    
    """
def replace_vcf_info(keyword, annotation, variant_line=None, variant_dict=None):
    """Replace the information of a info field of a vcf variant line or a 
        variant dict.
    
    Arguments:
        variant_line (str): A vcf formatted variant line
        variant_dict (dict): A variant dictionary
        keyword (str): The info field key
        annotation (str): If the annotation is a key, value pair
                          this is the string that represents the value
    
    Returns:
        variant_line (str): A annotated variant line
def add_vcf_info(keyword, variant_line=None, variant_dict=None, annotation=None):
    """
    Add information to the info field of a vcf variant line.
    
    Arguments:
        keyword (str): The info field key
        variant_line (str): A vcf formatted variant line
        annotation (str): If the annotation is a key, value pair
                          this is the string that represents the value
    
    Returns:
        fixed_variant : str if variant line, or dict if variant_dict
    """

Classes

Genotype

class Genotype(object):
    """Holds information about a vcf genotype
    
        These objects try to collect the relevant information about genotype 
        calls in the vcf format.
        
        Attributes:
            allele_1 = '.'
            allele_2 = '.'
            genotype = './.'
            heterozygote = False
            allele_depth = False
            homo_alt = False
            homo_ref = False
            has_variant = False
            genotyped = False
            phased = False
            depth_of_coverage = 0
            quality_depth = 0
            genotype_quality = 0
            ref_depth = 0
            alt_depth = 0
            quality_depth = ref_depth + alt_depth
            phred_likelihoods = []
        
    
    """

HeaderParser

class HeaderParser(object):
    """Class for holding and manipulating vcf headers
        
        Attributes:
            info_dict=OrderedDict()
            extra_info = {}
        
            filter_lines=[]
            filter_dict=OrderedDict()
        
            contig_lines=[]
            contig_dict=OrderedDict()
        
            format_lines=[]
            format_dict=OrderedDict()
        
            alt_lines=[]
            alt_dict=OrderedDict()
        
            other_lines=[]
            other_dict=OrderedDict()
        
            header=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO']
            header_keys={'info' : ['ID', 'Number', 'Type', 'Description'], 
                           'form' : ['ID', 'Number', 'Type', 'Description'], 
                           'filt' : ['ID', 'Description'],
                           'alt' : ['ID', 'Description'],
                           'contig' : ['ID', 'length']}
            fileformat = None
            filedate = None
            reference = None
            phasing = None
            source = None
            line_counter = 0
            individuals = []
            vep_columns = []
            info_pattern
            filter_pattern
            contig_pattern
            format_pattern
            alt_pattern
            meta_pattern
        
        Methods:
            parse_meta_data(line)
            parse_header_line(line)
            add_fileformat(fileformat)
            print_header(): Return a list with the header lines
            add_meta_linekey, value)
            add_info(info_id, number, entry_type, description)
            add_filter(filter_id, description)
            add_format(format_id, number, entry_type, description)            
            add_alt(alt_id, description)            
            add_contig(contig_id, length)
            add_version_tracking(info_id, version, date, command_line='')            
            
    """