Parsing NCBI GeneInfo in Python

Problem:

You need to parse files in the NCBI GeneInfo format, like those that can be downloaded from the NCBI FTP GENE_INFO directory, in Python. You want to avoid any dependencies.

Solution

Here’s a parser using only the Python standard library for you. It has been tested with the 2013/11/30 All_Data.gene_info.gz and parses each line into immutable GeneInfo objects.

Fields such as the synonym list or the database XRefs are converted to lists or dictionaries respectively. Missing values that are denoted by “-“ in the NCBI representation are replaced by None to allow easier filtering.

The code includes an example and does not depend on any software or library beyond Python 2.6. It is recommended you modify the parser to accomodate different but similar formats or to fit your needs. The parser is generator-based and requires constant memory if used properly.

Note that PyPy was about 60% faster than CPython 2.7.6 in a quick test on All_Data.gene_info.gz

parse_ncbi_geneinfo.py

#!/usr/bin/env python3
"""
A simple parser for the NCBI gene info format.

Version 1.1: Python3 ready
"""
from __future__ import with_statement
from collections import namedtuple
import gzip
from datetime import datetime

__author__  = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"

#Initialized GeneInfo named tuple. Note: namedtuple is immutable
geneInfoFields = ["tax_id", "gene_id", "symbol", "locus_tag", "synonyms", "db_xrefs", "chromosome", "map_location", "description", "type_of_gene", "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority", "nomenclature_status", "other_designations", "modification_date"]
GeneInfo = namedtuple("GeneInfo", geneInfoFields)

def parseDBXrefs(xrefs):
    """Parse a DB xref string like HGNC:5|MIM:138670 to a dictionary"""
    #Split by |, split results by :. Create a dict (python 2.6 compatible way).
    if xrefs == "-": return {}
    return dict([(xrefParts[0], xrefParts[2])
                for xrefParts in (xref.partition(":")
                  for xref in xrefs.split("|"))])

def parseNCBIGeneInfo(filename):
    """
    A NCBI gene info format parser.
    Yields objects that contain info about a single gene.

    Supports transparent gzip decompression
    """
    #Parse with transparent decompression
    openFunc = gzip.open if filename.endswith(".gz") else open
    with openFunc(filename) as infile:
        for line in infile:
            if line.startswith("#"): continue
            parts = line.strip().split("\t")
            #If this fails, the format is not standard-compatible
            assert len(parts) == len(geneInfoFields)
            #Normalize data
            normalizedInfo = {
                "tax_id": int(parts[0]),
                "gene_id": int(parts[1]),
                "symbol": parts[2],
                "locus_tag": None if parts[3] == "-" else parts[3],
                "synonyms": [] if parts[4] == "-" else parts[4].split("|"),
                "db_xrefs": parseDBXrefs(parts[5]),
                "chromosome": parts[6],
                "map_location": parts[7],
                "description": parts[8],
                "type_of_gene": parts[9],
                "symbol_from_nomenclature_authority": None if parts[10] == "-" else parts[10],
                "full_name_from_nomenclature_authority": None if parts[11] == "-" else parts[11],
                "nomenclature_status": None if parts[12] == "-" else parts[12],
                "other_designations": None if parts[13] == "-" else parts[13],
                "modification_date": datetime.strptime(parts[14], "%Y%m%d")
            }
            #Alternatively, you can emit the dictionary here, if you need mutability:
            #    yield normalizedInfo
            yield GeneInfo(**normalizedInfo)

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("geneinfo_file", help="The NCBI GeneInfo input file (.gz allowed)")
    parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
    args = parser.parse_args()
    #Execute the parser
    recordCount = 0
    for geneInfo in parseNCBIGeneInfo(args.geneinfo_file):
        if args.print_records:
            print (geneInfo)
        #Access records like this: my_gene_id = geneInfo.gene_id
        recordCount += 1
    print ("Total records: %d" % recordCount)

#!/usr/bin/env python3
"""
A simple parser for the NCBI gene info format.

Version 1.1: Python3 ready
"""
from __future__ import with_statement
from collections import namedtuple
import gzip
from datetime import datetime

__author__  = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"

#Initialized GeneInfo named tuple. Note: namedtuple is immutable
geneInfoFields = ["tax_id", "gene_id", "symbol", "locus_tag", "synonyms", "db_xrefs", "chromosome", "map_location", "description", "type_of_gene", "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority", "nomenclature_status", "other_designations", "modification_date"]
GeneInfo = namedtuple("GeneInfo", geneInfoFields)

def parseDBXrefs(xrefs):
    """Parse a DB xref string like HGNC:5|MIM:138670 to a dictionary"""
    #Split by |, split results by :. Create a dict (python 2.6 compatible way).
    if xrefs == "-": return {}
    return dict([(xrefParts[0], xrefParts[2])
                for xrefParts in (xref.partition(":")
                  for xref in xrefs.split("|"))])

def parseNCBIGeneInfo(filename):
    """
    A NCBI gene info format parser.
    Yields objects that contain info about a single gene.

    Supports transparent gzip decompression
    """
    #Parse with transparent decompression
    openFunc = gzip.open if filename.endswith(".gz") else open
    with openFunc(filename) as infile:
        for line in infile:
            if line.startswith("#"): continue
            parts = line.strip().split("\t")
            #If this fails, the format is not standard-compatible
            assert len(parts) == len(geneInfoFields)
            #Normalize data
            normalizedInfo = {
                "tax_id": int(parts[0]),
                "gene_id": int(parts[1]),
                "symbol": parts[2],
                "locus_tag": None if parts[3] == "-" else parts[3],
                "synonyms": [] if parts[4] == "-" else parts[4].split("|"),
                "db_xrefs": parseDBXrefs(parts[5]),
                "chromosome": parts[6],
                "map_location": parts[7],
                "description": parts[8],
                "type_of_gene": parts[9],
                "symbol_from_nomenclature_authority": None if parts[10] == "-" else parts[10],
                "full_name_from_nomenclature_authority": None if parts[11] == "-" else parts[11],
                "nomenclature_status": None if parts[12] == "-" else parts[12],
                "other_designations": None if parts[13] == "-" else parts[13],
                "modification_date": datetime.strptime(parts[14], "%Y%m%d")
            }
            #Alternatively, you can emit the dictionary here, if you need mutability:
            #    yield normalizedInfo
            yield GeneInfo(**normalizedInfo)

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("geneinfo_file", help="The NCBI GeneInfo input file (.gz allowed)")
    parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
    args = parser.parse_args()
    #Execute the parser
    recordCount = 0
    for geneInfo in parseNCBIGeneInfo(args.geneinfo_file):
        if args.print_records:
            print (geneInfo)
        #Access records like this: my_gene_id = geneInfo.gene_id
        recordCount += 1
    print ("Total records: %d" % recordCount)

Check out similar posts by category: Bioinformatics, Python

If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow

Buy me a coffee