Parsing NCBI GeneInfo in Python
Problem:
You need to parse files in the NCBI GeneInfo format, like those that can be downloaded from the NCBI FTP GENE_INFO directory, in Python. You want to avoid any dependencies.
Solution
Here’s a parser using only the Python standard library for you. It has been tested with the 2013/11/30 All_Data.gene_info.gz and parses each line into immutable GeneInfo objects.
Fields such as the synonym list or the database XRefs are converted to lists or dictionaries respectively. Missing values that are denoted by “-“ in the NCBI representation are replaced by None to allow easier filtering.
The code includes an example and does not depend on any software or library beyond Python 2.6. It is recommended you modify the parser to accomodate different but similar formats or to fit your needs. The parser is generator-based and requires constant memory if used properly.
Note that PyPy was about 60% faster than CPython 2.7.6 in a quick test on All_Data.gene_info.gz
#!/usr/bin/env python3
"""
A simple parser for the NCBI gene info format.
Version 1.1: Python3 ready
"""
from __future__ import with_statement
from collections import namedtuple
import gzip
from datetime import datetime
__author__ = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"
#Initialized GeneInfo named tuple. Note: namedtuple is immutable
geneInfoFields = ["tax_id", "gene_id", "symbol", "locus_tag", "synonyms", "db_xrefs", "chromosome", "map_location", "description", "type_of_gene", "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority", "nomenclature_status", "other_designations", "modification_date"]
GeneInfo = namedtuple("GeneInfo", geneInfoFields)
def parseDBXrefs(xrefs):
"""Parse a DB xref string like HGNC:5|MIM:138670 to a dictionary"""
#Split by |, split results by :. Create a dict (python 2.6 compatible way).
if xrefs == "-": return {}
return dict([(xrefParts[0], xrefParts[2])
for xrefParts in (xref.partition(":")
for xref in xrefs.split("|"))])
def parseNCBIGeneInfo(filename):
"""
A NCBI gene info format parser.
Yields objects that contain info about a single gene.
Supports transparent gzip decompression
"""
#Parse with transparent decompression
openFunc = gzip.open if filename.endswith(".gz") else open
with openFunc(filename) as infile:
for line in infile:
if line.startswith("#"): continue
parts = line.strip().split("\t")
#If this fails, the format is not standard-compatible
assert len(parts) == len(geneInfoFields)
#Normalize data
normalizedInfo = {
"tax_id": int(parts[0]),
"gene_id": int(parts[1]),
"symbol": parts[2],
"locus_tag": None if parts[3] == "-" else parts[3],
"synonyms": [] if parts[4] == "-" else parts[4].split("|"),
"db_xrefs": parseDBXrefs(parts[5]),
"chromosome": parts[6],
"map_location": parts[7],
"description": parts[8],
"type_of_gene": parts[9],
"symbol_from_nomenclature_authority": None if parts[10] == "-" else parts[10],
"full_name_from_nomenclature_authority": None if parts[11] == "-" else parts[11],
"nomenclature_status": None if parts[12] == "-" else parts[12],
"other_designations": None if parts[13] == "-" else parts[13],
"modification_date": datetime.strptime(parts[14], "%Y%m%d")
}
#Alternatively, you can emit the dictionary here, if you need mutability:
# yield normalizedInfo
yield GeneInfo(**normalizedInfo)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("geneinfo_file", help="The NCBI GeneInfo input file (.gz allowed)")
parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
args = parser.parse_args()
#Execute the parser
recordCount = 0
for geneInfo in parseNCBIGeneInfo(args.geneinfo_file):
if args.print_records:
print (geneInfo)
#Access records like this: my_gene_id = geneInfo.gene_id
recordCount += 1
print ("Total records: %d" % recordCount)