A simple GFF3 parser in Python
Problem:
You need to parse a GFF3 file containing information about sequence features. You prefer to use a minimal, depedency-free solution instead of importing the GFF3 data into a database right away. However, you need to have a standard-compatible parser
Solution
The following parser is fully compatible with the format described at the SequenceOntology GFF3 page and has been tested with the transcript.gff3 example file by the Broad Institute.
It contains a fully standard-compatible attribute parser and properly handles.
Almost any software comes with its slightly different GFF format specification. We advise you run the parser over a sample GFF dataset generated by your software and then change the implementation to match your requirements.
In contrast to software like BioPython this building-block approach does not require you to either use a standard-compatible format or write the entire parser yourself.
#!/usr/bin/env python3
"""
A simple parser for the GFF3 format.
Test with transcripts.gff3 from
http://www.broadinstitute.org/annotation/gebo/help/gff3.html.
Format specification source:
http://www.sequenceontology.org/gff3.shtml
Version 1.1: Python3 ready
"""
from collections import namedtuple
import gzip
import urllib.request, urllib.parse, urllib.error
__author__ = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"
#Initialized GeneInfo named tuple. Note: namedtuple is immutable
gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
GFFRecord = namedtuple("GFFRecord", gffInfoFields)
def parseGFFAttributes(attributeString):
"""Parse the GFF3 attribute column and return a dict"""#
if attributeString == ".": return {}
ret = {}
for attribute in attributeString.split(";"):
key, value = attribute.split("=")
ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
return ret
def parseGFF3(filename):
"""
A minimalistic GFF3 format parser.
Yields objects that contain info about a single GFF3 feature.
Supports transparent gzip decompression.
"""
#Parse with transparent decompression
openFunc = gzip.open if filename.endswith(".gz") else open
with openFunc(filename) as infile:
for line in infile:
if line.startswith("#"): continue
parts = line.strip().split("\t")
#If this fails, the file format is not standard-compatible
assert len(parts) == len(gffInfoFields)
#Normalize data
normalizedInfo = {
"seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
"source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
"type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
"start": None if parts[3] == "." else int(parts[3]),
"end": None if parts[4] == "." else int(parts[4]),
"score": None if parts[5] == "." else float(parts[5]),
"strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
"phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
"attributes": parseGFFAttributes(parts[8])
}
#Alternatively, you can emit the dictionary here, if you need mutability:
# yield normalizedInfo
yield GFFRecord(**normalizedInfo)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("file", help="The GFF3 input file (.gz allowed)")
parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
parser.add_argument("--filter-type", help="Ignore records not having the given type")
args = parser.parse_args()
#Execute the parser
recordCount = 0
for record in parseGFF3(args.file):
#Apply filter, if any
if args.filter_type and record.type != args.filter_type:
continue
#Print record if specified by the user
if args.print_records: print(record)
#Access attributes like this: my_strand = record.strand
recordCount += 1
print("Total records: %d" % recordCount)