GFF3-Parser in Python | TechOverflow

Problem:

Du musst eine GFF3-Datei parsen, die Informationen über Sequenz-Features enthält. Du bevorzugst eine minimale, abhängigkeitsfreie Lösung, anstatt die GFF3-Daten sofort in eine Datenbank zu importieren. Du benötigst jedoch einen standardkompatiblen Parser.

Lösung

Der folgende Parser ist vollständig kompatibel mit dem auf der SequenceOntology GFF3-Seite beschriebenen Format und wurde mit der transcript.gff3-Beispieldatei des Broad Institute getestet.

Er enthält einen vollständig standardkompatiblen Attribut-Parser und verarbeitet alles korrekt.

Fast jede Software kommt mit ihrer leicht abweichenden GFF-Formatspezifikation. Wir empfehlen, den Parser über einen Beispieldatensatz auszuführen, der von deiner Software generiert wurde, und dann die Implementierung an deine Anforderungen anzupassen.

Im Gegensatz zu Software wie BioPython erfordert dieser Baustein-Ansatz nicht, entweder ein standardkompatibles Format zu verwenden oder den gesamten Parser selbst zu schreiben.

gff3_parser.py

#!/usr/bin/env python3
"""
Ein einfacher Parser für das GFF3-Format.

Getestet mit transcripts.gff3 von
http://www.broadinstitute.org/annotation/gebo/help/gff3.html.

Quelle der Formatspezifikation:
http://www.sequenceontology.org/gff3.shtml

Version 1.1: Python3-ready
"""
from collections import namedtuple
import gzip
import urllib.request, urllib.parse, urllib.error

__author__  = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"

#Initialisiert GeneInfo Named Tuple. Hinweis: namedtuple ist unveränderlich
gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
GFFRecord = namedtuple("GFFRecord", gffInfoFields)

def parseGFFAttributes(attributeString):
    """Parst die GFF3-Attributspalte und gibt ein Dict zurück"""#
    if attributeString == ".": return {}
    ret = {}
    for attribute in attributeString.split(";"):
        key, value = attribute.split("=")
        ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
    return ret

def parseGFF3(filename):
    """
    Ein minimalistischer GFF3-Format-Parser.
    Yielded Objekte, die Infos über ein einzelnes GFF3-Feature enthalten.

    Unterstützt transparente Gzip-Dekomprimierung.
    """
    #Mit transparenter Dekomprimierung parsen
    openFunc = gzip.open if filename.endswith(".gz") else open
    with openFunc(filename) as infile:
        for line in infile:
            if line.startswith("#"): continue
            parts = line.strip().split("\t")
            #Wenn dies fehlschlägt, ist das Dateiformat nicht standardkompatibel
            assert len(parts) == len(gffInfoFields)
            #Daten normalisieren
            normalizedInfo = {
                "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
                "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
                "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
                "start": None if parts[3] == "." else int(parts[3]),
                "end": None if parts[4] == "." else int(parts[4]),
                "score": None if parts[5] == "." else float(parts[5]),
                "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
                "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
                "attributes": parseGFFAttributes(parts[8])
            }
            #Alternativ kann das Dictionary hier ausgegeben werden, wenn Mutabilität benötigt wird:
            #    yield normalizedInfo
            yield GFFRecord(**normalizedInfo)


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("file", help="Die GFF3-Eingabedatei (.gz erlaubt)")
    parser.add_argument("--print-records", action="store_true", help="Alle GeneInfo-Objekte ausgeben, nicht nur")
    parser.add_argument("--filter-type", help="Datensätze ohne den angegebenen Typ ignorieren")
    args = parser.parse_args()
    #Parser ausführen
    recordCount = 0
    for record in parseGFF3(args.file):
        #Filter anwenden, falls vorhanden
        if args.filter_type and record.type != args.filter_type:
            continue
        #Datensatz ausgeben, falls vom Benutzer angegeben
        if args.print_records: print(record)
        #Auf Attribute wie folgt zugreifen: my_strand = record.strand
        recordCount += 1
    print("Total records: %d" % recordCount)

#!/usr/bin/env python3
"""
Ein einfacher Parser für das GFF3-Format.

Getestet mit transcripts.gff3 von
http://www.broadinstitute.org/annotation/gebo/help/gff3.html.

Quelle der Formatspezifikation:
http://www.sequenceontology.org/gff3.shtml

Version 1.1: Python3-ready
"""
from collections import namedtuple
import gzip
import urllib.request, urllib.parse, urllib.error

__author__  = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"

#Initialisiert GeneInfo Named Tuple. Hinweis: namedtuple ist unveränderlich
gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
GFFRecord = namedtuple("GFFRecord", gffInfoFields)

def parseGFFAttributes(attributeString):
    """Parst die GFF3-Attributspalte und gibt ein Dict zurück"""#
    if attributeString == ".": return {}
    ret = {}
    for attribute in attributeString.split(";"):
        key, value = attribute.split("=")
        ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
    return ret

def parseGFF3(filename):
    """
    Ein minimalistischer GFF3-Format-Parser.
    Yielded Objekte, die Infos über ein einzelnes GFF3-Feature enthalten.

    Unterstützt transparente Gzip-Dekomprimierung.
    """
    #Mit transparenter Dekomprimierung parsen
    openFunc = gzip.open if filename.endswith(".gz") else open
    with openFunc(filename) as infile:
        for line in infile:
            if line.startswith("#"): continue
            parts = line.strip().split("\t")
            #Wenn dies fehlschlägt, ist das Dateiformat nicht standardkompatibel
            assert len(parts) == len(gffInfoFields)
            #Daten normalisieren
            normalizedInfo = {
                "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
                "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
                "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
                "start": None if parts[3] == "." else int(parts[3]),
                "end": None if parts[4] == "." else int(parts[4]),
                "score": None if parts[5] == "." else float(parts[5]),
                "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
                "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
                "attributes": parseGFFAttributes(parts[8])
            }
            #Alternativ kann das Dictionary hier ausgegeben werden, wenn Mutabilität benötigt wird:
            #    yield normalizedInfo
            yield GFFRecord(**normalizedInfo)


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("file", help="Die GFF3-Eingabedatei (.gz erlaubt)")
    parser.add_argument("--print-records", action="store_true", help="Alle GeneInfo-Objekte ausgeben, nicht nur")
    parser.add_argument("--filter-type", help="Datensätze ohne den angegebenen Typ ignorieren")
    args = parser.parse_args()
    #Parser ausführen
    recordCount = 0
    for record in parseGFF3(args.file):
        #Filter anwenden, falls vorhanden
        if args.filter_type and record.type != args.filter_type:
            continue
        #Datensatz ausgeben, falls vom Benutzer angegeben
        if args.print_records: print(record)
        #Auf Attribute wie folgt zugreifen: my_strand = record.strand
        recordCount += 1
    print("Total records: %d" % recordCount)

Check out similar posts by category: Bioinformatics, Python

If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow

Buy me a coffee