gtf2gff.py: A replacement for gtf2gff.pl

Recently we had to work with the gtf2gff.pl tool to convert CONTRAST and TwinScan GTF output to the GFF format which can be read by many annotation tools.

Working with that script was really hard, it did not report errors at all, plus it is not programmatically reusable at all. There are different versions of the perl script on the internet, but what we needed was a standardized, short, readable version that does proper command line parsing using a standard tool like argparse and a conversion function that is usable from other scripts.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
gtf2gff.py -- A script to convert GTF to GFF files.
... and a better replacement for gtf2gff.pl

Version 1.1: Python3 ready, various small improvements
"""
# Python 2.x support
from __future__ import with_statement, print_function
import argparse
import sys
import os.path

__author__    = "Uli Köhler & Anton Smirnov"
__copyright__ = "Copyright 2013 Uli Köhler & Anton Smirnov"
__license__   = "Apache v2.0"
__version__   = "1.1"

class GTFException(Exception):
    pass

def gtf2gff(infilepath, outfilepath, startindex, endindex, program):
    with open(infilepath, "r") as infile, open(outfilepath, "w") as outfile:
        genId = 0
        for line in infile:
            line = line.strip()
            if not line: continue
            words = line.split("\t")
            if len(words) != 9:
                raise GTFException("Encountered %d columns instead of the expected 9 in line: '%s'" % (len(words), line))
            if words[2].find("start_codon") != -1 and words[6] == "+":
                genId += 1
            if words[2].find("stop_codon") != -1 and words[6] == "-":
                genId += 1
            if int(words[3]) >= startindex and int(words[3]) <= endindex:
                words[0] += "_%d" % genId
                words[1] = program
                words[3] = str(int(words[3]) - startindex)
                words[4] = str(int(words[4]) - startindex)
                print ("\t".join(words), file=outfile)
            if int(words[3]) > endindex:
                break

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--startindex', help="Start index of the part to extract. Entry Indices will be adjusted to this value, meaning, here you should be precise. Take the value: Sbjct_Index - Query_Index", type=int, nargs="?")
    parser.add_argument('-l', '--length', help="Start  index of the part to extract. Entry Indices will  be adjusted to this  value, meaning, here you should be precise. Take  the value: Sbjct_Index -  Query_Index", type=int, nargs="?")
    parser.add_argument('-e', '--endindex', help="End index. Only Entries smaller than this value are included", type=int, nargs="?")
    parser.add_argument('-p', '--program', help='The name of the program which generated the GTF file, e.g. twinscan or CONTRAST',required=True)
    parser.add_argument('infile', help="The GTF input file.",)
    parser.add_argument('outfile', help="The GFF output file.", nargs="?")
    args = parser.parse_args()
    #Check argument consistency
    num_length_args = (1 if args.startindex is not None else 0) \
        + (1 if args.endindex is not None  else 0) \
        + (1 if args.length is not None  else 0)
    if num_length_args < 2:
        parser.print_help()
        print ("You need to specify at least two of --startindex, --length and --endindex")
        sys.exit(1)
    if args.startindex is not None and args.endindex is not None and args.startindex > args.endindex:
        parser.print_help()
        print('Check your start and end indices!')
        sys.exit(1)
    if args.length is not None and args.length < 1:
        parser.print_help()
        print('Length too short')
        sys.exit(1)
    if args.length is not None and args.startindex is not None and args.endindex is not None and (args.endindex - args.startindex) != args.length:
        parser.print_help()
        print('Length does not match start/end index.')
        sys.exit(1)
    if args.startindex is None: args.startindex = args.endindex - args.length
    if args.endindex is None: args.endindex = args.startindex + args.length
    # Build if
    outfilename = args.outfile
    if outfilename is None:
        outfilename = "{}.gff".format(os.path.splitext(args.infile)[0])
    #Execute the converter
    print(args.infile, outfilename)
    gtf2gff(args.infile, outfilename, args.startindex, args.endindex, args.program)