gtf2gff.py: A replacement for gtf2gff.pl
Recently we had to work with the gtf2gff.pl tool to convert CONTRAST and TwinScan GTF output to the GFF format which can be read by many annotation tools.
Working with that script was really hard, it did not report errors at all, plus it is not programmatically reusable at all. There are different versions of the perl script on the internet, but what we needed was a standardized, short, readable version that does proper command line parsing using a standard tool like argparse and a conversion function that is usable from other scripts.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
gtf2gff.py -- A script to convert GTF to GFF files.
... and a better replacement for gtf2gff.pl
Version 1.1: Python3 ready, various small improvements
"""
# Python 2.x support
from __future__ import with_statement, print_function
import argparse
import sys
import os.path
__author__ = "Uli Köhler & Anton Smirnov"
__copyright__ = "Copyright 2013 Uli Köhler & Anton Smirnov"
__license__ = "Apache v2.0"
__version__ = "1.1"
class GTFException(Exception):
pass
def gtf2gff(infilepath, outfilepath, startindex, endindex, program):
with open(infilepath, "r") as infile, open(outfilepath, "w") as outfile:
genId = 0
for line in infile:
line = line.strip()
if not line: continue
words = line.split("\t")
if len(words) != 9:
raise GTFException("Encountered %d columns instead of the expected 9 in line: '%s'" % (len(words), line))
if words[2].find("start_codon") != -1 and words[6] == "+":
genId += 1
if words[2].find("stop_codon") != -1 and words[6] == "-":
genId += 1
if int(words[3]) >= startindex and int(words[3]) <= endindex:
words[0] += "_%d" % genId
words[1] = program
words[3] = str(int(words[3]) - startindex)
words[4] = str(int(words[4]) - startindex)
print ("\t".join(words), file=outfile)
if int(words[3]) > endindex:
break
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--startindex', help="Start index of the part to extract. Entry Indices will be adjusted to this value, meaning, here you should be precise. Take the value: Sbjct_Index - Query_Index", type=int, nargs="?")
parser.add_argument('-l', '--length', help="Start index of the part to extract. Entry Indices will be adjusted to this value, meaning, here you should be precise. Take the value: Sbjct_Index - Query_Index", type=int, nargs="?")
parser.add_argument('-e', '--endindex', help="End index. Only Entries smaller than this value are included", type=int, nargs="?")
parser.add_argument('-p', '--program', help='The name of the program which generated the GTF file, e.g. twinscan or CONTRAST',required=True)
parser.add_argument('infile', help="The GTF input file.",)
parser.add_argument('outfile', help="The GFF output file.", nargs="?")
args = parser.parse_args()
#Check argument consistency
num_length_args = (1 if args.startindex is not None else 0) \
+ (1 if args.endindex is not None else 0) \
+ (1 if args.length is not None else 0)
if num_length_args < 2:
parser.print_help()
print ("You need to specify at least two of --startindex, --length and --endindex")
sys.exit(1)
if args.startindex is not None and args.endindex is not None and args.startindex > args.endindex:
parser.print_help()
print('Check your start and end indices!')
sys.exit(1)
if args.length is not None and args.length < 1:
parser.print_help()
print('Length too short')
sys.exit(1)
if args.length is not None and args.startindex is not None and args.endindex is not None and (args.endindex - args.startindex) != args.length:
parser.print_help()
print('Length does not match start/end index.')
sys.exit(1)
if args.startindex is None: args.startindex = args.endindex - args.length
if args.endindex is None: args.endindex = args.startindex + args.length
# Build if
outfilename = args.outfile
if outfilename is None:
outfilename = "{}.gff".format(os.path.splitext(args.infile)[0])
#Execute the converter
print(args.infile, outfilename)
gtf2gff(args.infile, outfilename, args.startindex, args.endindex, args.program)