A simple tool for FASTA statistics
The issue
It is surprisingly difficult to compute simple statistics of FASTA files using existing software. I recently needed to compute the nucleotide count and relative GC frequency of a single sequence in FASTA format, but unless you install dependency-heavy native software like FASTX or you develop it by yourself using BioPython or similar, there doesn’t seem to be a simple, dependency-free solution for this simple set of problem.
The solution: fasta-stats.py
I developed a simple Python script, using only the standard Python libraries, which is able to generate these statistics for one or multiple FASTA files (plain or Gzipped), with one or multiple sequences each. The absolute and relative frequency of character (whereas a character is usually an aminoacid code or nucleotide) is calculated, and printed for each sequence of the input files. If the files contain a nucleotide sequence (i.e. it contains ATGC characters only), the A+T and G+C count is automatically printed as well.
Per default, the characters are counted in a case-insensitive manenr
Usage examples:
#Simple statistics for multiple files, auto-detects .gz input
fasta-stats.py sequence1.fa sequence.2.fa.gz
#Count characters case-sensitive
fasta-stats.py --case-sensitive sequence1.fa sequence.2.fa.gz
#Only count ATGC characters, ignore all other characters
fasta-stats.py --only ATGC sequence1.fa sequence.2.fa.gz
Script source:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fasta-stats.py: Utility script to count number of nucleotides/Aminoacids in a FASTA files.
Changelog:
1.1: Python3-ready
"""
from __future__ import with_statement
import sys
import argparse
import gzip
#Counter is used for per-character statistics
from collections import Counter
__author__ = "Uli Koehler & Anton Smirnov"
__copyright__ = "Copyright 2013 Uli Koehler & Anton Smirnov"
__license__ = "Apache v2.0"
__version__ = "1.1"
def printSequenceStats(fileName, sequenceName, charOccurrenceMap, totalCharCount):
"""
Print details about a sequence to stdout.
Called from inside parseFile().
Keyword arguments:
sequenceName: The name of the sequence to print
charOccurrenceMap: A dict-like object that contains a char -->
number of occurrences mapping
totalCharCount: The overall character count of the sequence
"""
print ("Sequence '{}' from FASTA file '{}' contains {} sequence characters:".format(
sequenceName, fileName, totalCharCount))
for char in sorted(charOccurrenceMap.keys()):
charCount = charOccurrenceMap[char]
relativeFrequency = charCount * 100.0 / totalCharCount
print ("\t{} : {} = {}%".format(char, charCount, relativeFrequency))
#For nucleotide sequences (ATGC only), also print A+T vs G+C count
if sorted(charOccurrenceMap.keys()) == ["A","C","G","T"]:
#Print A+T count
atCount = charOccurrenceMap["A"] + charOccurrenceMap["T"]
atRelFrequency = atCount * 100.0 / totalCharCount
print ("\tA+T : {} = {}%".format(atCount, atRelFrequency))
#Print G+C count
gcCount = charOccurrenceMap["G"] + charOccurrenceMap["C"]
atRelFrequency = gcCount * 100.0 / totalCharCount
print ("\tG+C : {} = {}%%".format(gcCount, atRelFrequency))
def parseFile(filename, caseSensitive=False, charWhitelist=None):
"""
Parse a FASTA fil and call printRe
"""
#Set to the header line, with ">" removed from the beginning
sequenceName = None
#Key: character, value: Number of occurrences
charOccurrenceMap = Counter()
#The number of characters in the current sequence, not including \n
charCount = 0
#Keep track of consecutive comments, because they are appended
previousLineWasComment = False
#Open and iterate the file, auto- detect gzip
openFunc = gzip.open if filename.endswith(".gz") else open
with openFunc(filename, "r") as infile:
for line in infile:
line = line.strip()
#Be super-compatible with the original specification
if line.startswith(">") or line.startswith(";"):
#Process previous sequence, if any
if sequenceName is not None:
printSequenceStats(filename, sequenceName, charOccurrenceMap, charCount)
charOccurrenceMap = Counter()
charCount = 0
#Take the entire comment line as (new) sequence ID (with ">" stripped)
#Concatenate consecutive sequence lines
if previousLineWasComment: #Append -- add one space between to normalize whitespace count
sequenceName += " " + line[1:].strip()
else:
sequenceName = line[1:].strip()
previousLineWasComment = True
else: #Line belongs to the sequence
previousLineWasComment = False
#Line has been stripped before, so we can count directly
#Increment per-character stats (character occurrences)
for char in line:
#Skip any character not in the whitelist, if whitelist (--only) is enabled
if charWhitelist is not None and not char in charWhitelist:
continue
#We can only count after whitelite filter
charCount += 1
#In case-insensitive mode (default) count uppercased chars only
char = char if caseSensitive else char.upper()
charOccurrenceMap[char] += 1
#The last line has been read, print the last record, if any
if sequenceName is not None:
printSequenceStats(filename, sequenceName, charOccurrenceMap, charCount)
if __name__ == "__main__":
#Allow single or multiple files to be specified
parser = argparse.ArgumentParser(description='Compute simple statistics for FASTA files.')
parser.add_argument('infiles', nargs='+', help='FASTA files (.fa, .fa.gz) to generate statistics for')
parser.add_argument('--case-sensitive', action='store_true', help='Count characters in a case-sensitive way. Disabled per default.')
parser.add_argument('-o','--only', help='If this option is supplied (e.g. set to \'ATGC\'), characters not in the set will be ignored for all statistics')
args = parser.parse_args()
#Process all FASTA files
for infile in args.infiles:
parseFile(infile, caseSensitive=args.case_sensitive, charWhitelist=args.only)