Center star approximation: Identifying the center string in Python
Problem:
You need to calculate the center star approximation for a given set of sequences. Instead of calculating the sequence distances and center string by hand, you want the computer to do the hard work.
Solution:
The script presented here is not intended to be used for high-volume MSA calculation, but to provide a simple reference for the reader on how to calculate it — and to reduce the work needing to be done by hand.
General algorithm:
- For each pair of sequences, calculate their edit distance matrix
- In the sequence distance matrix, calculate the sum of each row
- The sequence with the smallest row sum is called the center string
#!/usr/bin/env python3
"""
Simple functions to calculate the center string of a list of sequences.
Also calculates related matrices.
Version 1.1: Python3 ready
"""
import numpy
import pandas
__author__ = "Uli Köhler"
__license__ = "Apache license v2.0"
__version__ = "1.1"
def substitutionCost(a, b):
"""Return the cost of substituting character a with character b"""
return 0 if a == b else 3
def calculateCostMatrix(a, b, gapCost=2):
"""For two sequences a, b, calculate their edit distance matrix"""
#Initialize n+1 x m+1 matrix. x axis <--> a, y axis <--> b
xdim = len(b) + 1
ydim = len(a) + 1
matrix = numpy.zeros((xdim, ydim))
#Initialize first row and first col with n*gap cost
matrix[0,:] = numpy.arange(0, gapCost*ydim, step=gapCost)
matrix[:,0] = numpy.arange(0, gapCost*xdim, step=gapCost)
#Fill array
for y in range(1, ydim):
for x in range(1, xdim):
substCost = matrix[x-1, y-1] + substitutionCost(b[x-1], a[y-1])
matrix[x,y] = min(substCost, matrix[x-1, y] + gapCost, matrix[x, y-1] + gapCost)
return matrix
def getEditDistance(a, b, gapCost=2):
"""Get the edit distance of two sequences a and b"""
return calculateCostMatrix(a, b, gapCost)[-1,-1]
def calculateSequenceDistanceMatrix(sequences, gapCost=2):
"""
Calculate a matrix of edit distance between a list of sequences.
Return the matrix.
"""
#Note: For the sake of simplicity, this calculates ANY pair of sequences.
# If performance matters, one would only calculate half of that
dim = len(sequences)
matrix = numpy.zeros((dim, dim))
for i in range(len(sequences)):
for j in range(len(sequences)):
matrix[i,j] = getEditDistance(sequences[i], sequences[j], gapCost)
return matrix
def calculateCenterString(distanceMatrix):
"""
Calculate the center string (represented by its index in the sequence array)
for a given sequence edit distance matrix.
Return the 1-based index of the center string
"""
#Calculate the sums for any row
rowSums = [sum(distanceMatrix[:,i]) for i in range(distanceMatrix.shape[1])]
#If the same row score occurs multiple times, use the first occurrence
return rowSums.index(min(rowSums)) + 1 #+ 1: One-based indices
def getPandasCostMatrix(a, b, gapCost=2):
"""
Calculate a cost matrix and convert it into a pandas data frame.
Advantage over numpy ndarray: Labelled rows and columns
"""
return pandas.DataFrame(calculateCostMatrix(a, b, gapCost), index=list(" " + b), columns=list(" " + a))
def getPandasDistanceMatrix(sequences, gapCost=2):
"""
Calculate the sequence distance matrix of a list of sequences
and convert the result to a labelled pandas DataFrame
"""
labels = ["s%d" % i for i in range(1, len(sequences) + 1)]
return pandas.DataFrame(calculateSequenceDistanceMatrix(sequences, gapCost), index=labels, columns=labels)
if __name__ == "__main__":
#Example call
sequences = ["ACGTGC","ACCTG", "AGGCTT", "AGCC"]
#Print ONE (arbitrary) edit distance matrix
print("s2 vs s3: \n", getPandasCostMatrix(sequences[1], sequences[3]),"\n")
#Print the sequence distance matrix
print("Sequence distances: \n", getPandasDistanceMatrix(sequences),"\n")
#Print the center string
print("Center string: s%d" % calculateCenterString(calculateSequenceDistanceMatrix(sequences, gapCost=2)))