Center star approximation: Identifying the center string in Python
Problem:
You need to calculate the center star approximation for a given set of sequences. Instead of calculating the sequence distances and center string by hand, you want the computer to do the hard work.
Solution
The script presented here is not intended to be used for high-volume MSA calculation, but to provide a simple reference for the reader on how to calculate it — and to reduce the work needing to be done by hand.
General algorithm:
- For each pair of sequences, calculate their edit distance matrix
- In the sequence distance matrix, calculate the sum of each row
- The sequence with the smallest row sum is called the center string
#!/usr/bin/env python3
"""
Simple functions to calculate the center string of a list of sequences.
Also calculates related matrices.
Version 1.1: Python3 ready
"""
import numpy
import pandas
__author__ = "Uli Köhler"
__license__ = "Apache license v2.0"
__version__ = "1.1"
def substitutionCost(a, b):
"""Return the cost of substituting character a with character b"""
return 0 if a == b else 3
def calculateCostMatrix(a, b, gapCost=2):
"""For two sequences a, b, calculate their edit distance matrix"""
#Initialize n+1 x m+1 matrix. x axis <--> a, y axis <--> b
xdim = len(b) + 1
ydim = len(a) + 1
matrix = numpy.zeros((xdim, ydim))
#Initialize first row and first col with n*gap cost
matrix[0,:] = numpy.arange(0, gapCost*ydim, step=gapCost)
matrix[:,0] = numpy.arange(0, gapCost*xdim, step=gapCost)
#Fill array
for y in range(1, ydim):
for x in range(1, xdim):
substCost = matrix[x-1, y-1] + substitutionCost(b[x-1], a[y-1])
matrix[x,y] = min(substCost, matrix[x-1, y] + gapCost, matrix[x, y-1] + gapCost)
return matrix
def getEditDistance(a, b, gapCost=2):
"""Get the edit distance of two sequences a and b"""
return calculateCostMatrix(a, b, gapCost)[-1,-1]
def calculateSequenceDistanceMatrix(sequences, gapCost=2):
"""
Calculate a matrix of edit distance between a list of sequences.
Return the matrix.
"""
#Note: For the sake of simplicity, this calculates ANY pair of sequences.
# If performance matters, one would only calculate half of that
dim = len(sequences)
matrix = numpy.zeros((dim, dim))
for i in range(len(sequences)):
for j in range(len(sequences)):
matrix[i,j] = getEditDistance(sequences[i], sequences[j], gapCost)
return matrix
def calculateCenterString(distanceMatrix):
"""
Calculate the center string (represented by its index in the sequence array)
for a given sequence edit distance matrix.
Return the 1-based index of the center string
"""
#Calculate the sums for any row
rowSums = [sum(distanceMatrix[:,i]) for i in range(distanceMatrix.shape[1])]
#If the same row score occurs multiple times, use the first occurrence
return rowSums.index(min(rowSums)) + 1 #+ 1: One-based indices
def getPandasCostMatrix(a, b, gapCost=2):
"""
Calculate a cost matrix and convert it into a pandas data frame.
Advantage over numpy ndarray: Labelled rows and columns
"""
return pandas.DataFrame(calculateCostMatrix(a, b, gapCost), index=list(" " + b), columns=list(" " + a))
def getPandasDistanceMatrix(sequences, gapCost=2):
"""
Calculate the sequence distance matrix of a list of sequences
and convert the result to a labelled pandas DataFrame
"""
labels = ["s%d" % i for i in range(1, len(sequences) + 1)]
return pandas.DataFrame(calculateSequenceDistanceMatrix(sequences, gapCost), index=labels, columns=labels)
if __name__ == "__main__":
#Example call
sequences = ["ACGTGC","ACCTG", "AGGCTT", "AGCC"]
#Print ONE (arbitrary) edit distance matrix
print("s2 vs s3: \n", getPandasCostMatrix(sequences[1], sequences[3]),"\n")
#Print the sequence distance matrix
print("Sequence distances: \n", getPandasDistanceMatrix(sequences),"\n")
#Print the center string
print("Center string: s%d" % calculateCenterString(calculateSequenceDistanceMatrix(sequences, gapCost=2)))