Problem:
You need to calculate the center star approximation for a given set of sequences. Instead of calculating the sequence distances and center string by hand, you want the computer to do the hard work.
Solution:
The script presented here is not intended to be used for high-volume MSA calculation, but to provide a simple reference for the reader on how to calculate it — and to reduce the work needing to be done by hand.
General algorithm:
- For each pair of sequences, calculate their edit distance matrix
- In the sequence distance matrix, calculate the sum of each row
- The sequence with the smallest row sum is called the center string
#!/usr/bin/env python3 """ Simple functions to calculate the center string of a list of sequences. Also calculates related matrices. Version 1.1: Python3 ready """ import numpy import pandas __author__ = "Uli Köhler" __license__ = "Apache license v2.0" __version__ = "1.1" def substitutionCost(a, b): """Return the cost of substituting character a with character b""" return 0 if a == b else 3 def calculateCostMatrix(a, b, gapCost=2): """For two sequences a, b, calculate their edit distance matrix""" #Initialize n+1 x m+1 matrix. x axis <--> a, y axis <--> b xdim = len(b) + 1 ydim = len(a) + 1 matrix = numpy.zeros((xdim, ydim)) #Initialize first row and first col with n*gap cost matrix[0,:] = numpy.arange(0, gapCost*ydim, step=gapCost) matrix[:,0] = numpy.arange(0, gapCost*xdim, step=gapCost) #Fill array for y in range(1, ydim): for x in range(1, xdim): substCost = matrix[x-1, y-1] + substitutionCost(b[x-1], a[y-1]) matrix[x,y] = min(substCost, matrix[x-1, y] + gapCost, matrix[x, y-1] + gapCost) return matrix def getEditDistance(a, b, gapCost=2): """Get the edit distance of two sequences a and b""" return calculateCostMatrix(a, b, gapCost)[-1,-1] def calculateSequenceDistanceMatrix(sequences, gapCost=2): """ Calculate a matrix of edit distance between a list of sequences. Return the matrix. """ #Note: For the sake of simplicity, this calculates ANY pair of sequences. # If performance matters, one would only calculate half of that dim = len(sequences) matrix = numpy.zeros((dim, dim)) for i in range(len(sequences)): for j in range(len(sequences)): matrix[i,j] = getEditDistance(sequences[i], sequences[j], gapCost) return matrix def calculateCenterString(distanceMatrix): """ Calculate the center string (represented by its index in the sequence array) for a given sequence edit distance matrix. Return the 1-based index of the center string """ #Calculate the sums for any row rowSums = [sum(distanceMatrix[:,i]) for i in range(distanceMatrix.shape[1])] #If the same row score occurs multiple times, use the first occurrence return rowSums.index(min(rowSums)) + 1 #+ 1: One-based indices def getPandasCostMatrix(a, b, gapCost=2): """ Calculate a cost matrix and convert it into a pandas data frame. Advantage over numpy ndarray: Labelled rows and columns """ return pandas.DataFrame(calculateCostMatrix(a, b, gapCost), index=list(" " + b), columns=list(" " + a)) def getPandasDistanceMatrix(sequences, gapCost=2): """ Calculate the sequence distance matrix of a list of sequences and convert the result to a labelled pandas DataFrame """ labels = ["s%d" % i for i in range(1, len(sequences) + 1)] return pandas.DataFrame(calculateSequenceDistanceMatrix(sequences, gapCost), index=labels, columns=labels) if __name__ == "__main__": #Example call sequences = ["ACGTGC","ACCTG", "AGGCTT", "AGCC"] #Print ONE (arbitrary) edit distance matrix print("s2 vs s3: \n", getPandasCostMatrix(sequences[1], sequences[3]),"\n") #Print the sequence distance matrix print("Sequence distances: \n", getPandasDistanceMatrix(sequences),"\n") #Print the center string print("Center string: s%d" % calculateCenterString(calculateSequenceDistanceMatrix(sequences, gapCost=2)))