You need to calculate the center star approximation for a given set of sequences. Instead of calculating the sequence distances and center string by hand, you want the computer to do the hard work.
The script presented here is not intended to be used for high-volume MSA calculation, but to provide a simple reference for the reader on how to calculate it — and to reduce the work needing to be done by hand.
- For each pair of sequences, calculate their edit distance matrix
- In the sequence distance matrix, calculate the sum of each row
- The sequence with the smallest row sum is called the center string
#!/usr/bin/env python3 """ Simple functions to calculate the center string of a list of sequences. Also calculates related matrices. Version 1.1: Python3 ready """ import numpy import pandas __author__ = "Uli Köhler" __license__ = "Apache license v2.0" __version__ = "1.1" def substitutionCost(a, b): """Return the cost of substituting character a with character b""" return 0 if a == b else 3 def calculateCostMatrix(a, b, gapCost=2): """For two sequences a, b, calculate their edit distance matrix""" #Initialize n+1 x m+1 matrix. x axis <--> a, y axis <--> b xdim = len(b) + 1 ydim = len(a) + 1 matrix = numpy.zeros((xdim, ydim)) #Initialize first row and first col with n*gap cost matrix[0,:] = numpy.arange(0, gapCost*ydim, step=gapCost) matrix[:,0] = numpy.arange(0, gapCost*xdim, step=gapCost) #Fill array for y in range(1, ydim): for x in range(1, xdim): substCost = matrix[x-1, y-1] + substitutionCost(b[x-1], a[y-1]) matrix[x,y] = min(substCost, matrix[x-1, y] + gapCost, matrix[x, y-1] + gapCost) return matrix def getEditDistance(a, b, gapCost=2): """Get the edit distance of two sequences a and b""" return calculateCostMatrix(a, b, gapCost)[-1,-1] def calculateSequenceDistanceMatrix(sequences, gapCost=2): """ Calculate a matrix of edit distance between a list of sequences. Return the matrix. """ #Note: For the sake of simplicity, this calculates ANY pair of sequences. # If performance matters, one would only calculate half of that dim = len(sequences) matrix = numpy.zeros((dim, dim)) for i in range(len(sequences)): for j in range(len(sequences)): matrix[i,j] = getEditDistance(sequences[i], sequences[j], gapCost) return matrix def calculateCenterString(distanceMatrix): """ Calculate the center string (represented by its index in the sequence array) for a given sequence edit distance matrix. Return the 1-based index of the center string """ #Calculate the sums for any row rowSums = [sum(distanceMatrix[:,i]) for i in range(distanceMatrix.shape)] #If the same row score occurs multiple times, use the first occurrence return rowSums.index(min(rowSums)) + 1 #+ 1: One-based indices def getPandasCostMatrix(a, b, gapCost=2): """ Calculate a cost matrix and convert it into a pandas data frame. Advantage over numpy ndarray: Labelled rows and columns """ return pandas.DataFrame(calculateCostMatrix(a, b, gapCost), index=list(" " + b), columns=list(" " + a)) def getPandasDistanceMatrix(sequences, gapCost=2): """ Calculate the sequence distance matrix of a list of sequences and convert the result to a labelled pandas DataFrame """ labels = ["s%d" % i for i in range(1, len(sequences) + 1)] return pandas.DataFrame(calculateSequenceDistanceMatrix(sequences, gapCost), index=labels, columns=labels) if __name__ == "__main__": #Example call sequences = ["ACGTGC","ACCTG", "AGGCTT", "AGCC"] #Print ONE (arbitrary) edit distance matrix print("s2 vs s3: \n", getPandasCostMatrix(sequences, sequences),"\n") #Print the sequence distance matrix print("Sequence distances: \n", getPandasDistanceMatrix(sequences),"\n") #Print the center string print("Center string: s%d" % calculateCenterString(calculateSequenceDistanceMatrix(sequences, gapCost=2)))