Center star approximation: Identifying the center string in Python

Problem:

You need to calculate the center star approximation for a given set of sequences. Instead of calculating the sequence distances and center string by hand, you want the computer to do the hard work.

Solution:

The script presented here is not intended to be used for high-volume MSA calculation, but to provide a simple reference for the reader on how to calculate it — and to reduce the work needing to be done by hand.

General algorithm:

  • For each pair of sequences, calculate their edit distance matrix
  • In the sequence distance matrix, calculate the sum of each row
  • The sequence with the smallest row sum is called the center string
#!/usr/bin/env python3
"""
Simple functions to calculate the center string of a list of sequences.
Also calculates related matrices.

Version 1.1: Python3 ready
"""
import numpy
import pandas

__author__  = "Uli Köhler"
__license__ = "Apache license v2.0"
__version__ = "1.1"

def substitutionCost(a, b): 
    """Return the cost of substituting character a with character b"""
    return 0 if a == b else 3

def calculateCostMatrix(a, b, gapCost=2):
    """For two sequences a, b, calculate their edit distance matrix"""
    #Initialize n+1 x m+1 matrix. x axis <--> a, y axis <--> b
    xdim = len(b) + 1
    ydim = len(a) + 1
    matrix = numpy.zeros((xdim, ydim))
    #Initialize first row and first col with n*gap cost
    matrix[0,:] = numpy.arange(0, gapCost*ydim, step=gapCost)
    matrix[:,0] = numpy.arange(0, gapCost*xdim, step=gapCost)
    #Fill array
    for y in range(1, ydim):
        for x in range(1, xdim):
            substCost = matrix[x-1, y-1] + substitutionCost(b[x-1], a[y-1])
            matrix[x,y] = min(substCost, matrix[x-1, y] + gapCost, matrix[x, y-1] + gapCost)
    return matrix

def getEditDistance(a, b, gapCost=2):
    """Get the edit distance of two sequences a and b"""
    return calculateCostMatrix(a, b, gapCost)[-1,-1]

def calculateSequenceDistanceMatrix(sequences, gapCost=2):
    """
    Calculate a matrix of edit distance between a list of sequences.
    Return the matrix.
    """
    #Note: For the sake of simplicity, this calculates ANY pair of sequences.
    # If performance matters, one would only calculate half of that
    dim = len(sequences)
    matrix = numpy.zeros((dim, dim))
    for i in range(len(sequences)):
        for j in range(len(sequences)):
            matrix[i,j] = getEditDistance(sequences[i], sequences[j], gapCost)
    return matrix

def calculateCenterString(distanceMatrix):
    """
    Calculate the center string (represented by its index in the sequence array)
    for a given sequence edit distance matrix.
    Return the 1-based index of the center string
    """
    #Calculate the sums for any row
    rowSums = [sum(distanceMatrix[:,i]) for i in range(distanceMatrix.shape[1])]
    #If the same row score occurs multiple times, use the first occurrence
    return rowSums.index(min(rowSums)) + 1 #+ 1: One-based indices

def getPandasCostMatrix(a, b, gapCost=2):
    """
    Calculate a cost matrix and convert it into a pandas data frame.
    Advantage over numpy ndarray: Labelled rows and columns
    """
    return pandas.DataFrame(calculateCostMatrix(a, b, gapCost), index=list(" " + b), columns=list(" " + a))

def getPandasDistanceMatrix(sequences, gapCost=2):
    """
    Calculate the sequence distance matrix of a list of sequences
    and convert the result to a labelled pandas DataFrame
    """
    labels = ["s%d" % i for i in range(1, len(sequences) + 1)]
    return pandas.DataFrame(calculateSequenceDistanceMatrix(sequences, gapCost), index=labels, columns=labels)

if __name__ == "__main__":
    #Example call
    sequences = ["ACGTGC","ACCTG", "AGGCTT", "AGCC"]
    #Print ONE (arbitrary) edit distance matrix
    print("s2 vs s3: \n", getPandasCostMatrix(sequences[1], sequences[3]),"\n")
    #Print the sequence distance matrix
    print("Sequence distances: \n", getPandasDistanceMatrix(sequences),"\n")
    #Print the center string
    print("Center string: s%d" % calculateCenterString(calculateSequenceDistanceMatrix(sequences, gapCost=2)))