Center star approximation: Identifying the center string in Python

Problem:

You need to calculate the center star approximation for a given set of sequences. Instead of calculating the sequence distances and center string by hand, you want the computer to do the hard work.

Solution

The script presented here is not intended to be used for high-volume MSA calculation, but to provide a simple reference for the reader on how to calculate it — and to reduce the work needing to be done by hand.

General algorithm:

#!/usr/bin/env python3
"""
Simple functions to calculate the center string of a list of sequences.
Also calculates related matrices.

Version 1.1: Python3 ready
"""
import numpy
import pandas

__author__  = "Uli Köhler"
__license__ = "Apache license v2.0"
__version__ = "1.1"

def substitutionCost(a, b): 
    """Return the cost of substituting character a with character b"""
    return 0 if a == b else 3

def calculateCostMatrix(a, b, gapCost=2):
    """For two sequences a, b, calculate their edit distance matrix"""
    #Initialize n+1 x m+1 matrix. x axis <--> a, y axis <--> b
    xdim = len(b) + 1
    ydim = len(a) + 1
    matrix = numpy.zeros((xdim, ydim))
    #Initialize first row and first col with n*gap cost
    matrix[0,:] = numpy.arange(0, gapCost*ydim, step=gapCost)
    matrix[:,0] = numpy.arange(0, gapCost*xdim, step=gapCost)
    #Fill array
    for y in range(1, ydim):
        for x in range(1, xdim):
            substCost = matrix[x-1, y-1] + substitutionCost(b[x-1], a[y-1])
            matrix[x,y] = min(substCost, matrix[x-1, y] + gapCost, matrix[x, y-1] + gapCost)
    return matrix

def getEditDistance(a, b, gapCost=2):
    """Get the edit distance of two sequences a and b"""
    return calculateCostMatrix(a, b, gapCost)[-1,-1]

def calculateSequenceDistanceMatrix(sequences, gapCost=2):
    """
    Calculate a matrix of edit distance between a list of sequences.
    Return the matrix.
    """
    #Note: For the sake of simplicity, this calculates ANY pair of sequences.
    # If performance matters, one would only calculate half of that
    dim = len(sequences)
    matrix = numpy.zeros((dim, dim))
    for i in range(len(sequences)):
        for j in range(len(sequences)):
            matrix[i,j] = getEditDistance(sequences[i], sequences[j], gapCost)
    return matrix

def calculateCenterString(distanceMatrix):
    """
    Calculate the center string (represented by its index in the sequence array)
    for a given sequence edit distance matrix.
    Return the 1-based index of the center string
    """
    #Calculate the sums for any row
    rowSums = [sum(distanceMatrix[:,i]) for i in range(distanceMatrix.shape[1])]
    #If the same row score occurs multiple times, use the first occurrence
    return rowSums.index(min(rowSums)) + 1 #+ 1: One-based indices

def getPandasCostMatrix(a, b, gapCost=2):
    """
    Calculate a cost matrix and convert it into a pandas data frame.
    Advantage over numpy ndarray: Labelled rows and columns
    """
    return pandas.DataFrame(calculateCostMatrix(a, b, gapCost), index=list(" " + b), columns=list(" " + a))

def getPandasDistanceMatrix(sequences, gapCost=2):
    """
    Calculate the sequence distance matrix of a list of sequences
    and convert the result to a labelled pandas DataFrame
    """
    labels = ["s%d" % i for i in range(1, len(sequences) + 1)]
    return pandas.DataFrame(calculateSequenceDistanceMatrix(sequences, gapCost), index=labels, columns=labels)

if __name__ == "__main__":
    #Example call
    sequences = ["ACGTGC","ACCTG", "AGGCTT", "AGCC"]
    #Print ONE (arbitrary) edit distance matrix
    print("s2 vs s3: \n", getPandasCostMatrix(sequences[1], sequences[3]),"\n")
    #Print the sequence distance matrix
    print("Sequence distances: \n", getPandasDistanceMatrix(sequences),"\n")
    #Print the center string
    print("Center string: s%d" % calculateCenterString(calculateSequenceDistanceMatrix(sequences, gapCost=2)))