Reading QUASAR scoring matrices in C++

Problem:

You want to read alignment matrices like BLOSUM62 in the QUASAR format. The solution needs to be integrable into C++ code easily.

Solution

Unfortunately the official toolkit only supports Java. In order to read QUASAR matrices efficiently, I wrote a small program that uses an abstract AlignmentMatrix class that stores the matrix in a single array. Currently only quadratic matrices with the same row/column labels are supported (that’s the only option that usually makes sense in real-world bioinformatics applications).

The main function currently only tests if a matrix file in QUASAR format can be read without errors.

/**
 * QUASARReader.cpp
 * An efficient reader for the QUASAR substitution matrix format.
 * See http://www.bio.ifi.lmu.de/QUASAR/
 * 
 * Requires Boost String algorithms (header-only)
 * 
 * Compile like this: clang++ -o quasar quasar.cpp
 * 
 * Version 1.0.3
 * 
 * Copyright (c) 2013 Uli Koehler
 * This file is released as public domain.
 * Please mention the original author if used in your software.
 */
#include <fstream>
#include <iostream>
#include <cassert>
#include <string>
#include <cstring>
#include <map>
#include <sstream>
#include <algorithm>
#include <vector>
#include <boost/algorithm/string.hpp>

/**
 * A quadratic substitution matrix.
 * This class only supports symmetric substitution
 * matrices that map equal character sets.
 */
class SubstitutionMatrix {
public:
    /**
     * Construct a new substitution matrix by character index and default score.
     */
    SubstitutionMatrix(const std::string& index, double defaultScore = 0) : index(index) {
        size = index.size();
        matrix = new double[size * size];
        for(size_t i = 0; i < size * size; i++) {
            matrix[i] = defaultScore;
        }
    }
    ~SubstitutionMatrix() {
        if(matrix != nullptr) {
            delete[] matrix;
        }
    }
    /**
     * For a given character, determine the index
     * in the matrix.
     * @return The index, or string::npos if not found
     */
    int getIndexForChar(char c) {
        return index.find(c);
    }
    /**
     * Set the score of replacing character a with character b.
     */
    void setScore(char a, char b, double score) {
        setScore(getIndexForChar(a), getIndexForChar(b), score);
    }
    /**
     * Set a score by index.
     * @param a The index of character a
     * @param b The index of character b
     * @param score The score of replacing a by b (or b by a)
     */
    void setScore(int a, int b, double score) {
        assert(a >= 0);
        assert(b >= 0);
        assert(a < size);
        assert(b < size);
        //Symmetric matrix - a->b == b-> a
        matrix[a + size*b] = score;
        matrix[b + size*a] = score;
    }
    /**
     * Get the score for replacing
     * the character a with the character b.
     * @return The score
     */
    double getScore(char a, char b) {
        //Replacing a and b does not matter (symmetric matrix)
        return getScore(getIndexForChar(a), getIndexForChar(b));
    }
    /**
     * Get the score for replacing
     * the character at position a with
     * the character at position b
     * @return The score
     */
    double getScore(int a, int b) {
        assert(a >= 0);
        assert(b >= 0);
        assert(a < size);
        assert(b < size);
        //Replacing a and b does not matter (symmetric matrix)
        return matrix[b + size * a];
    }
    /**
     * Get the matrix size.
     * The matrix has getMatrixSize()**2 entries.
     */
    size_t getMatrixSize() {
        return size * size;
    }
    /**
     * Get a string of all characters in the matrix alphabet
     */
    std::string getAlphabet() {
        return index;
    }
    /**
     * Get the raw matrix buffer.
     * The valid length is sizeof(double) * size * size
     */
    double* getMatrix() {
        return matrix;
    }
private:
    /**
     * The two dimensional matrix that contains the scores.
     * Row-first or Column-first indexing does not matter here
     * because the matrix is symmetric.
     */
    double* matrix;
    size_t size;
    /**
     * The list of alphabet characters,
     * assigned to the array indices of the matrix.
     * The first character is assigned to index 0
     */
    std::string index;
};

SubstitutionMatrix* readSubstitutionMatrix(const char* filename) {
    static const char rowIndexPrefix[] = "ROWINDEX";
    static const char colIndexPrefix[] = "COLINDEX";
    static const char matrixPrefix[] = "MATRIX";
    std::ifstream infile(filename);
    std::string line = "", rowIndex = "", colIndex = "";
    int matrixRowIndex = 0; //The number of MATRIX rows already processed
    SubstitutionMatrix* matrix = nullptr;
    while(std::getline(infile, line)) {
        if(line.compare(0, sizeof(rowIndexPrefix) - 1, rowIndexPrefix) == 0) {
            rowIndex = line.substr(sizeof(rowIndexPrefix));
            boost::algorithm::trim(rowIndex);
        } else if(line.compare(0, sizeof(colIndexPrefix) - 1, colIndexPrefix) == 0) {
            colIndex = line.substr(sizeof(colIndexPrefix));
            boost::algorithm::trim(colIndex);
        } else if(line.compare(0, sizeof(matrixPrefix) - 1, matrixPrefix) == 0) {
            //Initialize matrix if not done yet
            if(matrix == nullptr) {
                //Matrices with different row and column indices are currently unsupported
                assert(!rowIndex.empty());
                assert(!colIndex.empty());
                assert(rowIndex == colIndex);
                matrix = new SubstitutionMatrix(rowIndex);
            }
            //Split elements
            int matrixColIndex = 0; //The number of columns in the current row already processed
            std::vector<std::string> elements;
            std::string lineContent = boost::trim_copy(line.substr(sizeof(matrixPrefix)));
            boost::split(elements, lineContent, boost::is_any_of("\t "), boost::token_compress_on);
            for(std::string elem : elements) {
                boost::trim(elem);
                double score = atof(elem.c_str());
                //Row and column index are interchangable
                matrix->setScore(matrixRowIndex, matrixColIndex, score);
                matrixColIndex++;
            }
            matrixRowIndex++;
        }
    }
    infile.close();
    return matrix;
}

using std::cout;
using std::cerr;
using std::endl;

/**
 * The main function just tests if the matrix reader does not fail for a given matrix.
 */
int main(int argc, char** argv) {
    if(argc < 2) {
        cerr << "Usage: " << argv[0] << " <QUASAR input file>" << endl;
        return 1;
    }
    SubstitutionMatrix* matrix = readSubstitutionMatrix(argv[1]);
    delete matrix;
    return 0;
}