Problem:
You want to read alignment matrices like BLOSUM62 in the QUASAR format. The solution needs to be integrable into C++ code easily.
Solution:
Unfortunately the official toolkit only supports Java. In order to read QUASAR matrices efficiently, I wrote a small program that uses an abstract AlignmentMatrix class that stores the matrix in a single array. Currently only quadratic matrices with the same row/column labels are supported (that’s the only option that usually makes sense in real-world bioinformatics applications).
The main function currently only tests if a matrix file in QUASAR format can be read without errors.
/** * QUASARReader.cpp * An efficient reader for the QUASAR substitution matrix format. * See http://www.bio.ifi.lmu.de/QUASAR/ * * Requires Boost String algorithms (header-only) * * Compile like this: clang++ -o quasar quasar.cpp * * Version 1.0.3 * * Copyright (c) 2013 Uli Koehler * This file is released as public domain. * Please mention the original author if used in your software. */ #include <fstream> #include <iostream> #include <cassert> #include <string> #include <cstring> #include <map> #include <sstream> #include <algorithm> #include <vector> #include <boost/algorithm/string.hpp> /** * A quadratic substitution matrix. * This class only supports symmetric substitution * matrices that map equal character sets. */ class SubstitutionMatrix { public: /** * Construct a new substitution matrix by character index and default score. */ SubstitutionMatrix(const std::string& index, double defaultScore = 0) : index(index) { size = index.size(); matrix = new double[size * size]; for(size_t i = 0; i < size * size; i++) { matrix[i] = defaultScore; } } ~SubstitutionMatrix() { if(matrix != nullptr) { delete[] matrix; } } /** * For a given character, determine the index * in the matrix. * @return The index, or string::npos if not found */ int getIndexForChar(char c) { return index.find(c); } /** * Set the score of replacing character a with character b. */ void setScore(char a, char b, double score) { setScore(getIndexForChar(a), getIndexForChar(b), score); } /** * Set a score by index. * @param a The index of character a * @param b The index of character b * @param score The score of replacing a by b (or b by a) */ void setScore(int a, int b, double score) { assert(a >= 0); assert(b >= 0); assert(a < size); assert(b < size); //Symmetric matrix - a->b == b-> a matrix[a + size*b] = score; matrix[b + size*a] = score; } /** * Get the score for replacing * the character a with the character b. * @return The score */ double getScore(char a, char b) { //Replacing a and b does not matter (symmetric matrix) return getScore(getIndexForChar(a), getIndexForChar(b)); } /** * Get the score for replacing * the character at position a with * the character at position b * @return The score */ double getScore(int a, int b) { assert(a >= 0); assert(b >= 0); assert(a < size); assert(b < size); //Replacing a and b does not matter (symmetric matrix) return matrix[b + size * a]; } /** * Get the matrix size. * The matrix has getMatrixSize()**2 entries. */ size_t getMatrixSize() { return size * size; } /** * Get a string of all characters in the matrix alphabet */ std::string getAlphabet() { return index; } /** * Get the raw matrix buffer. * The valid length is sizeof(double) * size * size */ double* getMatrix() { return matrix; } private: /** * The two dimensional matrix that contains the scores. * Row-first or Column-first indexing does not matter here * because the matrix is symmetric. */ double* matrix; size_t size; /** * The list of alphabet characters, * assigned to the array indices of the matrix. * The first character is assigned to index 0 */ std::string index; }; SubstitutionMatrix* readSubstitutionMatrix(const char* filename) { static const char rowIndexPrefix[] = "ROWINDEX"; static const char colIndexPrefix[] = "COLINDEX"; static const char matrixPrefix[] = "MATRIX"; std::ifstream infile(filename); std::string line = "", rowIndex = "", colIndex = ""; int matrixRowIndex = 0; //The number of MATRIX rows already processed SubstitutionMatrix* matrix = nullptr; while(std::getline(infile, line)) { if(line.compare(0, sizeof(rowIndexPrefix) - 1, rowIndexPrefix) == 0) { rowIndex = line.substr(sizeof(rowIndexPrefix)); boost::algorithm::trim(rowIndex); } else if(line.compare(0, sizeof(colIndexPrefix) - 1, colIndexPrefix) == 0) { colIndex = line.substr(sizeof(colIndexPrefix)); boost::algorithm::trim(colIndex); } else if(line.compare(0, sizeof(matrixPrefix) - 1, matrixPrefix) == 0) { //Initialize matrix if not done yet if(matrix == nullptr) { //Matrices with different row and column indices are currently unsupported assert(!rowIndex.empty()); assert(!colIndex.empty()); assert(rowIndex == colIndex); matrix = new SubstitutionMatrix(rowIndex); } //Split elements int matrixColIndex = 0; //The number of columns in the current row already processed std::vector<std::string> elements; std::string lineContent = boost::trim_copy(line.substr(sizeof(matrixPrefix))); boost::split(elements, lineContent, boost::is_any_of("\t "), boost::token_compress_on); for(std::string elem : elements) { boost::trim(elem); double score = atof(elem.c_str()); //Row and column index are interchangable matrix->setScore(matrixRowIndex, matrixColIndex, score); matrixColIndex++; } matrixRowIndex++; } } infile.close(); return matrix; } using std::cout; using std::cerr; using std::endl; /** * The main function just tests if the matrix reader does not fail for a given matrix. */ int main(int argc, char** argv) { if(argc < 2) { cerr << "Usage: " << argv[0] << " <QUASAR input file>" << endl; return 1; } SubstitutionMatrix* matrix = readSubstitutionMatrix(argv[1]); delete matrix; return 0; }