Reading QUASAR scoring matrices in C++
Problem:
You want to read alignment matrices like BLOSUM62 in the QUASAR format. The solution needs to be integrable into C++ code easily.
Solution
Unfortunately the official toolkit only supports Java. In order to read QUASAR matrices efficiently, I wrote a small program that uses an abstract AlignmentMatrix class that stores the matrix in a single array. Currently only quadratic matrices with the same row/column labels are supported (that’s the only option that usually makes sense in real-world bioinformatics applications).
The main function currently only tests if a matrix file in QUASAR format can be read without errors.
/**
* QUASARReader.cpp
* An efficient reader for the QUASAR substitution matrix format.
* See http://www.bio.ifi.lmu.de/QUASAR/
*
* Requires Boost String algorithms (header-only)
*
* Compile like this: clang++ -o quasar quasar.cpp
*
* Version 1.0.3
*
* Copyright (c) 2013 Uli Koehler
* This file is released as public domain.
* Please mention the original author if used in your software.
*/
#include <fstream>
#include <iostream>
#include <cassert>
#include <string>
#include <cstring>
#include <map>
#include <sstream>
#include <algorithm>
#include <vector>
#include <boost/algorithm/string.hpp>
/**
* A quadratic substitution matrix.
* This class only supports symmetric substitution
* matrices that map equal character sets.
*/
class SubstitutionMatrix {
public:
/**
* Construct a new substitution matrix by character index and default score.
*/
SubstitutionMatrix(const std::string& index, double defaultScore = 0) : index(index) {
size = index.size();
matrix = new double[size * size];
for(size_t i = 0; i < size * size; i++) {
matrix[i] = defaultScore;
}
}
~SubstitutionMatrix() {
if(matrix != nullptr) {
delete[] matrix;
}
}
/**
* For a given character, determine the index
* in the matrix.
* @return The index, or string::npos if not found
*/
int getIndexForChar(char c) {
return index.find(c);
}
/**
* Set the score of replacing character a with character b.
*/
void setScore(char a, char b, double score) {
setScore(getIndexForChar(a), getIndexForChar(b), score);
}
/**
* Set a score by index.
* @param a The index of character a
* @param b The index of character b
* @param score The score of replacing a by b (or b by a)
*/
void setScore(int a, int b, double score) {
assert(a >= 0);
assert(b >= 0);
assert(a < size);
assert(b < size);
//Symmetric matrix - a->b == b-> a
matrix[a + size*b] = score;
matrix[b + size*a] = score;
}
/**
* Get the score for replacing
* the character a with the character b.
* @return The score
*/
double getScore(char a, char b) {
//Replacing a and b does not matter (symmetric matrix)
return getScore(getIndexForChar(a), getIndexForChar(b));
}
/**
* Get the score for replacing
* the character at position a with
* the character at position b
* @return The score
*/
double getScore(int a, int b) {
assert(a >= 0);
assert(b >= 0);
assert(a < size);
assert(b < size);
//Replacing a and b does not matter (symmetric matrix)
return matrix[b + size * a];
}
/**
* Get the matrix size.
* The matrix has getMatrixSize()**2 entries.
*/
size_t getMatrixSize() {
return size * size;
}
/**
* Get a string of all characters in the matrix alphabet
*/
std::string getAlphabet() {
return index;
}
/**
* Get the raw matrix buffer.
* The valid length is sizeof(double) * size * size
*/
double* getMatrix() {
return matrix;
}
private:
/**
* The two dimensional matrix that contains the scores.
* Row-first or Column-first indexing does not matter here
* because the matrix is symmetric.
*/
double* matrix;
size_t size;
/**
* The list of alphabet characters,
* assigned to the array indices of the matrix.
* The first character is assigned to index 0
*/
std::string index;
};
SubstitutionMatrix* readSubstitutionMatrix(const char* filename) {
static const char rowIndexPrefix[] = "ROWINDEX";
static const char colIndexPrefix[] = "COLINDEX";
static const char matrixPrefix[] = "MATRIX";
std::ifstream infile(filename);
std::string line = "", rowIndex = "", colIndex = "";
int matrixRowIndex = 0; //The number of MATRIX rows already processed
SubstitutionMatrix* matrix = nullptr;
while(std::getline(infile, line)) {
if(line.compare(0, sizeof(rowIndexPrefix) - 1, rowIndexPrefix) == 0) {
rowIndex = line.substr(sizeof(rowIndexPrefix));
boost::algorithm::trim(rowIndex);
} else if(line.compare(0, sizeof(colIndexPrefix) - 1, colIndexPrefix) == 0) {
colIndex = line.substr(sizeof(colIndexPrefix));
boost::algorithm::trim(colIndex);
} else if(line.compare(0, sizeof(matrixPrefix) - 1, matrixPrefix) == 0) {
//Initialize matrix if not done yet
if(matrix == nullptr) {
//Matrices with different row and column indices are currently unsupported
assert(!rowIndex.empty());
assert(!colIndex.empty());
assert(rowIndex == colIndex);
matrix = new SubstitutionMatrix(rowIndex);
}
//Split elements
int matrixColIndex = 0; //The number of columns in the current row already processed
std::vector<std::string> elements;
std::string lineContent = boost::trim_copy(line.substr(sizeof(matrixPrefix)));
boost::split(elements, lineContent, boost::is_any_of("\t "), boost::token_compress_on);
for(std::string elem : elements) {
boost::trim(elem);
double score = atof(elem.c_str());
//Row and column index are interchangable
matrix->setScore(matrixRowIndex, matrixColIndex, score);
matrixColIndex++;
}
matrixRowIndex++;
}
}
infile.close();
return matrix;
}
using std::cout;
using std::cerr;
using std::endl;
/**
* The main function just tests if the matrix reader does not fail for a given matrix.
*/
int main(int argc, char** argv) {
if(argc < 2) {
cerr << "Usage: " << argv[0] << " <QUASAR input file>" << endl;
return 1;
}
SubstitutionMatrix* matrix = readSubstitutionMatrix(argv[1]);
delete matrix;
return 0;
}