How to parse Simulink Coder system hierarchy from generated source code

In Simulink Coder generated source code, there is a comment in the main .h file that contains the system hierarchy. This comment is required in order to know what other comments in the file mean, such as Referenced by: '<S291>/index4'.

The system hierarchy comment looks like this:

/**
 * [...]
 * 
 *  Here is the system hierarchy for this model
 *  [...]
 *  '<S27>'  : 'MySimulinkModel/MyImportantSubsystem'
 *  [...]
 */

with one line per subsystem.

In our previous post How to parse C++ comments from source code using clang we showed how to parse C++ comments using clang. We can use the same approach to parse the system hierarchy comment.

Parsing the system hierarchy comment

The following code uses the clang comment parser (see link above) and a regex based approach to extract the system hierarchy from the comment.

#include <clang-c/Index.h>
#include <iostream>
#include <vector>
#include <string>
#include <cstring>
#include <functional>
#include <regex>
#include <unordered_map>
#include <sstream>
#include <fstream>
#include <rapidjson/document.h>
#include <rapidjson/writer.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/filewritestream.h>
#include <rapidjson/prettywriter.h>

void extractCommentBlocks(CXTranslationUnit unit, std::function<void(const std::string&, unsigned, unsigned)> processor) {
    CXSourceRange range = clang_getCursorExtent(clang_getTranslationUnitCursor(unit));
    CXToken *tokens;
    unsigned numTokens;
    
    clang_tokenize(unit, range, &tokens, &numTokens);
    
    for (unsigned i = 0; i < numTokens; ++i) {
        CXTokenKind tokenKind = clang_getTokenKind(tokens[i]);
        if (tokenKind == CXToken_Comment) {
            CXString tokenSpelling = clang_getTokenSpelling(unit, tokens[i]);
            CXSourceLocation location = clang_getTokenLocation(unit, tokens[i]);
            
            unsigned line, column;
            CXFile file;
            clang_getFileLocation(location, &file, &line, &column, nullptr);
            
            std::string comment = clang_getCString(tokenSpelling);
            processor(comment, line, column);
            
            clang_disposeString(tokenSpelling);
        }
    }
    
    clang_disposeTokens(unit, tokens, numTokens);
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <source_file> <output_json_file>" << std::endl;
        return 1;
    }

    CXIndex index = clang_createIndex(0, 0);
    const char *filename = argv[1];
    const char *outputFile = argv[2];

    CXTranslationUnit unit = clang_parseTranslationUnit(index, filename, nullptr, 0, nullptr, 0, CXTranslationUnit_None);
    if (unit == nullptr) {
        std::cerr << "Failed to parse translation unit: " << filename << std::endl;
        return 1;
    }

    CXCursor cursor = clang_getTranslationUnitCursor(unit);
    std::unordered_map<std::string, std::string> hierarchyMap;
    std::regex pattern(R"('\<([S]\d+)\>' : '([^']+)')");
    
    extractCommentBlocks(unit, [&hierarchyMap, &pattern](const std::string& comment, unsigned line, unsigned column) {
        if(comment.find("Here is the system hierarchy for this model") != std::string::npos) {
            std::cout << "Found system hierarchy comment." << std::endl;
            std::cout << "Comment at line " << line << ", column " << column << ": " 
            << comment << std::endl;

            // Parse each line in the comment using regex
            std::istringstream stream(comment);
            std::string line_text;
            while (std::getline(stream, line_text)) {
                std::smatch matches;
                if (std::regex_search(line_text, matches, pattern)) {
                    std::string sNumber = matches[1].str();
                    std::string path = matches[2].str();
                    hierarchyMap[sNumber] = path;
                }
            }
        }
    });

    // Print all entries in the map
    std::cout << "\nExtracted hierarchy entries:" << std::endl;
    for (const auto& entry : hierarchyMap) {
        std::cout << entry.first << " -> " << entry.second << std::endl;
    }

    // Generate JSON output using RapidJSON
    rapidjson::Document document;
    document.SetObject();
    rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
    
    rapidjson::Value hierarchyObject(rapidjson::kObjectType);
    
    for (const auto& entry : hierarchyMap) {
        rapidjson::Value key(entry.first.c_str(), allocator);
        rapidjson::Value value(entry.second.c_str(), allocator);
        hierarchyObject.AddMember(key, value, allocator);
    }
    
    document.AddMember("hierarchy", hierarchyObject, allocator);
    
    // Write to file
    FILE* fp = fopen(outputFile, "wb");
    if (!fp) {
        std::cerr << "Failed to open output file: " << outputFile << std::endl;
        clang_disposeTranslationUnit(unit);
        clang_disposeIndex(index);
        return 1;
    }
    
    char writeBuffer[65536];
    rapidjson::FileWriteStream os(fp, writeBuffer, sizeof(writeBuffer));
    rapidjson::PrettyWriter<rapidjson::FileWriteStream> writer(os);
    document.Accept(writer);
    fclose(fp);

    std::cout << "JSON output written to: " << outputFile << std::endl;

    clang_disposeTranslationUnit(unit);
    clang_disposeIndex(index);
    return 0;
}

Compile using

g++ test.cpp -o parse_hierarchy -std=c++17 -I/usr/lib/llvm-19/include -L/usr/lib/llvm-19/lib -lclang

Run the program with the source file and output JSON file as arguments:

./parse_hierarchy my_model.h output.json

After that, the output.json file will contain the system hierarchy in JSON format:

{
    "hierarchy": {
        "<S27>": "MySimulinkModel/MyImportantSubsystem",
        ...
    }
}