How to extract C/C++ comments & source lines from source code using clang

Based on How to extract C/C++ function extent (from..to source lines) from source code using clang, this function extracts comments and their source lines from a C/C++ source file using the clang library.

It captures both block comments (/* ... */) and line comments (// ...), along with the source lines they are associated with.

Only the comments that are within the extent of the specified C/C++ function are extracted.

#include <clang-c/Index.h>
#include <fstream>
#include <iostream>
#include <regex>
#include <string>
#include <vector>

int main(int argc, char** argv) {
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <source-file> <function-name>" << std::endl;
        return 1;
    }
    const char* filename = argv[1];
    const char* funcname = argv[2];
    CXIndex index = clang_createIndex(0, 0);
    CXTranslationUnit unit = clang_parseTranslationUnit(
        index, filename, nullptr, 0, nullptr, 0, CXTranslationUnit_None);
    if (unit == nullptr) {
        std::cerr << "Unable to parse translation unit." << std::endl;
        return 1;
    }
    struct VisitorData {
        std::string target;
        unsigned start_line = 0;
        unsigned end_line = 0;
    } data;
    data.target = funcname;
    auto visitor = [](CXCursor c, CXCursor parent, CXClientData client_data) {
        VisitorData* data = static_cast<VisitorData*>(client_data);
        if (clang_getCursorKind(c) == CXCursor_FunctionDecl && clang_isCursorDefinition(c)) {
            CXString name = clang_getCursorSpelling(c);
            std::string func_name = clang_getCString(name);
            clang_disposeString(name);
            if (func_name == data->target) {
                CXSourceRange range = clang_getCursorExtent(c);
                CXSourceLocation start = clang_getRangeStart(range);
                CXSourceLocation end = clang_getRangeEnd(range);
                unsigned start_line, start_col, end_line, end_col;
                clang_getSpellingLocation(start, nullptr, &start_line, &start_col, nullptr);
                clang_getSpellingLocation(end, nullptr, &end_line, &end_col, nullptr);

                // Get file name from CXSourceLocation
                CXFile file;
                unsigned tmp_line, tmp_col, tmp_offset;
                clang_getSpellingLocation(start, &file, &tmp_line, &tmp_col, &tmp_offset);
                CXString cx_file_name = clang_getFileName(file);
                std::string file_path = clang_getCString(cx_file_name);
                clang_disposeString(cx_file_name);
                std::ifstream src(file_path);
                if (!src) {
                    std::cerr << "Could not open source file: " << file_path << std::endl;
                    return CXChildVisit_Break;
                }
                std::vector<std::string> lines;
                std::string line;
                unsigned line_num = 1;
                while (std::getline(src, line)) {
                    if (line_num >= start_line && line_num <= end_line) {
                        lines.push_back(line);
                    }
                    if (line_num > end_line) break;
                    ++line_num;
                }
                // Join lines for block comment search
                std::string joined;
                for (const auto& l : lines) joined += l + "\n";
                // Find block comments
                std::regex block_re("/\\*([\\s\\S]*?)\\*/");
                auto blocks_begin = std::sregex_iterator(joined.begin(), joined.end(), block_re);
                auto blocks_end = std::sregex_iterator();
                for (auto it = blocks_begin; it != blocks_end; ++it) {
                    std::string comment = it->str();
                    size_t start_pos = joined.substr(0, it->position()).find_last_of('\n');
                    size_t end_pos = joined.substr(0, it->position() + it->length()).find_last_of('\n');
                    unsigned comment_start_line = start_line + std::count(joined.begin(), joined.begin() + it->position(), '\n');
                    unsigned comment_end_line = comment_start_line + std::count(comment.begin(), comment.end(), '\n');
                    // Strip /* */
                    if (comment.substr(0,2) == "/*" && comment.size() >= 4 && comment.substr(comment.size()-2) == "*/")
                        comment = comment.substr(2, comment.size()-4);
                    // Remove leading/trailing whitespace
                    size_t first = comment.find_first_not_of(" \t\n\r");
                    size_t last = comment.find_last_not_of(" \t\n\r");
                    if (first != std::string::npos && last != std::string::npos)
                        comment = comment.substr(first, last - first + 1);
                    std::cout << "Comment (lines " << comment_start_line << "-" << comment_end_line << "): " << comment << "\n";
                }
                // Find line comments
                std::regex line_re("//.*");
                line_num = start_line;
                for (const auto& l : lines) {
                    std::smatch m;
                    if (std::regex_search(l, m, line_re)) {
                        std::string comment = m.str();
                        if (comment.substr(0,2) == "//") comment = comment.substr(2);
                        size_t first = comment.find_first_not_of(" \t\n\r");
                        size_t last = comment.find_last_not_of(" \t\n\r");
                        if (first != std::string::npos && last != std::string::npos)
                            comment = comment.substr(first, last - first + 1);
                        std::cout << "Comment (line " << line_num << "):\n" << comment << "\n";
                    }
                    ++line_num;
                }
                return CXChildVisit_Break;
            }
        }
        return CXChildVisit_Recurse;
    };
    clang_visitChildren(clang_getTranslationUnitCursor(unit), visitor, &data);
    clang_disposeTranslationUnit(unit);
    clang_disposeIndex(index);
    return 0;
}

See How to extract C/C++ function extent (from..to source lines) from source code using clang for details on how to compile and run this code.