How to extract C/C++ comments & source lines from source code using clang
Based on How to extract C/C++ function extent (from..to source lines) from source code using clang, this function extracts comments and their source lines from a C/C++ source file using the clang library.
It captures both block comments (/* ... */
) and line comments (// ...
), along with the source lines they are associated with.
Only the comments that are within the extent of the specified C/C++ function are extracted.
#include <clang-c/Index.h>
#include <fstream>
#include <iostream>
#include <regex>
#include <string>
#include <vector>
int main(int argc, char** argv) {
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " <source-file> <function-name>" << std::endl;
return 1;
}
const char* filename = argv[1];
const char* funcname = argv[2];
CXIndex index = clang_createIndex(0, 0);
CXTranslationUnit unit = clang_parseTranslationUnit(
index, filename, nullptr, 0, nullptr, 0, CXTranslationUnit_None);
if (unit == nullptr) {
std::cerr << "Unable to parse translation unit." << std::endl;
return 1;
}
struct VisitorData {
std::string target;
unsigned start_line = 0;
unsigned end_line = 0;
} data;
data.target = funcname;
auto visitor = [](CXCursor c, CXCursor parent, CXClientData client_data) {
VisitorData* data = static_cast<VisitorData*>(client_data);
if (clang_getCursorKind(c) == CXCursor_FunctionDecl && clang_isCursorDefinition(c)) {
CXString name = clang_getCursorSpelling(c);
std::string func_name = clang_getCString(name);
clang_disposeString(name);
if (func_name == data->target) {
CXSourceRange range = clang_getCursorExtent(c);
CXSourceLocation start = clang_getRangeStart(range);
CXSourceLocation end = clang_getRangeEnd(range);
unsigned start_line, start_col, end_line, end_col;
clang_getSpellingLocation(start, nullptr, &start_line, &start_col, nullptr);
clang_getSpellingLocation(end, nullptr, &end_line, &end_col, nullptr);
// Get file name from CXSourceLocation
CXFile file;
unsigned tmp_line, tmp_col, tmp_offset;
clang_getSpellingLocation(start, &file, &tmp_line, &tmp_col, &tmp_offset);
CXString cx_file_name = clang_getFileName(file);
std::string file_path = clang_getCString(cx_file_name);
clang_disposeString(cx_file_name);
std::ifstream src(file_path);
if (!src) {
std::cerr << "Could not open source file: " << file_path << std::endl;
return CXChildVisit_Break;
}
std::vector<std::string> lines;
std::string line;
unsigned line_num = 1;
while (std::getline(src, line)) {
if (line_num >= start_line && line_num <= end_line) {
lines.push_back(line);
}
if (line_num > end_line) break;
++line_num;
}
// Join lines for block comment search
std::string joined;
for (const auto& l : lines) joined += l + "\n";
// Find block comments
std::regex block_re("/\\*([\\s\\S]*?)\\*/");
auto blocks_begin = std::sregex_iterator(joined.begin(), joined.end(), block_re);
auto blocks_end = std::sregex_iterator();
for (auto it = blocks_begin; it != blocks_end; ++it) {
std::string comment = it->str();
size_t start_pos = joined.substr(0, it->position()).find_last_of('\n');
size_t end_pos = joined.substr(0, it->position() + it->length()).find_last_of('\n');
unsigned comment_start_line = start_line + std::count(joined.begin(), joined.begin() + it->position(), '\n');
unsigned comment_end_line = comment_start_line + std::count(comment.begin(), comment.end(), '\n');
// Strip /* */
if (comment.substr(0,2) == "/*" && comment.size() >= 4 && comment.substr(comment.size()-2) == "*/")
comment = comment.substr(2, comment.size()-4);
// Remove leading/trailing whitespace
size_t first = comment.find_first_not_of(" \t\n\r");
size_t last = comment.find_last_not_of(" \t\n\r");
if (first != std::string::npos && last != std::string::npos)
comment = comment.substr(first, last - first + 1);
std::cout << "Comment (lines " << comment_start_line << "-" << comment_end_line << "): " << comment << "\n";
}
// Find line comments
std::regex line_re("//.*");
line_num = start_line;
for (const auto& l : lines) {
std::smatch m;
if (std::regex_search(l, m, line_re)) {
std::string comment = m.str();
if (comment.substr(0,2) == "//") comment = comment.substr(2);
size_t first = comment.find_first_not_of(" \t\n\r");
size_t last = comment.find_last_not_of(" \t\n\r");
if (first != std::string::npos && last != std::string::npos)
comment = comment.substr(first, last - first + 1);
std::cout << "Comment (line " << line_num << "):\n" << comment << "\n";
}
++line_num;
}
return CXChildVisit_Break;
}
}
return CXChildVisit_Recurse;
};
clang_visitChildren(clang_getTranslationUnitCursor(unit), visitor, &data);
clang_disposeTranslationUnit(unit);
clang_disposeIndex(index);
return 0;
}
See How to extract C/C++ function extent (from..to source lines) from source code using clang for details on how to compile and run this code.
If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow