#include #include #include #include #include #include #include #include #include #include #include #include #include #include extern "C" TSLanguage *tree_sitter_cpp(); // Global address regex pattern const std::regex ADDRESS_REGEX(R"(//\s*([0-9a-fA-F]{8}))"); // Helper function to check if a comment contains an address bool hasAddressPattern(const std::string &comment) { return std::regex_search(comment, ADDRESS_REGEX); } // Helper function to extract text from a TSNode std::string extractNodeText(TSNode node, const char *source_code) { uint32_t start = ts_node_start_byte(node); uint32_t end = ts_node_end_byte(node); return std::string(source_code + start, end - start); } // Helper function to find first identifier in a node std::string findIdentifierInNode(TSNode node, const char *source_code) { uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { TSNode child = ts_node_child(node, i); if (strcmp(ts_node_type(child), "identifier") == 0) { return extractNodeText(child, source_code); } } return ""; } struct FunctionInfo { std::string name; std::string address; std::string filepath; bool is_import; }; struct GlobalInfo { std::string name; std::string address; std::string filepath; }; class PreparedStatements { private: sqlite3 *db; sqlite3_stmt *delete_functions_stmt; sqlite3_stmt *delete_imports_stmt; sqlite3_stmt *insert_functions_stmt; sqlite3_stmt *insert_imports_stmt; sqlite3_stmt *delete_globals_stmt; sqlite3_stmt *insert_globals_stmt; void prepareStatement(const char *sql, sqlite3_stmt **stmt, const std::string &error_msg) { if (sqlite3_prepare_v2(db, sql, -1, stmt, nullptr) != SQLITE_OK) { throw std::runtime_error(error_msg + ": " + sqlite3_errmsg(db)); } } public: PreparedStatements(sqlite3 *database) : db(database) { prepareStatement("DELETE FROM Functions WHERE filepath = ?", &delete_functions_stmt, "Failed to prepare delete functions statement"); prepareStatement("DELETE FROM Imports WHERE filepath = ?", &delete_imports_stmt, "Failed to prepare delete imports statement"); prepareStatement("INSERT OR REPLACE INTO Functions (filepath, name, " "address) VALUES (?, ?, ?)", &insert_functions_stmt, "Failed to prepare insert functions statement"); prepareStatement("INSERT OR REPLACE INTO Imports (filepath, name, address) " "VALUES (?, ?, ?)", &insert_imports_stmt, "Failed to prepare insert imports statement"); prepareStatement("DELETE FROM Globals WHERE filepath = ?", &delete_globals_stmt, "Failed to prepare delete globals statement"); prepareStatement("INSERT OR REPLACE INTO Globals (filepath, name, address) " "VALUES (?, ?, ?)", &insert_globals_stmt, "Failed to prepare insert globals statement"); } ~PreparedStatements() { sqlite3_finalize(delete_functions_stmt); sqlite3_finalize(delete_imports_stmt); sqlite3_finalize(insert_functions_stmt); sqlite3_finalize(insert_imports_stmt); sqlite3_finalize(delete_globals_stmt); sqlite3_finalize(insert_globals_stmt); } void clearEntriesForFile(const std::string &filepath) { for (auto stmt : {delete_functions_stmt, delete_imports_stmt}) { sqlite3_reset(stmt); sqlite3_bind_text(stmt, 1, filepath.c_str(), -1, SQLITE_STATIC); sqlite3_step(stmt); } } void clearGlobalsForFile(const std::string &filepath) { sqlite3_reset(delete_globals_stmt); sqlite3_bind_text(delete_globals_stmt, 1, filepath.c_str(), -1, SQLITE_STATIC); sqlite3_step(delete_globals_stmt); } void insertFunction(const FunctionInfo &func) { sqlite3_stmt *stmt = func.is_import ? insert_imports_stmt : insert_functions_stmt; sqlite3_reset(stmt); sqlite3_bind_text(stmt, 1, func.filepath.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 2, func.name.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 3, func.address.c_str(), -1, SQLITE_STATIC); sqlite3_step(stmt); } void insertGlobal(const GlobalInfo &global) { sqlite3_reset(insert_globals_stmt); sqlite3_bind_text(insert_globals_stmt, 1, global.filepath.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(insert_globals_stmt, 2, global.name.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(insert_globals_stmt, 3, global.address.c_str(), -1, SQLITE_STATIC); sqlite3_step(insert_globals_stmt); } }; class DatabaseManager { private: sqlite3 *db; std::unique_ptr prepared_stmts; public: DatabaseManager(const std::string &db_path) : db(nullptr) { if (sqlite3_open(db_path.c_str(), &db) != SQLITE_OK) { spdlog::error("Can't open database: {}", sqlite3_errmsg(db)); sqlite3_close(db); throw std::runtime_error("Failed to open database"); } const char *create_tables = R"( CREATE TABLE IF NOT EXISTS Functions (filepath TEXT, name TEXT, address TEXT, PRIMARY KEY (name, filepath)); CREATE TABLE IF NOT EXISTS Imports (filepath TEXT, name TEXT, address TEXT, PRIMARY KEY (name, filepath)); CREATE TABLE IF NOT EXISTS Globals (filepath TEXT, name TEXT, address TEXT); )"; sqlite3_exec(db, create_tables, nullptr, nullptr, nullptr); prepared_stmts = std::make_unique(db); } ~DatabaseManager() { if (db) sqlite3_close(db); } void clearEntriesForFile(const std::string &filepath) { prepared_stmts->clearEntriesForFile(filepath); } void insertFunction(const FunctionInfo &func) { prepared_stmts->insertFunction(func); } void clearGlobalsForFile(const std::string &filepath) { prepared_stmts->clearGlobalsForFile(filepath); } void insertGlobal(const GlobalInfo &global) { prepared_stmts->insertGlobal(global); } void beginTransaction() { sqlite3_exec(db, "BEGIN TRANSACTION", nullptr, nullptr, nullptr); } void commitTransaction() { sqlite3_exec(db, "COMMIT", nullptr, nullptr, nullptr); } void rollbackTransaction() { sqlite3_exec(db, "ROLLBACK", nullptr, nullptr, nullptr); } // New methods for duplicate checking bool checkDuplicateAddresses() { const char *sql = R"( WITH all_addresses AS ( SELECT 'Functions' as table_name, name, address, filepath FROM Functions WHERE address != '' UNION ALL SELECT 'Globals' as table_name, name, address, filepath FROM Globals WHERE address != '' ) SELECT address, COUNT(*) as count, GROUP_CONCAT(table_name || ':' || name || ' (' || filepath || ')', '; ') as entries FROM all_addresses GROUP BY address HAVING COUNT(*) > 1 ORDER BY address; )"; sqlite3_stmt *stmt; if (sqlite3_prepare_v2(db, sql, -1, &stmt, nullptr) != SQLITE_OK) { spdlog::error("Failed to prepare duplicate address query: {}", sqlite3_errmsg(db)); return false; } bool found_duplicates = false; while (sqlite3_step(stmt) == SQLITE_ROW) { found_duplicates = true; const char *address = (const char *)sqlite3_column_text(stmt, 0); int count = sqlite3_column_int(stmt, 1); const char *entries = (const char *)sqlite3_column_text(stmt, 2); spdlog::error("DUPLICATE ADDRESS: {} appears {} times in: {}", address, count, entries); } sqlite3_finalize(stmt); return found_duplicates; } bool checkDuplicateNames() { bool found_duplicates = false; // Check Functions table const char *functions_sql = R"( SELECT name, COUNT(*) as count, GROUP_CONCAT(filepath, '; ') as filepaths FROM Functions GROUP BY name HAVING COUNT(*) > 1 ORDER BY name; )"; sqlite3_stmt *stmt; if (sqlite3_prepare_v2(db, functions_sql, -1, &stmt, nullptr) == SQLITE_OK) { while (sqlite3_step(stmt) == SQLITE_ROW) { found_duplicates = true; const char *name = (const char *)sqlite3_column_text(stmt, 0); int count = sqlite3_column_int(stmt, 1); const char *filepaths = (const char *)sqlite3_column_text(stmt, 2); spdlog::error( "DUPLICATE FUNCTION NAME: '{}' appears {} times in files: {}", name, count, filepaths); } sqlite3_finalize(stmt); } // Check Globals table const char *globals_sql = R"( SELECT name, COUNT(*) as count, GROUP_CONCAT(filepath, '; ') as filepaths FROM Globals GROUP BY name HAVING COUNT(*) > 1 ORDER BY name; )"; if (sqlite3_prepare_v2(db, globals_sql, -1, &stmt, nullptr) == SQLITE_OK) { while (sqlite3_step(stmt) == SQLITE_ROW) { found_duplicates = true; const char *name = (const char *)sqlite3_column_text(stmt, 0); int count = sqlite3_column_int(stmt, 1); const char *filepaths = (const char *)sqlite3_column_text(stmt, 2); spdlog::error( "DUPLICATE GLOBAL NAME: '{}' appears {} times in files: {}", name, count, filepaths); } sqlite3_finalize(stmt); } return found_duplicates; } }; std::string extractAddress(const std::string &comment) { std::smatch match; return std::regex_search(comment, match, ADDRESS_REGEX) ? match[1].str() : ""; } std::string getFunctionName(TSNode node, const char *source_code) { uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { TSNode child = ts_node_child(node, i); const char *type = ts_node_type(child); if (strcmp(type, "function_declarator") == 0) { std::string name = findIdentifierInNode(child, source_code); if (!name.empty()) return name; } else if (strcmp(type, "identifier") == 0) { return extractNodeText(child, source_code); } else if (strcmp(type, "pointer_declarator") == 0) { std::string name = getFunctionName(child, source_code); if (!name.empty()) return name; } } return ""; } std::string getComment(TSNode node, const char *source_code, uint32_t source_length, bool search_before) { TSNode current = node; if (search_before) { // Look for comments before the current node while (!ts_node_is_null(current)) { TSNode prev_sibling = ts_node_prev_sibling(current); while (!ts_node_is_null(prev_sibling)) { const char *type = ts_node_type(prev_sibling); if (strcmp(type, "comment") == 0) { std::string comment_text = extractNodeText(prev_sibling, source_code); // Check if it contains an address pattern if (hasAddressPattern(comment_text)) { return comment_text; } } // Skip whitespace and continue looking else if (strcmp(type, "ERROR") != 0) { // If we hit non-comment, non-whitespace content, stop searching break; } prev_sibling = ts_node_prev_sibling(prev_sibling); } // Move up to parent and continue searching current = ts_node_parent(current); } } else { // Look for comments after the current node TSNode next_sibling = ts_node_next_sibling(node); while (!ts_node_is_null(next_sibling)) { const char *type = ts_node_type(next_sibling); if (strcmp(type, "comment") == 0) { std::string comment_text = extractNodeText(next_sibling, source_code); // Check if it contains an address pattern if (hasAddressPattern(comment_text)) { return comment_text; } } // Skip whitespace and continue looking else if (strcmp(type, "ERROR") != 0) { // If we hit non-comment, non-whitespace content, stop searching break; } next_sibling = ts_node_next_sibling(next_sibling); } } return ""; } bool hasFunctionBody(TSNode node) { if (strcmp(ts_node_type(node), "function_definition") != 0) return false; uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { if (strcmp(ts_node_type(ts_node_child(node, i)), "compound_statement") == 0) { return true; } } return false; } void findFunctions(TSNode node, const char *source_code, uint32_t source_length, std::vector &functions) { const char *type = ts_node_type(node); if (strcmp(type, "function_definition") == 0 || strcmp(type, "declaration") == 0) { std::string func_name = getFunctionName(node, source_code); if (!func_name.empty()) { std::string address = extractAddress(getComment(node, source_code, source_length, false)); if (address.empty() && strcmp(type, "function_definition") == 0) { address = extractAddress(getComment(node, source_code, source_length, true)); } if (!address.empty()) { FunctionInfo func{func_name, address, "", strcmp(type, "function_definition") == 0 ? !hasFunctionBody(node) : true}; functions.push_back(func); } // We'll never nest function declarations return; } else { spdlog::error("Failed to get function name for {}", extractNodeText(node, source_code)); } } uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { findFunctions(ts_node_child(node, i), source_code, source_length, functions); } } std::vector readFileList(const std::string &list_file) { std::vector files; std::ifstream file(list_file); if (!file.is_open()) { spdlog::error("Could not open list file {}", list_file); return files; } std::string line; while (std::getline(file, line)) { if (line.empty() || line[0] == '#') continue; if (line.find('*') != std::string::npos) { spdlog::info("Skipping wildcard pattern: {}", line); continue; } if (std::filesystem::exists(line)) { files.push_back(line); } else { spdlog::warn("File not found: {}", line); } } return files; } bool processFile(const std::string &filepath, DatabaseManager &db) { std::ifstream file(filepath); if (!file.is_open()) { spdlog::error("Could not open file {}", filepath); return false; } std::string file_content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); TSParser *parser = ts_parser_new(); ts_parser_set_language(parser, tree_sitter_cpp()); TSTree *tree = ts_parser_parse_string(parser, nullptr, file_content.c_str(), file_content.length()); TSNode root_node = ts_tree_root_node(tree); if (ts_node_is_null(root_node)) { spdlog::error("Failed to parse file {}", filepath); ts_tree_delete(tree); ts_parser_delete(parser); return false; } db.clearEntriesForFile(filepath); std::vector functions; findFunctions(root_node, file_content.c_str(), file_content.length(), functions); for (auto &func : functions) { func.filepath = filepath; db.insertFunction(func); spdlog::debug("{}: {} @ {} in {}", func.is_import ? "Import" : "Function", func.name, func.address, filepath); } spdlog::info("Processed {} functions/imports from {}", functions.size(), filepath); ts_tree_delete(tree); ts_parser_delete(parser); return true; } // Helper function to recursively find identifier in any declarator std::string findIdentifierInDeclarator(TSNode node, const char *source_code) { const char *type = ts_node_type(node); // If this is an identifier, return it if (strcmp(type, "identifier") == 0) { return extractNodeText(node, source_code); } // Recursively search all children uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { TSNode child = ts_node_child(node, i); std::string result = findIdentifierInDeclarator(child, source_code); if (!result.empty()) { return result; } } return ""; } std::string getGlobalName(TSNode node, const char *source_code) { uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { TSNode child = ts_node_child(node, i); const char *type = ts_node_type(child); // Look for any kind of declarator and recursively search for identifier if (strcmp(type, "init_declarator") == 0 || strcmp(type, "declarator") == 0 || strcmp(type, "reference_declarator") == 0 || strcmp(type, "pointer_declarator") == 0 || strcmp(type, "parenthesized_declarator") == 0 || strcmp(type, "array_declarator") == 0) { std::string name = findIdentifierInDeclarator(child, source_code); if (!name.empty()) { return name; } } // Direct identifier child else if (strcmp(type, "identifier") == 0) { return extractNodeText(child, source_code); } } return ""; } void findGlobals(TSNode node, const char *source_code, uint32_t source_length, std::vector &globals) { const char *type = ts_node_type(node); // Look for extern declarations if (strcmp(type, "declaration") == 0) { // Check if this is an extern declaration uint32_t child_count = ts_node_child_count(node); bool is_extern = false; for (uint32_t i = 0; i < child_count; i++) { TSNode child = ts_node_child(node, i); if (strcmp(ts_node_type(child), "storage_class_specifier") == 0) { std::string storage_class = extractNodeText(child, source_code); if (storage_class == "extern") { is_extern = true; break; } } } if (is_extern) { std::string global_name = getGlobalName(node, source_code); if (!global_name.empty()) { // Look for address comment after the declaration std::string address = extractAddress(getComment(node, source_code, source_length, false)); if (!address.empty()) { GlobalInfo global{global_name, address, ""}; globals.push_back(global); } } else { std::string src = extractNodeText(node, source_code); SPDLOG_ERROR("Failed to get global name for {}", src); } return; } } // Recursively search child nodes uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { findGlobals(ts_node_child(node, i), source_code, source_length, globals); } } bool processGlobalsFile(const std::string &filepath, DatabaseManager &db) { std::ifstream file(filepath); if (!file.is_open()) { spdlog::error("Could not open file {}", filepath); return false; } std::string file_content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); TSParser *parser = ts_parser_new(); ts_parser_set_language(parser, tree_sitter_cpp()); TSTree *tree = ts_parser_parse_string(parser, nullptr, file_content.c_str(), file_content.length()); TSNode root_node = ts_tree_root_node(tree); if (ts_node_is_null(root_node)) { spdlog::error("Failed to parse file {}", filepath); ts_tree_delete(tree); ts_parser_delete(parser); return false; } db.clearGlobalsForFile(filepath); std::vector globals; findGlobals(root_node, file_content.c_str(), file_content.length(), globals); for (auto &global : globals) { global.filepath = filepath; db.insertGlobal(global); spdlog::debug("Global: {} @ {} in {}", global.name, global.address, filepath); } spdlog::info("Processed {} globals from {}", globals.size(), filepath); ts_tree_delete(tree); ts_parser_delete(parser); return true; } // Helper function to dump Tree-sitter AST void dumpTreeSitterAST(TSNode node, const char *source_code, int depth = 0) { std::string indent(depth * 2, ' '); const char *type = ts_node_type(node); uint32_t start = ts_node_start_byte(node); uint32_t end = ts_node_end_byte(node); // Get the text content for leaf nodes or small nodes std::string content; if (end - start < 100) { // Only show content for small nodes content = extractNodeText(node, source_code); // Replace newlines with \n for better readability std::regex newline_regex("\n"); content = std::regex_replace(content, newline_regex, "\\n"); // Truncate if still too long if (content.length() > 50) { content = content.substr(0, 47) + "..."; } } if (!content.empty()) { spdlog::info("{}{}[{}:{}] \"{}\"", indent, type, start, end, content); } else { spdlog::info("{}{}[{}:{}]", indent, type, start, end); } // Recursively dump children uint32_t child_count = ts_node_child_count(node); for (uint32_t i = 0; i < child_count; i++) { TSNode child = ts_node_child(node, i); dumpTreeSitterAST(child, source_code, depth + 1); } } bool dumpTreeFile(const std::string &filepath) { std::ifstream file(filepath); if (!file.is_open()) { spdlog::error("Could not open file {}", filepath); return false; } std::string file_content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); TSParser *parser = ts_parser_new(); ts_parser_set_language(parser, tree_sitter_cpp()); TSTree *tree = ts_parser_parse_string(parser, nullptr, file_content.c_str(), file_content.length()); TSNode root_node = ts_tree_root_node(tree); if (ts_node_is_null(root_node)) { spdlog::error("Failed to parse file {}", filepath); ts_tree_delete(tree); ts_parser_delete(parser); return false; } spdlog::info("=== Tree-sitter AST for {} ===", filepath); dumpTreeSitterAST(root_node, file_content.c_str()); spdlog::info("=== End of AST dump ==="); ts_tree_delete(tree); ts_parser_delete(parser); return true; } bool processDuplicates(DatabaseManager &db) { spdlog::info("=== Checking for duplicate addresses ==="); bool found_address_duplicates = db.checkDuplicateAddresses(); if (found_address_duplicates) { spdlog::error("Found duplicate addresses in the database!"); } else { spdlog::info("No duplicate addresses found in the database."); } spdlog::info("=== Checking for duplicate names ==="); bool found_name_duplicates = db.checkDuplicateNames(); if (found_name_duplicates) { spdlog::error("Found duplicate names in the database!"); } else { spdlog::info("No duplicate names found in the database."); } return !found_address_duplicates && !found_name_duplicates; } int main(int argc, char *argv[]) { // Initialize spdlog auto console = spdlog::stdout_color_mt("console"); spdlog::set_default_logger(console); spdlog::set_level(spdlog::level::info); // Default to info level spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v"); CLI::App app{"C++ Function/Global Parser - Extracts function addresses or " "global variable addresses from C++ files"}; std::vector input_files; std::string list_file; std::string db_path = "gh.db"; std::string mode = "functions"; std::string log_file = ""; bool verbose = false; app.add_option("files", input_files, "Input C++ files to parse (supports @listfile.txt syntax)"); app.add_option("-l,--list", list_file, "File containing list of files to process"); app.add_option("-d,--database", db_path, "SQLite database path") ->default_val("gh.db"); app.add_option("-m,--mode", mode, "Processing mode: 'functions', 'globals', 'duplicates', or " "'dump-tree'") ->default_val("functions") ->check( CLI::IsMember({"functions", "globals", "duplicates", "dump-tree"})); app.add_flag("-v,--verbose", verbose, "Enable verbose logging (debug level)"); app.add_flag("--log-file", log_file, "Enable logging to file"); CLI11_PARSE(app, argc, argv); // Set log level based on verbose flag if (verbose) { spdlog::set_level(spdlog::level::debug); } spdlog::set_pattern(std::string("[%^%l%$] %v")); if (!log_file.empty()) { auto log_sink = std::make_shared(log_file, true); spdlog::get("console")->sinks().push_back(log_sink); } std::vector files_to_process; bool needFiles = mode != "duplicates"; if (needFiles) { if (!list_file.empty()) { auto list_files = readFileList(list_file); files_to_process.insert(files_to_process.end(), list_files.begin(), list_files.end()); } for (const auto &input : input_files) { if (input.starts_with("@")) { auto list_files = readFileList(input.substr(1)); files_to_process.insert(files_to_process.end(), list_files.begin(), list_files.end()); } else if (std::filesystem::exists(input)) { files_to_process.push_back(input); } else { spdlog::warn("File not found: {}", input); } } if (files_to_process.empty()) { spdlog::error("No files to process. Use --help for usage information."); return 1; } } try { int processed_count = 0; // For dump-tree mode, we don't need database operations if (mode == "dump-tree") { for (const auto &filepath : files_to_process) { spdlog::info("=== Processing: {} ===", filepath); if (dumpTreeFile(filepath)) { processed_count++; } } } else if (mode == "duplicates") { DatabaseManager db(db_path); // For duplicates mode, we only check the database, no file processing spdlog::info("=== Checking database for duplicates ==="); bool has_duplicates = !processDuplicates(db); spdlog::info("=== Summary ==="); spdlog::info("Mode: {}", mode); spdlog::info("Database: {}", db_path); return has_duplicates ? 1 : 0; // Return 1 if duplicates found, 0 if none } else { DatabaseManager db(db_path); const size_t batch_size = 50; size_t current_batch = 0; db.beginTransaction(); for (const auto &filepath : files_to_process) { spdlog::info("=== Processing: {} ===", filepath); bool success = false; if (mode == "functions") { success = processFile(filepath, db); } else if (mode == "globals") { success = processGlobalsFile(filepath, db); } if (success) processed_count++; if (++current_batch >= batch_size) { db.commitTransaction(); spdlog::info("Committed batch of {} files to database", current_batch); db.beginTransaction(); current_batch = 0; } } if (current_batch > 0) { db.commitTransaction(); spdlog::info("Committed final batch of {} files to database", current_batch); } } spdlog::info("=== Summary ==="); spdlog::info("Processed {} files successfully", processed_count); spdlog::info("Mode: {}", mode); if (mode != "dump-tree") { spdlog::info("Database saved to: {}", db_path); } } catch (const std::exception &e) { spdlog::error("Database error: {}", e.what()); return 1; } return 0; }