WIP Duplicate detection mode

This commit is contained in:
Guus Waals 2025-05-28 01:29:10 +08:00
parent ec5d40be0c
commit a958b0268e
2 changed files with 183 additions and 20 deletions

View File

@ -18,3 +18,4 @@ done
$tool @$file_list -v --log-file=log-functions.txt
$tool tmps/gh_global.h -mglobals -v --log-file=log-globals.txt
$tool -mduplicates -v --log-file=log-duplicates.txt

View File

@ -192,6 +192,126 @@ public:
void rollbackTransaction() {
sqlite3_exec(db, "ROLLBACK", nullptr, nullptr, nullptr);
}
// New methods for duplicate checking
bool checkDuplicateAddresses() {
const char *sql = R"(
WITH all_addresses AS (
SELECT 'Functions' as table_name, name, address, filepath FROM Functions WHERE address != ''
UNION ALL
SELECT 'Imports' as table_name, name, address, filepath FROM Imports WHERE address != ''
UNION ALL
SELECT 'Globals' as table_name, name, address, filepath FROM Globals WHERE address != ''
)
SELECT address, COUNT(*) as count,
GROUP_CONCAT(table_name || ':' || name || ' (' || filepath || ')', '; ') as entries
FROM all_addresses
GROUP BY address
HAVING COUNT(*) > 1
ORDER BY address;
)";
sqlite3_stmt *stmt;
if (sqlite3_prepare_v2(db, sql, -1, &stmt, nullptr) != SQLITE_OK) {
spdlog::error("Failed to prepare duplicate address query: {}",
sqlite3_errmsg(db));
return false;
}
bool found_duplicates = false;
while (sqlite3_step(stmt) == SQLITE_ROW) {
found_duplicates = true;
const char *address = (const char *)sqlite3_column_text(stmt, 0);
int count = sqlite3_column_int(stmt, 1);
const char *entries = (const char *)sqlite3_column_text(stmt, 2);
spdlog::error("DUPLICATE ADDRESS: {} appears {} times in: {}", address,
count, entries);
}
sqlite3_finalize(stmt);
return found_duplicates;
}
bool checkDuplicateNames() {
bool found_duplicates = false;
// Check Functions table
const char *functions_sql = R"(
SELECT name, COUNT(*) as count,
GROUP_CONCAT(filepath, '; ') as filepaths
FROM Functions
GROUP BY name
HAVING COUNT(*) > 1
ORDER BY name;
)";
sqlite3_stmt *stmt;
if (sqlite3_prepare_v2(db, functions_sql, -1, &stmt, nullptr) ==
SQLITE_OK) {
while (sqlite3_step(stmt) == SQLITE_ROW) {
found_duplicates = true;
const char *name = (const char *)sqlite3_column_text(stmt, 0);
int count = sqlite3_column_int(stmt, 1);
const char *filepaths = (const char *)sqlite3_column_text(stmt, 2);
spdlog::error(
"DUPLICATE FUNCTION NAME: '{}' appears {} times in files: {}", name,
count, filepaths);
}
sqlite3_finalize(stmt);
}
// Check Imports table
const char *imports_sql = R"(
SELECT name, COUNT(*) as count,
GROUP_CONCAT(filepath, '; ') as filepaths
FROM Imports
GROUP BY name
HAVING COUNT(*) > 1
ORDER BY name;
)";
if (sqlite3_prepare_v2(db, imports_sql, -1, &stmt, nullptr) == SQLITE_OK) {
while (sqlite3_step(stmt) == SQLITE_ROW) {
found_duplicates = true;
const char *name = (const char *)sqlite3_column_text(stmt, 0);
int count = sqlite3_column_int(stmt, 1);
const char *filepaths = (const char *)sqlite3_column_text(stmt, 2);
spdlog::error(
"DUPLICATE IMPORT NAME: '{}' appears {} times in files: {}", name,
count, filepaths);
}
sqlite3_finalize(stmt);
}
// Check Globals table
const char *globals_sql = R"(
SELECT name, COUNT(*) as count,
GROUP_CONCAT(filepath, '; ') as filepaths
FROM Globals
GROUP BY name
HAVING COUNT(*) > 1
ORDER BY name;
)";
if (sqlite3_prepare_v2(db, globals_sql, -1, &stmt, nullptr) == SQLITE_OK) {
while (sqlite3_step(stmt) == SQLITE_ROW) {
found_duplicates = true;
const char *name = (const char *)sqlite3_column_text(stmt, 0);
int count = sqlite3_column_int(stmt, 1);
const char *filepaths = (const char *)sqlite3_column_text(stmt, 2);
spdlog::error(
"DUPLICATE GLOBAL NAME: '{}' appears {} times in files: {}", name,
count, filepaths);
}
sqlite3_finalize(stmt);
}
return found_duplicates;
}
};
std::string extractAddress(const std::string &comment) {
@ -611,6 +731,29 @@ bool dumpTreeFile(const std::string &filepath) {
return true;
}
bool processDuplicates(DatabaseManager &db) {
spdlog::info("=== Checking for duplicate addresses ===");
bool found_address_duplicates = db.checkDuplicateAddresses();
spdlog::info("=== Checking for duplicate names ===");
bool found_name_duplicates = db.checkDuplicateNames();
if (!found_address_duplicates && !found_name_duplicates) {
spdlog::info("No duplicates found in the database.");
return true;
}
if (found_address_duplicates) {
spdlog::error("Found duplicate addresses in the database!");
}
if (found_name_duplicates) {
spdlog::error("Found duplicate names in the database!");
}
return false; // Return false to indicate errors were found
}
int main(int argc, char *argv[]) {
// Initialize spdlog
auto console = spdlog::stdout_color_mt("console");
@ -634,9 +777,12 @@ int main(int argc, char *argv[]) {
"File containing list of files to process");
app.add_option("-d,--database", db_path, "SQLite database path")
->default_val("gh.db");
app.add_option("-m,--mode", mode, "Processing mode: 'functions', 'globals', or 'dump-tree'")
app.add_option("-m,--mode", mode,
"Processing mode: 'functions', 'globals', 'duplicates', or "
"'dump-tree'")
->default_val("functions")
->check(CLI::IsMember({"functions", "globals", "dump-tree"}));
->check(
CLI::IsMember({"functions", "globals", "duplicates", "dump-tree"}));
app.add_flag("-v,--verbose", verbose, "Enable verbose logging (debug level)");
app.add_flag("--log-file", log_file, "Enable logging to file");
@ -647,6 +793,8 @@ int main(int argc, char *argv[]) {
spdlog::set_level(spdlog::level::debug);
}
spdlog::set_pattern(std::string("[%^%l%$] %v"));
if (!log_file.empty()) {
auto log_sink =
std::make_shared<spdlog::sinks::basic_file_sink_mt>(log_file, true);
@ -654,28 +802,31 @@ int main(int argc, char *argv[]) {
}
std::vector<std::string> files_to_process;
bool needFiles = mode != "duplicates";
if (!list_file.empty()) {
auto list_files = readFileList(list_file);
files_to_process.insert(files_to_process.end(), list_files.begin(),
list_files.end());
}
for (const auto &input : input_files) {
if (input.starts_with("@")) {
auto list_files = readFileList(input.substr(1));
if (needFiles) {
if (!list_file.empty()) {
auto list_files = readFileList(list_file);
files_to_process.insert(files_to_process.end(), list_files.begin(),
list_files.end());
} else if (std::filesystem::exists(input)) {
files_to_process.push_back(input);
} else {
spdlog::warn("File not found: {}", input);
}
}
if (files_to_process.empty()) {
spdlog::error("No files to process. Use --help for usage information.");
return 1;
for (const auto &input : input_files) {
if (input.starts_with("@")) {
auto list_files = readFileList(input.substr(1));
files_to_process.insert(files_to_process.end(), list_files.begin(),
list_files.end());
} else if (std::filesystem::exists(input)) {
files_to_process.push_back(input);
} else {
spdlog::warn("File not found: {}", input);
}
}
if (files_to_process.empty()) {
spdlog::error("No files to process. Use --help for usage information.");
return 1;
}
}
try {
@ -689,8 +840,18 @@ int main(int argc, char *argv[]) {
processed_count++;
}
}
} else if (mode == "duplicates") {
DatabaseManager db(db_path);
// For duplicates mode, we only check the database, no file processing
spdlog::info("=== Checking database for duplicates ===");
bool has_duplicates = !processDuplicates(db);
spdlog::info("=== Summary ===");
spdlog::info("Mode: {}", mode);
spdlog::info("Database: {}", db_path);
return has_duplicates ? 1 : 0; // Return 1 if duplicates found, 0 if none
} else {
DatabaseManager db(db_path);
const size_t batch_size = 50;
size_t current_batch = 0;
@ -710,7 +871,8 @@ int main(int argc, char *argv[]) {
if (++current_batch >= batch_size) {
db.commitTransaction();
spdlog::info("Committed batch of {} files to database", current_batch);
spdlog::info("Committed batch of {} files to database",
current_batch);
db.beginTransaction();
current_batch = 0;
}