From 2da111a348079ab9f42a3058aa61f52fe386d301 Mon Sep 17 00:00:00 2001
From: Guus Waals <_@guusw.nl>
Date: Sun, 6 Oct 2024 19:44:32 +0800
Subject: [PATCH] WIP C parser

---
 scripts/DumpCurrentFunctionN.java    | 116 ++++-----
 scripts/RebuildFunctionDatabase.java | 143 ++++++-----
 scripts/re3lib/CParser.java          | 194 +++++++++++++++
 scripts/re3lib/CTokenizer.java       | 346 +++++++++++++++++++++++++++
 4 files changed, 684 insertions(+), 115 deletions(-)
 create mode 100644 scripts/re3lib/CParser.java
 create mode 100644 scripts/re3lib/CTokenizer.java
diff --git a/scripts/DumpCurrentFunctionN.java b/scripts/DumpCurrentFunctionN.java
index 36de4516..bb45bef7 100644
--- a/scripts/DumpCurrentFunctionN.java
+++ b/scripts/DumpCurrentFunctionN.java
@@ -18,28 +18,28 @@ import re3lib.TypeDumper;
 public class DumpCurrentFunctionN extends GhidraScript {
   final int NumFunctions = 8;
 
-  class Entry {
-    Function function;
-  }
-  class QueueEntry {
-    Function function;
-    List<Function> callees;
-  }
+  // class Entry {
+  //   Function function;
+  // }
+  // class QueueEntry {
+  //   Function function;
+  //   List<Function> callees;
+  // }
 
-  HashSet<Address> visited = new HashSet<>();
+  // HashSet<Address> visited = new HashSet<>();
 
-  QueueEntry enter(Function function) {
-    if (visited.contains(function.getEntryPoint()))
-      return null;
+  // QueueEntry enter(Function function) {
+  //   if (visited.contains(function.getEntryPoint()))
+  //     return null;
 
-    visited.add(function.getEntryPoint());
+  //   visited.add(function.getEntryPoint());
 
-    QueueEntry entry = new QueueEntry();
-    entry.function = function;
+  //   QueueEntry entry = new QueueEntry();
+  //   entry.function = function;
 
-    function.getCalledFunctions(monitor);
+  //   function.getCalledFunctions(monitor);
     
-  }
+  // }
 
   @Override
   public void run() throws Exception {
@@ -51,57 +51,57 @@ public class DumpCurrentFunctionN extends GhidraScript {
 
     FunctionDumper functionDumper = new FunctionDumper(this, globalDumper);
 
-    // PCallTracer tracer = new PCallTracer();
-    // tracer.setBlacklist(functionDumper.functionAddrBlackList);
-    // tracer.traceCalls(getFunctionContaining(currentAddress));
+    PCallTracer tracer = new PCallTracer();
+    tracer.setBlacklist(functionDumper.functionAddrBlackList);
+    tracer.traceCalls(getFunctionContaining(currentAddress));
 
     List<Address> queue = new ArrayList<>();
 
-    List<Function> functionsToDump = new ArrayList<>();
-    List<Function> functionsToDumpNew = new ArrayList<>();
-    for (Function func : tracer.out) {
-      if (FunctionDumper.isDumpedFix(func))
-        continue;
+    // List<Function> functionsToDump = new ArrayList<>();
+    // List<Function> functionsToDumpNew = new ArrayList<>();
+    // for (Function func : tracer.out) {
+    //   if (FunctionDumper.isDumpedFix(func))
+    //     continue;
 
-      println("Dump: " + func.getName());
-      functionsToDump.add(func);
+    //   println("Dump: " + func.getName());
+    //   functionsToDump.add(func);
 
-      if (!FunctionDumper.isDumpedAuto(func))
-        functionsToDumpNew.add(func);
-    }
+    //   if (!FunctionDumper.isDumpedAuto(func))
+    //     functionsToDumpNew.add(func);
+    // }
 
-    if (!functionsToDump.isEmpty()) {
-      String newOpt = "Only new (" + functionsToDumpNew.size() + ")";
-      String okOpt = "Yes (" + functionsToDump.size() + ")";
-      String choice = askChoice("Confirmation", "About to generate " + functionsToDump.size() + " functions ("
-          + functionsToDumpNew.size() + " new), continue?",
-          new ArrayList<String>() {
-            {
-              add(okOpt);
-              add(newOpt);
-              add("No");
-            }
-          }, okOpt);
-      if (choice == okOpt) {
-      } else if (choice == newOpt) {
-        functionsToDump = functionsToDumpNew;
-      } else {
-        return;
-      }
+    // if (!functionsToDump.isEmpty()) {
+    //   String newOpt = "Only new (" + functionsToDumpNew.size() + ")";
+    //   String okOpt = "Yes (" + functionsToDump.size() + ")";
+    //   String choice = askChoice("Confirmation", "About to generate " + functionsToDump.size() + " functions ("
+    //       + functionsToDumpNew.size() + " new), continue?",
+    //       new ArrayList<String>() {
+    //         {
+    //           add(okOpt);
+    //           add(newOpt);
+    //           add("No");
+    //         }
+    //       }, okOpt);
+    //   if (choice == okOpt) {
+    //   } else if (choice == newOpt) {
+    //     functionsToDump = functionsToDumpNew;
+    //   } else {
+    //     return;
+    //   }
 
-      for (Function func : functionsToDump) {
-        functionDumper.dump(func);
-      }
+    //   for (Function func : functionsToDump) {
+    //     functionDumper.dump(func);
+    //   }
 
-      if (functionDumper.createdFile)
-        RecompileConfig.INSTANCE.touchCMakeTimestamp();
+    //   if (functionDumper.createdFile)
+    //     RecompileConfig.INSTANCE.touchCMakeTimestamp();
 
-      globalDumper.dumpGlobals();
-      globalDumper.saveGlobalManifest();
-    }
+    //   globalDumper.dumpGlobals();
+    //   globalDumper.saveGlobalManifest();
+    // }
 
-    // Dump types
-    TypeDumper dumper = new TypeDumper(this);
-    dumper.run();
+    // // Dump types
+    // TypeDumper dumper = new TypeDumper(this);
+    // dumper.run();
   }
 }
diff --git a/scripts/RebuildFunctionDatabase.java b/scripts/RebuildFunctionDatabase.java
index a9abd637..a12aaae4 100644
--- a/scripts/RebuildFunctionDatabase.java
+++ b/scripts/RebuildFunctionDatabase.java
@@ -3,14 +3,21 @@
 
 import ghidra.app.script.GhidraScript;
 import ghidra.program.model.address.Address;
+import ghidra.program.model.data.DataType;
+import ghidra.program.model.data.StandAloneDataTypeManager;
 import re3lib.FunctionDatabase;
 import re3lib.RecompileConfig;
+import re3lib.CParser;
+import re3lib.CTokenizer;
 
 import java.io.File;
 import java.io.BufferedReader;
 import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -25,9 +32,10 @@ public class RebuildFunctionDatabase extends GhidraScript {
 
     functionDB = new FunctionDatabase(this);
 
-    scanDirectory(RecompileConfig.INSTANCE.dirDecompAuto, FunctionDatabase.Type.Auto);
-    scanDirectory(RecompileConfig.INSTANCE.dirDecompFix, FunctionDatabase.Type.Fix);
-    scanDirectory(RecompileConfig.INSTANCE.dirDecompStub, FunctionDatabase.Type.Stub);
+    scanFile(new File(RecompileConfig.INSTANCE.outputDir, "gh_auto/r3_engineLoop.cxx"), FunctionDatabase.Type.Auto);
+    // scanDirectory(RecompileConfig.INSTANCE.dirDecompAuto, FunctionDatabase.Type.Auto);
+    // scanDirectory(RecompileConfig.INSTANCE.dirDecompFix, FunctionDatabase.Type.Fix);
+    // scanDirectory(RecompileConfig.INSTANCE.dirDecompStub, FunctionDatabase.Type.Stub);
 
     println("Applying default filters...");
     functionDB.applyDefaultFilters(rebuildAllGlobals);
@@ -38,10 +46,10 @@ public class RebuildFunctionDatabase extends GhidraScript {
     println("Function database rebuilt successfully.");
 
     // for (FunctionDatabase.Entry entry : functionDB.entries) {
-    //   println(entry.address + " " + entry.name + " " + entry.file.getName());
-    //   for (FunctionDatabase.Dependency dependency : entry.dependencies) {
-    //     println("  " + dependency.address + " " + dependency.name);
-    //   }
+    // println(entry.address + " " + entry.name + " " + entry.file.getName());
+    // for (FunctionDatabase.Dependency dependency : entry.dependencies) {
+    // println(" " + dependency.address + " " + dependency.name);
+    // }
     // }
   }
 
@@ -55,64 +63,85 @@ public class RebuildFunctionDatabase extends GhidraScript {
     }
   }
 
-  private void scanFile(File file, FunctionDatabase.Type type) throws Exception {
-    println("Scanning " + file);
-    try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
-      String line;
-      Pattern dependencyPattern = Pattern.compile("(\\w+)\\s+(\\w+)\\(.*\\);\\s*//\\s*([0-9A-Fa-f]{8})\\s*//\\s*(.*)");
-      Pattern addressPattern = Pattern.compile("//\\s*([0-9A-Fa-f]{8})");
-      Pattern functionNamePattern = Pattern.compile("(\\S+)\\s+(\\S+)\\s*\\(");
+  private void parseOld(BufferedReader reader, File file, FunctionDatabase.Type type) throws Exception {
+    String line;
+    Pattern dependencyPattern = Pattern.compile("(\\w+)\\s+(\\w+)\\(.*\\);\\s*//\\s*([0-9A-Fa-f]{8})\\s*//\\s*(.*)");
+    Pattern addressPattern = Pattern.compile("//\\s*([0-9A-Fa-f]{8})");
+    Pattern functionNamePattern = Pattern.compile("(\\S+)\\s+(\\S+)\\s*\\(");
 
-      List<FunctionDatabase.Dependency> dependencies = new ArrayList<>();
-      String address = null;
-      String functionName = null;
+    List<FunctionDatabase.Dependency> dependencies = new ArrayList<>();
+    String address = null;
+    String functionName = null;
 
-      while ((line = reader.readLine()) != null) {
-        Matcher dependencyMatcher = dependencyPattern.matcher(line);
-        if (dependencyMatcher.find()) {
-          // println("Found dependency: " + dependencyMatcher.group(3));
-          Address depAddress = currentProgram.getAddressFactory().getAddress(dependencyMatcher.group(3));
-          String name = dependencyMatcher.group(2);
-          FunctionDatabase.Dependency dependency = functionDB.new Dependency(depAddress, name);
-          dependencies.add(dependency);
-          continue;
-        }
+    while ((line = reader.readLine()) != null) {
+      Matcher dependencyMatcher = dependencyPattern.matcher(line);
+      if (dependencyMatcher.find()) {
+        // println("Found dependency: " + dependencyMatcher.group(3));
+        Address depAddress = currentProgram.getAddressFactory().getAddress(dependencyMatcher.group(3));
+        String name = dependencyMatcher.group(2);
+        FunctionDatabase.Dependency dependency = functionDB.new Dependency(depAddress, name);
+        dependencies.add(dependency);
+        continue;
+      }
 
-        Matcher addressMatcher = addressPattern.matcher(line);
-        if (addressMatcher.find()) {
-          // println("Found address: " + addressMatcher.group(1));
-          address = addressMatcher.group(1);
-          // Skip any comments or newlines between address and function definition
-          while ((line = reader.readLine()) != null) {
-            line = line.trim();
-            // println("Line: " + line);
-            if (!line.isEmpty()) {
-              Matcher functionNameMatcher = functionNamePattern.matcher(line);
-              if (functionNameMatcher.find()) {
-                functionName = functionNameMatcher.group(2).trim();
-                break;
-              }
+      Matcher addressMatcher = addressPattern.matcher(line);
+      if (addressMatcher.find()) {
+        // println("Found address: " + addressMatcher.group(1));
+        address = addressMatcher.group(1);
+        // Skip any comments or newlines between address and function definition
+        while ((line = reader.readLine()) != null) {
+          line = line.trim();
+          // println("Line: " + line);
+          if (!line.isEmpty()) {
+            Matcher functionNameMatcher = functionNamePattern.matcher(line);
+            if (functionNameMatcher.find()) {
+              functionName = functionNameMatcher.group(2).trim();
+              break;
             }
           }
-          if (functionName != null) {
-            break;
-          }
+        }
+        if (functionName != null) {
+          break;
         }
       }
+    }
 
-      if (address != null && functionName != null) {
-        Address functionAddress = currentProgram.getAddressFactory().getAddress(address);
-        FunctionDatabase.Entry entry = functionDB.new Entry();
-        entry.address = functionAddress;
-        entry.name = functionName;
-        entry.file = file;
-        entry.type = type;
-        entry.dependencies = dependencies;
-        functionDB.entries.add(entry);
-      } else {
-        // throw new Exception("Failed to parse function at " + file.getName());
-        println("Failed to parse function at " + file.getName());
-      }
+    if (address != null && functionName != null) {
+      Address functionAddress = currentProgram.getAddressFactory().getAddress(address);
+      FunctionDatabase.Entry entry = functionDB.new Entry();
+      entry.address = functionAddress;
+      entry.name = functionName;
+      entry.file = file;
+      entry.type = type;
+      entry.dependencies = dependencies;
+      functionDB.entries.add(entry);
+    } else {
+      // throw new Exception("Failed to parse function at " + file.getName());
+      println("Failed to parse function at " + file.getName());
+    }
+  }
+
+  private void scanFile(File file, FunctionDatabase.Type type) throws Exception {
+    println("Scanning " + file);
+
+    String text = new String(Files.readAllBytes(file.toPath()));
+    CTokenizer.TokenSet tokens = new CTokenizer(text).parse();
+    CParser parser = new CParser(tokens);
+    parser.parse();
+
+    // for (CTokenizer.Token token : tokens.getTokens()) {
+    //   int line = tokens.getLine(token.ofs);
+    //   println("Line " + line + ": " + token.ofs + " " + token.len + " " + token.type + " - "
+    //       + tokens.getTextNoNewlines(token));
+    // }
+    for (CParser.Function function : parser.getFunctions()) {
+      println("Function: " + function.name + " " + function.startOffset + " " + function.endOffset);
+    }
+    for (CParser.FunctionCall functionCall : parser.getFunctionCalls()) {
+      println("FunctionCall: " + functionCall.name + " " + functionCall.startOffset + " " + functionCall.endOffset);
+    }
+    for (CParser.Variable variable : parser.getVariables()) {
+      println("Variable: " + variable.name + " " + variable.startOffset + " " + variable.endOffset);
     }
   }
 }
diff --git a/scripts/re3lib/CParser.java b/scripts/re3lib/CParser.java
new file mode 100644
index 00000000..8ae54451
--- /dev/null
+++ b/scripts/re3lib/CParser.java
@@ -0,0 +1,194 @@
+package re3lib;
+
+import java.util.*;
+
+import re3lib.CTokenizer.Token;
+
+public class CParser {
+  private CTokenizer.TokenSet tokenSet;
+  private List<Variable> variables;
+  private List<Function> functions;
+  private List<FunctionCall> functionCalls;
+
+  public CParser(CTokenizer.TokenSet tokenSet) {
+    this.tokenSet = tokenSet;
+    this.variables = new ArrayList<>();
+    this.functions = new ArrayList<>();
+    this.functionCalls = new ArrayList<>();
+  }
+
+  int index = 0;
+
+  public void parse() {
+    CTokenizer.Token[] tokens = tokenSet.getTokens();
+    for (index = 0; index < tokens.length; index++) {
+      CTokenizer.Token token = tokens[index];
+      if (token.type == CTokenizer.TokenType.BLOCK_COMMENT || token.type == CTokenizer.TokenType.COMMENT) {
+        continue;
+      } else if (token.type == CTokenizer.TokenType.HASH) {
+        index = parsePreprocessorExpression();
+      } else if (tokens[index].type == CTokenizer.TokenType.IDENTIFIER) {
+        if (index + 1 < tokens.length && tokens[index + 1].type == CTokenizer.TokenType.L_PAREN) {
+          // Function call or declaration/definition
+          if (index > 0 && (tokens[index - 1].type == CTokenizer.TokenType.IDENTIFIER ||
+              tokens[index - 1].type == CTokenizer.TokenType.OTHER)) {
+            // Function declaration or definition
+            index = parseFunctionDeclaration();
+          } else {
+            // Function call
+            index = parseFunctionCall();
+          }
+        } else {
+          // Variable reference
+          index = parseVariableReference();
+        }
+      }
+    }
+  }
+
+  // Try to parse prep expression
+  private int parsePreprocessorExpression() {
+    int index = this.index;
+    if (tokenSet.tokens[index].type == CTokenizer.TokenType.HASH) {
+      int startLine = tokenSet.getLine(index);
+      while (index < tokenSet.tokens.length) {
+        if (tokenSet.getLine(index) > startLine) {
+          break;
+        }
+        index++;
+      }
+      // Find first next line token
+      index--;
+    }
+    return index;
+  }
+
+  // Try to parse function declaration and return the ending token index
+  private int parseFunctionDeclaration() {
+    CTokenizer.Token[] tokens = tokenSet.getTokens();
+    String name = tokenSet.getTextNoNewlines(tokens[index]);
+    int endIndex = findClosingParenthesis(index + 1);
+
+    if (endIndex == -1)
+      return index;
+
+    boolean isDefinition = false;
+    if (endIndex + 1 < tokens.length && tokens[endIndex + 1].type == CTokenizer.TokenType.L_BRACE) {
+      isDefinition = true;
+      endIndex = findClosingBrace(endIndex + 1);
+    }
+
+    if (endIndex == -1)
+      return index;
+
+    Function function = new Function(name, tokens[index].ofs, tokens[endIndex].ofs + tokens[endIndex].len,
+        isDefinition);
+    functions.add(function);
+    return endIndex - 1;
+  }
+
+  // Try to parse function call and return the ending token index
+  private int parseFunctionCall() {
+    CTokenizer.Token[] tokens = tokenSet.getTokens();
+    String name = tokenSet.getTextNoNewlines(tokens[index]);
+    int endIndex = findClosingParenthesis(index + 1);
+    if (endIndex == -1)
+      return index;
+
+    FunctionCall functionCall = new FunctionCall(name, tokens[index].ofs,
+        tokens[endIndex].ofs + tokens[endIndex].len);
+    functionCalls.add(functionCall);
+    return endIndex - 1;
+  }
+
+  // Try to parse variable reference and add it to the list
+  private int parseVariableReference() {
+    CTokenizer.Token token = tokenSet.getTokens()[index];
+    String name = tokenSet.getTextNoNewlines(token);
+    Variable variable = new Variable(name, token.ofs, token.ofs + token.len);
+    variables.add(variable);
+    return index + 1;
+  }
+
+  private int findClosingParenthesis(int startIndex) {
+    CTokenizer.Token[] tokens = tokenSet.getTokens();
+    int parenCount = 1;
+    for (int i = startIndex + 1; i < tokens.length; i++) {
+      if (tokens[i].type == CTokenizer.TokenType.L_PAREN) {
+        parenCount++;
+      } else if (tokens[i].type == CTokenizer.TokenType.R_PAREN) {
+        parenCount--;
+        if (parenCount == 0) {
+          return i;
+        }
+      }
+    }
+    return -1;
+  }
+
+  private int findClosingBrace(int startIndex) {
+    CTokenizer.Token[] tokens = tokenSet.getTokens();
+    int braceCount = 1;
+    for (int i = startIndex + 1; i < tokens.length; i++) {
+      if (tokens[i].type == CTokenizer.TokenType.L_BRACE) {
+        braceCount++;
+      } else if (tokens[i].type == CTokenizer.TokenType.R_BRACE) {
+        braceCount--;
+        if (braceCount == 0) {
+          return i;
+        }
+      }
+    }
+    return -1;
+  }
+
+  public List<Variable> getVariables() {
+    return variables;
+  }
+
+  public List<Function> getFunctions() {
+    return functions;
+  }
+
+  public List<FunctionCall> getFunctionCalls() {
+    return functionCalls;
+  }
+
+  public static class Variable {
+    public final String name;
+    public final int startOffset;
+    public final int endOffset;
+
+    public Variable(String name, int startOffset, int endOffset) {
+      this.name = name;
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+    }
+  }
+
+  public static class Function {
+    public final String name;
+    public final int startOffset;
+    public final int endOffset;
+    public final boolean isDefinition;
+
+    public Function(String name, int startOffset, int endOffset, boolean isDefinition) {
+      this.name = name;
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+      this.isDefinition = isDefinition;
+    }
+  }
+
+  public static class FunctionCall {
+    public final String name;
+    public final int startOffset;
+    public final int endOffset;
+
+    public FunctionCall(String name, int startOffset, int endOffset) {
+      this.name = name;
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+    }
+  }
+}
diff --git a/scripts/re3lib/CTokenizer.java b/scripts/re3lib/CTokenizer.java
new file mode 100644
index 00000000..16e25ed7
--- /dev/null
+++ b/scripts/re3lib/CTokenizer.java
@@ -0,0 +1,346 @@
+package re3lib;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import ghidra.app.script.GhidraScript;
+
+public class CTokenizer {
+  public enum TokenType {
+    UNDEFINED,
+    HASH,
+    L_PAREN,
+    R_PAREN,
+    L_BRACE,
+    R_BRACE,
+    SEMICOLON,
+    COMMA,
+    COMMENT,
+    BLOCK_COMMENT,
+    IDENTIFIER,
+    STRING_LITERAL,
+    NUMERIC_LITERAL,
+    NUMERIC_LITERAL_HEX,
+    OTHER,
+    KEYWORD,
+  }
+
+  public class Token {
+    public int ofs;
+    public int len;
+    public TokenType type;
+  }
+
+  public class TokenSet {
+    public final Token[] tokens;
+    public final String text;
+    private final TreeMap<Integer, Integer> lineNumberTable;
+
+    TokenSet(Token[] tokens, String text, TreeMap<Integer, Integer> lineNumberTable) {
+      this.tokens = tokens;
+      this.text = text;
+      this.lineNumberTable = lineNumberTable;
+    }
+
+    public Token[] getTokens() {
+      return this.tokens;
+    }
+
+    public int getLine(int offset) {
+      Map.Entry<Integer, Integer> entry = lineNumberTable.floorEntry(offset);
+      return entry != null ? entry.getValue() : -1;
+    }
+
+    public String getTextNoNewlines(Token token) {
+      String text = getText(token);
+      return text.replace("\n", "");
+    }
+  };
+
+  private final String text;
+  private TreeMap<Integer, Integer> lineNumberTable;
+  public GhidraScript log;
+
+  public CTokenizer(String text) {
+    this.text = text;
+  }
+
+  public CTokenizer(String text, GhidraScript script) {
+    this.text = text;
+    this.log = log;
+  }
+
+  String getText(Token token) {
+    return getText(token.ofs, token.len);
+  }
+
+  String getText(int ofs, int len) {
+    return text.substring(ofs, ofs + len); // Fixed recursion issue
+  }
+
+  TokenType lastTokenType = TokenType.UNDEFINED;
+
+  /**
+   * Inserts a new token into the tokens list.
+   *
+   * @param tokens      The list of tokens.
+   * @param tokenStart  The starting index of the token.
+   * @param tokenEnd    The current index in the text.
+   * @param currentType The type of the current token.
+   */
+  private void insertToken(List<Token> tokens, int tokenStart, int tokenEnd, TokenType currentType) {
+    if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) {
+      // Strip whitespace
+      for (int i = tokenStart; i < tokenEnd; i++) {
+        if (Character.isWhitespace(text.charAt(i))) {
+          tokenStart = i + 1;
+        } else {
+          break;
+        }
+      }
+      // Strip whitespace from end
+      for (int i = tokenEnd - 1; i >= tokenStart; i--) {
+        if (Character.isWhitespace(text.charAt(i))) {
+          tokenEnd = i;
+        } else {
+          break;
+        }
+      }
+
+      if (tokenEnd - tokenStart > 0) {
+        Token token = new Token();
+        token.ofs = tokenStart;
+        token.len = tokenEnd - tokenStart;
+        token.type = currentType;
+        if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) {
+          token.type = TokenType.KEYWORD;
+        }
+        tokens.add(token);
+      }
+
+      // Consume the token
+      currentType = TokenType.UNDEFINED;
+    }
+  }
+
+  /**
+   * Handles the insertion of the last token after parsing is complete.
+   *
+   * @param tokens      The list of tokens.
+   * @param tokenStart  The starting index of the last token.
+   * @param currentType The type of the last token.
+   */
+  private void handleLastToken(List<Token> tokens, int tokenStart, TokenType currentType) {
+    insertToken(tokens, tokenStart, text.length(), currentType);
+  }
+
+  void buildLineNumberTable() {
+    this.lineNumberTable = new TreeMap<>();
+    int lineNumber = 1;
+    lineNumberTable.put(0, 1);
+    for (int i = 0; i < text.length(); i++) {
+      if (text.charAt(i) == '\n') {
+        lineNumber++;
+        lineNumberTable.put(i + 1, lineNumber);
+      }
+    }
+  }
+
+  List<Token> tokens = new ArrayList<>();
+
+  // Initialize tokenization state
+  int tokenStart = 0;
+  TokenType currentType = TokenType.UNDEFINED;
+  boolean inComment = false;
+  boolean inBlockComment = false;
+  boolean inString = false;
+
+  class ScanRange {
+    int start;
+    int end;
+    TokenType type;
+
+    ScanRange(int start, int end, TokenType type) {
+      this.start = start;
+      this.end = end;
+      this.type = type;
+    }
+
+    // Invalid constructor
+    ScanRange() {
+      this.type = TokenType.UNDEFINED;
+    }
+
+    boolean isValid() {
+      return this.type != TokenType.UNDEFINED;
+    }
+  };
+
+  // Add the following method to handle hexadecimal literals
+  private ScanRange tryParseHexadecimal(int currentIndex) {
+    if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) {
+      char nextChar = text.charAt(currentIndex + 1);
+      if (nextChar == 'x' || nextChar == 'X') {
+        int tempIndex = currentIndex + 2;
+        while (tempIndex < text.length()) {
+          char c = text.charAt(tempIndex);
+          if (Character.digit(c, 16) == -1) {
+            break;
+          }
+          tempIndex++;
+        }
+        if (tempIndex > currentIndex + 2) {
+          return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX);
+        }
+      }
+    }
+    return new ScanRange();
+  }
+
+  // Identifier that starts with a letter or underscore, and can contain letters,
+  // digits, and underscores
+  private ScanRange tryParseIdentifier(int currentIndex) {
+    if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') {
+      int tempIndex = currentIndex + 1;
+      while (tempIndex < text.length()) {
+        char c = text.charAt(tempIndex);
+        if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) {
+          break;
+        }
+        tempIndex++;
+      }
+      return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER);
+    }
+    return new ScanRange();
+  }
+
+  private ScanRange tryParseWithLookahead(int currentIndex) {
+    ScanRange sr = tryParseHexadecimal(currentIndex);
+    if (!sr.isValid()) {
+      sr = tryParseIdentifier(currentIndex);
+    }
+    return sr;
+  }
+
+  public boolean isKeyword(String text) {
+    return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") || 
+        text.equals("return") || text.equals("struct") || text.equals("typedef") ||
+        text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static");
+  }
+
+  public TokenSet parse() {
+    this.buildLineNumberTable();
+
+    int index = 0;
+    while (index < text.length()) {
+      char currentChar = text.charAt(index);
+      TokenType newType = TokenType.OTHER;
+
+      // Handle comments
+      if (inBlockComment) {
+        newType = TokenType.BLOCK_COMMENT;
+        if (currentChar == '*') {
+          if (index + 1 < text.length() && text.charAt(index + 1) == '/') {
+            inBlockComment = false;
+            index++;
+          }
+        }
+      } else if (inComment) {
+        newType = TokenType.COMMENT;
+        if (currentChar == '\n') {
+          inComment = false;
+        }
+      }
+      // Handle string literals
+      else if (inString) {
+        if (currentChar == '"') {
+          inString = false;
+          newType = TokenType.STRING_LITERAL;
+        } else {
+          newType = TokenType.STRING_LITERAL;
+        }
+      }
+      // Detect start of comments
+      else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') {
+        inBlockComment = true;
+        newType = TokenType.BLOCK_COMMENT;
+      } else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') {
+        inComment = true;
+        newType = TokenType.COMMENT;
+      }
+      // Detect start of string literals
+      else if (currentChar == '"') {
+        inString = true;
+        newType = TokenType.STRING_LITERAL;
+      } else {
+        ScanRange range = tryParseWithLookahead(index);
+        if (range.isValid()) {
+          // Insert the current token first
+          // script.println("Inserting current token: " + currentType + ", start: " +
+          // tokenStart + ", end: " + range.start);
+          insertToken(tokens, tokenStart, range.start, currentType);
+
+          // Insert a ranged token
+          // script.println("Inserting ranged token: " + range.type + " start: " +
+          // range.start + ", end: " + range.end);
+          insertToken(tokens, range.start, range.end, range.type);
+
+          // New start
+          currentType = TokenType.UNDEFINED;
+          tokenStart = range.end;
+          index = range.end;
+        }
+        // Detect numeric literals
+        else if (Character.isDigit(currentChar)) {
+          newType = TokenType.NUMERIC_LITERAL;
+        }
+        // Detect identifiers
+        else if (Character.isLetter(currentChar) || currentChar == '_') {
+          newType = TokenType.IDENTIFIER;
+        }
+        // Detect parentheses
+        else if (currentChar == '(') {
+          newType = TokenType.L_PAREN;
+        } else if (currentChar == ')') {
+          newType = TokenType.R_PAREN;
+        }
+        // Detect braces
+        else if (currentChar == '{') {
+          newType = TokenType.L_BRACE;
+        } else if (currentChar == '}') {
+          newType = TokenType.R_BRACE;
+        }
+        // Detect semicolon
+        else if (currentChar == ';') {
+          newType = TokenType.SEMICOLON;
+        }
+        // Detect comma
+        else if (currentChar == ',') {
+          newType = TokenType.COMMA;
+        } else if (currentChar == '#') {
+          newType = TokenType.HASH;
+        }
+        // Handle other characters
+        else {
+          newType = TokenType.OTHER;
+        }
+      }
+
+      // Insert a new token if the type changes
+      if (newType != currentType) {
+        insertToken(tokens, tokenStart, index, currentType);
+        tokenStart = index;
+        currentType = newType;
+      }
+
+      index++;
+    }
+
+    // Handle the last token
+    handleLastToken(tokens, tokenStart, currentType);
+
+    return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable);
+  }
+}