From 2da111a348079ab9f42a3058aa61f52fe386d301 Mon Sep 17 00:00:00 2001 From: Guus Waals <_@guusw.nl> Date: Sun, 6 Oct 2024 19:44:32 +0800 Subject: [PATCH] WIP C parser --- scripts/DumpCurrentFunctionN.java | 116 ++++----- scripts/RebuildFunctionDatabase.java | 143 ++++++----- scripts/re3lib/CParser.java | 194 +++++++++++++++ scripts/re3lib/CTokenizer.java | 346 +++++++++++++++++++++++++++ 4 files changed, 684 insertions(+), 115 deletions(-) create mode 100644 scripts/re3lib/CParser.java create mode 100644 scripts/re3lib/CTokenizer.java diff --git a/scripts/DumpCurrentFunctionN.java b/scripts/DumpCurrentFunctionN.java index 36de4516..bb45bef7 100644 --- a/scripts/DumpCurrentFunctionN.java +++ b/scripts/DumpCurrentFunctionN.java @@ -18,28 +18,28 @@ import re3lib.TypeDumper; public class DumpCurrentFunctionN extends GhidraScript { final int NumFunctions = 8; - class Entry { - Function function; - } - class QueueEntry { - Function function; - List callees; - } + // class Entry { + // Function function; + // } + // class QueueEntry { + // Function function; + // List callees; + // } - HashSet
visited = new HashSet<>(); + // HashSet
visited = new HashSet<>(); - QueueEntry enter(Function function) { - if (visited.contains(function.getEntryPoint())) - return null; + // QueueEntry enter(Function function) { + // if (visited.contains(function.getEntryPoint())) + // return null; - visited.add(function.getEntryPoint()); + // visited.add(function.getEntryPoint()); - QueueEntry entry = new QueueEntry(); - entry.function = function; + // QueueEntry entry = new QueueEntry(); + // entry.function = function; - function.getCalledFunctions(monitor); + // function.getCalledFunctions(monitor); - } + // } @Override public void run() throws Exception { @@ -51,57 +51,57 @@ public class DumpCurrentFunctionN extends GhidraScript { FunctionDumper functionDumper = new FunctionDumper(this, globalDumper); - // PCallTracer tracer = new PCallTracer(); - // tracer.setBlacklist(functionDumper.functionAddrBlackList); - // tracer.traceCalls(getFunctionContaining(currentAddress)); + PCallTracer tracer = new PCallTracer(); + tracer.setBlacklist(functionDumper.functionAddrBlackList); + tracer.traceCalls(getFunctionContaining(currentAddress)); List
queue = new ArrayList<>(); - List functionsToDump = new ArrayList<>(); - List functionsToDumpNew = new ArrayList<>(); - for (Function func : tracer.out) { - if (FunctionDumper.isDumpedFix(func)) - continue; + // List functionsToDump = new ArrayList<>(); + // List functionsToDumpNew = new ArrayList<>(); + // for (Function func : tracer.out) { + // if (FunctionDumper.isDumpedFix(func)) + // continue; - println("Dump: " + func.getName()); - functionsToDump.add(func); + // println("Dump: " + func.getName()); + // functionsToDump.add(func); - if (!FunctionDumper.isDumpedAuto(func)) - functionsToDumpNew.add(func); - } + // if (!FunctionDumper.isDumpedAuto(func)) + // functionsToDumpNew.add(func); + // } - if (!functionsToDump.isEmpty()) { - String newOpt = "Only new (" + functionsToDumpNew.size() + ")"; - String okOpt = "Yes (" + functionsToDump.size() + ")"; - String choice = askChoice("Confirmation", "About to generate " + functionsToDump.size() + " functions (" - + functionsToDumpNew.size() + " new), continue?", - new ArrayList() { - { - add(okOpt); - add(newOpt); - add("No"); - } - }, okOpt); - if (choice == okOpt) { - } else if (choice == newOpt) { - functionsToDump = functionsToDumpNew; - } else { - return; - } + // if (!functionsToDump.isEmpty()) { + // String newOpt = "Only new (" + functionsToDumpNew.size() + ")"; + // String okOpt = "Yes (" + functionsToDump.size() + ")"; + // String choice = askChoice("Confirmation", "About to generate " + functionsToDump.size() + " functions (" + // + functionsToDumpNew.size() + " new), continue?", + // new ArrayList() { + // { + // add(okOpt); + // add(newOpt); + // add("No"); + // } + // }, okOpt); + // if (choice == okOpt) { + // } else if (choice == newOpt) { + // functionsToDump = functionsToDumpNew; + // } else { + // return; + // } - for (Function func : functionsToDump) { - functionDumper.dump(func); - } + // for (Function func : functionsToDump) { + // functionDumper.dump(func); + // } - if (functionDumper.createdFile) - RecompileConfig.INSTANCE.touchCMakeTimestamp(); + // if (functionDumper.createdFile) + // RecompileConfig.INSTANCE.touchCMakeTimestamp(); - globalDumper.dumpGlobals(); - globalDumper.saveGlobalManifest(); - } + // globalDumper.dumpGlobals(); + // globalDumper.saveGlobalManifest(); + // } - // Dump types - TypeDumper dumper = new TypeDumper(this); - dumper.run(); + // // Dump types + // TypeDumper dumper = new TypeDumper(this); + // dumper.run(); } } diff --git a/scripts/RebuildFunctionDatabase.java b/scripts/RebuildFunctionDatabase.java index a9abd637..a12aaae4 100644 --- a/scripts/RebuildFunctionDatabase.java +++ b/scripts/RebuildFunctionDatabase.java @@ -3,14 +3,21 @@ import ghidra.app.script.GhidraScript; import ghidra.program.model.address.Address; +import ghidra.program.model.data.DataType; +import ghidra.program.model.data.StandAloneDataTypeManager; import re3lib.FunctionDatabase; import re3lib.RecompileConfig; +import re3lib.CParser; +import re3lib.CTokenizer; import java.io.File; import java.io.BufferedReader; import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -25,9 +32,10 @@ public class RebuildFunctionDatabase extends GhidraScript { functionDB = new FunctionDatabase(this); - scanDirectory(RecompileConfig.INSTANCE.dirDecompAuto, FunctionDatabase.Type.Auto); - scanDirectory(RecompileConfig.INSTANCE.dirDecompFix, FunctionDatabase.Type.Fix); - scanDirectory(RecompileConfig.INSTANCE.dirDecompStub, FunctionDatabase.Type.Stub); + scanFile(new File(RecompileConfig.INSTANCE.outputDir, "gh_auto/r3_engineLoop.cxx"), FunctionDatabase.Type.Auto); + // scanDirectory(RecompileConfig.INSTANCE.dirDecompAuto, FunctionDatabase.Type.Auto); + // scanDirectory(RecompileConfig.INSTANCE.dirDecompFix, FunctionDatabase.Type.Fix); + // scanDirectory(RecompileConfig.INSTANCE.dirDecompStub, FunctionDatabase.Type.Stub); println("Applying default filters..."); functionDB.applyDefaultFilters(rebuildAllGlobals); @@ -38,10 +46,10 @@ public class RebuildFunctionDatabase extends GhidraScript { println("Function database rebuilt successfully."); // for (FunctionDatabase.Entry entry : functionDB.entries) { - // println(entry.address + " " + entry.name + " " + entry.file.getName()); - // for (FunctionDatabase.Dependency dependency : entry.dependencies) { - // println(" " + dependency.address + " " + dependency.name); - // } + // println(entry.address + " " + entry.name + " " + entry.file.getName()); + // for (FunctionDatabase.Dependency dependency : entry.dependencies) { + // println(" " + dependency.address + " " + dependency.name); + // } // } } @@ -55,64 +63,85 @@ public class RebuildFunctionDatabase extends GhidraScript { } } - private void scanFile(File file, FunctionDatabase.Type type) throws Exception { - println("Scanning " + file); - try (BufferedReader reader = new BufferedReader(new FileReader(file))) { - String line; - Pattern dependencyPattern = Pattern.compile("(\\w+)\\s+(\\w+)\\(.*\\);\\s*//\\s*([0-9A-Fa-f]{8})\\s*//\\s*(.*)"); - Pattern addressPattern = Pattern.compile("//\\s*([0-9A-Fa-f]{8})"); - Pattern functionNamePattern = Pattern.compile("(\\S+)\\s+(\\S+)\\s*\\("); + private void parseOld(BufferedReader reader, File file, FunctionDatabase.Type type) throws Exception { + String line; + Pattern dependencyPattern = Pattern.compile("(\\w+)\\s+(\\w+)\\(.*\\);\\s*//\\s*([0-9A-Fa-f]{8})\\s*//\\s*(.*)"); + Pattern addressPattern = Pattern.compile("//\\s*([0-9A-Fa-f]{8})"); + Pattern functionNamePattern = Pattern.compile("(\\S+)\\s+(\\S+)\\s*\\("); - List dependencies = new ArrayList<>(); - String address = null; - String functionName = null; + List dependencies = new ArrayList<>(); + String address = null; + String functionName = null; - while ((line = reader.readLine()) != null) { - Matcher dependencyMatcher = dependencyPattern.matcher(line); - if (dependencyMatcher.find()) { - // println("Found dependency: " + dependencyMatcher.group(3)); - Address depAddress = currentProgram.getAddressFactory().getAddress(dependencyMatcher.group(3)); - String name = dependencyMatcher.group(2); - FunctionDatabase.Dependency dependency = functionDB.new Dependency(depAddress, name); - dependencies.add(dependency); - continue; - } + while ((line = reader.readLine()) != null) { + Matcher dependencyMatcher = dependencyPattern.matcher(line); + if (dependencyMatcher.find()) { + // println("Found dependency: " + dependencyMatcher.group(3)); + Address depAddress = currentProgram.getAddressFactory().getAddress(dependencyMatcher.group(3)); + String name = dependencyMatcher.group(2); + FunctionDatabase.Dependency dependency = functionDB.new Dependency(depAddress, name); + dependencies.add(dependency); + continue; + } - Matcher addressMatcher = addressPattern.matcher(line); - if (addressMatcher.find()) { - // println("Found address: " + addressMatcher.group(1)); - address = addressMatcher.group(1); - // Skip any comments or newlines between address and function definition - while ((line = reader.readLine()) != null) { - line = line.trim(); - // println("Line: " + line); - if (!line.isEmpty()) { - Matcher functionNameMatcher = functionNamePattern.matcher(line); - if (functionNameMatcher.find()) { - functionName = functionNameMatcher.group(2).trim(); - break; - } + Matcher addressMatcher = addressPattern.matcher(line); + if (addressMatcher.find()) { + // println("Found address: " + addressMatcher.group(1)); + address = addressMatcher.group(1); + // Skip any comments or newlines between address and function definition + while ((line = reader.readLine()) != null) { + line = line.trim(); + // println("Line: " + line); + if (!line.isEmpty()) { + Matcher functionNameMatcher = functionNamePattern.matcher(line); + if (functionNameMatcher.find()) { + functionName = functionNameMatcher.group(2).trim(); + break; } } - if (functionName != null) { - break; - } + } + if (functionName != null) { + break; } } + } - if (address != null && functionName != null) { - Address functionAddress = currentProgram.getAddressFactory().getAddress(address); - FunctionDatabase.Entry entry = functionDB.new Entry(); - entry.address = functionAddress; - entry.name = functionName; - entry.file = file; - entry.type = type; - entry.dependencies = dependencies; - functionDB.entries.add(entry); - } else { - // throw new Exception("Failed to parse function at " + file.getName()); - println("Failed to parse function at " + file.getName()); - } + if (address != null && functionName != null) { + Address functionAddress = currentProgram.getAddressFactory().getAddress(address); + FunctionDatabase.Entry entry = functionDB.new Entry(); + entry.address = functionAddress; + entry.name = functionName; + entry.file = file; + entry.type = type; + entry.dependencies = dependencies; + functionDB.entries.add(entry); + } else { + // throw new Exception("Failed to parse function at " + file.getName()); + println("Failed to parse function at " + file.getName()); + } + } + + private void scanFile(File file, FunctionDatabase.Type type) throws Exception { + println("Scanning " + file); + + String text = new String(Files.readAllBytes(file.toPath())); + CTokenizer.TokenSet tokens = new CTokenizer(text).parse(); + CParser parser = new CParser(tokens); + parser.parse(); + + // for (CTokenizer.Token token : tokens.getTokens()) { + // int line = tokens.getLine(token.ofs); + // println("Line " + line + ": " + token.ofs + " " + token.len + " " + token.type + " - " + // + tokens.getTextNoNewlines(token)); + // } + for (CParser.Function function : parser.getFunctions()) { + println("Function: " + function.name + " " + function.startOffset + " " + function.endOffset); + } + for (CParser.FunctionCall functionCall : parser.getFunctionCalls()) { + println("FunctionCall: " + functionCall.name + " " + functionCall.startOffset + " " + functionCall.endOffset); + } + for (CParser.Variable variable : parser.getVariables()) { + println("Variable: " + variable.name + " " + variable.startOffset + " " + variable.endOffset); } } } diff --git a/scripts/re3lib/CParser.java b/scripts/re3lib/CParser.java new file mode 100644 index 00000000..8ae54451 --- /dev/null +++ b/scripts/re3lib/CParser.java @@ -0,0 +1,194 @@ +package re3lib; + +import java.util.*; + +import re3lib.CTokenizer.Token; + +public class CParser { + private CTokenizer.TokenSet tokenSet; + private List variables; + private List functions; + private List functionCalls; + + public CParser(CTokenizer.TokenSet tokenSet) { + this.tokenSet = tokenSet; + this.variables = new ArrayList<>(); + this.functions = new ArrayList<>(); + this.functionCalls = new ArrayList<>(); + } + + int index = 0; + + public void parse() { + CTokenizer.Token[] tokens = tokenSet.getTokens(); + for (index = 0; index < tokens.length; index++) { + CTokenizer.Token token = tokens[index]; + if (token.type == CTokenizer.TokenType.BLOCK_COMMENT || token.type == CTokenizer.TokenType.COMMENT) { + continue; + } else if (token.type == CTokenizer.TokenType.HASH) { + index = parsePreprocessorExpression(); + } else if (tokens[index].type == CTokenizer.TokenType.IDENTIFIER) { + if (index + 1 < tokens.length && tokens[index + 1].type == CTokenizer.TokenType.L_PAREN) { + // Function call or declaration/definition + if (index > 0 && (tokens[index - 1].type == CTokenizer.TokenType.IDENTIFIER || + tokens[index - 1].type == CTokenizer.TokenType.OTHER)) { + // Function declaration or definition + index = parseFunctionDeclaration(); + } else { + // Function call + index = parseFunctionCall(); + } + } else { + // Variable reference + index = parseVariableReference(); + } + } + } + } + + // Try to parse prep expression + private int parsePreprocessorExpression() { + int index = this.index; + if (tokenSet.tokens[index].type == CTokenizer.TokenType.HASH) { + int startLine = tokenSet.getLine(index); + while (index < tokenSet.tokens.length) { + if (tokenSet.getLine(index) > startLine) { + break; + } + index++; + } + // Find first next line token + index--; + } + return index; + } + + // Try to parse function declaration and return the ending token index + private int parseFunctionDeclaration() { + CTokenizer.Token[] tokens = tokenSet.getTokens(); + String name = tokenSet.getTextNoNewlines(tokens[index]); + int endIndex = findClosingParenthesis(index + 1); + + if (endIndex == -1) + return index; + + boolean isDefinition = false; + if (endIndex + 1 < tokens.length && tokens[endIndex + 1].type == CTokenizer.TokenType.L_BRACE) { + isDefinition = true; + endIndex = findClosingBrace(endIndex + 1); + } + + if (endIndex == -1) + return index; + + Function function = new Function(name, tokens[index].ofs, tokens[endIndex].ofs + tokens[endIndex].len, + isDefinition); + functions.add(function); + return endIndex - 1; + } + + // Try to parse function call and return the ending token index + private int parseFunctionCall() { + CTokenizer.Token[] tokens = tokenSet.getTokens(); + String name = tokenSet.getTextNoNewlines(tokens[index]); + int endIndex = findClosingParenthesis(index + 1); + if (endIndex == -1) + return index; + + FunctionCall functionCall = new FunctionCall(name, tokens[index].ofs, + tokens[endIndex].ofs + tokens[endIndex].len); + functionCalls.add(functionCall); + return endIndex - 1; + } + + // Try to parse variable reference and add it to the list + private int parseVariableReference() { + CTokenizer.Token token = tokenSet.getTokens()[index]; + String name = tokenSet.getTextNoNewlines(token); + Variable variable = new Variable(name, token.ofs, token.ofs + token.len); + variables.add(variable); + return index + 1; + } + + private int findClosingParenthesis(int startIndex) { + CTokenizer.Token[] tokens = tokenSet.getTokens(); + int parenCount = 1; + for (int i = startIndex + 1; i < tokens.length; i++) { + if (tokens[i].type == CTokenizer.TokenType.L_PAREN) { + parenCount++; + } else if (tokens[i].type == CTokenizer.TokenType.R_PAREN) { + parenCount--; + if (parenCount == 0) { + return i; + } + } + } + return -1; + } + + private int findClosingBrace(int startIndex) { + CTokenizer.Token[] tokens = tokenSet.getTokens(); + int braceCount = 1; + for (int i = startIndex + 1; i < tokens.length; i++) { + if (tokens[i].type == CTokenizer.TokenType.L_BRACE) { + braceCount++; + } else if (tokens[i].type == CTokenizer.TokenType.R_BRACE) { + braceCount--; + if (braceCount == 0) { + return i; + } + } + } + return -1; + } + + public List getVariables() { + return variables; + } + + public List getFunctions() { + return functions; + } + + public List getFunctionCalls() { + return functionCalls; + } + + public static class Variable { + public final String name; + public final int startOffset; + public final int endOffset; + + public Variable(String name, int startOffset, int endOffset) { + this.name = name; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + } + + public static class Function { + public final String name; + public final int startOffset; + public final int endOffset; + public final boolean isDefinition; + + public Function(String name, int startOffset, int endOffset, boolean isDefinition) { + this.name = name; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.isDefinition = isDefinition; + } + } + + public static class FunctionCall { + public final String name; + public final int startOffset; + public final int endOffset; + + public FunctionCall(String name, int startOffset, int endOffset) { + this.name = name; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + } +} diff --git a/scripts/re3lib/CTokenizer.java b/scripts/re3lib/CTokenizer.java new file mode 100644 index 00000000..16e25ed7 --- /dev/null +++ b/scripts/re3lib/CTokenizer.java @@ -0,0 +1,346 @@ +package re3lib; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import ghidra.app.script.GhidraScript; + +public class CTokenizer { + public enum TokenType { + UNDEFINED, + HASH, + L_PAREN, + R_PAREN, + L_BRACE, + R_BRACE, + SEMICOLON, + COMMA, + COMMENT, + BLOCK_COMMENT, + IDENTIFIER, + STRING_LITERAL, + NUMERIC_LITERAL, + NUMERIC_LITERAL_HEX, + OTHER, + KEYWORD, + } + + public class Token { + public int ofs; + public int len; + public TokenType type; + } + + public class TokenSet { + public final Token[] tokens; + public final String text; + private final TreeMap lineNumberTable; + + TokenSet(Token[] tokens, String text, TreeMap lineNumberTable) { + this.tokens = tokens; + this.text = text; + this.lineNumberTable = lineNumberTable; + } + + public Token[] getTokens() { + return this.tokens; + } + + public int getLine(int offset) { + Map.Entry entry = lineNumberTable.floorEntry(offset); + return entry != null ? entry.getValue() : -1; + } + + public String getTextNoNewlines(Token token) { + String text = getText(token); + return text.replace("\n", ""); + } + }; + + private final String text; + private TreeMap lineNumberTable; + public GhidraScript log; + + public CTokenizer(String text) { + this.text = text; + } + + public CTokenizer(String text, GhidraScript script) { + this.text = text; + this.log = log; + } + + String getText(Token token) { + return getText(token.ofs, token.len); + } + + String getText(int ofs, int len) { + return text.substring(ofs, ofs + len); // Fixed recursion issue + } + + TokenType lastTokenType = TokenType.UNDEFINED; + + /** + * Inserts a new token into the tokens list. + * + * @param tokens The list of tokens. + * @param tokenStart The starting index of the token. + * @param tokenEnd The current index in the text. + * @param currentType The type of the current token. + */ + private void insertToken(List tokens, int tokenStart, int tokenEnd, TokenType currentType) { + if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) { + // Strip whitespace + for (int i = tokenStart; i < tokenEnd; i++) { + if (Character.isWhitespace(text.charAt(i))) { + tokenStart = i + 1; + } else { + break; + } + } + // Strip whitespace from end + for (int i = tokenEnd - 1; i >= tokenStart; i--) { + if (Character.isWhitespace(text.charAt(i))) { + tokenEnd = i; + } else { + break; + } + } + + if (tokenEnd - tokenStart > 0) { + Token token = new Token(); + token.ofs = tokenStart; + token.len = tokenEnd - tokenStart; + token.type = currentType; + if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) { + token.type = TokenType.KEYWORD; + } + tokens.add(token); + } + + // Consume the token + currentType = TokenType.UNDEFINED; + } + } + + /** + * Handles the insertion of the last token after parsing is complete. + * + * @param tokens The list of tokens. + * @param tokenStart The starting index of the last token. + * @param currentType The type of the last token. + */ + private void handleLastToken(List tokens, int tokenStart, TokenType currentType) { + insertToken(tokens, tokenStart, text.length(), currentType); + } + + void buildLineNumberTable() { + this.lineNumberTable = new TreeMap<>(); + int lineNumber = 1; + lineNumberTable.put(0, 1); + for (int i = 0; i < text.length(); i++) { + if (text.charAt(i) == '\n') { + lineNumber++; + lineNumberTable.put(i + 1, lineNumber); + } + } + } + + List tokens = new ArrayList<>(); + + // Initialize tokenization state + int tokenStart = 0; + TokenType currentType = TokenType.UNDEFINED; + boolean inComment = false; + boolean inBlockComment = false; + boolean inString = false; + + class ScanRange { + int start; + int end; + TokenType type; + + ScanRange(int start, int end, TokenType type) { + this.start = start; + this.end = end; + this.type = type; + } + + // Invalid constructor + ScanRange() { + this.type = TokenType.UNDEFINED; + } + + boolean isValid() { + return this.type != TokenType.UNDEFINED; + } + }; + + // Add the following method to handle hexadecimal literals + private ScanRange tryParseHexadecimal(int currentIndex) { + if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) { + char nextChar = text.charAt(currentIndex + 1); + if (nextChar == 'x' || nextChar == 'X') { + int tempIndex = currentIndex + 2; + while (tempIndex < text.length()) { + char c = text.charAt(tempIndex); + if (Character.digit(c, 16) == -1) { + break; + } + tempIndex++; + } + if (tempIndex > currentIndex + 2) { + return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX); + } + } + } + return new ScanRange(); + } + + // Identifier that starts with a letter or underscore, and can contain letters, + // digits, and underscores + private ScanRange tryParseIdentifier(int currentIndex) { + if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') { + int tempIndex = currentIndex + 1; + while (tempIndex < text.length()) { + char c = text.charAt(tempIndex); + if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) { + break; + } + tempIndex++; + } + return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER); + } + return new ScanRange(); + } + + private ScanRange tryParseWithLookahead(int currentIndex) { + ScanRange sr = tryParseHexadecimal(currentIndex); + if (!sr.isValid()) { + sr = tryParseIdentifier(currentIndex); + } + return sr; + } + + public boolean isKeyword(String text) { + return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") || + text.equals("return") || text.equals("struct") || text.equals("typedef") || + text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static"); + } + + public TokenSet parse() { + this.buildLineNumberTable(); + + int index = 0; + while (index < text.length()) { + char currentChar = text.charAt(index); + TokenType newType = TokenType.OTHER; + + // Handle comments + if (inBlockComment) { + newType = TokenType.BLOCK_COMMENT; + if (currentChar == '*') { + if (index + 1 < text.length() && text.charAt(index + 1) == '/') { + inBlockComment = false; + index++; + } + } + } else if (inComment) { + newType = TokenType.COMMENT; + if (currentChar == '\n') { + inComment = false; + } + } + // Handle string literals + else if (inString) { + if (currentChar == '"') { + inString = false; + newType = TokenType.STRING_LITERAL; + } else { + newType = TokenType.STRING_LITERAL; + } + } + // Detect start of comments + else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') { + inBlockComment = true; + newType = TokenType.BLOCK_COMMENT; + } else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') { + inComment = true; + newType = TokenType.COMMENT; + } + // Detect start of string literals + else if (currentChar == '"') { + inString = true; + newType = TokenType.STRING_LITERAL; + } else { + ScanRange range = tryParseWithLookahead(index); + if (range.isValid()) { + // Insert the current token first + // script.println("Inserting current token: " + currentType + ", start: " + + // tokenStart + ", end: " + range.start); + insertToken(tokens, tokenStart, range.start, currentType); + + // Insert a ranged token + // script.println("Inserting ranged token: " + range.type + " start: " + + // range.start + ", end: " + range.end); + insertToken(tokens, range.start, range.end, range.type); + + // New start + currentType = TokenType.UNDEFINED; + tokenStart = range.end; + index = range.end; + } + // Detect numeric literals + else if (Character.isDigit(currentChar)) { + newType = TokenType.NUMERIC_LITERAL; + } + // Detect identifiers + else if (Character.isLetter(currentChar) || currentChar == '_') { + newType = TokenType.IDENTIFIER; + } + // Detect parentheses + else if (currentChar == '(') { + newType = TokenType.L_PAREN; + } else if (currentChar == ')') { + newType = TokenType.R_PAREN; + } + // Detect braces + else if (currentChar == '{') { + newType = TokenType.L_BRACE; + } else if (currentChar == '}') { + newType = TokenType.R_BRACE; + } + // Detect semicolon + else if (currentChar == ';') { + newType = TokenType.SEMICOLON; + } + // Detect comma + else if (currentChar == ',') { + newType = TokenType.COMMA; + } else if (currentChar == '#') { + newType = TokenType.HASH; + } + // Handle other characters + else { + newType = TokenType.OTHER; + } + } + + // Insert a new token if the type changes + if (newType != currentType) { + insertToken(tokens, tokenStart, index, currentType); + tokenStart = index; + currentType = newType; + } + + index++; + } + + // Handle the last token + handleLastToken(tokens, tokenStart, currentType); + + return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable); + } +}