package re3lib; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import ghidra.app.script.GhidraScript; public class CTokenizer { public enum TokenType { UNDEFINED, HASH, L_PAREN, R_PAREN, L_BRACE, R_BRACE, SEMICOLON, COMMA, COMMENT, BLOCK_COMMENT, IDENTIFIER, STRING_LITERAL, NUMERIC_LITERAL, NUMERIC_LITERAL_HEX, OTHER, KEYWORD, } public class Token { public int ofs; public int len; public TokenType type; } public class TokenSet { public final Token[] tokens; public final String text; private final TreeMap lineNumberTable; TokenSet(Token[] tokens, String text, TreeMap lineNumberTable) { this.tokens = tokens; this.text = text; this.lineNumberTable = lineNumberTable; } public Token[] getTokens() { return this.tokens; } public int getLine(int offset) { Map.Entry entry = lineNumberTable.floorEntry(offset); return entry != null ? entry.getValue() : -1; } public String getTextNoNewlines(Token token) { String text = getText(token); return text.replace("\n", ""); } }; private final String text; private TreeMap lineNumberTable; public GhidraScript log; public CTokenizer(String text) { this.text = text; } public CTokenizer(String text, GhidraScript script) { this.text = text; this.log = log; } String getText(Token token) { return getText(token.ofs, token.len); } String getText(int ofs, int len) { return text.substring(ofs, ofs + len); // Fixed recursion issue } TokenType lastTokenType = TokenType.UNDEFINED; /** * Inserts a new token into the tokens list. * * @param tokens The list of tokens. * @param tokenStart The starting index of the token. * @param tokenEnd The current index in the text. * @param currentType The type of the current token. */ private void insertToken(List tokens, int tokenStart, int tokenEnd, TokenType currentType) { if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) { // Strip whitespace for (int i = tokenStart; i < tokenEnd; i++) { if (Character.isWhitespace(text.charAt(i))) { tokenStart = i + 1; } else { break; } } // Strip whitespace from end for (int i = tokenEnd - 1; i >= tokenStart; i--) { if (Character.isWhitespace(text.charAt(i))) { tokenEnd = i; } else { break; } } if (tokenEnd - tokenStart > 0) { Token token = new Token(); token.ofs = tokenStart; token.len = tokenEnd - tokenStart; token.type = currentType; if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) { token.type = TokenType.KEYWORD; } tokens.add(token); } // Consume the token currentType = TokenType.UNDEFINED; } } /** * Handles the insertion of the last token after parsing is complete. * * @param tokens The list of tokens. * @param tokenStart The starting index of the last token. * @param currentType The type of the last token. */ private void handleLastToken(List tokens, int tokenStart, TokenType currentType) { insertToken(tokens, tokenStart, text.length(), currentType); } void buildLineNumberTable() { this.lineNumberTable = new TreeMap<>(); int lineNumber = 1; lineNumberTable.put(0, 1); for (int i = 0; i < text.length(); i++) { if (text.charAt(i) == '\n') { lineNumber++; lineNumberTable.put(i + 1, lineNumber); } } } List tokens = new ArrayList<>(); // Initialize tokenization state int tokenStart = 0; TokenType currentType = TokenType.UNDEFINED; boolean inComment = false; boolean inBlockComment = false; boolean inString = false; class ScanRange { int start; int end; TokenType type; ScanRange(int start, int end, TokenType type) { this.start = start; this.end = end; this.type = type; } // Invalid constructor ScanRange() { this.type = TokenType.UNDEFINED; } boolean isValid() { return this.type != TokenType.UNDEFINED; } }; // Add the following method to handle hexadecimal literals private ScanRange tryParseHexadecimal(int currentIndex) { if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) { char nextChar = text.charAt(currentIndex + 1); if (nextChar == 'x' || nextChar == 'X') { int tempIndex = currentIndex + 2; while (tempIndex < text.length()) { char c = text.charAt(tempIndex); if (Character.digit(c, 16) == -1) { break; } tempIndex++; } if (tempIndex > currentIndex + 2) { return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX); } } } return new ScanRange(); } // Identifier that starts with a letter or underscore, and can contain letters, // digits, and underscores private ScanRange tryParseIdentifier(int currentIndex) { if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') { int tempIndex = currentIndex + 1; while (tempIndex < text.length()) { char c = text.charAt(tempIndex); if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) { break; } tempIndex++; } return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER); } return new ScanRange(); } private ScanRange tryParseWithLookahead(int currentIndex) { ScanRange sr = tryParseHexadecimal(currentIndex); if (!sr.isValid()) { sr = tryParseIdentifier(currentIndex); } return sr; } public boolean isKeyword(String text) { return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") || text.equals("return") || text.equals("struct") || text.equals("typedef") || text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static"); } public TokenSet parse() { this.buildLineNumberTable(); int index = 0; while (index < text.length()) { char currentChar = text.charAt(index); TokenType newType = TokenType.OTHER; // Handle comments if (inBlockComment) { newType = TokenType.BLOCK_COMMENT; if (currentChar == '*') { if (index + 1 < text.length() && text.charAt(index + 1) == '/') { inBlockComment = false; index++; } } } else if (inComment) { newType = TokenType.COMMENT; if (currentChar == '\n') { inComment = false; } } // Handle string literals else if (inString) { if (currentChar == '"') { inString = false; newType = TokenType.STRING_LITERAL; } else { newType = TokenType.STRING_LITERAL; } } // Detect start of comments else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') { inBlockComment = true; newType = TokenType.BLOCK_COMMENT; } else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') { inComment = true; newType = TokenType.COMMENT; } // Detect start of string literals else if (currentChar == '"') { inString = true; newType = TokenType.STRING_LITERAL; } else { ScanRange range = tryParseWithLookahead(index); if (range.isValid()) { // Insert the current token first // script.println("Inserting current token: " + currentType + ", start: " + // tokenStart + ", end: " + range.start); insertToken(tokens, tokenStart, range.start, currentType); // Insert a ranged token // script.println("Inserting ranged token: " + range.type + " start: " + // range.start + ", end: " + range.end); insertToken(tokens, range.start, range.end, range.type); // New start currentType = TokenType.UNDEFINED; tokenStart = range.end; index = range.end; } // Detect numeric literals else if (Character.isDigit(currentChar)) { newType = TokenType.NUMERIC_LITERAL; } // Detect identifiers else if (Character.isLetter(currentChar) || currentChar == '_') { newType = TokenType.IDENTIFIER; } // Detect parentheses else if (currentChar == '(') { newType = TokenType.L_PAREN; } else if (currentChar == ')') { newType = TokenType.R_PAREN; } // Detect braces else if (currentChar == '{') { newType = TokenType.L_BRACE; } else if (currentChar == '}') { newType = TokenType.R_BRACE; } // Detect semicolon else if (currentChar == ';') { newType = TokenType.SEMICOLON; } // Detect comma else if (currentChar == ',') { newType = TokenType.COMMA; } else if (currentChar == '#') { newType = TokenType.HASH; } // Handle other characters else { newType = TokenType.OTHER; } } // Insert a new token if the type changes if (newType != currentType) { insertToken(tokens, tokenStart, index, currentType); tokenStart = index; currentType = newType; } index++; } // Handle the last token handleLastToken(tokens, tokenStart, currentType); return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable); } }