347 lines
9.7 KiB
Java
347 lines
9.7 KiB
Java
package re3lib;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.TreeMap;
|
|
|
|
import ghidra.app.script.GhidraScript;
|
|
|
|
public class CTokenizer {
|
|
public enum TokenType {
|
|
UNDEFINED,
|
|
HASH,
|
|
L_PAREN,
|
|
R_PAREN,
|
|
L_BRACE,
|
|
R_BRACE,
|
|
SEMICOLON,
|
|
COMMA,
|
|
COMMENT,
|
|
BLOCK_COMMENT,
|
|
IDENTIFIER,
|
|
STRING_LITERAL,
|
|
NUMERIC_LITERAL,
|
|
NUMERIC_LITERAL_HEX,
|
|
OTHER,
|
|
KEYWORD,
|
|
}
|
|
|
|
public class Token {
|
|
public int ofs;
|
|
public int len;
|
|
public TokenType type;
|
|
}
|
|
|
|
public class TokenSet {
|
|
public final Token[] tokens;
|
|
public final String text;
|
|
private final TreeMap<Integer, Integer> lineNumberTable;
|
|
|
|
TokenSet(Token[] tokens, String text, TreeMap<Integer, Integer> lineNumberTable) {
|
|
this.tokens = tokens;
|
|
this.text = text;
|
|
this.lineNumberTable = lineNumberTable;
|
|
}
|
|
|
|
public Token[] getTokens() {
|
|
return this.tokens;
|
|
}
|
|
|
|
public int getLine(int offset) {
|
|
Map.Entry<Integer, Integer> entry = lineNumberTable.floorEntry(offset);
|
|
return entry != null ? entry.getValue() : -1;
|
|
}
|
|
|
|
public String getTextNoNewlines(Token token) {
|
|
String text = getText(token);
|
|
return text.replace("\n", "");
|
|
}
|
|
};
|
|
|
|
private final String text;
|
|
private TreeMap<Integer, Integer> lineNumberTable;
|
|
public GhidraScript log;
|
|
|
|
public CTokenizer(String text) {
|
|
this.text = text;
|
|
}
|
|
|
|
public CTokenizer(String text, GhidraScript script) {
|
|
this.text = text;
|
|
this.log = log;
|
|
}
|
|
|
|
String getText(Token token) {
|
|
return getText(token.ofs, token.len);
|
|
}
|
|
|
|
String getText(int ofs, int len) {
|
|
return text.substring(ofs, ofs + len); // Fixed recursion issue
|
|
}
|
|
|
|
TokenType lastTokenType = TokenType.UNDEFINED;
|
|
|
|
/**
|
|
* Inserts a new token into the tokens list.
|
|
*
|
|
* @param tokens The list of tokens.
|
|
* @param tokenStart The starting index of the token.
|
|
* @param tokenEnd The current index in the text.
|
|
* @param currentType The type of the current token.
|
|
*/
|
|
private void insertToken(List<Token> tokens, int tokenStart, int tokenEnd, TokenType currentType) {
|
|
if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) {
|
|
// Strip whitespace
|
|
for (int i = tokenStart; i < tokenEnd; i++) {
|
|
if (Character.isWhitespace(text.charAt(i))) {
|
|
tokenStart = i + 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
// Strip whitespace from end
|
|
for (int i = tokenEnd - 1; i >= tokenStart; i--) {
|
|
if (Character.isWhitespace(text.charAt(i))) {
|
|
tokenEnd = i;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (tokenEnd - tokenStart > 0) {
|
|
Token token = new Token();
|
|
token.ofs = tokenStart;
|
|
token.len = tokenEnd - tokenStart;
|
|
token.type = currentType;
|
|
if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) {
|
|
token.type = TokenType.KEYWORD;
|
|
}
|
|
tokens.add(token);
|
|
}
|
|
|
|
// Consume the token
|
|
currentType = TokenType.UNDEFINED;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handles the insertion of the last token after parsing is complete.
|
|
*
|
|
* @param tokens The list of tokens.
|
|
* @param tokenStart The starting index of the last token.
|
|
* @param currentType The type of the last token.
|
|
*/
|
|
private void handleLastToken(List<Token> tokens, int tokenStart, TokenType currentType) {
|
|
insertToken(tokens, tokenStart, text.length(), currentType);
|
|
}
|
|
|
|
void buildLineNumberTable() {
|
|
this.lineNumberTable = new TreeMap<>();
|
|
int lineNumber = 1;
|
|
lineNumberTable.put(0, 1);
|
|
for (int i = 0; i < text.length(); i++) {
|
|
if (text.charAt(i) == '\n') {
|
|
lineNumber++;
|
|
lineNumberTable.put(i + 1, lineNumber);
|
|
}
|
|
}
|
|
}
|
|
|
|
List<Token> tokens = new ArrayList<>();
|
|
|
|
// Initialize tokenization state
|
|
int tokenStart = 0;
|
|
TokenType currentType = TokenType.UNDEFINED;
|
|
boolean inComment = false;
|
|
boolean inBlockComment = false;
|
|
boolean inString = false;
|
|
|
|
class ScanRange {
|
|
int start;
|
|
int end;
|
|
TokenType type;
|
|
|
|
ScanRange(int start, int end, TokenType type) {
|
|
this.start = start;
|
|
this.end = end;
|
|
this.type = type;
|
|
}
|
|
|
|
// Invalid constructor
|
|
ScanRange() {
|
|
this.type = TokenType.UNDEFINED;
|
|
}
|
|
|
|
boolean isValid() {
|
|
return this.type != TokenType.UNDEFINED;
|
|
}
|
|
};
|
|
|
|
// Add the following method to handle hexadecimal literals
|
|
private ScanRange tryParseHexadecimal(int currentIndex) {
|
|
if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) {
|
|
char nextChar = text.charAt(currentIndex + 1);
|
|
if (nextChar == 'x' || nextChar == 'X') {
|
|
int tempIndex = currentIndex + 2;
|
|
while (tempIndex < text.length()) {
|
|
char c = text.charAt(tempIndex);
|
|
if (Character.digit(c, 16) == -1) {
|
|
break;
|
|
}
|
|
tempIndex++;
|
|
}
|
|
if (tempIndex > currentIndex + 2) {
|
|
return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX);
|
|
}
|
|
}
|
|
}
|
|
return new ScanRange();
|
|
}
|
|
|
|
// Identifier that starts with a letter or underscore, and can contain letters,
|
|
// digits, and underscores
|
|
private ScanRange tryParseIdentifier(int currentIndex) {
|
|
if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') {
|
|
int tempIndex = currentIndex + 1;
|
|
while (tempIndex < text.length()) {
|
|
char c = text.charAt(tempIndex);
|
|
if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) {
|
|
break;
|
|
}
|
|
tempIndex++;
|
|
}
|
|
return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER);
|
|
}
|
|
return new ScanRange();
|
|
}
|
|
|
|
private ScanRange tryParseWithLookahead(int currentIndex) {
|
|
ScanRange sr = tryParseHexadecimal(currentIndex);
|
|
if (!sr.isValid()) {
|
|
sr = tryParseIdentifier(currentIndex);
|
|
}
|
|
return sr;
|
|
}
|
|
|
|
public boolean isKeyword(String text) {
|
|
return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") ||
|
|
text.equals("return") || text.equals("struct") || text.equals("typedef") ||
|
|
text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static");
|
|
}
|
|
|
|
public TokenSet parse() {
|
|
this.buildLineNumberTable();
|
|
|
|
int index = 0;
|
|
while (index < text.length()) {
|
|
char currentChar = text.charAt(index);
|
|
TokenType newType = TokenType.OTHER;
|
|
|
|
// Handle comments
|
|
if (inBlockComment) {
|
|
newType = TokenType.BLOCK_COMMENT;
|
|
if (currentChar == '*') {
|
|
if (index + 1 < text.length() && text.charAt(index + 1) == '/') {
|
|
inBlockComment = false;
|
|
index++;
|
|
}
|
|
}
|
|
} else if (inComment) {
|
|
newType = TokenType.COMMENT;
|
|
if (currentChar == '\n') {
|
|
inComment = false;
|
|
}
|
|
}
|
|
// Handle string literals
|
|
else if (inString) {
|
|
if (currentChar == '"') {
|
|
inString = false;
|
|
newType = TokenType.STRING_LITERAL;
|
|
} else {
|
|
newType = TokenType.STRING_LITERAL;
|
|
}
|
|
}
|
|
// Detect start of comments
|
|
else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') {
|
|
inBlockComment = true;
|
|
newType = TokenType.BLOCK_COMMENT;
|
|
} else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') {
|
|
inComment = true;
|
|
newType = TokenType.COMMENT;
|
|
}
|
|
// Detect start of string literals
|
|
else if (currentChar == '"') {
|
|
inString = true;
|
|
newType = TokenType.STRING_LITERAL;
|
|
} else {
|
|
ScanRange range = tryParseWithLookahead(index);
|
|
if (range.isValid()) {
|
|
// Insert the current token first
|
|
// script.println("Inserting current token: " + currentType + ", start: " +
|
|
// tokenStart + ", end: " + range.start);
|
|
insertToken(tokens, tokenStart, range.start, currentType);
|
|
|
|
// Insert a ranged token
|
|
// script.println("Inserting ranged token: " + range.type + " start: " +
|
|
// range.start + ", end: " + range.end);
|
|
insertToken(tokens, range.start, range.end, range.type);
|
|
|
|
// New start
|
|
currentType = TokenType.UNDEFINED;
|
|
tokenStart = range.end;
|
|
index = range.end;
|
|
}
|
|
// Detect numeric literals
|
|
else if (Character.isDigit(currentChar)) {
|
|
newType = TokenType.NUMERIC_LITERAL;
|
|
}
|
|
// Detect identifiers
|
|
else if (Character.isLetter(currentChar) || currentChar == '_') {
|
|
newType = TokenType.IDENTIFIER;
|
|
}
|
|
// Detect parentheses
|
|
else if (currentChar == '(') {
|
|
newType = TokenType.L_PAREN;
|
|
} else if (currentChar == ')') {
|
|
newType = TokenType.R_PAREN;
|
|
}
|
|
// Detect braces
|
|
else if (currentChar == '{') {
|
|
newType = TokenType.L_BRACE;
|
|
} else if (currentChar == '}') {
|
|
newType = TokenType.R_BRACE;
|
|
}
|
|
// Detect semicolon
|
|
else if (currentChar == ';') {
|
|
newType = TokenType.SEMICOLON;
|
|
}
|
|
// Detect comma
|
|
else if (currentChar == ',') {
|
|
newType = TokenType.COMMA;
|
|
} else if (currentChar == '#') {
|
|
newType = TokenType.HASH;
|
|
}
|
|
// Handle other characters
|
|
else {
|
|
newType = TokenType.OTHER;
|
|
}
|
|
}
|
|
|
|
// Insert a new token if the type changes
|
|
if (newType != currentType) {
|
|
insertToken(tokens, tokenStart, index, currentType);
|
|
tokenStart = index;
|
|
currentType = newType;
|
|
}
|
|
|
|
index++;
|
|
}
|
|
|
|
// Handle the last token
|
|
handleLastToken(tokens, tokenStart, currentType);
|
|
|
|
return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable);
|
|
}
|
|
}
|