reman3/scripts/cparser/Tokenizer.java

351 lines
9.8 KiB
Java

package cparser;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class Tokenizer {
public enum TokenType {
UNDEFINED,
HASH,
L_PAREN,
R_PAREN,
L_BRACE,
R_BRACE,
SEMICOLON,
COMMA,
COMMENT,
BLOCK_COMMENT,
IDENTIFIER,
STRING_LITERAL,
NUMERIC_LITERAL,
NUMERIC_LITERAL_HEX,
OTHER,
KEYWORD,
}
public class Token {
public int ofs;
public int len;
public TokenType type;
}
public class TokenSet {
public final Token[] tokens;
public final String text;
private final TreeMap<Integer, Integer> lineNumberTable;
TokenSet(Token[] tokens, String text, TreeMap<Integer, Integer> lineNumberTable) {
this.tokens = tokens;
this.text = text;
this.lineNumberTable = lineNumberTable;
}
public Token[] getTokens() {
return this.tokens;
}
public int getLine(int offset) {
Map.Entry<Integer, Integer> entry = lineNumberTable.floorEntry(offset);
return entry != null ? entry.getValue() : -1;
}
public String getTextNoNewlines(Token token) {
String text = getText(token);
return text.replace("\n", "");
}
};
private final String text;
private TreeMap<Integer, Integer> lineNumberTable;
private Log log;
public Tokenizer(String text) {
this.text = text;
}
public Tokenizer(String text, Log log) {
this.text = text;
this.log = log;
}
void log(String msg) {
if (log != null) {
log.log(msg);
}
}
String getText(Token token) {
return getText(token.ofs, token.len);
}
String getText(int ofs, int len) {
return text.substring(ofs, ofs + len); // Fixed recursion issue
}
TokenType lastTokenType = TokenType.UNDEFINED;
/**
* Inserts a new token into the tokens list.
*
* @param tokens The list of tokens.
* @param tokenStart The starting index of the token.
* @param tokenEnd The current index in the text.
* @param currentType The type of the current token.
*/
private void insertToken(List<Token> tokens, int tokenStart, int tokenEnd, TokenType currentType) {
if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) {
// Strip whitespace
for (int i = tokenStart; i < tokenEnd; i++) {
if (Character.isWhitespace(text.charAt(i))) {
tokenStart = i + 1;
} else {
break;
}
}
// Strip whitespace from end
for (int i = tokenEnd - 1; i >= tokenStart; i--) {
if (Character.isWhitespace(text.charAt(i))) {
tokenEnd = i;
} else {
break;
}
}
if (tokenEnd - tokenStart > 0) {
Token token = new Token();
token.ofs = tokenStart;
token.len = tokenEnd - tokenStart;
token.type = currentType;
if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) {
token.type = TokenType.KEYWORD;
}
tokens.add(token);
}
// Consume the token
currentType = TokenType.UNDEFINED;
}
}
/**
* Handles the insertion of the last token after parsing is complete.
*
* @param tokens The list of tokens.
* @param tokenStart The starting index of the last token.
* @param currentType The type of the last token.
*/
private void handleLastToken(List<Token> tokens, int tokenStart, TokenType currentType) {
insertToken(tokens, tokenStart, text.length(), currentType);
}
void buildLineNumberTable() {
this.lineNumberTable = new TreeMap<>();
int lineNumber = 1;
lineNumberTable.put(0, 1);
for (int i = 0; i < text.length(); i++) {
if (text.charAt(i) == '\n') {
lineNumber++;
lineNumberTable.put(i + 1, lineNumber);
}
}
}
List<Token> tokens = new ArrayList<>();
// Initialize tokenization state
int tokenStart = 0;
TokenType currentType = TokenType.UNDEFINED;
boolean inComment = false;
boolean inBlockComment = false;
boolean inString = false;
class ScanRange {
int start;
int end;
TokenType type;
ScanRange(int start, int end, TokenType type) {
this.start = start;
this.end = end;
this.type = type;
}
// Invalid constructor
ScanRange() {
this.type = TokenType.UNDEFINED;
}
boolean isValid() {
return this.type != TokenType.UNDEFINED;
}
};
// Add the following method to handle hexadecimal literals
private ScanRange tryParseHexadecimal(int currentIndex) {
if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) {
char nextChar = text.charAt(currentIndex + 1);
if (nextChar == 'x' || nextChar == 'X') {
int tempIndex = currentIndex + 2;
while (tempIndex < text.length()) {
char c = text.charAt(tempIndex);
if (Character.digit(c, 16) == -1) {
break;
}
tempIndex++;
}
if (tempIndex > currentIndex + 2) {
return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX);
}
}
}
return new ScanRange();
}
// Identifier that starts with a letter or underscore, and can contain letters,
// digits, and underscores
private ScanRange tryParseIdentifier(int currentIndex) {
if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') {
int tempIndex = currentIndex + 1;
while (tempIndex < text.length()) {
char c = text.charAt(tempIndex);
if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) {
break;
}
tempIndex++;
}
return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER);
}
return new ScanRange();
}
private ScanRange tryParseWithLookahead(int currentIndex) {
ScanRange sr = tryParseHexadecimal(currentIndex);
if (!sr.isValid()) {
sr = tryParseIdentifier(currentIndex);
}
return sr;
}
public boolean isKeyword(String text) {
return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") ||
text.equals("return") || text.equals("struct") || text.equals("typedef") ||
text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static");
}
public TokenSet parse() {
this.buildLineNumberTable();
int index = 0;
while (index < text.length()) {
char currentChar = text.charAt(index);
TokenType newType = TokenType.OTHER;
// Handle comments
if (inBlockComment) {
newType = TokenType.BLOCK_COMMENT;
if (currentChar == '*') {
if (index + 1 < text.length() && text.charAt(index + 1) == '/') {
inBlockComment = false;
index++;
}
}
} else if (inComment) {
newType = TokenType.COMMENT;
if (currentChar == '\n') {
inComment = false;
}
}
// Handle string literals
else if (inString) {
if (currentChar == '"') {
inString = false;
newType = TokenType.STRING_LITERAL;
} else {
newType = TokenType.STRING_LITERAL;
}
}
// Detect start of comments
else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') {
inBlockComment = true;
newType = TokenType.BLOCK_COMMENT;
} else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') {
inComment = true;
newType = TokenType.COMMENT;
}
// Detect start of string literals
else if (currentChar == '"') {
inString = true;
newType = TokenType.STRING_LITERAL;
} else {
ScanRange range = tryParseWithLookahead(index);
if (range.isValid()) {
// Insert the current token first
// script.println("Inserting current token: " + currentType + ", start: " +
// tokenStart + ", end: " + range.start);
insertToken(tokens, tokenStart, range.start, currentType);
// Insert a ranged token
// script.println("Inserting ranged token: " + range.type + " start: " +
// range.start + ", end: " + range.end);
insertToken(tokens, range.start, range.end, range.type);
// New start
currentType = TokenType.UNDEFINED;
tokenStart = range.end;
index = range.end;
}
// Detect numeric literals
else if (Character.isDigit(currentChar)) {
newType = TokenType.NUMERIC_LITERAL;
}
// Detect identifiers
else if (Character.isLetter(currentChar) || currentChar == '_') {
newType = TokenType.IDENTIFIER;
}
// Detect parentheses
else if (currentChar == '(') {
newType = TokenType.L_PAREN;
} else if (currentChar == ')') {
newType = TokenType.R_PAREN;
}
// Detect braces
else if (currentChar == '{') {
newType = TokenType.L_BRACE;
} else if (currentChar == '}') {
newType = TokenType.R_BRACE;
}
// Detect semicolon
else if (currentChar == ';') {
newType = TokenType.SEMICOLON;
}
// Detect comma
else if (currentChar == ',') {
newType = TokenType.COMMA;
} else if (currentChar == '#') {
newType = TokenType.HASH;
}
// Handle other characters
else {
newType = TokenType.OTHER;
}
}
// Insert a new token if the type changes
if (newType != currentType) {
insertToken(tokens, tokenStart, index, currentType);
tokenStart = index;
currentType = newType;
}
index++;
}
// Handle the last token
handleLastToken(tokens, tokenStart, currentType);
return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable);
}
}