reman3/scripts/re3lib/CTokenizer.java

package re3lib;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import ghidra.app.script.GhidraScript;

public class CTokenizer {
  public enum TokenType {
    UNDEFINED,
    HASH,
    L_PAREN,
    R_PAREN,
    L_BRACE,
    R_BRACE,
    SEMICOLON,
    COMMA,
    COMMENT,
    BLOCK_COMMENT,
    IDENTIFIER,
    STRING_LITERAL,
    NUMERIC_LITERAL,
    NUMERIC_LITERAL_HEX,
    OTHER,
    KEYWORD,
  }

  public class Token {
    public int ofs;
    public int len;
    public TokenType type;
  }

  public class TokenSet {
    public final Token[] tokens;
    public final String text;
    private final TreeMap<Integer, Integer> lineNumberTable;

    TokenSet(Token[] tokens, String text, TreeMap<Integer, Integer> lineNumberTable) {
      this.tokens = tokens;
      this.text = text;
      this.lineNumberTable = lineNumberTable;
    }

    public Token[] getTokens() {
      return this.tokens;
    }

    public int getLine(int offset) {
      Map.Entry<Integer, Integer> entry = lineNumberTable.floorEntry(offset);
      return entry != null ? entry.getValue() : -1;
    }

    public String getTextNoNewlines(Token token) {
      String text = getText(token);
      return text.replace("\n", "");
    }
  };

  private final String text;
  private TreeMap<Integer, Integer> lineNumberTable;
  public GhidraScript log;

  public CTokenizer(String text) {
    this.text = text;
  }

  public CTokenizer(String text, GhidraScript script) {
    this.text = text;
    this.log = log;
  }

  String getText(Token token) {
    return getText(token.ofs, token.len);
  }

  String getText(int ofs, int len) {
    return text.substring(ofs, ofs + len); // Fixed recursion issue
  }

  TokenType lastTokenType = TokenType.UNDEFINED;

  /**
   * Inserts a new token into the tokens list.
   *
   * @param tokens      The list of tokens.
   * @param tokenStart  The starting index of the token.
   * @param tokenEnd    The current index in the text.
   * @param currentType The type of the current token.
   */
  private void insertToken(List<Token> tokens, int tokenStart, int tokenEnd, TokenType currentType) {
    if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) {
      // Strip whitespace
      for (int i = tokenStart; i < tokenEnd; i++) {
        if (Character.isWhitespace(text.charAt(i))) {
          tokenStart = i + 1;
        } else {
          break;
        }
      }
      // Strip whitespace from end
      for (int i = tokenEnd - 1; i >= tokenStart; i--) {
        if (Character.isWhitespace(text.charAt(i))) {
          tokenEnd = i;
        } else {
          break;
        }
      }

      if (tokenEnd - tokenStart > 0) {
        Token token = new Token();
        token.ofs = tokenStart;
        token.len = tokenEnd - tokenStart;
        token.type = currentType;
        if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) {
          token.type = TokenType.KEYWORD;
        }
        tokens.add(token);
      }

      // Consume the token
      currentType = TokenType.UNDEFINED;
    }
  }

  /**
   * Handles the insertion of the last token after parsing is complete.
   *
   * @param tokens      The list of tokens.
   * @param tokenStart  The starting index of the last token.
   * @param currentType The type of the last token.
   */
  private void handleLastToken(List<Token> tokens, int tokenStart, TokenType currentType) {
    insertToken(tokens, tokenStart, text.length(), currentType);
  }

  void buildLineNumberTable() {
    this.lineNumberTable = new TreeMap<>();
    int lineNumber = 1;
    lineNumberTable.put(0, 1);
    for (int i = 0; i < text.length(); i++) {
      if (text.charAt(i) == '\n') {
        lineNumber++;
        lineNumberTable.put(i + 1, lineNumber);
      }
    }
  }

  List<Token> tokens = new ArrayList<>();

  // Initialize tokenization state
  int tokenStart = 0;
  TokenType currentType = TokenType.UNDEFINED;
  boolean inComment = false;
  boolean inBlockComment = false;
  boolean inString = false;

  class ScanRange {
    int start;
    int end;
    TokenType type;

    ScanRange(int start, int end, TokenType type) {
      this.start = start;
      this.end = end;
      this.type = type;
    }

    // Invalid constructor
    ScanRange() {
      this.type = TokenType.UNDEFINED;
    }

    boolean isValid() {
      return this.type != TokenType.UNDEFINED;
    }
  };

  // Add the following method to handle hexadecimal literals
  private ScanRange tryParseHexadecimal(int currentIndex) {
    if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) {
      char nextChar = text.charAt(currentIndex + 1);
      if (nextChar == 'x' || nextChar == 'X') {
        int tempIndex = currentIndex + 2;
        while (tempIndex < text.length()) {
          char c = text.charAt(tempIndex);
          if (Character.digit(c, 16) == -1) {
            break;
          }
          tempIndex++;
        }
        if (tempIndex > currentIndex + 2) {
          return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX);
        }
      }
    }
    return new ScanRange();
  }

  // Identifier that starts with a letter or underscore, and can contain letters,
  // digits, and underscores
  private ScanRange tryParseIdentifier(int currentIndex) {
    if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') {
      int tempIndex = currentIndex + 1;
      while (tempIndex < text.length()) {
        char c = text.charAt(tempIndex);
        if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) {
          break;
        }
        tempIndex++;
      }
      return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER);
    }
    return new ScanRange();
  }

  private ScanRange tryParseWithLookahead(int currentIndex) {
    ScanRange sr = tryParseHexadecimal(currentIndex);
    if (!sr.isValid()) {
      sr = tryParseIdentifier(currentIndex);
    }
    return sr;
  }

  public boolean isKeyword(String text) {
    return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") ||
        text.equals("return") || text.equals("struct") || text.equals("typedef") ||
        text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static");
  }

  public TokenSet parse() {
    this.buildLineNumberTable();

    int index = 0;
    while (index < text.length()) {
      char currentChar = text.charAt(index);
      TokenType newType = TokenType.OTHER;

      // Handle comments
      if (inBlockComment) {
        newType = TokenType.BLOCK_COMMENT;
        if (currentChar == '*') {
          if (index + 1 < text.length() && text.charAt(index + 1) == '/') {
            inBlockComment = false;
            index++;
          }
        }
      } else if (inComment) {
        newType = TokenType.COMMENT;
        if (currentChar == '\n') {
          inComment = false;
        }
      }
      // Handle string literals
      else if (inString) {
        if (currentChar == '"') {
          inString = false;
          newType = TokenType.STRING_LITERAL;
        } else {
          newType = TokenType.STRING_LITERAL;
        }
      }
      // Detect start of comments
      else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') {
        inBlockComment = true;
        newType = TokenType.BLOCK_COMMENT;
      } else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') {
        inComment = true;
        newType = TokenType.COMMENT;
      }
      // Detect start of string literals
      else if (currentChar == '"') {
        inString = true;
        newType = TokenType.STRING_LITERAL;
      } else {
        ScanRange range = tryParseWithLookahead(index);
        if (range.isValid()) {
          // Insert the current token first
          // script.println("Inserting current token: " + currentType + ", start: " +
          // tokenStart + ", end: " + range.start);
          insertToken(tokens, tokenStart, range.start, currentType);

          // Insert a ranged token
          // script.println("Inserting ranged token: " + range.type + " start: " +
          // range.start + ", end: " + range.end);
          insertToken(tokens, range.start, range.end, range.type);

          // New start
          currentType = TokenType.UNDEFINED;
          tokenStart = range.end;
          index = range.end;
        }
        // Detect numeric literals
        else if (Character.isDigit(currentChar)) {
          newType = TokenType.NUMERIC_LITERAL;
        }
        // Detect identifiers
        else if (Character.isLetter(currentChar) || currentChar == '_') {
          newType = TokenType.IDENTIFIER;
        }
        // Detect parentheses
        else if (currentChar == '(') {
          newType = TokenType.L_PAREN;
        } else if (currentChar == ')') {
          newType = TokenType.R_PAREN;
        }
        // Detect braces
        else if (currentChar == '{') {
          newType = TokenType.L_BRACE;
        } else if (currentChar == '}') {
          newType = TokenType.R_BRACE;
        }
        // Detect semicolon
        else if (currentChar == ';') {
          newType = TokenType.SEMICOLON;
        }
        // Detect comma
        else if (currentChar == ',') {
          newType = TokenType.COMMA;
        } else if (currentChar == '#') {
          newType = TokenType.HASH;
        }
        // Handle other characters
        else {
          newType = TokenType.OTHER;
        }
      }

      // Insert a new token if the type changes
      if (newType != currentType) {
        insertToken(tokens, tokenStart, index, currentType);
        tokenStart = index;
        currentType = newType;
      }

      index++;
    }

    // Handle the last token
    handleLastToken(tokens, tokenStart, currentType);

    return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable);
  }
}