Add cparser java project

This commit is contained in:
2024-10-08 00:58:50 +08:00
parent 8f0e8f68bb
commit 0133a237ac
31 changed files with 299 additions and 324 deletions

2
java/cparser/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
.class
target/

52
java/cparser/pom.xml Normal file
View File

@@ -0,0 +1,52 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cparser</groupId>
<artifactId>cparser</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- JUnit for testing -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<!-- Specify the custom test source directory -->
<testSourceDirectory>./src/test/java</testSourceDirectory>
<plugins>
<!-- Compiler Plugin to specify Java version -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<!-- Surefire Plugin to run JUnit tests -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.2</version>
<configuration>
<includes>
<include>**/ParserTests.java</include>
</includes>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,5 @@
package cparser;
public interface Log {
public void log(String msg);
}

View File

@@ -0,0 +1,264 @@
package cparser;
import java.util.*;
import cparser.Tokenizer.Token;
import cparser.Log;
public class Parser {
private Tokenizer.TokenSet tokenSet;
private List<Object> statements;
private Log log;
private Tokenizer.Token[] tokens;
public Parser(Tokenizer.TokenSet tokenSet, Log log) {
this.tokenSet = tokenSet;
this.statements = new ArrayList<>();
this.tokens = tokenSet.getTokens();
}
void log(String msg) {
if (log != null) {
log.log(msg);
}
}
int index = 0;
public void parse() {
for (index = 0; index < tokens.length; index++) {
Tokenizer.Token token = tokens[index];
if (token.type == Tokenizer.TokenType.BLOCK_COMMENT || token.type == Tokenizer.TokenType.COMMENT) {
continue;
} else if (token.type == Tokenizer.TokenType.HASH) {
index = parsePreprocessorExpression();
} else {
index = parseStmt();
}
}
}
private int parseStmt() {
int startIndex = index;
List<Tokenizer.Token> idStack = new ArrayList<>();
for (int i = startIndex; i < tokens.length; i++) {
Tokenizer.Token token = tokens[i];
if (token.type == Tokenizer.TokenType.L_PAREN && idStack.size() > 0) {
// Function call?
} else if (token.type == Tokenizer.TokenType.SEMICOLON) {
boolean isVarAssign = false;
for (int j = startIndex; j < i; j++) {
if (tokens[j].type == Tokenizer.TokenType.EQUALS) {
isVarAssign = true;
}
}
} else if (token.type == Tokenizer.TokenType.L_BRACE) {
boolean isVarAssign = false;
for (int j = startIndex; j < i; j++) {
if (tokens[j].type == Tokenizer.TokenType.L_PAREN) {
int endIndex = findClosingParenthesis(j);
if (endIndex != -1) {
index = endIndex;
}
}
}
}
}
// if (index + 1 < tokens.length && tokens[index + 1].type ==
// Tokenizer.TokenType.L_PAREN) {
// // Function call or declaration/definition
// if (index > 0 && (tokens[index - 1].type == Tokenizer.TokenType.IDENTIFIER ||
// tokens[index - 1].type == Tokenizer.TokenType.OTHER)) {
// // Function declaration or definition
// index = parseFunctionDeclaration();
// } else {
// // Function call
// index = parseFunctionCall();
// }
// } else {
// // Variable reference
// index = parseVariableReference();
// }
}
private int parseVarDecl(int startIndex, int endIndex) {
if (tokens[startIndex].type == Tokenizer.TokenType.R_PAREN) {
return startIndex;
}
return startIndex + 1;
}
private ArgumentList parseArgumentList(int startIndex, int endIndex) {
List<Object> arguments = new ArrayList<>();
for (int i = startIndex; i < endIndex; i++) {
if (tokens[i].type == Tokenizer.TokenType.COMMA) {
}
}
return new ArgumentList(arguments);
}
// Try to parse prep expression
private int parsePreprocessorExpression() {
int index = this.index;
if (tokenSet.tokens[index].type == Tokenizer.TokenType.HASH) {
int startLine = tokenSet.getLine(index);
while (index < tokenSet.tokens.length) {
if (tokenSet.getLine(index) > startLine) {
break;
}
index++;
}
// Find first next line token
index--;
}
return index;
}
// // Try to parse function declaration and return the ending token index
// private int parseFunctionDeclaration() {
// Tokenizer.Token[] tokens = tokenSet.getTokens();
// String name = tokenSet.getTextNoNewlines(tokens[index]);
// int endIndex = findClosingParenthesis(index + 1);
// if (endIndex == -1)
// return index;
// boolean isDefinition = false;
// if (endIndex + 1 < tokens.length && tokens[endIndex + 1].type ==
// Tokenizer.TokenType.L_BRACE) {
// isDefinition = true;
// endIndex = findClosingBrace(endIndex + 1);
// }
// if (endIndex == -1)
// return index;
// Function function = new Function(name, tokens[index].ofs,
// tokens[endIndex].ofs + tokens[endIndex].len,
// isDefinition);
// functions.add(function);
// return endIndex - 1;
// }
// // Try to parse function call and return the ending token index
// private int parseFunctionCall() {
// Tokenizer.Token[] tokens = tokenSet.getTokens();
// String name = tokenSet.getTextNoNewlines(tokens[index]);
// int endIndex = findClosingParenthesis(index + 1);
// if (endIndex == -1)
// return index;
// FunctionCall functionCall = new FunctionCall(name, tokens[index].ofs,
// tokens[endIndex].ofs + tokens[endIndex].len);
// functionCalls.add(functionCall);
// return endIndex - 1;
// }
// // Try to parse variable reference and add it to the list
// private int parseVariableReference() {
// Tokenizer.Token token = tokenSet.getTokens()[index];
// String name = tokenSet.getTextNoNewlines(token);
// Variable variable = new Variable(name, token.ofs, token.ofs + token.len);
// variables.add(variable);
// return index + 1;
// }
private int findClosingParenthesis(int startIndex) {
Tokenizer.Token[] tokens = tokenSet.getTokens();
int parenCount = 1;
for (int i = startIndex + 1; i < tokens.length; i++) {
if (tokens[i].type == Tokenizer.TokenType.L_PAREN) {
parenCount++;
} else if (tokens[i].type == Tokenizer.TokenType.R_PAREN) {
parenCount--;
if (parenCount == 0) {
return i;
}
}
}
return -1;
}
private int findClosingBrace(int startIndex) {
Tokenizer.Token[] tokens = tokenSet.getTokens();
int braceCount = 1;
for (int i = startIndex + 1; i < tokens.length; i++) {
if (tokens[i].type == Tokenizer.TokenType.L_BRACE) {
braceCount++;
} else if (tokens[i].type == Tokenizer.TokenType.R_BRACE) {
braceCount--;
if (braceCount == 0) {
return i;
}
}
}
return -1;
}
public static class Span {
public final int startOffset;
public final int endOffset;
public Span(int startOffset, int endOffset) {
this.startOffset = startOffset;
this.endOffset = endOffset;
}
}
public static class Type {
public final Span span;
public Type(Span span) {
this.span = span;
}
}
public static class Identifier {
public final Span span;
public Identifier(Span span) {
this.span = span;
}
}
public static class ArgumentList {
public final List<Object> arguments;
public ArgumentList(List<Object> arguments) {
this.arguments = arguments;
}
}
public static class VariableDeclaration {
public final Type type;
public final Identifier name;
public VariableDeclaration(Type type, Identifier name) {
this.type = type;
this.name = name;
}
}
public static class FunctionDecl {
public final Identifier name;
public final Type returnValue;
public final ArgumentList args;
public FunctionDecl(Identifier name, Type returnValue, ArgumentList args) {
this.name = name;
this.returnValue = returnValue;
this.args = args;
}
}
public static class FunctionCall {
public final Identifier name;
public final ArgumentList args;
public FunctionCall(Identifier name, ArgumentList args) {
this.name = name;
this.args = args;
}
}
}

View File

@@ -0,0 +1,367 @@
package cparser;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class Tokenizer {
public enum TokenType {
UNDEFINED,
HASH,
L_PAREN,
R_PAREN,
L_BRACE,
R_BRACE,
L_IDX,
R_IDX,
SEMICOLON,
EQUALS,
ARROW,
STAR,
COMMA,
COMMENT,
BLOCK_COMMENT,
IDENTIFIER,
STRING_LITERAL,
NUMERIC_LITERAL,
NUMERIC_LITERAL_HEX,
OTHER,
KEYWORD,
}
public class Token {
public int ofs;
public int len;
public TokenType type;
}
public class TokenSet {
public final Token[] tokens;
public final String text;
private final TreeMap<Integer, Integer> lineNumberTable;
TokenSet(Token[] tokens, String text, TreeMap<Integer, Integer> lineNumberTable) {
this.tokens = tokens;
this.text = text;
this.lineNumberTable = lineNumberTable;
}
public Token[] getTokens() {
return this.tokens;
}
public int getLine(int offset) {
Map.Entry<Integer, Integer> entry = lineNumberTable.floorEntry(offset);
return entry != null ? entry.getValue() : -1;
}
public String getTextNoNewlines(Token token) {
String text = getText(token);
return text.replace("\n", "");
}
};
private final String text;
private TreeMap<Integer, Integer> lineNumberTable;
private Log log;
public Tokenizer(String text) {
this.text = text;
}
public Tokenizer(String text, Log log) {
this.text = text;
this.log = log;
}
void log(String msg) {
if (log != null) {
log.log(msg);
}
}
String getText(Token token) {
return getText(token.ofs, token.len);
}
String getText(int ofs, int len) {
return text.substring(ofs, ofs + len); // Fixed recursion issue
}
TokenType lastTokenType = TokenType.UNDEFINED;
/**
* Inserts a new token into the tokens list.
*
* @param tokens The list of tokens.
* @param tokenStart The starting index of the token.
* @param tokenEnd The current index in the text.
* @param currentType The type of the current token.
*/
private void insertToken(List<Token> tokens, int tokenStart, int tokenEnd, TokenType currentType) {
if (currentType != TokenType.UNDEFINED && tokenStart < tokenEnd) {
// Strip whitespace
for (int i = tokenStart; i < tokenEnd; i++) {
if (Character.isWhitespace(text.charAt(i))) {
tokenStart = i + 1;
} else {
break;
}
}
// Strip whitespace from end
for (int i = tokenEnd - 1; i >= tokenStart; i--) {
if (Character.isWhitespace(text.charAt(i))) {
tokenEnd = i;
} else {
break;
}
}
if (tokenEnd - tokenStart > 0) {
Token token = new Token();
token.ofs = tokenStart;
token.len = tokenEnd - tokenStart;
token.type = currentType;
if (currentType == TokenType.IDENTIFIER && isKeyword(getText(token))) {
token.type = TokenType.KEYWORD;
}
tokens.add(token);
}
// Consume the token
currentType = TokenType.UNDEFINED;
}
}
/**
* Handles the insertion of the last token after parsing is complete.
*
* @param tokens The list of tokens.
* @param tokenStart The starting index of the last token.
* @param currentType The type of the last token.
*/
private void handleLastToken(List<Token> tokens, int tokenStart, TokenType currentType) {
insertToken(tokens, tokenStart, text.length(), currentType);
}
void buildLineNumberTable() {
this.lineNumberTable = new TreeMap<>();
int lineNumber = 1;
lineNumberTable.put(0, 1);
for (int i = 0; i < text.length(); i++) {
if (text.charAt(i) == '\n') {
lineNumber++;
lineNumberTable.put(i + 1, lineNumber);
}
}
}
List<Token> tokens = new ArrayList<>();
// Initialize tokenization state
int tokenStart = 0;
TokenType currentType = TokenType.UNDEFINED;
boolean inComment = false;
boolean inBlockComment = false;
boolean inString = false;
class ScanRange {
int start;
int end;
TokenType type;
ScanRange(int start, int end, TokenType type) {
this.start = start;
this.end = end;
this.type = type;
}
// Invalid constructor
ScanRange() {
this.type = TokenType.UNDEFINED;
}
boolean isValid() {
return this.type != TokenType.UNDEFINED;
}
};
// Add the following method to handle hexadecimal literals
private ScanRange tryParseHexadecimal(int currentIndex) {
if (text.charAt(currentIndex) == '0' && currentIndex + 1 < text.length()) {
char nextChar = text.charAt(currentIndex + 1);
if (nextChar == 'x' || nextChar == 'X') {
int tempIndex = currentIndex + 2;
while (tempIndex < text.length()) {
char c = text.charAt(tempIndex);
if (Character.digit(c, 16) == -1) {
break;
}
tempIndex++;
}
if (tempIndex > currentIndex + 2) {
return new ScanRange(currentIndex, tempIndex, TokenType.NUMERIC_LITERAL_HEX);
}
}
}
return new ScanRange();
}
// Identifier that starts with a letter or underscore, and can contain letters,
// digits, and underscores
private ScanRange tryParseIdentifier(int currentIndex) {
if (Character.isLetter(text.charAt(currentIndex)) || text.charAt(currentIndex) == '_') {
int tempIndex = currentIndex + 1;
while (tempIndex < text.length()) {
char c = text.charAt(tempIndex);
if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) {
break;
}
tempIndex++;
}
return new ScanRange(currentIndex, tempIndex, TokenType.IDENTIFIER);
}
return new ScanRange();
}
private ScanRange tryParseWithLookahead(int currentIndex) {
ScanRange sr = tryParseHexadecimal(currentIndex);
if (!sr.isValid()) {
sr = tryParseIdentifier(currentIndex);
}
return sr;
}
public boolean isKeyword(String text) {
return text.equals("while") || text.equals("for") || text.equals("if") || text.equals("else") ||
text.equals("return") || text.equals("struct") || text.equals("typedef") ||
text.equals("enum") || text.equals("union") || text.equals("const") || text.equals("static");
}
public TokenSet parse() {
this.buildLineNumberTable();
int index = 0;
while (index < text.length()) {
char currentChar = text.charAt(index);
TokenType newType = TokenType.OTHER;
// Handle comments
if (inBlockComment) {
newType = TokenType.BLOCK_COMMENT;
if (currentChar == '*') {
if (index + 1 < text.length() && text.charAt(index + 1) == '/') {
inBlockComment = false;
index++;
}
}
} else if (inComment) {
newType = TokenType.COMMENT;
if (currentChar == '\n') {
inComment = false;
}
}
// Handle string literals
else if (inString) {
if (currentChar == '"') {
inString = false;
newType = TokenType.STRING_LITERAL;
} else {
newType = TokenType.STRING_LITERAL;
}
}
// Detect start of comments
else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '*') {
inBlockComment = true;
newType = TokenType.BLOCK_COMMENT;
} else if (currentChar == '/' && index + 1 < text.length() && text.charAt(index + 1) == '/') {
inComment = true;
newType = TokenType.COMMENT;
}
// Detect start of string literals
else if (currentChar == '"') {
inString = true;
newType = TokenType.STRING_LITERAL;
} else {
ScanRange range = tryParseWithLookahead(index);
if (range.isValid()) {
// Insert the current token first
// script.println("Inserting current token: " + currentType + ", start: " +
// tokenStart + ", end: " + range.start);
insertToken(tokens, tokenStart, range.start, currentType);
// Insert a ranged token
// script.println("Inserting ranged token: " + range.type + " start: " +
// range.start + ", end: " + range.end);
insertToken(tokens, range.start, range.end, range.type);
// New start
currentType = TokenType.UNDEFINED;
tokenStart = range.end;
index = range.end;
}
// Detect numeric literals
else if (Character.isDigit(currentChar)) {
newType = TokenType.NUMERIC_LITERAL;
}
// Detect identifiers
else if (Character.isLetter(currentChar) || currentChar == '_') {
newType = TokenType.IDENTIFIER;
}
// Detect parentheses
else if (currentChar == '(') {
newType = TokenType.L_PAREN;
} else if (currentChar == ')') {
newType = TokenType.R_PAREN;
}
// Detect braces
else if (currentChar == '{') {
newType = TokenType.L_BRACE;
} else if (currentChar == '}') {
newType = TokenType.R_BRACE;
}
// Detect semicolon
else if (currentChar == ';') {
newType = TokenType.SEMICOLON;
}
// Detect comma
else if (currentChar == ',') {
newType = TokenType.COMMA;
} else if (currentChar == '#') {
newType = TokenType.HASH;
} else if (currentChar == '[') {
newType = TokenType.L_IDX;
} else if (currentChar == ']') {
newType = TokenType.R_IDX;
} else if (currentChar == '=') {
newType = TokenType.EQUALS;
} else if (currentChar == '>' && index > 0 && text.charAt(index - 1) == '-') {
newType = TokenType.ARROW;
currentType = TokenType.ARROW;
tokenStart = index - 1;
} else if (currentChar == '*') {
newType = TokenType.STAR;
}
// Handle other characters
else {
newType = TokenType.OTHER;
}
}
// Insert a new token if the type changes
if (newType != currentType) {
insertToken(tokens, tokenStart, index, currentType);
tokenStart = index;
currentType = newType;
}
index++;
}
// Handle the last token
handleLastToken(tokens, tokenStart, currentType);
return new TokenSet(tokens.toArray(new Token[0]), text, lineNumberTable);
}
}

View File

@@ -0,0 +1,124 @@
package cparser_tests;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.Test;
import java.lang.String;
import cparser.Parser;
import cparser.Tokenizer;
import cparser.Log;
import java.util.List;
public class ParserTests {
private Parser parser;
private Tokenizer.TokenSet tokenSet;
private Log testLog;
@Before
public void setUp() {
testLog = new Log() {
@Override
public void log(String msg) {
System.out.println(msg);
}
};
}
@Test
public void testParseVariableReference() {
String code = "int x = 5;";
tokenSet = new Tokenizer(code).parse();
parser = new Parser(tokenSet, testLog);
parser.parse();
List<Parser.Variable> variables = parser.getVariables();
assertEquals(1, variables.size());
assertEquals("x", variables.get(0).name);
}
@Test
public void testParseFunctionDeclaration() {
String code = "void foo(int a, int b);";
tokenSet = new Tokenizer(code).parse();
parser = new Parser(tokenSet, testLog);
parser.parse();
List<Parser.Function> functions = parser.getFunctions();
assertEquals(1, functions.size());
assertEquals("foo", functions.get(0).name);
assertFalse(functions.get(0).isDefinition);
}
@Test
public void testParseFunctionDefinition() {
String code = "int bar(int x) { return x + 1; }";
tokenSet = new Tokenizer(code).parse();
parser = new Parser(tokenSet, testLog);
parser.parse();
List<Parser.Function> functions = parser.getFunctions();
assertEquals(1, functions.size());
assertEquals("bar", functions.get(0).name);
assertTrue(functions.get(0).isDefinition);
}
@Test
public void testParseFunctionCall() {
String code = "result = calculate(5, 10);";
tokenSet = new Tokenizer(code).parse();
parser = new Parser(tokenSet, testLog);
parser.parse();
List<Parser.FunctionCall> functionCalls = parser.getFunctionCalls();
assertEquals(1, functionCalls.size());
assertEquals("calculate", functionCalls.get(0).name);
}
@Test
public void testParsePreprocessorDirective() {
String code = "#include <stdio.h>\nint main() { return 0; }";
tokenSet = new Tokenizer(code).parse();
parser = new Parser(tokenSet, testLog);
parser.parse();
List<Parser.Function> functions = parser.getFunctions();
assertEquals(1, functions.size());
assertEquals("main", functions.get(0).name);
}
@Test
public void testParseComplexCode() {
String code =
"#include <stdio.h>\n" +
"int globalVar = 10;\n" +
"void helper(int x);\n" +
"int main() {\n" +
" int localVar = 5;\n" +
" helper(localVar);\n" +
" return 0;\n" +
"}\n" +
"void helper(int x) {\n" +
" printf(\"%d\", x);\n" +
"}";
tokenSet = new Tokenizer(code).parse();
parser = new Parser(tokenSet, testLog);
parser.parse();
List<Parser.Variable> variables = parser.getVariables();
List<Parser.Function> functions = parser.getFunctions();
List<Parser.FunctionCall> functionCalls = parser.getFunctionCalls();
assertEquals(2, variables.size());
assertEquals(2, functions.size());
assertEquals(2, functionCalls.size());
assertTrue(variables.stream().anyMatch(v -> v.name.equals("globalVar")));
assertTrue(variables.stream().anyMatch(v -> v.name.equals("localVar")));
assertTrue(functions.stream().anyMatch(f -> f.name.equals("main")));
assertTrue(functions.stream().anyMatch(f -> f.name.equals("helper")));
assertTrue(functionCalls.stream().anyMatch(fc -> fc.name.equals("helper")));
assertTrue(functionCalls.stream().anyMatch(fc -> fc.name.equals("printf")));
}
}