Last active
September 10, 2023 05:47
-
-
Save quat1024/a1966a5718407ad63c1848504f637374 to your computer and use it in GitHub Desktop.
pratt parsing in java, but succinct
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package agency.highlysuspect.parsefunny; | |
import org.jetbrains.annotations.NotNull; | |
import org.jetbrains.annotations.Nullable; | |
import java.util.ArrayDeque; | |
import java.util.Deque; | |
import java.util.stream.Collectors; | |
//This is sort of a synthesis of the parsers described in these two articles: | |
// | |
//matklad - "Simple but Powerful Pratt Parsing" | |
// https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html | |
// Very featureful Pratt parser that can recognize prefix, postfix, infix, and "around"fix operators. | |
// Uses a somewhat delicate numeric precedence scheme. Implemented in Rust. | |
// *He also has this nice article https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html | |
// | |
//Jamie Brandon (scattered-thoughts) - "Better operator precedence" | |
// https://www.scattered-thoughts.net/writing/better-operator-precedence/ | |
// Introduces a "comparePrecedence" function instead of using numeric precedence. | |
// But the given implementation (in Zig) doesn't cover anything other than infix operators. | |
// | |
//Wanted the best of both. I think I got it. | |
// | |
//There's also this nice article about pratt parsing by the Crafting Interpreters developer: | |
//Bob Nystrom - Pratt Parsers: Expression Parsing Made Easy | |
// https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/ | |
// | |
//But it gets a little bogged down in all the Java, so much so that someone wrote a "pratt parsing | |
//without virtual dispatch or Java" article about it (http://www.oilshell.org/blog/2016/11/03.html), | |
//and I wanted to redeem Java a bit :) So here's a complete lexer and parser in ~200 lines of commented java 17. | |
public class ParserExploration2 { | |
static void check(boolean arg, String msg) { | |
if(!arg) throw new IllegalStateException(msg); | |
} | |
/// Lexing /// | |
enum TokenType { | |
EOF, //token after the end of the file (very common technique to avoid Optional<Token> everywhere) | |
ATOM, //stand-in for a "number" or "name literal" token you might have in a real parser | |
STAR, SLASH, SQUIGGLE, PLUS, MINUS, DOT, BANG, PERCENT, //grab bag of operators | |
LPAREN, RPAREN, //parenthesis | |
} | |
record Token(TokenType type, String display) { | |
@Override public String toString() { return display; } | |
} | |
static class Lexer { | |
//Toy-quality lexer ported from matklad's post, lexes 1-character long operations and literals only. | |
//Don't get bogged down with this, it's not the important part. | |
Lexer(String in) { | |
this.tokens = in.chars() | |
.filter(i -> !Character.isWhitespace(i)) | |
.mapToObj(i -> switch(i) { | |
case '*' -> new Token(TokenType.STAR, "*"); | |
case '/' -> new Token(TokenType.SLASH, "/"); | |
case '~' -> new Token(TokenType.SQUIGGLE, "~"); | |
case '+' -> new Token(TokenType.PLUS, "+"); | |
case '-' -> new Token(TokenType.MINUS, "-"); | |
case '.' -> new Token(TokenType.DOT, "."); | |
case '!' -> new Token(TokenType.BANG, "!"); | |
case '%' -> new Token(TokenType.PERCENT, "%"); | |
case '(' -> new Token(TokenType.LPAREN, "("); | |
case ')' -> new Token(TokenType.RPAREN, ")"); | |
default -> new Token(TokenType.ATOM, Character.toString(i)); | |
}).collect(Collectors.toCollection(ArrayDeque::new)); | |
} | |
//A "peeking iterator" API over the lexemes. Note that there's no way to see two tokens ahead. | |
final Deque<Token> tokens; | |
Token peek() { | |
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.peekFirst(); | |
} | |
Token pop() { | |
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.pollFirst(); | |
} | |
} | |
/// AST sketch. Renders toString as an S-expression. /// | |
interface Expr {} | |
record Atom(String s) implements Expr { | |
@Override public String toString() { return s; } | |
} | |
record UnaryOp(OperationType op, Expr operand) implements Expr { | |
@Override public String toString() { return "(%s %s)".formatted(op, operand); } | |
} | |
record BinaryOp(OperationType op, Expr left, Expr right) implements Expr { | |
@Override public String toString() { return "(%s %s %s)".formatted(op, left, right); } | |
} | |
/// Parsing /// | |
//Entrypoint for the recursive expression parser. In a real parsing application, this would be one of many little functions. | |
Expr expr(Lexer lex) { | |
return pratt(lex, null); //`null` means "there is no operation to my left to consider binding to". | |
} | |
//Pratt parser bullshit magic: | |
Expr pratt(Lexer lex, @Nullable OperationType leftOp) { | |
Token lhsToken = lex.pop(); | |
Expr lhs = switch(lhsToken.type) { | |
case ATOM -> new Atom(lhsToken.toString()); | |
case LPAREN -> { | |
//When crossing into parenthesis, revert to the null operation type. | |
//Nothing should be able to bind leftwards *through* an opening paren. | |
Expr e = pratt(lex, null); | |
check(lex.pop().type == TokenType.RPAREN, "expected closing parenthesis"); | |
yield e; | |
} | |
default -> { | |
//is it prefix? | |
OperationType asPrefix = OperationType.prefixOpFrom(lhsToken); | |
check(asPrefix != null, lhsToken + " is not a prefix operation!"); | |
//we already consumed the prefix operator's token | |
Expr rhs = pratt(lex, asPrefix); //recurse to find the right-hand side of the expression | |
yield new UnaryOp(asPrefix, rhs); //apply the prefix operator to the *next* operand | |
//look for another infix or postfix operation | |
} | |
}; | |
while(true) { | |
Token opToken = lex.peek(); | |
//(Special casing for EOF, RParen, etc is not actually needed; xxxOpFrom returning `null` takes care of it.) | |
//is it infix? | |
OperationType infixOp = OperationType.infixOpFrom(opToken); | |
if(infixOp != null && bindsToRight(leftOp, infixOp)) { | |
lex.pop(); //consume the infix operator's token | |
Expr rhs = pratt(lex, infixOp); //recurse to find the right-hand side of the expression (!HARD to understand!) | |
lhs = new BinaryOp(infixOp, lhs, rhs); //apply the infix operator to *both* operands | |
continue; //look for another infix or postfix operation | |
} | |
//is it postfix? | |
OperationType postfixOp = OperationType.postfixOpFrom(opToken); | |
if(postfixOp != null && bindsToRight(leftOp, postfixOp)) { | |
lex.pop(); //consume the postfix operator's token | |
lhs = new UnaryOp(postfixOp, lhs); //apply the postfix operator to the *previous* operand | |
continue; //look for another infix or postfix operation | |
} | |
return lhs; | |
} | |
} | |
//Whether the operation on the right is "stronger" than the one on the left. | |
static boolean bindsToRight(@Nullable OperationType left, @NotNull OperationType right) { | |
if(left == null) return true; //There isn't an operator on the left to even consider binding to! | |
//Key observation: this is the only place precedence and rightAssociative are read. | |
//The precedence implementation can be swapped out for anything (such as a handwritten table), | |
//if manual precedence-number fiddling isn't up your alley. Exceptions for specific pairs of | |
//operators can also be added to this method. | |
if(left.precedence == right.precedence) return left.rightAssociative; //break ties like this | |
else return right.precedence > left.precedence; | |
} | |
//A token corresponds to zero or more "logical operations" depending on the token's fixity. | |
//For example, the - token can be unary minus or subtraction, and unary minus binds tighter than | |
//multiplication, but multiplication binds tighter than subtraction. In other words, tokens don't | |
//really "have precedence", but (token, fixity) tuples do. The typical way of modelling this is | |
//maintaining separate precedence tables: one for prefix operators, one for infix operators, and | |
//one for postfix operators. This is a little fiddly, so in the spirit of reducing the amount of | |
//"magic numbers" in pratt parsing, I tried mapping each operator and fixity into a *single* | |
//OperationType table, giving a name to the operation at the same time (so I can say "factorial" | |
//instead of "postfix bang"). Maybe it makes more sense this way? It's probably a little easier to | |
//reason about when it comes time to add more operations? I think it's just as flexible? Idk. | |
enum OperationType { | |
ADD (20, false, "+"), | |
SUBTRACT (20, false, "-"), | |
SQUIGGLE (30, false, "~"), //Arbitrary binary operator with precedence between + and * | |
PERCENT (30, true, "%"), //Same, but right-associative | |
MULTIPLY (40, false, "*"), | |
DIVIDE (40, false, "/"), | |
UNARY_PLUS (50, false, "+"), | |
UNARY_MINUS (50, false, "-"), | |
FACTORIAL (60, false, "!"), //Postfix | |
UNARY_SQUIGGLE(70, false, "~"), | |
COMPOSITION (100, true, "."); //Right-associative | |
OperationType(int precedence, boolean rightAssociative, String display) { | |
this.precedence = precedence; | |
this.rightAssociative = rightAssociative; | |
this.display = display; | |
} | |
final int precedence; | |
final boolean rightAssociative; | |
final String display; | |
@Override public String toString() { return this.display; } | |
//Tables for converting tokens into logical operations, depending on the token's fixity. | |
//Note that if a token has infix *and* postfix forms, the main parser loop will just pick one. | |
//I think correctly disambiguating cases like that requires backtracking or infinite lookahead. | |
//Prefix and infix is okay though. | |
static @Nullable OperationType prefixOpFrom(Token tok) { | |
return switch(tok.type) { | |
case PLUS -> OperationType.UNARY_PLUS; | |
case MINUS -> OperationType.UNARY_MINUS; | |
case SQUIGGLE -> OperationType.UNARY_SQUIGGLE; | |
default -> null; | |
}; | |
} | |
static @Nullable OperationType infixOpFrom(Token tok) { | |
return switch(tok.type) { | |
case PLUS -> OperationType.ADD; | |
case MINUS -> OperationType.SUBTRACT; | |
case SQUIGGLE -> OperationType.SQUIGGLE; | |
case PERCENT -> OperationType.PERCENT; | |
case STAR -> OperationType.MULTIPLY; | |
case SLASH -> OperationType.DIVIDE; | |
case DOT -> OperationType.COMPOSITION; | |
default -> null; | |
}; | |
} | |
static @Nullable OperationType postfixOpFrom(Token tok) { | |
return switch(tok.type) { | |
case BANG -> OperationType.FACTORIAL; | |
default -> null; | |
}; | |
} | |
} | |
/// demonstration /// | |
void checkParse(String in, String expect) { | |
String out = expr(new Lexer(in)).toString(); | |
System.out.println("in: " + in); | |
System.out.println("out: " + out); | |
System.out.println("expect: " + expect); | |
check(out.equals(expect), "Expected " + expect + " but found " + out); | |
} | |
void go() { | |
//left-associative operators | |
checkParse("1 + 2 + 3 + 4", "(+ (+ (+ 1 2) 3) 4)"); | |
checkParse("1 ~ 2 ~ 3 ~ 4", "(~ (~ (~ 1 2) 3) 4)"); | |
checkParse("1 * 2 * 3 * 4", "(* (* (* 1 2) 3) 4)"); | |
checkParse("1 + 2 - 3 + 4", "(+ (- (+ 1 2) 3) 4)"); //operators with the same priority | |
checkParse("1 * 2 / 3 * 4", "(* (/ (* 1 2) 3) 4)"); | |
//right-associative operators | |
checkParse("1 . 2 . 3 . 4", "(. 1 (. 2 (. 3 4)))"); | |
checkParse("1 % 2 % 3 % 4", "(% 1 (% 2 (% 3 4)))"); | |
//mixed associativity among operators with the same precedence | |
checkParse("1 ~ 2 ~ 3 % 4", "(% (~ (~ 1 2) 3) 4)"); | |
checkParse("1 ~ 2 % 3 % 4", "(% (~ 1 2) (% 3 4))"); | |
checkParse("1 % 2 ~ 3 ~ 4", "(% 1 (~ (~ 2 3) 4))"); | |
checkParse("1 % 2 % 3 ~ 4", "(% 1 (% 2 (~ 3 4)))"); | |
//operator precedence | |
checkParse("1 + 2 ~ 3", "(+ 1 (~ 2 3))"); //plus is weaker than squiggle | |
checkParse("1 * 2 ~ 3", "(~ (* 1 2) 3)"); //star is stronger than squiggle | |
checkParse("1 + 2 * 3 + 4", "(+ (+ 1 (* 2 3)) 4)"); | |
checkParse("1 * 2 + 3 * 4", "(+ (* 1 2) (* 3 4))"); | |
//prefix operators | |
checkParse("-1", "(- 1)"); | |
checkParse("- - 1", "(- (- 1))"); | |
checkParse("- + 1", "(- (+ 1))"); | |
checkParse("- - 1 - 2", "(- (- (- 1)) 2)"); | |
checkParse("- - 1 - - 2", "(- (- (- 1)) (- 2))"); | |
checkParse("1--2", "(- 1 (- 2))"); | |
checkParse("- 1 * - 2", "(* (- 1) (- 2))"); | |
//postfix operators | |
checkParse("5 !", "(! 5)"); | |
checkParse("5 ! !", "(! (! 5))"); | |
checkParse("- 5 !", "(- (! 5))"); //unary - is weaker than ! | |
checkParse("~ 5 !", "(! (~ 5))"); //unary ~ is stronger than ! | |
checkParse("---5!!!", "(- (- (- (! (! (! 5))))))"); | |
checkParse("~~~5!!!", "(! (! (! (~ (~ (~ 5))))))"); | |
checkParse("-~~5!!!", "(- (! (! (! (~ (~ 5))))))"); | |
checkParse("~-~5!!!", "(~ (- (! (! (! (~ 5))))))"); | |
checkParse("1 + 3!", "(+ 1 (! 3))"); | |
checkParse("1 + 3 ! + 5", "(+ (+ 1 (! 3)) 5)"); | |
//parentheticals | |
checkParse("( 1 + 2 ) * 3", "(* (+ 1 2) 3)"); | |
checkParse("1 + ( 2 * 3 )", "(+ 1 (* 2 3))"); | |
checkParse("(1+2)*(3+4)", "(* (+ 1 2) (+ 3 4))"); | |
checkParse("~(1)", "(~ 1)"); | |
checkParse("~((((((1))))))", "(~ 1)"); | |
checkParse("~(((~(((1))))))", "(~ (~ 1))"); | |
checkParse("(4 + 5)!", "(! (+ 4 5))"); | |
//stupid viral math problems from the internet | |
checkParse("6 / 2 * (1 + 2)", "(* (/ 6 2) (+ 1 2))"); | |
checkParse("2 + 2 * 4", "(+ 2 (* 2 4))"); | |
checkParse("1 + 1 * 0 + 1", "(+ (+ 1 (* 1 0)) 1)"); | |
} | |
public static void main(String[] args) { | |
new ParserExploration2().go(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package agency.highlysuspect.parsefunny; | |
import java.util.ArrayDeque; | |
import java.util.Deque; | |
import java.util.stream.Collectors; | |
//This shorter version strips away the pre/postfix parsing & parenthesis parsing, leaving only the | |
//core of the algorithm. It's also implemented more traditionally, using a numeric "binding power". | |
public class ParserExploration2_InfixOnly { | |
static void check(boolean arg, String msg) { | |
if(!arg) throw new IllegalStateException(msg); | |
} | |
/// Lexing /// | |
enum TokenType { | |
EOF, //token after the end of the file (very common technique to avoid Optional<Token> everywhere) | |
ATOM, //stand-in for a "number" or "name literal" token you might have in a real parser | |
STAR, SLASH, SQUIGGLE, PLUS, MINUS, DOT, PERCENT, //grab bag of infix binary operators | |
} | |
record Token(TokenType type, String display) { | |
@Override public String toString() { return display; } | |
} | |
static class Lexer { | |
//Toy-quality lexer ported from matklad's post, lexes 1-character long operations and literals only. | |
//Don't get bogged down with this, it's not the important part. | |
Lexer(String in) { | |
this.tokens = in.chars() | |
.filter(i -> !Character.isWhitespace(i)) | |
.mapToObj(i -> switch(i) { | |
case '*' -> new Token(TokenType.STAR, "*"); | |
case '/' -> new Token(TokenType.SLASH, "/"); | |
case '~' -> new Token(TokenType.SQUIGGLE, "~"); | |
case '+' -> new Token(TokenType.PLUS, "+"); | |
case '-' -> new Token(TokenType.MINUS, "-"); | |
case '.' -> new Token(TokenType.DOT, "."); | |
case '%' -> new Token(TokenType.PERCENT, "%"); | |
default -> new Token(TokenType.ATOM, Character.toString(i)); | |
}).collect(Collectors.toCollection(ArrayDeque::new)); | |
} | |
//A "peeking iterator" API over the lexemes. Note that there's no way to see two tokens ahead. | |
final Deque<Token> tokens; | |
Token peek() { | |
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.peekFirst(); | |
} | |
Token pop() { | |
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.pollFirst(); | |
} | |
} | |
/// AST sketch. Renders toString as an S-expression. /// | |
interface Expr {} | |
record Atom(String s) implements Expr { | |
@Override public String toString() { return s; } | |
} | |
record BinaryOp(String op, Expr left, Expr right) implements Expr { | |
@Override public String toString() { return "(%s %s %s)".formatted(op, left, right); } | |
} | |
/// Parsing /// | |
static final int NOT_INFIX_OP = 0; | |
Expr expr(Lexer lex) { | |
return pratt(lex, NOT_INFIX_OP); //"there is no operation to my left to consider binding to" | |
} | |
Expr pratt(Lexer lex, int minBindingPower) { | |
Token lhsToken = lex.pop(); | |
check(lhsToken.type == TokenType.ATOM, "expected atom"); | |
Expr lhs = new Atom(lhsToken.toString()); | |
while(true) { | |
Token opToken = lex.peek(); | |
if(leftPower(opToken) == NOT_INFIX_OP || leftPower(opToken) < minBindingPower) return lhs; | |
lex.pop(); | |
Expr rhs = pratt(lex, rightPower(opToken)); //recurse to find the right-hand side of the expression | |
lhs = new BinaryOp(opToken.toString(), lhs, rhs); //^ that is the HARD part to understand about this function | |
} | |
} | |
static int leftPower(Token t) { | |
return switch(t.type) { | |
case PLUS, MINUS -> 20; | |
case SQUIGGLE, PERCENT -> 30; | |
case STAR, SLASH -> 40; | |
case DOT -> 100; | |
default -> NOT_INFIX_OP; | |
}; | |
} | |
static int rightPower(Token t) { | |
return switch(t.type) { | |
case PLUS, MINUS, SQUIGGLE, STAR, SLASH -> leftPower(t) + 1; | |
case DOT, PERCENT -> leftPower(t) - 1; | |
default -> NOT_INFIX_OP; | |
}; | |
} | |
/// demonstration /// | |
void checkParse(String in, String expect) { | |
String out = expr(new Lexer(in)).toString(); | |
System.out.println("in: " + in); | |
System.out.println("out: " + out); | |
System.out.println("expect: " + expect); | |
check(out.equals(expect), "Expected " + expect + " but found " + out); | |
} | |
void go() { | |
//left-associative operators | |
checkParse("1 + 2 + 3 + 4", "(+ (+ (+ 1 2) 3) 4)"); | |
checkParse("1 ~ 2 ~ 3 ~ 4", "(~ (~ (~ 1 2) 3) 4)"); | |
checkParse("1 * 2 * 3 * 4", "(* (* (* 1 2) 3) 4)"); | |
checkParse("1 + 2 - 3 + 4", "(+ (- (+ 1 2) 3) 4)"); //operators with the same priority | |
checkParse("1 * 2 / 3 * 4", "(* (/ (* 1 2) 3) 4)"); | |
//right-associative operators | |
checkParse("1 . 2 . 3 . 4", "(. 1 (. 2 (. 3 4)))"); | |
checkParse("1 % 2 % 3 % 4", "(% 1 (% 2 (% 3 4)))"); | |
//mixed associativity among operators with the same precedence | |
checkParse("1 ~ 2 ~ 3 % 4", "(% (~ (~ 1 2) 3) 4)"); | |
checkParse("1 ~ 2 % 3 % 4", "(% (~ 1 2) (% 3 4))"); | |
checkParse("1 % 2 ~ 3 ~ 4", "(% 1 (~ (~ 2 3) 4))"); | |
checkParse("1 % 2 % 3 ~ 4", "(% 1 (% 2 (~ 3 4)))"); | |
//operator precedence | |
checkParse("1 + 2 ~ 3", "(+ 1 (~ 2 3))"); //plus is weaker than squiggle | |
checkParse("1 * 2 ~ 3", "(~ (* 1 2) 3)"); //star is stronger than squiggle | |
checkParse("1 + 2 * 3 + 4", "(+ (+ 1 (* 2 3)) 4)"); | |
checkParse("1 * 2 + 3 * 4", "(+ (* 1 2) (* 3 4))"); | |
} | |
public static void main(String[] args) { | |
new ParserExploration2_InfixOnly().go(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment