Created
September 25, 2024 02:08
-
-
Save d0rc/2626935cf6bcf94126c802a4080526a8 to your computer and use it in GitHub Desktop.
php subset parser / lexer in go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"strconv" | |
) | |
// TokenType represents the type of a token | |
type TokenType int | |
const ( | |
TOKEN_EOF TokenType = iota | |
TOKEN_IDENTIFIER | |
TOKEN_STRING | |
TOKEN_NUMBER | |
TOKEN_EQUALS | |
TOKEN_SEMICOLON | |
TOKEN_DOLLAR | |
TOKEN_LPAREN | |
TOKEN_RPAREN | |
TOKEN_COMMA | |
TOKEN_ECHO | |
TOKEN_PHP_START | |
TOKEN_PHP_END | |
TOKEN_DOT | |
) | |
// Token represents a lexical token | |
type Token struct { | |
Type TokenType | |
Literal string | |
} | |
// Lexer performs lexical analysis | |
type Lexer struct { | |
input string | |
position int | |
readPosition int | |
ch byte | |
} | |
// NewLexer creates a new Lexer | |
func NewLexer(input string) *Lexer { | |
l := &Lexer{input: input} | |
l.readChar() | |
return l | |
} | |
// readChar reads the next character | |
func (l *Lexer) readChar() { | |
if l.readPosition >= len(l.input) { | |
l.ch = 0 | |
} else { | |
l.ch = l.input[l.readPosition] | |
} | |
l.position = l.readPosition | |
l.readPosition++ | |
} | |
// NextToken returns the next token | |
func (l *Lexer) NextToken() Token { | |
var tok Token | |
l.skipWhitespace() | |
switch l.ch { | |
case '=': | |
tok = Token{Type: TOKEN_EQUALS, Literal: string(l.ch)} | |
case ';': | |
tok = Token{Type: TOKEN_SEMICOLON, Literal: string(l.ch)} | |
case '$': | |
tok = Token{Type: TOKEN_DOLLAR, Literal: string(l.ch)} | |
case '(': | |
tok = Token{Type: TOKEN_LPAREN, Literal: string(l.ch)} | |
case ')': | |
tok = Token{Type: TOKEN_RPAREN, Literal: string(l.ch)} | |
case ',': | |
tok = Token{Type: TOKEN_COMMA, Literal: string(l.ch)} | |
case '.': | |
tok = Token{Type: TOKEN_DOT, Literal: string(l.ch)} | |
case '"': | |
tok.Type = TOKEN_STRING | |
tok.Literal = l.readString() | |
case 0: | |
tok.Literal = "" | |
tok.Type = TOKEN_EOF | |
default: | |
if isLetter(l.ch) { | |
tok.Literal = l.readIdentifier() | |
tok.Type = l.lookupIdentifier(tok.Literal) | |
return tok | |
} else if isDigit(l.ch) { | |
tok.Type = TOKEN_NUMBER | |
tok.Literal = l.readNumber() | |
return tok | |
} else if l.ch == '<' && l.peekChar() == '?' { | |
l.readChar() // consume '<' | |
l.readChar() // consume '?' | |
if l.ch == 'p' && l.peekChar() == 'h' { | |
l.readChar() // consume 'p' | |
l.readChar() // consume 'h' | |
l.readChar() // consume 'p' | |
tok = Token{Type: TOKEN_PHP_START, Literal: "<?php"} | |
} | |
} else if l.ch == '?' && l.peekChar() == '>' { | |
l.readChar() // consume '?' | |
l.readChar() // consume '>' | |
tok = Token{Type: TOKEN_PHP_END, Literal: "?>"} | |
} else { | |
tok = Token{Type: TOKEN_EOF, Literal: string(l.ch)} | |
} | |
} | |
l.readChar() | |
return tok | |
} | |
func (l *Lexer) skipWhitespace() { | |
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' { | |
l.readChar() | |
} | |
} | |
func (l *Lexer) readIdentifier() string { | |
position := l.position | |
for isLetter(l.ch) || isDigit(l.ch) { | |
l.readChar() | |
} | |
return l.input[position:l.position] | |
} | |
func (l *Lexer) readNumber() string { | |
position := l.position | |
for isDigit(l.ch) { | |
l.readChar() | |
} | |
return l.input[position:l.position] | |
} | |
func (l *Lexer) readString() string { | |
position := l.position + 1 | |
for { | |
l.readChar() | |
if l.ch == '"' || l.ch == 0 { | |
break | |
} | |
} | |
return l.input[position:l.position] | |
} | |
func (l *Lexer) peekChar() byte { | |
if l.readPosition >= len(l.input) { | |
return 0 | |
} | |
return l.input[l.readPosition] | |
} | |
func (l *Lexer) lookupIdentifier(ident string) TokenType { | |
switch ident { | |
case "echo": | |
return TOKEN_ECHO | |
default: | |
return TOKEN_IDENTIFIER | |
} | |
} | |
func isLetter(ch byte) bool { | |
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' | |
} | |
func isDigit(ch byte) bool { | |
return '0' <= ch && ch <= '9' | |
} | |
// Node represents a node in the AST | |
type Node interface { | |
TokenLiteral() string | |
} | |
// Statement represents a statement node | |
type Statement interface { | |
Node | |
statementNode() | |
} | |
// Expression represents an expression node | |
type Expression interface { | |
Node | |
expressionNode() | |
} | |
// Program represents the root node of the AST | |
type Program struct { | |
Statements []Statement | |
} | |
func (p *Program) TokenLiteral() string { | |
if len(p.Statements) > 0 { | |
return p.Statements[0].TokenLiteral() | |
} | |
return "" | |
} | |
// ExpressionStatement represents an expression statement | |
type ExpressionStatement struct { | |
Token Token | |
Expression Expression | |
} | |
func (es *ExpressionStatement) statementNode() {} | |
func (es *ExpressionStatement) TokenLiteral() string { return es.Token.Literal } | |
// AssignmentStatement represents an assignment statement | |
type AssignmentStatement struct { | |
Token Token // The '=' token | |
Name *Identifier | |
Value Expression | |
} | |
func (as *AssignmentStatement) statementNode() {} | |
func (as *AssignmentStatement) TokenLiteral() string { return as.Token.Literal } | |
// EchoStatement represents an echo statement | |
type EchoStatement struct { | |
Token Token | |
Value Expression | |
} | |
func (es *EchoStatement) statementNode() {} | |
func (es *EchoStatement) TokenLiteral() string { return es.Token.Literal } | |
// Identifier represents an identifier | |
type Identifier struct { | |
Token Token | |
Value string | |
} | |
func (i *Identifier) expressionNode() {} | |
func (i *Identifier) TokenLiteral() string { return i.Token.Literal } | |
// StringLiteral represents a string literal | |
type StringLiteral struct { | |
Token Token | |
Value string | |
} | |
func (sl *StringLiteral) expressionNode() {} | |
func (sl *StringLiteral) TokenLiteral() string { return sl.Token.Literal } | |
// NumberLiteral represents a number literal | |
type NumberLiteral struct { | |
Token Token | |
Value float64 | |
} | |
func (nl *NumberLiteral) expressionNode() {} | |
func (nl *NumberLiteral) TokenLiteral() string { return nl.Token.Literal } | |
// CallExpression represents a function call | |
type CallExpression struct { | |
Token Token | |
Function Expression | |
Arguments []Expression | |
} | |
func (ce *CallExpression) expressionNode() {} | |
func (ce *CallExpression) TokenLiteral() string { return ce.Token.Literal } | |
// Parser parses tokens into an AST | |
type Parser struct { | |
l *Lexer | |
curToken Token | |
peekToken Token | |
errors []string | |
} | |
// NewParser creates a new Parser | |
func NewParser(l *Lexer) *Parser { | |
p := &Parser{l: l, errors: []string{}} | |
p.nextToken() | |
p.nextToken() | |
return p | |
} | |
func (p *Parser) nextToken() { | |
p.curToken = p.peekToken | |
p.peekToken = p.l.NextToken() | |
} | |
func (p *Parser) ParseProgram() *Program { | |
program := &Program{} | |
program.Statements = []Statement{} | |
for p.curToken.Type != TOKEN_EOF { | |
if p.curToken.Type == TOKEN_PHP_START { | |
p.nextToken() // consume <?php | |
for p.curToken.Type != TOKEN_PHP_END && p.curToken.Type != TOKEN_EOF { | |
if p.curToken.Type == TOKEN_ECHO { | |
stmt := p.parseEchoStatement() | |
program.Statements = append(program.Statements, stmt) | |
} else if p.curToken.Type == TOKEN_DOLLAR { | |
stmt := p.parseAssignmentStatement() | |
program.Statements = append(program.Statements, stmt) | |
} | |
p.nextToken() | |
} | |
} else { | |
p.nextToken() | |
} | |
} | |
return program | |
} | |
func (p *Parser) parseStatement() Statement { | |
switch p.curToken.Type { | |
case TOKEN_ECHO: | |
return p.parseEchoStatement() | |
case TOKEN_DOLLAR: | |
return p.parseAssignmentStatement() | |
case TOKEN_STRING: | |
return p.parseEchoStatement() | |
default: | |
return p.parseExpressionStatement() | |
} | |
} | |
func (p *Parser) parseEchoStatement() *EchoStatement { | |
stmt := &EchoStatement{Token: p.curToken} | |
p.nextToken() | |
stmt.Value = p.parseExpression() | |
for p.peekToken.Type == TOKEN_DOT { | |
p.nextToken() // consume '.' | |
p.nextToken() // consume next token | |
stmt.Value = &CallExpression{Token: Token{Type: TOKEN_IDENTIFIER, Literal: "."}, Function: stmt.Value, Arguments: []Expression{p.parseExpression()}} | |
} | |
return stmt | |
} | |
func (p *Parser) parseAssignmentStatement() *AssignmentStatement { | |
stmt := &AssignmentStatement{Token: p.curToken} | |
p.nextToken() // consume '$' | |
stmt.Name = &Identifier{Token: p.curToken, Value: p.curToken.Literal} | |
p.nextToken() // consume identifier | |
if p.curToken.Type != TOKEN_EQUALS { | |
p.peekError(TOKEN_EQUALS) | |
return nil | |
} | |
p.nextToken() // consume '=' | |
stmt.Value = p.parseExpression() | |
return stmt | |
} | |
func (p *Parser) parseExpressionStatement() *ExpressionStatement { | |
stmt := &ExpressionStatement{Token: p.curToken} | |
stmt.Expression = p.parseExpression() | |
return stmt | |
} | |
func (p *Parser) parseExpression() Expression { | |
switch p.curToken.Type { | |
case TOKEN_IDENTIFIER: | |
return &Identifier{Token: p.curToken, Value: p.curToken.Literal} | |
case TOKEN_STRING: | |
return &StringLiteral{Token: p.curToken, Value: p.curToken.Literal} | |
case TOKEN_NUMBER: | |
return p.parseNumberLiteral() | |
case TOKEN_DOLLAR: | |
return &Identifier{Token: p.curToken, Value: p.curToken.Literal} | |
default: | |
return nil | |
} | |
} | |
func (p *Parser) parseNumberLiteral() Expression { | |
lit := &NumberLiteral{Token: p.curToken} | |
value, err := strconv.ParseFloat(p.curToken.Literal, 64) | |
if err != nil { | |
msg := fmt.Sprintf("could not parse %q as float", p.curToken.Literal) | |
p.errors = append(p.errors, msg) | |
return nil | |
} | |
lit.Value = value | |
return lit | |
} | |
func (p *Parser) expectPeek(t TokenType) bool { | |
if p.peekToken.Type == t { | |
p.nextToken() | |
return true | |
} | |
p.peekError(t) | |
return false | |
} | |
func (p *Parser) peekError(t TokenType) { | |
msg := fmt.Sprintf("expected next token to be %s, got %s instead", | |
t, p.peekToken.Type) | |
p.errors = append(p.errors, msg) | |
} | |
// For testing and demonstration | |
func main() { | |
input := `<?php | |
$name = "World"; | |
echo "Hello, " . $name; | |
$age = 30; | |
echo "Age: " . $age; | |
?>` | |
l := NewLexer(input) | |
p := NewParser(l) | |
program := p.ParseProgram() | |
if len(p.errors) != 0 { | |
for _, err := range p.errors { | |
fmt.Println("parser error:", err) | |
} | |
return | |
} | |
fmt.Println("AST:") | |
for _, stmt := range program.Statements { | |
fmt.Printf("%T\n", stmt) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment