Skip to content

Instantly share code, notes, and snippets.

@d0rc
Created September 25, 2024 02:08
Show Gist options
  • Save d0rc/2626935cf6bcf94126c802a4080526a8 to your computer and use it in GitHub Desktop.
Save d0rc/2626935cf6bcf94126c802a4080526a8 to your computer and use it in GitHub Desktop.
php subset parser / lexer in go
package main
import (
"fmt"
"strconv"
)
// TokenType represents the type of a token
type TokenType int
const (
TOKEN_EOF TokenType = iota
TOKEN_IDENTIFIER
TOKEN_STRING
TOKEN_NUMBER
TOKEN_EQUALS
TOKEN_SEMICOLON
TOKEN_DOLLAR
TOKEN_LPAREN
TOKEN_RPAREN
TOKEN_COMMA
TOKEN_ECHO
TOKEN_PHP_START
TOKEN_PHP_END
TOKEN_DOT
)
// Token represents a lexical token
type Token struct {
Type TokenType
Literal string
}
// Lexer performs lexical analysis
type Lexer struct {
input string
position int
readPosition int
ch byte
}
// NewLexer creates a new Lexer
func NewLexer(input string) *Lexer {
l := &Lexer{input: input}
l.readChar()
return l
}
// readChar reads the next character
func (l *Lexer) readChar() {
if l.readPosition >= len(l.input) {
l.ch = 0
} else {
l.ch = l.input[l.readPosition]
}
l.position = l.readPosition
l.readPosition++
}
// NextToken returns the next token
func (l *Lexer) NextToken() Token {
var tok Token
l.skipWhitespace()
switch l.ch {
case '=':
tok = Token{Type: TOKEN_EQUALS, Literal: string(l.ch)}
case ';':
tok = Token{Type: TOKEN_SEMICOLON, Literal: string(l.ch)}
case '$':
tok = Token{Type: TOKEN_DOLLAR, Literal: string(l.ch)}
case '(':
tok = Token{Type: TOKEN_LPAREN, Literal: string(l.ch)}
case ')':
tok = Token{Type: TOKEN_RPAREN, Literal: string(l.ch)}
case ',':
tok = Token{Type: TOKEN_COMMA, Literal: string(l.ch)}
case '.':
tok = Token{Type: TOKEN_DOT, Literal: string(l.ch)}
case '"':
tok.Type = TOKEN_STRING
tok.Literal = l.readString()
case 0:
tok.Literal = ""
tok.Type = TOKEN_EOF
default:
if isLetter(l.ch) {
tok.Literal = l.readIdentifier()
tok.Type = l.lookupIdentifier(tok.Literal)
return tok
} else if isDigit(l.ch) {
tok.Type = TOKEN_NUMBER
tok.Literal = l.readNumber()
return tok
} else if l.ch == '<' && l.peekChar() == '?' {
l.readChar() // consume '<'
l.readChar() // consume '?'
if l.ch == 'p' && l.peekChar() == 'h' {
l.readChar() // consume 'p'
l.readChar() // consume 'h'
l.readChar() // consume 'p'
tok = Token{Type: TOKEN_PHP_START, Literal: "<?php"}
}
} else if l.ch == '?' && l.peekChar() == '>' {
l.readChar() // consume '?'
l.readChar() // consume '>'
tok = Token{Type: TOKEN_PHP_END, Literal: "?>"}
} else {
tok = Token{Type: TOKEN_EOF, Literal: string(l.ch)}
}
}
l.readChar()
return tok
}
func (l *Lexer) skipWhitespace() {
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
l.readChar()
}
}
func (l *Lexer) readIdentifier() string {
position := l.position
for isLetter(l.ch) || isDigit(l.ch) {
l.readChar()
}
return l.input[position:l.position]
}
func (l *Lexer) readNumber() string {
position := l.position
for isDigit(l.ch) {
l.readChar()
}
return l.input[position:l.position]
}
func (l *Lexer) readString() string {
position := l.position + 1
for {
l.readChar()
if l.ch == '"' || l.ch == 0 {
break
}
}
return l.input[position:l.position]
}
func (l *Lexer) peekChar() byte {
if l.readPosition >= len(l.input) {
return 0
}
return l.input[l.readPosition]
}
func (l *Lexer) lookupIdentifier(ident string) TokenType {
switch ident {
case "echo":
return TOKEN_ECHO
default:
return TOKEN_IDENTIFIER
}
}
func isLetter(ch byte) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
}
func isDigit(ch byte) bool {
return '0' <= ch && ch <= '9'
}
// Node represents a node in the AST
type Node interface {
TokenLiteral() string
}
// Statement represents a statement node
type Statement interface {
Node
statementNode()
}
// Expression represents an expression node
type Expression interface {
Node
expressionNode()
}
// Program represents the root node of the AST
type Program struct {
Statements []Statement
}
func (p *Program) TokenLiteral() string {
if len(p.Statements) > 0 {
return p.Statements[0].TokenLiteral()
}
return ""
}
// ExpressionStatement represents an expression statement
type ExpressionStatement struct {
Token Token
Expression Expression
}
func (es *ExpressionStatement) statementNode() {}
func (es *ExpressionStatement) TokenLiteral() string { return es.Token.Literal }
// AssignmentStatement represents an assignment statement
type AssignmentStatement struct {
Token Token // The '=' token
Name *Identifier
Value Expression
}
func (as *AssignmentStatement) statementNode() {}
func (as *AssignmentStatement) TokenLiteral() string { return as.Token.Literal }
// EchoStatement represents an echo statement
type EchoStatement struct {
Token Token
Value Expression
}
func (es *EchoStatement) statementNode() {}
func (es *EchoStatement) TokenLiteral() string { return es.Token.Literal }
// Identifier represents an identifier
type Identifier struct {
Token Token
Value string
}
func (i *Identifier) expressionNode() {}
func (i *Identifier) TokenLiteral() string { return i.Token.Literal }
// StringLiteral represents a string literal
type StringLiteral struct {
Token Token
Value string
}
func (sl *StringLiteral) expressionNode() {}
func (sl *StringLiteral) TokenLiteral() string { return sl.Token.Literal }
// NumberLiteral represents a number literal
type NumberLiteral struct {
Token Token
Value float64
}
func (nl *NumberLiteral) expressionNode() {}
func (nl *NumberLiteral) TokenLiteral() string { return nl.Token.Literal }
// CallExpression represents a function call
type CallExpression struct {
Token Token
Function Expression
Arguments []Expression
}
func (ce *CallExpression) expressionNode() {}
func (ce *CallExpression) TokenLiteral() string { return ce.Token.Literal }
// Parser parses tokens into an AST
type Parser struct {
l *Lexer
curToken Token
peekToken Token
errors []string
}
// NewParser creates a new Parser
func NewParser(l *Lexer) *Parser {
p := &Parser{l: l, errors: []string{}}
p.nextToken()
p.nextToken()
return p
}
func (p *Parser) nextToken() {
p.curToken = p.peekToken
p.peekToken = p.l.NextToken()
}
func (p *Parser) ParseProgram() *Program {
program := &Program{}
program.Statements = []Statement{}
for p.curToken.Type != TOKEN_EOF {
if p.curToken.Type == TOKEN_PHP_START {
p.nextToken() // consume <?php
for p.curToken.Type != TOKEN_PHP_END && p.curToken.Type != TOKEN_EOF {
if p.curToken.Type == TOKEN_ECHO {
stmt := p.parseEchoStatement()
program.Statements = append(program.Statements, stmt)
} else if p.curToken.Type == TOKEN_DOLLAR {
stmt := p.parseAssignmentStatement()
program.Statements = append(program.Statements, stmt)
}
p.nextToken()
}
} else {
p.nextToken()
}
}
return program
}
func (p *Parser) parseStatement() Statement {
switch p.curToken.Type {
case TOKEN_ECHO:
return p.parseEchoStatement()
case TOKEN_DOLLAR:
return p.parseAssignmentStatement()
case TOKEN_STRING:
return p.parseEchoStatement()
default:
return p.parseExpressionStatement()
}
}
func (p *Parser) parseEchoStatement() *EchoStatement {
stmt := &EchoStatement{Token: p.curToken}
p.nextToken()
stmt.Value = p.parseExpression()
for p.peekToken.Type == TOKEN_DOT {
p.nextToken() // consume '.'
p.nextToken() // consume next token
stmt.Value = &CallExpression{Token: Token{Type: TOKEN_IDENTIFIER, Literal: "."}, Function: stmt.Value, Arguments: []Expression{p.parseExpression()}}
}
return stmt
}
func (p *Parser) parseAssignmentStatement() *AssignmentStatement {
stmt := &AssignmentStatement{Token: p.curToken}
p.nextToken() // consume '$'
stmt.Name = &Identifier{Token: p.curToken, Value: p.curToken.Literal}
p.nextToken() // consume identifier
if p.curToken.Type != TOKEN_EQUALS {
p.peekError(TOKEN_EQUALS)
return nil
}
p.nextToken() // consume '='
stmt.Value = p.parseExpression()
return stmt
}
func (p *Parser) parseExpressionStatement() *ExpressionStatement {
stmt := &ExpressionStatement{Token: p.curToken}
stmt.Expression = p.parseExpression()
return stmt
}
func (p *Parser) parseExpression() Expression {
switch p.curToken.Type {
case TOKEN_IDENTIFIER:
return &Identifier{Token: p.curToken, Value: p.curToken.Literal}
case TOKEN_STRING:
return &StringLiteral{Token: p.curToken, Value: p.curToken.Literal}
case TOKEN_NUMBER:
return p.parseNumberLiteral()
case TOKEN_DOLLAR:
return &Identifier{Token: p.curToken, Value: p.curToken.Literal}
default:
return nil
}
}
func (p *Parser) parseNumberLiteral() Expression {
lit := &NumberLiteral{Token: p.curToken}
value, err := strconv.ParseFloat(p.curToken.Literal, 64)
if err != nil {
msg := fmt.Sprintf("could not parse %q as float", p.curToken.Literal)
p.errors = append(p.errors, msg)
return nil
}
lit.Value = value
return lit
}
func (p *Parser) expectPeek(t TokenType) bool {
if p.peekToken.Type == t {
p.nextToken()
return true
}
p.peekError(t)
return false
}
func (p *Parser) peekError(t TokenType) {
msg := fmt.Sprintf("expected next token to be %s, got %s instead",
t, p.peekToken.Type)
p.errors = append(p.errors, msg)
}
// For testing and demonstration
func main() {
input := `<?php
$name = "World";
echo "Hello, " . $name;
$age = 30;
echo "Age: " . $age;
?>`
l := NewLexer(input)
p := NewParser(l)
program := p.ParseProgram()
if len(p.errors) != 0 {
for _, err := range p.errors {
fmt.Println("parser error:", err)
}
return
}
fmt.Println("AST:")
for _, stmt := range program.Statements {
fmt.Printf("%T\n", stmt)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment