Skip to content

Instantly share code, notes, and snippets.

@caudamus
Last active September 14, 2025 14:50
Show Gist options
  • Select an option

  • Save caudamus/1294f197cbb3fc16c1a82e7e2f1d0ec4 to your computer and use it in GitHub Desktop.

Select an option

Save caudamus/1294f197cbb3fc16c1a82e7e2f1d0ec4 to your computer and use it in GitHub Desktop.
BNF Parser

BNF parsing

Toy weekend project: make a parser-parser!

This takes in a grammar given in Bakus-Naur Form (BNF), and a program that was written in the provided grammar, and produces an image of Syntax Tree using graphviz.

References:

Examples

Simple arithmetic grammar from chapter 2 of the dragon book.

  • simple.bnf - simple arithmetic grammar
  • simple.txt - example arithmetic
  • simple.png - generated with ./bnf.py simple.bnf simple.txt simple

Grammar for the grammar definitions themselves, which is both grammar and program.

  • bnf.bnf - grammar for BNF grammars
  • bnf.png - generated with ./bnf.py bnf.bnf bnf.bnf bnf

Toy Language (TL) from University of Texas at San Antonio.

  • tl.bnf - grammar for TL
  • example.tl - example program in TL
  • tl.png - generated with ./bnf.py tl.bnf example.tl tl
grammar = rule grammar | ;
rule = ruleName "=" expression ";" ;
expression = list "|" expression | list | ;
list = term list | ;
term = literal | ruleName ;
ruleName = identifier ;
#!/usr/bin/env python
import re
import argparse
from collections import namedtuple
from graphviz import Digraph
# Utility types to keep things a little bit neater
Token = namedtuple('Token', ['token', 'value'])
Rule = namedtuple('Rule', ['name', 'expression'])
Grammar = namedtuple('Grammar', ['root', 'rules'])
Tree = namedtuple('Tree', ['root', 'children', 'length'])
terminals = ['identifier', 'integer', 'symbol', 'literal']
def tokenize(s):
''' Return a list of tokens from a string '''
tokens = []
offset = 0 # Offset into string
s = re.sub(r'%[^\n]*\n', '', s) # Remove comments
while offset < len(s):
# Newlines and spaces (ignored)
if s[offset] == '\n' or s[offset] == ' ':
offset = offset + 1
continue
# Identifiers
match = re.match(r'^[a-z_A-Z][a-zA-Z0-9]*', s[offset:])
if match is not None:
tokens.append(Token('identifier', match.group()))
offset = offset + len(match.group())
continue
# Integers
match = re.match(r'^[1-9][0-9]*|0', s[offset:])
if match is not None:
tokens.append(Token('integer', match.group()))
offset = offset + len(match.group())
continue
# Symbols
match = re.match(r'^[=|;:!<>\+\-\*\/\(\)]+', s[offset:])
if match is not None:
tokens.append(Token('symbol', match.group()))
offset = offset + len(match.group())
continue
# Literals
match = re.match(r'^"([^"\n]*)"', s[offset:])
if match is not None:
tokens.append(Token('literal', match.groups()[0]))
offset = offset + len(match.group())
continue
# Error case
raise ValueError('Unable to tokenize: {}'.format(s[offset:]))
return tokens
def is_term(token, symbol):
''' Return true if token is the given terminal symbol '''
return token.token == 'symbol' and token.value == symbol
def parse_grammar(tokens):
''' Parse a list of tokens into a BNF grammar '''
rules = {}
total = 0
first_rule = None
while total < len(tokens):
# Parse rule name
assert tokens[total].token == 'identifier', '{} {}'.format(
tokens[total], total)
name = tokens[total].value
if first_rule is None:
first_rule = name
total += 1
# Parse assignment symbol
assert is_term(tokens[total], '=')
total += 1
# Parse expression
expression = []
while True:
# Parse list
list_ = []
while not (is_term(tokens[total], '|') or is_term(tokens[total], ';')):
list_.append(tokens[total].value)
total += 1
expression.append(list_)
if is_term(tokens[total], ';'):
break
if is_term(tokens[total], '|'):
total += 1
# Parse terminating semicolon
assert is_term(tokens[total], ';')
total += 1
# Add rule to list
rules[name] = expression
return Grammar(first_rule, rules)
def parse(grammar, tokens, items=None):
''' Given a grammar, parse a list of tokens into syntax trees given nonterminals '''
if items is None:
items = [grammar.root]
if len(items) == 0:
return []
if len(tokens) == 0:
return None
item, *tail = items
if item not in grammar.rules:
if ((item in terminals and tokens[0].token == item) or
(item == tokens[0].value and tokens[0].token in terminals)):
tail_trees = parse(grammar, tokens[1:], tail)
if tail_trees is not None:
root = item
if item == tokens[0].token and tokens[0].token != 'literal':
root = root + '\n' + tokens[0].value
return [Tree(root, [], 1)] + tail_trees
return None
for exp in grammar.rules[item]:
exp_trees = parse(grammar, tokens, exp)
if exp_trees is None:
continue
offset = sum(tree.length for tree in exp_trees)
tail_trees = parse(grammar, tokens[offset:], tail)
if tail_trees is None:
continue
return [Tree(item, exp_trees, offset)] + tail_trees
return None
def render(tree, filepath):
''' Draw the syntax tree as a graph to an output image file '''
dot = Digraph(format='png')
to_visit = [tree]
while to_visit:
node = to_visit.pop(0)
dot.node(str(id(node)), label=str(node.root))
for child in node.children:
if isinstance(child, Tree):
dot.edge(str(id(node)), str(id(child)))
to_visit.append(child)
dot.render(filepath, cleanup=True)
def main():
parser = argparse.ArgumentParser(description='Make syntax tree graphs')
parser.add_argument('grammar_file', help='BNF-encoded grammar to parse')
parser.add_argument('program_file', help='program to parse with grammar')
parser.add_argument('output_image', help='image file to save syntax tree')
args = parser.parse_args()
grammar = parse_grammar(tokenize(open(args.grammar_file).read()))
program = tokenize(open(args.program_file).read())
syntax, = parse(grammar, program)
render(syntax, args.output_image)
if __name__ == '__main__':
main()
program
var N as int ;
var SQRT as int ;
begin
N := readInt ;
SQRT := 0 ;
% go until SQRT exceeds the square root of N
while SQRT * SQRT <= N do
SQRT := SQRT + 1 ;
end ;
SQRT := SQRT - 1 ; % subtract one SQRT is <= sqrt(N)
writeInt SQRT ;
end
expr = term "+" expr | term "-" expr | term ;
term = factor "*" term | factor "/" term | factor ;
factor = integer | "(" expr ")" ;
9 / 3 + 5 * (2 - 0)
programText = "program" declarations "begin" statementSequence "end" ;
declarations = "var" identifier "as" type ";" declarations | ;
type = "int" | "bool" ;
statementSequence = statement ";" statementSequence | ;
statement = assignment | ifStatement | whileStatement | writeIntExpr ;
assignment = identifier ":=" "readInt" | identifier ":=" expression ;
ifStatement = "if" expression "then" statementSequence elseClause "end" ;
elseClause = "else" statementSequence | ;
whileStatement = "while" expression "do" statementSequence "end" ;
writeIntExpr = "writeInt" expression ;
expression = simpleExpression | simpleExpression compare expression ;
simpleExpression = term additive simpleExpression | term ;
term = factor multiplicative term | factor ;
factor = identifier | integer | boolean | "(" expression ")" ;
compare = "=" | "!=" | "<" | ">" | "<=" | ">=" ;
additive = "+" | "-" ;
multiplicative = "*" | "div" | "mod" ;
boolean = "true" | "false" ;
@anhldbk
Copy link
Copy Markdown

anhldbk commented Jan 11, 2022

Nice. Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment