Skip to content

Instantly share code, notes, and snippets.

@blazs
Created January 13, 2018 18:10
Show Gist options
  • Save blazs/2c33b8be5add79211c4354ff55f9d226 to your computer and use it in GitHub Desktop.
Save blazs/2c33b8be5add79211c4354ff55f9d226 to your computer and use it in GitHub Desktop.
An old Java implementation of the Earley parser from student days.
// -- Description --
// Assume 'S' is always the start symbol
// You may assume that your method will be tested in the following setting:
// - grammar will contain between 1 and 100 strings,
// - each string will represent one production (possibly with multiple right
// hand sides, separated by | (see examples)),
// - word can be empty or up to 10000 terminal symbols (characters) long.
// -- References --
// [1] Jay Earley. An Efficient Context-Free Parsing Algorithm. Communications of the ACM, 1970.
// [2] John Aycock and Nigel Horspool. Practical Earley Parsing. Computer Journal, 2002.
//
import java.util.;
// Single production
// Assume nonterminal is always a single character from {A,B,...,Z}, and terminal is always a string over {a,b,...,z}
// prodHead -> prodRhs
class Production {
public Character prodHead; // head
public Character[] prodRhs; // right hand side
// Constructor
public Production(Character prodHead, Character[] prodRhs) {
this.prodHead = prodHead;
this.prodRhs = prodRhs;
}
public Production(String prodHead, String prodRhs) {
// assert(prodHead.length() == 1);
this.prodHead = prodHead.charAt(0);
this.prodRhs = toCharacterArray(prodRhs.toCharArray());
}
public boolean equals(Object o) {
if (this == o) {
return true;
} else if (!(o instanceof Production)) {
return false;
} else {
Production p = (Production)o;
return prodHead.equals(p.prodHead) && Arrays.equals(prodRhs, p.prodRhs);
}
}
// Helper
private static Character[] toCharacterArray(char[] array) {
Character[] chArray = new Character[array.length];
for (int i = 0; i < array.length; ++i) { chArray[i] = array[i]; }
return chArray;
}
}
// Let G = (V, T, P, S) be a context-free grammar
class Grammar {
public ArrayList<Production> productions; // P
// Constructor
public Grammar() {
productions = new ArrayList<Production>();
}
public Grammar(ArrayList<Production> productions) {
this.productions = productions;
}
public boolean addProduction(Production prod) {
return !productions.contains(prod) && productions.add(prod);
}
public boolean addProduction(Character prodHead, Character[] prodRhs) {
return addProduction(new Production(prodHead, prodRhs));
}
public boolean addProduction(String prodHead, String prodRhs) {
return addProduction(new Production(prodHead, prodRhs));
}
// Compute the set of nullable nonterminals: { A in V | A =>* eps }
// I decided to go with a simple fixed-point algorithm
public Set<Character> getNullable() {
Set<Character> nullSet = new TreeSet<Character>();
// find the ``base'' symbols --- all A in V such that A -> eps is in P
for (Production p : productions) {
if (p.prodRhs[0] == '~') { nullSet.add(p.prodHead); }
}
if (nullSet.size() != 0) {
boolean isNullable = true;
int crrSize = nullSet.size();
do {
crrSize = nullSet.size();
for (Production p : productions) {
isNullable = true;
for (Character c : p.prodRhs) {
if (c.isLowerCase(c) && c.isLetter(c) || !nullSet.contains(c)) {
isNullable = false; break;
}
}
if (isNullable) { nullSet.add(p.prodHead); }
}
} while (crrSize != nullSet.size());
}
return nullSet;
}
}
// State
class State {
public Production prod;
public int rhsIdx;
public int prevSet;
// Constructor
public State(Production prod, int rhsIdx, int prevSet) {
this.prod = prod; // production
this.rhsIdx = rhsIdx; // position of the dot on the right-hand-side of the production
this.prevSet = prevSet;
}
public boolean equals(Object o) {
if (this == o) {
return true;
} else if (!(o instanceof State)) {
return false;
} else {
State s = (State)o;
// System.out.println("[DEBUG] " + prod.equals(s.prod));
return rhsIdx == s.rhsIdx && prevSet == s.prevSet && prod.equals(s.prod);
}
}
}
public class Earley {
private Grammar g;
private ArrayList<LinkedList<State> > stateSets;
public Earley() {
g = new Grammar();
stateSets = new ArrayList<LinkedList<State> >();
}
// helper methods
private boolean isNonterminal(Character c) {
return Character.isUpperCase(c) || c.equals('@');
}
// prints state set contents
/*private void print() {
for (int idx = 0; idx < stateSets.size(); ++idx) {
System.out.println("------ State set " + idx + " -----------");
for (State s : stateSets.get(idx)) {
String rhs = Arrays.toString(s.prod.prodRhs).replaceAll(", ", "").substring(1, s.prod.prodRhs.length + 1);
System.out.println(s.prod.prodHead + " -> " + rhs + ", " + s.rhsIdx + " ; " + s.prevSet);
}
}
}*/
// If production (P, j) is in S_i and x_{i+1}= current terminal, then add P with rhsIdx+1 to S_{i+1}
private void scanner(State state, int crrIdx) {
State newState = new State(state.prod, state.rhsIdx+1, state.prevSet);
if (!stateSets.get(crrIdx+1).contains(newState)) {
stateSets.get(crrIdx+1).add(newState);
}
}
// We use the predictor from [2]; it nicely handles $\varepsilon$-productions
// For [A -> alpha o E beta, i], it adds [E -> o gamma, j] for all productions in S_i
// with E on the left-hand side
private void predictor(State state, int crrIdx, Set<Character> nullableVars) {
Character B = state.prod.prodRhs[state.rhsIdx];
for (Production p : g.productions) {
if (p.prodHead.equals(B)) {
State newState = new State(p, 0, crrIdx);
if (!stateSets.get(crrIdx).contains(newState)) {
stateSets.get(crrIdx).add(newState);
}
}
}
// Need this to handle $\varepsilon$-productions [2]
if (nullableVars.contains(B)) { // B is nullable, i.e., B =>* eps
State newState = new State(state.prod, state.rhsIdx+1, state.prevSet);
if (!stateSets.get(crrIdx).contains(newState)) {
stateSets.get(crrIdx).add(newState);
}
}
}
private void completer(State state, int crrIdx) {
int j = state.prevSet;
LinkedList<State> stateSet = stateSets.get(j);
for (int i = 0; i < stateSet.size(); ++i) {
State s = stateSet.get(i);
if (s.rhsIdx < s.prod.prodRhs.length && s.prod.prodRhs[s.rhsIdx].equals(state.prod.prodHead)) {
State newState = new State(s.prod, s.rhsIdx+1, s.prevSet);
if (!stateSets.get(crrIdx).contains(newState)) {
stateSets.get(crrIdx).add(newState);
}
}
}
}
private void initialize(int n) {
// add the initial state set S_0
Production newProd = new Production("@", "S");
LinkedList<State> initState = new LinkedList<State>();
initState.add(new State(newProd, 0, 0));
stateSets.add(initState);
for (int i = 0; i < n; ++i) {
stateSets.add(new LinkedList<State>());
}
}
private void process(int i, Character a, Set<Character> nullableVars) {
LinkedList<State> stateSet = stateSets.get(i);
int crrSize = 0;
do {
crrSize = stateSet.size();
// iterate over the states from the current state set
for (int j = 0; j < crrSize; ++j) {
State state = stateSet.get(j);
// apply either scanner, predictor, or completer
if (state.rhsIdx == state.prod.prodRhs.length) {
completer(state, i);
} else {
if (isNonterminal(state.prod.prodRhs[state.rhsIdx])) {
predictor(state, i, nullableVars);
} else if (state.prod.prodRhs[state.rhsIdx].equals(a)) {
scanner(state, i);
} // else Nothing left to do
}
}
} while (crrSize != stateSet.size());
}
public boolean solve(String[] grammar, String word) {
// load the grammar
for (String s : grammar) {
// System.out.println(s);
Character prodHead = s.charAt(0);
for (String rhs : s.split("->")[1].split("\\|")) {
g.addProduction(s, rhs);
}
}
// compute nullable symbols
Set<Character> nullableVars = g.getNullable();
// run the earley recognizer
initialize(word.length());
for (int i = 0; i < word.length(); ++i) {
int crrSet = i+1; // set index
Character a = word.charAt(i);
// Initialize the set S_{i+1} by applying the three operations to S_i until S_i stops changing
process(i, a, nullableVars);
}
process(word.length(), '/', nullableVars); // character '/' is never in T
State finalState = new State(new Production("@", "S"), 1, 0);
LinkedList<State> lastStateSet = stateSets.get(word.length());
// print();
for (State s : lastStateSet) {
if (s.equals(finalState)) {
return true;
}
}
return false;
}
}
import java.util.Arrays;
import java.util.HashSet;
public class Main {
public static boolean wordInGrammar(String g, String w) {
Earley ep = new Earley();
return ep.solve(g.split(";"), w);
}
public static void main(String [] args) {
/* Test our implementation of the Early parser. */
// wordInGrammar("S->aA|Aa|aAAA|AAAABBBABABABA;A->aA|bA|~;B->bbbA|ABA|A", "aaa");
// wordInGrammar("S->S+M|M;M->M*T|T;T->a|b|c|d", "a++b*c");
System.out.println("All palindromic binary strings of length 5:");
StringGenerator sg = new StringGenerator(new HashSet<>(Arrays.asList('0', '1')), 5);
for (String w : sg) {
if (wordInGrammar("S->0S0|1S1|0|1|~", w)) {
System.out.println(w);
}
}
}
}
import java.util.Set;
import java.util.Iterator;
import java.util.NoSuchElementException;
public class StringGenerator implements Iterable<String>, Iterator<String> {
private Character[] alphabet;
private int stringLength;
private int[] currentString;
private boolean isLast;
public StringGenerator(Set<Character> alphabet, int stringLength) {
if (alphabet == null || alphabet.isEmpty()) {
throw new IllegalArgumentException("Alphabet is a nonempty set of characters");
} else if (stringLength < 1) {
throw new IllegalArgumentException("String length must be positive");
}
this.alphabet = alphabet.toArray(new Character[0]);
this.stringLength = stringLength;
this.isLast = false;
this.currentString = new int[stringLength];
}
public Iterator<String> iterator() {
return this;
}
public String next() {
if (!hasNext()) {
throw new NoSuchElementException("Iterator exhausted");
}
StringBuilder sb = new StringBuilder(stringLength);
for (int idx : this.currentString) {
sb.append(this.alphabet[idx]);
}
prepareNext();
return sb.toString();
}
public boolean hasNext() {
return !this.isLast;
}
private void prepareNext() {
final int n = alphabet.length;
int index = stringLength - 1;
while (index >= 0 && currentString[index] == n-1) {
index--;
}
if (index < 0) {
this.isLast = true;
return;
}
this.currentString[index] += 1;
for (int j = stringLength-1; j > index; j--) {
this.currentString[j] = 0;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment