Created
January 13, 2018 18:10
-
-
Save blazs/2c33b8be5add79211c4354ff55f9d226 to your computer and use it in GitHub Desktop.
An old Java implementation of the Earley parser from student days.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -- Description -- | |
// Assume 'S' is always the start symbol | |
// You may assume that your method will be tested in the following setting: | |
// - grammar will contain between 1 and 100 strings, | |
// - each string will represent one production (possibly with multiple right | |
// hand sides, separated by | (see examples)), | |
// - word can be empty or up to 10000 terminal symbols (characters) long. | |
// -- References -- | |
// [1] Jay Earley. An Efficient Context-Free Parsing Algorithm. Communications of the ACM, 1970. | |
// [2] John Aycock and Nigel Horspool. Practical Earley Parsing. Computer Journal, 2002. | |
// | |
import java.util.; | |
// Single production | |
// Assume nonterminal is always a single character from {A,B,...,Z}, and terminal is always a string over {a,b,...,z} | |
// prodHead -> prodRhs | |
class Production { | |
public Character prodHead; // head | |
public Character[] prodRhs; // right hand side | |
// Constructor | |
public Production(Character prodHead, Character[] prodRhs) { | |
this.prodHead = prodHead; | |
this.prodRhs = prodRhs; | |
} | |
public Production(String prodHead, String prodRhs) { | |
// assert(prodHead.length() == 1); | |
this.prodHead = prodHead.charAt(0); | |
this.prodRhs = toCharacterArray(prodRhs.toCharArray()); | |
} | |
public boolean equals(Object o) { | |
if (this == o) { | |
return true; | |
} else if (!(o instanceof Production)) { | |
return false; | |
} else { | |
Production p = (Production)o; | |
return prodHead.equals(p.prodHead) && Arrays.equals(prodRhs, p.prodRhs); | |
} | |
} | |
// Helper | |
private static Character[] toCharacterArray(char[] array) { | |
Character[] chArray = new Character[array.length]; | |
for (int i = 0; i < array.length; ++i) { chArray[i] = array[i]; } | |
return chArray; | |
} | |
} | |
// Let G = (V, T, P, S) be a context-free grammar | |
class Grammar { | |
public ArrayList<Production> productions; // P | |
// Constructor | |
public Grammar() { | |
productions = new ArrayList<Production>(); | |
} | |
public Grammar(ArrayList<Production> productions) { | |
this.productions = productions; | |
} | |
public boolean addProduction(Production prod) { | |
return !productions.contains(prod) && productions.add(prod); | |
} | |
public boolean addProduction(Character prodHead, Character[] prodRhs) { | |
return addProduction(new Production(prodHead, prodRhs)); | |
} | |
public boolean addProduction(String prodHead, String prodRhs) { | |
return addProduction(new Production(prodHead, prodRhs)); | |
} | |
// Compute the set of nullable nonterminals: { A in V | A =>* eps } | |
// I decided to go with a simple fixed-point algorithm | |
public Set<Character> getNullable() { | |
Set<Character> nullSet = new TreeSet<Character>(); | |
// find the ``base'' symbols --- all A in V such that A -> eps is in P | |
for (Production p : productions) { | |
if (p.prodRhs[0] == '~') { nullSet.add(p.prodHead); } | |
} | |
if (nullSet.size() != 0) { | |
boolean isNullable = true; | |
int crrSize = nullSet.size(); | |
do { | |
crrSize = nullSet.size(); | |
for (Production p : productions) { | |
isNullable = true; | |
for (Character c : p.prodRhs) { | |
if (c.isLowerCase(c) && c.isLetter(c) || !nullSet.contains(c)) { | |
isNullable = false; break; | |
} | |
} | |
if (isNullable) { nullSet.add(p.prodHead); } | |
} | |
} while (crrSize != nullSet.size()); | |
} | |
return nullSet; | |
} | |
} | |
// State | |
class State { | |
public Production prod; | |
public int rhsIdx; | |
public int prevSet; | |
// Constructor | |
public State(Production prod, int rhsIdx, int prevSet) { | |
this.prod = prod; // production | |
this.rhsIdx = rhsIdx; // position of the dot on the right-hand-side of the production | |
this.prevSet = prevSet; | |
} | |
public boolean equals(Object o) { | |
if (this == o) { | |
return true; | |
} else if (!(o instanceof State)) { | |
return false; | |
} else { | |
State s = (State)o; | |
// System.out.println("[DEBUG] " + prod.equals(s.prod)); | |
return rhsIdx == s.rhsIdx && prevSet == s.prevSet && prod.equals(s.prod); | |
} | |
} | |
} | |
public class Earley { | |
private Grammar g; | |
private ArrayList<LinkedList<State> > stateSets; | |
public Earley() { | |
g = new Grammar(); | |
stateSets = new ArrayList<LinkedList<State> >(); | |
} | |
// helper methods | |
private boolean isNonterminal(Character c) { | |
return Character.isUpperCase(c) || c.equals('@'); | |
} | |
// prints state set contents | |
/*private void print() { | |
for (int idx = 0; idx < stateSets.size(); ++idx) { | |
System.out.println("------ State set " + idx + " -----------"); | |
for (State s : stateSets.get(idx)) { | |
String rhs = Arrays.toString(s.prod.prodRhs).replaceAll(", ", "").substring(1, s.prod.prodRhs.length + 1); | |
System.out.println(s.prod.prodHead + " -> " + rhs + ", " + s.rhsIdx + " ; " + s.prevSet); | |
} | |
} | |
}*/ | |
// If production (P, j) is in S_i and x_{i+1}= current terminal, then add P with rhsIdx+1 to S_{i+1} | |
private void scanner(State state, int crrIdx) { | |
State newState = new State(state.prod, state.rhsIdx+1, state.prevSet); | |
if (!stateSets.get(crrIdx+1).contains(newState)) { | |
stateSets.get(crrIdx+1).add(newState); | |
} | |
} | |
// We use the predictor from [2]; it nicely handles $\varepsilon$-productions | |
// For [A -> alpha o E beta, i], it adds [E -> o gamma, j] for all productions in S_i | |
// with E on the left-hand side | |
private void predictor(State state, int crrIdx, Set<Character> nullableVars) { | |
Character B = state.prod.prodRhs[state.rhsIdx]; | |
for (Production p : g.productions) { | |
if (p.prodHead.equals(B)) { | |
State newState = new State(p, 0, crrIdx); | |
if (!stateSets.get(crrIdx).contains(newState)) { | |
stateSets.get(crrIdx).add(newState); | |
} | |
} | |
} | |
// Need this to handle $\varepsilon$-productions [2] | |
if (nullableVars.contains(B)) { // B is nullable, i.e., B =>* eps | |
State newState = new State(state.prod, state.rhsIdx+1, state.prevSet); | |
if (!stateSets.get(crrIdx).contains(newState)) { | |
stateSets.get(crrIdx).add(newState); | |
} | |
} | |
} | |
private void completer(State state, int crrIdx) { | |
int j = state.prevSet; | |
LinkedList<State> stateSet = stateSets.get(j); | |
for (int i = 0; i < stateSet.size(); ++i) { | |
State s = stateSet.get(i); | |
if (s.rhsIdx < s.prod.prodRhs.length && s.prod.prodRhs[s.rhsIdx].equals(state.prod.prodHead)) { | |
State newState = new State(s.prod, s.rhsIdx+1, s.prevSet); | |
if (!stateSets.get(crrIdx).contains(newState)) { | |
stateSets.get(crrIdx).add(newState); | |
} | |
} | |
} | |
} | |
private void initialize(int n) { | |
// add the initial state set S_0 | |
Production newProd = new Production("@", "S"); | |
LinkedList<State> initState = new LinkedList<State>(); | |
initState.add(new State(newProd, 0, 0)); | |
stateSets.add(initState); | |
for (int i = 0; i < n; ++i) { | |
stateSets.add(new LinkedList<State>()); | |
} | |
} | |
private void process(int i, Character a, Set<Character> nullableVars) { | |
LinkedList<State> stateSet = stateSets.get(i); | |
int crrSize = 0; | |
do { | |
crrSize = stateSet.size(); | |
// iterate over the states from the current state set | |
for (int j = 0; j < crrSize; ++j) { | |
State state = stateSet.get(j); | |
// apply either scanner, predictor, or completer | |
if (state.rhsIdx == state.prod.prodRhs.length) { | |
completer(state, i); | |
} else { | |
if (isNonterminal(state.prod.prodRhs[state.rhsIdx])) { | |
predictor(state, i, nullableVars); | |
} else if (state.prod.prodRhs[state.rhsIdx].equals(a)) { | |
scanner(state, i); | |
} // else Nothing left to do | |
} | |
} | |
} while (crrSize != stateSet.size()); | |
} | |
public boolean solve(String[] grammar, String word) { | |
// load the grammar | |
for (String s : grammar) { | |
// System.out.println(s); | |
Character prodHead = s.charAt(0); | |
for (String rhs : s.split("->")[1].split("\\|")) { | |
g.addProduction(s, rhs); | |
} | |
} | |
// compute nullable symbols | |
Set<Character> nullableVars = g.getNullable(); | |
// run the earley recognizer | |
initialize(word.length()); | |
for (int i = 0; i < word.length(); ++i) { | |
int crrSet = i+1; // set index | |
Character a = word.charAt(i); | |
// Initialize the set S_{i+1} by applying the three operations to S_i until S_i stops changing | |
process(i, a, nullableVars); | |
} | |
process(word.length(), '/', nullableVars); // character '/' is never in T | |
State finalState = new State(new Production("@", "S"), 1, 0); | |
LinkedList<State> lastStateSet = stateSets.get(word.length()); | |
// print(); | |
for (State s : lastStateSet) { | |
if (s.equals(finalState)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.Arrays; | |
import java.util.HashSet; | |
public class Main { | |
public static boolean wordInGrammar(String g, String w) { | |
Earley ep = new Earley(); | |
return ep.solve(g.split(";"), w); | |
} | |
public static void main(String [] args) { | |
/* Test our implementation of the Early parser. */ | |
// wordInGrammar("S->aA|Aa|aAAA|AAAABBBABABABA;A->aA|bA|~;B->bbbA|ABA|A", "aaa"); | |
// wordInGrammar("S->S+M|M;M->M*T|T;T->a|b|c|d", "a++b*c"); | |
System.out.println("All palindromic binary strings of length 5:"); | |
StringGenerator sg = new StringGenerator(new HashSet<>(Arrays.asList('0', '1')), 5); | |
for (String w : sg) { | |
if (wordInGrammar("S->0S0|1S1|0|1|~", w)) { | |
System.out.println(w); | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.Set; | |
import java.util.Iterator; | |
import java.util.NoSuchElementException; | |
public class StringGenerator implements Iterable<String>, Iterator<String> { | |
private Character[] alphabet; | |
private int stringLength; | |
private int[] currentString; | |
private boolean isLast; | |
public StringGenerator(Set<Character> alphabet, int stringLength) { | |
if (alphabet == null || alphabet.isEmpty()) { | |
throw new IllegalArgumentException("Alphabet is a nonempty set of characters"); | |
} else if (stringLength < 1) { | |
throw new IllegalArgumentException("String length must be positive"); | |
} | |
this.alphabet = alphabet.toArray(new Character[0]); | |
this.stringLength = stringLength; | |
this.isLast = false; | |
this.currentString = new int[stringLength]; | |
} | |
public Iterator<String> iterator() { | |
return this; | |
} | |
public String next() { | |
if (!hasNext()) { | |
throw new NoSuchElementException("Iterator exhausted"); | |
} | |
StringBuilder sb = new StringBuilder(stringLength); | |
for (int idx : this.currentString) { | |
sb.append(this.alphabet[idx]); | |
} | |
prepareNext(); | |
return sb.toString(); | |
} | |
public boolean hasNext() { | |
return !this.isLast; | |
} | |
private void prepareNext() { | |
final int n = alphabet.length; | |
int index = stringLength - 1; | |
while (index >= 0 && currentString[index] == n-1) { | |
index--; | |
} | |
if (index < 0) { | |
this.isLast = true; | |
return; | |
} | |
this.currentString[index] += 1; | |
for (int j = stringLength-1; j > index; j--) { | |
this.currentString[j] = 0; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment