Skip to content

Instantly share code, notes, and snippets.

@fjavieralba
Created March 23, 2012 10:52
Show Gist options
  • Save fjavieralba/2169559 to your computer and use it in GitHub Desktop.
Save fjavieralba/2169559 to your computer and use it in GitHub Desktop.
[JAVA] Non overlapping tagging of a sentence based on a dictionary of expressions
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
/*
Result is only one tagging of all the possible ones.
The resulting tagging is determined by these two priority rules:
- longest matches have higher priority
- search is made from left to right
*/
public class NonOverlappingTagging {
private static ArrayList non_overlapping_tagging(String[] sentence, HashMap dict,int max_key_size)
{
ArrayList tag_sentence = new ArrayList();
int N = sentence.length;
if (max_key_size == -1)
{
max_key_size = N;
}
int i = 0;
while (i < N)
{
boolean tagged = false;
int j = Math.min(i + max_key_size, N); //avoid overflow
while (j > i)
{
String[] literal_tokens = Arrays.copyOfRange(sentence, i, j);
String literal = join(literal_tokens, " ");
System.out.println(literal);
if (dict.get(literal) != null)
{
tag_sentence.add("["+literal+"]");
i = j;
tagged = true;
}
else
{
j -= 1;
}
}
if (!tagged)
{
tag_sentence.add(sentence[i]);
i += 1;
}
}
return tag_sentence;
}
private static String join(String[] sentence, String separator)
{
String result = "";
for (int i = 0; i < sentence.length; i++)
{
String word = sentence[i];
result += word + separator;
}
return result.trim();
}
public static void main(String[] args) {
String[] sentence = {"Los", "Angeles", "Lakers", "visited", "Washington", "State", "last", "week"};
HashMap <String, Integer>dict = new HashMap();
dict.put("Los Angeles", 1);
dict.put("Lakers", 1);
dict.put("Los Angeles Lakers", 1);
dict.put("Washington", 1);
dict.put("State", 1);
dict.put("Washington State University", 1);
ArrayList tagging = non_overlapping_tagging(sentence, dict, -1);
System.out.println(tagging);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment