Created
December 13, 2016 11:27
-
-
Save thmavri/b6d5e4438e5278d4dff286b0e46f8b2e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#query to classify | |
q="hotel amsterdam wifi" | |
#labels to use "prop", "dest", "fac" | |
#this will contain all our training data | |
trainer = ner_trainer("total_word_feature_extractor.dat") | |
#split the string to words | |
#you can use whatever tokenizer | |
q_list=string_to_clasify.split() | |
#initialize a ner training instance | |
#ner: named entity recognition | |
train_item=ner_training_instance(q_list) | |
s_range=0 | |
e_range=len(q_list[0])+s_range | |
train_item.add_entity(xrange(s_range,e_range), "prop") | |
s_range=len(q_list[0])+s_range | |
e_range=len(q_list[1])+s_range | |
train_item.add_entity(xrange(s_range,e_range), "dest") | |
s_range=len(q_list[1])+s_range | |
e_range=len(q_list[2])+s_range | |
train_item.add_entity(xrange(s_range,e_range), "fac") | |
trainer.add(train_item) | |
#take advantage of multi-core CPU. | |
#Set the num\_threads to the number of processing cores. | |
trainer.num_threads = 23 | |
ner = trainer.train() | |
ner.save_to_disk('ner_model.dat') | |
#prints the classes | |
ner.get_possible_ner.tags() | |
#recognizes the entities in a string | |
q=["hotel","amsterdam","wifi"] | |
entities = ner.extract_entities(q) | |
# print out below. | |
print "\nEntities found:", entities | |
print "\nNumber of entities detected:", len(entities) | |
for e in entities: | |
range = e[0] | |
tag = e[1] | |
entity_text = " ".join(q[i] for i in range) | |
print " " + tag + ":" + entity_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment