Skip to content

Instantly share code, notes, and snippets.

@davidw93
Created June 10, 2013 09:33
Show Gist options
  • Save davidw93/5747549 to your computer and use it in GitHub Desktop.
Save davidw93/5747549 to your computer and use it in GitHub Desktop.
NLTK for pubmed articles
from lxml import etree
from StringIO import StringIO
from tokenize import generate_tokens
import nltk
from collections import Counter
import re
def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']):
good = []
for word, tag in chunk:
ok = True
for suffix in tag_suffixes:
if tag.endswith(suffix):
ok = False
break
if ok:
good.append((word, tag))
return good
def get_acronyms(chunk):
good = []
for word in chunk:
if word.isupper():
good.append(word)
return good
def expand_acronyms(acronyms, background, methods, results, used_collection):
final_list = []
pattern = ""
for acronym in acronyms:
for letter in acronym:
pattern = pattern + "(\w*)\W*"
pattern = pattern + acronym + "\W"
m = re.search(pattern, background).group()
if m is not None and m not in final_list:
final_list.append(m)
used_collection[m] = 100
del used_collection[acronym]
return used_collection
def upgrade_hyphenations(used_collection):
for word in list(used_collection):
if '-' in word:
used_collection[word] = 75
return used_collection
f = open("pubmedtest.xml")
xml_content = f.read()
f.close()
context = etree.iterparse(StringIO(xml_content))
for action, elem in context:
if elem.tag == "ArticleTitle":
title = elem.text
elif elem.tag == "AbstractText":
if elem.attrib["Label"] == "BACKGROUND":
background = elem.text
elif elem.attrib["Label"] == "METHODS":
methods = elem.text
elif elem.attrib["Label"] == "CONCLUSIONS":
results = elem.text
background_tokenized = nltk.word_tokenize(background)
background_tagged = filter_insignificant(nltk.pos_tag(background_tokenized))
print background_tagged
methods_tokenized = nltk.word_tokenize(methods)
methods_tagged = filter_insignificant(nltk.pos_tag(methods_tokenized))
results_tokenized = nltk.word_tokenize(results)
results_tagged = filter_insignificant(nltk.pos_tag(results_tokenized))
background_tokens = []
methods_tokens = []
results_tokens = []
word_count = Counter()
for token in background_tagged:
background_tokens.append(token[0])
word_count[token[0]] += 1
for token in methods_tagged:
methods_tokens.append(token[0])
word_count[token[0]] += 1
for token in results_tagged:
results_tokens.append(token[0])
word_count[token[0]] += 1
word_count = expand_acronyms(get_acronyms(word_count), background, methods, results, word_count)
word_count = upgrade_hyphenations(word_count)
print word_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment