davidw93 · June 10, 2013 09:33
diff --git a/gistfile1.py b/gistfile1.py
 from lxml import etree
 from StringIO import StringIO
 from tokenize import generate_tokens
 import nltk
 from collections import Counter
 import re

 def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']):
  good = []

  for word, tag in chunk:
    ok = True

    for suffix in tag_suffixes:
      if tag.endswith(suffix):
        ok = False
        break

    if ok:
      good.append((word, tag))

  return good

 def get_acronyms(chunk):
    good = []

    for word in chunk:
        if word.isupper():
            good.append(word)

    return good

 def expand_acronyms(acronyms, background, methods, results, used_collection):
    final_list = []
    pattern = ""
    for acronym in acronyms:
        for letter in acronym:
            pattern = pattern + "(\w*)\W*"
        pattern = pattern + acronym + "\W"
        m = re.search(pattern, background).group()
        if m is not None and m not in final_list:
            final_list.append(m)
            used_collection[m] = 100
            del used_collection[acronym]
    return used_collection

 def upgrade_hyphenations(used_collection):
    for word in list(used_collection):
        if '-' in word:
            used_collection[word] = 75
    return used_collection


 f = open("pubmedtest.xml")
 xml_content = f.read()
 f.close()

 context = etree.iterparse(StringIO(xml_content))

 for action, elem in context:
  if elem.tag == "ArticleTitle":
    title = elem.text
  elif elem.tag == "AbstractText":
    if elem.attrib["Label"] == "BACKGROUND":
      background = elem.text
    elif elem.attrib["Label"] == "METHODS":
      methods = elem.text
    elif elem.attrib["Label"] == "CONCLUSIONS":
      results = elem.text


 background_tokenized = nltk.word_tokenize(background)
 background_tagged = filter_insignificant(nltk.pos_tag(background_tokenized))
 print background_tagged

 methods_tokenized = nltk.word_tokenize(methods)
 methods_tagged = filter_insignificant(nltk.pos_tag(methods_tokenized))

 results_tokenized = nltk.word_tokenize(results)
 results_tagged = filter_insignificant(nltk.pos_tag(results_tokenized))

 background_tokens = []
 methods_tokens = []
 results_tokens = []

 word_count = Counter()

 for token in background_tagged:
  background_tokens.append(token[0])
  word_count[token[0]] += 1

 for token in methods_tagged:
  methods_tokens.append(token[0])
  word_count[token[0]] += 1

 for token in results_tagged:
  results_tokens.append(token[0])
  word_count[token[0]] += 1

 word_count = expand_acronyms(get_acronyms(word_count), background, methods, results, word_count)
 word_count = upgrade_hyphenations(word_count)

 print word_count
	from lxml import etree
	from StringIO import StringIO
	from tokenize import generate_tokens
	import nltk
	from collections import Counter
	import re

	def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']):
	good = []

	for word, tag in chunk:
	ok = True

	for suffix in tag_suffixes:
	if tag.endswith(suffix):
	ok = False
	break

	if ok:
	good.append((word, tag))

	return good

	def get_acronyms(chunk):
	good = []

	for word in chunk:
	if word.isupper():
	good.append(word)

	return good

	def expand_acronyms(acronyms, background, methods, results, used_collection):
	final_list = []
	pattern = ""
	for acronym in acronyms:
	for letter in acronym:
	pattern = pattern + "(\w)\W"
	pattern = pattern + acronym + "\W"
	m = re.search(pattern, background).group()
	if m is not None and m not in final_list:
	final_list.append(m)
	used_collection[m] = 100
	del used_collection[acronym]
	return used_collection

	def upgrade_hyphenations(used_collection):
	for word in list(used_collection):
	if '-' in word:
	used_collection[word] = 75
	return used_collection


	f = open("pubmedtest.xml")
	xml_content = f.read()
	f.close()

	context = etree.iterparse(StringIO(xml_content))

	for action, elem in context:
	if elem.tag == "ArticleTitle":
	title = elem.text
	elif elem.tag == "AbstractText":
	if elem.attrib["Label"] == "BACKGROUND":
	background = elem.text
	elif elem.attrib["Label"] == "METHODS":
	methods = elem.text
	elif elem.attrib["Label"] == "CONCLUSIONS":
	results = elem.text


	background_tokenized = nltk.word_tokenize(background)
	background_tagged = filter_insignificant(nltk.pos_tag(background_tokenized))
	print background_tagged

	methods_tokenized = nltk.word_tokenize(methods)
	methods_tagged = filter_insignificant(nltk.pos_tag(methods_tokenized))

	results_tokenized = nltk.word_tokenize(results)
	results_tagged = filter_insignificant(nltk.pos_tag(results_tokenized))

	background_tokens = []
	methods_tokens = []
	results_tokens = []

	word_count = Counter()

	for token in background_tagged:
	background_tokens.append(token[0])
	word_count[token[0]] += 1

	for token in methods_tagged:
	methods_tokens.append(token[0])
	word_count[token[0]] += 1

	for token in results_tagged:
	results_tokens.append(token[0])
	word_count[token[0]] += 1

	word_count = expand_acronyms(get_acronyms(word_count), background, methods, results, word_count)
	word_count = upgrade_hyphenations(word_count)

	print word_count