Skip to content

Instantly share code, notes, and snippets.

@davidw93
Last active December 19, 2015 12:38
Show Gist options
  • Save davidw93/5956007 to your computer and use it in GitHub Desktop.
Save davidw93/5956007 to your computer and use it in GitHub Desktop.
from lxml import etree
from StringIO import StringIO
from tokenize import generate_tokens
import nltk
from collections import Counter
import re
def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']):
good = []
for word, tag in chunk:
ok = True
for suffix in tag_suffixes:
if tag.endswith(suffix):
ok = False
break
if ok:
good.append((word, tag))
return good
def get_acronyms(chunk):
good = []
for word in chunk:
if word.isupper():
good.append(word)
return good
def expand_acronyms(acronyms, background, methods, results, used_collection):
final_list = []
pattern = ""
for acronym in acronyms:
for letter in acronym:
pattern = pattern + "(\w*)\W*"
pattern = pattern + acronym + "\W"
m = re.search(pattern, background).group()
if m is not None and m not in final_list:
final_list.append(m)
used_collection[m] = 100
del used_collection[acronym]
return used_collection
def upgrade_hyphenations(used_collection):
for word in list(used_collection):
if '-' in word:
used_collection[word] = 75
return used_collection
f = open("pubmedtest.xml")
xml_content = f.read()
f.close()
context = etree.iterparse(StringIO(xml_content))
for action, elem in context:
if elem.tag == "ArticleTitle":
title = elem.text
elif elem.tag == "AbstractText":
if elem.attrib["Label"] == "BACKGROUND":
background = elem.text
elif elem.attrib["Label"] == "METHODS":
methods = elem.text
elif elem.attrib["Label"] == "CONCLUSIONS":
results = elem.text
background_tokenized = nltk.word_tokenize(background)
background_tagged = filter_insignificant(nltk.pos_tag(background_tokenized))
print background_tagged
methods_tokenized = nltk.word_tokenize(methods)
methods_tagged = filter_insignificant(nltk.pos_tag(methods_tokenized))
results_tokenized = nltk.word_tokenize(results)
results_tagged = filter_insignificant(nltk.pos_tag(results_tokenized))
background_tokens = []
methods_tokens = []
results_tokens = []
word_count = Counter()
for token in background_tagged:
background_tokens.append(token[0])
word_count[token[0]] += 1
for token in methods_tagged:
methods_tokens.append(token[0])
word_count[token[0]] += 1
for token in results_tagged:
results_tokens.append(token[0])
word_count[token[0]] += 1
word_count = expand_acronyms(get_acronyms(word_count), background, methods, results, word_count)
word_count = upgrade_hyphenations(word_count)
print word_count
<PubmedArticle>
<MedlineCitation Owner="NLM" Status="MEDLINE">
<PMID Version="1">21470008</PMID>
<DateCreated>
<Year>2011</Year>
<Month>04</Month>
<Day>07</Day>
</DateCreated>
<DateCompleted>
<Year>2011</Year>
<Month>04</Month>
<Day>11</Day>
</DateCompleted>
<DateRevised>
<Year>2011</Year>
<Month>07</Month>
<Day>21</Day>
</DateRevised>
<Article PubModel="Print">
<Journal>
<ISSN IssnType="Electronic">1533-4406</ISSN>
<JournalIssue CitedMedium="Internet">
<Volume>364</Volume>
<Issue>14</Issue>
<PubDate>
<Year>2011</Year>
<Month>Apr</Month>
<Day>7</Day>
</PubDate>
</JournalIssue>
<Title>The New England journal of medicine</Title>
<ISOAbbreviation>N. Engl. J. Med.</ISOAbbreviation>
</Journal>
<ArticleTitle>Functional disability 5 years after acute respiratory distress syndrome.</ArticleTitle>
<Pagination>
<MedlinePgn>1293-304</MedlinePgn>
</Pagination>
<ELocationID EIdType="doi" ValidYN="Y">10.1056/NEJMoa1011802</ELocationID>
<Abstract>
<AbstractText Label="BACKGROUND" NlmCategory="BACKGROUND">There have been few detailed, in-person interviews and examinations to obtain follow-up data on 5-year outcomes among survivors of the acute respiratory distress syndrome (ARDS).</AbstractText>
<AbstractText Label="METHODS" NlmCategory="METHODS">We evaluated 109 survivors of ARDS at 3, 6, and 12 months and at 2, 3, 4, and 5 years after discharge from the intensive care unit. At each visit, patients were interviewed and examined; underwent pulmonary-function tests, the 6-minute walk test, resting and exercise oximetry, chest imaging, and a quality-of-life evaluation; and reported their use of health care services.</AbstractText>
<AbstractText Label="RESULTS" NlmCategory="RESULTS">At 5 years, the median 6-minute walk distance was 436 m (76% of predicted distance) and the Physical Component Score on the Medical Outcomes Study 36-Item Short-Form Health Survey was 41 (mean norm score matched for age and sex, 50). With respect to this score, younger patients had a greater rate of recovery than older patients, but neither group returned to normal predicted levels of physical function at 5 years. Pulmonary function was normal to near-normal. A constellation of other physical and psychological problems developed or persisted in patients and family caregivers for up to 5 years. Patients with more coexisting illnesses incurred greater 5-year costs.</AbstractText>
<AbstractText Label="CONCLUSIONS" NlmCategory="CONCLUSIONS">Exercise limitation, physical and psychological sequelae, decreased physical quality of life, and increased costs and use of health care services are important legacies of severe lung injury.</AbstractText>
</Abstract>
<Affiliation>Department of Medicine, University Health Network, University of Toronto, Toronto, ON, Canada. [email protected]</Affiliation>
<AuthorList CompleteYN="Y">
<Author ValidYN="Y">
<LastName>Herridge</LastName>
<ForeName>Margaret S</ForeName>
<Initials>MS</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Tansey</LastName>
<ForeName>Catherine M</ForeName>
<Initials>CM</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Matté</LastName>
<ForeName>Andrea</ForeName>
<Initials>A</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Tomlinson</LastName>
<ForeName>George</ForeName>
<Initials>G</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Diaz-Granados</LastName>
<ForeName>Natalia</ForeName>
<Initials>N</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Cooper</LastName>
<ForeName>Andrew</ForeName>
<Initials>A</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Guest</LastName>
<ForeName>Cameron B</ForeName>
<Initials>CB</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Mazer</LastName>
<ForeName>C David</ForeName>
<Initials>CD</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Mehta</LastName>
<ForeName>Sangeeta</ForeName>
<Initials>S</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Stewart</LastName>
<ForeName>Thomas E</ForeName>
<Initials>TE</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Kudlow</LastName>
<ForeName>Paul</ForeName>
<Initials>P</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Cook</LastName>
<ForeName>Deborah</ForeName>
<Initials>D</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Slutsky</LastName>
<ForeName>Arthur S</ForeName>
<Initials>AS</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Cheung</LastName>
<ForeName>Angela M</ForeName>
<Initials>AM</Initials>
</Author>
<Author ValidYN="Y">
<CollectiveName>Canadian Critical Care Trials Group</CollectiveName>
</Author>
</AuthorList>
<Language>eng</Language>
<PublicationTypeList>
<PublicationType>Journal Article</PublicationType>
<PublicationType>Research Support, Non-U.S. Gov't</PublicationType>
</PublicationTypeList>
</Article>
<MedlineJournalInfo>
<Country>United States</Country>
<MedlineTA>N Engl J Med</MedlineTA>
<NlmUniqueID>0255562</NlmUniqueID>
<ISSNLinking>0028-4793</ISSNLinking>
</MedlineJournalInfo>
<CitationSubset>AIM</CitationSubset>
<CitationSubset>IM</CitationSubset>
<CommentsCorrectionsList>
<CommentsCorrections RefType="CommentIn">
<RefSource>N Engl J Med. 2011 Jul 21;365(3):274-5; author reply 275-6</RefSource>
<PMID Version="1">21774725</PMID>
</CommentsCorrections>
<CommentsCorrections RefType="CommentIn">
<RefSource>N Engl J Med. 2011 Jul 21;365(3):275; author reply 275-6</RefSource>
<PMID Version="1">21774724</PMID>
</CommentsCorrections>
<CommentsCorrections RefType="CommentIn">
<RefSource>N Engl J Med. 2011 Apr 7;364(14):1358-9</RefSource>
<PMID Version="1">21470014</PMID>
</CommentsCorrections>
</CommentsCorrectionsList>
<MeshHeadingList>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Activities of Daily Living</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Adult</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="Y">Disabled Persons</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Exercise Test</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Female</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Follow-Up Studies</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Health Services</DescriptorName>
<QualifierName MajorTopicYN="N">utilization</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Humans</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Kaplan-Meier Estimate</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Lung</DescriptorName>
<QualifierName MajorTopicYN="N">physiology</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Male</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Middle Aged</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="Y">Quality of Life</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Recovery of Function</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Respiratory Distress Syndrome, Adult</DescriptorName>
<QualifierName MajorTopicYN="Y">complications</QualifierName>
<QualifierName MajorTopicYN="N">physiopathology</QualifierName>
<QualifierName MajorTopicYN="N">psychology</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Respiratory Function Tests</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Survivors</DescriptorName>
<QualifierName MajorTopicYN="N">psychology</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Walking</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName MajorTopicYN="N">Work</DescriptorName>
</MeshHeading>
</MeshHeadingList>
</MedlineCitation>
<PubmedData>
<History>
<PubMedPubDate PubStatus="entrez">
<Year>2011</Year>
<Month>4</Month>
<Day>8</Day>
<Hour>6</Hour>
<Minute>0</Minute>
</PubMedPubDate>
<PubMedPubDate PubStatus="pubmed">
<Year>2011</Year>
<Month>4</Month>
<Day>8</Day>
<Hour>6</Hour>
<Minute>0</Minute>
</PubMedPubDate>
<PubMedPubDate PubStatus="medline">
<Year>2011</Year>
<Month>4</Month>
<Day>13</Day>
<Hour>6</Hour>
<Minute>0</Minute>
</PubMedPubDate>
</History>
<PublicationStatus>ppublish</PublicationStatus>
<ArticleIdList>
<ArticleId IdType="doi">10.1056/NEJMoa1011802</ArticleId>
<ArticleId IdType="pubmed">21470008</ArticleId>
</ArticleIdList>
</PubmedData>
</PubmedArticle>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment