Last active
December 19, 2015 12:38
-
-
Save davidw93/5956007 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
from StringIO import StringIO | |
from tokenize import generate_tokens | |
import nltk | |
from collections import Counter | |
import re | |
def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']): | |
good = [] | |
for word, tag in chunk: | |
ok = True | |
for suffix in tag_suffixes: | |
if tag.endswith(suffix): | |
ok = False | |
break | |
if ok: | |
good.append((word, tag)) | |
return good | |
def get_acronyms(chunk): | |
good = [] | |
for word in chunk: | |
if word.isupper(): | |
good.append(word) | |
return good | |
def expand_acronyms(acronyms, background, methods, results, used_collection): | |
final_list = [] | |
pattern = "" | |
for acronym in acronyms: | |
for letter in acronym: | |
pattern = pattern + "(\w*)\W*" | |
pattern = pattern + acronym + "\W" | |
m = re.search(pattern, background).group() | |
if m is not None and m not in final_list: | |
final_list.append(m) | |
used_collection[m] = 100 | |
del used_collection[acronym] | |
return used_collection | |
def upgrade_hyphenations(used_collection): | |
for word in list(used_collection): | |
if '-' in word: | |
used_collection[word] = 75 | |
return used_collection | |
f = open("pubmedtest.xml") | |
xml_content = f.read() | |
f.close() | |
context = etree.iterparse(StringIO(xml_content)) | |
for action, elem in context: | |
if elem.tag == "ArticleTitle": | |
title = elem.text | |
elif elem.tag == "AbstractText": | |
if elem.attrib["Label"] == "BACKGROUND": | |
background = elem.text | |
elif elem.attrib["Label"] == "METHODS": | |
methods = elem.text | |
elif elem.attrib["Label"] == "CONCLUSIONS": | |
results = elem.text | |
background_tokenized = nltk.word_tokenize(background) | |
background_tagged = filter_insignificant(nltk.pos_tag(background_tokenized)) | |
print background_tagged | |
methods_tokenized = nltk.word_tokenize(methods) | |
methods_tagged = filter_insignificant(nltk.pos_tag(methods_tokenized)) | |
results_tokenized = nltk.word_tokenize(results) | |
results_tagged = filter_insignificant(nltk.pos_tag(results_tokenized)) | |
background_tokens = [] | |
methods_tokens = [] | |
results_tokens = [] | |
word_count = Counter() | |
for token in background_tagged: | |
background_tokens.append(token[0]) | |
word_count[token[0]] += 1 | |
for token in methods_tagged: | |
methods_tokens.append(token[0]) | |
word_count[token[0]] += 1 | |
for token in results_tagged: | |
results_tokens.append(token[0]) | |
word_count[token[0]] += 1 | |
word_count = expand_acronyms(get_acronyms(word_count), background, methods, results, word_count) | |
word_count = upgrade_hyphenations(word_count) | |
print word_count |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<PubmedArticle> | |
<MedlineCitation Owner="NLM" Status="MEDLINE"> | |
<PMID Version="1">21470008</PMID> | |
<DateCreated> | |
<Year>2011</Year> | |
<Month>04</Month> | |
<Day>07</Day> | |
</DateCreated> | |
<DateCompleted> | |
<Year>2011</Year> | |
<Month>04</Month> | |
<Day>11</Day> | |
</DateCompleted> | |
<DateRevised> | |
<Year>2011</Year> | |
<Month>07</Month> | |
<Day>21</Day> | |
</DateRevised> | |
<Article PubModel="Print"> | |
<Journal> | |
<ISSN IssnType="Electronic">1533-4406</ISSN> | |
<JournalIssue CitedMedium="Internet"> | |
<Volume>364</Volume> | |
<Issue>14</Issue> | |
<PubDate> | |
<Year>2011</Year> | |
<Month>Apr</Month> | |
<Day>7</Day> | |
</PubDate> | |
</JournalIssue> | |
<Title>The New England journal of medicine</Title> | |
<ISOAbbreviation>N. Engl. J. Med.</ISOAbbreviation> | |
</Journal> | |
<ArticleTitle>Functional disability 5 years after acute respiratory distress syndrome.</ArticleTitle> | |
<Pagination> | |
<MedlinePgn>1293-304</MedlinePgn> | |
</Pagination> | |
<ELocationID EIdType="doi" ValidYN="Y">10.1056/NEJMoa1011802</ELocationID> | |
<Abstract> | |
<AbstractText Label="BACKGROUND" NlmCategory="BACKGROUND">There have been few detailed, in-person interviews and examinations to obtain follow-up data on 5-year outcomes among survivors of the acute respiratory distress syndrome (ARDS).</AbstractText> | |
<AbstractText Label="METHODS" NlmCategory="METHODS">We evaluated 109 survivors of ARDS at 3, 6, and 12 months and at 2, 3, 4, and 5 years after discharge from the intensive care unit. At each visit, patients were interviewed and examined; underwent pulmonary-function tests, the 6-minute walk test, resting and exercise oximetry, chest imaging, and a quality-of-life evaluation; and reported their use of health care services.</AbstractText> | |
<AbstractText Label="RESULTS" NlmCategory="RESULTS">At 5 years, the median 6-minute walk distance was 436 m (76% of predicted distance) and the Physical Component Score on the Medical Outcomes Study 36-Item Short-Form Health Survey was 41 (mean norm score matched for age and sex, 50). With respect to this score, younger patients had a greater rate of recovery than older patients, but neither group returned to normal predicted levels of physical function at 5 years. Pulmonary function was normal to near-normal. A constellation of other physical and psychological problems developed or persisted in patients and family caregivers for up to 5 years. Patients with more coexisting illnesses incurred greater 5-year costs.</AbstractText> | |
<AbstractText Label="CONCLUSIONS" NlmCategory="CONCLUSIONS">Exercise limitation, physical and psychological sequelae, decreased physical quality of life, and increased costs and use of health care services are important legacies of severe lung injury.</AbstractText> | |
</Abstract> | |
<Affiliation>Department of Medicine, University Health Network, University of Toronto, Toronto, ON, Canada. [email protected]</Affiliation> | |
<AuthorList CompleteYN="Y"> | |
<Author ValidYN="Y"> | |
<LastName>Herridge</LastName> | |
<ForeName>Margaret S</ForeName> | |
<Initials>MS</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Tansey</LastName> | |
<ForeName>Catherine M</ForeName> | |
<Initials>CM</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Matté</LastName> | |
<ForeName>Andrea</ForeName> | |
<Initials>A</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Tomlinson</LastName> | |
<ForeName>George</ForeName> | |
<Initials>G</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Diaz-Granados</LastName> | |
<ForeName>Natalia</ForeName> | |
<Initials>N</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Cooper</LastName> | |
<ForeName>Andrew</ForeName> | |
<Initials>A</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Guest</LastName> | |
<ForeName>Cameron B</ForeName> | |
<Initials>CB</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Mazer</LastName> | |
<ForeName>C David</ForeName> | |
<Initials>CD</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Mehta</LastName> | |
<ForeName>Sangeeta</ForeName> | |
<Initials>S</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Stewart</LastName> | |
<ForeName>Thomas E</ForeName> | |
<Initials>TE</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Kudlow</LastName> | |
<ForeName>Paul</ForeName> | |
<Initials>P</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Cook</LastName> | |
<ForeName>Deborah</ForeName> | |
<Initials>D</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Slutsky</LastName> | |
<ForeName>Arthur S</ForeName> | |
<Initials>AS</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<LastName>Cheung</LastName> | |
<ForeName>Angela M</ForeName> | |
<Initials>AM</Initials> | |
</Author> | |
<Author ValidYN="Y"> | |
<CollectiveName>Canadian Critical Care Trials Group</CollectiveName> | |
</Author> | |
</AuthorList> | |
<Language>eng</Language> | |
<PublicationTypeList> | |
<PublicationType>Journal Article</PublicationType> | |
<PublicationType>Research Support, Non-U.S. Gov't</PublicationType> | |
</PublicationTypeList> | |
</Article> | |
<MedlineJournalInfo> | |
<Country>United States</Country> | |
<MedlineTA>N Engl J Med</MedlineTA> | |
<NlmUniqueID>0255562</NlmUniqueID> | |
<ISSNLinking>0028-4793</ISSNLinking> | |
</MedlineJournalInfo> | |
<CitationSubset>AIM</CitationSubset> | |
<CitationSubset>IM</CitationSubset> | |
<CommentsCorrectionsList> | |
<CommentsCorrections RefType="CommentIn"> | |
<RefSource>N Engl J Med. 2011 Jul 21;365(3):274-5; author reply 275-6</RefSource> | |
<PMID Version="1">21774725</PMID> | |
</CommentsCorrections> | |
<CommentsCorrections RefType="CommentIn"> | |
<RefSource>N Engl J Med. 2011 Jul 21;365(3):275; author reply 275-6</RefSource> | |
<PMID Version="1">21774724</PMID> | |
</CommentsCorrections> | |
<CommentsCorrections RefType="CommentIn"> | |
<RefSource>N Engl J Med. 2011 Apr 7;364(14):1358-9</RefSource> | |
<PMID Version="1">21470014</PMID> | |
</CommentsCorrections> | |
</CommentsCorrectionsList> | |
<MeshHeadingList> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Activities of Daily Living</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Adult</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="Y">Disabled Persons</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Exercise Test</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Female</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Follow-Up Studies</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Health Services</DescriptorName> | |
<QualifierName MajorTopicYN="N">utilization</QualifierName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Humans</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Kaplan-Meier Estimate</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Lung</DescriptorName> | |
<QualifierName MajorTopicYN="N">physiology</QualifierName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Male</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Middle Aged</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="Y">Quality of Life</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Recovery of Function</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Respiratory Distress Syndrome, Adult</DescriptorName> | |
<QualifierName MajorTopicYN="Y">complications</QualifierName> | |
<QualifierName MajorTopicYN="N">physiopathology</QualifierName> | |
<QualifierName MajorTopicYN="N">psychology</QualifierName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Respiratory Function Tests</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Survivors</DescriptorName> | |
<QualifierName MajorTopicYN="N">psychology</QualifierName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Walking</DescriptorName> | |
</MeshHeading> | |
<MeshHeading> | |
<DescriptorName MajorTopicYN="N">Work</DescriptorName> | |
</MeshHeading> | |
</MeshHeadingList> | |
</MedlineCitation> | |
<PubmedData> | |
<History> | |
<PubMedPubDate PubStatus="entrez"> | |
<Year>2011</Year> | |
<Month>4</Month> | |
<Day>8</Day> | |
<Hour>6</Hour> | |
<Minute>0</Minute> | |
</PubMedPubDate> | |
<PubMedPubDate PubStatus="pubmed"> | |
<Year>2011</Year> | |
<Month>4</Month> | |
<Day>8</Day> | |
<Hour>6</Hour> | |
<Minute>0</Minute> | |
</PubMedPubDate> | |
<PubMedPubDate PubStatus="medline"> | |
<Year>2011</Year> | |
<Month>4</Month> | |
<Day>13</Day> | |
<Hour>6</Hour> | |
<Minute>0</Minute> | |
</PubMedPubDate> | |
</History> | |
<PublicationStatus>ppublish</PublicationStatus> | |
<ArticleIdList> | |
<ArticleId IdType="doi">10.1056/NEJMoa1011802</ArticleId> | |
<ArticleId IdType="pubmed">21470008</ArticleId> | |
</ArticleIdList> | |
</PubmedData> | |
</PubmedArticle> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment