Skip to content

Instantly share code, notes, and snippets.

View davidw93's full-sized avatar

David Waddington davidw93

  • Airdrie, Scotland
View GitHub Profile
from lxml import etree
from StringIO import StringIO
from tokenize import generate_tokens
import nltk
from collections import Counter
import re
def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']):
good = []
@davidw93
davidw93 / gist:5747549
Created June 10, 2013 09:33
NLTK for pubmed articles
from lxml import etree
from StringIO import StringIO
from tokenize import generate_tokens
import nltk
from collections import Counter
import re
def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD', 'IN', 'PRP', 'PRP$', 'VBD', 'NNP', 'EX']):
good = []
@davidw93
davidw93 / gist:5695038
Created June 2, 2013 21:25
Basic XML parsing and token removal + counting
from lxml import etree
from StringIO import StringIO
from tokenize import generate_tokens
import nltk
from collections import Counter
def filter_insignificant(chunk, tag_suffixes=['DT', 'CC', ',', '.', ':', 'VBP', '!', 'CD']):
good = []
for word, tag in chunk: