-
-
Save xim/1279283 to your computer and use it in GitHub Desktop.
| import sys | |
| import numpy | |
| from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
| import nltk.corpus | |
| from nltk import decorators | |
| import nltk.stem | |
| stemmer_func = nltk.stem.EnglishStemmer().stem | |
| stopwords = set(nltk.corpus.stopwords.words('english')) | |
| @decorators.memoize | |
| def normalize_word(word): | |
| return stemmer_func(word.lower()) | |
| def get_words(titles): | |
| words = set() | |
| for title in job_titles: | |
| for word in title.split(): | |
| words.add(normalize_word(word)) | |
| return list(words) | |
| @decorators.memoize | |
| def vectorspaced(title): | |
| title_components = [normalize_word(word) for word in title.split()] | |
| return numpy.array([ | |
| word in title_components and not word in stopwords | |
| for word in words], numpy.short) | |
| if __name__ == '__main__': | |
| filename = 'example.txt' | |
| if len(sys.argv) == 2: | |
| filename = sys.argv[1] | |
| with open(filename) as title_file: | |
| job_titles = [line.strip() for line in title_file.readlines()] | |
| words = get_words(job_titles) | |
| # cluster = KMeansClusterer(5, euclidean_distance) | |
| cluster = GAAClusterer(5) | |
| cluster.cluster([vectorspaced(title) for title in job_titles if title]) | |
| # NOTE: This is inefficient, cluster.classify should really just be | |
| # called when you are classifying previously unseen examples! | |
| classified_examples = [ | |
| cluster.classify(vectorspaced(title)) for title in job_titles | |
| ] | |
| for cluster_id, title in sorted(zip(classified_examples, job_titles)): | |
| print cluster_id, title |
| Not so skilled worker | |
| Skilled worker | |
| Banana picker | |
| Police officer | |
| Office worker | |
| Fireman | |
| IT consultant | |
| Rapist of old ladies | |
| Engineer | |
| Stupid bastard son | |
| Genious computer analyst | |
| Computer banana peeler | |
| Potato peeler | |
| CEO of a major business | |
| Business economist | |
| Data analyst | |
| Economist analyst bastard | |
| Psychologist data enumerator | |
| Psychologist genious | |
| Evil genious | |
| Murderer and rapist of cats | |
| Cat psychologist | |
| Top Software Engineer in IT with NLTK experience | |
| xim | |
| fission6 |
Stemmers have moved, line #9 should be changed to: stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
If my text file is encoded with utf-8, there is this error occurring:
Traceback (most recent call last):
File "cluster_example.py", line 40, in
words = get_words(job_titles)
File "cluster_example.py", line 20, in get_words
words.add(normalize_word(word))
File "", line 1, in
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 183, in memoize
result = func(*args)
File "cluster_example.py", line 14, in normalize_word
return stemmer_func(word.lower())
File "/usr/local/lib/python2.7/dist-packages/nltk/stem/snowball.py", line 694, in stem
word = (word.replace(u"\u2019", u"\x27")
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 13: ordinal not in range(128)
Can you suggest what to do in this case? Thank you.
line 9 won't work with nltk==2.0.4. it needs to be changed to:
stemmer_func = nltk.stem.snowball.EnglishStemmer().stemLines 16-18:
def get_words(titles):
words = set()
for title in job_titles:Shouldn't job_titles be titles ?
For those suffering with UTF-8 files, a simple solution is to use codecs package to open the file:
..
import codecs
..
and replace line 39 with this:
with codecs.open(filename,encoding='latin1') as title_file:
this is a great example of clustering - thanks to xim for his excellent assistance and mentorship.