-
-
Save mamonu/dbe4884e3e18e106f8e9a2ce75e8b0b9 to your computer and use it in GitHub Desktop.
Clustering K-Means by euclidian distance, yay!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
from nltk.cluster import KMeansClusterer, euclidean_distance | |
job_titles = [ | |
'Not so skilled worker', | |
'Skilled worker', | |
'Banana picker', | |
'Police officer', | |
'Office worker', | |
'Fireman', | |
'IT consultant', | |
'Rapist of old ladies', | |
'Engineer', | |
'Stupid bastard son', | |
'Genious computer analyst', | |
'Computer banana peeler', | |
'Potato peeler', | |
'CEO of a major business', | |
'Business economist', | |
'Data analyst', | |
'Economist analyst bastard', | |
'Psychologist data enumerator', | |
'Psychologist genious', | |
'Evil genious', | |
'Murderer and rapist of cats', | |
'Cat psychologist', | |
'Top Software Engineer in IT with NLTK experience', | |
'xim', | |
'fission6' | |
] | |
words = set() | |
for title in job_titles: | |
for word in title.split(): | |
words.add(word.lower()) | |
words = list(words) | |
def vectorspaced(title): | |
title_components = [word.lower() for word in title.split()] | |
return numpy.array([word in title_components and 1 or 0 for word in words]) | |
cluster = KMeansClusterer(5, euclidean_distance) | |
cluster.cluster([vectorspaced(title) for title in job_titles], True) | |
# NOTE: This is inefficient, cluster.classify should really just be called when | |
# you are classifying previously unseen examples! | |
classified_examples = [ | |
cluster.classify(vectorspaced(title)) for title in job_titles | |
] | |
for cluster, title in sorted(zip(classified_examples, job_titles)): | |
print cluster, title |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment