Skip to content

Instantly share code, notes, and snippets.

@nsaje
Created February 18, 2014 11:44
Show Gist options
  • Save nsaje/9069439 to your computer and use it in GitHub Desktop.
Save nsaje/9069439 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import mrjob
from mrjob.job import MRJob
import os
import sys
import numpy as np
class MRkMeans(MRJob):
centroids = None
OUTPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
def mapper_init(self):
centroid_file = os.environ['CENTROID_FILE']
centroids = []
with open(centroid_file, 'r') as f:
for line in f:
centroids.append(np.array(map(float, line.split(' ')))) #pretvorimo vsak centroid v numpy array
self.centroids = np.array(centroids)
def mapper(self, _, value):
point = np.array(map(float, value.split(' ')))
square_distances = np.sum(np.power(self.centroids - point, 2), 1)
min_idx = np.argmin(square_distances)
yield "min_sq_dist", np.min(square_distances)
yield min_idx, ' '.join(map(str, point.tolist()))
def reducer(self, key, values):
if key == "min_sq_dist":
print >> sys.stderr, "min_sq_dist", np.sum([int(v) for v in values])
else:
points = [np.array(map(float, point.split(' '))) for point in values]
cluster = np.array(points)
new_centroid = np.sum(cluster, 0) / np.shape(cluster)[0]
yield '', ' '.join(map(str, new_centroid.tolist()))
def get_2nd_doc_tags(centroids_file, data_file, vocab_file):
with open(data_file, 'r') as f:
data = np.array([np.array(map(float, value.split(' '))) for value in f])
with open(centroids_file, 'r') as f:
centroids = np.array([np.array(map(float, line.split(' '))) for line in f])
with open(vocab_file, 'r') as f:
vocab = np.array([l.strip() for l in f])
square_distances = np.sum(np.power(centroids - data[1], 2), 1)
centroid_idx = np.argmin(square_distances)
tags = (-centroids[centroid_idx, :]).argsort()[:10]
print vocab[tags]
if __name__ == '__main__':
MRkMeans.run()
#get_2nd_doc_tags('output/20', 'data.txt', 'vocab.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment