Garfounkel · July 8, 2020 23:50
diff --git a/cluster_topics.py b/cluster_topics.py
 import numpy as np


 sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
 terms = cv.get_feature_names()

 clusters_terms = sorted_centroids[:, :100].get()

 for i, c1 in enumerate(clusters_terms):
    cluster = set(c1)
    for j, c2 in enumerate(clusters_terms):
        if i == j:
            continue
        cluster -= set(c2)
    cluster = c1[np.isin(c1, list(cluster))][:5]
    print(f'Cluster {i}:', ' | '.join(terms[cluster].tolist()))
diff --git a/kmeans.py b/kmeans.py
 from cuml.cluster import KMeans


 num_clusters = 10
 sample_size = 100_000
 kmeans_model = KMeans(n_clusters=num_clusters, n_init=1, max_iter=1000)
 sample_data = tfidf_matrix[:sample_size].todense()
 sample_tweets = tweets[:sample_size].reset_index(drop=True)

 kmeans = kmeans_model.fit(sample_data)
 kmeans_clusters = kmeans.predict(sample_data)
 kmeans_distances = kmeans.transform(sample_data)
diff --git a/plotting.py b/plotting.py
 import bokeh.plotting as bp
 from bokeh.palettes import Turbo256
 from bokeh.models import HoverTool


 # setup data
 step = len(Turbo256) / num_clusters
 kmeans_df = DataFrame(tsne_kmeans, columns=['x', 'y'])
 kmeans_df['cluster'] = kmeans_clusters
 kmeans_df['tweets'] = sample_tweets
 kmeans_df['color'] = [Turbo256[int(i * step)] for i in kmeans_clusters.tolist()]
 kmeans_df = kmeans_df.to_pandas()

 # setup plot
 plot_kmeans = bp.figure(plot_width=700, plot_height=600,
                        title="KMeans clustering of item description",
                        tools="pan,wheel_zoom,box_zoom,reset,hover",
                        x_axis_type=None, y_axis_type=None, min_border=1)

 # display plot and tooltips
 plot_kmeans.scatter(x='x', y='y', color='color', source=kmeans_df)
 hover = plot_kmeans.select(dict(type=HoverTool))
 hover.tooltips={"tweets": "@tweets", "cluster":"@cluster" }
 bp.show(plot_kmeans)
diff --git a/sample.py b/sample.py
 sample_tweets[kmeans_clusters == 4].to_pandas().sample()
diff --git a/tsne.py b/tsne.py
 from cuml.manifold import TSNE


 tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=1000)
 tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
diff --git a/vectorize.py b/vectorize.py
 cv = CountVectorizer(max_features=18000, stop_words='english')
 count = cv.fit_transform(tweets)

 tf = TfidfTransformer()
 tfidf_matrix = tf.fit_transform(count)
	import numpy as np


	sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
	terms = cv.get_feature_names()

	clusters_terms = sorted_centroids[:, :100].get()

	for i, c1 in enumerate(clusters_terms):
	cluster = set(c1)
	for j, c2 in enumerate(clusters_terms):
	if i == j:
	continue
	cluster -= set(c2)
	cluster = c1[np.isin(c1, list(cluster))][:5]
	print(f'Cluster {i}:', ' \| '.join(terms[cluster].tolist()))
	from cuml.cluster import KMeans


	num_clusters = 10
	sample_size = 100_000
	kmeans_model = KMeans(n_clusters=num_clusters, n_init=1, max_iter=1000)
	sample_data = tfidf_matrix[:sample_size].todense()
	sample_tweets = tweets[:sample_size].reset_index(drop=True)

	kmeans = kmeans_model.fit(sample_data)
	kmeans_clusters = kmeans.predict(sample_data)
	kmeans_distances = kmeans.transform(sample_data)
	import bokeh.plotting as bp
	from bokeh.palettes import Turbo256
	from bokeh.models import HoverTool


	# setup data
	step = len(Turbo256) / num_clusters
	kmeans_df = DataFrame(tsne_kmeans, columns=['x', 'y'])
	kmeans_df['cluster'] = kmeans_clusters
	kmeans_df['tweets'] = sample_tweets
	kmeans_df['color'] = [Turbo256[int(i * step)] for i in kmeans_clusters.tolist()]
	kmeans_df = kmeans_df.to_pandas()

	# setup plot
	plot_kmeans = bp.figure(plot_width=700, plot_height=600,
	title="KMeans clustering of item description",
	tools="pan,wheel_zoom,box_zoom,reset,hover",
	x_axis_type=None, y_axis_type=None, min_border=1)

	# display plot and tooltips
	plot_kmeans.scatter(x='x', y='y', color='color', source=kmeans_df)
	hover = plot_kmeans.select(dict(type=HoverTool))
	hover.tooltips={"tweets": "@tweets", "cluster":"@cluster" }
	bp.show(plot_kmeans)
	from cuml.manifold import TSNE


	tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=1000)
	tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
	cv = CountVectorizer(max_features=18000, stop_words='english')
	count = cv.fit_transform(tweets)

	tf = TfidfTransformer()
	tfidf_matrix = tf.fit_transform(count)