Last active
July 8, 2020 23:50
-
-
Save Garfounkel/3074c6a8b113e8db527740bcc1e86b2c to your computer and use it in GitHub Desktop.
Nvidia NLP blog clustering
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] | |
terms = cv.get_feature_names() | |
clusters_terms = sorted_centroids[:, :100].get() | |
for i, c1 in enumerate(clusters_terms): | |
cluster = set(c1) | |
for j, c2 in enumerate(clusters_terms): | |
if i == j: | |
continue | |
cluster -= set(c2) | |
cluster = c1[np.isin(c1, list(cluster))][:5] | |
print(f'Cluster {i}:', ' | '.join(terms[cluster].tolist())) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from cuml.cluster import KMeans | |
num_clusters = 10 | |
sample_size = 100_000 | |
kmeans_model = KMeans(n_clusters=num_clusters, n_init=1, max_iter=1000) | |
sample_data = tfidf_matrix[:sample_size].todense() | |
sample_tweets = tweets[:sample_size].reset_index(drop=True) | |
kmeans = kmeans_model.fit(sample_data) | |
kmeans_clusters = kmeans.predict(sample_data) | |
kmeans_distances = kmeans.transform(sample_data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bokeh.plotting as bp | |
from bokeh.palettes import Turbo256 | |
from bokeh.models import HoverTool | |
# setup data | |
step = len(Turbo256) / num_clusters | |
kmeans_df = DataFrame(tsne_kmeans, columns=['x', 'y']) | |
kmeans_df['cluster'] = kmeans_clusters | |
kmeans_df['tweets'] = sample_tweets | |
kmeans_df['color'] = [Turbo256[int(i * step)] for i in kmeans_clusters.tolist()] | |
kmeans_df = kmeans_df.to_pandas() | |
# setup plot | |
plot_kmeans = bp.figure(plot_width=700, plot_height=600, | |
title="KMeans clustering of item description", | |
tools="pan,wheel_zoom,box_zoom,reset,hover", | |
x_axis_type=None, y_axis_type=None, min_border=1) | |
# display plot and tooltips | |
plot_kmeans.scatter(x='x', y='y', color='color', source=kmeans_df) | |
hover = plot_kmeans.select(dict(type=HoverTool)) | |
hover.tooltips={"tweets": "@tweets", "cluster":"@cluster" } | |
bp.show(plot_kmeans) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sample_tweets[kmeans_clusters == 4].to_pandas().sample() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from cuml.manifold import TSNE | |
tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=1000) | |
tsne_kmeans = tsne_model.fit_transform(kmeans_distances) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cv = CountVectorizer(max_features=18000, stop_words='english') | |
count = cv.fit_transform(tweets) | |
tf = TfidfTransformer() | |
tfidf_matrix = tf.fit_transform(count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment