Last active
          July 8, 2020 23:50 
        
      - 
      
- 
        Save Garfounkel/3074c6a8b113e8db527740bcc1e86b2c to your computer and use it in GitHub Desktop. 
    Nvidia NLP blog clustering
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import numpy as np | |
| sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] | |
| terms = cv.get_feature_names() | |
| clusters_terms = sorted_centroids[:, :100].get() | |
| for i, c1 in enumerate(clusters_terms): | |
| cluster = set(c1) | |
| for j, c2 in enumerate(clusters_terms): | |
| if i == j: | |
| continue | |
| cluster -= set(c2) | |
| cluster = c1[np.isin(c1, list(cluster))][:5] | |
| print(f'Cluster {i}:', ' | '.join(terms[cluster].tolist())) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from cuml.cluster import KMeans | |
| num_clusters = 10 | |
| sample_size = 100_000 | |
| kmeans_model = KMeans(n_clusters=num_clusters, n_init=1, max_iter=1000) | |
| sample_data = tfidf_matrix[:sample_size].todense() | |
| sample_tweets = tweets[:sample_size].reset_index(drop=True) | |
| kmeans = kmeans_model.fit(sample_data) | |
| kmeans_clusters = kmeans.predict(sample_data) | |
| kmeans_distances = kmeans.transform(sample_data) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import bokeh.plotting as bp | |
| from bokeh.palettes import Turbo256 | |
| from bokeh.models import HoverTool | |
| # setup data | |
| step = len(Turbo256) / num_clusters | |
| kmeans_df = DataFrame(tsne_kmeans, columns=['x', 'y']) | |
| kmeans_df['cluster'] = kmeans_clusters | |
| kmeans_df['tweets'] = sample_tweets | |
| kmeans_df['color'] = [Turbo256[int(i * step)] for i in kmeans_clusters.tolist()] | |
| kmeans_df = kmeans_df.to_pandas() | |
| # setup plot | |
| plot_kmeans = bp.figure(plot_width=700, plot_height=600, | |
| title="KMeans clustering of item description", | |
| tools="pan,wheel_zoom,box_zoom,reset,hover", | |
| x_axis_type=None, y_axis_type=None, min_border=1) | |
| # display plot and tooltips | |
| plot_kmeans.scatter(x='x', y='y', color='color', source=kmeans_df) | |
| hover = plot_kmeans.select(dict(type=HoverTool)) | |
| hover.tooltips={"tweets": "@tweets", "cluster":"@cluster" } | |
| bp.show(plot_kmeans) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | sample_tweets[kmeans_clusters == 4].to_pandas().sample() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from cuml.manifold import TSNE | |
| tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=1000) | |
| tsne_kmeans = tsne_model.fit_transform(kmeans_distances) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | cv = CountVectorizer(max_features=18000, stop_words='english') | |
| count = cv.fit_transform(tweets) | |
| tf = TfidfTransformer() | |
| tfidf_matrix = tf.fit_transform(count) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment