-
-
Save allenday/2f302524c16157828c1ef6eae46cfa1f to your computer and use it in GitHub Desktop.
Efficient python implementation of canopy clustering. (A method for efficiently generating centroids and clusters, most commonly as input to a more robust clustering algorithm.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics.pairwise import pairwise_distances | |
import numpy as np | |
# X shoudl be a numpy matrix, very likely sparse matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix | |
# T1 > T2 for overlapping clusters | |
# T1 = Distance to centroid point to not include in other clusters | |
# T2 = Distance to centroid point to include in cluster | |
# T1 > T2 for overlapping clusters | |
# T1 < T2 will have points which reside in no clusters | |
# T1 == T2 will cause all points to reside in mutually exclusive clusters | |
# Distance metric can be any from here: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html | |
# filemap may be a list of point names in their order in X. If included, row numbers from X will be replaced with names from filemap. | |
def canopy(X, T1, T2, distance_metric='euclidean', filemap=None): | |
canopies = dict() | |
X1_dist = pairwise_distances(X, metric=distance_metric) | |
canopy_points = set(range(X.shape[0])) | |
while canopy_points: | |
point = canopy_points.pop() | |
i = len(canopies) | |
canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])} | |
canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0])) | |
if filemap: | |
for canopy_id in canopies.keys(): | |
canopy = canopies.pop(canopy_id) | |
canopy2 = {"c":filemap[canopy['c']], "points":list()} | |
for point in canopy['points']: | |
canopy2["points"].append(filemap[point]) | |
canopies[canopy_id] = canopy2 | |
return canopies |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment