Last active
October 20, 2023 20:06
-
-
Save Vini2/0dc28a1d8b7d78cf30b3d633cd62c271 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Clustering using convex hulls - high dimensional data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Imports" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import numpy as np\n", | |
"import seaborn as sns\n", | |
"\n", | |
"from sklearn.datasets import make_blobs\n", | |
"from mpl_toolkits.mplot3d import Axes3D\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.cluster import KMeans\n", | |
"from itertools import permutations\n", | |
"from scipy.spatial import ConvexHull\n", | |
"from quadprog import solve_qp\n", | |
"from sklearn.metrics import accuracy_score" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Creating dataset \n", | |
"centers = [[0.8, 0.5, 0.7, 0.6, 0.5, 0.5, 0.3, 0.3], \n", | |
" [0.2, 0.2, 0.2, 0.1, 0.1, 0.2, 0.3, 0.2], \n", | |
" [0.1, -0.1, -0.1, -0.2, -0.2, -0.1, 0, 0.1]]\n", | |
"stds = [0.4, 0.3, 0.4] \n", | |
"\n", | |
"X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=stds, random_state=0) \n", | |
"point_indices = np.arange(1000)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get training and testing datasets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_seeds, X_rest, y_seeds, y_rest, id_seeds, id_rest = train_test_split(X, labels_true, point_indices, test_size=0.33, random_state=42)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Remap clustering result to original labels" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def remap_labels(pred_labels, true_labels):\n", | |
" \"\"\"Rename prediction labels (clustered output) to best match true labels.\"\"\"\n", | |
" # from itertools import permutations # import this into script.\n", | |
" pred_labels, true_labels = np.array(pred_labels), np.array(true_labels)\n", | |
" assert pred_labels.ndim == 1 == true_labels.ndim\n", | |
" assert len(pred_labels) == len(true_labels)\n", | |
" cluster_names = np.unique(pred_labels)\n", | |
" accuracy = 0\n", | |
"\n", | |
" perms = np.array(list(permutations(np.unique(true_labels))))\n", | |
"\n", | |
" remapped_labels = true_labels\n", | |
" for perm in perms:\n", | |
" flipped_labels = np.zeros(len(true_labels))\n", | |
" for label_index, label in enumerate(cluster_names):\n", | |
" flipped_labels[pred_labels == label] = perm[label_index]\n", | |
"\n", | |
" testAcc = np.sum(flipped_labels == true_labels) / len(true_labels)\n", | |
" if testAcc > accuracy:\n", | |
" accuracy = testAcc\n", | |
" remapped_labels = flipped_labels\n", | |
"\n", | |
" return accuracy, remapped_labels" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get initial clustering using K-means" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.8641791044776119\n" | |
] | |
} | |
], | |
"source": [ | |
"kmeans = KMeans(n_clusters=3, random_state=9).fit(X_seeds)\n", | |
"initial_result = kmeans.labels_\n", | |
"\n", | |
"intial_accuracy, remapped_initial_result = remap_labels(initial_result, y_seeds)\n", | |
"print(intial_accuracy)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get convex hulls of the initial clustering" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"dict_keys([2, 1, 0])\n" | |
] | |
} | |
], | |
"source": [ | |
"# Get the idices of the data points belonging to each cluster\n", | |
"indices = {}\n", | |
"\n", | |
"for i in range(len(id_seeds)):\n", | |
" if int(remapped_initial_result[i]) not in indices:\n", | |
" indices[int(remapped_initial_result[i])] = [i]\n", | |
" else:\n", | |
" indices[int(remapped_initial_result[i])].append(i)\n", | |
"\n", | |
"# Get convex hulls for each cluster\n", | |
"hulls = {}\n", | |
"\n", | |
"for i in indices:\n", | |
" hull = ConvexHull(X_seeds[indices[i]])\n", | |
" hulls[i] = hull\n", | |
"\n", | |
"print(hulls.keys())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Assign remaining points to the cluster of the closest convex hull" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def point_in_hull(point, hull, tolerance=1e-12):\n", | |
" return all(\n", | |
" (np.dot(eq[:-1], point) + eq[-1] <= tolerance) for eq in hull.equations)\n", | |
"\n", | |
"def proj2hull(z, equations):\n", | |
" \"\"\"\n", | |
" Project `z` to the convex hull defined by the\n", | |
" hyperplane equations of the facets\n", | |
"\n", | |
" Arguments\n", | |
" z: array, shape (ndim,)\n", | |
" equations: array shape (nfacets, ndim + 1)\n", | |
"\n", | |
" Returns\n", | |
" x: array, shape (ndim,)\n", | |
" \"\"\"\n", | |
" G = np.eye(len(z), dtype=float)\n", | |
" a = np.array(z, dtype=float)\n", | |
" C = np.array(-equations[:, :-1], dtype=float)\n", | |
" b = np.array(equations[:, -1], dtype=float)\n", | |
" x, f, xu, itr, lag, act = solve_qp(G, a, C.T, b, meq=0, factorized=True)\n", | |
" return x" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"prediction = []\n", | |
"\n", | |
"for z1 in X_rest:\n", | |
" \n", | |
" min_cluster_distance = 100000\n", | |
" min_distance_point = \"\"\n", | |
" min_cluster_distance_hull = \"\"\n", | |
" \n", | |
" for i in indices:\n", | |
"\n", | |
" p = proj2hull(z1, hulls[i].equations)\n", | |
"\n", | |
" dist = np.linalg.norm(z1-p)\n", | |
"\n", | |
" if dist < min_cluster_distance:\n", | |
" min_cluster_distance = dist\n", | |
" min_distance_point = p\n", | |
" min_cluster_distance_hull = i\n", | |
" \n", | |
" prediction.append(min_cluster_distance_hull)\n", | |
"\n", | |
"prediction = np.array(prediction)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Evaluate final result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy of Convex Hull method: 0.867\n" | |
] | |
} | |
], | |
"source": [ | |
"Y_pred = np.concatenate((remapped_initial_result, prediction))\n", | |
"Y_real = np.concatenate((y_seeds, y_rest))\n", | |
"\n", | |
"final_accuracy, remapped_final_result = remap_labels(Y_pred, Y_real)\n", | |
"\n", | |
"print(\"Accuracy of Convex Hull method:\", final_accuracy)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.867" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"accuracy_score(Y_real, Y_pred)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Run K-means on the entire dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy of K-means method: 0.866\n" | |
] | |
} | |
], | |
"source": [ | |
"kmeans = KMeans(n_clusters=3, random_state=9).fit(X)\n", | |
"result = kmeans.labels_\n", | |
"\n", | |
"accuracy, remapped_result = remap_labels(result, labels_true)\n", | |
"\n", | |
"print(\"Accuracy of K-means method:\", accuracy)\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment