Skip to content

Instantly share code, notes, and snippets.

@ahue
Last active May 4, 2021 08:40
Show Gist options
  • Save ahue/3a538e1a7c01f9ac13e71f38d91c0f03 to your computer and use it in GitHub Desktop.
Save ahue/3a538e1a7c01f9ac13e71f38d91c0f03 to your computer and use it in GitHub Desktop.
Rearranges the correlation matrix, corr_array, so that groups of highly correlated variables are next to eachother
import scipy
import scipy.cluster.hierarchy as sch
def cluster_corr(corr_array, inplace=False, impute_nan=False):
"""
Original author: @WYegelwel (https://wil.yegelwel.com/cluster-correlation-matrix/)
Rearranges the correlation matrix, corr_array, so that groups of highly
correlated variables are next to eachother
Parameters
----------
corr_array : pandas.DataFrame or numpy.ndarray
a MxN correlation matrix
impute_nan : bool
When True, nan values are replace with the mean value of the matrix
Returns
-------
pandas.DataFrame or numpy.ndarray
a MxN correlation matrix with the columns and rows rearranged
"""
if isinstance(corr_array, pd.DataFrame):
corr_array4clst = corr_array.copy().values
else:
corr_array4clst = corr_array.copy()
if impute_nan:
corr_array4clst[np.isnan(corr_array4clst)] = np.nanmean(corr_array4clst)
def compute_idx(corr_arr):
pairwise_distances = sch.distance.pdist(corr_arr)
linkage = sch.linkage(pairwise_distances, method='complete')
cluster_distance_threshold = pairwise_distances.max()/2
idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold,
criterion='distance')
idx = np.argsort(idx_to_cluster_array)
return idx
idx = compute_idx(corr_array4clst)
idx2 = idx
# in case we have an MxN array where M!=N
if corr_array4clst.shape[0] != corr_array4clst.shape[1]:
idx2 = compute_idx(corr_array4clst.T)
if not inplace:
corr_array = corr_array.copy()
if isinstance(corr_array, pd.DataFrame):
return corr_array.iloc[idx, :].iloc[:, idx2]
return corr_array[idx, :][:, idx2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment