Last active
May 4, 2021 08:40
-
-
Save ahue/3a538e1a7c01f9ac13e71f38d91c0f03 to your computer and use it in GitHub Desktop.
Rearranges the correlation matrix, corr_array, so that groups of highly correlated variables are next to eachother
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scipy | |
import scipy.cluster.hierarchy as sch | |
def cluster_corr(corr_array, inplace=False, impute_nan=False): | |
""" | |
Original author: @WYegelwel (https://wil.yegelwel.com/cluster-correlation-matrix/) | |
Rearranges the correlation matrix, corr_array, so that groups of highly | |
correlated variables are next to eachother | |
Parameters | |
---------- | |
corr_array : pandas.DataFrame or numpy.ndarray | |
a MxN correlation matrix | |
impute_nan : bool | |
When True, nan values are replace with the mean value of the matrix | |
Returns | |
------- | |
pandas.DataFrame or numpy.ndarray | |
a MxN correlation matrix with the columns and rows rearranged | |
""" | |
if isinstance(corr_array, pd.DataFrame): | |
corr_array4clst = corr_array.copy().values | |
else: | |
corr_array4clst = corr_array.copy() | |
if impute_nan: | |
corr_array4clst[np.isnan(corr_array4clst)] = np.nanmean(corr_array4clst) | |
def compute_idx(corr_arr): | |
pairwise_distances = sch.distance.pdist(corr_arr) | |
linkage = sch.linkage(pairwise_distances, method='complete') | |
cluster_distance_threshold = pairwise_distances.max()/2 | |
idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold, | |
criterion='distance') | |
idx = np.argsort(idx_to_cluster_array) | |
return idx | |
idx = compute_idx(corr_array4clst) | |
idx2 = idx | |
# in case we have an MxN array where M!=N | |
if corr_array4clst.shape[0] != corr_array4clst.shape[1]: | |
idx2 = compute_idx(corr_array4clst.T) | |
if not inplace: | |
corr_array = corr_array.copy() | |
if isinstance(corr_array, pd.DataFrame): | |
return corr_array.iloc[idx, :].iloc[:, idx2] | |
return corr_array[idx, :][:, idx2] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment