Skip to content

Instantly share code, notes, and snippets.

@ahue
Created May 3, 2021 11:52
Show Gist options
  • Save ahue/a27654c453bf5c89227ab72ce17c5753 to your computer and use it in GitHub Desktop.
Save ahue/a27654c453bf5c89227ab72ce17c5753 to your computer and use it in GitHub Desktop.
Analyize cardinalities of a pandas Data.Frame
import pandas as pd
import seaborn as sns
def cardinalities(df: pd.DataFrame, plot: bool = False):
"""
Analyses cardinalities in a data frame
df DataFrame the data frame to be analyized
plot Plot whether to plot rather than return the result DataFrame
returns (DataFrame, Plot) of a matrix M(i,j) that reads row-wise,
i.e. for each entry in attribute i, there are 1 (dark) or n (light) values in attribute j
"""
l = []
for cnm in df.columns:
# here is the heart
# count number of distinct values in any column per value in the grouped column and take max
rw = df.groupby(cnm).agg(['nunique']).agg(["max"]) > 1
rw.index = [cnm]
l += [rw]
car = pd.concat(l)
# postprocessing
car = car.sort_index()
car.columns = df.columns.sort_values()
car[car == True] = "n"
car[car == False] = "1"
car[car.isna()] = "1"
if plot:
pcar = car.copy()
pcar[pcar == "n"] = 2
pcar[pcar == "1"] = 1
plt = sns.heatmap(pcar.astype(float), cbar=False)
return (car, plt)
return (car, )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment