Created
August 14, 2024 07:11
-
-
Save uros-r/6e6adf2cf2df8eb4dfe473a077ed89ca to your computer and use it in GitHub Desktop.
Normalise Pandas column names to snake case
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is released into the public domain under the Creative Commons Zero (CC0) license. | |
# You can copy, modify, distribute, and perform the work, even for commercial purposes, all without asking permission. | |
# For more information, see https://creativecommons.org/publicdomain/zero/1.0/ | |
import re | |
from collections import defaultdict | |
def normalise_column_name(x): | |
x = x.lower().strip() | |
x = re.sub(r"[?!()]", "", x) # remove | |
x = re.sub(r"[\s\t\-_/\.]+", "_", x) # replace with _ | |
x = re.sub(r"^_+", "", x) # replace leading _ | |
return x | |
def normalise_column_names(df, reject_dupes=True, unmangle_numbered_dupes=False): | |
""" | |
:param unmangle_numbered_dupes: This reverses the dupe_col.1 numbering scheme of pd.read_csv, so we can apply our own | |
""" | |
df = df.copy() | |
if unmangle_numbered_dupes: | |
df.columns = [re.sub(r"\.\d+$", "", col) for col in df.columns] | |
df.columns = [normalise_column_name(col) for col in df.columns] | |
duplicate_cols = set(df.columns[df.columns.duplicated(keep=False)]) | |
if len(duplicate_cols): | |
if reject_dupes: | |
raise ValueError(f"One or more column names are duplicated") | |
dupe_col_count = defaultdict(int) | |
new_cols = [] | |
for col in df.columns: | |
if col in duplicate_cols: | |
dupe_col_count[col] += 1 | |
new_cols.append(f"{col}_{dupe_col_count[col]}") | |
else: | |
new_cols.append(col) | |
df.columns = new_cols | |
return df | |
## | |
## Tests | |
## | |
import pandas as pd | |
import pytest | |
def test_normalise_column_names(): | |
df = pd.DataFrame(columns=["A", "_B-", " cc!! /", "DDdd", " e e e ", "f.x"]) | |
assert normalise_column_names(df).columns.tolist() == [ | |
"a", | |
"b_", | |
"cc_", | |
"dddd", | |
"e_e_e", | |
"f_x", | |
] | |
def test_normalise_column_names_reject_duplicate_cols(): | |
df = pd.DataFrame(columns=["A", "B", "A"]) | |
with pytest.raises(ValueError, match=r".*?duplicate.*?"): | |
normalise_column_names(df) | |
def test_normalise_column_names_number_duplicate_cols(): | |
df = pd.DataFrame(columns=["A", "B", "A"]) | |
assert normalise_column_names(df, reject_dupes=False).columns.tolist() == [ | |
"a_1", | |
"b", | |
"a_2", | |
] | |
def test_normalise_column_names_unmangle_dupe_col_names(): | |
df = pd.DataFrame(columns=["A", "B", "A.1", "A", "B.22.33"]) | |
assert normalise_column_names( | |
df, reject_dupes=False, unmangle_numbered_dupes=True | |
).columns.tolist() == [ | |
"a_1", | |
"b", | |
"a_2", | |
"a_3", | |
"b_22", | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment