Skip to content

Instantly share code, notes, and snippets.

@uros-r
Created August 14, 2024 07:11
Show Gist options
  • Save uros-r/6e6adf2cf2df8eb4dfe473a077ed89ca to your computer and use it in GitHub Desktop.
Save uros-r/6e6adf2cf2df8eb4dfe473a077ed89ca to your computer and use it in GitHub Desktop.
Normalise Pandas column names to snake case
# This code is released into the public domain under the Creative Commons Zero (CC0) license.
# You can copy, modify, distribute, and perform the work, even for commercial purposes, all without asking permission.
# For more information, see https://creativecommons.org/publicdomain/zero/1.0/
import re
from collections import defaultdict
def normalise_column_name(x):
x = x.lower().strip()
x = re.sub(r"[?!()]", "", x) # remove
x = re.sub(r"[\s\t\-_/\.]+", "_", x) # replace with _
x = re.sub(r"^_+", "", x) # replace leading _
return x
def normalise_column_names(df, reject_dupes=True, unmangle_numbered_dupes=False):
"""
:param unmangle_numbered_dupes: This reverses the dupe_col.1 numbering scheme of pd.read_csv, so we can apply our own
"""
df = df.copy()
if unmangle_numbered_dupes:
df.columns = [re.sub(r"\.\d+$", "", col) for col in df.columns]
df.columns = [normalise_column_name(col) for col in df.columns]
duplicate_cols = set(df.columns[df.columns.duplicated(keep=False)])
if len(duplicate_cols):
if reject_dupes:
raise ValueError(f"One or more column names are duplicated")
dupe_col_count = defaultdict(int)
new_cols = []
for col in df.columns:
if col in duplicate_cols:
dupe_col_count[col] += 1
new_cols.append(f"{col}_{dupe_col_count[col]}")
else:
new_cols.append(col)
df.columns = new_cols
return df
##
## Tests
##
import pandas as pd
import pytest
def test_normalise_column_names():
df = pd.DataFrame(columns=["A", "_B-", " cc!! /", "DDdd", " e e e ", "f.x"])
assert normalise_column_names(df).columns.tolist() == [
"a",
"b_",
"cc_",
"dddd",
"e_e_e",
"f_x",
]
def test_normalise_column_names_reject_duplicate_cols():
df = pd.DataFrame(columns=["A", "B", "A"])
with pytest.raises(ValueError, match=r".*?duplicate.*?"):
normalise_column_names(df)
def test_normalise_column_names_number_duplicate_cols():
df = pd.DataFrame(columns=["A", "B", "A"])
assert normalise_column_names(df, reject_dupes=False).columns.tolist() == [
"a_1",
"b",
"a_2",
]
def test_normalise_column_names_unmangle_dupe_col_names():
df = pd.DataFrame(columns=["A", "B", "A.1", "A", "B.22.33"])
assert normalise_column_names(
df, reject_dupes=False, unmangle_numbered_dupes=True
).columns.tolist() == [
"a_1",
"b",
"a_2",
"a_3",
"b_22",
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment