Skip to content

Instantly share code, notes, and snippets.

@greyhoundforty
Created May 8, 2026 08:57
Show Gist options
  • Select an option

  • Save greyhoundforty/d328a4af036efdce79e49e44e7e1b9bd to your computer and use it in GitHub Desktop.

Select an option

Save greyhoundforty/d328a4af036efdce79e49e44e7e1b9bd to your computer and use it in GitHub Desktop.
Organize Excel Files
#!/usr/bin/env python3
"""
organize_excel.py
Scans a directory of Excel files, extracts a customer name from each
filename, creates a per-customer subfolder in the output directory,
and moves each file into the appropriate folder.
Customer name extraction is controlled by --separator and --position:
- The filename stem (no extension) is split on the separator string
- The segment at --position is used as the customer name
Example filename patterns and the flags to match them:
"Acme Corp_2024-01.xlsx" --separator "_" --position 0 → "Acme Corp"
"2024_Acme Corp_Q1.xlsx" --separator "_" --position 1 → "Acme Corp"
"Acme Corp - Invoice.xlsx" --separator " - " --position 0 → "Acme Corp"
Usage:
python organize_excel.py -i ./raw_files -o ./sorted
python organize_excel.py -i ./raw_files -o ./sorted --separator " - "
python organize_excel.py -i ./raw_files -o ./sorted --dry-run
"""
import argparse
import shutil
import sys
from pathlib import Path
# ── Helpers ────────────────────────────────────────────────────────────────────
def extract_customer_name(stem: str, separator: str, position: int) -> str | None:
"""
Split the filename stem (no extension) on `separator` and return the
segment at `position` as the customer name.
Returns None when the filename doesn't contain enough separator
occurrences to reach the requested position — which lets the caller
log a useful skip message instead of crashing.
Args:
stem: Filename without extension, e.g. "Acme Corp_2024-01"
separator: String to split on, e.g. "_" or " - "
position: Zero-based index of the customer name segment.
Returns:
Stripped customer name string, or None if not enough parts.
"""
parts = stem.split(separator)
if position >= len(parts):
return None # Not enough segments — caller will log a skip warning
# Strip surrounding whitespace so "Acme Corp " → "Acme Corp"
return parts[position].strip()
def sanitize_folder_name(name: str) -> str:
"""
Replace characters that are illegal in directory names on Windows
(the most restrictive common OS) with underscores.
Illegal characters: \\ / : * ? " < > |
Args:
name: Raw customer name string.
Returns:
A safe-to-use folder name string.
"""
illegal_chars = r'\/:*?"<>|'
for char in illegal_chars:
name = name.replace(char, "_")
return name.strip()
def collect_excel_files(input_dir: Path) -> list[Path]:
"""
Return all Excel files in `input_dir` (top-level only, non-recursive).
Recognises the four most common Excel extensions:
.xlsx - Excel workbook (modern, default)
.xls - Legacy Excel 97-2003 format
.xlsm - Macro-enabled workbook
.xlsb - Binary workbook (large files)
Args:
input_dir: Directory to scan.
Returns:
Sorted list of Path objects for matched files.
"""
excel_extensions = {".xlsx", ".xls", ".xlsm", ".xlsb"}
return sorted(
f for f in input_dir.iterdir()
if f.is_file() and f.suffix.lower() in excel_extensions
)
# ── Core logic ─────────────────────────────────────────────────────────────────
def organize_files(
input_dir: Path,
output_dir: Path,
separator: str,
position: int,
dry_run: bool,
) -> None:
"""
Main orchestration: walk input_dir, group files by customer, and move them.
For each Excel file found:
1. Extract customer name from the filename using separator + position
2. Sanitize the name so it's safe to use as a directory name
3. Create the customer folder under output_dir (if it doesn't exist)
4. Move the file into that folder (skip if destination already exists)
When dry_run=True every step is printed but no files or directories
are actually created or moved — useful for verifying your separator
and position settings before committing.
Args:
input_dir: Source directory containing Excel files.
output_dir: Root destination; customer folders are created inside.
separator: String used to split filenames into segments.
position: Which segment (0-based) holds the customer name.
dry_run: If True, only print actions without executing them.
"""
excel_files = collect_excel_files(input_dir)
if not excel_files:
print(f"No Excel files found in: {input_dir}")
return
print(f"Found {len(excel_files)} Excel file(s) in '{input_dir}'")
if dry_run:
print("── DRY RUN — no files will be moved ──\n")
# Counters for the summary line printed at the end
moved = 0
skipped = 0
for file_path in excel_files:
# stem = filename without the extension → "Acme Corp_2024-01"
stem = file_path.stem
customer = extract_customer_name(stem, separator, position)
# ── Could not parse a customer name → skip this file ──
if not customer:
print(f" [SKIP] '{file_path.name}'")
print(f" Splitting '{stem}' on '{separator}' gave fewer "
f"than {position + 1} part(s). Try adjusting --separator or --position.")
skipped += 1
continue
# Build the destination path
folder_name = sanitize_folder_name(customer) # safe for the filesystem
dest_dir = output_dir / folder_name
dest_file = dest_dir / file_path.name
print(f" {file_path.name}")
print(f" customer → {customer}")
print(f" dest → {dest_file}")
# In dry-run mode we stop here — nothing is written to disk
if dry_run:
moved += 1
continue
# ── Create the customer folder if it doesn't already exist ──
# mkdir(parents=True) creates any missing intermediate directories.
# exist_ok=True means it won't raise an error if the folder is already there.
dest_dir.mkdir(parents=True, exist_ok=True)
# ── Guard: don't silently overwrite an existing file ──
if dest_file.exists():
print(f" [WARN] Destination already exists — skipping to avoid overwrite.")
skipped += 1
continue
# ── Move the file ──
# shutil.move handles both same-filesystem renames and cross-device copies.
shutil.move(str(file_path), str(dest_file))
moved += 1
# ── Final summary ──
print()
verb = "Would move" if dry_run else "Moved"
print(f"Done. {verb} {moved} file(s). Skipped {skipped} file(s).")
# ── CLI ────────────────────────────────────────────────────────────────────────
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="organize_excel.py",
description=(
"Organize Excel files into per-customer subdirectories.\n"
"The customer name is extracted from each filename using a\n"
"configurable separator and segment position.\n\n"
"Examples:\n"
" # Files like 'Acme Corp_2024-01.xlsx' (default settings)\n"
" python organize_excel.py -i ./raw -o ./sorted\n\n"
" # Files like 'Acme Corp - Invoice 001.xlsx'\n"
" python organize_excel.py -i ./raw -o ./sorted --separator ' - '\n\n"
" # Files like '2024_Acme Corp_Q1.xlsx' (customer is 2nd segment)\n"
" python organize_excel.py -i ./raw -o ./sorted --position 1\n\n"
" # Preview without moving anything\n"
" python organize_excel.py -i ./raw -o ./sorted --dry-run"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-i", "--input",
required=True,
metavar="DIR",
help="Directory containing the Excel files to organize.",
)
parser.add_argument(
"-o", "--output",
required=True,
metavar="DIR",
help=(
"Root output directory. "
"A subfolder will be created here for each unique customer name."
),
)
parser.add_argument(
"--separator",
default="_",
metavar="SEP",
help=(
"String that separates the customer name from the rest of the filename. "
"Default: '_' → 'Acme Corp_2024-01.xlsx' gives customer 'Acme Corp'."
),
)
parser.add_argument(
"--position",
type=int,
default=0,
metavar="N",
help=(
"Zero-based index of the segment that contains the customer name "
"after splitting on --separator. "
"Default: 0 (first segment)."
),
)
parser.add_argument(
"--dry-run",
action="store_true",
help=(
"Print what would happen without moving any files. "
"Use this to verify your --separator and --position settings first."
),
)
return parser.parse_args()
def main() -> None:
args = parse_args()
# Resolve ~/ home-directory shortcuts and relative paths to absolute paths
input_dir = Path(args.input).expanduser().resolve()
output_dir = Path(args.output).expanduser().resolve()
# ── Validate input directory ──
if not input_dir.exists():
print(f"Error: input directory does not exist: {input_dir}", file=sys.stderr)
sys.exit(1)
if not input_dir.is_dir():
print(f"Error: input path is not a directory: {input_dir}", file=sys.stderr)
sys.exit(1)
# ── Create output root now (skip in dry-run so nothing is written) ──
if not args.dry_run:
output_dir.mkdir(parents=True, exist_ok=True)
organize_files(
input_dir = input_dir,
output_dir = output_dir,
separator = args.separator,
position = args.position,
dry_run = args.dry_run,
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment