greyhoundforty · May 8, 2026 08:57
diff --git a/organize_excel.py b/organize_excel.py
 #!/usr/bin/env python3
 """
 organize_excel.py

 Scans a directory of Excel files, extracts a customer name from each
 filename, creates a per-customer subfolder in the output directory,
 and moves each file into the appropriate folder.

 Customer name extraction is controlled by --separator and --position:
  - The filename stem (no extension) is split on the separator string
  - The segment at --position is used as the customer name

 Example filename patterns and the flags to match them:
  "Acme Corp_2024-01.xlsx"    --separator "_"   --position 0  → "Acme Corp"
  "2024_Acme Corp_Q1.xlsx"    --separator "_"   --position 1  → "Acme Corp"
  "Acme Corp - Invoice.xlsx"  --separator " - " --position 0  → "Acme Corp"

 Usage:
  python organize_excel.py -i ./raw_files -o ./sorted
  python organize_excel.py -i ./raw_files -o ./sorted --separator " - "
  python organize_excel.py -i ./raw_files -o ./sorted --dry-run
 """

 import argparse
 import shutil
 import sys
 from pathlib import Path


 # ── Helpers ────────────────────────────────────────────────────────────────────

 def extract_customer_name(stem: str, separator: str, position: int) -> str | None:
    """
    Split the filename stem (no extension) on `separator` and return the
    segment at `position` as the customer name.

    Returns None when the filename doesn't contain enough separator
    occurrences to reach the requested position — which lets the caller
    log a useful skip message instead of crashing.

    Args:
        stem:      Filename without extension, e.g. "Acme Corp_2024-01"
        separator: String to split on, e.g. "_" or " - "
        position:  Zero-based index of the customer name segment.

    Returns:
        Stripped customer name string, or None if not enough parts.
    """
    parts = stem.split(separator)

    if position >= len(parts):
        return None  # Not enough segments — caller will log a skip warning

    # Strip surrounding whitespace so "Acme Corp " → "Acme Corp"
    return parts[position].strip()


 def sanitize_folder_name(name: str) -> str:
    """
    Replace characters that are illegal in directory names on Windows
    (the most restrictive common OS) with underscores.

    Illegal characters: \\ / : * ? " < > |

    Args:
        name: Raw customer name string.

    Returns:
        A safe-to-use folder name string.
    """
    illegal_chars = r'\/:*?"<>|'
    for char in illegal_chars:
        name = name.replace(char, "_")
    return name.strip()


 def collect_excel_files(input_dir: Path) -> list[Path]:
    """
    Return all Excel files in `input_dir` (top-level only, non-recursive).

    Recognises the four most common Excel extensions:
      .xlsx  - Excel workbook (modern, default)
      .xls   - Legacy Excel 97-2003 format
      .xlsm  - Macro-enabled workbook
      .xlsb  - Binary workbook (large files)

    Args:
        input_dir: Directory to scan.

    Returns:
        Sorted list of Path objects for matched files.
    """
    excel_extensions = {".xlsx", ".xls", ".xlsm", ".xlsb"}
    return sorted(
        f for f in input_dir.iterdir()
        if f.is_file() and f.suffix.lower() in excel_extensions
    )


 # ── Core logic ─────────────────────────────────────────────────────────────────

 def organize_files(
    input_dir: Path,
    output_dir: Path,
    separator: str,
    position: int,
    dry_run: bool,
 ) -> None:
    """
    Main orchestration: walk input_dir, group files by customer, and move them.

    For each Excel file found:
      1. Extract customer name from the filename using separator + position
      2. Sanitize the name so it's safe to use as a directory name
      3. Create the customer folder under output_dir (if it doesn't exist)
      4. Move the file into that folder (skip if destination already exists)

    When dry_run=True every step is printed but no files or directories
    are actually created or moved — useful for verifying your separator
    and position settings before committing.

    Args:
        input_dir:  Source directory containing Excel files.
        output_dir: Root destination; customer folders are created inside.
        separator:  String used to split filenames into segments.
        position:   Which segment (0-based) holds the customer name.
        dry_run:    If True, only print actions without executing them.
    """
    excel_files = collect_excel_files(input_dir)

    if not excel_files:
        print(f"No Excel files found in: {input_dir}")
        return

    print(f"Found {len(excel_files)} Excel file(s) in '{input_dir}'")

    if dry_run:
        print("── DRY RUN — no files will be moved ──\n")

    # Counters for the summary line printed at the end
    moved = 0
    skipped = 0

    for file_path in excel_files:
        # stem = filename without the extension  →  "Acme Corp_2024-01"
        stem = file_path.stem

        customer = extract_customer_name(stem, separator, position)

        # ── Could not parse a customer name → skip this file ──
        if not customer:
            print(f"  [SKIP] '{file_path.name}'")
            print(f"         Splitting '{stem}' on '{separator}' gave fewer "
                  f"than {position + 1} part(s). Try adjusting --separator or --position.")
            skipped += 1
            continue

        # Build the destination path
        folder_name = sanitize_folder_name(customer)   # safe for the filesystem
        dest_dir  = output_dir / folder_name
        dest_file = dest_dir / file_path.name

        print(f"  {file_path.name}")
        print(f"    customer → {customer}")
        print(f"    dest     → {dest_file}")

        # In dry-run mode we stop here — nothing is written to disk
        if dry_run:
            moved += 1
            continue

        # ── Create the customer folder if it doesn't already exist ──
        # mkdir(parents=True) creates any missing intermediate directories.
        # exist_ok=True means it won't raise an error if the folder is already there.
        dest_dir.mkdir(parents=True, exist_ok=True)

        # ── Guard: don't silently overwrite an existing file ──
        if dest_file.exists():
            print(f"    [WARN] Destination already exists — skipping to avoid overwrite.")
            skipped += 1
            continue

        # ── Move the file ──
        # shutil.move handles both same-filesystem renames and cross-device copies.
        shutil.move(str(file_path), str(dest_file))
        moved += 1

    # ── Final summary ──
    print()
    verb = "Would move" if dry_run else "Moved"
    print(f"Done.  {verb} {moved} file(s).  Skipped {skipped} file(s).")


 # ── CLI ────────────────────────────────────────────────────────────────────────

 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        prog="organize_excel.py",
        description=(
            "Organize Excel files into per-customer subdirectories.\n"
            "The customer name is extracted from each filename using a\n"
            "configurable separator and segment position.\n\n"
            "Examples:\n"
            "  # Files like 'Acme Corp_2024-01.xlsx' (default settings)\n"
            "  python organize_excel.py -i ./raw -o ./sorted\n\n"
            "  # Files like 'Acme Corp - Invoice 001.xlsx'\n"
            "  python organize_excel.py -i ./raw -o ./sorted --separator ' - '\n\n"
            "  # Files like '2024_Acme Corp_Q1.xlsx' (customer is 2nd segment)\n"
            "  python organize_excel.py -i ./raw -o ./sorted --position 1\n\n"
            "  # Preview without moving anything\n"
            "  python organize_excel.py -i ./raw -o ./sorted --dry-run"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument(
        "-i", "--input",
        required=True,
        metavar="DIR",
        help="Directory containing the Excel files to organize.",
    )
    parser.add_argument(
        "-o", "--output",
        required=True,
        metavar="DIR",
        help=(
            "Root output directory. "
            "A subfolder will be created here for each unique customer name."
        ),
    )
    parser.add_argument(
        "--separator",
        default="_",
        metavar="SEP",
        help=(
            "String that separates the customer name from the rest of the filename. "
            "Default: '_'  →  'Acme Corp_2024-01.xlsx' gives customer 'Acme Corp'."
        ),
    )
    parser.add_argument(
        "--position",
        type=int,
        default=0,
        metavar="N",
        help=(
            "Zero-based index of the segment that contains the customer name "
            "after splitting on --separator. "
            "Default: 0 (first segment)."
        ),
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help=(
            "Print what would happen without moving any files. "
            "Use this to verify your --separator and --position settings first."
        ),
    )

    return parser.parse_args()


 def main() -> None:
    args = parse_args()

    # Resolve ~/ home-directory shortcuts and relative paths to absolute paths
    input_dir  = Path(args.input).expanduser().resolve()
    output_dir = Path(args.output).expanduser().resolve()

    # ── Validate input directory ──
    if not input_dir.exists():
        print(f"Error: input directory does not exist: {input_dir}", file=sys.stderr)
        sys.exit(1)
    if not input_dir.is_dir():
        print(f"Error: input path is not a directory: {input_dir}", file=sys.stderr)
        sys.exit(1)

    # ── Create output root now (skip in dry-run so nothing is written) ──
    if not args.dry_run:
        output_dir.mkdir(parents=True, exist_ok=True)

    organize_files(
        input_dir  = input_dir,
        output_dir = output_dir,
        separator  = args.separator,
        position   = args.position,
        dry_run    = args.dry_run,
    )


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	organize_excel.py

	Scans a directory of Excel files, extracts a customer name from each
	filename, creates a per-customer subfolder in the output directory,
	and moves each file into the appropriate folder.

	Customer name extraction is controlled by --separator and --position:
	- The filename stem (no extension) is split on the separator string
	- The segment at --position is used as the customer name

	Example filename patterns and the flags to match them:
	"Acme Corp_2024-01.xlsx" --separator "_" --position 0 → "Acme Corp"
	"2024_Acme Corp_Q1.xlsx" --separator "_" --position 1 → "Acme Corp"
	"Acme Corp - Invoice.xlsx" --separator " - " --position 0 → "Acme Corp"

	Usage:
	python organize_excel.py -i ./raw_files -o ./sorted
	python organize_excel.py -i ./raw_files -o ./sorted --separator " - "
	python organize_excel.py -i ./raw_files -o ./sorted --dry-run
	"""

	import argparse
	import shutil
	import sys
	from pathlib import Path


	# ── Helpers ────────────────────────────────────────────────────────────────────

	def extract_customer_name(stem: str, separator: str, position: int) -> str \| None:
	"""
	Split the filename stem (no extension) on `separator` and return the
	segment at `position` as the customer name.

	Returns None when the filename doesn't contain enough separator
	occurrences to reach the requested position — which lets the caller
	log a useful skip message instead of crashing.

	Args:
	stem: Filename without extension, e.g. "Acme Corp_2024-01"
	separator: String to split on, e.g. "_" or " - "
	position: Zero-based index of the customer name segment.

	Returns:
	Stripped customer name string, or None if not enough parts.
	"""
	parts = stem.split(separator)

	if position >= len(parts):
	return None # Not enough segments — caller will log a skip warning

	# Strip surrounding whitespace so "Acme Corp " → "Acme Corp"
	return parts[position].strip()


	def sanitize_folder_name(name: str) -> str:
	"""
	Replace characters that are illegal in directory names on Windows
	(the most restrictive common OS) with underscores.

	Illegal characters: \\ / : * ? " < > \|

	Args:
	name: Raw customer name string.

	Returns:
	A safe-to-use folder name string.
	"""
	illegal_chars = r'\/:*?"<>\|'
	for char in illegal_chars:
	name = name.replace(char, "_")
	return name.strip()


	def collect_excel_files(input_dir: Path) -> list[Path]:
	"""
	Return all Excel files in `input_dir` (top-level only, non-recursive).

	Recognises the four most common Excel extensions:
	.xlsx - Excel workbook (modern, default)
	.xls - Legacy Excel 97-2003 format
	.xlsm - Macro-enabled workbook
	.xlsb - Binary workbook (large files)

	Args:
	input_dir: Directory to scan.

	Returns:
	Sorted list of Path objects for matched files.
	"""
	excel_extensions = {".xlsx", ".xls", ".xlsm", ".xlsb"}
	return sorted(
	f for f in input_dir.iterdir()
	if f.is_file() and f.suffix.lower() in excel_extensions
	)


	# ── Core logic ─────────────────────────────────────────────────────────────────

	def organize_files(
	input_dir: Path,
	output_dir: Path,
	separator: str,
	position: int,
	dry_run: bool,
	) -> None:
	"""
	Main orchestration: walk input_dir, group files by customer, and move them.

	For each Excel file found:
	1. Extract customer name from the filename using separator + position
	2. Sanitize the name so it's safe to use as a directory name
	3. Create the customer folder under output_dir (if it doesn't exist)
	4. Move the file into that folder (skip if destination already exists)

	When dry_run=True every step is printed but no files or directories
	are actually created or moved — useful for verifying your separator
	and position settings before committing.

	Args:
	input_dir: Source directory containing Excel files.
	output_dir: Root destination; customer folders are created inside.
	separator: String used to split filenames into segments.
	position: Which segment (0-based) holds the customer name.
	dry_run: If True, only print actions without executing them.
	"""
	excel_files = collect_excel_files(input_dir)

	if not excel_files:
	print(f"No Excel files found in: {input_dir}")
	return

	print(f"Found {len(excel_files)} Excel file(s) in '{input_dir}'")

	if dry_run:
	print("── DRY RUN — no files will be moved ──\n")

	# Counters for the summary line printed at the end
	moved = 0
	skipped = 0

	for file_path in excel_files:
	# stem = filename without the extension → "Acme Corp_2024-01"
	stem = file_path.stem

	customer = extract_customer_name(stem, separator, position)

	# ── Could not parse a customer name → skip this file ──
	if not customer:
	print(f" [SKIP] '{file_path.name}'")
	print(f" Splitting '{stem}' on '{separator}' gave fewer "
	f"than {position + 1} part(s). Try adjusting --separator or --position.")
	skipped += 1
	continue

	# Build the destination path
	folder_name = sanitize_folder_name(customer) # safe for the filesystem
	dest_dir = output_dir / folder_name
	dest_file = dest_dir / file_path.name

	print(f" {file_path.name}")
	print(f" customer → {customer}")
	print(f" dest → {dest_file}")

	# In dry-run mode we stop here — nothing is written to disk
	if dry_run:
	moved += 1
	continue

	# ── Create the customer folder if it doesn't already exist ──
	# mkdir(parents=True) creates any missing intermediate directories.
	# exist_ok=True means it won't raise an error if the folder is already there.
	dest_dir.mkdir(parents=True, exist_ok=True)

	# ── Guard: don't silently overwrite an existing file ──
	if dest_file.exists():
	print(f" [WARN] Destination already exists — skipping to avoid overwrite.")
	skipped += 1
	continue

	# ── Move the file ──
	# shutil.move handles both same-filesystem renames and cross-device copies.
	shutil.move(str(file_path), str(dest_file))
	moved += 1

	# ── Final summary ──
	print()
	verb = "Would move" if dry_run else "Moved"
	print(f"Done. {verb} {moved} file(s). Skipped {skipped} file(s).")


	# ── CLI ────────────────────────────────────────────────────────────────────────

	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	prog="organize_excel.py",
	description=(
	"Organize Excel files into per-customer subdirectories.\n"
	"The customer name is extracted from each filename using a\n"
	"configurable separator and segment position.\n\n"
	"Examples:\n"
	" # Files like 'Acme Corp_2024-01.xlsx' (default settings)\n"
	" python organize_excel.py -i ./raw -o ./sorted\n\n"
	" # Files like 'Acme Corp - Invoice 001.xlsx'\n"
	" python organize_excel.py -i ./raw -o ./sorted --separator ' - '\n\n"
	" # Files like '2024_Acme Corp_Q1.xlsx' (customer is 2nd segment)\n"
	" python organize_excel.py -i ./raw -o ./sorted --position 1\n\n"
	" # Preview without moving anything\n"
	" python organize_excel.py -i ./raw -o ./sorted --dry-run"
	),
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)

	parser.add_argument(
	"-i", "--input",
	required=True,
	metavar="DIR",
	help="Directory containing the Excel files to organize.",
	)
	parser.add_argument(
	"-o", "--output",
	required=True,
	metavar="DIR",
	help=(
	"Root output directory. "
	"A subfolder will be created here for each unique customer name."
	),
	)
	parser.add_argument(
	"--separator",
	default="_",
	metavar="SEP",
	help=(
	"String that separates the customer name from the rest of the filename. "
	"Default: '_' → 'Acme Corp_2024-01.xlsx' gives customer 'Acme Corp'."
	),
	)
	parser.add_argument(
	"--position",
	type=int,
	default=0,
	metavar="N",
	help=(
	"Zero-based index of the segment that contains the customer name "
	"after splitting on --separator. "
	"Default: 0 (first segment)."
	),
	)
	parser.add_argument(
	"--dry-run",
	action="store_true",
	help=(
	"Print what would happen without moving any files. "
	"Use this to verify your --separator and --position settings first."
	),
	)

	return parser.parse_args()


	def main() -> None:
	args = parse_args()

	# Resolve ~/ home-directory shortcuts and relative paths to absolute paths
	input_dir = Path(args.input).expanduser().resolve()
	output_dir = Path(args.output).expanduser().resolve()

	# ── Validate input directory ──
	if not input_dir.exists():
	print(f"Error: input directory does not exist: {input_dir}", file=sys.stderr)
	sys.exit(1)
	if not input_dir.is_dir():
	print(f"Error: input path is not a directory: {input_dir}", file=sys.stderr)
	sys.exit(1)

	# ── Create output root now (skip in dry-run so nothing is written) ──
	if not args.dry_run:
	output_dir.mkdir(parents=True, exist_ok=True)

	organize_files(
	input_dir = input_dir,
	output_dir = output_dir,
	separator = args.separator,
	position = args.position,
	dry_run = args.dry_run,
	)


	if __name__ == "__main__":
	main()
No results found