Last active
June 16, 2025 15:52
-
-
Save tabiodun/1a96023c910647141ca5559043e41706 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
from datetime import datetime | |
try: | |
import chardet | |
except ImportError: | |
chardet = None | |
def detect_encoding(file_path): | |
"""Detect file encoding using multiple methods""" | |
if chardet is None: | |
print("chardet not available, will try common encodings...") | |
return None | |
try: | |
# Try to detect encoding (requires: pip install chardet) | |
with open(file_path, 'rb') as f: | |
raw_data = f.read(10000) # Read first 10KB | |
result = chardet.detect(raw_data) | |
detected_encoding = result['encoding'] | |
confidence = result['confidence'] | |
print(f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})") | |
return detected_encoding | |
except Exception as e: | |
print(f"Auto-detection failed: {e}") | |
return None | |
def read_csv_safe(file_path): | |
"""Try multiple encodings to read the CSV file""" | |
encodings_to_try = [ | |
'utf-8', | |
'utf-8-sig', # UTF-8 with BOM | |
'iso-8859-1', # Latin-1 | |
'windows-1252', # Windows encoding | |
'cp1252' | |
] | |
# First try auto-detection | |
try: | |
detected = detect_encoding(file_path) | |
if detected and detected not in encodings_to_try: | |
encodings_to_try.insert(0, detected) | |
except: | |
print("Auto-detection failed, trying common encodings...") | |
for encoding in encodings_to_try: | |
try: | |
print(f"Trying encoding: {encoding}") | |
with open(file_path, 'r', newline='', encoding=encoding) as f: | |
# Try to read first few lines to validate | |
reader = csv.reader(f) | |
test_rows = [] | |
for i, row in enumerate(reader): | |
test_rows.append(row) | |
if i >= 3: # Just test first few rows | |
break | |
print(f"✅ Successfully read with {encoding}") | |
return encoding | |
except UnicodeDecodeError as e: | |
print(f"❌ Failed with {encoding}: {e}") | |
continue | |
except Exception as e: | |
print(f"❌ Other error with {encoding}: {e}") | |
continue | |
raise ValueError("Could not read file with any common encoding") | |
def standardize_dates_csv(input_file, output_file, date_column_index=10): # K is 11th column (0-indexed = 10) | |
""" | |
Convert mixed date formats to yyyymmdd format using pure Python | |
Handles: yyyy-mm-dd, mm/dd/yyyy, and blanks | |
Auto-detects file encoding to handle various CSV sources | |
Args: | |
input_file: Path to input CSV | |
output_file: Path to output CSV | |
date_column_index: 0-based index of date column (K = 10) | |
""" | |
stats = { | |
'total_rows': 0, | |
'blank_cells': 0, | |
'yyyy_mm_dd_format': 0, | |
'mm_dd_yyyy_format': 0, | |
'conversion_errors': 0, | |
'successful_conversions': 0, | |
'header_row': False | |
} | |
def convert_date(date_str): | |
"""Convert individual date string to yyyymmdd format""" | |
if not date_str or date_str.strip() == '': | |
stats['blank_cells'] += 1 | |
return '' | |
date_str = date_str.strip() | |
try: | |
# Pattern 1: yyyy-mm-dd | |
if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str): | |
stats['yyyy_mm_dd_format'] += 1 | |
date_obj = datetime.strptime(date_str, '%Y-%m-%d') | |
return date_obj.strftime('%Y%m%d') | |
# Pattern 2: mm/dd/yyyy | |
elif re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', date_str): | |
stats['mm_dd_yyyy_format'] += 1 | |
date_obj = datetime.strptime(date_str, '%m/%d/%Y') | |
return date_obj.strftime('%Y%m%d') | |
else: | |
print(f"Unrecognized format: '{date_str}'") | |
stats['conversion_errors'] += 1 | |
return date_str # Return original if unrecognized | |
except ValueError as e: | |
print(f"Error converting '{date_str}': {e}") | |
stats['conversion_errors'] += 1 | |
return date_str | |
# Detect the proper encoding first | |
try: | |
input_encoding = read_csv_safe(input_file) | |
except ValueError as e: | |
print(f"Error: {e}") | |
print("Manual inspection needed - file may be corrupted or in unsupported format") | |
return None | |
# Process the CSV file with detected encoding | |
with open(input_file, 'r', newline='', encoding=input_encoding) as infile, \ | |
open(output_file, 'w', newline='', encoding='utf-8') as outfile: | |
reader = csv.reader(infile) | |
writer = csv.writer(outfile) | |
for row_num, row in enumerate(reader): | |
stats['total_rows'] += 1 | |
# Handle header row | |
if row_num == 0: | |
# Check if first row looks like headers | |
if any(not cell.replace('/', '').replace('-', '').isdigit() for cell in row if cell.strip()): | |
stats['header_row'] = True | |
writer.writerow(row) # Write header as-is | |
continue | |
# Ensure row has enough columns | |
while len(row) <= date_column_index: | |
row.append('') | |
# Convert the date in specified column | |
if len(row) > date_column_index: | |
original_date = row[date_column_index] | |
row[date_column_index] = convert_date(original_date) | |
writer.writerow(row) | |
# Calculate successful conversions | |
stats['successful_conversions'] = stats['yyyy_mm_dd_format'] + stats['mm_dd_yyyy_format'] | |
# Adjust total rows count if header was present | |
if stats['header_row']: | |
stats['total_rows'] -= 1 | |
# Print comprehensive statistics | |
print(f"\n=== CONVERSION SUMMARY ===") | |
print(f"Input file: {input_file}") | |
print(f"Output file: {output_file}") | |
print(f"Date column: {chr(65 + date_column_index)} (index {date_column_index})") | |
print(f"Header row detected: {'Yes' if stats['header_row'] else 'No'}") | |
print(f"\nDATA PROCESSING:") | |
print(f" Total data rows: {stats['total_rows']:,}") | |
print(f" Blank cells: {stats['blank_cells']:,}") | |
print(f" yyyy-mm-dd format: {stats['yyyy_mm_dd_format']:,}") | |
print(f" mm/dd/yyyy format: {stats['mm_dd_yyyy_format']:,}") | |
print(f" Successful conversions: {stats['successful_conversions']:,}") | |
print(f" Conversion errors: {stats['conversion_errors']:,}") | |
if stats['total_rows'] > 0: | |
success_rate = (stats['successful_conversions'] / stats['total_rows']) * 100 | |
print(f" Success rate: {success_rate:.1f}%") | |
return stats | |
def preview_dates(input_file, date_column_index=10, num_samples=10): | |
"""Preview date formats in the file before conversion""" | |
print(f"=== PREVIEW OF COLUMN {chr(65 + date_column_index)} ===") | |
try: | |
encoding = read_csv_safe(input_file) | |
except ValueError as e: | |
print(f"Cannot preview: {e}") | |
return | |
with open(input_file, 'r', newline='', encoding=encoding) as infile: | |
reader = csv.reader(infile) | |
samples = [] | |
for i, row in enumerate(reader): | |
if i == 0: # Skip header if present | |
continue | |
if len(row) > date_column_index and len(samples) < num_samples: | |
date_val = row[date_column_index].strip() | |
if date_val: # Only show non-empty dates | |
samples.append(f"Row {i+1}: '{date_val}'") | |
for sample in samples: | |
print(sample) | |
print(f"\nFound {len(samples)} non-empty date samples") | |
# Example usage: | |
if __name__ == "__main__": | |
input_file = "your_input_file.csv" | |
output_file = "standardized_dates.csv" | |
# Preview dates first (optional) | |
print("Previewing date formats...") | |
preview_dates(input_file) | |
# Run the conversion | |
print(f"\nStarting conversion...") | |
conversion_stats = standardize_dates_csv(input_file, output_file) | |
if conversion_stats: | |
print(f"\n✅ Conversion complete!") | |
else: | |
print("\n❌ Conversion failed - check encoding issues above") | |
print("\nTROUBLESHOoting:") | |
print("1. Try opening your CSV in a text editor to see if it looks corrupted") | |
print("2. If from Excel, try 'Save As' -> CSV UTF-8") | |
print("3. Install chardet for better encoding detection: pip install chardet") | |
print("4. Check if file has unusual characters or was created on different OS") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment