Skip to content

Instantly share code, notes, and snippets.

@tabiodun
Last active June 16, 2025 15:52
Show Gist options
  • Save tabiodun/1a96023c910647141ca5559043e41706 to your computer and use it in GitHub Desktop.
Save tabiodun/1a96023c910647141ca5559043e41706 to your computer and use it in GitHub Desktop.
import csv
import re
from datetime import datetime
try:
import chardet
except ImportError:
chardet = None
def detect_encoding(file_path):
"""Detect file encoding using multiple methods"""
if chardet is None:
print("chardet not available, will try common encodings...")
return None
try:
# Try to detect encoding (requires: pip install chardet)
with open(file_path, 'rb') as f:
raw_data = f.read(10000) # Read first 10KB
result = chardet.detect(raw_data)
detected_encoding = result['encoding']
confidence = result['confidence']
print(f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})")
return detected_encoding
except Exception as e:
print(f"Auto-detection failed: {e}")
return None
def read_csv_safe(file_path):
"""Try multiple encodings to read the CSV file"""
encodings_to_try = [
'utf-8',
'utf-8-sig', # UTF-8 with BOM
'iso-8859-1', # Latin-1
'windows-1252', # Windows encoding
'cp1252'
]
# First try auto-detection
try:
detected = detect_encoding(file_path)
if detected and detected not in encodings_to_try:
encodings_to_try.insert(0, detected)
except:
print("Auto-detection failed, trying common encodings...")
for encoding in encodings_to_try:
try:
print(f"Trying encoding: {encoding}")
with open(file_path, 'r', newline='', encoding=encoding) as f:
# Try to read first few lines to validate
reader = csv.reader(f)
test_rows = []
for i, row in enumerate(reader):
test_rows.append(row)
if i >= 3: # Just test first few rows
break
print(f"✅ Successfully read with {encoding}")
return encoding
except UnicodeDecodeError as e:
print(f"❌ Failed with {encoding}: {e}")
continue
except Exception as e:
print(f"❌ Other error with {encoding}: {e}")
continue
raise ValueError("Could not read file with any common encoding")
def standardize_dates_csv(input_file, output_file, date_column_index=10): # K is 11th column (0-indexed = 10)
"""
Convert mixed date formats to yyyymmdd format using pure Python
Handles: yyyy-mm-dd, mm/dd/yyyy, and blanks
Auto-detects file encoding to handle various CSV sources
Args:
input_file: Path to input CSV
output_file: Path to output CSV
date_column_index: 0-based index of date column (K = 10)
"""
stats = {
'total_rows': 0,
'blank_cells': 0,
'yyyy_mm_dd_format': 0,
'mm_dd_yyyy_format': 0,
'conversion_errors': 0,
'successful_conversions': 0,
'header_row': False
}
def convert_date(date_str):
"""Convert individual date string to yyyymmdd format"""
if not date_str or date_str.strip() == '':
stats['blank_cells'] += 1
return ''
date_str = date_str.strip()
try:
# Pattern 1: yyyy-mm-dd
if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
stats['yyyy_mm_dd_format'] += 1
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
return date_obj.strftime('%Y%m%d')
# Pattern 2: mm/dd/yyyy
elif re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', date_str):
stats['mm_dd_yyyy_format'] += 1
date_obj = datetime.strptime(date_str, '%m/%d/%Y')
return date_obj.strftime('%Y%m%d')
else:
print(f"Unrecognized format: '{date_str}'")
stats['conversion_errors'] += 1
return date_str # Return original if unrecognized
except ValueError as e:
print(f"Error converting '{date_str}': {e}")
stats['conversion_errors'] += 1
return date_str
# Detect the proper encoding first
try:
input_encoding = read_csv_safe(input_file)
except ValueError as e:
print(f"Error: {e}")
print("Manual inspection needed - file may be corrupted or in unsupported format")
return None
# Process the CSV file with detected encoding
with open(input_file, 'r', newline='', encoding=input_encoding) as infile, \
open(output_file, 'w', newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row_num, row in enumerate(reader):
stats['total_rows'] += 1
# Handle header row
if row_num == 0:
# Check if first row looks like headers
if any(not cell.replace('/', '').replace('-', '').isdigit() for cell in row if cell.strip()):
stats['header_row'] = True
writer.writerow(row) # Write header as-is
continue
# Ensure row has enough columns
while len(row) <= date_column_index:
row.append('')
# Convert the date in specified column
if len(row) > date_column_index:
original_date = row[date_column_index]
row[date_column_index] = convert_date(original_date)
writer.writerow(row)
# Calculate successful conversions
stats['successful_conversions'] = stats['yyyy_mm_dd_format'] + stats['mm_dd_yyyy_format']
# Adjust total rows count if header was present
if stats['header_row']:
stats['total_rows'] -= 1
# Print comprehensive statistics
print(f"\n=== CONVERSION SUMMARY ===")
print(f"Input file: {input_file}")
print(f"Output file: {output_file}")
print(f"Date column: {chr(65 + date_column_index)} (index {date_column_index})")
print(f"Header row detected: {'Yes' if stats['header_row'] else 'No'}")
print(f"\nDATA PROCESSING:")
print(f" Total data rows: {stats['total_rows']:,}")
print(f" Blank cells: {stats['blank_cells']:,}")
print(f" yyyy-mm-dd format: {stats['yyyy_mm_dd_format']:,}")
print(f" mm/dd/yyyy format: {stats['mm_dd_yyyy_format']:,}")
print(f" Successful conversions: {stats['successful_conversions']:,}")
print(f" Conversion errors: {stats['conversion_errors']:,}")
if stats['total_rows'] > 0:
success_rate = (stats['successful_conversions'] / stats['total_rows']) * 100
print(f" Success rate: {success_rate:.1f}%")
return stats
def preview_dates(input_file, date_column_index=10, num_samples=10):
"""Preview date formats in the file before conversion"""
print(f"=== PREVIEW OF COLUMN {chr(65 + date_column_index)} ===")
try:
encoding = read_csv_safe(input_file)
except ValueError as e:
print(f"Cannot preview: {e}")
return
with open(input_file, 'r', newline='', encoding=encoding) as infile:
reader = csv.reader(infile)
samples = []
for i, row in enumerate(reader):
if i == 0: # Skip header if present
continue
if len(row) > date_column_index and len(samples) < num_samples:
date_val = row[date_column_index].strip()
if date_val: # Only show non-empty dates
samples.append(f"Row {i+1}: '{date_val}'")
for sample in samples:
print(sample)
print(f"\nFound {len(samples)} non-empty date samples")
# Example usage:
if __name__ == "__main__":
input_file = "your_input_file.csv"
output_file = "standardized_dates.csv"
# Preview dates first (optional)
print("Previewing date formats...")
preview_dates(input_file)
# Run the conversion
print(f"\nStarting conversion...")
conversion_stats = standardize_dates_csv(input_file, output_file)
if conversion_stats:
print(f"\n✅ Conversion complete!")
else:
print("\n❌ Conversion failed - check encoding issues above")
print("\nTROUBLESHOoting:")
print("1. Try opening your CSV in a text editor to see if it looks corrupted")
print("2. If from Excel, try 'Save As' -> CSV UTF-8")
print("3. Install chardet for better encoding detection: pip install chardet")
print("4. Check if file has unusual characters or was created on different OS")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment