tabiodun · June 16, 2025 15:52
diff --git a/dates.py b/dates.py
 import csv
 import re
 from datetime import datetime

 try:
    import chardet
 except ImportError:
    chardet = None

 def detect_encoding(file_path):
    """Detect file encoding using multiple methods"""
    if chardet is None:
        print("chardet not available, will try common encodings...")
        return None
        
    try:
        # Try to detect encoding (requires: pip install chardet)
        with open(file_path, 'rb') as f:
            raw_data = f.read(10000)  # Read first 10KB
            result = chardet.detect(raw_data)
            detected_encoding = result['encoding']
            confidence = result['confidence']
        
        print(f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})")
        return detected_encoding
    except Exception as e:
        print(f"Auto-detection failed: {e}")
        return None

 def read_csv_safe(file_path):
    """Try multiple encodings to read the CSV file"""
    encodings_to_try = [
        'utf-8',
        'utf-8-sig',  # UTF-8 with BOM
        'iso-8859-1',  # Latin-1
        'windows-1252',  # Windows encoding
        'cp1252'
    ]
    
    # First try auto-detection
    try:
        detected = detect_encoding(file_path)
        if detected and detected not in encodings_to_try:
            encodings_to_try.insert(0, detected)
    except:
        print("Auto-detection failed, trying common encodings...")
    
    for encoding in encodings_to_try:
        try:
            print(f"Trying encoding: {encoding}")
            with open(file_path, 'r', newline='', encoding=encoding) as f:
                # Try to read first few lines to validate
                reader = csv.reader(f)
                test_rows = []
                for i, row in enumerate(reader):
                    test_rows.append(row)
                    if i >= 3:  # Just test first few rows
                        break
                print(f"✅ Successfully read with {encoding}")
                return encoding
        except UnicodeDecodeError as e:
            print(f"❌ Failed with {encoding}: {e}")
            continue
        except Exception as e:
            print(f"❌ Other error with {encoding}: {e}")
            continue
    
    raise ValueError("Could not read file with any common encoding")

 def standardize_dates_csv(input_file, output_file, date_column_index=10):  # K is 11th column (0-indexed = 10)
    """
    Convert mixed date formats to yyyymmdd format using pure Python
    Handles: yyyy-mm-dd, mm/dd/yyyy, and blanks
    Auto-detects file encoding to handle various CSV sources
    
    Args:
        input_file: Path to input CSV
        output_file: Path to output CSV
        date_column_index: 0-based index of date column (K = 10)
    """
    
    stats = {
        'total_rows': 0,
        'blank_cells': 0,
        'yyyy_mm_dd_format': 0,
        'mm_dd_yyyy_format': 0,
        'conversion_errors': 0,
        'successful_conversions': 0,
        'header_row': False
    }
    
    def convert_date(date_str):
        """Convert individual date string to yyyymmdd format"""
        if not date_str or date_str.strip() == '':
            stats['blank_cells'] += 1
            return ''
        
        date_str = date_str.strip()
        
        try:
            # Pattern 1: yyyy-mm-dd
            if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
                stats['yyyy_mm_dd_format'] += 1
                date_obj = datetime.strptime(date_str, '%Y-%m-%d')
                return date_obj.strftime('%Y%m%d')
            
            # Pattern 2: mm/dd/yyyy
            elif re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', date_str):
                stats['mm_dd_yyyy_format'] += 1
                date_obj = datetime.strptime(date_str, '%m/%d/%Y')
                return date_obj.strftime('%Y%m%d')
            
            else:
                print(f"Unrecognized format: '{date_str}'")
                stats['conversion_errors'] += 1
                return date_str  # Return original if unrecognized
                
        except ValueError as e:
            print(f"Error converting '{date_str}': {e}")
            stats['conversion_errors'] += 1
            return date_str
    
    # Detect the proper encoding first
    try:
        input_encoding = read_csv_safe(input_file)
    except ValueError as e:
        print(f"Error: {e}")
        print("Manual inspection needed - file may be corrupted or in unsupported format")
        return None
    
    # Process the CSV file with detected encoding
    with open(input_file, 'r', newline='', encoding=input_encoding) as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        for row_num, row in enumerate(reader):
            stats['total_rows'] += 1
            
            # Handle header row
            if row_num == 0:
                # Check if first row looks like headers
                if any(not cell.replace('/', '').replace('-', '').isdigit() for cell in row if cell.strip()):
                    stats['header_row'] = True
                    writer.writerow(row)  # Write header as-is
                    continue
            
            # Ensure row has enough columns
            while len(row) <= date_column_index:
                row.append('')
            
            # Convert the date in specified column
            if len(row) > date_column_index:
                original_date = row[date_column_index]
                row[date_column_index] = convert_date(original_date)
            
            writer.writerow(row)
    
    # Calculate successful conversions
    stats['successful_conversions'] = stats['yyyy_mm_dd_format'] + stats['mm_dd_yyyy_format']
    
    # Adjust total rows count if header was present
    if stats['header_row']:
        stats['total_rows'] -= 1
    
    # Print comprehensive statistics
    print(f"\n=== CONVERSION SUMMARY ===")
    print(f"Input file: {input_file}")
    print(f"Output file: {output_file}")
    print(f"Date column: {chr(65 + date_column_index)} (index {date_column_index})")
    print(f"Header row detected: {'Yes' if stats['header_row'] else 'No'}")
    print(f"\nDATA PROCESSING:")
    print(f"  Total data rows: {stats['total_rows']:,}")
    print(f"  Blank cells: {stats['blank_cells']:,}")
    print(f"  yyyy-mm-dd format: {stats['yyyy_mm_dd_format']:,}")
    print(f"  mm/dd/yyyy format: {stats['mm_dd_yyyy_format']:,}")
    print(f"  Successful conversions: {stats['successful_conversions']:,}")
    print(f"  Conversion errors: {stats['conversion_errors']:,}")
    
    if stats['total_rows'] > 0:
        success_rate = (stats['successful_conversions'] / stats['total_rows']) * 100
        print(f"  Success rate: {success_rate:.1f}%")
    
    return stats

 def preview_dates(input_file, date_column_index=10, num_samples=10):
    """Preview date formats in the file before conversion"""
    print(f"=== PREVIEW OF COLUMN {chr(65 + date_column_index)} ===")
    
    try:
        encoding = read_csv_safe(input_file)
    except ValueError as e:
        print(f"Cannot preview: {e}")
        return
    
    with open(input_file, 'r', newline='', encoding=encoding) as infile:
        reader = csv.reader(infile)
        samples = []
        
        for i, row in enumerate(reader):
            if i == 0:  # Skip header if present
                continue
            if len(row) > date_column_index and len(samples) < num_samples:
                date_val = row[date_column_index].strip()
                if date_val:  # Only show non-empty dates
                    samples.append(f"Row {i+1}: '{date_val}'")
        
        for sample in samples:
            print(sample)
    
    print(f"\nFound {len(samples)} non-empty date samples")

 # Example usage:
 if __name__ == "__main__":
    input_file = "your_input_file.csv"
    output_file = "standardized_dates.csv"
    
    # Preview dates first (optional)
    print("Previewing date formats...")
    preview_dates(input_file)
    
    # Run the conversion
    print(f"\nStarting conversion...")
    conversion_stats = standardize_dates_csv(input_file, output_file)
    
    if conversion_stats:
        print(f"\n✅ Conversion complete!")
    else:
        print("\n❌ Conversion failed - check encoding issues above")
        print("\nTROUBLESHOoting:")
        print("1. Try opening your CSV in a text editor to see if it looks corrupted")
        print("2. If from Excel, try 'Save As' -> CSV UTF-8")
        print("3. Install chardet for better encoding detection: pip install chardet")
        print("4. Check if file has unusual characters or was created on different OS")
	import csv
	import re
	from datetime import datetime

	try:
	import chardet
	except ImportError:
	chardet = None

	def detect_encoding(file_path):
	"""Detect file encoding using multiple methods"""
	if chardet is None:
	print("chardet not available, will try common encodings...")
	return None

	try:
	# Try to detect encoding (requires: pip install chardet)
	with open(file_path, 'rb') as f:
	raw_data = f.read(10000) # Read first 10KB
	result = chardet.detect(raw_data)
	detected_encoding = result['encoding']
	confidence = result['confidence']

	print(f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})")
	return detected_encoding
	except Exception as e:
	print(f"Auto-detection failed: {e}")
	return None

	def read_csv_safe(file_path):
	"""Try multiple encodings to read the CSV file"""
	encodings_to_try = [
	'utf-8',
	'utf-8-sig', # UTF-8 with BOM
	'iso-8859-1', # Latin-1
	'windows-1252', # Windows encoding
	'cp1252'
	]

	# First try auto-detection
	try:
	detected = detect_encoding(file_path)
	if detected and detected not in encodings_to_try:
	encodings_to_try.insert(0, detected)
	except:
	print("Auto-detection failed, trying common encodings...")

	for encoding in encodings_to_try:
	try:
	print(f"Trying encoding: {encoding}")
	with open(file_path, 'r', newline='', encoding=encoding) as f:
	# Try to read first few lines to validate
	reader = csv.reader(f)
	test_rows = []
	for i, row in enumerate(reader):
	test_rows.append(row)
	if i >= 3: # Just test first few rows
	break
	print(f"✅ Successfully read with {encoding}")
	return encoding
	except UnicodeDecodeError as e:
	print(f"❌ Failed with {encoding}: {e}")
	continue
	except Exception as e:
	print(f"❌ Other error with {encoding}: {e}")
	continue

	raise ValueError("Could not read file with any common encoding")

	def standardize_dates_csv(input_file, output_file, date_column_index=10): # K is 11th column (0-indexed = 10)
	"""
	Convert mixed date formats to yyyymmdd format using pure Python
	Handles: yyyy-mm-dd, mm/dd/yyyy, and blanks
	Auto-detects file encoding to handle various CSV sources

	Args:
	input_file: Path to input CSV
	output_file: Path to output CSV
	date_column_index: 0-based index of date column (K = 10)
	"""

	stats = {
	'total_rows': 0,
	'blank_cells': 0,
	'yyyy_mm_dd_format': 0,
	'mm_dd_yyyy_format': 0,
	'conversion_errors': 0,
	'successful_conversions': 0,
	'header_row': False
	}

	def convert_date(date_str):
	"""Convert individual date string to yyyymmdd format"""
	if not date_str or date_str.strip() == '':
	stats['blank_cells'] += 1
	return ''

	date_str = date_str.strip()

	try:
	# Pattern 1: yyyy-mm-dd
	if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
	stats['yyyy_mm_dd_format'] += 1
	date_obj = datetime.strptime(date_str, '%Y-%m-%d')
	return date_obj.strftime('%Y%m%d')

	# Pattern 2: mm/dd/yyyy
	elif re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', date_str):
	stats['mm_dd_yyyy_format'] += 1
	date_obj = datetime.strptime(date_str, '%m/%d/%Y')
	return date_obj.strftime('%Y%m%d')

	else:
	print(f"Unrecognized format: '{date_str}'")
	stats['conversion_errors'] += 1
	return date_str # Return original if unrecognized

	except ValueError as e:
	print(f"Error converting '{date_str}': {e}")
	stats['conversion_errors'] += 1
	return date_str

	# Detect the proper encoding first
	try:
	input_encoding = read_csv_safe(input_file)
	except ValueError as e:
	print(f"Error: {e}")
	print("Manual inspection needed - file may be corrupted or in unsupported format")
	return None

	# Process the CSV file with detected encoding
	with open(input_file, 'r', newline='', encoding=input_encoding) as infile, \
	open(output_file, 'w', newline='', encoding='utf-8') as outfile:

	reader = csv.reader(infile)
	writer = csv.writer(outfile)

	for row_num, row in enumerate(reader):
	stats['total_rows'] += 1

	# Handle header row
	if row_num == 0:
	# Check if first row looks like headers
	if any(not cell.replace('/', '').replace('-', '').isdigit() for cell in row if cell.strip()):
	stats['header_row'] = True
	writer.writerow(row) # Write header as-is
	continue

	# Ensure row has enough columns
	while len(row) <= date_column_index:
	row.append('')

	# Convert the date in specified column
	if len(row) > date_column_index:
	original_date = row[date_column_index]
	row[date_column_index] = convert_date(original_date)

	writer.writerow(row)

	# Calculate successful conversions
	stats['successful_conversions'] = stats['yyyy_mm_dd_format'] + stats['mm_dd_yyyy_format']

	# Adjust total rows count if header was present
	if stats['header_row']:
	stats['total_rows'] -= 1

	# Print comprehensive statistics
	print(f"\n=== CONVERSION SUMMARY ===")
	print(f"Input file: {input_file}")
	print(f"Output file: {output_file}")
	print(f"Date column: {chr(65 + date_column_index)} (index {date_column_index})")
	print(f"Header row detected: {'Yes' if stats['header_row'] else 'No'}")
	print(f"\nDATA PROCESSING:")
	print(f" Total data rows: {stats['total_rows']:,}")
	print(f" Blank cells: {stats['blank_cells']:,}")
	print(f" yyyy-mm-dd format: {stats['yyyy_mm_dd_format']:,}")
	print(f" mm/dd/yyyy format: {stats['mm_dd_yyyy_format']:,}")
	print(f" Successful conversions: {stats['successful_conversions']:,}")
	print(f" Conversion errors: {stats['conversion_errors']:,}")

	if stats['total_rows'] > 0:
	success_rate = (stats['successful_conversions'] / stats['total_rows']) * 100
	print(f" Success rate: {success_rate:.1f}%")

	return stats

	def preview_dates(input_file, date_column_index=10, num_samples=10):
	"""Preview date formats in the file before conversion"""
	print(f"=== PREVIEW OF COLUMN {chr(65 + date_column_index)} ===")

	try:
	encoding = read_csv_safe(input_file)
	except ValueError as e:
	print(f"Cannot preview: {e}")
	return

	with open(input_file, 'r', newline='', encoding=encoding) as infile:
	reader = csv.reader(infile)
	samples = []

	for i, row in enumerate(reader):
	if i == 0: # Skip header if present
	continue
	if len(row) > date_column_index and len(samples) < num_samples:
	date_val = row[date_column_index].strip()
	if date_val: # Only show non-empty dates
	samples.append(f"Row {i+1}: '{date_val}'")

	for sample in samples:
	print(sample)

	print(f"\nFound {len(samples)} non-empty date samples")

	# Example usage:
	if __name__ == "__main__":
	input_file = "your_input_file.csv"
	output_file = "standardized_dates.csv"

	# Preview dates first (optional)
	print("Previewing date formats...")
	preview_dates(input_file)

	# Run the conversion
	print(f"\nStarting conversion...")
	conversion_stats = standardize_dates_csv(input_file, output_file)

	if conversion_stats:
	print(f"\n✅ Conversion complete!")
	else:
	print("\n❌ Conversion failed - check encoding issues above")
	print("\nTROUBLESHOoting:")
	print("1. Try opening your CSV in a text editor to see if it looks corrupted")
	print("2. If from Excel, try 'Save As' -> CSV UTF-8")
	print("3. Install chardet for better encoding detection: pip install chardet")
	print("4. Check if file has unusual characters or was created on different OS")