dharamsk · November 21, 2024 17:27
diff --git a/load.py b/load.py

 # portal page: https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/
 # login to SFTP: https://sftp.floridados.gov/ (login creds in webpage above)
 # download contents of /doc/cor/
 # run this script in that directory

 # paste schema from webpage into LLM and ask for it in a structured format
 # https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/file-structure/

 FIELD_SPECS = [
    ('corporation_number', 0, 12),
    ('corporation_name', 12, 192),
    ('status', 204, 1),
    ('filing_type', 205, 15),
    ('address_1', 220, 42),
    ('address_2', 262, 42),
    ('city', 304, 28),
    ('state', 332, 2),
    ('zip', 334, 10),
    ('country', 344, 2),
    ('mail_address_1', 346, 42),
    ('mail_address_2', 388, 42),
    ('mail_city', 430, 28),
    ('mail_state', 458, 2),
    ('mail_zip', 460, 10),
    ('mail_country', 470, 2),
    ('file_date', 472, 8),
    ('fei_number', 480, 14),
    ('more_than_six_officers', 494, 1),
    ('last_transaction_date', 495, 8),
    ('state_country', 503, 2),
    ('report_year_1', 505, 4),
    ('filler_1', 509, 1),
    ('report_date_1', 510, 8),
    ('report_year_2', 518, 4),
    ('filler_2', 522, 1),
    ('report_date_2', 523, 8),
    ('report_year_3', 531, 4),
    ('filler_3', 535, 1),
    ('report_date_3', 536, 8),
    ('registered_agent_name', 544, 42),
    ('registered_agent_type', 586, 1),
    ('registered_agent_address', 587, 42),
    ('registered_agent_city', 629, 28),
    ('registered_agent_state', 657, 2),
    ('registered_agent_zip', 659, 9),
    # Officer 1
    ('officer_1_title', 668, 4),
    ('officer_1_type', 672, 1),
    ('officer_1_name', 673, 42),
    ('officer_1_address', 715, 42),
    ('officer_1_city', 757, 28),
    ('officer_1_state', 785, 2),
    ('officer_1_zip', 787, 9),
    # Officer 2
    ('officer_2_title', 796, 4),
    ('officer_2_type', 800, 1),
    ('officer_2_name', 801, 42),
    ('officer_2_address', 843, 42),
    ('officer_2_city', 885, 28),
    ('officer_2_state', 913, 2),
    ('officer_2_zip', 915, 9),
    # Officer 3
    ('officer_3_title', 924, 4),
    ('officer_3_type', 928, 1),
    ('officer_3_name', 929, 42),
    ('officer_3_address', 971, 42),
    ('officer_3_city', 1013, 28),
    ('officer_3_state', 1041, 2),
    ('officer_3_zip', 1043, 9),
    # Officer 4
    ('officer_4_title', 1052, 4),
    ('officer_4_type', 1056, 1),
    ('officer_4_name', 1057, 42),
    ('officer_4_address', 1099, 42),
    ('officer_4_city', 1141, 28),
    ('officer_4_state', 1169, 2),
    ('officer_4_zip', 1171, 9),
    # Officer 5
    ('officer_5_title', 1180, 4),
    ('officer_5_type', 1184, 1),
    ('officer_5_name', 1185, 42),
    ('officer_5_address', 1227, 42),
    ('officer_5_city', 1269, 28),
    ('officer_5_state', 1297, 2),
    ('officer_5_zip', 1299, 9),
    # Officer 6
    ('officer_6_title', 1308, 4),
    ('officer_6_type', 1312, 1),
    ('officer_6_name', 1313, 42),
    ('officer_6_address', 1355, 42),
    ('officer_6_city', 1397, 28),
    ('officer_6_state', 1425, 2),
    ('officer_6_zip', 1427, 9),
    ('filler_end', 1436, 4)
 ]

 def parse_line(line):
    """Parse a single line according to field specifications and clean null bytes"""
    return {
        name: line[start:start+length].strip().replace('\x00', '')
        for name, start, length in FIELD_SPECS
    }

 import pandas as pd
 from sqlalchemy import create_engine
 import os
 import shutil
 import sqlalchemy

 # Create processed directory if it doesn't exist
 processed_dir = 'processed'
 if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

 # Get all .txt files in current directory, sorted alphabetically
 txt_files = sorted([f for f in os.listdir('.') if f.endswith('.txt')])

 # Create SQLAlchemy engine
 engine = create_engine('postgresql://dharam:@localhost:5432/savvy')

 # Process each file
 for filename in txt_files:
    try:
        # Try different encodings (cursor came up with this)
        encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
        
        for encoding in encodings:
            try:
                # Parse the file into records
                with open(filename, 'r', encoding=encoding) as file:
                    records = [parse_line(line) for line in file if line.strip()]
                
                # If we get here, the encoding worked
                break
            except UnicodeDecodeError:
                if encoding == encodings[-1]:  # If this was the last encoding to try
                    raise  # Re-raise the exception
                continue  # Try next encoding

        # Convert records to DataFrame
        df = pd.DataFrame(records)
        
        # Add filename column
        df['filename'] = filename
        
        # Add file_date column from first 8 chars of filename
        df['file_date_from_name'] = pd.to_datetime(filename[:8], format='%Y%m%d')

        # Write DataFrame to PostgreSQL
        df.to_sql('corporations', engine, if_exists='append', index=False,
                 dtype={'file_date_from_name': sqlalchemy.types.Date})

        # Move file to processed directory
        shutil.move(filename, os.path.join(processed_dir, filename))
        
        print(f"Processed {filename} using {encoding}: {len(df)} records inserted")
    
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

 print("All files processed")

	# portal page: https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/
	# login to SFTP: https://sftp.floridados.gov/ (login creds in webpage above)
	# download contents of /doc/cor/
	# run this script in that directory

	# paste schema from webpage into LLM and ask for it in a structured format
	# https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/file-structure/

	FIELD_SPECS = [
	('corporation_number', 0, 12),
	('corporation_name', 12, 192),
	('status', 204, 1),
	('filing_type', 205, 15),
	('address_1', 220, 42),
	('address_2', 262, 42),
	('city', 304, 28),
	('state', 332, 2),
	('zip', 334, 10),
	('country', 344, 2),
	('mail_address_1', 346, 42),
	('mail_address_2', 388, 42),
	('mail_city', 430, 28),
	('mail_state', 458, 2),
	('mail_zip', 460, 10),
	('mail_country', 470, 2),
	('file_date', 472, 8),
	('fei_number', 480, 14),
	('more_than_six_officers', 494, 1),
	('last_transaction_date', 495, 8),
	('state_country', 503, 2),
	('report_year_1', 505, 4),
	('filler_1', 509, 1),
	('report_date_1', 510, 8),
	('report_year_2', 518, 4),
	('filler_2', 522, 1),
	('report_date_2', 523, 8),
	('report_year_3', 531, 4),
	('filler_3', 535, 1),
	('report_date_3', 536, 8),
	('registered_agent_name', 544, 42),
	('registered_agent_type', 586, 1),
	('registered_agent_address', 587, 42),
	('registered_agent_city', 629, 28),
	('registered_agent_state', 657, 2),
	('registered_agent_zip', 659, 9),
	# Officer 1
	('officer_1_title', 668, 4),
	('officer_1_type', 672, 1),
	('officer_1_name', 673, 42),
	('officer_1_address', 715, 42),
	('officer_1_city', 757, 28),
	('officer_1_state', 785, 2),
	('officer_1_zip', 787, 9),
	# Officer 2
	('officer_2_title', 796, 4),
	('officer_2_type', 800, 1),
	('officer_2_name', 801, 42),
	('officer_2_address', 843, 42),
	('officer_2_city', 885, 28),
	('officer_2_state', 913, 2),
	('officer_2_zip', 915, 9),
	# Officer 3
	('officer_3_title', 924, 4),
	('officer_3_type', 928, 1),
	('officer_3_name', 929, 42),
	('officer_3_address', 971, 42),
	('officer_3_city', 1013, 28),
	('officer_3_state', 1041, 2),
	('officer_3_zip', 1043, 9),
	# Officer 4
	('officer_4_title', 1052, 4),
	('officer_4_type', 1056, 1),
	('officer_4_name', 1057, 42),
	('officer_4_address', 1099, 42),
	('officer_4_city', 1141, 28),
	('officer_4_state', 1169, 2),
	('officer_4_zip', 1171, 9),
	# Officer 5
	('officer_5_title', 1180, 4),
	('officer_5_type', 1184, 1),
	('officer_5_name', 1185, 42),
	('officer_5_address', 1227, 42),
	('officer_5_city', 1269, 28),
	('officer_5_state', 1297, 2),
	('officer_5_zip', 1299, 9),
	# Officer 6
	('officer_6_title', 1308, 4),
	('officer_6_type', 1312, 1),
	('officer_6_name', 1313, 42),
	('officer_6_address', 1355, 42),
	('officer_6_city', 1397, 28),
	('officer_6_state', 1425, 2),
	('officer_6_zip', 1427, 9),
	('filler_end', 1436, 4)
	]

	def parse_line(line):
	"""Parse a single line according to field specifications and clean null bytes"""
	return {
	name: line[start:start+length].strip().replace('\x00', '')
	for name, start, length in FIELD_SPECS
	}

	import pandas as pd
	from sqlalchemy import create_engine
	import os
	import shutil
	import sqlalchemy

	# Create processed directory if it doesn't exist
	processed_dir = 'processed'
	if not os.path.exists(processed_dir):
	os.makedirs(processed_dir)

	# Get all .txt files in current directory, sorted alphabetically
	txt_files = sorted([f for f in os.listdir('.') if f.endswith('.txt')])

	# Create SQLAlchemy engine
	engine = create_engine('postgresql://dharam:@localhost:5432/savvy')

	# Process each file
	for filename in txt_files:
	try:
	# Try different encodings (cursor came up with this)
	encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']

	for encoding in encodings:
	try:
	# Parse the file into records
	with open(filename, 'r', encoding=encoding) as file:
	records = [parse_line(line) for line in file if line.strip()]

	# If we get here, the encoding worked
	break
	except UnicodeDecodeError:
	if encoding == encodings[-1]: # If this was the last encoding to try
	raise # Re-raise the exception
	continue # Try next encoding

	# Convert records to DataFrame
	df = pd.DataFrame(records)

	# Add filename column
	df['filename'] = filename

	# Add file_date column from first 8 chars of filename
	df['file_date_from_name'] = pd.to_datetime(filename[:8], format='%Y%m%d')

	# Write DataFrame to PostgreSQL
	df.to_sql('corporations', engine, if_exists='append', index=False,
	dtype={'file_date_from_name': sqlalchemy.types.Date})

	# Move file to processed directory
	shutil.move(filename, os.path.join(processed_dir, filename))

	print(f"Processed {filename} using {encoding}: {len(df)} records inserted")

	except Exception as e:
	print(f"Error processing {filename}: {str(e)}")

	print("All files processed")