Skip to content

Instantly share code, notes, and snippets.

@dharamsk
Created November 21, 2024 17:27
Show Gist options
  • Save dharamsk/d1dd552cb122b07708960f269140d680 to your computer and use it in GitHub Desktop.
Save dharamsk/d1dd552cb122b07708960f269140d680 to your computer and use it in GitHub Desktop.
Florida corporate data scrape
# portal page: https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/
# login to SFTP: https://sftp.floridados.gov/ (login creds in webpage above)
# download contents of /doc/cor/
# run this script in that directory
# paste schema from webpage into LLM and ask for it in a structured format
# https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/file-structure/
FIELD_SPECS = [
('corporation_number', 0, 12),
('corporation_name', 12, 192),
('status', 204, 1),
('filing_type', 205, 15),
('address_1', 220, 42),
('address_2', 262, 42),
('city', 304, 28),
('state', 332, 2),
('zip', 334, 10),
('country', 344, 2),
('mail_address_1', 346, 42),
('mail_address_2', 388, 42),
('mail_city', 430, 28),
('mail_state', 458, 2),
('mail_zip', 460, 10),
('mail_country', 470, 2),
('file_date', 472, 8),
('fei_number', 480, 14),
('more_than_six_officers', 494, 1),
('last_transaction_date', 495, 8),
('state_country', 503, 2),
('report_year_1', 505, 4),
('filler_1', 509, 1),
('report_date_1', 510, 8),
('report_year_2', 518, 4),
('filler_2', 522, 1),
('report_date_2', 523, 8),
('report_year_3', 531, 4),
('filler_3', 535, 1),
('report_date_3', 536, 8),
('registered_agent_name', 544, 42),
('registered_agent_type', 586, 1),
('registered_agent_address', 587, 42),
('registered_agent_city', 629, 28),
('registered_agent_state', 657, 2),
('registered_agent_zip', 659, 9),
# Officer 1
('officer_1_title', 668, 4),
('officer_1_type', 672, 1),
('officer_1_name', 673, 42),
('officer_1_address', 715, 42),
('officer_1_city', 757, 28),
('officer_1_state', 785, 2),
('officer_1_zip', 787, 9),
# Officer 2
('officer_2_title', 796, 4),
('officer_2_type', 800, 1),
('officer_2_name', 801, 42),
('officer_2_address', 843, 42),
('officer_2_city', 885, 28),
('officer_2_state', 913, 2),
('officer_2_zip', 915, 9),
# Officer 3
('officer_3_title', 924, 4),
('officer_3_type', 928, 1),
('officer_3_name', 929, 42),
('officer_3_address', 971, 42),
('officer_3_city', 1013, 28),
('officer_3_state', 1041, 2),
('officer_3_zip', 1043, 9),
# Officer 4
('officer_4_title', 1052, 4),
('officer_4_type', 1056, 1),
('officer_4_name', 1057, 42),
('officer_4_address', 1099, 42),
('officer_4_city', 1141, 28),
('officer_4_state', 1169, 2),
('officer_4_zip', 1171, 9),
# Officer 5
('officer_5_title', 1180, 4),
('officer_5_type', 1184, 1),
('officer_5_name', 1185, 42),
('officer_5_address', 1227, 42),
('officer_5_city', 1269, 28),
('officer_5_state', 1297, 2),
('officer_5_zip', 1299, 9),
# Officer 6
('officer_6_title', 1308, 4),
('officer_6_type', 1312, 1),
('officer_6_name', 1313, 42),
('officer_6_address', 1355, 42),
('officer_6_city', 1397, 28),
('officer_6_state', 1425, 2),
('officer_6_zip', 1427, 9),
('filler_end', 1436, 4)
]
def parse_line(line):
"""Parse a single line according to field specifications and clean null bytes"""
return {
name: line[start:start+length].strip().replace('\x00', '')
for name, start, length in FIELD_SPECS
}
import pandas as pd
from sqlalchemy import create_engine
import os
import shutil
import sqlalchemy
# Create processed directory if it doesn't exist
processed_dir = 'processed'
if not os.path.exists(processed_dir):
os.makedirs(processed_dir)
# Get all .txt files in current directory, sorted alphabetically
txt_files = sorted([f for f in os.listdir('.') if f.endswith('.txt')])
# Create SQLAlchemy engine
engine = create_engine('postgresql://dharam:@localhost:5432/savvy')
# Process each file
for filename in txt_files:
try:
# Try different encodings (cursor came up with this)
encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
for encoding in encodings:
try:
# Parse the file into records
with open(filename, 'r', encoding=encoding) as file:
records = [parse_line(line) for line in file if line.strip()]
# If we get here, the encoding worked
break
except UnicodeDecodeError:
if encoding == encodings[-1]: # If this was the last encoding to try
raise # Re-raise the exception
continue # Try next encoding
# Convert records to DataFrame
df = pd.DataFrame(records)
# Add filename column
df['filename'] = filename
# Add file_date column from first 8 chars of filename
df['file_date_from_name'] = pd.to_datetime(filename[:8], format='%Y%m%d')
# Write DataFrame to PostgreSQL
df.to_sql('corporations', engine, if_exists='append', index=False,
dtype={'file_date_from_name': sqlalchemy.types.Date})
# Move file to processed directory
shutil.move(filename, os.path.join(processed_dir, filename))
print(f"Processed {filename} using {encoding}: {len(df)} records inserted")
except Exception as e:
print(f"Error processing {filename}: {str(e)}")
print("All files processed")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment