Created
November 21, 2024 17:27
-
-
Save dharamsk/d1dd552cb122b07708960f269140d680 to your computer and use it in GitHub Desktop.
Florida corporate data scrape
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# portal page: https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/ | |
# login to SFTP: https://sftp.floridados.gov/ (login creds in webpage above) | |
# download contents of /doc/cor/ | |
# run this script in that directory | |
# paste schema from webpage into LLM and ask for it in a structured format | |
# https://dos.fl.gov/sunbiz/other-services/data-downloads/corporate-data-file/file-structure/ | |
FIELD_SPECS = [ | |
('corporation_number', 0, 12), | |
('corporation_name', 12, 192), | |
('status', 204, 1), | |
('filing_type', 205, 15), | |
('address_1', 220, 42), | |
('address_2', 262, 42), | |
('city', 304, 28), | |
('state', 332, 2), | |
('zip', 334, 10), | |
('country', 344, 2), | |
('mail_address_1', 346, 42), | |
('mail_address_2', 388, 42), | |
('mail_city', 430, 28), | |
('mail_state', 458, 2), | |
('mail_zip', 460, 10), | |
('mail_country', 470, 2), | |
('file_date', 472, 8), | |
('fei_number', 480, 14), | |
('more_than_six_officers', 494, 1), | |
('last_transaction_date', 495, 8), | |
('state_country', 503, 2), | |
('report_year_1', 505, 4), | |
('filler_1', 509, 1), | |
('report_date_1', 510, 8), | |
('report_year_2', 518, 4), | |
('filler_2', 522, 1), | |
('report_date_2', 523, 8), | |
('report_year_3', 531, 4), | |
('filler_3', 535, 1), | |
('report_date_3', 536, 8), | |
('registered_agent_name', 544, 42), | |
('registered_agent_type', 586, 1), | |
('registered_agent_address', 587, 42), | |
('registered_agent_city', 629, 28), | |
('registered_agent_state', 657, 2), | |
('registered_agent_zip', 659, 9), | |
# Officer 1 | |
('officer_1_title', 668, 4), | |
('officer_1_type', 672, 1), | |
('officer_1_name', 673, 42), | |
('officer_1_address', 715, 42), | |
('officer_1_city', 757, 28), | |
('officer_1_state', 785, 2), | |
('officer_1_zip', 787, 9), | |
# Officer 2 | |
('officer_2_title', 796, 4), | |
('officer_2_type', 800, 1), | |
('officer_2_name', 801, 42), | |
('officer_2_address', 843, 42), | |
('officer_2_city', 885, 28), | |
('officer_2_state', 913, 2), | |
('officer_2_zip', 915, 9), | |
# Officer 3 | |
('officer_3_title', 924, 4), | |
('officer_3_type', 928, 1), | |
('officer_3_name', 929, 42), | |
('officer_3_address', 971, 42), | |
('officer_3_city', 1013, 28), | |
('officer_3_state', 1041, 2), | |
('officer_3_zip', 1043, 9), | |
# Officer 4 | |
('officer_4_title', 1052, 4), | |
('officer_4_type', 1056, 1), | |
('officer_4_name', 1057, 42), | |
('officer_4_address', 1099, 42), | |
('officer_4_city', 1141, 28), | |
('officer_4_state', 1169, 2), | |
('officer_4_zip', 1171, 9), | |
# Officer 5 | |
('officer_5_title', 1180, 4), | |
('officer_5_type', 1184, 1), | |
('officer_5_name', 1185, 42), | |
('officer_5_address', 1227, 42), | |
('officer_5_city', 1269, 28), | |
('officer_5_state', 1297, 2), | |
('officer_5_zip', 1299, 9), | |
# Officer 6 | |
('officer_6_title', 1308, 4), | |
('officer_6_type', 1312, 1), | |
('officer_6_name', 1313, 42), | |
('officer_6_address', 1355, 42), | |
('officer_6_city', 1397, 28), | |
('officer_6_state', 1425, 2), | |
('officer_6_zip', 1427, 9), | |
('filler_end', 1436, 4) | |
] | |
def parse_line(line): | |
"""Parse a single line according to field specifications and clean null bytes""" | |
return { | |
name: line[start:start+length].strip().replace('\x00', '') | |
for name, start, length in FIELD_SPECS | |
} | |
import pandas as pd | |
from sqlalchemy import create_engine | |
import os | |
import shutil | |
import sqlalchemy | |
# Create processed directory if it doesn't exist | |
processed_dir = 'processed' | |
if not os.path.exists(processed_dir): | |
os.makedirs(processed_dir) | |
# Get all .txt files in current directory, sorted alphabetically | |
txt_files = sorted([f for f in os.listdir('.') if f.endswith('.txt')]) | |
# Create SQLAlchemy engine | |
engine = create_engine('postgresql://dharam:@localhost:5432/savvy') | |
# Process each file | |
for filename in txt_files: | |
try: | |
# Try different encodings (cursor came up with this) | |
encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1'] | |
for encoding in encodings: | |
try: | |
# Parse the file into records | |
with open(filename, 'r', encoding=encoding) as file: | |
records = [parse_line(line) for line in file if line.strip()] | |
# If we get here, the encoding worked | |
break | |
except UnicodeDecodeError: | |
if encoding == encodings[-1]: # If this was the last encoding to try | |
raise # Re-raise the exception | |
continue # Try next encoding | |
# Convert records to DataFrame | |
df = pd.DataFrame(records) | |
# Add filename column | |
df['filename'] = filename | |
# Add file_date column from first 8 chars of filename | |
df['file_date_from_name'] = pd.to_datetime(filename[:8], format='%Y%m%d') | |
# Write DataFrame to PostgreSQL | |
df.to_sql('corporations', engine, if_exists='append', index=False, | |
dtype={'file_date_from_name': sqlalchemy.types.Date}) | |
# Move file to processed directory | |
shutil.move(filename, os.path.join(processed_dir, filename)) | |
print(f"Processed {filename} using {encoding}: {len(df)} records inserted") | |
except Exception as e: | |
print(f"Error processing {filename}: {str(e)}") | |
print("All files processed") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment