Created
May 27, 2025 01:40
-
-
Save tbbooher/6f8221af28c145c501baf9620620e1b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pandas as pd | |
import psycopg2 | |
from psycopg2 import sql | |
# 1) Load & clean CSV | |
df = pd.read_csv("data/bikes.csv") | |
# Rename 'original model' → valid identifier | |
df.rename(columns={"original model": "original_model"}, inplace=True) | |
# Clean price: strip '$' and ',' → float | |
df["price"] = ( | |
df["price"] | |
.astype(str) | |
.str.replace(r"[\$,]", "", regex=True) | |
.replace({"": None}) | |
.astype("float") | |
) | |
# Cast year → Int64, then to Python int (with None for missing) | |
df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64") | |
# Parse timestamps | |
for col in ("post_date", "scraped_at"): | |
df[col] = pd.to_datetime(df[col], errors="coerce", utc=True) | |
# 2) Connect & (re)create table | |
conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim") | |
cur = conn.cursor() | |
cur.execute(""" | |
DROP TABLE IF EXISTS bike_listings_new; | |
CREATE TABLE bike_listings_new ( | |
id SERIAL PRIMARY KEY, | |
city TEXT NOT NULL, | |
post_date TIMESTAMPTZ, | |
price NUMERIC NOT NULL, | |
currency TEXT NOT NULL, | |
title TEXT NOT NULL, | |
url TEXT, | |
location TEXT, | |
year INTEGER, | |
brand TEXT, | |
original_model TEXT, | |
model TEXT, | |
frame_material TEXT, | |
wheel_size TEXT, | |
drivetrain TEXT, | |
brake_type TEXT, | |
suspension TEXT, | |
scraped_at TIMESTAMPTZ, | |
tire_brand TEXT, | |
frame_size TEXT, | |
travel TEXT, | |
source TEXT NOT NULL | |
); | |
""") | |
conn.commit() | |
# 3) Prepare INSERT | |
cols = list(df.columns) # should be ['city','post_date','price',...,'source'] | |
insert_stmt = sql.SQL(""" | |
INSERT INTO bike_listings_new ({fields}) | |
VALUES ({placeholders}) | |
""").format( | |
fields=sql.SQL(", ").join(map(sql.Identifier, cols)), | |
placeholders=sql.SQL(", ").join(sql.Placeholder() * len(cols)) | |
) | |
# 4) Insert rows, converting pandas-missing → None | |
for i, row in df.iterrows(): | |
raw = [] | |
for v in row: | |
# pd.isna covers NaN, None, pd.NA, pd.NaT | |
raw.append(None if pd.isna(v) else v) | |
try: | |
cur.execute(insert_stmt, raw) | |
except Exception as e: | |
conn.rollback() | |
print(f"[ERROR] row {i}: {e}") | |
print(" ->", raw) | |
else: | |
conn.commit() | |
cur.close() | |
conn.close() | |
print(f"✔ Inserted {len(df)} rows into bike_listings_new") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment