Skip to content

Instantly share code, notes, and snippets.

@tbbooher
Created May 27, 2025 01:40
Show Gist options
  • Save tbbooher/6f8221af28c145c501baf9620620e1b6 to your computer and use it in GitHub Desktop.
Save tbbooher/6f8221af28c145c501baf9620620e1b6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import pandas as pd
import psycopg2
from psycopg2 import sql
# 1) Load & clean CSV
df = pd.read_csv("data/bikes.csv")
# Rename 'original model' → valid identifier
df.rename(columns={"original model": "original_model"}, inplace=True)
# Clean price: strip '$' and ',' → float
df["price"] = (
df["price"]
.astype(str)
.str.replace(r"[\$,]", "", regex=True)
.replace({"": None})
.astype("float")
)
# Cast year → Int64, then to Python int (with None for missing)
df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
# Parse timestamps
for col in ("post_date", "scraped_at"):
df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)
# 2) Connect & (re)create table
conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim")
cur = conn.cursor()
cur.execute("""
DROP TABLE IF EXISTS bike_listings_new;
CREATE TABLE bike_listings_new (
id SERIAL PRIMARY KEY,
city TEXT NOT NULL,
post_date TIMESTAMPTZ,
price NUMERIC NOT NULL,
currency TEXT NOT NULL,
title TEXT NOT NULL,
url TEXT,
location TEXT,
year INTEGER,
brand TEXT,
original_model TEXT,
model TEXT,
frame_material TEXT,
wheel_size TEXT,
drivetrain TEXT,
brake_type TEXT,
suspension TEXT,
scraped_at TIMESTAMPTZ,
tire_brand TEXT,
frame_size TEXT,
travel TEXT,
source TEXT NOT NULL
);
""")
conn.commit()
# 3) Prepare INSERT
cols = list(df.columns) # should be ['city','post_date','price',...,'source']
insert_stmt = sql.SQL("""
INSERT INTO bike_listings_new ({fields})
VALUES ({placeholders})
""").format(
fields=sql.SQL(", ").join(map(sql.Identifier, cols)),
placeholders=sql.SQL(", ").join(sql.Placeholder() * len(cols))
)
# 4) Insert rows, converting pandas-missing → None
for i, row in df.iterrows():
raw = []
for v in row:
# pd.isna covers NaN, None, pd.NA, pd.NaT
raw.append(None if pd.isna(v) else v)
try:
cur.execute(insert_stmt, raw)
except Exception as e:
conn.rollback()
print(f"[ERROR] row {i}: {e}")
print(" ->", raw)
else:
conn.commit()
cur.close()
conn.close()
print(f"✔ Inserted {len(df)} rows into bike_listings_new")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment