tbbooher · May 27, 2025 01:40
diff --git a/upload_bikes.py b/upload_bikes.py
 #!/usr/bin/env python3
 import pandas as pd
 import psycopg2
 from psycopg2 import sql

 # 1) Load & clean CSV
 df = pd.read_csv("data/bikes.csv")

 # Rename 'original model' → valid identifier
 df.rename(columns={"original model": "original_model"}, inplace=True)

 # Clean price: strip '$' and ',' → float
 df["price"] = (
    df["price"]
    .astype(str)
    .str.replace(r"[\$,]", "", regex=True)
    .replace({"": None})
    .astype("float")
 )

 # Cast year → Int64, then to Python int (with None for missing)
 df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

 # Parse timestamps
 for col in ("post_date", "scraped_at"):
    df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)

 # 2) Connect & (re)create table
 conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim")
 cur = conn.cursor()

 cur.execute("""
 DROP TABLE IF EXISTS bike_listings_new;
 CREATE TABLE bike_listings_new (
    id              SERIAL      PRIMARY KEY,
    city            TEXT        NOT NULL,
    post_date       TIMESTAMPTZ,
    price           NUMERIC     NOT NULL,
    currency        TEXT        NOT NULL,
    title           TEXT        NOT NULL,
    url             TEXT,
    location        TEXT,
    year            INTEGER,
    brand           TEXT,
    original_model  TEXT,
    model           TEXT,
    frame_material  TEXT,
    wheel_size      TEXT,
    drivetrain      TEXT,
    brake_type      TEXT,
    suspension      TEXT,
    scraped_at      TIMESTAMPTZ,
    tire_brand      TEXT,
    frame_size      TEXT,
    travel          TEXT,
    source          TEXT        NOT NULL
 );
 """)
 conn.commit()

 # 3) Prepare INSERT
 cols = list(df.columns)  # should be ['city','post_date','price',...,'source']
 insert_stmt = sql.SQL("""
 INSERT INTO bike_listings_new ({fields})
 VALUES ({placeholders})
 """).format(
    fields=sql.SQL(", ").join(map(sql.Identifier, cols)),
    placeholders=sql.SQL(", ").join(sql.Placeholder() * len(cols))
 )

 # 4) Insert rows, converting pandas-missing → None
 for i, row in df.iterrows():
    raw = []
    for v in row:
        # pd.isna covers NaN, None, pd.NA, pd.NaT
        raw.append(None if pd.isna(v) else v)
    try:
        cur.execute(insert_stmt, raw)
    except Exception as e:
        conn.rollback()
        print(f"[ERROR] row {i}: {e}")
        print(" ->", raw)
    else:
        conn.commit()

 cur.close()
 conn.close()

 print(f"✔ Inserted {len(df)} rows into bike_listings_new")
	#!/usr/bin/env python3
	import pandas as pd
	import psycopg2
	from psycopg2 import sql

	# 1) Load & clean CSV
	df = pd.read_csv("data/bikes.csv")

	# Rename 'original model' → valid identifier
	df.rename(columns={"original model": "original_model"}, inplace=True)

	# Clean price: strip '$' and ',' → float
	df["price"] = (
	df["price"]
	.astype(str)
	.str.replace(r"[\$,]", "", regex=True)
	.replace({"": None})
	.astype("float")
	)

	# Cast year → Int64, then to Python int (with None for missing)
	df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

	# Parse timestamps
	for col in ("post_date", "scraped_at"):
	df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)

	# 2) Connect & (re)create table
	conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim")
	cur = conn.cursor()

	cur.execute("""
	DROP TABLE IF EXISTS bike_listings_new;
	CREATE TABLE bike_listings_new (
	id SERIAL PRIMARY KEY,
	city TEXT NOT NULL,
	post_date TIMESTAMPTZ,
	price NUMERIC NOT NULL,
	currency TEXT NOT NULL,
	title TEXT NOT NULL,
	url TEXT,
	location TEXT,
	year INTEGER,
	brand TEXT,
	original_model TEXT,
	model TEXT,
	frame_material TEXT,
	wheel_size TEXT,
	drivetrain TEXT,
	brake_type TEXT,
	suspension TEXT,
	scraped_at TIMESTAMPTZ,
	tire_brand TEXT,
	frame_size TEXT,
	travel TEXT,
	source TEXT NOT NULL
	);
	""")
	conn.commit()

	# 3) Prepare INSERT
	cols = list(df.columns) # should be ['city','post_date','price',...,'source']
	insert_stmt = sql.SQL("""
	INSERT INTO bike_listings_new ({fields})
	VALUES ({placeholders})
	""").format(
	fields=sql.SQL(", ").join(map(sql.Identifier, cols)),
	placeholders=sql.SQL(", ").join(sql.Placeholder() * len(cols))
	)

	# 4) Insert rows, converting pandas-missing → None
	for i, row in df.iterrows():
	raw = []
	for v in row:
	# pd.isna covers NaN, None, pd.NA, pd.NaT
	raw.append(None if pd.isna(v) else v)
	try:
	cur.execute(insert_stmt, raw)
	except Exception as e:
	conn.rollback()
	print(f"[ERROR] row {i}: {e}")
	print(" ->", raw)
	else:
	conn.commit()

	cur.close()
	conn.close()

	print(f"✔ Inserted {len(df)} rows into bike_listings_new")