Skip to content

Instantly share code, notes, and snippets.

@tbbooher
Created May 27, 2025 02:56
Show Gist options
  • Save tbbooher/ccf6972b229858334e248d4be4a67f87 to your computer and use it in GitHub Desktop.
Save tbbooher/ccf6972b229858334e248d4be4a67f87 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Fit a resale-price model for used mountain bikes.
"""
import os, psycopg2, pandas as pd, numpy as np, datetime as dt
from sklearn.model_selection import GroupKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import RidgeCV
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import joblib, pathlib
# ---------- 1. Load data from Postgres ---------- #
conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim")
df = pd.read_sql("""
SELECT price, year, brand, model, frame_material, frame_size,
city, scraped_at
FROM bike_listings
WHERE price BETWEEN 300 AND 20000
AND year BETWEEN 1995 AND 2025
AND brand IS NOT NULL
AND price IS NOT NULL
""", conn)
conn.close()
# ---------- 2. Clean & feature engineering ---------- #
brand_counts = df['brand'].str.lower().value_counts()
valid_brands = brand_counts[brand_counts >= 5].index
df = df[df['brand'].str.lower().isin(valid_brands)]
df['age'] = dt.datetime.now().year - df['year']
df['log_age'] = np.log1p(df['age'])
numeric = ['age', 'log_age']
categoric = ['brand', 'frame_material', 'frame_size', 'city']
# ---------- 3. Train/test split ---------- #
n_splits = min(5, df['city'].nunique())
train_idx, test_idx = next(GroupKFold(n_splits=n_splits).split(df, groups=df['city']))
train, test = df.iloc[train_idx], df.iloc[test_idx]
# ---------- 4. Preprocessing pipeline ---------- #
pre = ColumnTransformer([
('num', Pipeline([
('imputer', SimpleImputer(strategy='median')),
('passthrough', 'passthrough')
]), numeric),
('cat', Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', min_frequency=5))
]), categoric)
])
# ---------- 5A. Ridge baseline ---------- #
ridge_pipe = Pipeline([
('pre', pre),
('model', RidgeCV(alphas=np.logspace(-2, 3, 20)))
])
ridge_pipe.fit(train, train['price'])
pred_ridge = ridge_pipe.predict(test)
print(f"Ridge MAE = {mean_absolute_error(test['price'], pred_ridge):.0f}")
# ---------- 5B. LightGBM quantile model ---------- #
lgb_train = lgb.Dataset(pre.fit_transform(train), label=train['price'])
params = dict(objective='quantile', alpha=0.5,
learning_rate=0.05, num_leaves=64, min_data_in_leaf=20,
feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=1)
gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=[lgb_train])
pred_gbm = gbm.predict(pre.transform(test))
mae_gbm = mean_absolute_error(test['price'], pred_gbm)
print(f"LightGBM-median MAE = {mae_gbm:.0f}")
# ---------- 6. Save model, predictions, and outputs ---------- #
pathlib.Path('models').mkdir(exist_ok=True)
joblib.dump({'pre': pre, 'gbm': gbm}, 'models/bike_price_gbm.pkl')
print("Model saved → models/bike_price_gbm.pkl")
test_results = test.copy()
test_results['predicted_price'] = pred_gbm
test_results.to_csv("bike_price_predictions.csv", index=False)
print("Predictions saved → bike_price_predictions.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment