Created
May 27, 2025 02:56
-
-
Save tbbooher/ccf6972b229858334e248d4be4a67f87 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Fit a resale-price model for used mountain bikes. | |
""" | |
import os, psycopg2, pandas as pd, numpy as np, datetime as dt | |
from sklearn.model_selection import GroupKFold | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.metrics import mean_absolute_error | |
from sklearn.linear_model import RidgeCV | |
from sklearn.impute import SimpleImputer | |
import lightgbm as lgb | |
import joblib, pathlib | |
# ---------- 1. Load data from Postgres ---------- # | |
conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim") | |
df = pd.read_sql(""" | |
SELECT price, year, brand, model, frame_material, frame_size, | |
city, scraped_at | |
FROM bike_listings | |
WHERE price BETWEEN 300 AND 20000 | |
AND year BETWEEN 1995 AND 2025 | |
AND brand IS NOT NULL | |
AND price IS NOT NULL | |
""", conn) | |
conn.close() | |
# ---------- 2. Clean & feature engineering ---------- # | |
brand_counts = df['brand'].str.lower().value_counts() | |
valid_brands = brand_counts[brand_counts >= 5].index | |
df = df[df['brand'].str.lower().isin(valid_brands)] | |
df['age'] = dt.datetime.now().year - df['year'] | |
df['log_age'] = np.log1p(df['age']) | |
numeric = ['age', 'log_age'] | |
categoric = ['brand', 'frame_material', 'frame_size', 'city'] | |
# ---------- 3. Train/test split ---------- # | |
n_splits = min(5, df['city'].nunique()) | |
train_idx, test_idx = next(GroupKFold(n_splits=n_splits).split(df, groups=df['city'])) | |
train, test = df.iloc[train_idx], df.iloc[test_idx] | |
# ---------- 4. Preprocessing pipeline ---------- # | |
pre = ColumnTransformer([ | |
('num', Pipeline([ | |
('imputer', SimpleImputer(strategy='median')), | |
('passthrough', 'passthrough') | |
]), numeric), | |
('cat', Pipeline([ | |
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), | |
('ohe', OneHotEncoder(handle_unknown='ignore', min_frequency=5)) | |
]), categoric) | |
]) | |
# ---------- 5A. Ridge baseline ---------- # | |
ridge_pipe = Pipeline([ | |
('pre', pre), | |
('model', RidgeCV(alphas=np.logspace(-2, 3, 20))) | |
]) | |
ridge_pipe.fit(train, train['price']) | |
pred_ridge = ridge_pipe.predict(test) | |
print(f"Ridge MAE = {mean_absolute_error(test['price'], pred_ridge):.0f}") | |
# ---------- 5B. LightGBM quantile model ---------- # | |
lgb_train = lgb.Dataset(pre.fit_transform(train), label=train['price']) | |
params = dict(objective='quantile', alpha=0.5, | |
learning_rate=0.05, num_leaves=64, min_data_in_leaf=20, | |
feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=1) | |
gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=[lgb_train]) | |
pred_gbm = gbm.predict(pre.transform(test)) | |
mae_gbm = mean_absolute_error(test['price'], pred_gbm) | |
print(f"LightGBM-median MAE = {mae_gbm:.0f}") | |
# ---------- 6. Save model, predictions, and outputs ---------- # | |
pathlib.Path('models').mkdir(exist_ok=True) | |
joblib.dump({'pre': pre, 'gbm': gbm}, 'models/bike_price_gbm.pkl') | |
print("Model saved → models/bike_price_gbm.pkl") | |
test_results = test.copy() | |
test_results['predicted_price'] = pred_gbm | |
test_results.to_csv("bike_price_predictions.csv", index=False) | |
print("Predictions saved → bike_price_predictions.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment