tbbooher · May 27, 2025 02:56
diff --git a/model.py b/model.py
 #!/usr/bin/env python3
 """
 Fit a resale-price model for used mountain bikes.
 """

 import os, psycopg2, pandas as pd, numpy as np, datetime as dt
 from sklearn.model_selection import GroupKFold
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.metrics import mean_absolute_error
 from sklearn.linear_model import RidgeCV
 from sklearn.impute import SimpleImputer
 import lightgbm as lgb
 import joblib, pathlib

 # ---------- 1. Load data from Postgres ---------- #
 conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim")
 df = pd.read_sql("""
    SELECT price, year, brand, model, frame_material, frame_size,
           city, scraped_at
    FROM   bike_listings
    WHERE  price BETWEEN 300 AND 20000
      AND year BETWEEN 1995 AND 2025
      AND brand IS NOT NULL
      AND price IS NOT NULL
 """, conn)
 conn.close()

 # ---------- 2. Clean & feature engineering ---------- #
 brand_counts = df['brand'].str.lower().value_counts()
 valid_brands = brand_counts[brand_counts >= 5].index
 df = df[df['brand'].str.lower().isin(valid_brands)]

 df['age'] = dt.datetime.now().year - df['year']
 df['log_age'] = np.log1p(df['age'])

 numeric = ['age', 'log_age']
 categoric = ['brand', 'frame_material', 'frame_size', 'city']

 # ---------- 3. Train/test split ---------- #
 n_splits = min(5, df['city'].nunique())
 train_idx, test_idx = next(GroupKFold(n_splits=n_splits).split(df, groups=df['city']))
 train, test = df.iloc[train_idx], df.iloc[test_idx]

 # ---------- 4. Preprocessing pipeline ---------- #
 pre = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('passthrough', 'passthrough')
    ]), numeric),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', min_frequency=5))
    ]), categoric)
 ])

 # ---------- 5A. Ridge baseline ---------- #
 ridge_pipe = Pipeline([
    ('pre', pre),
    ('model', RidgeCV(alphas=np.logspace(-2, 3, 20)))
 ])
 ridge_pipe.fit(train, train['price'])
 pred_ridge = ridge_pipe.predict(test)
 print(f"Ridge MAE = {mean_absolute_error(test['price'], pred_ridge):.0f}")

 # ---------- 5B. LightGBM quantile model ---------- #
 lgb_train = lgb.Dataset(pre.fit_transform(train), label=train['price'])
 params = dict(objective='quantile', alpha=0.5,
              learning_rate=0.05, num_leaves=64, min_data_in_leaf=20,
              feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=1)

 gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=[lgb_train])

 pred_gbm = gbm.predict(pre.transform(test))
 mae_gbm = mean_absolute_error(test['price'], pred_gbm)
 print(f"LightGBM-median MAE = {mae_gbm:.0f}")

 # ---------- 6. Save model, predictions, and outputs ---------- #
 pathlib.Path('models').mkdir(exist_ok=True)
 joblib.dump({'pre': pre, 'gbm': gbm}, 'models/bike_price_gbm.pkl')
 print("Model saved → models/bike_price_gbm.pkl")

 test_results = test.copy()
 test_results['predicted_price'] = pred_gbm
 test_results.to_csv("bike_price_predictions.csv", index=False)
 print("Predictions saved → bike_price_predictions.csv")
	#!/usr/bin/env python3
	"""
	Fit a resale-price model for used mountain bikes.
	"""

	import os, psycopg2, pandas as pd, numpy as np, datetime as dt
	from sklearn.model_selection import GroupKFold
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.metrics import mean_absolute_error
	from sklearn.linear_model import RidgeCV
	from sklearn.impute import SimpleImputer
	import lightgbm as lgb
	import joblib, pathlib

	# ---------- 1. Load data from Postgres ---------- #
	conn = psycopg2.connect("host=localhost port=5432 dbname=bike_prices user=tim")
	df = pd.read_sql("""
	SELECT price, year, brand, model, frame_material, frame_size,
	city, scraped_at
	FROM bike_listings
	WHERE price BETWEEN 300 AND 20000
	AND year BETWEEN 1995 AND 2025
	AND brand IS NOT NULL
	AND price IS NOT NULL
	""", conn)
	conn.close()

	# ---------- 2. Clean & feature engineering ---------- #
	brand_counts = df['brand'].str.lower().value_counts()
	valid_brands = brand_counts[brand_counts >= 5].index
	df = df[df['brand'].str.lower().isin(valid_brands)]

	df['age'] = dt.datetime.now().year - df['year']
	df['log_age'] = np.log1p(df['age'])

	numeric = ['age', 'log_age']
	categoric = ['brand', 'frame_material', 'frame_size', 'city']

	# ---------- 3. Train/test split ---------- #
	n_splits = min(5, df['city'].nunique())
	train_idx, test_idx = next(GroupKFold(n_splits=n_splits).split(df, groups=df['city']))
	train, test = df.iloc[train_idx], df.iloc[test_idx]

	# ---------- 4. Preprocessing pipeline ---------- #
	pre = ColumnTransformer([
	('num', Pipeline([
	('imputer', SimpleImputer(strategy='median')),
	('passthrough', 'passthrough')
	]), numeric),
	('cat', Pipeline([
	('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
	('ohe', OneHotEncoder(handle_unknown='ignore', min_frequency=5))
	]), categoric)
	])

	# ---------- 5A. Ridge baseline ---------- #
	ridge_pipe = Pipeline([
	('pre', pre),
	('model', RidgeCV(alphas=np.logspace(-2, 3, 20)))
	])
	ridge_pipe.fit(train, train['price'])
	pred_ridge = ridge_pipe.predict(test)
	print(f"Ridge MAE = {mean_absolute_error(test['price'], pred_ridge):.0f}")

	# ---------- 5B. LightGBM quantile model ---------- #
	lgb_train = lgb.Dataset(pre.fit_transform(train), label=train['price'])
	params = dict(objective='quantile', alpha=0.5,
	learning_rate=0.05, num_leaves=64, min_data_in_leaf=20,
	feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=1)

	gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=[lgb_train])

	pred_gbm = gbm.predict(pre.transform(test))
	mae_gbm = mean_absolute_error(test['price'], pred_gbm)
	print(f"LightGBM-median MAE = {mae_gbm:.0f}")

	# ---------- 6. Save model, predictions, and outputs ---------- #
	pathlib.Path('models').mkdir(exist_ok=True)
	joblib.dump({'pre': pre, 'gbm': gbm}, 'models/bike_price_gbm.pkl')
	print("Model saved → models/bike_price_gbm.pkl")

	test_results = test.copy()
	test_results['predicted_price'] = pred_gbm
	test_results.to_csv("bike_price_predictions.csv", index=False)
	print("Predictions saved → bike_price_predictions.csv")