Created
March 24, 2025 16:28
-
-
Save eliseomartelli/58d288e41286b35198a91c37d54fa146 to your computer and use it in GitHub Desktop.
Predict the next WWDC date
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from datetime import datetime, timedelta | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.linear_model import Ridge | |
from sklearn.model_selection import cross_val_score, LeaveOneOut | |
def preprocess_data(data): | |
df = pd.DataFrame(data) | |
for i, row in df.iterrows(): | |
df.at[i, "wwdc_date"] = datetime.strptime( | |
f"{row['wwdc']}/{row['year']}", "%m/%d/%Y" | |
) | |
df.at[i, "announced_date"] = datetime.strptime( | |
f"{row['announced']}/{row['year']}", "%m/%d/%Y" | |
) | |
df["wwdc_month"] = df["wwdc_date"].dt.month | |
df["wwdc_day"] = df["wwdc_date"].dt.day | |
df["wwdc_weekday"] = df["wwdc_date"].dt.weekday | |
return df | |
def train_models(df): | |
X = df[["year"]].copy() | |
y_day = df["wwdc_day"].values | |
y_month = df["wwdc_month"].values | |
y_lead = df["lead_time"].values | |
y_weekday = df["wwdc_weekday"].values | |
models = { | |
"day": { | |
"rf": RandomForestRegressor( | |
n_estimators=100, random_state=42, min_samples_leaf=2 | |
), | |
"ridge": Ridge(alpha=1.0), | |
}, | |
"month": { | |
"rf": RandomForestRegressor( | |
n_estimators=100, random_state=42, min_samples_leaf=2 | |
), | |
"ridge": Ridge(alpha=1.0), | |
}, | |
"lead_time": { | |
"rf": RandomForestRegressor( | |
n_estimators=100, random_state=42, min_samples_leaf=2 | |
), | |
"ridge": Ridge(alpha=1.0), | |
}, | |
"weekday": { | |
"rf": RandomForestRegressor( | |
n_estimators=100, random_state=42, min_samples_leaf=2 | |
), | |
"ridge": Ridge(alpha=1.0), | |
}, | |
} | |
loo = LeaveOneOut() | |
best_models = {} | |
print("\nModel Evaluation:") | |
print("-" * 50) | |
for target_name, target_models in models.items(): | |
best_score = float("inf") | |
best_model_name = None | |
if target_name == "day": | |
y = y_day | |
elif target_name == "month": | |
y = y_month | |
elif target_name == "lead_time": | |
y = y_lead | |
else: | |
y = y_weekday | |
for model_name, model in target_models.items(): | |
cv_scores = cross_val_score( | |
model, X, y, cv=loo, scoring="neg_mean_absolute_error" | |
) | |
mae = -np.mean(cv_scores) | |
if mae < best_score: | |
best_score = mae | |
best_model_name = model_name | |
print( | |
f"{target_name.capitalize()} - {model_name}: MAE = {mae:.2f}", | |
) | |
best_models[target_name] = target_models[best_model_name].fit(X, y) | |
print( | |
f"Best for {target_name}: {best_model_name}", | |
f"MAE = {best_score:.2f}", | |
) | |
print("-" * 50) | |
return best_models, X.columns | |
def predict_dates(models, year, feature_columns): | |
X_pred = pd.DataFrame({"year": [year]}) | |
predicted_month = int(round(models["month"].predict(X_pred)[0])) | |
predicted_day = int(round(models["day"].predict(X_pred)[0])) | |
predicted_lead_time = int(round(models["lead_time"].predict(X_pred)[0])) | |
if predicted_month != 6 and predicted_month != 8: | |
predicted_month = 6 | |
target_weekday = 0 | |
try: | |
temp_date = datetime(year, predicted_month, predicted_day) | |
weekday_diff = (target_weekday - temp_date.weekday()) % 7 | |
if weekday_diff != 0: | |
temp_date += timedelta(days=weekday_diff) | |
wwdc_date = temp_date | |
except ValueError as e: | |
print(f"Error in initial date prediction: {e}") | |
wwdc_date = datetime(year, predicted_month, 1) | |
while wwdc_date.weekday() != 0: | |
wwdc_date += timedelta(days=1) | |
announced_date = wwdc_date - timedelta(days=predicted_lead_time) | |
return wwdc_date, announced_date | |
def validate_prediction(wwdc_date, announced_date): | |
issues = [] | |
if wwdc_date.weekday() >= 5: | |
issues.append(f"WWDC date is {wwdc_date.strftime('%A')}") | |
if announced_date.weekday() >= 5: | |
issues.append(f"Announcement date is {announced_date.strftime('%A')}") | |
if (wwdc_date - announced_date).days < 30: | |
issues.append(f"Lead time is {(wwdc_date - announced_date).days} days") | |
return issues | |
def main(): | |
data = [ | |
{"year": 2024, "wwdc": "6/10", "announced": "3/26", "lead_time": 76}, | |
{"year": 2023, "wwdc": "6/5", "announced": "3/29", "lead_time": 68}, | |
{"year": 2022, "wwdc": "6/6", "announced": "4/5", "lead_time": 62}, | |
{"year": 2021, "wwdc": "6/7", "announced": "3/30", "lead_time": 69}, | |
{"year": 2020, "wwdc": "6/22", "announced": "5/5", "lead_time": 48}, | |
{"year": 2019, "wwdc": "6/3", "announced": "3/14", "lead_time": 81}, | |
{"year": 2018, "wwdc": "6/4", "announced": "3/15", "lead_time": 81}, | |
{"year": 2017, "wwdc": "6/5", "announced": "2/16", "lead_time": 109}, | |
{"year": 2016, "wwdc": "6/13", "announced": "4/18", "lead_time": 56}, | |
{"year": 2015, "wwdc": "6/8", "announced": "4/14", "lead_time": 55}, | |
{"year": 2014, "wwdc": "6/2", "announced": "4/3", "lead_time": 60}, | |
{"year": 2013, "wwdc": "6/10", "announced": "4/24", "lead_time": 47}, | |
{"year": 2012, "wwdc": "6/11", "announced": "4/25", "lead_time": 47}, | |
{"year": 2011, "wwdc": "6/6", "announced": "3/28", "lead_time": 70}, | |
{"year": 2010, "wwdc": "6/7", "announced": "4/28", "lead_time": 40}, | |
{"year": 2009, "wwdc": "6/8", "announced": "3/26", "lead_time": 74}, | |
{"year": 2008, "wwdc": "6/9", "announced": "3/13", "lead_time": 88}, | |
{"year": 2007, "wwdc": "6/11", "announced": "2/6", "lead_time": 125}, | |
{"year": 2006, "wwdc": "8/7", "announced": "3/7", "lead_time": 153}, | |
{"year": 2005, "wwdc": "6/6", "announced": "2/15", "lead_time": 111}, | |
{"year": 2004, "wwdc": "6/28", "announced": "2/17", "lead_time": 132}, | |
] | |
df = preprocess_data(data) | |
models, feature_columns = train_models(df) | |
year = 2025 | |
wwdc_date, announced_date = predict_dates( | |
models, | |
year, | |
feature_columns, | |
) | |
issues = validate_prediction(wwdc_date, announced_date) | |
print("\nPredictions:") | |
print("-" * 50) | |
print(f"WWDC {year} Date: {wwdc_date.strftime('%A, %B %d, %Y')}") | |
print(f"Announcement Date: {announced_date.strftime('%A, %B %d, %Y')}") | |
print(f"Lead Time: {(wwdc_date - announced_date).days} days") | |
if issues: | |
print("\nValidation Issues:") | |
for issue in issues: | |
print(f"- {issue}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment