Skip to content

Instantly share code, notes, and snippets.

@benbieler
Created December 18, 2024 15:54
Show Gist options
  • Save benbieler/dd5606bc46e1a2fd27da3c62bf213994 to your computer and use it in GitHub Desktop.
Save benbieler/dd5606bc46e1a2fd27da3c62bf213994 to your computer and use it in GitHub Desktop.
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score,
recall_score, f1_score, roc_auc_score, classification_report
)
# ------------------------- 1. Load and Explore Data -------------------------
# Load the dataset
df = pd.read_csv('./admit-train.csv') # Replace with your file path
print("Dataset Preview:")
print(df.head())
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
# Describe the dataset
print("\nStatistical Summary:")
print(df.describe())
# Check data types
print("\nData Types:")
print(df.dtypes)
# ------------------------- 2. Preprocess the Data -------------------------
# Convert 'rank' to categorical
df['rank'] = df['rank'].astype('category')
# Separate features and target
X = df[['gre', 'gpa', 'rank']]
y = df['admit']
# Preprocessing: Scale numerical variables, one-hot encode 'rank'
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['gre', 'gpa']),
('cat', OneHotEncoder(drop='first'), ['rank'])
]
)
# Split into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\nTraining and Test Set Sizes:")
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
# Preprocess training and test data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
# ------------------------- 3. Train Models -------------------------
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
# Support Vector Machine (SVM)
svm = SVC(probability=True, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
# ------------------------- 4. Evaluate Models -------------------------
# Define a function for metrics
def evaluate_model(y_true, y_pred, model_name):
print(f"\nEvaluation Metrics for {model_name}:")
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
# Evaluate each model
evaluate_model(y_test, y_pred_logreg, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_svm, "SVM")
# Compare accuracies
models = ['Logistic Regression', 'Random Forest', 'SVM']
accuracies = [
accuracy_score(y_test, y_pred_logreg),
accuracy_score(y_test, y_pred_rf),
accuracy_score(y_test, y_pred_svm)
]
comparison = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
print("\nModel Comparison:")
print(comparison)
# Plot accuracy comparison
comparison.plot(x='Model', y='Accuracy', kind='bar', legend=False)
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.show()
# ------------------------- 5. Predict on New Data -------------------------
# Load new test data
new_data = pd.read_csv('./admit-test.csv') # Replace with test file path
new_data['rank'] = new_data['rank'].astype('category') # Ensure rank is categorical
# Preprocess new data
new_data_preprocessed = preprocessor.transform(new_data)
# Predict probabilities and final classes using the best model (Random Forest here)
new_data['admit_prob'] = rf.predict_proba(new_data_preprocessed)[:, 1]
new_data['admit_pred'] = (new_data['admit_prob'] >= 0.5).astype(int)
# Save predictions
new_data[['admit_prob', 'admit_pred']].to_csv("predictions.csv", index=False)
print("\nPredictions saved to 'predictions.csv'.")
# ------------------------- 6. Summary -------------------------
print("\nTask Completed Successfully!")
print("1. Models trained: Logistic Regression, Random Forest, and SVM.")
print("2. Metrics evaluated: Accuracy, Precision, Recall, F1-Score, ROC-AUC.")
print("3. Predictions made and saved for new test data.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment