Created
December 18, 2024 15:54
-
-
Save benbieler/dd5606bc46e1a2fd27da3c62bf213994 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import necessary libraries | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split, cross_val_score | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.svm import SVC | |
from sklearn.metrics import ( | |
confusion_matrix, accuracy_score, precision_score, | |
recall_score, f1_score, roc_auc_score, classification_report | |
) | |
# ------------------------- 1. Load and Explore Data ------------------------- | |
# Load the dataset | |
df = pd.read_csv('./admit-train.csv') # Replace with your file path | |
print("Dataset Preview:") | |
print(df.head()) | |
# Check for missing values | |
print("\nMissing Values:") | |
print(df.isnull().sum()) | |
# Describe the dataset | |
print("\nStatistical Summary:") | |
print(df.describe()) | |
# Check data types | |
print("\nData Types:") | |
print(df.dtypes) | |
# ------------------------- 2. Preprocess the Data ------------------------- | |
# Convert 'rank' to categorical | |
df['rank'] = df['rank'].astype('category') | |
# Separate features and target | |
X = df[['gre', 'gpa', 'rank']] | |
y = df['admit'] | |
# Preprocessing: Scale numerical variables, one-hot encode 'rank' | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', StandardScaler(), ['gre', 'gpa']), | |
('cat', OneHotEncoder(drop='first'), ['rank']) | |
] | |
) | |
# Split into training and test sets (80/20 split) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) | |
print("\nTraining and Test Set Sizes:") | |
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}") | |
# Preprocess training and test data | |
X_train = preprocessor.fit_transform(X_train) | |
X_test = preprocessor.transform(X_test) | |
# ------------------------- 3. Train Models ------------------------- | |
# Logistic Regression | |
logreg = LogisticRegression() | |
logreg.fit(X_train, y_train) | |
y_pred_logreg = logreg.predict(X_test) | |
# Random Forest | |
rf = RandomForestClassifier(random_state=42) | |
rf.fit(X_train, y_train) | |
y_pred_rf = rf.predict(X_test) | |
# Support Vector Machine (SVM) | |
svm = SVC(probability=True, random_state=42) | |
svm.fit(X_train, y_train) | |
y_pred_svm = svm.predict(X_test) | |
# ------------------------- 4. Evaluate Models ------------------------- | |
# Define a function for metrics | |
def evaluate_model(y_true, y_pred, model_name): | |
print(f"\nEvaluation Metrics for {model_name}:") | |
print("Accuracy:", accuracy_score(y_true, y_pred)) | |
print("Precision:", precision_score(y_true, y_pred)) | |
print("Recall:", recall_score(y_true, y_pred)) | |
print("F1 Score:", f1_score(y_true, y_pred)) | |
print("ROC-AUC Score:", roc_auc_score(y_true, y_pred)) | |
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred)) | |
# Evaluate each model | |
evaluate_model(y_test, y_pred_logreg, "Logistic Regression") | |
evaluate_model(y_test, y_pred_rf, "Random Forest") | |
evaluate_model(y_test, y_pred_svm, "SVM") | |
# Compare accuracies | |
models = ['Logistic Regression', 'Random Forest', 'SVM'] | |
accuracies = [ | |
accuracy_score(y_test, y_pred_logreg), | |
accuracy_score(y_test, y_pred_rf), | |
accuracy_score(y_test, y_pred_svm) | |
] | |
comparison = pd.DataFrame({'Model': models, 'Accuracy': accuracies}) | |
print("\nModel Comparison:") | |
print(comparison) | |
# Plot accuracy comparison | |
comparison.plot(x='Model', y='Accuracy', kind='bar', legend=False) | |
plt.title("Model Accuracy Comparison") | |
plt.ylabel("Accuracy") | |
plt.show() | |
# ------------------------- 5. Predict on New Data ------------------------- | |
# Load new test data | |
new_data = pd.read_csv('./admit-test.csv') # Replace with test file path | |
new_data['rank'] = new_data['rank'].astype('category') # Ensure rank is categorical | |
# Preprocess new data | |
new_data_preprocessed = preprocessor.transform(new_data) | |
# Predict probabilities and final classes using the best model (Random Forest here) | |
new_data['admit_prob'] = rf.predict_proba(new_data_preprocessed)[:, 1] | |
new_data['admit_pred'] = (new_data['admit_prob'] >= 0.5).astype(int) | |
# Save predictions | |
new_data[['admit_prob', 'admit_pred']].to_csv("predictions.csv", index=False) | |
print("\nPredictions saved to 'predictions.csv'.") | |
# ------------------------- 6. Summary ------------------------- | |
print("\nTask Completed Successfully!") | |
print("1. Models trained: Logistic Regression, Random Forest, and SVM.") | |
print("2. Metrics evaluated: Accuracy, Precision, Recall, F1-Score, ROC-AUC.") | |
print("3. Predictions made and saved for new test data.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment