benbieler · December 18, 2024 15:54
diff --git a/main.py b/main.py
 # Import necessary libraries
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.svm import SVC
 from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, 
    recall_score, f1_score, roc_auc_score, classification_report
 )

 # ------------------------- 1. Load and Explore Data -------------------------

 # Load the dataset
 df = pd.read_csv('./admit-train.csv')  # Replace with your file path
 print("Dataset Preview:")
 print(df.head())

 # Check for missing values
 print("\nMissing Values:")
 print(df.isnull().sum())

 # Describe the dataset
 print("\nStatistical Summary:")
 print(df.describe())

 # Check data types
 print("\nData Types:")
 print(df.dtypes)

 # ------------------------- 2. Preprocess the Data -------------------------

 # Convert 'rank' to categorical
 df['rank'] = df['rank'].astype('category')

 # Separate features and target
 X = df[['gre', 'gpa', 'rank']]
 y = df['admit']

 # Preprocessing: Scale numerical variables, one-hot encode 'rank'
 preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['gre', 'gpa']),
        ('cat', OneHotEncoder(drop='first'), ['rank'])
    ]
 )

 # Split into training and test sets (80/20 split)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 print("\nTraining and Test Set Sizes:")
 print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

 # Preprocess training and test data
 X_train = preprocessor.fit_transform(X_train)
 X_test = preprocessor.transform(X_test)

 # ------------------------- 3. Train Models -------------------------

 # Logistic Regression
 logreg = LogisticRegression()
 logreg.fit(X_train, y_train)
 y_pred_logreg = logreg.predict(X_test)

 # Random Forest
 rf = RandomForestClassifier(random_state=42)
 rf.fit(X_train, y_train)
 y_pred_rf = rf.predict(X_test)

 # Support Vector Machine (SVM)
 svm = SVC(probability=True, random_state=42)
 svm.fit(X_train, y_train)
 y_pred_svm = svm.predict(X_test)

 # ------------------------- 4. Evaluate Models -------------------------

 # Define a function for metrics
 def evaluate_model(y_true, y_pred, model_name):
    print(f"\nEvaluation Metrics for {model_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

 # Evaluate each model
 evaluate_model(y_test, y_pred_logreg, "Logistic Regression")
 evaluate_model(y_test, y_pred_rf, "Random Forest")
 evaluate_model(y_test, y_pred_svm, "SVM")

 # Compare accuracies
 models = ['Logistic Regression', 'Random Forest', 'SVM']
 accuracies = [
    accuracy_score(y_test, y_pred_logreg),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_svm)
 ]
 comparison = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
 print("\nModel Comparison:")
 print(comparison)

 # Plot accuracy comparison
 comparison.plot(x='Model', y='Accuracy', kind='bar', legend=False)
 plt.title("Model Accuracy Comparison")
 plt.ylabel("Accuracy")
 plt.show()

 # ------------------------- 5. Predict on New Data -------------------------

 # Load new test data
 new_data = pd.read_csv('./admit-test.csv')  # Replace with test file path
 new_data['rank'] = new_data['rank'].astype('category')  # Ensure rank is categorical

 # Preprocess new data
 new_data_preprocessed = preprocessor.transform(new_data)

 # Predict probabilities and final classes using the best model (Random Forest here)
 new_data['admit_prob'] = rf.predict_proba(new_data_preprocessed)[:, 1]
 new_data['admit_pred'] = (new_data['admit_prob'] >= 0.5).astype(int)

 # Save predictions
 new_data[['admit_prob', 'admit_pred']].to_csv("predictions.csv", index=False)
 print("\nPredictions saved to 'predictions.csv'.")

 # ------------------------- 6. Summary -------------------------

 print("\nTask Completed Successfully!")
 print("1. Models trained: Logistic Regression, Random Forest, and SVM.")
 print("2. Metrics evaluated: Accuracy, Precision, Recall, F1-Score, ROC-AUC.")
 print("3. Predictions made and saved for new test data.")
	# Import necessary libraries
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.svm import SVC
	from sklearn.metrics import (
	confusion_matrix, accuracy_score, precision_score,
	recall_score, f1_score, roc_auc_score, classification_report
	)

	# ------------------------- 1. Load and Explore Data -------------------------

	# Load the dataset
	df = pd.read_csv('./admit-train.csv') # Replace with your file path
	print("Dataset Preview:")
	print(df.head())

	# Check for missing values
	print("\nMissing Values:")
	print(df.isnull().sum())

	# Describe the dataset
	print("\nStatistical Summary:")
	print(df.describe())

	# Check data types
	print("\nData Types:")
	print(df.dtypes)

	# ------------------------- 2. Preprocess the Data -------------------------

	# Convert 'rank' to categorical
	df['rank'] = df['rank'].astype('category')

	# Separate features and target
	X = df[['gre', 'gpa', 'rank']]
	y = df['admit']

	# Preprocessing: Scale numerical variables, one-hot encode 'rank'
	preprocessor = ColumnTransformer(
	transformers=[
	('num', StandardScaler(), ['gre', 'gpa']),
	('cat', OneHotEncoder(drop='first'), ['rank'])
	]
	)

	# Split into training and test sets (80/20 split)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
	print("\nTraining and Test Set Sizes:")
	print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

	# Preprocess training and test data
	X_train = preprocessor.fit_transform(X_train)
	X_test = preprocessor.transform(X_test)

	# ------------------------- 3. Train Models -------------------------

	# Logistic Regression
	logreg = LogisticRegression()
	logreg.fit(X_train, y_train)
	y_pred_logreg = logreg.predict(X_test)

	# Random Forest
	rf = RandomForestClassifier(random_state=42)
	rf.fit(X_train, y_train)
	y_pred_rf = rf.predict(X_test)

	# Support Vector Machine (SVM)
	svm = SVC(probability=True, random_state=42)
	svm.fit(X_train, y_train)
	y_pred_svm = svm.predict(X_test)

	# ------------------------- 4. Evaluate Models -------------------------

	# Define a function for metrics
	def evaluate_model(y_true, y_pred, model_name):
	print(f"\nEvaluation Metrics for {model_name}:")
	print("Accuracy:", accuracy_score(y_true, y_pred))
	print("Precision:", precision_score(y_true, y_pred))
	print("Recall:", recall_score(y_true, y_pred))
	print("F1 Score:", f1_score(y_true, y_pred))
	print("ROC-AUC Score:", roc_auc_score(y_true, y_pred))
	print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

	# Evaluate each model
	evaluate_model(y_test, y_pred_logreg, "Logistic Regression")
	evaluate_model(y_test, y_pred_rf, "Random Forest")
	evaluate_model(y_test, y_pred_svm, "SVM")

	# Compare accuracies
	models = ['Logistic Regression', 'Random Forest', 'SVM']
	accuracies = [
	accuracy_score(y_test, y_pred_logreg),
	accuracy_score(y_test, y_pred_rf),
	accuracy_score(y_test, y_pred_svm)
	]
	comparison = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
	print("\nModel Comparison:")
	print(comparison)

	# Plot accuracy comparison
	comparison.plot(x='Model', y='Accuracy', kind='bar', legend=False)
	plt.title("Model Accuracy Comparison")
	plt.ylabel("Accuracy")
	plt.show()

	# ------------------------- 5. Predict on New Data -------------------------

	# Load new test data
	new_data = pd.read_csv('./admit-test.csv') # Replace with test file path
	new_data['rank'] = new_data['rank'].astype('category') # Ensure rank is categorical

	# Preprocess new data
	new_data_preprocessed = preprocessor.transform(new_data)

	# Predict probabilities and final classes using the best model (Random Forest here)
	new_data['admit_prob'] = rf.predict_proba(new_data_preprocessed)[:, 1]
	new_data['admit_pred'] = (new_data['admit_prob'] >= 0.5).astype(int)

	# Save predictions
	new_data[['admit_prob', 'admit_pred']].to_csv("predictions.csv", index=False)
	print("\nPredictions saved to 'predictions.csv'.")

	# ------------------------- 6. Summary -------------------------

	print("\nTask Completed Successfully!")
	print("1. Models trained: Logistic Regression, Random Forest, and SVM.")
	print("2. Metrics evaluated: Accuracy, Precision, Recall, F1-Score, ROC-AUC.")
	print("3. Predictions made and saved for new test data.")