Skip to content

Instantly share code, notes, and snippets.

@promto-c
Last active July 23, 2024 06:46
Show Gist options
  • Save promto-c/ddf24dd75bff52d7b338b72a2826b27f to your computer and use it in GitHub Desktop.
Save promto-c/ddf24dd75bff52d7b338b72a2826b27f to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import category_encoders as ce
import time
# Load the dataset
file_path = 'car1.csv'
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print(data.head())
# Identify the categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
# Use TargetEncoder to encode the categorical columns
encoder = ce.TargetEncoder(cols=categorical_cols)
data_encoded = encoder.fit_transform(data, data['selling_price'])
# data_encoded = data
# Display the first few rows of the encoded dataset
print(data_encoded.head())
# Split the dataset into features and target
X = data_encoded.drop(columns=['selling_price'])
y = data_encoded['selling_price']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Custom Linear Regression Class
class CustomLinearRegression:
"""A simple implementation of Linear Regression using Ordinary Least Squares (OLS).
Attributes:
_coefficients (numpy.ndarray): Coefficients of the linear model.
Methods:
fit(X, y):
Fits the linear model to the data.
predict(X):
Predicts the target values for the given input data.
score(X, y):
Computes the coefficient of determination (R^2 score) for the model.
intercept_:
Returns the intercept (bias) of the model.
coef_:
Returns the coefficients of the model.
"""
def __init__(self):
self._coefficients = None
def fit(self, X, y):
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones)
# Add a small value to the diagonal to handle singularity
regularization_term = 1e-8 * np.eye(X_b.shape[1])
self._coefficients = np.linalg.inv(X_b.T @ X_b + regularization_term) @ X_b.T @ y
return self
def predict(self, X):
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones)
return X_b @ self._coefficients
def score(self, X, y):
y_pred = self.predict(X)
total_variance = ((y - y.mean()) ** 2).sum()
residual_variance = ((y - y_pred) ** 2).sum()
return 1 - (residual_variance / total_variance)
@property
def intercept_(self):
if self._coefficients is None:
return
return self._coefficients[0]
@property
def coef_(self):
if self._coefficients is None:
return
return self._coefficients[1:]
# Fit the custom linear regression model
custom_model = CustomLinearRegression()
start_time = time.time()
custom_model.fit(X_train, y_train)
custom_time = time.time() - start_time
# Make predictions on the test set using custom model
y_pred_custom = custom_model.predict(X_test)
print(X_test)
print(y_test)
print(y_pred_custom)
# Calculate the mean squared error for custom model
mse_custom = mean_squared_error(y_test, y_pred_custom)
print(f"Custom Model Mean Squared Error: {mse_custom}")
# Print the custom model parameters
print(f"Custom Model Intercept: {custom_model.intercept_}")
print(f"Custom Model Coefficients: {custom_model.coef_}")
# Fit the scikit-learn linear regression model
model = LinearRegression()
start_time = time.time()
model.fit(X_train, y_train)
sklearn_time = time.time() - start_time
# Make predictions on the test set using scikit-learn model
y_pred_sklearn = model.predict(X_test)
print(X_test)
print(y_test)
print(y_pred_sklearn)
# Calculate the mean squared error for scikit-learn model
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
print(f"Scikit-learn Model Mean Squared Error: {mse_sklearn}")
# Print the scikit-learn model parameters
print(f"Scikit-learn Model Intercept: {model.intercept_}")
print(f"Scikit-learn Model Coefficients: {model.coef_}")
# Compare execution time
print(f"Custom Model Training Time: {custom_time}")
print(f"Scikit-learn Model Training Time: {sklearn_time}")
from typing import Tuple, Optional
import numpy as np
import pandas as pd
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def train_test_split_pandas(features: pd.DataFrame,
target: pd.Series,
test_size: float = 0.2,
random_state: Optional[int] = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
"""Splits features and target DataFrames into train and test sets.
Args:
features (pd.DataFrame): The features DataFrame.
target (pd.Series): The target Series.
test_size (float): Proportion of the dataset to include in the test split.
random_state (int, optional): Seed for the random number generator.
Returns:
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: The training and testing features and targets.
"""
features = features.sample(frac=1, random_state=random_state).reset_index(drop=True)
target = target.sample(frac=1, random_state=random_state).reset_index(drop=True)
test_len = int(len(features) * test_size)
training_features = features.iloc[test_len:]
testing_features = features.iloc[:test_len]
training_target = target.iloc[test_len:]
y_test = target.iloc[:test_len]
return training_features, testing_features, training_target, y_test
class StandardScaler:
"""Standardize features by removing the mean and scaling to unit variance.
Args:
copy (bool): If True, performs in-place scaling.
with_mean (bool): If True, center the data before scaling.
with_std (bool): If True, scale the data to unit variance.
Attributes:
mean_ (numpy.ndarray): The mean value for each feature in the training set.
scale_ (numpy.ndarray): The scaling factor (standard deviation) for each feature in the training set.
Example:
>>> import numpy as np
>>> X = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
>>> scaler = StandardScaler()
>>> scaler.fit_transform(X)
array([[-1.22474487, -1.22474487, -1.22474487],
[ 0. , 0. , 0. ],
[ 1.22474487, 1.22474487, 1.22474487]])
>>> scaler.mean_
array([4., 5., 6.])
>>> scaler.scale_
array([2.44948974, 2.44948974, 2.44948974])
"""
def __init__(self, copy=True, with_mean=True, with_std=True):
"""Initialize the scaler with the given parameters."""
self.copy = copy
self.with_mean = with_mean
self.with_std = with_std
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
"""Compute the mean and std to be used for later scaling.
Args:
X (numpy.ndarray): The data used to compute the mean and standard deviation.
y (numpy.ndarray, optional): Ignored.
Returns:
self: Returns an instance of self.
"""
self.mean_ = X.mean(axis=0) if self.with_mean else np.zeros(X.shape[1])
self.scale_ = X.std(axis=0, ddof=0) if self.with_std else np.ones(X.shape[1])
return self
def transform(self, X, y=None):
"""Perform standardization by centering and scaling.
Args:
X (numpy.ndarray): The data to be transformed.
Returns:
numpy.ndarray: The transformed data.
"""
if self.copy:
X = X.copy()
if self.with_mean:
X -= self.mean_
if self.with_std:
X /= self.scale_
return X
def fit_transform(self, X, y=None):
"""Fit to data, then transform it.
Args:
X (numpy.ndarray): The data to fit and transform.
y (numpy.ndarray, optional): Ignored.
Returns:
numpy.ndarray: The transformed data.
"""
return self.fit(X, y).transform(X)
def inverse_transform(self, X, y=None):
"""Scale back the data to the original representation.
Args:
X (numpy.ndarray): The data to be inverse transformed.
Returns:
numpy.ndarray: The inverse transformed data.
"""
if self.copy:
X = X.copy()
if self.with_std:
X *= self.scale_
if self.with_mean:
X += self.mean_
return X
def get_params(self, deep=True):
"""Get parameters for this estimator.
Args:
deep (bool): If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns:
dict: Parameter names mapped to their values.
"""
return {"copy": self.copy, "with_mean": self.with_mean, "with_std": self.with_std}
def set_params(self, **params):
"""Set the parameters of this estimator.
Args:
params (dict): The parameters to set for this estimator.
Returns:
self: Returns an instance of self.
"""
for key, value in params.items():
setattr(self, key, value)
return self
class TargetEncoder:
"""Implementation of TargetEncoder for encoding categorical variables.
Attributes:
cols (list): List of columns to encode.
target_means (dict): Dictionary to store the mean target values for each category.
Methods:
fit(X, y):
Fits the encoder to the data.
transform(X):
Transforms the categorical features using the learned target means.
fit_transform(X, y):
Fits the encoder and transforms the data in one step.
get_params(deep=True):
Get parameters for this estimator.
set_params(**params):
Set the parameters of this estimator.
"""
def __init__(self, cols=None, handle_missing='value', handle_unknown='value', min_samples_leaf=20, smoothing=10):
"""Initialize the TargetEncoder with the given columns.
Args:
cols (list, optional): List of columns to encode. If None, all object-type columns are used.
handle_missing (str, optional): How to handle missing values. Defaults to 'value'.
handle_unknown (str, optional): How to handle unknown values. Defaults to 'value'.
min_samples_leaf (int, optional): Minimum samples leaf for smoothing. Defaults to 20.
smoothing (int, optional): Smoothing parameter. Defaults to 10.
"""
self.cols = cols
self.handle_missing = handle_missing
self.handle_unknown = handle_unknown
self.min_samples_leaf = min_samples_leaf
self.smoothing = smoothing
self.target_means = {}
self.global_mean = None
def fit(self, X, y):
"""Fit the encoder to the data.
Args:
X (pandas.DataFrame): Feature matrix.
y (pandas.Series): Target vector.
Returns:
self: Returns an instance of self.
"""
if self.cols is None:
self.cols = X.select_dtypes(include=['object']).columns.tolist()
self.global_mean = y.mean()
for col in self.cols:
stats = y.groupby(X[col]).agg(['count', 'mean'])
weight = self._weighting(stats['count'])
self.target_means[col] = self.global_mean * (1 - weight) + stats['mean'] * weight
if self.handle_missing == 'value':
self.target_means[col].loc[np.nan] = self.global_mean
if self.handle_unknown == 'value':
self.target_means[col].loc['unknown'] = self.global_mean
return self
def transform(self, X):
"""Transform the categorical features using the learned target means.
Args:
X (pandas.DataFrame): Feature matrix.
Returns:
pandas.DataFrame: Transformed feature matrix.
"""
X_encoded = X.copy()
for col in self.cols:
X_encoded[col] = X[col].map(self.target_means[col]).fillna(self.global_mean)
if self.handle_unknown == 'value':
X_encoded[col] = X_encoded[col].fillna(self.target_means[col].get('unknown', self.global_mean))
elif self.handle_missing == 'value':
X_encoded[col] = X_encoded[col].fillna(self.global_mean)
return X_encoded
def fit_transform(self, X, y):
"""Fit the encoder and transforms the data in one step.
Args:
X (pandas.DataFrame): Feature matrix.
y (pandas.Series): Target vector.
Returns:
pandas.DataFrame: Transformed feature matrix.
"""
return self.fit(X, y).transform(X)
def get_params(self, deep=True):
"""Get parameters for this estimator.
Args:
deep (bool): If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns:
dict: Parameter names mapped to their values.
"""
return {
"cols": self.cols,
"handle_missing": self.handle_missing,
"handle_unknown": self.handle_unknown,
"min_samples_leaf": self.min_samples_leaf,
"smoothing": self.smoothing
}
def set_params(self, **params):
"""Set the parameters of this estimator.
Args:
params (dict): The parameters to set for this estimator.
Returns:
self: Returns an instance of self.
"""
for key, value in params.items():
setattr(self, key, value)
return self
def _weighting(self, n):
"""Compute the weighting for smoothing.
Args:
n (int): Number of samples.
Returns:
float: Smoothing weight.
"""
return sigmoid((n - self.min_samples_leaf) / self.smoothing)
class LinearRegression:
"""A simple implementation of Linear Regression using Ordinary Least Squares (OLS).
Attributes:
coefficients (numpy.ndarray): Coefficients of the linear model.
fit_intercept (bool): Whether to calculate the intercept for this model.
copy_X (bool): If True, X will be copied; otherwise, it may be overwritten.
positive (bool): When set to True, forces the coefficients to be positive.
Methods:
fit(X, y):
Fits the linear model to the data.
predict(X):
Predicts the target values for the given input data.
score(X, y):
Computes the coefficient of determination (R^2 score) for the model.
intercept_:
Returns the intercept (bias) of the model.
coef_:
Returns the coefficients of the model.
"""
def __init__(self, fit_intercept=True, copy_X=True, n_jobs=None, positive=False):
self._coefficients = None
self.fit_intercept = fit_intercept
self.copy_X = copy_X
self.n_jobs = n_jobs
self.positive = positive
def fit(self, X, y):
"""Fits the linear model to the data using the Ordinary Least Squares (OLS) method.
Args:
X (numpy.ndarray): Feature matrix.
y (numpy.ndarray): Target vector.
Returns:
self: Returns an instance of self.
"""
if self.copy_X:
X = X.copy()
y = y.copy()
if self.fit_intercept:
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones)
else:
X_b = X
# Add a small value to the diagonal to handle singularity
regularization_term = 1e-8 * np.eye(X_b.shape[1])
self._coefficients = np.linalg.inv(X_b.T @ X_b + regularization_term) @ X_b.T @ y
if self.positive:
self._coefficients = np.maximum(self._coefficients, 0)
return self
def predict(self, X):
"""Predicts the target values for the given input data.
Args:
X (numpy.ndarray): Feature matrix.
Returns:
numpy.ndarray: Predicted target values.
"""
if self.fit_intercept:
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones)
else:
X_b = X
return X_b @ self._coefficients
def score(self, X, y):
"""Computes the coefficient of determination (R^2 score) for the model.
Args:
X (numpy.ndarray): Feature matrix.
y (numpy.ndarray): True target values.
Returns:
float: R^2 score.
"""
y_pred = self.predict(X)
total_variance = ((y - y.mean()) ** 2).sum()
residual_variance = ((y - y_pred) ** 2).sum()
return 1 - (residual_variance / total_variance)
@property
def intercept_(self):
"""Returns the intercept (bias) of the model.
Returns:
float: Intercept of the model.
"""
if self.fit_intercept and self._coefficients is not None:
return self._coefficients[0]
else:
return 0.0
@property
def coef_(self):
"""Returns the coefficients of the model.
Returns:
numpy.ndarray: Coefficients of the model.
"""
if self.fit_intercept and self._coefficients is not None:
return self._coefficients[1:]
else:
return self._coefficients
import numpy as np
import pandas as pd
from typing import List, Union
class MultiLabelBinarizer:
"""Implementation of MultiLabelBinarizer for encoding multilabel data.
Attributes:
classes_ (np.ndarray): Array of all unique classes found during fitting.
Methods:
fit(X):
Fits the binarizer to the data.
transform(X):
Transforms the data using the learned classes.
fit_transform(X):
Fits the binarizer and transforms the data in one step.
"""
def __init__(self):
self.classes_: np.ndarray = None
def fit(self, X: List[List[Union[str, int]]]) -> 'MultiLabelBinarizer':
"""Fits the binarizer to the data.
Args:
X (List[List[Union[str, int]]]): Multilabel data to fit.
Returns:
self: Returns an instance of self.
"""
unique_classes = set()
for labels in X:
unique_classes.update(labels)
self.classes_ = np.array(sorted(unique_classes))
return self
def transform(self, X: List[List[Union[str, int]]]) -> np.ndarray:
"""Transforms the data using the learned classes.
Args:
X (List[List[Union[str, int]]]): Multilabel data to transform.
Returns:
np.ndarray: Binarized data.
"""
if self.classes_ is None:
raise ValueError("The binarizer is not fitted yet.")
binarized = np.zeros((len(X), len(self.classes_)), dtype=int)
class_to_index = {cls: idx for idx, cls in enumerate(self.classes_)}
for i, labels in enumerate(X):
for label in labels:
binarized[i, class_to_index[label]] = 1
return binarized
def fit_transform(self, X: List[List[Union[str, int]]]) -> np.ndarray:
"""Fits the binarizer and transforms the data in one step.
Args:
X (List[List[Union[str, int]]]): Multilabel data to fit and transform.
Returns:
np.ndarray: Binarized data.
"""
return self.fit(X).transform(X)
# Example usage
if __name__ == "__main__":
# Sample data
data = pd.DataFrame({
'tags': ['a,b,c', 'b,c,d', 'a,d', 'a,b']
})
# Convert tags column to list of labels
data['tags'] = data['tags'].apply(lambda x: x.split(','))
# MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(data['tags'].tolist())
tags_df = pd.DataFrame(tags_encoded, columns=mlb.classes_)
encoded_data = pd.concat([data.drop(columns=['tags']), tags_df], axis=1)
print("MultiLabelBinarizer Encoded Data:\n", encoded_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment