Last active
July 23, 2024 06:46
-
-
Save promto-c/ddf24dd75bff52d7b338b72a2826b27f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import mean_squared_error | |
import category_encoders as ce | |
import time | |
# Load the dataset | |
file_path = 'car1.csv' | |
data = pd.read_csv(file_path) | |
# Display the first few rows of the dataset | |
print(data.head()) | |
# Identify the categorical columns | |
categorical_cols = data.select_dtypes(include=['object']).columns | |
# Use TargetEncoder to encode the categorical columns | |
encoder = ce.TargetEncoder(cols=categorical_cols) | |
data_encoded = encoder.fit_transform(data, data['selling_price']) | |
# data_encoded = data | |
# Display the first few rows of the encoded dataset | |
print(data_encoded.head()) | |
# Split the dataset into features and target | |
X = data_encoded.drop(columns=['selling_price']) | |
y = data_encoded['selling_price'] | |
# Split the data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) | |
# Custom Linear Regression Class | |
class CustomLinearRegression: | |
"""A simple implementation of Linear Regression using Ordinary Least Squares (OLS). | |
Attributes: | |
_coefficients (numpy.ndarray): Coefficients of the linear model. | |
Methods: | |
fit(X, y): | |
Fits the linear model to the data. | |
predict(X): | |
Predicts the target values for the given input data. | |
score(X, y): | |
Computes the coefficient of determination (R^2 score) for the model. | |
intercept_: | |
Returns the intercept (bias) of the model. | |
coef_: | |
Returns the coefficients of the model. | |
""" | |
def __init__(self): | |
self._coefficients = None | |
def fit(self, X, y): | |
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones) | |
# Add a small value to the diagonal to handle singularity | |
regularization_term = 1e-8 * np.eye(X_b.shape[1]) | |
self._coefficients = np.linalg.inv(X_b.T @ X_b + regularization_term) @ X_b.T @ y | |
return self | |
def predict(self, X): | |
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones) | |
return X_b @ self._coefficients | |
def score(self, X, y): | |
y_pred = self.predict(X) | |
total_variance = ((y - y.mean()) ** 2).sum() | |
residual_variance = ((y - y_pred) ** 2).sum() | |
return 1 - (residual_variance / total_variance) | |
@property | |
def intercept_(self): | |
if self._coefficients is None: | |
return | |
return self._coefficients[0] | |
@property | |
def coef_(self): | |
if self._coefficients is None: | |
return | |
return self._coefficients[1:] | |
# Fit the custom linear regression model | |
custom_model = CustomLinearRegression() | |
start_time = time.time() | |
custom_model.fit(X_train, y_train) | |
custom_time = time.time() - start_time | |
# Make predictions on the test set using custom model | |
y_pred_custom = custom_model.predict(X_test) | |
print(X_test) | |
print(y_test) | |
print(y_pred_custom) | |
# Calculate the mean squared error for custom model | |
mse_custom = mean_squared_error(y_test, y_pred_custom) | |
print(f"Custom Model Mean Squared Error: {mse_custom}") | |
# Print the custom model parameters | |
print(f"Custom Model Intercept: {custom_model.intercept_}") | |
print(f"Custom Model Coefficients: {custom_model.coef_}") | |
# Fit the scikit-learn linear regression model | |
model = LinearRegression() | |
start_time = time.time() | |
model.fit(X_train, y_train) | |
sklearn_time = time.time() - start_time | |
# Make predictions on the test set using scikit-learn model | |
y_pred_sklearn = model.predict(X_test) | |
print(X_test) | |
print(y_test) | |
print(y_pred_sklearn) | |
# Calculate the mean squared error for scikit-learn model | |
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn) | |
print(f"Scikit-learn Model Mean Squared Error: {mse_sklearn}") | |
# Print the scikit-learn model parameters | |
print(f"Scikit-learn Model Intercept: {model.intercept_}") | |
print(f"Scikit-learn Model Coefficients: {model.coef_}") | |
# Compare execution time | |
print(f"Custom Model Training Time: {custom_time}") | |
print(f"Scikit-learn Model Training Time: {sklearn_time}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Tuple, Optional | |
import numpy as np | |
import pandas as pd | |
def sigmoid(x): | |
return 1 / (1 + np.exp(-x)) | |
def train_test_split_pandas(features: pd.DataFrame, | |
target: pd.Series, | |
test_size: float = 0.2, | |
random_state: Optional[int] = None | |
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: | |
"""Splits features and target DataFrames into train and test sets. | |
Args: | |
features (pd.DataFrame): The features DataFrame. | |
target (pd.Series): The target Series. | |
test_size (float): Proportion of the dataset to include in the test split. | |
random_state (int, optional): Seed for the random number generator. | |
Returns: | |
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: The training and testing features and targets. | |
""" | |
features = features.sample(frac=1, random_state=random_state).reset_index(drop=True) | |
target = target.sample(frac=1, random_state=random_state).reset_index(drop=True) | |
test_len = int(len(features) * test_size) | |
training_features = features.iloc[test_len:] | |
testing_features = features.iloc[:test_len] | |
training_target = target.iloc[test_len:] | |
y_test = target.iloc[:test_len] | |
return training_features, testing_features, training_target, y_test | |
class StandardScaler: | |
"""Standardize features by removing the mean and scaling to unit variance. | |
Args: | |
copy (bool): If True, performs in-place scaling. | |
with_mean (bool): If True, center the data before scaling. | |
with_std (bool): If True, scale the data to unit variance. | |
Attributes: | |
mean_ (numpy.ndarray): The mean value for each feature in the training set. | |
scale_ (numpy.ndarray): The scaling factor (standard deviation) for each feature in the training set. | |
Example: | |
>>> import numpy as np | |
>>> X = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]) | |
>>> scaler = StandardScaler() | |
>>> scaler.fit_transform(X) | |
array([[-1.22474487, -1.22474487, -1.22474487], | |
[ 0. , 0. , 0. ], | |
[ 1.22474487, 1.22474487, 1.22474487]]) | |
>>> scaler.mean_ | |
array([4., 5., 6.]) | |
>>> scaler.scale_ | |
array([2.44948974, 2.44948974, 2.44948974]) | |
""" | |
def __init__(self, copy=True, with_mean=True, with_std=True): | |
"""Initialize the scaler with the given parameters.""" | |
self.copy = copy | |
self.with_mean = with_mean | |
self.with_std = with_std | |
self.mean_ = None | |
self.scale_ = None | |
def fit(self, X, y=None): | |
"""Compute the mean and std to be used for later scaling. | |
Args: | |
X (numpy.ndarray): The data used to compute the mean and standard deviation. | |
y (numpy.ndarray, optional): Ignored. | |
Returns: | |
self: Returns an instance of self. | |
""" | |
self.mean_ = X.mean(axis=0) if self.with_mean else np.zeros(X.shape[1]) | |
self.scale_ = X.std(axis=0, ddof=0) if self.with_std else np.ones(X.shape[1]) | |
return self | |
def transform(self, X, y=None): | |
"""Perform standardization by centering and scaling. | |
Args: | |
X (numpy.ndarray): The data to be transformed. | |
Returns: | |
numpy.ndarray: The transformed data. | |
""" | |
if self.copy: | |
X = X.copy() | |
if self.with_mean: | |
X -= self.mean_ | |
if self.with_std: | |
X /= self.scale_ | |
return X | |
def fit_transform(self, X, y=None): | |
"""Fit to data, then transform it. | |
Args: | |
X (numpy.ndarray): The data to fit and transform. | |
y (numpy.ndarray, optional): Ignored. | |
Returns: | |
numpy.ndarray: The transformed data. | |
""" | |
return self.fit(X, y).transform(X) | |
def inverse_transform(self, X, y=None): | |
"""Scale back the data to the original representation. | |
Args: | |
X (numpy.ndarray): The data to be inverse transformed. | |
Returns: | |
numpy.ndarray: The inverse transformed data. | |
""" | |
if self.copy: | |
X = X.copy() | |
if self.with_std: | |
X *= self.scale_ | |
if self.with_mean: | |
X += self.mean_ | |
return X | |
def get_params(self, deep=True): | |
"""Get parameters for this estimator. | |
Args: | |
deep (bool): If True, will return the parameters for this estimator and contained subobjects that are estimators. | |
Returns: | |
dict: Parameter names mapped to their values. | |
""" | |
return {"copy": self.copy, "with_mean": self.with_mean, "with_std": self.with_std} | |
def set_params(self, **params): | |
"""Set the parameters of this estimator. | |
Args: | |
params (dict): The parameters to set for this estimator. | |
Returns: | |
self: Returns an instance of self. | |
""" | |
for key, value in params.items(): | |
setattr(self, key, value) | |
return self | |
class TargetEncoder: | |
"""Implementation of TargetEncoder for encoding categorical variables. | |
Attributes: | |
cols (list): List of columns to encode. | |
target_means (dict): Dictionary to store the mean target values for each category. | |
Methods: | |
fit(X, y): | |
Fits the encoder to the data. | |
transform(X): | |
Transforms the categorical features using the learned target means. | |
fit_transform(X, y): | |
Fits the encoder and transforms the data in one step. | |
get_params(deep=True): | |
Get parameters for this estimator. | |
set_params(**params): | |
Set the parameters of this estimator. | |
""" | |
def __init__(self, cols=None, handle_missing='value', handle_unknown='value', min_samples_leaf=20, smoothing=10): | |
"""Initialize the TargetEncoder with the given columns. | |
Args: | |
cols (list, optional): List of columns to encode. If None, all object-type columns are used. | |
handle_missing (str, optional): How to handle missing values. Defaults to 'value'. | |
handle_unknown (str, optional): How to handle unknown values. Defaults to 'value'. | |
min_samples_leaf (int, optional): Minimum samples leaf for smoothing. Defaults to 20. | |
smoothing (int, optional): Smoothing parameter. Defaults to 10. | |
""" | |
self.cols = cols | |
self.handle_missing = handle_missing | |
self.handle_unknown = handle_unknown | |
self.min_samples_leaf = min_samples_leaf | |
self.smoothing = smoothing | |
self.target_means = {} | |
self.global_mean = None | |
def fit(self, X, y): | |
"""Fit the encoder to the data. | |
Args: | |
X (pandas.DataFrame): Feature matrix. | |
y (pandas.Series): Target vector. | |
Returns: | |
self: Returns an instance of self. | |
""" | |
if self.cols is None: | |
self.cols = X.select_dtypes(include=['object']).columns.tolist() | |
self.global_mean = y.mean() | |
for col in self.cols: | |
stats = y.groupby(X[col]).agg(['count', 'mean']) | |
weight = self._weighting(stats['count']) | |
self.target_means[col] = self.global_mean * (1 - weight) + stats['mean'] * weight | |
if self.handle_missing == 'value': | |
self.target_means[col].loc[np.nan] = self.global_mean | |
if self.handle_unknown == 'value': | |
self.target_means[col].loc['unknown'] = self.global_mean | |
return self | |
def transform(self, X): | |
"""Transform the categorical features using the learned target means. | |
Args: | |
X (pandas.DataFrame): Feature matrix. | |
Returns: | |
pandas.DataFrame: Transformed feature matrix. | |
""" | |
X_encoded = X.copy() | |
for col in self.cols: | |
X_encoded[col] = X[col].map(self.target_means[col]).fillna(self.global_mean) | |
if self.handle_unknown == 'value': | |
X_encoded[col] = X_encoded[col].fillna(self.target_means[col].get('unknown', self.global_mean)) | |
elif self.handle_missing == 'value': | |
X_encoded[col] = X_encoded[col].fillna(self.global_mean) | |
return X_encoded | |
def fit_transform(self, X, y): | |
"""Fit the encoder and transforms the data in one step. | |
Args: | |
X (pandas.DataFrame): Feature matrix. | |
y (pandas.Series): Target vector. | |
Returns: | |
pandas.DataFrame: Transformed feature matrix. | |
""" | |
return self.fit(X, y).transform(X) | |
def get_params(self, deep=True): | |
"""Get parameters for this estimator. | |
Args: | |
deep (bool): If True, will return the parameters for this estimator and contained subobjects that are estimators. | |
Returns: | |
dict: Parameter names mapped to their values. | |
""" | |
return { | |
"cols": self.cols, | |
"handle_missing": self.handle_missing, | |
"handle_unknown": self.handle_unknown, | |
"min_samples_leaf": self.min_samples_leaf, | |
"smoothing": self.smoothing | |
} | |
def set_params(self, **params): | |
"""Set the parameters of this estimator. | |
Args: | |
params (dict): The parameters to set for this estimator. | |
Returns: | |
self: Returns an instance of self. | |
""" | |
for key, value in params.items(): | |
setattr(self, key, value) | |
return self | |
def _weighting(self, n): | |
"""Compute the weighting for smoothing. | |
Args: | |
n (int): Number of samples. | |
Returns: | |
float: Smoothing weight. | |
""" | |
return sigmoid((n - self.min_samples_leaf) / self.smoothing) | |
class LinearRegression: | |
"""A simple implementation of Linear Regression using Ordinary Least Squares (OLS). | |
Attributes: | |
coefficients (numpy.ndarray): Coefficients of the linear model. | |
fit_intercept (bool): Whether to calculate the intercept for this model. | |
copy_X (bool): If True, X will be copied; otherwise, it may be overwritten. | |
positive (bool): When set to True, forces the coefficients to be positive. | |
Methods: | |
fit(X, y): | |
Fits the linear model to the data. | |
predict(X): | |
Predicts the target values for the given input data. | |
score(X, y): | |
Computes the coefficient of determination (R^2 score) for the model. | |
intercept_: | |
Returns the intercept (bias) of the model. | |
coef_: | |
Returns the coefficients of the model. | |
""" | |
def __init__(self, fit_intercept=True, copy_X=True, n_jobs=None, positive=False): | |
self._coefficients = None | |
self.fit_intercept = fit_intercept | |
self.copy_X = copy_X | |
self.n_jobs = n_jobs | |
self.positive = positive | |
def fit(self, X, y): | |
"""Fits the linear model to the data using the Ordinary Least Squares (OLS) method. | |
Args: | |
X (numpy.ndarray): Feature matrix. | |
y (numpy.ndarray): Target vector. | |
Returns: | |
self: Returns an instance of self. | |
""" | |
if self.copy_X: | |
X = X.copy() | |
y = y.copy() | |
if self.fit_intercept: | |
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones) | |
else: | |
X_b = X | |
# Add a small value to the diagonal to handle singularity | |
regularization_term = 1e-8 * np.eye(X_b.shape[1]) | |
self._coefficients = np.linalg.inv(X_b.T @ X_b + regularization_term) @ X_b.T @ y | |
if self.positive: | |
self._coefficients = np.maximum(self._coefficients, 0) | |
return self | |
def predict(self, X): | |
"""Predicts the target values for the given input data. | |
Args: | |
X (numpy.ndarray): Feature matrix. | |
Returns: | |
numpy.ndarray: Predicted target values. | |
""" | |
if self.fit_intercept: | |
X_b = np.c_[np.ones((X.shape[0], 1)), X] # Add bias term (column of ones) | |
else: | |
X_b = X | |
return X_b @ self._coefficients | |
def score(self, X, y): | |
"""Computes the coefficient of determination (R^2 score) for the model. | |
Args: | |
X (numpy.ndarray): Feature matrix. | |
y (numpy.ndarray): True target values. | |
Returns: | |
float: R^2 score. | |
""" | |
y_pred = self.predict(X) | |
total_variance = ((y - y.mean()) ** 2).sum() | |
residual_variance = ((y - y_pred) ** 2).sum() | |
return 1 - (residual_variance / total_variance) | |
@property | |
def intercept_(self): | |
"""Returns the intercept (bias) of the model. | |
Returns: | |
float: Intercept of the model. | |
""" | |
if self.fit_intercept and self._coefficients is not None: | |
return self._coefficients[0] | |
else: | |
return 0.0 | |
@property | |
def coef_(self): | |
"""Returns the coefficients of the model. | |
Returns: | |
numpy.ndarray: Coefficients of the model. | |
""" | |
if self.fit_intercept and self._coefficients is not None: | |
return self._coefficients[1:] | |
else: | |
return self._coefficients |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from typing import List, Union | |
class MultiLabelBinarizer: | |
"""Implementation of MultiLabelBinarizer for encoding multilabel data. | |
Attributes: | |
classes_ (np.ndarray): Array of all unique classes found during fitting. | |
Methods: | |
fit(X): | |
Fits the binarizer to the data. | |
transform(X): | |
Transforms the data using the learned classes. | |
fit_transform(X): | |
Fits the binarizer and transforms the data in one step. | |
""" | |
def __init__(self): | |
self.classes_: np.ndarray = None | |
def fit(self, X: List[List[Union[str, int]]]) -> 'MultiLabelBinarizer': | |
"""Fits the binarizer to the data. | |
Args: | |
X (List[List[Union[str, int]]]): Multilabel data to fit. | |
Returns: | |
self: Returns an instance of self. | |
""" | |
unique_classes = set() | |
for labels in X: | |
unique_classes.update(labels) | |
self.classes_ = np.array(sorted(unique_classes)) | |
return self | |
def transform(self, X: List[List[Union[str, int]]]) -> np.ndarray: | |
"""Transforms the data using the learned classes. | |
Args: | |
X (List[List[Union[str, int]]]): Multilabel data to transform. | |
Returns: | |
np.ndarray: Binarized data. | |
""" | |
if self.classes_ is None: | |
raise ValueError("The binarizer is not fitted yet.") | |
binarized = np.zeros((len(X), len(self.classes_)), dtype=int) | |
class_to_index = {cls: idx for idx, cls in enumerate(self.classes_)} | |
for i, labels in enumerate(X): | |
for label in labels: | |
binarized[i, class_to_index[label]] = 1 | |
return binarized | |
def fit_transform(self, X: List[List[Union[str, int]]]) -> np.ndarray: | |
"""Fits the binarizer and transforms the data in one step. | |
Args: | |
X (List[List[Union[str, int]]]): Multilabel data to fit and transform. | |
Returns: | |
np.ndarray: Binarized data. | |
""" | |
return self.fit(X).transform(X) | |
# Example usage | |
if __name__ == "__main__": | |
# Sample data | |
data = pd.DataFrame({ | |
'tags': ['a,b,c', 'b,c,d', 'a,d', 'a,b'] | |
}) | |
# Convert tags column to list of labels | |
data['tags'] = data['tags'].apply(lambda x: x.split(',')) | |
# MultiLabelBinarizer | |
mlb = MultiLabelBinarizer() | |
tags_encoded = mlb.fit_transform(data['tags'].tolist()) | |
tags_df = pd.DataFrame(tags_encoded, columns=mlb.classes_) | |
encoded_data = pd.concat([data.drop(columns=['tags']), tags_df], axis=1) | |
print("MultiLabelBinarizer Encoded Data:\n", encoded_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment