Created
March 16, 2021 14:24
-
-
Save oscar-defelice/a376c6e81bcf7c35e73d3ad1e23e043b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.model_selection import train_test_split | |
config = { | |
'data': data, | |
'train_test_ratio': 0.2 | |
} | |
def feature_selection(data): | |
""" | |
feature_selection function. | |
It takes data array and returns the feature selected. | |
Arguments: | |
data np.array of shape (n_training_example, n_features) | |
Returns: | |
data_reduced np.array of shape (n_training_example, n_reduced_features) | |
""" | |
data_reduced = data # In this case we keep all the features. Change this according to your analysis. | |
assert data_reduced.shape[0] == data.shape[0], "Data leaking!" | |
return data_reduced | |
def feature_normalisation(data): | |
""" | |
feature_normalisation function. | |
It takes data array and returns it with feature normalised. | |
Arguments: | |
data np.array of shape (n_training_example, n_features) | |
Returns: | |
data_normalised np.array of shape (n_training_example, n_features) | |
""" | |
data_normalised = data | |
mean = data_normalised.mean(axis=0) | |
data_normalised -= mean | |
std = data_normalised.std(axis=0) | |
data_normalised /= std | |
assert data_normalised.shape == data.shape, "Data leaking!" | |
return data_normalised | |
def import_data(input=config): | |
""" | |
import_data function. | |
It makes use of sklearn.model_selection.train_test_split. | |
Arguments: | |
input dict containing the following variables | |
data dict of np.arrays | |
data.data is the array made of feature vectors rows. | |
data.target is the array of target values. | |
train_test_ratio float | |
the ratio between train and test set sizes. | |
default: 0.2 | |
Returns: | |
tuple of four np.arrays (X_train, X_test, Y_train, Y_test) of shape | |
- X_train (n_training_examples, n_features) | |
- X_test (n_test_examples, n_features) | |
- Y_train (n_training_examples, ) | |
- Y_test (n_test_examples, ) | |
""" | |
data, train_test_ratio = input['data'], input['train_test_ratio'] | |
X = feature_selection(data.data) | |
X = feature_normalisation(X) | |
Y = data.target | |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = train_test_ratio, random_state=42) | |
assert X_train.shape[1] == X_test.shape[1], "Train and test shapes do not correspond!" | |
return X_train, X_test, Y_train, Y_test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment