Last active
November 1, 2017 22:59
-
-
Save VictorGarritano/83650d15920168a0af053eaa7e9d86c9 to your computer and use it in GitHub Desktop.
updating __init__ documentation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
"""Usage: | |
>>> from sklearn.datasets import load_iris | |
>>> data = load_iris() | |
>>> obj = disc.Discretization(data.data, {0:7,1:3, 2:10, 3:4}) | |
>>> X_t = obj.fit_transform() | |
>>> sample = X_t[65,:] | |
>>> sample | |
#array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, | |
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, | |
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, | |
# 1, 0, 0, 0, 0, 0, 0, 0]) | |
>>> data.data[65,:] | |
array([ 6.7, 3.1, 4.4, 1.4]) | |
>>> obj.inverse_transform(sample) | |
array([ 6.66863463, 3.054 , 4.28622542, 1.38881982]) | |
""" | |
import numpy as np | |
class Discretization(object): | |
def __init__(self, features, granularity_dict): | |
"""Initializer. | |
Arguments: | |
features (numpy.ndarray): The set of columns to be discretized. | |
granularity_dict (dictionary): A dictionary that specifies the number | |
of subdivisions per standard deviation, below and above the average. | |
By default, all features will be discretized with a granularity of 5 | |
per std. The key is the feature index on the features argument, | |
and the value is its granularity. | |
The discretized features will have a shape of 4*v + 1, where v | |
is the number of subdivisions per standard deviation. | |
""" | |
super(Discretization, self).__init__() | |
self.features = features | |
self.granularity_dict = granularity_dict | |
self.statistics = list() | |
self.grans = list() | |
self.bins = list() | |
def fit(self): | |
"""Get the features to be discretized and obtain the necessary information | |
to the discretization process, as listed below: | |
mean, | |
standard deviation, | |
discretization granularity, | |
discretization bins. | |
""" | |
if len(self.features.shape) == 1: | |
self.features = self.features.reshape(self.features.shape[0],1) | |
for idx in range(self.features.shape[1]): | |
if idx in self.granularity_dict.keys(): | |
gran = self.granularity_dict[idx] | |
self.grans.append(gran) | |
else: | |
gran = 5 | |
self.grans.append(gran) | |
mean = self.features[:,idx].mean() | |
std = self.features[:,idx].std() | |
self.statistics.append([mean, std]) | |
less = np.arange(-2*std+mean, | |
mean-0.2*(std/gran), | |
step=(std/gran)) | |
greater = np.arange(mean+(std/gran), mean+2*std+(0.5*std/gran), step=(std/gran)) | |
bins = np.concatenate([ | |
less, | |
np.array([ mean ]), | |
greater | |
]) | |
self.bins.append(bins) | |
def transform(self): | |
"""Do the binary encoding of features. | |
Return: | |
A numpy.ndarray containing the samples with discretized features. | |
""" | |
discr_final = list() | |
for idx in range(self.features.shape[1]): | |
mean = self.statistics[idx][0] | |
std = self.statistics[idx][1] | |
gran = self.grans[idx] | |
bins = self.bins[idx] | |
middle = int((bins.shape[0]-1)/2) | |
discr_samples = list() | |
for sample in self.features[:,idx]: | |
vec = np.array([False]*bins.shape[0]) | |
vec[middle] = True | |
if sample >= mean: | |
vec[middle+1:] = sample >= bins[middle+1:] | |
else: | |
vec[:middle] = sample <= bins[:middle] | |
vec = vec.astype(int) | |
discr_samples.append(vec.reshape(1, vec.shape[0])) | |
feat_discr = np.concatenate(discr_samples,axis=0) | |
discr_final.append(feat_discr) | |
X_transform = np.concatenate(discr_final, axis=1) | |
return X_transform | |
def fit_transform(self): | |
"""Extract the information of features needed to discretization, | |
and discretize the features. | |
""" | |
self.fit() | |
X = self.transform() | |
return X | |
def inverse_transform(self, sample): | |
"""Given a discretization, returns the real value associated with this | |
discretization (binary decoding). | |
Argument: | |
sample (numpy.ndarray): binary discretization. | |
Return: | |
A numpy.ndarray, which is an approximation to the instance that | |
generates the discretization. | |
""" | |
instance = list() | |
bins_slices = [x.shape[0] for x in self.bins] | |
slices_list = list() | |
for sli in bins_slices: | |
idxs = np.arange(sli) | |
slices_list.append(sample[idxs]) | |
sample = np.delete(sample, idxs) | |
for idx in range(len(slices_list)): | |
middle = (bins_slices[idx]-1)/2 | |
if slices_list[idx][int(middle+1)] == 1: | |
stds = np.where(slices_list[idx] == 1)[0][-1] - middle | |
elif slices_list[idx][int(middle-1)] == 1: | |
stds = np.where(slices_list[idx] == 1)[0][0] - middle | |
else: | |
stds = 0 | |
mean = self.statistics[idx][0] | |
std = self.statistics[idx][1] | |
gran = self.grans[idx] | |
aprox_value = mean + (stds)*float(std/gran) | |
instance.append(aprox_value) | |
return np.array(instance) | |
def get_slices_from_discretized_sample(self, sample): | |
"""Helper function to get feature slices from a discretized sample. | |
Arguments: | |
sample (numpy.ndarray): the discretized sample. | |
Return: | |
A numpy.ndarray with the discretizations of each feature separately. | |
""" | |
slices = [e.shape[0] for e in self.bins] | |
feature_slices = list() | |
for sli in slices: | |
idxs = np.arange(sli) | |
feature_slices.append(sample[idxs]) | |
sample = np.delete(sample, idxs) | |
return np.array(feature_slices) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment