Last active
October 13, 2022 11:53
-
-
Save mesgarpour/f24769cd186e2db853957b10ff6b7a95 to your computer and use it in GitHub Desktop.
Yeo-Johnson Transformation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
import warnings | |
import numpy as np | |
import pandas as pd | |
import sys | |
__author__ = "Mohsen Mesgarpour" | |
__copyright__ = "Copyright 2016, https://github.com/mesgarpour" | |
__credits__ = ["Mohsen Mesgarpour"] | |
__license__ = "GPL" | |
__version__ = "1.0" | |
__maintainer__ = "Mohsen Mesgarpour" | |
__email__ = "[email protected]" | |
class YeoJohnson: | |
""" | |
Computing Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation | |
but can handle both positive and negative values. | |
References: | |
Weisberg, S. (2001). Yeo-Johnson Power Transformations. | |
Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003. | |
https://www.stat.umn.edu/arc/yjpower.pdf | |
Adapted from CRAN - Package VGAM | |
""" | |
def fit(self, y, lmbda, derivative=0, epsilon=np.finfo(np.float).eps, inverse=False): | |
""" | |
:param y: The variable to be transformed (numeric array). | |
:param lmbda: The function's Lambda value (numeric value or array). | |
:param derivative: The derivative with respect to lambda | |
(non-negative integer; default: ordinary function evaluation). | |
:param epsilon: The lambda's tolerance (positive value). | |
:param inverse: The inverse transformation option (logical value). | |
:return: The Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y. | |
""" | |
# Validate arguments | |
self.__validate(y, lmbda, derivative, epsilon, inverse) | |
# initialise | |
y = np.array(y, dtype=float) | |
result = y | |
if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)): | |
lmbda, y = np.broadcast_arrays(lmbda, y) | |
lmbda = np.array(lmbda, dtype=float) | |
l0 = np.abs(lmbda) > epsilon | |
l2 = np.abs(lmbda - 2) > epsilon | |
# Inverse | |
with warnings.catch_warnings(): # suppress warnings | |
warnings.simplefilter("ignore") | |
if inverse is True: | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1 / lmbda[mask]) - 1 | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.expm1(y[mask]) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1 / (2 - lmbda[mask])) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = -np.expm1(-y[mask]) | |
# Derivative | |
else: | |
if derivative == 0: | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask]) | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.log1p(y[mask]) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask]) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = -np.log1p(-y[mask]) | |
# Not Derivative | |
else: | |
p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse) | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.divide(np.multiply(np.power(y[mask] + 1, lmbda[mask]), | |
np.power(np.log1p(y[mask]), derivative)) - | |
np.multiply(derivative, p[mask]), lmbda[mask]) | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = np.divide(-(np.multiply(np.power(-y[mask] + 1, 2 - lmbda[mask]), | |
np.power(-np.log1p(-y[mask]), derivative)) - | |
np.multiply(derivative, p[mask])), 2 - lmbda[mask]) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1) | |
return result | |
@staticmethod | |
def __validate(y, lmbda, derivative, epsilon, inverse): | |
try: | |
if not isinstance(y, (list, np.ndarray, pd.Series)): | |
raise Exception("Argument 'y' must be a list!") | |
if not isinstance(lmbda, (int, float, np.int, np.float)): | |
if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y): | |
raise Exception("Argument 'lmbda' must be a number " | |
"or a list, which its length matches 'y' argument!") | |
if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0: | |
raise Exception("Argument 'derivative' must be a non-negative integer!") | |
if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0: | |
raise Exception("Argument 'epsilon' must be a positive number!") | |
if not isinstance(inverse, bool): | |
raise Exception("Argument 'inverse' must be boolean!") | |
if inverse is True and derivative != 0: | |
raise Exception("Argument 'derivative' must be zero " | |
"when argument 'inverse' is 'True'!") | |
except (): | |
sys.exit() |
Sorry @croessert, I was not monitoring the comments.
The Lambda value indicates the power to which all data should be raised to transform it to normal distribution.
You have three principal options:
- Try different set of values for lambda, and assess the feature distribution (e.g. use Q-Q plot).
- If your feature has positive values only, then use boxcox transformation to maximizes the log-likelihood function using a simple linear regression (y~1). Under the hood, box-cox searches for lambda from within a range of values.
- Use an automated grid-search approach in combination with a normality test performed on the transformed data to determine optimal lambda, such that the p-value from the normality test is the highest. For instance, you can use "scipy.optimize.brute" and "scipy.stats.mstats.normaltest" functions to achieve that.
Some general notes:
- Always check for outliers and missing values, to make sure that some artifact is not driving the transformation.
- Consider the confidence interval around the optimal lambda, and whether a particular transformation makes sense.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Could you tell me how to extract lambda from R using rpy2? This value should be somewhere in r_scale.rx('yj'), right?
Thanks!