Last active
January 16, 2025 15:15
-
-
Save jmquintana79/f637d2ed4176cf024e0b35334ab2074c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import copy | |
import logging | |
def counter_consecutive_frozen_values(df:pd.DataFrame, column:str)->pd.DataFrame: | |
"""Count consecutive frozen values | |
It is added a new column 'counter' with counting values. | |
Arguments: | |
df {pd.DataFrame} -- Dataframe to be analyzed. | |
column {str} -- Column to be analyzed. | |
Returns: | |
pd.DataFrame -- Same dataframe with new "counter" column. | |
""" | |
# validate arguments | |
assert isinstance(df, pd.DataFrame) and len(df) > 0 | |
assert column in df.columns.tolist() | |
# identifying consecutitve changes | |
df['change'] = df[column].ne(df[column].shift()).cumsum() | |
# count changes | |
df['counter'] = df.groupby('change').cumcount() + 1 | |
# remove unnecessary columns | |
df = df.drop(columns=['change']) | |
# return | |
return df | |
def remove_frozen_values(df:pd.DataFrame, column:str, freq_minutes_dt_index:int)->pd.DataFrame: | |
"""Remove frozen values from a timeseries df column. | |
It is removed just the consecutive duplicated values. The first | |
one, it is kept. | |
Arguments: | |
df {pd.DataFrame} -- Dataframe to be analyzed with a dt index. | |
column {str} -- Column to be analyzed. | |
freq_minutes_dt_index {int} -- Frequency in minutes for datetime index. | |
Returns: | |
pd.DataFrame -- Input dataframe with cleaned column. | |
""" | |
# validate arguments | |
assert isinstance(df, pd.DataFrame) and len(df) > 0, "It is required a non-empty dataframe." | |
assert isinstance(df.index, pd.DatetimeIndex), "It is required DateTime field in the index." | |
assert isinstance(column, str) | |
assert column in df.columns.tolist(), f"Column '{column}' is required." | |
assert isinstance(freq_minutes_dt_index, int) | |
# copy data | |
data = copy.deepcopy(df[[column]]) | |
# fill temporal holes | |
data = data.resample(f"{freq_minutes_dt_index}min").asfreq() | |
# frozen values | |
data = counter_consecutive_frozen_values(data, column) | |
# list of indexes of records whose values will be removed | |
l_index_remove = data[data["counter"]>1].index.tolist() | |
# remove values in original df | |
df.loc[df.index.isin(l_index_remove), column] = np.nan | |
# display | |
logging.info(f"It was removed {len(l_index_remove)} frozen values in column '{column}'.") | |
# clean | |
del data, l_index_remove | |
# return | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment