Skip to content

Instantly share code, notes, and snippets.

@jmquintana79
Last active January 16, 2025 15:15
Show Gist options
  • Save jmquintana79/f637d2ed4176cf024e0b35334ab2074c to your computer and use it in GitHub Desktop.
Save jmquintana79/f637d2ed4176cf024e0b35334ab2074c to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import copy
import logging
def counter_consecutive_frozen_values(df:pd.DataFrame, column:str)->pd.DataFrame:
"""Count consecutive frozen values
It is added a new column 'counter' with counting values.
Arguments:
df {pd.DataFrame} -- Dataframe to be analyzed.
column {str} -- Column to be analyzed.
Returns:
pd.DataFrame -- Same dataframe with new "counter" column.
"""
# validate arguments
assert isinstance(df, pd.DataFrame) and len(df) > 0
assert column in df.columns.tolist()
# identifying consecutitve changes
df['change'] = df[column].ne(df[column].shift()).cumsum()
# count changes
df['counter'] = df.groupby('change').cumcount() + 1
# remove unnecessary columns
df = df.drop(columns=['change'])
# return
return df
def remove_frozen_values(df:pd.DataFrame, column:str, freq_minutes_dt_index:int)->pd.DataFrame:
"""Remove frozen values from a timeseries df column.
It is removed just the consecutive duplicated values. The first
one, it is kept.
Arguments:
df {pd.DataFrame} -- Dataframe to be analyzed with a dt index.
column {str} -- Column to be analyzed.
freq_minutes_dt_index {int} -- Frequency in minutes for datetime index.
Returns:
pd.DataFrame -- Input dataframe with cleaned column.
"""
# validate arguments
assert isinstance(df, pd.DataFrame) and len(df) > 0, "It is required a non-empty dataframe."
assert isinstance(df.index, pd.DatetimeIndex), "It is required DateTime field in the index."
assert isinstance(column, str)
assert column in df.columns.tolist(), f"Column '{column}' is required."
assert isinstance(freq_minutes_dt_index, int)
# copy data
data = copy.deepcopy(df[[column]])
# fill temporal holes
data = data.resample(f"{freq_minutes_dt_index}min").asfreq()
# frozen values
data = counter_consecutive_frozen_values(data, column)
# list of indexes of records whose values will be removed
l_index_remove = data[data["counter"]>1].index.tolist()
# remove values in original df
df.loc[df.index.isin(l_index_remove), column] = np.nan
# display
logging.info(f"It was removed {len(l_index_remove)} frozen values in column '{column}'.")
# clean
del data, l_index_remove
# return
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment