Last active
January 22, 2022 00:04
-
-
Save jmquintana79/0ae540676993bf6cbbc61a1513326859 to your computer and use it in GitHub Desktop.
Filter data stored into pandas dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Filter by condition """ | |
# conditional operators for pandas: | |
# EQUAL: == | |
# DIFFERENT TO: != | |
# BIGGER THAN: > | |
# LOWER THAN: < | |
# BIGGER OR EQUAL TO: >= | |
# LOWER OR EQUEAL TO: <= | |
# AND: & | |
# OR: | | |
# condition is set through single values | |
DF_filtered = DF[DF.column == value] # or other condition | |
DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition | |
""" Filtering by "isin()" method """ | |
# condition is set through multiple values | |
#When values is a list: | |
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) | |
df.isin([1, 3, 12, 'a']) | |
""" | |
A B | |
0 True True | |
1 False False | |
2 True False | |
""" | |
# When values is a dict: | |
df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]}) | |
df.isin({'A': [1, 3], 'B': [4, 7, 12]}) | |
""" | |
A B | |
0 True False # Note that B didn't match the 1 here. | |
1 False True | |
2 True True | |
""" | |
# When values is a Series or DataFrame: | |
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) | |
other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']}) | |
df.isin(other) | |
""" | |
A B | |
0 True False | |
1 False False # Column A in `other` has a 3, but not at index 1. | |
2 True True | |
""" | |
# filter dataframe through a condition with values of list | |
df_filtered = df[df.column.isin(list_values_to_filter)] | |
# filter dataframe through a condition with values out of list | |
df_filtered = df[~df.column.isin(list_values_to_filter)] | |
# filter according to a column values if appear in a list | |
df[df.columna.isin(list_targets)] | |
# filter according to a string column value contain a substring | |
df[df.columna.map(lambda x: 'substr' in x)] | |
df[df.COLUMN.str.contains('substring')] | |
## filtering nan values | |
lindex = list(df[np.isnan(df["column"])].index) # only possible with float values | |
lindex = np.where(pd.isnull(df["column"])) # only possible with float values | |
lindex = df[pd.isnull(df["column"])].index.tolist() # always possible | |
df_filtered = df[~df.index.isin(lindex)] | |
## other way | |
df_filtered = df.dropna(inplace=False) | |
# dropping rows with NaN values in only a list of column | |
df.dropna(subset=['column_name']) | |
## other way with notna() | |
df[df.notna()] | |
df[df['column'].notna()] | |
""" duplicated """ | |
# return True / False column | |
DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’ | |
# filter duplicated values | |
DF = DF[~DF['duplicated']] | |
# DROP DUPLICATES | |
df.drop_duplicates(['col1','col2'],keep='first',inplace=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment