jmquintana79 · January 22, 2022 00:04
diff --git a/filtering_pandas.py b/filtering_pandas.py
 """ Filter by condition """
 # conditional operators for pandas:
 # EQUAL: ==
 # DIFFERENT TO: !=
 # BIGGER THAN: >
 # LOWER THAN: <
 # BIGGER OR EQUAL TO: >=
 # LOWER OR EQUEAL TO: <=
 # AND: &
 # OR: |

 # condition is set through single values
 DF_filtered = DF[DF.column == value] # or other condition
 DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition



 """ Filtering by "isin()" method """
 # condition is set through multiple values

 #When values is a list:
 df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
 df.isin([1, 3, 12, 'a'])
 """
       A      B
 0   True   True
 1  False  False
 2   True  False
 """

 # When values is a dict:
 df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
 df.isin({'A': [1, 3], 'B': [4, 7, 12]})
 """
       A      B
 0   True  False  # Note that B didn't match the 1 here.
 1  False   True
 2   True   True
 """

 # When values is a Series or DataFrame:
 df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
 other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
 df.isin(other)
 """
       A      B
 0   True  False
 1  False  False  # Column A in `other` has a 3, but not at index 1.
 2   True   True
 """

 # filter dataframe through a condition with values of list
 df_filtered = df[df.column.isin(list_values_to_filter)]

 # filter dataframe through a condition with values out of list
 df_filtered = df[~df.column.isin(list_values_to_filter)]

 # filter according to a column values if appear in a list
 df[df.columna.isin(list_targets)]

 # filter according to a string column value contain a substring
 df[df.columna.map(lambda x: 'substr' in x)]
 df[df.COLUMN.str.contains('substring')]

 ## filtering nan values
 lindex = list(df[np.isnan(df["column"])].index) # only possible with float values
 lindex = np.where(pd.isnull(df["column"])) # only possible with float values
 lindex = df[pd.isnull(df["column"])].index.tolist() # always possible
 df_filtered = df[~df.index.isin(lindex)]

 ## other way
 df_filtered = df.dropna(inplace=False)
 # dropping rows with NaN values in only a list of column
 df.dropna(subset=['column_name'])

 ## other way with notna()
 df[df.notna()]
 df[df['column'].notna()]

 """ duplicated """
 # return True / False column
 DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’
 # filter duplicated values
 DF = DF[~DF['duplicated']]

 # DROP DUPLICATES
 df.drop_duplicates(['col1','col2'],keep='first',inplace=True)
	""" Filter by condition """
	# conditional operators for pandas:
	# EQUAL: ==
	# DIFFERENT TO: !=
	# BIGGER THAN: >
	# LOWER THAN: <
	# BIGGER OR EQUAL TO: >=
	# LOWER OR EQUEAL TO: <=
	# AND: &
	# OR: \|

	# condition is set through single values
	DF_filtered = DF[DF.column == value] # or other condition
	DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition



	""" Filtering by "isin()" method """
	# condition is set through multiple values

	#When values is a list:
	df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
	df.isin([1, 3, 12, 'a'])
	"""
	A B
	0 True True
	1 False False
	2 True False
	"""

	# When values is a dict:
	df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
	df.isin({'A': [1, 3], 'B': [4, 7, 12]})
	"""
	A B
	0 True False # Note that B didn't match the 1 here.
	1 False True
	2 True True
	"""

	# When values is a Series or DataFrame:
	df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
	other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
	df.isin(other)
	"""
	A B
	0 True False
	1 False False # Column A in `other` has a 3, but not at index 1.
	2 True True
	"""

	# filter dataframe through a condition with values of list
	df_filtered = df[df.column.isin(list_values_to_filter)]

	# filter dataframe through a condition with values out of list
	df_filtered = df[~df.column.isin(list_values_to_filter)]

	# filter according to a column values if appear in a list
	df[df.columna.isin(list_targets)]

	# filter according to a string column value contain a substring
	df[df.columna.map(lambda x: 'substr' in x)]
	df[df.COLUMN.str.contains('substring')]

	## filtering nan values
	lindex = list(df[np.isnan(df["column"])].index) # only possible with float values
	lindex = np.where(pd.isnull(df["column"])) # only possible with float values
	lindex = df[pd.isnull(df["column"])].index.tolist() # always possible
	df_filtered = df[~df.index.isin(lindex)]

	## other way
	df_filtered = df.dropna(inplace=False)
	# dropping rows with NaN values in only a list of column
	df.dropna(subset=['column_name'])

	## other way with notna()
	df[df.notna()]
	df[df['column'].notna()]

	""" duplicated """
	# return True / False column
	DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’
	# filter duplicated values
	DF = DF[~DF['duplicated']]

	# DROP DUPLICATES
	df.drop_duplicates(['col1','col2'],keep='first',inplace=True)