Last active
November 21, 2017 23:29
-
-
Save reservoirinvest/d641e08d9ccc0bde723a5575811e7f1e to your computer and use it in GitHub Desktop.
Column manipulation #pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# replace all values of a pandas column | |
df['Column Name'].replace('replacement', inplace=True) | |
# change a column value based on some other column values | |
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value | |
# List unique values in a DataFrame column | |
df['Column Name'].unique() | |
# Convert Series datatype to numeric, changing non-numeric values to NaN | |
pd.to_numeric(df['Column Name'], errors='coerce') | |
# Grab DataFrame rows where column has certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[df.column.isin(valuelist)] | |
# Grab DataFrame rows where column doesn't have certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[~df.column.isin(value_list)] | |
# Delete column from DataFrame | |
del df['column'] | |
# Select from DataFrame using criteria from multiple columns | |
# (use `|` instead of `&` to do an OR) | |
newdf = df[(df['column_one']>2004) & (df['column_two']==9)] | |
# Rename several DataFrame columns | |
df = df.rename(columns = { | |
'col1 old name':'col1 new name', | |
'col2 old name':'col2 new name', | |
'col3 old name':'col3 new name', | |
}) | |
# Lower-case all DataFrame column names | |
df.columns = map(str.lower, df.columns) | |
# Even more fancy DataFrame column re-naming | |
# lower-case all DataFrame column names (for example) | |
df.rename(columns=lambda x: x.split('.')[-1], inplace=True) | |
# Slice values in a DataFrame column (aka Series) | |
df.column.str[0:2] | |
# Lower-case everything in a DataFrame column | |
df.column_name = df.column_name.str.lower() | |
# Get length of data in a DataFrame column | |
df.column_name.str.len() | |
# Sort dataframe by multiple columns | |
df = df.sort(['col1','col2','col3'],ascending=[1,1,0]) | |
# Get top n for each group of columns in a sorted dataframe | |
# (make sure dataframe is sorted first) | |
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5) | |
# Grab DataFrame rows where specific column is null/notnull | |
newdf = df[df['column'].isnull()] | |
# Select from DataFrame using multiple keys of a hierarchical index | |
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2')) | |
# Change all NaNs to None (useful before | |
# loading to a db) | |
df = df.where((pd.notnull(df)), None) | |
# More pre-db insert cleanup...make a pass through the dataframe, stripping whitespace | |
# from strings and changing any empty values to None | |
# (not especially recommended but including here b/c I had to do this in real life one time) | |
df = df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) | |
# Get quick count of rows in a DataFrame | |
len(df.index) | |
# Pivot data (with flexibility about what what | |
# becomes a column and what stays a row). | |
# Syntax works on Pandas >= .14 | |
pd.pivot_table( | |
df,values='cell_value', | |
index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values | |
columns=['col4']) #data values in this column become their own column | |
# Change data type of DataFrame column | |
df.column_name = df.column_name.astype(np.int64) | |
# Get rid of non-numeric values throughout a DataFrame: | |
for col in refunds.columns.values: | |
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True) | |
# Set DataFrame column values based on other column values | |
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value | |
# Clean up missing values in multiple DataFrame columns | |
df = df.fillna({ | |
'col1': 'missing', | |
'col2': '99.999', | |
'col3': '999', | |
'col4': 'missing', | |
'col5': 'missing', | |
'col6': '99' | |
}) | |
# Concatenate two DataFrame columns into a new, single column | |
# (useful when dealing with composite keys, for example) | |
df['newcol'] = df['col1'].astype(str) + df['col2'].astype(str) | |
# Doing calculations with DataFrame columns that have missing values | |
# In example below, swap in 0 for df['col1'] cells that contain null | |
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2'] | |
# Split delimited values in a DataFrame column into two new columns | |
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) | |
# Get a report of all duplicate records in a dataframe, based on specific columns | |
dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment