Skip to content

Instantly share code, notes, and snippets.

@augustoqm
Forked from why-not/gist:4582705
Last active April 18, 2016 21:48
Show Gist options
  • Save augustoqm/ec60b1cf221b2e40de7b1312f55fe75a to your computer and use it in GitHub Desktop.
Save augustoqm/ec60b1cf221b2e40de7b1312f55fe75a to your computer and use it in GitHub Desktop.
Pandas recipe.
# ======================================================================================================
# Pandas Recipe
# ======================================================================================================
# quick way to create a data frame to try things out
df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
# will bring out a col
df['A']
# will bring out a row, #0 in this case
df.ix[0]
# to get an array from a data frame or a series use values, note it is not a function here, so no parans ()
point = df_allpoints[df_allpoints['names'] == given_point] # extract one point row.
point = point['desc'].values[0] # get its descriptor in array form.
# Given a dataframe df to filter by a series s:
df[df['col_name'].isin(s)]
# to do the same filter on the index instead of arbitrary column
df.ix[s]
# display only certain columns, note it is a list inside the parans
df[['A', 'B']]
# drop rows with atleast one null value, pass params to modify
to atmost instead of atleast etc.
df.dropna()
# deleting a column
del df['column-name'] # note that df.column-name won't work.
# making rows out of whole objects instead of parsing them into seperate columns
# Create the dataset (no data or just the indexes)
dataset = pandas.DataFrame(index=names)
# Add a column to the dataset where each column entry is a 1-D array and each row of “svd” is applied to a different DataFrame row
dataset['Norm']=svds
# filter by multiple conditions in a dataframe df (parentheses!)
df[(df['gender'] == 'M') & (df['cc_iso'] == 'US')]
# filter by conditions and the condition on row labels(index)
df[(df.a > 0) & (df.index.isin([0, 2, 4]))]
# regexp filters on strings (vectorized), use .* instead of *
df[df.category.str.contains(r'some.regex.*pattern')]
# logical NOT is like this
df[~df.category.str.contains(r'some.regex.*pattern')]
# creating complex filters using functions on rows: http://goo.gl/r57b1
df[df.apply(lambda x: x['b'] > x['c'], axis=1)]
# Pandas replace operation http://goo.gl/DJphs
df[2].replace(4, 17, inplace=True)
df[1][df[1] == 4] = 19
# apply and map examples
# add 1 to every element
df.applymap(lambda x: x+1)
# add 2 to row 3 and return the series
df.apply(lambda x: x[3]+2,axis=0)
# add 3 to col A and return the series
df.apply(lambda x: x['a']+1,axis=1)
# assigning some value to a slice is tricky as sometimes a copy is returned,
# sometimes a view is returned based on numpy rules, more here:
# http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-advanced
df.ix[df['part'].isin(ids), 'assigned_name'] = "some new value"
# example of applying a complex external function to each row of a data frame
def stripper(x):
l = re.findall(r'[0-9]+(?:\.[0-9]+){3}', x['Text with IP adress embedded'])
# you can take care of special
# cases and missing values, more than expected
# number of return values etc like this.
if l == []:
return ''
else:
return l[0]
df.apply(stripper, axis=1)
# can pass extra args and named ones eg..
def subtract_and_divide(x, sub, divide=1):
return (x - sub) / divide
# You may then apply this function as follows:
df.apply(subtract_and_divide, args=(5,), divide=3)
# compute the means by group, and save mean to every element so group mean is available for every sample
sil_means = df.groupby('labels').mean()
df = df.join(sil_means, on='labels', rsuffix='_mean')
# groupby used like a histogram to obtain counts on sub-ranges of a variable, pretty handy
df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
# finding the distribution based on quantiles
df.groupby(pd.qcut(df.age, [0, 0.99, 1])
# if you don't need specific bins like above, and just want to count number of each values
df.age.value_counts()
# one liner to normalize a data frame
(df - df.mean()) / (df.max() - df.min())
# iterating and working with groups is easy when you realize each group is itself a DataFrame
for name, group in dg:
print name, print(type(group))
# grouping and applying a group specific function to each group element,
# I think this could be simpler, but here is my current version
quantile = [0, 0.50, 0.75, 0.90, 0.95, 0.99, 1]
grouped = df.groupby(pd.qcut(df.age, quantile))
frame_list = []
for i, group in enumerate(grouped):
(label, frame) = group
frame['age_quantile'] = quantile[i + 1]
frame_list.append(frame)
df = pd.concat(frame_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment