Skip to content

Instantly share code, notes, and snippets.

@shiyuangu
Last active May 30, 2017 21:16
Show Gist options
  • Save shiyuangu/cec0e9d4d5939f31c7781bebc97f69e9 to your computer and use it in GitHub Desktop.
Save shiyuangu/cec0e9d4d5939f31c7781bebc97f69e9 to your computer and use it in GitHub Desktop.
data profile
import os,sys
import pandas as pd
import numpy as np
if len(sys.argv)<2:
print "Usage: python profile.py csv_file"
os.exit(0)
fname = sys.argv[1]
df = pd.read_csv(fname)
cols = df.columns
dfo = pd.DataFrame(columns=['colname','n_unique','n_null','vals'])
for i,col in enumerate(cols):
s = df[col]
dfo.loc[i,'colname'] = col
dfo.loc[i,'n_unique'] = s.nunique()
dfo.loc[i,'n_null'] = s.isnull().sum()
if np.issubdtype(s.dtype, np.number):
s_vals = "(%g,%g)" % (s.min(), s.max())
else:
s_t = s.value_counts().sort_values(ascending=False)
n_t = min(s_t.shape[0],10)
s_vals = str(dict(s_t[:n_t]))
dfo.loc[i,'vals'] = s_vals.replace(",",";")
ofname = fname.split(".")[0]+"_profile.csv"
print "writing to %s" % (ofname,)
dfo.to_csv(ofname)
from IPython import embed; embed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment