Last active
January 30, 2025 22:49
-
-
Save BibMartin/b0219727266515fa2af059df7f75b967 to your computer and use it in GitHub Desktop.
Hack pandas.DataFrame to have unstacked JSON structure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import DataFrame, MultiIndex | |
def __getattribute__(self, x): | |
try: | |
return _parent__getattribute__(self, x) | |
except AttributeError: | |
columns = _parent__getattribute__(self, 'columns') | |
cols = list(set([x.split('.')[0] for x in columns])) | |
if x in cols: | |
_prefix = x + '.' | |
df = self[[x for x in columns if x.startswith(_prefix)]].rename_axis( | |
lambda x: x[len(_prefix):], axis=1) | |
if len(_parent__getattribute__(df, 'columns')): | |
return df | |
else: | |
return self[x] | |
else: | |
raise ValueError('{} not in {}'.format(x, cols)) | |
_parent__getattribute__ = DataFrame.__getattribute__ | |
DataFrame.__getattribute__ = __getattribute__ | |
def __dir__(self): | |
cols = list(set([x.split('.')[0] for x in self.columns])) | |
return _parent__dir__(self) + list(cols) | |
_parent__dir__ = DataFrame.__dir__ | |
DataFrame.__dir__ = __dir__ | |
def unstack(x, prefix=""): | |
if isinstance(x, dict): | |
out = {} | |
for key, val in x.items(): | |
z = unstack(val, prefix=key+'.') | |
if isinstance(z, dict): | |
for subkey, subval in z.items(): | |
out[prefix+subkey] = subval | |
else: | |
out[prefix+key] = val | |
return out | |
elif isinstance(x, list): | |
return unstack({'_'+str(i): val for i,val in enumerate(x)}, prefix=prefix) | |
else: | |
return x | |
def unwind(self): | |
columns = MultiIndex.from_tuples([tuple(col.split('.', 1)) | |
for col in self.columns]) | |
return DataFrame(self.values, | |
index=self.index, | |
columns=columns) | |
DataFrame.unwind = unwind | |
# Example | |
######### | |
data = [{'state': 'Florida', | |
'shortname': 'FL', | |
'info': { | |
'governor': 'Rick Scott' | |
}, | |
'counties': [{'name': 'Dade', 'population': 12345}, | |
{'name': 'Broward', 'population': 40000}, | |
{'name': 'Palm Beach', 'population': 60000}]}, | |
{'state': 'Ohio', | |
'shortname': 'OH', | |
'info': { | |
'governor': 'John Kasich' | |
}, | |
'counties': [{'name': 'Summit', 'population': 1234}, | |
{'name': 'Cuyahoga', 'population': 1337}]}] | |
df = DataFrame([unstack(x) for x in data]) | |
print(df.counties._0) | |
print(df.counties.unwind().stack(0)) |
<script src="https://gist.github.com/BibMartin/b0219727266515fa2af059df7f75b967.js"></script>
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
from pandas import DataFrame, MultiIndex
def getattribute(self, x):
try:
return parent__getattribute_(self, x)
except AttributeError:
columns = parent__getattribute_(self, 'columns')
cols = list(set([x.split('.')[0] for x in columns]))
if x in cols:
_prefix = x + '.'
df = self[[x for x in columns if x.startswith(prefix)]].rename_axis(
lambda x: x[len(prefix):], axis=1)
if len(parent__getattribute(df, 'columns')):
return df
else:
return self[x]
else:
raise ValueError('{} not in {}'.format(x, cols))
parent__getattribute = DataFrame.getattribute
DataFrame.getattribute = getattribute
def dir(self):
cols = list(set([x.split('.')[0] for x in self.columns]))
return parent__dir_(self) + list(cols)
parent__dir_ = DataFrame.dir
DataFrame.dir = dir
def unstack(x, prefix=""):
if isinstance(x, dict):
out = {}
for key, val in x.items():
z = unstack(val, prefix=key+'.')
if isinstance(z, dict):
for subkey, subval in z.items():
out[prefix+subkey] = subval
else:
out[prefix+key] = val
return out
elif isinstance(x, list):
return unstack({'_'+str(i): val for i,val in enumerate(x)}, prefix=prefix)
else:
return x
def unwind(self):
columns = MultiIndex.from_tuples([tuple(col.split('.', 1))
for col in self.columns])
return DataFrame(self.values,
index=self.index,
columns=columns)
DataFrame.unwind = unwind
Example
#########
data = [{'state': 'Florida',
'shortname': 'FL',
'info': {
'governor': 'Rick Scott'
},
'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]},
{'state': 'Ohio',
'shortname': 'OH',
'info': {
'governor': 'John Kasich'
},
'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]}]
df = DataFrame([unstack(x) for x in data])
print(df.counties._0)
print(df.counties.unwind().stack(0))
Comment