Skip to content

Instantly share code, notes, and snippets.

@trcook
Created May 2, 2018 13:52
Show Gist options
  • Save trcook/ea46e1941a4794775d99c3a130dfdb2c to your computer and use it in GitHub Desktop.
Save trcook/ea46e1941a4794775d99c3a130dfdb2c to your computer and use it in GitHub Desktop.
denoising autoencoder for data imputation
import keras as K
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import fredapi
fred_key="ADD FRED KEY FILE HERE"
fred=fredapi.fred.Fred(api_key_file=fred_key)
def get_data(series_list=["UNRATE","TTLCON","REALLN","PERMIT","HOUSTNSA"]):
m=[]
for i in series_list:
x=pd.DataFrame(fred.get_series(i),columns=[i])
m.append(x)
dat=pd.concat(m,axis=1,join='inner')
return dat
def shape_data(dat,lag,lead):
dat_=dat.copy()
for i in dat_:
for j in range(lag):
dat_.loc[:,"{}_lag_{}".format(i,j)]=dat.loc[:,[i]].shift(j)
for k in range(lead):
dat_.loc[:,"{}_lead_{}".format(i,j)]=dat.loc[:,[i]].shift(-1*j)
msk=dat_.isnull().sum(1)<=0
dat_=dat_[msk]
return dat_
def gen_mcar(dat, score=.05,**kwargs):
n,m=dat.shape
_dat=dat.copy()
num_missing=int((n*m)//(1/score))
print(num_missing)
d=[(i,j) for i in range(n) for j in range(m)]
_=np.random.choice(range(len(d)),size=num_missing,replace=False)
missing=[d[i] for i in _]
for i in missing:
if isinstance(_dat,pd.DataFrame):
_dat.iloc[i[0],i[1]] = np.nan
else:
_dat[i[0],i[1]]=np.nan
return _dat
def make_scaled(dat):
scale_list=[]
for i in dat:
scaler=MinMaxScaler()
dat.loc[:,[i]]=scaler.fit_transform(dat.loc[:,[i]])
scale_list.append(scaler)
return scale_list,dat
def unscale(scale_list,dat):
for i,iscaler in enumerate(scale_list):
dat.iloc[:,[i]]=iscaler.inverse_transform(dat.iloc[:,[i]])
return dat
# make data
dat=get_data()
dat=shape_data(dat,3,3)
scale_list,dat=make_scaled(dat)
# make autoencoder network
scale=2
input_=K.layers.Input(shape=(dat.shape[1],))
y_=K.layers.Input(shape=(dat.shape[1],))
D=K.layers.Dropout(.1,name='dropout')(input_) # simulate missing data
D=K.layers.BatchNormalization()(D)
# D=K.layers.GaussianNoise(.5,name='gaussian')(D)
for i in range(2):
D=K.layers.Dense(int(D.shape[1]*scale),activation='relu',name='encoding_{}'.format(i))(D)
D=K.layers.Dense(int(128),activation='relu',name='encoded'.format(i))(D)
D=K.layers.Dense(int(128),activation='relu',name='decoding'.format(i))(D)
for i in range(2):
D=K.layers.Dense(int(D.shape[1]//scale),activation='relu',name='decoding_{}'.format(i))(D)
D=K.layers.Dense(dat.shape[1],activation=None,name='output')(D)
# train network
mod=K.Model(input_,D)
mod.compile('adam',loss=K.losses.mse)
# train the model
for i in 100:
mod.fit(training_dat,training_dat,epochs=100,verbose=0)
mod.fit(training_dat,training_dat,epochs=1,verbose=2)
# generate predictions:
y=mod.predict(training_dat)
# create object reference to encoded layer
encoded=mod.get_layer(name='encoded')
# create object reference to output layer
output=mod.get_layer(name='output')
# This lets us pass values into the fitted model and get activations from encoded layer. It also lets us turn on/off random dropout
pout = K.backend.function([mod.layers[0].input,K.backend.learning_phase()],
[encoded.output])
# This is how you call the function defined above on the training data with the dropout turned on
layer_output =pout([training_dat,1])
@marviwaheed
Copy link

marviwaheed commented May 9, 2019

@trcook

NameError: name 'training_dat' is not defined

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment