Created
May 2, 2018 13:52
-
-
Save trcook/ea46e1941a4794775d99c3a130dfdb2c to your computer and use it in GitHub Desktop.
denoising autoencoder for data imputation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import keras as K | |
from keras.models import Sequential | |
from sklearn.preprocessing import MinMaxScaler | |
import pandas as pd | |
import numpy as np | |
import fredapi | |
fred_key="ADD FRED KEY FILE HERE" | |
fred=fredapi.fred.Fred(api_key_file=fred_key) | |
def get_data(series_list=["UNRATE","TTLCON","REALLN","PERMIT","HOUSTNSA"]): | |
m=[] | |
for i in series_list: | |
x=pd.DataFrame(fred.get_series(i),columns=[i]) | |
m.append(x) | |
dat=pd.concat(m,axis=1,join='inner') | |
return dat | |
def shape_data(dat,lag,lead): | |
dat_=dat.copy() | |
for i in dat_: | |
for j in range(lag): | |
dat_.loc[:,"{}_lag_{}".format(i,j)]=dat.loc[:,[i]].shift(j) | |
for k in range(lead): | |
dat_.loc[:,"{}_lead_{}".format(i,j)]=dat.loc[:,[i]].shift(-1*j) | |
msk=dat_.isnull().sum(1)<=0 | |
dat_=dat_[msk] | |
return dat_ | |
def gen_mcar(dat, score=.05,**kwargs): | |
n,m=dat.shape | |
_dat=dat.copy() | |
num_missing=int((n*m)//(1/score)) | |
print(num_missing) | |
d=[(i,j) for i in range(n) for j in range(m)] | |
_=np.random.choice(range(len(d)),size=num_missing,replace=False) | |
missing=[d[i] for i in _] | |
for i in missing: | |
if isinstance(_dat,pd.DataFrame): | |
_dat.iloc[i[0],i[1]] = np.nan | |
else: | |
_dat[i[0],i[1]]=np.nan | |
return _dat | |
def make_scaled(dat): | |
scale_list=[] | |
for i in dat: | |
scaler=MinMaxScaler() | |
dat.loc[:,[i]]=scaler.fit_transform(dat.loc[:,[i]]) | |
scale_list.append(scaler) | |
return scale_list,dat | |
def unscale(scale_list,dat): | |
for i,iscaler in enumerate(scale_list): | |
dat.iloc[:,[i]]=iscaler.inverse_transform(dat.iloc[:,[i]]) | |
return dat | |
# make data | |
dat=get_data() | |
dat=shape_data(dat,3,3) | |
scale_list,dat=make_scaled(dat) | |
# make autoencoder network | |
scale=2 | |
input_=K.layers.Input(shape=(dat.shape[1],)) | |
y_=K.layers.Input(shape=(dat.shape[1],)) | |
D=K.layers.Dropout(.1,name='dropout')(input_) # simulate missing data | |
D=K.layers.BatchNormalization()(D) | |
# D=K.layers.GaussianNoise(.5,name='gaussian')(D) | |
for i in range(2): | |
D=K.layers.Dense(int(D.shape[1]*scale),activation='relu',name='encoding_{}'.format(i))(D) | |
D=K.layers.Dense(int(128),activation='relu',name='encoded'.format(i))(D) | |
D=K.layers.Dense(int(128),activation='relu',name='decoding'.format(i))(D) | |
for i in range(2): | |
D=K.layers.Dense(int(D.shape[1]//scale),activation='relu',name='decoding_{}'.format(i))(D) | |
D=K.layers.Dense(dat.shape[1],activation=None,name='output')(D) | |
# train network | |
mod=K.Model(input_,D) | |
mod.compile('adam',loss=K.losses.mse) | |
# train the model | |
for i in 100: | |
mod.fit(training_dat,training_dat,epochs=100,verbose=0) | |
mod.fit(training_dat,training_dat,epochs=1,verbose=2) | |
# generate predictions: | |
y=mod.predict(training_dat) | |
# create object reference to encoded layer | |
encoded=mod.get_layer(name='encoded') | |
# create object reference to output layer | |
output=mod.get_layer(name='output') | |
# This lets us pass values into the fitted model and get activations from encoded layer. It also lets us turn on/off random dropout | |
pout = K.backend.function([mod.layers[0].input,K.backend.learning_phase()], | |
[encoded.output]) | |
# This is how you call the function defined above on the training data with the dropout turned on | |
layer_output =pout([training_dat,1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@trcook
NameError: name 'training_dat' is not defined