Skip to content

Instantly share code, notes, and snippets.

@caiobvilar
Last active December 11, 2019 21:41
Show Gist options
  • Save caiobvilar/79e4f6ad655cbc6798674a270455763f to your computer and use it in GitHub Desktop.
Save caiobvilar/79e4f6ad655cbc6798674a270455763f to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 9 23:03:35 2019
@author: caiob
"""
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score,RandomizedSearchCV
token = '6ObFwPQc1KdMsO4AcGgV0m15QV4HB9iA8Wb6M3FlesaNtL1sNxmgbSBTN4BN'
from worldtradingdata import WorldTradingData
# Creating WTD object with token
wtd = WorldTradingData(token)
optional_params = {'output':'csv'}
# Getting Lockheed-Martin historical prices as csv
LMT_history = wtd.history('LMT',optional_params)
# Presenting obtained data
LMT_history
# Transforming into a Pandas DataFrame
df_hist = pd.read_csv(io.StringIO(LMT_history))
# Analyzing DataFrame data structure and types
# Setting date as the index
df_hist['Date'] = pd.to_datetime(df_hist.Date,format='%Y-%m-%d')
df_hist.index = df_hist['Date']
df_hist_index = df_hist.sort_index(ascending=True, axis=0)
#Create new dataset with only Open price
df_hist.info()
# Extracting 3 years of data from the DataFrame
# Create a mask
mask = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2019-12-10')
# Creating new DataFrame
df_threeyear = df_hist.loc[mask]
# Plotting the dataset for Opening Price
plt.figure(figsize=(20,8))
plt.plot(df_threeyear['Open'],label='Preço Abertura 3 anos')
## Criando Datasets Anuais
###2016
mask_2016 = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2016-12-31')
df_hist_2016 = df_threeyear.loc[mask_2016]
# Plotting 2016's opening price graph
plt.figure(figsize=(20,8))
plt.plot(df_hist_2016['Open'],label='Preço Abertura 2016')
###2017
mask_2017 = (df_hist['Date'] > '2017-1-1') & (df_hist['Date'] <= '2017-12-31')
df_hist_2017 = df_threeyear.loc[mask_2017]
# Plotting 2017's opening price graph
plt.figure(figsize=(20,8))
plt.plot(df_hist_2017['Open'],label='Preço Abertura 2017')
###2018
mask_2018 = (df_hist['Date'] > '2018-1-1') & (df_hist['Date'] <= '2018-12-31')
df_hist_2018 = df_threeyear.loc[mask_2018]
# Plotting 2018's opening price graph
plt.figure(figsize=(20,8))
plt.plot(df_hist_2018['Open'],label='Preço Abertura 2018')
###2019
mask_2019 = (df_hist['Date'] > '2019-1-1') & (df_hist['Date'] <= '2019-12-31')
df_hist_2019 = df_threeyear.loc[mask_2019]
# Plotting 2019's opening price graph
plt.figure(figsize=(20,8))
plt.plot(df_hist_2019['Open'],label='Preço Abertura 2019')
## Creating training and Testing subsets for each year
X_2016,y_2016 = df_hist_2016.iloc[:,1:],df_hist_2016.iloc[:,1]
X_2017,y_2017 = df_hist_2017.iloc[:,1:],df_hist_2017.iloc[:,1]
X_2018,y_2018 = df_hist_2018.iloc[:,1:],df_hist_2018.iloc[:,1]
X_2019,y_2019 = df_hist_2019.iloc[:,1:],df_hist_2019.iloc[:,1]
#
X_train_2016, X_test_2016, y_train_2016, y_test_2016 = train_test_split(X_2016,y_2016,test_size=0.2,shuffle=False)
X_train_2017, X_test_2017, y_train_2017, y_test_2017 = train_test_split(X_2017,y_2017,test_size=0.2,shuffle=False)
X_train_2018, X_test_2018, y_train_2018, y_test_2018 = train_test_split(X_2018,y_2018,test_size=0.2,shuffle=False)
X_train_2019, X_test_2019, y_train_2019, y_test_2019 = train_test_split(X_2019,y_2019,test_size=0.2,shuffle=False)
# Creating XGBoost DM_matrices to work with
# Convert the training and testing sets into DMatrices for 2016 set:
DM_train_2016 = xgb.DMatrix(data=X_train_2016, label=y_train_2016)
DM_test_2016 = xgb.DMatrix(data=X_test_2016, label=y_test_2016)
# Convert the training and testing sets into DMatrices for 2017 set:
DM_train_2017 = xgb.DMatrix(data=X_train_2017, label=y_train_2017)
DM_test_2017 = xgb.DMatrix(data=X_test_2017, label=y_test_2017)
# Convert the training and testing sets into DMatrices for 2018 set:
DM_train_2018 = xgb.DMatrix(data=X_train_2018, label=y_train_2018)
DM_test_2018 = xgb.DMatrix(data=X_test_2018, label=y_test_2018)
# Convert the training and testing sets into DMatrices for 2019 set:
DM_train_2019 = xgb.DMatrix(data=X_train_2019, label=y_train_2019)
DM_test_2019 = xgb.DMatrix(data=X_test_2019, label=y_test_2019)
# Create the parameter dictionary: params
params = {"booster":"gblinear", "objective":"reg:squarederror"}
# Train models for each year
## 2016
# Train the model: xg_reg
xg_reg_2016 = xgb.train(params=params, dtrain=DM_train_2016, num_boost_round=10)
## 2017
# Train the model: xg_reg
xg_reg_2017 = xgb.train(params=params, dtrain=DM_train_2017, num_boost_round=10)
## 2018
# Train the model: xg_reg
xg_reg_2018 = xgb.train(params=params, dtrain=DM_train_2018, num_boost_round=10)
## 2019
# Train the model: xg_reg
xg_reg_2019 = xgb.train(params=params, dtrain=DM_train_2019, num_boost_round=10)
# Predictions for each year on the test sets
## 2016
preds_2016 = xg_reg_2016.predict(DM_test_2016)
## 2017
preds_2017 = xg_reg_2017.predict(DM_test_2017)
## 2018
preds_2018 = xg_reg_2018.predict(DM_test_2018)
## 2019
preds_2019 = xg_reg_2019.predict(DM_test_2019)
# Calculate RMSE of each model for each year
## 2016
rmse_2016 = np.sqrt(mean_squared_error(y_test_2016,preds_2016))
## 2017
rmse_2017 = np.sqrt(mean_squared_error(y_test_2017,preds_2017))
## 2018
rmse_2018 = np.sqrt(mean_squared_error(y_test_2018,preds_2018))
## 2019
rmse_2019 = np.sqrt(mean_squared_error(y_test_2019,preds_2019))
# Plot each model with each test set prediction in place of the real data.
##2016
new_data_2016 = pd.DataFrame(df_hist_2016,columns=['Date', 'Open'])
aux_preds_2016 = np.append(y_train_2016,preds_2016)
new_data_2016['Preds'] = aux_preds_2016
plt.plot(new_data_2016[['Open', 'Preds']])
## 2017
new_data_2017 = pd.DataFrame(df_hist_2017,columns=['Date', 'Open'])
aux_preds_2017 = np.append(y_train_2017,preds_2017)
new_data_2017['Preds'] = aux_preds_2017
plt.plot(new_data_2017[['Open', 'Preds']])
## 2018
new_data_2018 = pd.DataFrame(df_hist_2018,columns=['Date', 'Open'])
aux_preds_2018 = np.append(y_train_2018,preds_2018)
new_data_2018['Preds'] = aux_preds_2018
plt.plot(new_data_2018[['Open', 'Preds']])
## 2019
new_data_2019 = pd.DataFrame(df_hist_2019,columns=['Date', 'Open'])
aux_preds_2019 = np.append(y_train_2019,preds_2019)
new_data_2019['Preds'] = aux_preds_2019
plt.plot(new_data_2019[['Open', 'Preds']])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment