Last active
December 11, 2019 21:41
-
-
Save caiobvilar/79e4f6ad655cbc6798674a270455763f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Dec 9 23:03:35 2019 | |
@author: caiob | |
""" | |
import pandas as pd | |
import numpy as np | |
import io | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import mean_squared_error | |
import xgboost as xgb | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import cross_val_score,RandomizedSearchCV | |
token = '6ObFwPQc1KdMsO4AcGgV0m15QV4HB9iA8Wb6M3FlesaNtL1sNxmgbSBTN4BN' | |
from worldtradingdata import WorldTradingData | |
# Creating WTD object with token | |
wtd = WorldTradingData(token) | |
optional_params = {'output':'csv'} | |
# Getting Lockheed-Martin historical prices as csv | |
LMT_history = wtd.history('LMT',optional_params) | |
# Presenting obtained data | |
LMT_history | |
# Transforming into a Pandas DataFrame | |
df_hist = pd.read_csv(io.StringIO(LMT_history)) | |
# Analyzing DataFrame data structure and types | |
# Setting date as the index | |
df_hist['Date'] = pd.to_datetime(df_hist.Date,format='%Y-%m-%d') | |
df_hist.index = df_hist['Date'] | |
df_hist_index = df_hist.sort_index(ascending=True, axis=0) | |
#Create new dataset with only Open price | |
df_hist.info() | |
# Extracting 3 years of data from the DataFrame | |
# Create a mask | |
mask = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2019-12-10') | |
# Creating new DataFrame | |
df_threeyear = df_hist.loc[mask] | |
# Plotting the dataset for Opening Price | |
plt.figure(figsize=(20,8)) | |
plt.plot(df_threeyear['Open'],label='Preço Abertura 3 anos') | |
## Criando Datasets Anuais | |
###2016 | |
mask_2016 = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2016-12-31') | |
df_hist_2016 = df_threeyear.loc[mask_2016] | |
# Plotting 2016's opening price graph | |
plt.figure(figsize=(20,8)) | |
plt.plot(df_hist_2016['Open'],label='Preço Abertura 2016') | |
###2017 | |
mask_2017 = (df_hist['Date'] > '2017-1-1') & (df_hist['Date'] <= '2017-12-31') | |
df_hist_2017 = df_threeyear.loc[mask_2017] | |
# Plotting 2017's opening price graph | |
plt.figure(figsize=(20,8)) | |
plt.plot(df_hist_2017['Open'],label='Preço Abertura 2017') | |
###2018 | |
mask_2018 = (df_hist['Date'] > '2018-1-1') & (df_hist['Date'] <= '2018-12-31') | |
df_hist_2018 = df_threeyear.loc[mask_2018] | |
# Plotting 2018's opening price graph | |
plt.figure(figsize=(20,8)) | |
plt.plot(df_hist_2018['Open'],label='Preço Abertura 2018') | |
###2019 | |
mask_2019 = (df_hist['Date'] > '2019-1-1') & (df_hist['Date'] <= '2019-12-31') | |
df_hist_2019 = df_threeyear.loc[mask_2019] | |
# Plotting 2019's opening price graph | |
plt.figure(figsize=(20,8)) | |
plt.plot(df_hist_2019['Open'],label='Preço Abertura 2019') | |
## Creating training and Testing subsets for each year | |
X_2016,y_2016 = df_hist_2016.iloc[:,1:],df_hist_2016.iloc[:,1] | |
X_2017,y_2017 = df_hist_2017.iloc[:,1:],df_hist_2017.iloc[:,1] | |
X_2018,y_2018 = df_hist_2018.iloc[:,1:],df_hist_2018.iloc[:,1] | |
X_2019,y_2019 = df_hist_2019.iloc[:,1:],df_hist_2019.iloc[:,1] | |
# | |
X_train_2016, X_test_2016, y_train_2016, y_test_2016 = train_test_split(X_2016,y_2016,test_size=0.2,shuffle=False) | |
X_train_2017, X_test_2017, y_train_2017, y_test_2017 = train_test_split(X_2017,y_2017,test_size=0.2,shuffle=False) | |
X_train_2018, X_test_2018, y_train_2018, y_test_2018 = train_test_split(X_2018,y_2018,test_size=0.2,shuffle=False) | |
X_train_2019, X_test_2019, y_train_2019, y_test_2019 = train_test_split(X_2019,y_2019,test_size=0.2,shuffle=False) | |
# Creating XGBoost DM_matrices to work with | |
# Convert the training and testing sets into DMatrices for 2016 set: | |
DM_train_2016 = xgb.DMatrix(data=X_train_2016, label=y_train_2016) | |
DM_test_2016 = xgb.DMatrix(data=X_test_2016, label=y_test_2016) | |
# Convert the training and testing sets into DMatrices for 2017 set: | |
DM_train_2017 = xgb.DMatrix(data=X_train_2017, label=y_train_2017) | |
DM_test_2017 = xgb.DMatrix(data=X_test_2017, label=y_test_2017) | |
# Convert the training and testing sets into DMatrices for 2018 set: | |
DM_train_2018 = xgb.DMatrix(data=X_train_2018, label=y_train_2018) | |
DM_test_2018 = xgb.DMatrix(data=X_test_2018, label=y_test_2018) | |
# Convert the training and testing sets into DMatrices for 2019 set: | |
DM_train_2019 = xgb.DMatrix(data=X_train_2019, label=y_train_2019) | |
DM_test_2019 = xgb.DMatrix(data=X_test_2019, label=y_test_2019) | |
# Create the parameter dictionary: params | |
params = {"booster":"gblinear", "objective":"reg:squarederror"} | |
# Train models for each year | |
## 2016 | |
# Train the model: xg_reg | |
xg_reg_2016 = xgb.train(params=params, dtrain=DM_train_2016, num_boost_round=10) | |
## 2017 | |
# Train the model: xg_reg | |
xg_reg_2017 = xgb.train(params=params, dtrain=DM_train_2017, num_boost_round=10) | |
## 2018 | |
# Train the model: xg_reg | |
xg_reg_2018 = xgb.train(params=params, dtrain=DM_train_2018, num_boost_round=10) | |
## 2019 | |
# Train the model: xg_reg | |
xg_reg_2019 = xgb.train(params=params, dtrain=DM_train_2019, num_boost_round=10) | |
# Predictions for each year on the test sets | |
## 2016 | |
preds_2016 = xg_reg_2016.predict(DM_test_2016) | |
## 2017 | |
preds_2017 = xg_reg_2017.predict(DM_test_2017) | |
## 2018 | |
preds_2018 = xg_reg_2018.predict(DM_test_2018) | |
## 2019 | |
preds_2019 = xg_reg_2019.predict(DM_test_2019) | |
# Calculate RMSE of each model for each year | |
## 2016 | |
rmse_2016 = np.sqrt(mean_squared_error(y_test_2016,preds_2016)) | |
## 2017 | |
rmse_2017 = np.sqrt(mean_squared_error(y_test_2017,preds_2017)) | |
## 2018 | |
rmse_2018 = np.sqrt(mean_squared_error(y_test_2018,preds_2018)) | |
## 2019 | |
rmse_2019 = np.sqrt(mean_squared_error(y_test_2019,preds_2019)) | |
# Plot each model with each test set prediction in place of the real data. | |
##2016 | |
new_data_2016 = pd.DataFrame(df_hist_2016,columns=['Date', 'Open']) | |
aux_preds_2016 = np.append(y_train_2016,preds_2016) | |
new_data_2016['Preds'] = aux_preds_2016 | |
plt.plot(new_data_2016[['Open', 'Preds']]) | |
## 2017 | |
new_data_2017 = pd.DataFrame(df_hist_2017,columns=['Date', 'Open']) | |
aux_preds_2017 = np.append(y_train_2017,preds_2017) | |
new_data_2017['Preds'] = aux_preds_2017 | |
plt.plot(new_data_2017[['Open', 'Preds']]) | |
## 2018 | |
new_data_2018 = pd.DataFrame(df_hist_2018,columns=['Date', 'Open']) | |
aux_preds_2018 = np.append(y_train_2018,preds_2018) | |
new_data_2018['Preds'] = aux_preds_2018 | |
plt.plot(new_data_2018[['Open', 'Preds']]) | |
## 2019 | |
new_data_2019 = pd.DataFrame(df_hist_2019,columns=['Date', 'Open']) | |
aux_preds_2019 = np.append(y_train_2019,preds_2019) | |
new_data_2019['Preds'] = aux_preds_2019 | |
plt.plot(new_data_2019[['Open', 'Preds']]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment