caiobvilar · December 11, 2019 21:41
diff --git a/full_training.py b/full_training.py
 # -*- coding: utf-8 -*-
 """
 Created on Mon Dec  9 23:03:35 2019

 @author: caiob
 """
 import pandas as pd
 import numpy as np
 import io
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error
 import xgboost as xgb
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_val_score,RandomizedSearchCV
 token = '6ObFwPQc1KdMsO4AcGgV0m15QV4HB9iA8Wb6M3FlesaNtL1sNxmgbSBTN4BN'
 from worldtradingdata import WorldTradingData

 # Creating WTD object with token
 wtd = WorldTradingData(token)
 optional_params = {'output':'csv'}

 # Getting Lockheed-Martin historical prices as csv
 LMT_history = wtd.history('LMT',optional_params)
 # Presenting obtained data
 LMT_history
 # Transforming into a Pandas DataFrame
 df_hist = pd.read_csv(io.StringIO(LMT_history))
 # Analyzing DataFrame data structure and types
 # Setting date as the index
 df_hist['Date'] = pd.to_datetime(df_hist.Date,format='%Y-%m-%d')
 df_hist.index = df_hist['Date']
 df_hist_index = df_hist.sort_index(ascending=True, axis=0)
 #Create new dataset with only Open price
 df_hist.info()
 # Extracting 3 years of data from the DataFrame
 # Create a mask
 mask = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2019-12-10')
 # Creating new DataFrame
 df_threeyear = df_hist.loc[mask]
 # Plotting the dataset for Opening Price
 plt.figure(figsize=(20,8))
 plt.plot(df_threeyear['Open'],label='Preço Abertura 3 anos')
 ## Criando Datasets Anuais
 ###2016
 mask_2016 = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2016-12-31')
 df_hist_2016 = df_threeyear.loc[mask_2016]
 # Plotting 2016's opening price graph
 plt.figure(figsize=(20,8))
 plt.plot(df_hist_2016['Open'],label='Preço Abertura 2016')
 ###2017
 mask_2017 = (df_hist['Date'] > '2017-1-1') & (df_hist['Date'] <= '2017-12-31')
 df_hist_2017 = df_threeyear.loc[mask_2017]
 # Plotting 2017's opening price graph
 plt.figure(figsize=(20,8))
 plt.plot(df_hist_2017['Open'],label='Preço Abertura 2017')
 ###2018
 mask_2018 = (df_hist['Date'] > '2018-1-1') & (df_hist['Date'] <= '2018-12-31')
 df_hist_2018 = df_threeyear.loc[mask_2018]
 # Plotting 2018's opening price graph
 plt.figure(figsize=(20,8))
 plt.plot(df_hist_2018['Open'],label='Preço Abertura 2018')
 ###2019
 mask_2019 = (df_hist['Date'] > '2019-1-1') & (df_hist['Date'] <= '2019-12-31')
 df_hist_2019 = df_threeyear.loc[mask_2019]
 # Plotting 2019's opening price graph
 plt.figure(figsize=(20,8))
 plt.plot(df_hist_2019['Open'],label='Preço Abertura 2019')
 ## Creating training and Testing subsets for each year
 X_2016,y_2016 = df_hist_2016.iloc[:,1:],df_hist_2016.iloc[:,1]
 X_2017,y_2017 = df_hist_2017.iloc[:,1:],df_hist_2017.iloc[:,1]
 X_2018,y_2018 = df_hist_2018.iloc[:,1:],df_hist_2018.iloc[:,1]
 X_2019,y_2019 = df_hist_2019.iloc[:,1:],df_hist_2019.iloc[:,1]
 #
 X_train_2016, X_test_2016, y_train_2016, y_test_2016 = train_test_split(X_2016,y_2016,test_size=0.2,shuffle=False)
 X_train_2017, X_test_2017, y_train_2017, y_test_2017 = train_test_split(X_2017,y_2017,test_size=0.2,shuffle=False)
 X_train_2018, X_test_2018, y_train_2018, y_test_2018 = train_test_split(X_2018,y_2018,test_size=0.2,shuffle=False)
 X_train_2019, X_test_2019, y_train_2019, y_test_2019 = train_test_split(X_2019,y_2019,test_size=0.2,shuffle=False)
 # Creating XGBoost DM_matrices to work with
 # Convert the training and testing sets into DMatrices for 2016 set:
 DM_train_2016 = xgb.DMatrix(data=X_train_2016, label=y_train_2016)
 DM_test_2016 = xgb.DMatrix(data=X_test_2016, label=y_test_2016)
 # Convert the training and testing sets into DMatrices for 2017 set:
 DM_train_2017 = xgb.DMatrix(data=X_train_2017, label=y_train_2017)
 DM_test_2017 = xgb.DMatrix(data=X_test_2017, label=y_test_2017)
 # Convert the training and testing sets into DMatrices for 2018 set:
 DM_train_2018 = xgb.DMatrix(data=X_train_2018, label=y_train_2018)
 DM_test_2018 = xgb.DMatrix(data=X_test_2018, label=y_test_2018)
 # Convert the training and testing sets into DMatrices for 2019 set:
 DM_train_2019 = xgb.DMatrix(data=X_train_2019, label=y_train_2019)
 DM_test_2019 = xgb.DMatrix(data=X_test_2019, label=y_test_2019)
 # Create the parameter dictionary: params
 params = {"booster":"gblinear", "objective":"reg:squarederror"}

 # Train models for each year
 ## 2016
 # Train the model: xg_reg
 xg_reg_2016 = xgb.train(params=params, dtrain=DM_train_2016, num_boost_round=10)
 ## 2017
 # Train the model: xg_reg
 xg_reg_2017 = xgb.train(params=params, dtrain=DM_train_2017, num_boost_round=10)
 ## 2018
 # Train the model: xg_reg
 xg_reg_2018 = xgb.train(params=params, dtrain=DM_train_2018, num_boost_round=10)
 ## 2019
 # Train the model: xg_reg
 xg_reg_2019 = xgb.train(params=params, dtrain=DM_train_2019, num_boost_round=10)
 # Predictions for each year on the test sets
 ## 2016
 preds_2016 = xg_reg_2016.predict(DM_test_2016)
 ## 2017
 preds_2017 = xg_reg_2017.predict(DM_test_2017)
 ## 2018
 preds_2018 = xg_reg_2018.predict(DM_test_2018)
 ## 2019
 preds_2019 = xg_reg_2019.predict(DM_test_2019)
 # Calculate RMSE of each model for each year
 ## 2016
 rmse_2016 = np.sqrt(mean_squared_error(y_test_2016,preds_2016))
 ## 2017
 rmse_2017 = np.sqrt(mean_squared_error(y_test_2017,preds_2017))
 ## 2018
 rmse_2018 = np.sqrt(mean_squared_error(y_test_2018,preds_2018))
 ## 2019
 rmse_2019 = np.sqrt(mean_squared_error(y_test_2019,preds_2019))
 # Plot each model with each test set prediction in place of the real data.
 ##2016
 new_data_2016 = pd.DataFrame(df_hist_2016,columns=['Date', 'Open'])
 aux_preds_2016 = np.append(y_train_2016,preds_2016)
 new_data_2016['Preds'] = aux_preds_2016

 plt.plot(new_data_2016[['Open', 'Preds']])
 ## 2017
 new_data_2017 = pd.DataFrame(df_hist_2017,columns=['Date', 'Open'])
 aux_preds_2017 = np.append(y_train_2017,preds_2017)
 new_data_2017['Preds'] = aux_preds_2017

 plt.plot(new_data_2017[['Open', 'Preds']])

 ## 2018
 new_data_2018 = pd.DataFrame(df_hist_2018,columns=['Date', 'Open'])
 aux_preds_2018 = np.append(y_train_2018,preds_2018)
 new_data_2018['Preds'] = aux_preds_2018

 plt.plot(new_data_2018[['Open', 'Preds']])
 ## 2019
 new_data_2019 = pd.DataFrame(df_hist_2019,columns=['Date', 'Open'])
 aux_preds_2019 = np.append(y_train_2019,preds_2019)
 new_data_2019['Preds'] = aux_preds_2019

 plt.plot(new_data_2019[['Open', 'Preds']])
	# -- coding: utf-8 --
	"""
	Created on Mon Dec 9 23:03:35 2019

	@author: caiob
	"""
	import pandas as pd
	import numpy as np
	import io
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_squared_error
	import xgboost as xgb
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score,RandomizedSearchCV
	token = '6ObFwPQc1KdMsO4AcGgV0m15QV4HB9iA8Wb6M3FlesaNtL1sNxmgbSBTN4BN'
	from worldtradingdata import WorldTradingData

	# Creating WTD object with token
	wtd = WorldTradingData(token)
	optional_params = {'output':'csv'}

	# Getting Lockheed-Martin historical prices as csv
	LMT_history = wtd.history('LMT',optional_params)
	# Presenting obtained data
	LMT_history
	# Transforming into a Pandas DataFrame
	df_hist = pd.read_csv(io.StringIO(LMT_history))
	# Analyzing DataFrame data structure and types
	# Setting date as the index
	df_hist['Date'] = pd.to_datetime(df_hist.Date,format='%Y-%m-%d')
	df_hist.index = df_hist['Date']
	df_hist_index = df_hist.sort_index(ascending=True, axis=0)
	#Create new dataset with only Open price
	df_hist.info()
	# Extracting 3 years of data from the DataFrame
	# Create a mask
	mask = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2019-12-10')
	# Creating new DataFrame
	df_threeyear = df_hist.loc[mask]
	# Plotting the dataset for Opening Price
	plt.figure(figsize=(20,8))
	plt.plot(df_threeyear['Open'],label='Preço Abertura 3 anos')
	## Criando Datasets Anuais
	###2016
	mask_2016 = (df_hist['Date'] > '2016-1-1') & (df_hist['Date'] <= '2016-12-31')
	df_hist_2016 = df_threeyear.loc[mask_2016]
	# Plotting 2016's opening price graph
	plt.figure(figsize=(20,8))
	plt.plot(df_hist_2016['Open'],label='Preço Abertura 2016')
	###2017
	mask_2017 = (df_hist['Date'] > '2017-1-1') & (df_hist['Date'] <= '2017-12-31')
	df_hist_2017 = df_threeyear.loc[mask_2017]
	# Plotting 2017's opening price graph
	plt.figure(figsize=(20,8))
	plt.plot(df_hist_2017['Open'],label='Preço Abertura 2017')
	###2018
	mask_2018 = (df_hist['Date'] > '2018-1-1') & (df_hist['Date'] <= '2018-12-31')
	df_hist_2018 = df_threeyear.loc[mask_2018]
	# Plotting 2018's opening price graph
	plt.figure(figsize=(20,8))
	plt.plot(df_hist_2018['Open'],label='Preço Abertura 2018')
	###2019
	mask_2019 = (df_hist['Date'] > '2019-1-1') & (df_hist['Date'] <= '2019-12-31')
	df_hist_2019 = df_threeyear.loc[mask_2019]
	# Plotting 2019's opening price graph
	plt.figure(figsize=(20,8))
	plt.plot(df_hist_2019['Open'],label='Preço Abertura 2019')
	## Creating training and Testing subsets for each year
	X_2016,y_2016 = df_hist_2016.iloc[:,1:],df_hist_2016.iloc[:,1]
	X_2017,y_2017 = df_hist_2017.iloc[:,1:],df_hist_2017.iloc[:,1]
	X_2018,y_2018 = df_hist_2018.iloc[:,1:],df_hist_2018.iloc[:,1]
	X_2019,y_2019 = df_hist_2019.iloc[:,1:],df_hist_2019.iloc[:,1]
	#
	X_train_2016, X_test_2016, y_train_2016, y_test_2016 = train_test_split(X_2016,y_2016,test_size=0.2,shuffle=False)
	X_train_2017, X_test_2017, y_train_2017, y_test_2017 = train_test_split(X_2017,y_2017,test_size=0.2,shuffle=False)
	X_train_2018, X_test_2018, y_train_2018, y_test_2018 = train_test_split(X_2018,y_2018,test_size=0.2,shuffle=False)
	X_train_2019, X_test_2019, y_train_2019, y_test_2019 = train_test_split(X_2019,y_2019,test_size=0.2,shuffle=False)
	# Creating XGBoost DM_matrices to work with
	# Convert the training and testing sets into DMatrices for 2016 set:
	DM_train_2016 = xgb.DMatrix(data=X_train_2016, label=y_train_2016)
	DM_test_2016 = xgb.DMatrix(data=X_test_2016, label=y_test_2016)
	# Convert the training and testing sets into DMatrices for 2017 set:
	DM_train_2017 = xgb.DMatrix(data=X_train_2017, label=y_train_2017)
	DM_test_2017 = xgb.DMatrix(data=X_test_2017, label=y_test_2017)
	# Convert the training and testing sets into DMatrices for 2018 set:
	DM_train_2018 = xgb.DMatrix(data=X_train_2018, label=y_train_2018)
	DM_test_2018 = xgb.DMatrix(data=X_test_2018, label=y_test_2018)
	# Convert the training and testing sets into DMatrices for 2019 set:
	DM_train_2019 = xgb.DMatrix(data=X_train_2019, label=y_train_2019)
	DM_test_2019 = xgb.DMatrix(data=X_test_2019, label=y_test_2019)
	# Create the parameter dictionary: params
	params = {"booster":"gblinear", "objective":"reg:squarederror"}

	# Train models for each year
	## 2016
	# Train the model: xg_reg
	xg_reg_2016 = xgb.train(params=params, dtrain=DM_train_2016, num_boost_round=10)
	## 2017
	# Train the model: xg_reg
	xg_reg_2017 = xgb.train(params=params, dtrain=DM_train_2017, num_boost_round=10)
	## 2018
	# Train the model: xg_reg
	xg_reg_2018 = xgb.train(params=params, dtrain=DM_train_2018, num_boost_round=10)
	## 2019
	# Train the model: xg_reg
	xg_reg_2019 = xgb.train(params=params, dtrain=DM_train_2019, num_boost_round=10)
	# Predictions for each year on the test sets
	## 2016
	preds_2016 = xg_reg_2016.predict(DM_test_2016)
	## 2017
	preds_2017 = xg_reg_2017.predict(DM_test_2017)
	## 2018
	preds_2018 = xg_reg_2018.predict(DM_test_2018)
	## 2019
	preds_2019 = xg_reg_2019.predict(DM_test_2019)
	# Calculate RMSE of each model for each year
	## 2016
	rmse_2016 = np.sqrt(mean_squared_error(y_test_2016,preds_2016))
	## 2017
	rmse_2017 = np.sqrt(mean_squared_error(y_test_2017,preds_2017))
	## 2018
	rmse_2018 = np.sqrt(mean_squared_error(y_test_2018,preds_2018))
	## 2019
	rmse_2019 = np.sqrt(mean_squared_error(y_test_2019,preds_2019))
	# Plot each model with each test set prediction in place of the real data.
	##2016
	new_data_2016 = pd.DataFrame(df_hist_2016,columns=['Date', 'Open'])
	aux_preds_2016 = np.append(y_train_2016,preds_2016)
	new_data_2016['Preds'] = aux_preds_2016

	plt.plot(new_data_2016[['Open', 'Preds']])
	## 2017
	new_data_2017 = pd.DataFrame(df_hist_2017,columns=['Date', 'Open'])
	aux_preds_2017 = np.append(y_train_2017,preds_2017)
	new_data_2017['Preds'] = aux_preds_2017

	plt.plot(new_data_2017[['Open', 'Preds']])

	## 2018
	new_data_2018 = pd.DataFrame(df_hist_2018,columns=['Date', 'Open'])
	aux_preds_2018 = np.append(y_train_2018,preds_2018)
	new_data_2018['Preds'] = aux_preds_2018

	plt.plot(new_data_2018[['Open', 'Preds']])
	## 2019
	new_data_2019 = pd.DataFrame(df_hist_2019,columns=['Date', 'Open'])
	aux_preds_2019 = np.append(y_train_2019,preds_2019)
	new_data_2019['Preds'] = aux_preds_2019

	plt.plot(new_data_2019[['Open', 'Preds']])