Last active
December 20, 2020 07:30
-
-
Save AdrianAntico/960c82591ff6b77d69f91e36b015a54c to your computer and use it in GitHub Desktop.
Tuning AutoCatBoostCARMA() using the Walmart panel sales data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?R | |
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
# Out-of-Sample Feature + Grid Tuning of RemixAutoML::AutoCatBoostCARMA() | |
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
# Set up your output file path for saving results as a .csv | |
Path <- "C:/YourPathHere" | |
# Run on GPU or CPU (some options in the grid tuning force usage of CPU for some runs) | |
TaskType = "GPU" | |
# Define number of CPU threads to allow data.table to utilize | |
data.table::setDTthreads(percent = max(1L, parallel::detectCores()-2L)) | |
# Load data | |
data <- data <- data.table::fread("https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1") | |
# Ensure series have no missing dates (also remove series with more than 25% missing values) | |
data <- RemixAutoML::TimeSeriesFill( | |
data, | |
DateColumnName = "Date", | |
GroupVariables = c("Store","Dept"), | |
TimeUnit = "weeks", | |
FillType = "maxmax", | |
MaxMissingPercent = 0.25, | |
SimpleImpute = TRUE) | |
# Set negative numbers to 0 | |
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)] | |
# Remove IsHoliday column | |
data[, IsHoliday := NULL] | |
# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them) | |
xregs <- data[, .SD, .SDcols = c("Date", "Store", "Dept")] | |
# Change data types | |
data[, ":=" (Store = as.character(Store), Dept = as.character(Dept))] | |
xregs[, ":=" (Store = as.character(Store), Dept = as.character(Dept))] | |
# Subset data so we have an out of time sample | |
data1 <- data.table::copy(data[, ID := 1L:.N, by = c("Store","Dept")][ID <= 125L][, ID := NULL]) | |
data[, ID := NULL] | |
# Define values for SplitRatios and FCWindow Args | |
N1 <- data1[, .N, by = c("Store","Dept")][1L, N] | |
N2 <- xregs[, .N, by = c("Store","Dept")][1L, N] | |
# Setup Grid Tuning & Feature Tuning data.table using a cross join of vectors | |
Tuning <- data.table::CJ( | |
TimeWeights = c("None",0.999), | |
MaxTimeGroups = c("weeks","months"), | |
TargetTransformation = c("TRUE","FALSE"), | |
Difference = c("TRUE","FALSE"), | |
HoldoutTrain = c(6,18), | |
Langevin = c("TRUE","FALSE"), | |
NTrees = c(2500,5000), | |
Depth = c(6,9), | |
RandomStrength = c(0.75,1), | |
L2_Leaf_Reg = c(3.0,4.0), | |
RSM = c(0.75,"NULL"), | |
GrowPolicy = c("SymmetricTree","Lossguide","Depthwise"), | |
BootStrapType = c("Bayesian","MVS","No")) | |
# Remove options that are not compatible with GPU (skip over this otherwise) | |
Tuning <- Tuning[Langevin == "TRUE" | (Langevin == "FALSE" & RSM == "NULL" & BootStrapType %in% c("Bayesian","No"))] | |
# Randomize order of Tuning data.table | |
Tuning <- Tuning[order(runif(.N))] | |
# Load grid results and remove rows that have already been tested | |
if(file.exists(file.path(Path, "Walmart_CARMA_Metrics.csv"))) { | |
Metrics <- data.table::fread(file.path(Path, "Walmart_CARMA_Metrics.csv")) | |
temp <- data.table::rbindlist(list(Metrics,Tuning), fill = TRUE) | |
temp <- unique(temp, by = c(4:(ncol(temp)-1))) | |
Tuning <- temp[is.na(RunTime)][, .SD, .SDcols = names(Tuning)] | |
rm(Metrics,temp) | |
} | |
# Define the total number of runs | |
TotalRuns <- Tuning[,.N] | |
# Kick off feature + grid tuning | |
for(Run in seq_len(TotalRuns)) { | |
# Print run number | |
for(zz in seq_len(100)) print(Run) | |
# Use fresh data for each run | |
xregs_new <- data.table::copy(xregs) | |
data_new <- data.table::copy(data1) | |
# Timer start | |
StartTime <- Sys.time() | |
# Run carma system | |
CatBoostResults <- RemixAutoML::AutoCatBoostCARMA( | |
# data args | |
data = data_new, | |
TimeWeights = if(Tuning[Run, TimeWeights] == "None") NULL else as.numeric(Tuning[Run, TimeWeights]), | |
TargetColumnName = "Weekly_Sales", | |
DateColumnName = "Date", | |
HierarchGroups = NULL, | |
GroupVariables = c("Store","Dept"), | |
TimeUnit = "weeks", | |
TimeGroups = if(Tuning[Run, MaxTimeGroups] == "weeks") "weeks" else if(Tuning[Run, MaxTimeGroups] == "months") c("weeks","months") else c("weeks","months","quarters"), | |
# Production args | |
TrainOnFull = TRUE, | |
SplitRatios = c(1 - Tuning[Run, HoldoutTrain] / N2, Tuning[Run, HoldoutTrain] / N2), | |
PartitionType = "random", | |
FC_Periods = N2-N1, | |
TaskType = TaskType, | |
NumGPU = 1, | |
Timer = TRUE, | |
DebugMode = TRUE, | |
# Target variable transformations | |
TargetTransformation = as.logical(Tuning[Run, TargetTransformation]), | |
Methods = c("BoxCox","Asinh","Log","LogPlus1","YeoJohnson"), | |
Difference = as.logical(Tuning[Run, Difference]), | |
NonNegativePred = TRUE, | |
RoundPreds = FALSE, | |
# Calendar-related features | |
CalendarVariables = c("week","wom","month","quarter"), | |
HolidayVariable = c("USPublicHolidays"), | |
HolidayLags = c(1,2,3), | |
HolidayMovingAverages = c(2,3), | |
# Lags, moving averages, and other rolling stats | |
Lags = if(Tuning[Run, MaxTimeGroups] == "weeks") c(1,2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12)) else list("weeks" = c(1,2,3,4,5,8,9,12,13,51,52,53), "months" = c(1,2,6,12), "quarters" = c(1,2,3,4)), | |
MA_Periods = if(Tuning[Run, MaxTimeGroups] == "weeks") c(2,3,4,5,8,9,12,13,51,52,53) else if(Tuning[Run, MaxTimeGroups] == "months") list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12)) else list("weeks" = c(2,3,4,5,8,9,12,13,51,52,53), "months" = c(2,6,12), "quarters" = c(2,3,4)), | |
SD_Periods = NULL, | |
Skew_Periods = NULL, | |
Kurt_Periods = NULL, | |
Quantile_Periods = NULL, | |
Quantiles_Selected = NULL, | |
# Bonus features | |
AnomalyDetection = NULL, | |
XREGS = xregs_new, | |
FourierTerms = 0, | |
TimeTrendVariable = TRUE, | |
ZeroPadSeries = NULL, | |
DataTruncate = FALSE, | |
# ML insights | |
PDFOutputPath = NULL, | |
SaveDataPath = NULL, | |
NumOfParDepPlots = 0L, | |
# ML grid tuning args | |
GridTune = FALSE, | |
PassInGrid = NULL, | |
ModelCount = 5, | |
MaxRunsWithoutNewWinner = 50, | |
MaxRunMinutes = 60*60, | |
# ML loss functions | |
EvalMetric = "RMSE", | |
EvalMetricValue = 1, | |
LossFunction = "RMSE", | |
LossFunctionValue = 1, | |
# ML tuning args | |
NTrees = Tuning[Run, NTrees], | |
Depth = Tuning[Run, Depth], | |
Langevin = as.logical(Tuning[Run, Langevin]), | |
DiffusionTemperature = 10000, | |
# ML overfitting args | |
LearningRate = 0.03, | |
L2_Leaf_Reg = Tuning[Run, L2_Leaf_Reg], | |
ModelSizeReg = 0.0, | |
RSM = if(Tuning[Run, RSM] == "NULL") NULL else as.numeric(Tuning[Run, RSM]), | |
MinDataInLeaf = 5, | |
BorderCount = 254, | |
RandomStrength = Tuning[Run, RandomStrength], | |
SamplingUnit = "Group", | |
# ML styles | |
GrowPolicy = Tuning[Run, GrowPolicy], | |
BootStrapType = Tuning[Run, BootStrapType], | |
FeatureBorderType = "GreedyLogSum", | |
SubSample = NULL, | |
ScoreFunction = if(TaskType == "GPU") "NewtonL2" else "Cosine") | |
# Timer End | |
EndTime <- Sys.time() | |
# Prepare data for evaluation | |
Results <- CatBoostResults$Forecast | |
data.table::setnames(Results, "Weekly_Sales", "bla") | |
Results <- merge(Results, data, by = c("Store","Dept","Date"), all = FALSE) | |
Results <- Results[is.na(bla)][, bla := NULL] | |
# Create totals and subtotals | |
Results <- data.table::groupingsets( | |
x = Results, | |
j = list(Predictions = sum(Predictions), Weekly_Sales = sum(Weekly_Sales)), | |
by = c("Date", "Store", "Dept"), | |
sets = list(c("Date", "Store", "Dept"), c("Store", "Dept"), "Store", "Dept", "Date")) | |
# Fill NAs with "Total" for totals and subtotals | |
for(cols in c("Store","Dept")) Results[, eval(cols) := data.table::fifelse(is.na(get(cols)), "Total", get(cols))] | |
# Add error measures | |
Results[, Weekly_MAE := abs(Weekly_Sales - Predictions)] | |
Results[, Weekly_MAPE := Weekly_MAE / Weekly_Sales] | |
# Weekly results | |
Weekly_MAPE <- Results[, list(Weekly_MAPE = mean(Weekly_MAPE)), by = list(Store,Dept)] | |
# Monthly results | |
temp <- data.table::copy(Results) | |
temp <- temp[, Date := lubridate::floor_date(Date, unit = "months")] | |
temp <- temp[, lapply(.SD, sum), by = c("Date","Store","Dept"), .SDcols = c("Predictions", "Weekly_Sales")] | |
temp[, Monthly_MAE := abs(Weekly_Sales - Predictions)] | |
temp[, Monthly_MAPE := Monthly_MAE / Weekly_Sales] | |
Monthly_MAPE <- temp[, list(Monthly_MAPE = mean(Monthly_MAPE)), by = list(Store,Dept)] | |
# Collect metrics for Total (feel free to switch to something else or no filter at all) | |
Metrics <- data.table::data.table( | |
RunNumber = Run, | |
Total_Weekly_MAPE = Weekly_MAPE[Store == "Total" & Dept == "Total", Weekly_MAPE], | |
Total_Monthly_MAPE = Monthly_MAPE[Store == "Total" & Dept == "Total", Monthly_MAPE], | |
Tuning[Run], | |
RunTime = EndTime - StartTime) | |
# Append to file (not overwrite) | |
data.table::fwrite(Metrics, file = file.path(Path, "Walmart_CARMA_Metrics.csv"), append = TRUE) | |
# Remove objects (clear space before new runs) | |
rm(CatBoostResults, Results, temp, Weekly_MAE, Weekly_MAPE, Monthly_MAE, Monthly_MAPE) | |
# Garbage collection because of GPU | |
gc() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment