Skip to content

empirical_results

Rebecca Pontes Salles edited this page Mar 25, 2021 · 4 revisions

Some empirical results of TSPred:

This page provides code for obtaining some empirical results of TSPred regarding the task of predicting the fifth gap of missing observations of the CATS dataset.

Different approaches were adopted combining data preprocessing and data modeling, either by the machine learning model SVM or by the deep learning model CNN. The time series in CATS are mostly nonstationary, thus a data preprocessing step also adopted based on the application of the splitting-based nonstationary time series transform EMD. The linear ARIMA model is selected as baseline, and the results from each approach are ranked based on MSE prediction errors.

Loading the package

library(TSPred)

#Installing the required tensorflow version
if(tensorflow::tf_version() < "2.0") tensorflow::install_tensorflow(version="2.0.0")

Defining the dataset

data(CATS,CATS.cont)
data <- rbind(CATS[5],CATS.cont[5])
test_len <- nrow(CATS.cont[5])

Defining required objects for time series prediction

Defining data processing objects

data_subsetting <- subsetting(test_len=test_len)

mapping_based <- list(BCT=BoxCoxT(lambda=NULL))
splitting_based <- list(EMD=EMD(),
                        WT=WT())

normalization <- list(MM=MinMax(),
                      AN=AN())

sliding_window <- SW(window_len=5)

Defining modeling objects

ARIMA_model <- ARIMA()
CNN_model <- Tensor_CNN(sw=sliding_window,proc=list(norm=normalization[["MM"]]))
SVM_model <- SVM(sw=sliding_window,proc=list(norm=normalization[["MM"]]))

Defining evaluation objects

mse_eval <- MSE_eval()

Time series prediction approaches

ARIMA (baseline)

tspred_arima <- tspred(subsetting=data_subsetting,
                       modeling=ARIMA_model,
                       evaluating=list(MSE=mse_eval))
#Baseline
tspred_arima_run <- workflow(tspred_arima,data=data,prep_test=TRUE,onestep=TRUE,eval_fitness=FALSE)

CNN (do not beat the baseline)

tspred_cnn <- tspred(subsetting=data_subsetting,
                     processing=NULL,
                     modeling=CNN_model,
                     evaluating=list(MSE=mse_eval))

tspred_cnn_run <- workflow(tspred_cnn,data=data,prep_test=TRUE,onestep=TRUE,eval_fitness=FALSE)

CNN+EMD (do not beat the baseline)

tspred_cnn_proc <- tspred(subsetting=data_subsetting,
                     processing=list(EMD=splitting_based[["EMD"]]),
                     modeling=CNN_model,
                     evaluating=list(MSE=mse_eval))

tspred_cnn_proc_run <- workflow(tspred_cnn_proc,data=data,prep_test=TRUE,onestep=TRUE,eval_fitness=FALSE)

SVM (do not beat the baseline)

tspred_svm <- tspred(subsetting=data_subsetting,
                     processing=NULL,
                     modeling=SVM_model,
                     evaluating=list(MSE=mse_eval))

tspred_svm_run <- workflow(tspred_svm,data=data,prep_test=TRUE,onestep=TRUE,eval_fitness=FALSE)

SVM+EMD (beat the baseline)

#=========== SVM+EMD (beat the baseline) =============
tspred_svm_proc <- tspred(subsetting=data_subsetting,
                          processing=list(EMD=splitting_based[["EMD"]]),
                          modeling=SVM_model,
                          evaluating=list(MSE=mse_eval))

tspred_svm_proc_run <- workflow(tspred_svm_proc,data=data,prep_test=TRUE,onestep=TRUE,eval_fitness=FALSE)

SVM+(BCT+EMD) (beat the baseline)

tspred_svm_proc_2 <- tspred(subsetting=data_subsetting,
                            processing=list(BCT=mapping_based[["BCT"]],
                                            EMD=splitting_based[["EMD"]]),
                            modeling=SVM_model,
                            evaluating=list(MSE=mse_eval))

tspred_svm_proc_2_run <- workflow(tspred_svm_proc_2,data=data,prep_test=TRUE,onestep=TRUE,eval_fitness=FALSE)

Benchmarking prediction results

MLM_tests <- list(tspred_cnn_run,tspred_cnn_proc_run,tspred_svm_run,tspred_svm_proc_run)

benchmarking <- benchmark(tspred_arima_run,MLM_tests,rank.by=c("MSE"))
Discussions:
  • TSPred prediction results are competitive when comparing the errors produced by CATS competitors.
  • The demand for the adoption of a suitable baseline model is noticeable.
  • The CNN and SVM could not outperform the baseline.
  • Introducing nonstationarity treatment resulted in smaller prediction errors.

Plotting prediction results

predictions <- data.frame()

models <- stringr::str_remove(names(benchmarking$ranked_tspred_objs), "MinMax-")
models <- stringr::str_remove(models, "Tensor_")
models <- stringr::str_replace(models, "-","+")

for(m in 1:length(benchmarking$ranked_tspred_objs)){
  
  model <- names(benchmarking$ranked_tspred_obj)[m]
  obj <- benchmarking$ranked_tspred_objs[[model]]
  if(!is.null(obj$pred$postp)) pred <- obj$pred$postp[[1]]
  else if(!is.null(obj$pred$raw)) pred <- obj$pred$raw[[1]]
  
  predictions <- rbind(predictions,cbind(time=981:1000,pred=pred,model=models[m]))
}
predictions$time <- as.numeric(predictions$time)
predictions$pred <- as.numeric(predictions$pred)
predictions$model <- factor(predictions$model,levels = models)

library(ggplot2)
library(RColorBrewer)
colors <- brewer.pal(5,"Set1")

ggplot(predictions[predictions$model %in% c("ARIMA","SVM","EMD+SVM"),], aes(x = time, y = pred, col=model)) +
  geom_line(data = CATS.cont[5], aes(x = 981:1000, y = V5), size=1.1, col="gray45", linetype = "dashed") +
  geom_line(size=1) +
  geom_point(size=2) +
  labs(x ="Observation", y = "", col="Model", shape="Model") +
  scale_colour_manual(values=colors[1:3])+
  theme_bw()

ggplot(predictions[predictions$model %in% c("ARIMA","CNN","EMD+CNN"),], aes(x = time, y = pred, col=model)) +
  geom_line(data = CATS.cont[5], aes(x = 981:1000, y = V5), size=1.1, col="gray45", linetype = "dashed") +
  geom_line(size=1) +
  geom_point(size=2) +
  labs(x ="Observation", y = "", col="Model") +
  scale_colour_manual(values=colors[c(2,4:5)])+
  theme_bw()