2_model_training.Rmd

---
title: "2_model_training"
author: "Simon Topp"
date: "11/8/2019"
output: html_document
editor_options: 
  chunk_output_type: console
---

#First, use FFS and LLTO_CV to reduce the parameter space and remove spatially autocorrelated variables.

```{r}
## Select parameters to go into model
paste0(names(srMunged), collapse = ', ')

## These parameters make sense in terms of their mechanisms, resolution (with some assumptions), and reliability
features <- "NR, BG, dWL, inStreamCat, PctCarbResidCat, Precip8110Cat, Tmean8110Cat, RunoffCat, ClayCat, SandCat, OmCat, PermCat, RckdepCat, WtDepCat, BFICat, ElevCat, AOD, BFICat, KffactCat, CatAreaSqKm, PctAg2006Slp10Cat, ElevCat, NPDESDensCat, HydrlCondCat, PctImp2006Cat, pctUrban2006, pctForest2006, PctCrop2006Cat, pctWetland2006, WetIndexCat, areasqkm, meandused"

features <- str_split(features, pattern = ', ') %>% unlist()

##Pull 20% as holdout data
set.seed(340987)
holdOut <- srMunged %>%
  group_by(region) %>%
  sample_frac(.2)

df <- srMunged %>% filter(!UniqueID %in% holdOut$UniqueID)

#One hot encoding for xgboost
oneHot <- onehot(df %>% select(features))
df.encoded <- predict(oneHot, df %>% select(features)) %>% as.data.frame(.) 

# Split up cross validation folds based on stratified sampling
df <- df %>% mutate(SpaceTime = paste0(region, sat),
                    group1 = cut_number(lat, 3),
                    date = ymd(date),
                    julian = as.numeric(julian.Date(date))) %>%
            group_by(group1) %>%
            mutate(group2 = cut_number(long, 3)) %>%
            ungroup() %>%
            mutate(spaceCluster = paste0(group1,group2), #Use this or COMID
                   timeCluster = cut_number(julian, 9))

folds <- CreateSpacetimeFolds(df, spacevar = 'COMID', timevar = 'timeCluster', k= 10, seed = 34985)

control <- trainControl(method="cv", savePredictions = 'none', 
                        returnResamp = 'final', index = folds$index, 
                        indexOut = folds$indexOut)

## Do initial feature selection with conservative hyperparameters
ffs.tuneGrid <- expand.grid(nrounds = 300, max_depth = 4, eta = .1, 
                            gamma = 1, min_child_weight = 1, colsample_bytree = .6,
                            subsample = .6)

# Takes awhile, so set it up all to run in parrallel
cl <- makePSOCKcluster(availableCores() - 4)
registerDoParallel(cl)

ffs <- ffs(df.encoded, df$value, method = 'xgbTree', metric = 'RMSE', 
           tuneGrid = ffs.tuneGrid, trControl = control, verbose = T)

stopCluster(cl)

ffsResults <- ffs$perf_all

# Save the results
write_feather(ffsResults, 'out/ffsResultsFull.feather')

ffsResults %>%
  group_by(nvar) %>%
  summarise(RMSE = median(RMSE),
            SE = median(SE)) %>%
  ggplot(.) + geom_line(aes(x = nvar, y = RMSE)) +
  geom_errorbar(aes(x = nvar, ymin = RMSE - SE, ymax = RMSE + SE), color = 'red')

#ggsave('figures/rfeRMSE.png', device = 'png', width = 6, height = 4, units = 'in')
```

### Build the final model for the analysis

```{r baseline model, message=FALSE}
#Pull features from FFS and LLTO-CV
ffsResults <- read_feather('out/ffsResultsFull.feather')
features <- ffsResults[ffsResults$RMSE == min(ffsResults$RMSE),] %>%
  select(-c(nvar, RMSE, SE)) %>%
  paste(.) %>% .[.!= 'NA']

oneHot <- onehot(df %>% select(features))
df.encoded <- predict(oneHot, df %>% select(features)) %>% as.data.frame(.) 
```

# Grid search for hyperparameters

```{r}

## Set up a conservative grid search for hyperparameters
grid_train <- expand.grid(
  nrounds = seq(100,300,100),
  alpha = c(.1,.5,1),
  lambda = c(.1,.5,1),
  eta = c(.1, 0.3)
)

#Set up a cluster to run everything in parrallel
cl <- makePSOCKcluster(availableCores()-1)
registerDoParallel(cl)

train_control <- caret::trainControl(method="cv", savePredictions = F, 
                          returnResamp = 'final', index = folds$index, 
                          indexOut = folds$indexOut, verboseIter = T)

base_model <- caret::train(
  x = df.encoded,
  y = df$value,
  trControl = train_control,
  tuneGrid = grid_train,
  method = "xgbLinear",
  preProcess = c('center', 'scale'),
  importance = F,
  verbose = TRUE
)

base_model$bestTune

## Build final model with best tune
train_control1 <- caret::trainControl(method="cv", savePredictions = T, 
                          returnResamp = 'final', index = folds$index, 
                          indexOut = folds$indexOut, verboseIter = T)

grid_final <- expand.grid(
  nrounds = base_model$bestTune$nrounds,
  alpha = base_model$bestTune$alpha,
  lambda = base_model$bestTune$lambda,
  eta = base_model$bestTune$eta
)


model <- caret::train(
  x = df.encoded,
  y = df$value,
  trControl = train_control1,
  tuneGrid = grid_final,
  method = "xgbLinear",
  preProcess = c('center', 'scale'),
  importance = T,
  verbose = TRUE
)

stopCluster(cl)

## Now look at holdout data

holdout_x <- predict(oneHot, holdOut %>% select(features))

holdout_y <- holdOut$value

output <- tibble(Actual = holdout_y, Predicted = predict(model, holdout_x), log = 'none', UniqueID = holdOut$UniqueID)

save(model, file = paste0('models/',iteration, '.Rdata'))
write_feather(output, paste0('out/outputs/',iteration, '.feather'))
```