-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_model_training.Rmd
161 lines (123 loc) · 5.1 KB
/
2_model_training.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
---
title: "2_model_training"
author: "Simon Topp"
date: "11/8/2019"
output: html_document
editor_options:
chunk_output_type: console
---
#First, use FFS and LLTO_CV to reduce the parameter space and remove spatially autocorrelated variables.
```{r}
## Select parameters to go into model
paste0(names(srMunged), collapse = ', ')
## These parameters make sense in terms of their mechanisms, resolution (with some assumptions), and reliability
features <- "NR, BG, dWL, inStreamCat, PctCarbResidCat, Precip8110Cat, Tmean8110Cat, RunoffCat, ClayCat, SandCat, OmCat, PermCat, RckdepCat, WtDepCat, BFICat, ElevCat, AOD, BFICat, KffactCat, CatAreaSqKm, PctAg2006Slp10Cat, ElevCat, NPDESDensCat, HydrlCondCat, PctImp2006Cat, pctUrban2006, pctForest2006, PctCrop2006Cat, pctWetland2006, WetIndexCat, areasqkm, meandused"
features <- str_split(features, pattern = ', ') %>% unlist()
##Pull 20% as holdout data
set.seed(340987)
holdOut <- srMunged %>%
group_by(region) %>%
sample_frac(.2)
df <- srMunged %>% filter(!UniqueID %in% holdOut$UniqueID)
#One hot encoding for xgboost
oneHot <- onehot(df %>% select(features))
df.encoded <- predict(oneHot, df %>% select(features)) %>% as.data.frame(.)
# Split up cross validation folds based on stratified sampling
df <- df %>% mutate(SpaceTime = paste0(region, sat),
group1 = cut_number(lat, 3),
date = ymd(date),
julian = as.numeric(julian.Date(date))) %>%
group_by(group1) %>%
mutate(group2 = cut_number(long, 3)) %>%
ungroup() %>%
mutate(spaceCluster = paste0(group1,group2), #Use this or COMID
timeCluster = cut_number(julian, 9))
folds <- CreateSpacetimeFolds(df, spacevar = 'COMID', timevar = 'timeCluster', k= 10, seed = 34985)
control <- trainControl(method="cv", savePredictions = 'none',
returnResamp = 'final', index = folds$index,
indexOut = folds$indexOut)
## Do initial feature selection with conservative hyperparameters
ffs.tuneGrid <- expand.grid(nrounds = 300, max_depth = 4, eta = .1,
gamma = 1, min_child_weight = 1, colsample_bytree = .6,
subsample = .6)
# Takes awhile, so set it up all to run in parrallel
cl <- makePSOCKcluster(availableCores() - 4)
registerDoParallel(cl)
ffs <- ffs(df.encoded, df$value, method = 'xgbTree', metric = 'RMSE',
tuneGrid = ffs.tuneGrid, trControl = control, verbose = T)
stopCluster(cl)
ffsResults <- ffs$perf_all
# Save the results
write_feather(ffsResults, 'out/ffsResultsFull.feather')
ffsResults %>%
group_by(nvar) %>%
summarise(RMSE = median(RMSE),
SE = median(SE)) %>%
ggplot(.) + geom_line(aes(x = nvar, y = RMSE)) +
geom_errorbar(aes(x = nvar, ymin = RMSE - SE, ymax = RMSE + SE), color = 'red')
#ggsave('figures/rfeRMSE.png', device = 'png', width = 6, height = 4, units = 'in')
```
### Build the final model for the analysis
```{r baseline model, message=FALSE}
#Pull features from FFS and LLTO-CV
ffsResults <- read_feather('out/ffsResultsFull.feather')
features <- ffsResults[ffsResults$RMSE == min(ffsResults$RMSE),] %>%
select(-c(nvar, RMSE, SE)) %>%
paste(.) %>% .[.!= 'NA']
oneHot <- onehot(df %>% select(features))
df.encoded <- predict(oneHot, df %>% select(features)) %>% as.data.frame(.)
```
# Grid search for hyperparameters
```{r}
## Set up a conservative grid search for hyperparameters
grid_train <- expand.grid(
nrounds = seq(100,300,100),
alpha = c(.1,.5,1),
lambda = c(.1,.5,1),
eta = c(.1, 0.3)
)
#Set up a cluster to run everything in parrallel
cl <- makePSOCKcluster(availableCores()-1)
registerDoParallel(cl)
train_control <- caret::trainControl(method="cv", savePredictions = F,
returnResamp = 'final', index = folds$index,
indexOut = folds$indexOut, verboseIter = T)
base_model <- caret::train(
x = df.encoded,
y = df$value,
trControl = train_control,
tuneGrid = grid_train,
method = "xgbLinear",
preProcess = c('center', 'scale'),
importance = F,
verbose = TRUE
)
base_model$bestTune
## Build final model with best tune
train_control1 <- caret::trainControl(method="cv", savePredictions = T,
returnResamp = 'final', index = folds$index,
indexOut = folds$indexOut, verboseIter = T)
grid_final <- expand.grid(
nrounds = base_model$bestTune$nrounds,
alpha = base_model$bestTune$alpha,
lambda = base_model$bestTune$lambda,
eta = base_model$bestTune$eta
)
model <- caret::train(
x = df.encoded,
y = df$value,
trControl = train_control1,
tuneGrid = grid_final,
method = "xgbLinear",
preProcess = c('center', 'scale'),
importance = T,
verbose = TRUE
)
stopCluster(cl)
## Now look at holdout data
holdout_x <- predict(oneHot, holdOut %>% select(features))
holdout_y <- holdOut$value
output <- tibble(Actual = holdout_y, Predicted = predict(model, holdout_x), log = 'none', UniqueID = holdOut$UniqueID)
save(model, file = paste0('models/',iteration, '.Rdata'))
write_feather(output, paste0('out/outputs/',iteration, '.feather'))
```