library(mlr)
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
library(magrittr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.3 ✔ purrr 0.3.4
## ✔ tibble 3.1.0 ✔ dplyr 1.0.5
## ✔ tidyr 1.1.3 ✔ stringr 1.4.0
## ✔ readr 1.4.0 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#' Function imputing data with the use of random forest
#' @param data_frame data frame with both predictors and independent data
impute_NAs <- function(data_frame, ntree = 500) {
transformed_data <- rfImpute(data.matrix(data_frame[, -1]), as.factor(data_frame[["severity"]]), iter=5, ntree=ntree)
transformed_data[, 1] <- transformed_data[, 1]-1
transformed_data %<>%
as.data.frame()
names(transformed_data)[[1]] <- 'severity'
transformed_data$severity <- as.factor(transformed_data$severity)
transformed_data
}
mood_data_train_class <- readRDS("mood_train.RDS")
#mood_data_test_class <- readRDS("mood_test.RDS")
# mood_data_test_full <- readRDS("mood_test_full.RDS")
imputing missing values
mood_data_train_class_matrix <- impute_NAs(mood_data_train_class)
## ntree OOB 1 2
## 500: 6.30% 0.20% 91.59%
## ntree OOB 1 2
## 500: 6.67% 0.40% 94.39%
## ntree OOB 1 2
## 500: 6.30% 0.20% 91.59%
## ntree OOB 1 2
## 500: 6.73% 0.40% 95.33%
## ntree OOB 1 2
## 500: 6.55% 0.27% 94.39%
severity_forest <- makeLearner("classif.randomForest")
severity_task <- makeClassifTask(data = mood_data_train_class_matrix, target = "severity")
forest_param_space <- makeParamSet(
makeIntegerParam("ntree", lower = 500, upper = 500), # fixing the number of trees to 500
makeIntegerParam("mtry", lower = 6, upper = 12),
makeIntegerParam("nodesize", lower = 1, upper = 5),
makeIntegerParam("maxnodes", lower = 5, upper = 20)
)
random_search <- makeTuneControlRandom(maxit = 100)
cv_for_tuning <- makeResampleDesc("CV", iters = 5)
library(parallel)
library(parallelMap)
print(paste("Detected ", detectCores(), " cores", collapse = ""))
## [1] "Detected 16 cores"
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=16.
tuned_forest_pars <- tuneParams(severity_forest, task = severity_task,
resampling = cv_for_tuning,
par.set = forest_param_space,
control = random_search)
## [Tune] Started tuning learner classif.randomForest for parameter set:
## Type len Def Constr Req Tunable Trafo
## ntree integer - - 500 to 500 - TRUE -
## mtry integer - - 6 to 12 - TRUE -
## nodesize integer - - 1 to 5 - TRUE -
## maxnodes integer - - 5 to 20 - TRUE -
## With control class: TuneControlRandom
## Imputation value: 1
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 16; elements = 100.
## [Tune] Result: ntree=500; mtry=10; nodesize=1; maxnodes=16 : mmce.test.mean=0.0660962
parallelStop()
## Stopped parallelization. All cleaned up.
tuned_forest_pars
## Tune result:
## Op. pars: ntree=500; mtry=10; nodesize=1; maxnodes=16
## mmce.test.mean=0.0660962
tuned_forest <- setHyperPars(severity_forest, par.vals = tuned_forest_pars$x)
tuned_forest_model <- train(tuned_forest, severity_task)
saveRDS(tuned_forest_model, "tuned_forest_model.RDS")
forest_model_data <- getLearnerModel(tuned_forest_model)
res <- colnames(forest_model_data$err.rate)
plot(forest_model_data, col = 1:length(res), lty = 1:length(res))
legend("topright", res,
col = 1:length(res),
lty = 1:length(res))
We can see that once we hit 100 trees the error estimate stabilizes - this indicates that we have enough trees in our forest (we could have had fewer)
outer <- makeResampleDesc("CV", iters = 5)
forest_wrapper <- makeTuneWrapper("classif.randomForest",
resampling = cv_for_tuning,
par.set = forest_param_space,
control = random_search)
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=16.
cv_with_tuning <- resample(forest_wrapper, severity_task, resampling = outer)
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Resampling: cross-validation
## Measures: mmce
## Mapping in parallel: mode = socket; level = mlr.resample; cpus = 16; elements = 5.
##
## Aggregated Result: mmce.test.mean=0.0673384
##
parallelStop()
## Stopped parallelization. All cleaned up.
cv_with_tuning
## Resample Result
## Task: mood_data_train_class_matrix
## Learner: classif.randomForest.tuned
## Aggr perf: mmce.test.mean=0.0673384
## Runtime: 3015.38