library(mlr)
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
library(magrittr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:mlr':
## 
##     train

Models performance

Loading test data

test_data <- readRDS("mood_test.RDS")
#' Function imputing data with the use of random forest
#' @param data_frame data frame with both predictors and independent data
impute_NAs <- function(data_frame, ntree = 500) {
  transformed_data <- rfImpute(data.matrix(data_frame[, -1]), as.factor(data_frame[["severity"]]), iter=5, ntree=ntree)
  transformed_data[, 1] <- transformed_data[, 1]-1
  transformed_data %<>% 
    as.data.frame()
  names(transformed_data)[[1]] <- 'severity'
  transformed_data$severity <- as.factor(transformed_data$severity)
  transformed_data
}

Loading trained models

tuned_tree_model <- readRDS('tuned_tree_model.RDS')
tuned_forest_model <- readRDS('tuned_forest_model.RDS')
tuned_xgb_model <- readRDS('tuned_xgb_model.RDS')

Tree model by the rpart algorithm

predicted_mood_tree <- predict(tuned_tree_model, newdata = test_data)
## Warning in predict.WrappedModel(tuned_tree_model, newdata = test_data): Provided
## data for prediction is not a pure data.frame but from class tbl_df, hence it
## will be converted.
performance <- performance(predicted_mood_tree, measures = list(mmce, acc))
performance
##       mmce        acc 
## 0.03074013 0.9692599
calculateConfusionMatrix(predicted_mood_tree)
##         predicted
## true     FALSE TRUE -err.-
##   FALSE    632    7      7
##   TRUE      14   30     14
##   -err.-    14    7     21

Tree random forest

# in the course of building the training model, the factors got rewritten
#mood_data_test_class_matrix <- data.matrix(test_data) %>% 
#  na.roughfix() 

mood_data_test_class_matrix <- impute_NAs(test_data)
## ntree      OOB      1      2
##   500:   6.59%  0.47% 95.45%
## ntree      OOB      1      2
##   500:   6.44%  0.31% 95.45%
## ntree      OOB      1      2
##   500:   6.44%  0.31% 95.45%
## ntree      OOB      1      2
##   500:   6.44%  0.31% 95.45%
## ntree      OOB      1      2
##   500:   6.59%  0.31% 97.73%
predicted_forest_tree <- predict(tuned_forest_model, newdata = mood_data_test_class_matrix)
performance <- performance(predicted_forest_tree, measures = list(mmce, acc))
performance
##       mmce        acc 
## 0.06442167 0.93557833
calculateConfusionMatrix(predicted_forest_tree)
##         predicted
## true       0 1 -err.-
##   0      639 0      0
##   1       44 0     44
##   -err.-  44 0     44

xgb model

predicted_mood_xgb <- predict(tuned_xgb_model, newdata = mood_data_test_class_matrix)
performance <- performance(predicted_mood_xgb, measures = list(mmce, acc))
performance
##       mmce        acc 
## 0.06149341 0.93850659
calculateConfusionMatrix(predicted_mood_xgb)
##         predicted
## true       0 1 -err.-
##   0      634 5      5
##   1       37 7     37
##   -err.-  37 5     42

Assessing the quality of classifying the most severe cases

mood_test_full <- readRDS('mood_test_full.RDS')

Tree

most severe

predicted_mood_tree$data$response[mood_test_full$mood == 6] %>% as.logical() %>% sum() / sum(mood_test_full$mood == 6) * 100
## [1] 75

less severe

predicted_mood_tree$data$response[mood_test_full$mood == 5] %>% as.logical() %>% sum() / sum(mood_test_full$mood == 5) * 100
## [1] 66.66667

xgboost

most severe

predicted_mood_xgb$data$response[mood_test_full$mood == 6] %>% as.numeric %>% subtract(1) %>% sum() / sum(mood_test_full$mood == 6, na.rm = TRUE) * 100
## [1] 12.5

less severe

predicted_mood_xgb$data$response[mood_test_full$mood == 5] %>% as.numeric %>% subtract(1) %>% sum() / sum(mood_test_full$mood == 5, na.rm = TRUE) * 100
## [1] 16.66667