The goal of this vignette is explain how to use
ResamplingSameOtherSizesCV for various kinds of cross-validation.
We begin with a simple simulated data set.
N <- 2100
abs.x <- 70
set.seed(2)
x.vec <- runif(N, -abs.x, abs.x)
str(x.vec)
#>  num [1:2100] -44.1 28.3 10.3 -46.5 62.1 ...
library(data.table)
(task.dt <- data.table(
  x=x.vec,
  y = sin(x.vec)+rnorm(N,sd=0.5)))
#>               x           y
#>           <num>       <num>
#>    1: -44.11648 -0.40781530
#>    2:  28.33237 -0.08520601
#>    3:  10.26569 -1.23266284
#>    4: -46.47273 -1.36225125
#>    5:  62.13751 -1.33779346
#>   ---                      
#> 2096:  60.83765 -0.10678010
#> 2097:  55.71469 -0.92403513
#> 2098:  14.31045  1.04519820
#> 2099:  27.18008  1.67815828
#> 2100:  23.67202 -0.26881102
if(require(ggplot2)){
  text.size <- 6
  my_theme <- theme_bw(20)
  theme_set(my_theme)
  ggplot()+
    geom_point(aes(
      x, y),
      shape=1,
      data=task.dt)
}
#> Le chargement a nécessité le package : ggplot2
Above we see a scatterplot of the simulated data. The goal of the learning algorithm will be to predict y from x.
The code below assigns three test groups to the randomly simulated data.
atomic.group.size <- 2
task.dt[, agroup := rep(seq(1, N/atomic.group.size), each=atomic.group.size)][]
#>               x           y agroup
#>           <num>       <num>  <int>
#>    1: -44.11648 -0.40781530      1
#>    2:  28.33237 -0.08520601      1
#>    3:  10.26569 -1.23266284      2
#>    4: -46.47273 -1.36225125      2
#>    5:  62.13751 -1.33779346      3
#>   ---                             
#> 2096:  60.83765 -0.10678010   1048
#> 2097:  55.71469 -0.92403513   1049
#> 2098:  14.31045  1.04519820   1049
#> 2099:  27.18008  1.67815828   1050
#> 2100:  23.67202 -0.26881102   1050
task.dt[, random_group := rep(
  rep(c("A","B","B","C","C","C","C"), each=atomic.group.size),
  l=.N
)][]
#>               x           y agroup random_group
#>           <num>       <num>  <int>       <char>
#>    1: -44.11648 -0.40781530      1            A
#>    2:  28.33237 -0.08520601      1            A
#>    3:  10.26569 -1.23266284      2            B
#>    4: -46.47273 -1.36225125      2            B
#>    5:  62.13751 -1.33779346      3            B
#>   ---                                          
#> 2096:  60.83765 -0.10678010   1048            C
#> 2097:  55.71469 -0.92403513   1049            C
#> 2098:  14.31045  1.04519820   1049            C
#> 2099:  27.18008  1.67815828   1050            C
#> 2100:  23.67202 -0.26881102   1050            C
table(group.tab <- task.dt$random_group)
#> 
#>    A    B    C 
#>  300  600 1200
The output above shows the number of rows in each random group. Below we define a task,
reg.task <- mlr3::TaskRegr$new(
  "sin", task.dt, target="y")
reg.task$col_roles$group <- "agroup"
reg.task$col_roles$stratum <- "random_group"
reg.task$col_roles$feature <- "x"
Note that if we assign the subset role at this point, we will get an
error, because this is not a standard mlr3 role.
reg.task$col_roles$subset <- "random_group" 
#> Error in .__Task__col_roles(self = self, private = private, super = super, : Assertion on 'names(rhs)' failed: Names must be a permutation of set {'feature','target','name','order','stratum','group','offset','weights_learner','weights_measure'}, but has extra elements {'subset'}.
Below we define the cross-validation object, which loads the
mlr3resampling package, and then we assign the random group column to
be used as the subset role.
same_other_sizes_cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
reg.task$col_roles$subset <- "random_group" 
Below we instantiate the resampler, in
order to show details about how it works (but normally you should not
instantiate it yourself, as this will be done automatically inside the
call to mlr3::benchmark).
same_other_sizes_cv$instantiate(reg.task)
same_other_sizes_cv$instance$iteration.dt
#>     test.subset train.subsets groups test.fold                        test
#>          <char>        <char>  <int>     <int>                      <list>
#>  1:           A           all    700         1       43,44,57,58,71,72,...
#>  2:           B           all    700         1        3, 4, 5, 6,17,18,...
#>  3:           C           all    700         1       23,24,25,26,37,38,...
#>  4:           A           all    700         2        1, 2,15,16,29,30,...
#>  5:           B           all    700         2       33,34,47,48,61,62,...
#>  6:           C           all    700         2       13,14,21,22,35,36,...
#>  7:           A           all    700         3  99,100,155,156,169,170,...
#>  8:           B           all    700         3       19,20,45,46,75,76,...
#>  9:           C           all    700         3        7, 8, 9,10,11,12,...
#> 10:           A         other    600         1       43,44,57,58,71,72,...
#> 11:           B         other    500         1        3, 4, 5, 6,17,18,...
#> 12:           C         other    300         1       23,24,25,26,37,38,...
#> 13:           A         other    600         2        1, 2,15,16,29,30,...
#> 14:           B         other    500         2       33,34,47,48,61,62,...
#> 15:           C         other    300         2       13,14,21,22,35,36,...
#> 16:           A         other    600         3  99,100,155,156,169,170,...
#> 17:           B         other    500         3       19,20,45,46,75,76,...
#> 18:           C         other    300         3        7, 8, 9,10,11,12,...
#> 19:           A          same    100         1       43,44,57,58,71,72,...
#> 20:           B          same    200         1        3, 4, 5, 6,17,18,...
#> 21:           C          same    400         1       23,24,25,26,37,38,...
#> 22:           A          same    100         2        1, 2,15,16,29,30,...
#> 23:           B          same    200         2       33,34,47,48,61,62,...
#> 24:           C          same    400         2       13,14,21,22,35,36,...
#> 25:           A          same    100         3  99,100,155,156,169,170,...
#> 26:           B          same    200         3       19,20,45,46,75,76,...
#> 27:           C          same    400         3        7, 8, 9,10,11,12,...
#>     test.subset train.subsets groups test.fold                        test
#>          <char>        <char>  <int>     <int>                      <list>
#>                     train  seed n.train.groups iteration
#>                    <list> <int>          <int>     <int>
#>  1:  1, 2, 7, 8, 9,10,...     1            700         1
#>  2:  1, 2, 7, 8, 9,10,...     1            700         2
#>  3:  1, 2, 7, 8, 9,10,...     1            700         3
#>  4:       3,4,5,6,7,8,...     1            700         4
#>  5:       3,4,5,6,7,8,...     1            700         5
#>  6:       3,4,5,6,7,8,...     1            700         6
#>  7:       1,2,3,4,5,6,...     1            700         7
#>  8:       1,2,3,4,5,6,...     1            700         8
#>  9:       1,2,3,4,5,6,...     1            700         9
#> 10:  7, 8, 9,10,11,12,...     1            600        10
#> 11:  1, 2, 7, 8, 9,10,...     1            500        11
#> 12:  1, 2,15,16,19,20,...     1            300        12
#> 13:       3,4,5,6,7,8,...     1            600        13
#> 14:  7, 8, 9,10,11,12,...     1            500        14
#> 15:  3, 4, 5, 6,17,18,...     1            300        15
#> 16:  3, 4, 5, 6,13,14,...     1            600        16
#> 17:  1, 2,13,14,15,16,...     1            500        17
#> 18:       1,2,3,4,5,6,...     1            300        18
#> 19:  1, 2,15,16,29,30,...     1            100        19
#> 20: 19,20,33,34,45,46,...     1            200        20
#> 21:  7, 8, 9,10,11,12,...     1            400        21
#> 22: 43,44,57,58,71,72,...     1            100        22
#> 23:  3, 4, 5, 6,17,18,...     1            200        23
#> 24:  7, 8, 9,10,11,12,...     1            400        24
#> 25:  1, 2,15,16,29,30,...     1            100        25
#> 26:  3, 4, 5, 6,17,18,...     1            200        26
#> 27: 13,14,21,22,23,24,...     1            400        27
#>                     train  seed n.train.groups iteration
#>                    <list> <int>          <int>     <int>
So using the K-fold cross-validation, we will do one train/test split for each row of the table above. There is one row for each combination of test subset (A, B, C), train subset (same, other, all), and test fold (1, 2, 3).
We compute and plot the results using the code below,
(reg.learner.list <- list(
  mlr3::LearnerRegrFeatureless$new()))
#> [[1]]
#> 
#> ── <LearnerRegrFeatureless> (regr.featureless): Featureless Regression Learner ─
#> • Model: -
#> • Parameters: robust=FALSE
#> • Packages: mlr3 and stats
#> • Predict Types: [response], se, and quantiles
#> • Feature Types: logical, integer, numeric, character, factor, ordered,
#> POSIXct, and Date
#> • Encapsulation: none (fallback: -)
#> • Properties: featureless, importance, missings, selected_features, and weights
#> • Other settings: use_weights = 'use'
if(requireNamespace("rpart")){
  reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
}
#> Le chargement a nécessité le package : rpart
(same.other.grid <- mlr3::benchmark_grid(
  reg.task,
  reg.learner.list,
  same_other_sizes_cv))
#>      task          learner          resampling
#>    <char>           <char>              <char>
#> 1:    sin regr.featureless same_other_sizes_cv
#> 2:    sin       regr.rpart same_other_sizes_cv
##if(require(future))plan("multisession")
lgr::get_logger("mlr3")$set_threshold("warn")
(same.other.result <- mlr3::benchmark(
  same.other.grid, store_models = TRUE))
#> 
#> ── <BenchmarkResult> of 54 rows with 2 resampling run ──────────────────────────
#>  nr task_id       learner_id       resampling_id iters warnings errors
#>   1     sin regr.featureless same_other_sizes_cv    27        0      0
#>   2     sin       regr.rpart same_other_sizes_cv    27        0      0
same.other.score <- mlr3resampling::score(
  same.other.result, mlr3::msr("regr.rmse"))
plot(same.other.score)+my_theme
The plot method above shows a multi-panel figure (vertical facet for each algorithm), whereas below we make a custom ggplot with no vertical facets, and color for algorithm.
same.other.score[, n.train := sapply(train, length)]
#> Warning in `[.data.table`(same.other.score, , `:=`(n.train, sapply(train, : A
#> shallow copy of this data.table was taken so that := can add or remove 1
#> columns by reference. At an earlier point, this data.table was copied by R (or
#> was created manually using structure() or similar). Avoid names<- and attr<-
#> which in R currently (and oddly) may copy the whole data.table. Use set* syntax
#> instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
#> for data.table-agnostic packages to produce tables affected by this issue. If
#> this message doesn't help, please report your use case to the data.table issue
#> tracker so the root cause can be fixed or this message improved.
same.other.score[1]
#>    test.subset train.subsets groups test.fold                  test
#>         <char>        <char>  <int>     <int>                <list>
#> 1:           A           all    700         1 43,44,57,58,71,72,...
#>                    train  seed n.train.groups iteration
#>                   <list> <int>          <int>     <int>
#> 1:  1, 2, 7, 8, 9,10,...     1            700         1
#>                                   uhash    nr           task task_id
#>                                  <char> <int>         <list>  <char>
#> 1: 71f8e485-3b1e-4f51-b80f-6c67241d614c     1 <TaskRegr:sin>     sin
#>                                      learner       learner_id
#>                                       <list>           <char>
#> 1: <LearnerRegrFeatureless:regr.featureless> regr.featureless
#>                      resampling       resampling_id  prediction_test regr.rmse
#>                          <list>              <char>           <list>     <num>
#> 1: <ResamplingSameOtherSizesCV> same_other_sizes_cv <PredictionRegr> 0.9054364
#>      algorithm n.train
#>         <char>   <int>
#> 1: featureless    1400
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      regr.rmse, train.subsets, color=algorithm),
      shape=1,
      data=same.other.score)+
    geom_text(aes(
      Inf, train.subsets,
      label=sprintf("n.train=%d ", n.train)),
      size=text.size,
      hjust=1,
      vjust=1.5,
      data=same.other.score[algorithm=="featureless" & test.fold==1])+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_x_continuous(
      "Root mean squared prediction error (test set)")
}
The figure above shows the effect of train set size on test error.
same.other.wide <- dcast(
  same.other.score,
  algorithm + test.subset + train.subsets ~ .,
  list(mean, sd),
  value.var="regr.rmse")
if(require(ggplot2)){
  ggplot()+
    geom_segment(aes(
      regr.rmse_mean+regr.rmse_sd, train.subsets,
      xend=regr.rmse_mean-regr.rmse_sd, yend=train.subsets,
      color=algorithm),
      data=same.other.wide)+
    geom_point(aes(
      regr.rmse_mean, train.subsets, color=algorithm),
      shape=1,
      data=same.other.wide)+
    geom_text(aes(
      Inf, train.subsets,
      label=sprintf("n.train=%d ", n.train)),
      size=text.size,
      hjust=1,
      vjust=1.5,
      data=same.other.score[algorithm=="featureless" & test.fold==1])+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_x_continuous(
      "Root mean squared prediction error (test set)")
}
The figure above shows a test subset in each panel, the train subsets on the y axis, the test error on the x axis, the two different algorithms are shown in two different colors. We can clearly see that
train.subsets=same, test error is largest, sometimes almost as
large as featureless, which is the error rate when no relationship
has been learned between inputs and outputs (not enough data).train.subsets=other, rpart test error is significantly smaller
than featureless, indicating that some non-trivial relationship
between inputs and outputs has been learned. Sometimes other has
larger error than same, sometimes smaller (depending on sample
size).train.subsets=all, rpart test error tends to be minimal, which
indicates that combining all of the subsets is beneficial in this
case (when the pattern is exactly the same in the different
subsets).Overall in the plot above, all tends to have less prediction error than same, which suggests that the subsets are similar (and indeed the subsets are i.i.d. in this simulation). Another visualization method is shown below,
plist <- mlr3resampling::pvalue(same.other.score, digits=3)
plot(plist)+my_theme
The visualization above includes P-values (two-sided T-test) for the differences between Same and Other/All.
Below we visualize test error as a function of train size.
if(require(ggplot2)){
  ggplot()+
    geom_line(aes(
      n.train, regr.rmse,
      color=algorithm,
      group=paste(algorithm, test.fold)),
      data=same.other.score)+
    geom_label(aes(
      n.train, regr.rmse,
      color=algorithm,
      label=train.subsets),
      size=text.size,
      data=same.other.score)+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_y_continuous(
      "Root mean squared prediction error (test set)")
}
In the previous section we defined a task using the subset role,
which means that the different values in that column will be used to
define different subsets for training/testing using same/other/all CV.
In contrast, below we define a task without the subset role, which
means that we will not have separate CV iterations for same/other/all
(full data is treated as one subset / train subset is same).
task.no.subset <- mlr3::TaskRegr$new(
  "sin", task.dt, target="y")
task.no.subset$col_roles$group <- "agroup"
task.no.subset$col_roles$stratum <- "random_group"
task.no.subset$col_roles$feature <- "x"
str(task.no.subset$col_roles)
#> List of 10
#>  $ feature        : chr "x"
#>  $ target         : chr "y"
#>  $ name           : chr(0) 
#>  $ order          : chr(0) 
#>  $ stratum        : chr "random_group"
#>  $ group          : chr "agroup"
#>  $ offset         : chr(0) 
#>  $ weights_learner: chr(0) 
#>  $ weights_measure: chr(0) 
#>  $ subset         : chr(0)
Below we define cross-validation, and we set the sizes to 5 so we
can see what happens when we have have train sets that are 5 sizes
smaller than the full train set size.
same_other_sizes_cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
same_other_sizes_cv$param_set$values$sizes <- 5
same_other_sizes_cv$instantiate(task.no.subset)
same_other_sizes_cv$instance$iteration.dt
#>     test.subset train.subsets groups test.fold                  test
#>          <char>        <char>  <int>     <int>                <list>
#>  1:        full          same    700         1  3, 4,13,14,15,16,...
#>  2:        full          same    700         1  3, 4,13,14,15,16,...
#>  3:        full          same    700         1  3, 4,13,14,15,16,...
#>  4:        full          same    700         1  3, 4,13,14,15,16,...
#>  5:        full          same    700         1  3, 4,13,14,15,16,...
#>  6:        full          same    700         1  3, 4,13,14,15,16,...
#>  7:        full          same    700         2  1, 2,17,18,21,22,...
#>  8:        full          same    700         2  1, 2,17,18,21,22,...
#>  9:        full          same    700         2  1, 2,17,18,21,22,...
#> 10:        full          same    700         2  1, 2,17,18,21,22,...
#> 11:        full          same    700         2  1, 2,17,18,21,22,...
#> 12:        full          same    700         2  1, 2,17,18,21,22,...
#> 13:        full          same    700         3  5, 6, 7, 8, 9,10,...
#> 14:        full          same    700         3  5, 6, 7, 8, 9,10,...
#> 15:        full          same    700         3  5, 6, 7, 8, 9,10,...
#> 16:        full          same    700         3  5, 6, 7, 8, 9,10,...
#> 17:        full          same    700         3  5, 6, 7, 8, 9,10,...
#> 18:        full          same    700         3  5, 6, 7, 8, 9,10,...
#>                           train  seed n.train.groups iteration
#>                          <list> <int>          <int>     <int>
#>  1: 565,566,583,584,743,744,...     1             21         1
#>  2: 133,134,171,172,305,306,...     1             43         2
#>  3:       77,78,93,94,95,96,...     1             87         3
#>  4:        7, 8,25,26,29,30,...     1            175         4
#>  5:        1, 2, 7, 8,17,18,...     1            350         5
#>  6:             1,2,5,6,7,8,...     1            700         6
#>  7:  39, 40,109,110,285,286,...     1             21         7
#>  8:       29,30,37,38,39,40,...     1             43         8
#>  9:       29,30,37,38,39,40,...     1             87         9
#> 10:        3, 4,13,14,23,24,...     1            175        10
#> 11:        3, 4,13,14,19,20,...     1            350        11
#> 12:             3,4,5,6,7,8,...     1            700        12
#> 13: 209,210,397,398,519,520,...     1             21        13
#> 14: 139,140,209,210,343,344,...     1             43        14
#> 15:        1, 2,17,18,43,44,...     1             87        15
#> 16:        1, 2,17,18,37,38,...     1            175        16
#> 17:        1, 2,13,14,17,18,...     1            350        17
#> 18:        1, 2, 3, 4,13,14,...     1            700        18
So using the K-fold cross-validation, we will do one train/test split
for each row of the table above. There is one row for each combination
of n.train.groups (full train set size + 5 smaller sizes), and test
fold (1, 2, 3).
We compute and plot the results using the code below,
(reg.learner.list <- list(
  mlr3::LearnerRegrFeatureless$new()))
#> [[1]]
#> 
#> ── <LearnerRegrFeatureless> (regr.featureless): Featureless Regression Learner ─
#> • Model: -
#> • Parameters: robust=FALSE
#> • Packages: mlr3 and stats
#> • Predict Types: [response], se, and quantiles
#> • Feature Types: logical, integer, numeric, character, factor, ordered,
#> POSIXct, and Date
#> • Encapsulation: none (fallback: -)
#> • Properties: featureless, importance, missings, selected_features, and weights
#> • Other settings: use_weights = 'use'
if(requireNamespace("rpart")){
  reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
}
(same.other.grid <- mlr3::benchmark_grid(
  task.no.subset,
  reg.learner.list,
  same_other_sizes_cv))
#>      task          learner          resampling
#>    <char>           <char>              <char>
#> 1:    sin regr.featureless same_other_sizes_cv
#> 2:    sin       regr.rpart same_other_sizes_cv
##if(require(future))plan("multisession")
lgr::get_logger("mlr3")$set_threshold("warn")
(same.other.result <- mlr3::benchmark(
  same.other.grid, store_models = TRUE))
#> 
#> ── <BenchmarkResult> of 36 rows with 2 resampling run ──────────────────────────
#>  nr task_id       learner_id       resampling_id iters warnings errors
#>   1     sin regr.featureless same_other_sizes_cv    18        0      0
#>   2     sin       regr.rpart same_other_sizes_cv    18        0      0
same.other.score <- mlr3resampling::score(
  same.other.result, mlr3::msr("regr.rmse"))
same.other.score[, n.train := sapply(train, length)]
#> Warning in `[.data.table`(same.other.score, , `:=`(n.train, sapply(train, : A
#> shallow copy of this data.table was taken so that := can add or remove 1
#> columns by reference. At an earlier point, this data.table was copied by R (or
#> was created manually using structure() or similar). Avoid names<- and attr<-
#> which in R currently (and oddly) may copy the whole data.table. Use set* syntax
#> instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
#> for data.table-agnostic packages to produce tables affected by this issue. If
#> this message doesn't help, please report your use case to the data.table issue
#> tracker so the root cause can be fixed or this message improved.
same.other.score[1]
#>    test.subset train.subsets groups test.fold                  test
#>         <char>        <char>  <int>     <int>                <list>
#> 1:        full          same    700         1  3, 4,13,14,15,16,...
#>                          train  seed n.train.groups iteration
#>                         <list> <int>          <int>     <int>
#> 1: 565,566,583,584,743,744,...     1             21         1
#>                                   uhash    nr           task task_id
#>                                  <char> <int>         <list>  <char>
#> 1: 1dee4ec1-7ba0-4bb8-b4eb-e3de9472c522     1 <TaskRegr:sin>     sin
#>                                      learner       learner_id
#>                                       <list>           <char>
#> 1: <LearnerRegrFeatureless:regr.featureless> regr.featureless
#>                      resampling       resampling_id  prediction_test regr.rmse
#>                          <list>              <char>           <list>     <num>
#> 1: <ResamplingSameOtherSizesCV> same_other_sizes_cv <PredictionRegr> 0.8746503
#>      algorithm n.train
#>         <char>   <int>
#> 1: featureless      42
if(require(ggplot2)){
  ggplot()+
    geom_line(aes(
      n.train, regr.rmse,
      color=algorithm,
      group=paste(algorithm, test.fold)),
      data=same.other.score)+
    geom_point(aes(
      n.train, regr.rmse,
      color=algorithm),
      data=same.other.score)+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_x_log10(
      "Number of train rows",
      breaks=unique(same.other.score$n.train))+
    scale_y_continuous(
      "Root mean squared prediction error (test set)")
}
From the plot above, it looks like about 700 rows is enough to get minimal test error, using the rpart learner.
N <- 600
abs.x <- 20
set.seed(1)
x.vec <- sort(runif(N, -abs.x, abs.x))
str(x.vec)
#>  num [1:600] -19.9 -19.9 -19.7 -19.6 -19.6 ...
library(data.table)
(task.dt <- data.table(
  x=x.vec,
  y = sin(x.vec)+rnorm(N,sd=0.5)))
#>              x          y
#>          <num>      <num>
#>   1: -19.92653 -0.4336887
#>   2: -19.92269 -1.4023484
#>   3: -19.67486  0.2509134
#>   4: -19.55856 -0.8428921
#>   5: -19.55402  0.1794473
#>  ---                     
#> 596:  19.70736  0.7497818
#> 597:  19.74997  0.3178435
#> 598:  19.75656  1.3950030
#> 599:  19.83862 -0.2086586
#> 600:  19.84309  0.5748863
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      x, y),
      shape=1,
      data=task.dt)+
    coord_equal()
}
atomic.subset.size <- 2
task.dt[, agroup := rep(seq(1, N/atomic.subset.size), each=atomic.subset.size)][]
#>              x          y agroup
#>          <num>      <num>  <int>
#>   1: -19.92653 -0.4336887      1
#>   2: -19.92269 -1.4023484      1
#>   3: -19.67486  0.2509134      2
#>   4: -19.55856 -0.8428921      2
#>   5: -19.55402  0.1794473      3
#>  ---                            
#> 596:  19.70736  0.7497818    298
#> 597:  19.74997  0.3178435    299
#> 598:  19.75656  1.3950030    299
#> 599:  19.83862 -0.2086586    300
#> 600:  19.84309  0.5748863    300
task.dt[, random_subset := rep(
  rep(c("A","B","B","B"), each=atomic.subset.size),
  l=.N
)][]
#>              x          y agroup random_subset
#>          <num>      <num>  <int>        <char>
#>   1: -19.92653 -0.4336887      1             A
#>   2: -19.92269 -1.4023484      1             A
#>   3: -19.67486  0.2509134      2             B
#>   4: -19.55856 -0.8428921      2             B
#>   5: -19.55402  0.1794473      3             B
#>  ---                                          
#> 596:  19.70736  0.7497818    298             B
#> 597:  19.74997  0.3178435    299             B
#> 598:  19.75656  1.3950030    299             B
#> 599:  19.83862 -0.2086586    300             B
#> 600:  19.84309  0.5748863    300             B
table(subset.tab <- task.dt$random_subset)
#> 
#>   A   B 
#> 150 450
reg.task <- mlr3::TaskRegr$new(
  "sin", task.dt, target="y")
reg.task$col_roles$subset <- "random_subset"
reg.task$col_roles$group <- "agroup"
reg.task$col_roles$stratum <- "random_subset"
reg.task$col_roles$feature <- "x"
same_other_sizes_cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
In the previous section we analyzed prediction accuracy of
same/other/all, which corresponds to keeping sizes parameter at
default of -1.  The main difference in this section is that we change
sizes to 0, which means to down-sample same/other/all, so we can see
if there is an effect for sample size (there should be for iid
problems with intermediate difficulty). We set sizes to 0 in the next
line:
same_other_sizes_cv$param_set$values$sizes <- 0
same_other_sizes_cv$instantiate(reg.task)
same_other_sizes_cv$instance$it
#>     test.subset train.subsets groups test.fold                  test
#>          <char>        <char>  <int>     <int>                <list>
#>  1:           A           all    200         1  1, 2,49,50,57,58,...
#>  2:           A           all    200         1  1, 2,49,50,57,58,...
#>  3:           A           all    200         1  1, 2,49,50,57,58,...
#>  4:           B           all    200         1 19,20,31,32,37,38,...
#>  5:           B           all    200         1 19,20,31,32,37,38,...
#>  6:           B           all    200         1 19,20,31,32,37,38,...
#>  7:           A           all    200         2 17,18,41,42,89,90,...
#>  8:           A           all    200         2 17,18,41,42,89,90,...
#>  9:           A           all    200         2 17,18,41,42,89,90,...
#> 10:           B           all    200         2       3,4,5,6,7,8,...
#> 11:           B           all    200         2       3,4,5,6,7,8,...
#> 12:           B           all    200         2       3,4,5,6,7,8,...
#> 13:           A           all    200         3  9,10,25,26,33,34,...
#> 14:           A           all    200         3  9,10,25,26,33,34,...
#> 15:           A           all    200         3  9,10,25,26,33,34,...
#> 16:           B           all    200         3 15,16,21,22,23,24,...
#> 17:           B           all    200         3 15,16,21,22,23,24,...
#> 18:           B           all    200         3 15,16,21,22,23,24,...
#> 19:           A         other    150         1  1, 2,49,50,57,58,...
#> 20:           A         other    150         1  1, 2,49,50,57,58,...
#> 21:           B         other     50         1 19,20,31,32,37,38,...
#> 22:           A         other    150         2 17,18,41,42,89,90,...
#> 23:           A         other    150         2 17,18,41,42,89,90,...
#> 24:           B         other     50         2       3,4,5,6,7,8,...
#> 25:           A         other    150         3  9,10,25,26,33,34,...
#> 26:           A         other    150         3  9,10,25,26,33,34,...
#> 27:           B         other     50         3 15,16,21,22,23,24,...
#> 28:           A          same     50         1  1, 2,49,50,57,58,...
#> 29:           B          same    150         1 19,20,31,32,37,38,...
#> 30:           B          same    150         1 19,20,31,32,37,38,...
#> 31:           A          same     50         2 17,18,41,42,89,90,...
#> 32:           B          same    150         2       3,4,5,6,7,8,...
#> 33:           B          same    150         2       3,4,5,6,7,8,...
#> 34:           A          same     50         3  9,10,25,26,33,34,...
#> 35:           B          same    150         3 15,16,21,22,23,24,...
#> 36:           B          same    150         3 15,16,21,22,23,24,...
#>     test.subset train.subsets groups test.fold                  test
#>          <char>        <char>  <int>     <int>                <list>
#>                     train  seed n.train.groups iteration
#>                    <list> <int>          <int>     <int>
#>  1:  5, 6, 9,10,15,16,...     1             50         1
#>  2:       3,4,5,6,7,8,...     1            150         2
#>  3:       3,4,5,6,7,8,...     1            200         3
#>  4:  3, 4, 7, 8,15,16,...     1             50         4
#>  5:       3,4,5,6,7,8,...     1            150         5
#>  6:       3,4,5,6,7,8,...     1            200         6
#>  7:  1, 2,35,36,39,40,...     1             50         7
#>  8:  1, 2, 9,10,19,20,...     1            150         8
#>  9:  1, 2, 9,10,15,16,...     1            200         9
#> 10: 19,20,63,64,73,74,...     1             50        10
#> 11:  1, 2, 9,10,15,16,...     1            150        11
#> 12:  1, 2, 9,10,15,16,...     1            200        12
#> 13: 29,30,37,38,49,50,...     1             50        13
#> 14:  5, 6,11,12,13,14,...     1            150        14
#> 15:       1,2,3,4,5,6,...     1            200        15
#> 16: 13,14,29,30,49,50,...     1             50        16
#> 17:       1,2,3,4,5,6,...     1            150        17
#> 18:       1,2,3,4,5,6,...     1            200        18
#> 19: 15,16,21,22,55,56,...     1             50        19
#> 20:       3,4,5,6,7,8,...     1            150        20
#> 21:  9,10,17,18,25,26,...     1             50        21
#> 22: 15,16,19,20,23,24,...     1             50        22
#> 23: 15,16,19,20,21,22,...     1            150        23
#> 24:  1, 2, 9,10,25,26,...     1             50        24
#> 25: 11,12,19,20,27,28,...     1             50        25
#> 26:       3,4,5,6,7,8,...     1            150        26
#> 27:  1, 2,17,18,41,42,...     1             50        27
#> 28:  9,10,17,18,25,26,...     1             50        28
#> 29: 59,60,63,64,75,76,...     1             50        29
#> 30:       3,4,5,6,7,8,...     1            150        30
#> 31:  1, 2, 9,10,25,26,...     1             50        31
#> 32: 23,24,37,38,51,52,...     1             50        32
#> 33: 15,16,19,20,21,22,...     1            150        33
#> 34:  1, 2,17,18,41,42,...     1             50        34
#> 35: 11,12,19,20,45,46,...     1             50        35
#> 36:       3,4,5,6,7,8,...     1            150        36
#>                     train  seed n.train.groups iteration
#>                    <list> <int>          <int>     <int>
(reg.learner.list <- list(
  mlr3::LearnerRegrFeatureless$new()))
#> [[1]]
#> 
#> ── <LearnerRegrFeatureless> (regr.featureless): Featureless Regression Learner ─
#> • Model: -
#> • Parameters: robust=FALSE
#> • Packages: mlr3 and stats
#> • Predict Types: [response], se, and quantiles
#> • Feature Types: logical, integer, numeric, character, factor, ordered,
#> POSIXct, and Date
#> • Encapsulation: none (fallback: -)
#> • Properties: featureless, importance, missings, selected_features, and weights
#> • Other settings: use_weights = 'use'
if(requireNamespace("rpart")){
  reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
}
(same.other.grid <- mlr3::benchmark_grid(
  reg.task,
  reg.learner.list,
  same_other_sizes_cv))
#>      task          learner          resampling
#>    <char>           <char>              <char>
#> 1:    sin regr.featureless same_other_sizes_cv
#> 2:    sin       regr.rpart same_other_sizes_cv
##if(require(future))plan("multisession")
lgr::get_logger("mlr3")$set_threshold("warn")
(same.other.result <- mlr3::benchmark(
  same.other.grid, store_models = TRUE))
#> 
#> ── <BenchmarkResult> of 72 rows with 2 resampling run ──────────────────────────
#>  nr task_id       learner_id       resampling_id iters warnings errors
#>   1     sin regr.featureless same_other_sizes_cv    36        0      0
#>   2     sin       regr.rpart same_other_sizes_cv    36        0      0
same.other.score <- mlr3resampling::score(
  same.other.result, mlr3::msr("regr.rmse"))
same.other.score[1]
#>    test.subset train.subsets groups test.fold                  test
#>         <char>        <char>  <int>     <int>                <list>
#> 1:           A           all    200         1  1, 2,49,50,57,58,...
#>                    train  seed n.train.groups iteration
#>                   <list> <int>          <int>     <int>
#> 1:  5, 6, 9,10,15,16,...     1             50         1
#>                                   uhash    nr           task task_id
#>                                  <char> <int>         <list>  <char>
#> 1: 3bfd0125-b9e6-450b-bd32-22cfa44298fb     1 <TaskRegr:sin>     sin
#>                                      learner       learner_id
#>                                       <list>           <char>
#> 1: <LearnerRegrFeatureless:regr.featureless> regr.featureless
#>                      resampling       resampling_id  prediction_test regr.rmse
#>                          <list>              <char>           <list>     <num>
#> 1: <ResamplingSameOtherSizesCV> same_other_sizes_cv <PredictionRegr> 0.7625015
#>      algorithm
#>         <char>
#> 1: featureless
The plot below shows the same results (no down-sampling) as if we did
sizes=-1 (like in the previous section.
if(require(ggplot2)){
ggplot()+
  geom_point(aes(
    regr.rmse, train.subsets, color=algorithm),
    shape=1,
    data=same.other.score[groups==n.train.groups])+
  facet_grid(. ~ test.subset, labeller=label_both)
}
The plots below compare all six train subsets (including three down-sampled), and it it is clear there is an effect for sample size.
same.other.score[, subset.N := paste(train.subsets, n.train.groups)]
#> Warning in `[.data.table`(same.other.score, , `:=`(subset.N,
#> paste(train.subsets, : A shallow copy of this data.table was taken so that :=
#> can add or remove 1 columns by reference. At an earlier point, this data.table
#> was copied by R (or was created manually using structure() or similar). Avoid
#> names<- and attr<- which in R currently (and oddly) may copy the whole
#> data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and
#> ?setattr. It's also not unusual for data.table-agnostic packages to produce
#> tables affected by this issue. If this message doesn't help, please report your
#> use case to the data.table issue tracker so the root cause can be fixed or this
#> message improved.
(levs <- same.other.score[order(train.subsets, n.train.groups), unique(subset.N)])
#> [1] "all 50"    "all 150"   "all 200"   "other 50"  "other 150" "same 50"  
#> [7] "same 150"
same.other.score[, subset.N.fac := factor(subset.N, levs)]
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      regr.rmse, subset.N.fac, color=algorithm),
      shape=1,
      data=same.other.score)+
    facet_wrap("test.subset", labeller=label_both, scales="free", nrow=1)
}
(levs <- same.other.score[order(n.train.groups, train.subsets), unique(subset.N)])
#> [1] "all 50"    "other 50"  "same 50"   "all 150"   "other 150" "same 150" 
#> [7] "all 200"
same.other.score[, N.subset.fac := factor(subset.N, levs)]
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      regr.rmse, N.subset.fac, color=algorithm),
      shape=1,
      data=same.other.score)+
    facet_wrap("test.subset", labeller=label_both, scales="free", nrow=1)
}
Another way to view the effect of sample size is to plot the test/prediction error, as a function of number of train data, as in the plots below.
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      n.train.groups, regr.rmse,
      color=train.subsets),
      shape=1,
      data=same.other.score)+
    geom_line(aes(
      n.train.groups, regr.rmse,
      group=paste(train.subsets, seed, algorithm),
      linetype=algorithm,
      color=train.subsets),
      data=same.other.score)+
    facet_grid(test.fold ~ test.subset, labeller=label_both)
}
rpart.score <- same.other.score[algorithm=="rpart" & train.subsets != "other"]
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      n.train.groups, regr.rmse,
      color=train.subsets),
      shape=1,
      data=rpart.score)+
    geom_line(aes(
      n.train.groups, regr.rmse,
      group=paste(train.subsets, seed, algorithm),
      color=train.subsets),
      data=rpart.score)+
    facet_grid(test.fold ~ test.subset, labeller=label_both)
}
In this section we show how ResamplingSameOtherSizesCV can be used on a task with stratification and grouping, for hyper-parameter learning. First we recall the previously defined task and evaluation CV.
str(reg.task$col_roles)
#> List of 10
#>  $ feature        : chr "x"
#>  $ target         : chr "y"
#>  $ name           : chr(0) 
#>  $ order          : chr(0) 
#>  $ stratum        : chr "random_subset"
#>  $ group          : chr "agroup"
#>  $ offset         : chr(0) 
#>  $ weights_learner: chr(0) 
#>  $ weights_measure: chr(0) 
#>  $ subset         : chr "random_subset"
We see in the output aove that the task has column roles for both
stratum and group, which normally errors when used with
ResamplingCV:
mlr3::ResamplingCV$new()$instantiate(reg.task)
#> Error in private$.get_instance(task): Cannot combine stratification with grouping
Below we show how ResamplingSameOtherSizesCV can be used instead:
ignore.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
ignore.cv$param_set$values$ignore_subset <- TRUE
ignore.cv$instantiate(reg.task)
ignore.cv$instance$iteration.dt
#>    test.subset train.subsets groups test.fold                  test
#>         <char>        <char>  <int>     <int>                <list>
#> 1:        full          same    200         1  5, 6, 7, 8, 9,10,...
#> 2:        full          same    200         2  3, 4,11,12,13,14,...
#> 3:        full          same    200         3  1, 2,25,26,31,32,...
#>                    train  seed n.train.groups iteration
#>                   <list> <int>          <int>     <int>
#> 1:  1, 2, 3, 4,11,12,...     1            200         1
#> 2:       1,2,5,6,7,8,...     1            200         2
#> 3:       3,4,5,6,7,8,...     1            200         3
To use the above CV object with a learning algorithm in a benchmark
experiment, we need to use it as the resampling argument to
auto_tuner, as in the code below,
do_benchmark <- function(subtrain.valid.cv){
  reg.learner.list <- list(
    mlr3::LearnerRegrFeatureless$new())
  if(requireNamespace("rpart")){
    reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
    if(requireNamespace("mlr3tuning")){
      rpart.learner <- mlr3::LearnerRegrRpart$new()
      ##mlr3tuningspaces::lts(rpart.learner)$param_set$values
      rpart.learner$param_set$values$cp <- paradox::to_tune(1e-4, 0.1, log=TRUE)
      reg.learner.list$rpart.tuned <- mlr3tuning::auto_tuner(
        tuner = mlr3tuning::tnr("grid_search"), #mlr3tuning::TunerBatchGridSearch$new()
        learner = rpart.learner,
        resampling = subtrain.valid.cv,
        measure = mlr3::msr("regr.rmse"))
    }
  }
  same.other.grid <- mlr3::benchmark_grid(
    reg.task,
    reg.learner.list,
    same_other_sizes_cv)
  lgr::get_logger("bbotk")$set_threshold("warn")
  same.other.result <- mlr3::benchmark(
    same.other.grid, store_models = TRUE)
}
do_benchmark(mlr3::ResamplingCV$new())
#> Le chargement a nécessité le package : mlr3tuning
#> Warning: Caught mlr3error. Canceling all iterations ...
#> Error in private$.get_instance(task): Cannot combine stratification with grouping
The error above is because ResamplingCV does not support
stratification and grouping. To fix that, we can use the code below:
ignore.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
ignore.cv$param_set$values$ignore_subset <- TRUE
(same.other.result <- do_benchmark(ignore.cv))
#> 
#> ── <BenchmarkResult> of 108 rows with 3 resampling run ─────────────────────────
#>  nr task_id       learner_id       resampling_id iters warnings errors
#>   1     sin regr.featureless same_other_sizes_cv    36        0      0
#>   2     sin       regr.rpart same_other_sizes_cv    36        0      0
#>   3     sin regr.rpart.tuned same_other_sizes_cv    36        0      0
The output above shows that the benchmark worked. The code below plots the results.
same.other.score <- mlr3resampling::score(
  same.other.result, mlr3::msr("regr.rmse"))
same.other.score[1]
#>    test.subset train.subsets groups test.fold                  test
#>         <char>        <char>  <int>     <int>                <list>
#> 1:           A           all    200         1  1, 2,49,50,57,58,...
#>                    train  seed n.train.groups iteration
#>                   <list> <int>          <int>     <int>
#> 1:  5, 6, 9,10,15,16,...     1             50         1
#>                                   uhash    nr           task task_id
#>                                  <char> <int>         <list>  <char>
#> 1: b3fa9c0a-2934-4659-8e31-a71d9704af5c     1 <TaskRegr:sin>     sin
#>                                      learner       learner_id
#>                                       <list>           <char>
#> 1: <LearnerRegrFeatureless:regr.featureless> regr.featureless
#>                      resampling       resampling_id  prediction_test regr.rmse
#>                          <list>              <char>           <list>     <num>
#> 1: <ResamplingSameOtherSizesCV> same_other_sizes_cv <PredictionRegr> 0.7625015
#>      algorithm
#>         <char>
#> 1: featureless
same.other.wide <- dcast(
  same.other.score,
  algorithm + test.subset + train.subsets ~ .,
  list(mean, sd),
  value.var="regr.rmse")
if(require(ggplot2)){
  ggplot()+
    geom_segment(aes(
      regr.rmse_mean+regr.rmse_sd, train.subsets,
      xend=regr.rmse_mean-regr.rmse_sd, yend=train.subsets),
      data=same.other.wide)+
    geom_point(aes(
      regr.rmse_mean, train.subsets),
      shape=1,
      data=same.other.wide)+
    facet_grid(algorithm ~ test.subset, labeller=label_both)
}
The plot above has different panels for rpart (without tuning) and
tuned (rpart with tuning of cp).
mlr3resampling::ResamplingSameOtherSizesCV can be used for model evaluation (train/test split):
subset).sizes).It can also be used for model training (subtrain/validation split):
stratum and group roles (use is as resampling argument of auto_tuner).The goal of this section is explain the differences between various column roles:
group is used to designate observations which should stay together
when splitting. In other words, two rows in the same group should
never appear in different sets.subset designates a column whose values are each treated as a test
set (the train data come from Same/Other/All subsets).Below we load the data set.
data(AZtrees,package="mlr3resampling")
library(data.table)
AZdt <- data.table(AZtrees)
AZdt[1]
#>       xcoord   ycoord region3 region4 polygon        y SAMPLE_1 SAMPLE_2
#>        <num>    <num>  <char>  <char>  <fctr>   <fctr>    <int>    <int>
#> 1: -111.6643 35.23736      NE      NE       1 Not tree     3331     3919
#>    SAMPLE_3 SAMPLE_4 SAMPLE_5 SAMPLE_6 SAMPLE_7 SAMPLE_8 SAMPLE_9 SAMPLE_10
#>       <int>    <int>    <int>    <int>    <int>    <int>    <int>     <int>
#> 1:     3957     4514     4700     4607     4420     4494     4139      3906
#>    SAMPLE_11 SAMPLE_12 SAMPLE_13 SAMPLE_14 SAMPLE_15 SAMPLE_16 SAMPLE_17
#>        <int>     <int>     <int>     <int>     <int>     <int>     <int>
#> 1:        14       -40       -71       125        21        25        10
#>    SAMPLE_18 SAMPLE_19 SAMPLE_20 SAMPLE_21
#>        <int>     <int>     <int>     <int>
#> 1:      -263      -324      -362       370
Above we see one row of data. Below we see a scatterplot of the data:
x.center <- -111.72
y.center <- 35.272
rect.size <- 0.01/2
x.min.max <- x.center+c(-1, 1)*rect.size
y.min.max <- y.center+c(-1, 1)*rect.size
rect.dt <- data.table(
  xmin=x.min.max[1], xmax=x.min.max[2],
  ymin=y.min.max[1], ymax=y.min.max[2])
if(require(ggplot2)){
  tree.fill.scale <- scale_fill_manual(
    values=c(Tree="black", "Not tree"="white"))
  ggplot()+
    tree.fill.scale+
    geom_rect(aes(
      xmin=xmin, xmax=xmax, ymin=ymin,ymax=ymax),
      data=rect.dt,
      fill="red",
      linewidth=3,
      color="red")+
    geom_point(aes(
      xcoord, ycoord, fill=y),
      shape=21,
      data=AZdt)+
    coord_equal()
}
Note the red square in the plot above. Below we zoom into that square.
if(require(ggplot2)){
  gg <- ggplot()+
    tree.fill.scale+
    geom_point(aes(
      xcoord, ycoord, fill=y),
      shape=21,
      data=AZdt)+
    coord_equal()+
    scale_x_continuous(
      limits=x.min.max)+
    scale_y_continuous(
      limits=y.min.max)
  if(require(directlabels)){
    gg <- gg+geom_dl(aes(
      xcoord, ycoord, label=paste("polygon",polygon)),
      data=AZdt,
      method=list(cex=2, "smart.grid"))
  }
  gg
}
#> Le chargement a nécessité le package : directlabels
#> Warning: Removed 5927 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Warning: Removed 5927 rows containing missing values or values outside the scale range
#> (`geom_dl()`).
In the plot above, we see that there are several groups of points, each with a black number. Each group of points comes from a single polygon (label drawn in GIS software), and the black number is the polygon ID number. So each polygon represents one label, either tree or not, and there are one or more points/pixels with that label inside each polygon.
A polygon is an example of a group. Each polygon results in one or more rows of training data (pixels), but since pixels in a given group were all labeled together, we would like to keep them together when splitting the data.
Below we plot the same data, but this time colored by region.
##dput(RColorBrewer::brewer.pal(3,"Dark2"))
region.colors <- c(NW="#1B9E77", NE="#D95F02", S="#7570B3")
if(require(ggplot2)){
  ggplot()+
    tree.fill.scale+
    scale_color_manual(
      values=region.colors)+
    geom_point(aes(
      xcoord, ycoord, color=region3, fill=y),
      shape=21,
      data=AZdt)+
    coord_equal()
}
We can see in the plot above that there are three values in the
region3 column: NE, NW, and S (different geographical regions on the
map which are well-separated). We would like to know if it is possible
to train on one region, and then accurately predict on another region.
First we create a task:
ctask <- mlr3::TaskClassif$new(
  "AZtrees", AZdt, target="y")
ctask$col_roles$subset <- "region3"
ctask$col_roles$group <- "polygon"
ctask$col_roles$stratum <- "y"
ctask$col_roles$feature <- grep("SAMPLE",names(AZdt),value=TRUE)
str(ctask$col_roles)
#> List of 10
#>  $ feature        : chr [1:21] "SAMPLE_1" "SAMPLE_2" "SAMPLE_3" "SAMPLE_4" ...
#>  $ target         : chr "y"
#>  $ name           : chr(0) 
#>  $ order          : chr(0) 
#>  $ stratum        : chr "y"
#>  $ group          : chr "polygon"
#>  $ offset         : chr(0) 
#>  $ weights_learner: chr(0) 
#>  $ weights_measure: chr(0) 
#>  $ subset         : chr "region3"
Then we can instantiate the CV to see how it works (but usually you do
not need to instantiate, if you are using benchmark it does it for
you).
same.other.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
same.other.cv$param_set$values$folds <- 3
same.other.cv$instantiate(ctask)
same.other.cv$instance$iteration.dt[, .(
  train.subsets, test.fold, test.subset, n.train.groups,
  train.rows=sapply(train, length))]
#>     train.subsets test.fold test.subset n.train.groups train.rows
#>            <char>     <int>      <char>          <int>      <int>
#>  1:           all         1          NE            125       3108
#>  2:           all         1          NW            125       3108
#>  3:           all         1           S            125       3108
#>  4:           all         2          NE            125       4325
#>  5:           all         2          NW            125       4325
#>  6:           all         2           S            125       4325
#>  7:           all         3          NE            125       4479
#>  8:           all         3          NW            125       4479
#>  9:           all         3           S            125       4479
#> 10:         other         1          NE             55       1934
#> 11:         other         1          NW            104       1652
#> 12:         other         1           S             91       2630
#> 13:         other         2          NE             55       3550
#> 14:         other         2          NW            104       3524
#> 15:         other         2           S             91       1576
#> 16:         other         3          NE             55       3500
#> 17:         other         3          NW            104       3610
#> 18:         other         3           S             91       1848
#> 19:          same         1          NE             70       1174
#> 20:          same         1          NW             21       1456
#> 21:          same         1           S             34        478
#> 22:          same         2          NE             70        775
#> 23:          same         2          NW             21        801
#> 24:          same         2           S             34       2749
#> 25:          same         3          NE             70        979
#> 26:          same         3          NW             21        869
#> 27:          same         3           S             34       2631
#>     train.subsets test.fold test.subset n.train.groups train.rows
#>            <char>     <int>      <char>          <int>      <int>
The table above has one row per train/test split for which
error/accuracy metrics will be computed.  The n.train.groups column
is the number of polygons which are used in the train set, which is
defined as the intersection of the train subsets and the train folds.
To double check, below we compute the total number of groups/polygons per
subset/region, and the expected number of train groups/polygons.
AZdt[, .(
  polygons=length(unique(polygon))
), by=region3][
, train.polygons := polygons*with(same.other.cv$param_set$values, (folds-1)/folds)
][]
#>    region3 polygons train.polygons
#>     <char>    <int>          <num>
#> 1:      NE      105       70.00000
#> 2:      NW       32       21.33333
#> 3:       S       52       34.66667
It is clear that the counts in the train.polygons column above match
the numbers in the previous table column n.train.groups. To
determine the number of rows of train data, we can look at the
train.rows column in the previous table.
Below we define the benchmark experiment.
same.other.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
(learner.list <- list(
  mlr3::LearnerClassifFeatureless$new()))
#> [[1]]
#> 
#> ── <LearnerClassifFeatureless> (classif.featureless): Featureless Classification
#> • Model: -
#> • Parameters: method=mode
#> • Packages: mlr3
#> • Predict Types: [response] and prob
#> • Feature Types: logical, integer, numeric, character, factor, ordered,
#> POSIXct, and Date
#> • Encapsulation: none (fallback: -)
#> • Properties: featureless, importance, missings, multiclass, selected_features,
#> twoclass, and weights
#> • Other settings: use_weights = 'use'
if(requireNamespace("rpart")){
  learner.list$rpart <- mlr3::LearnerClassifRpart$new()
}
for(learner.i in seq_along(learner.list)){
  learner.list[[learner.i]]$predict_type <- "prob"
}
(bench.grid <- mlr3::benchmark_grid(ctask, learner.list, same.other.cv))
#>       task             learner          resampling
#>     <char>              <char>              <char>
#> 1: AZtrees classif.featureless same_other_sizes_cv
#> 2: AZtrees       classif.rpart same_other_sizes_cv
Above we see one row per combination of task, learner, and resampling. Below we compute the benchmark result and test accuracy.
bench.result <- mlr3::benchmark(bench.grid)
measure.list <- mlr3::msrs(c("classif.acc","classif.auc"))
score.dt <- mlr3resampling::score(bench.result, measure.list)
score.dt[1]
#>    test.subset train.subsets groups test.fold                        test
#>         <char>        <char>  <int>     <int>                      <list>
#> 1:          NE           all    125         1 123,124,125,126,127,128,...
#>              train  seed n.train.groups iteration
#>             <list> <int>          <int>     <int>
#> 1: 1,2,3,4,5,6,...     1            125         1
#>                                   uhash    nr                  task task_id
#>                                  <char> <int>                <list>  <char>
#> 1: e6a7f420-e384-48a2-84c9-c93c4ba7b24a     1 <TaskClassif:AZtrees> AZtrees
#>                                            learner          learner_id
#>                                             <list>              <char>
#> 1: <LearnerClassifFeatureless:classif.featureless> classif.featureless
#>                      resampling       resampling_id     prediction_test
#>                          <list>              <char>              <list>
#> 1: <ResamplingSameOtherSizesCV> same_other_sizes_cv <PredictionClassif>
#>    classif.acc classif.auc   algorithm
#>          <num>       <num>      <char>
#> 1:   0.6576819         0.5 featureless
Above we see one row of the result, for one train/test split. Below we plot the accuracy results using two different methods.
score.long <- melt(
  score.dt,
  measure.vars=measure(variable, pattern="classif.(acc|auc)"))
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      value, train.subsets, color=algorithm),
      data=score.long)+
    facet_grid(test.subset ~ variable, labeller=label_both, scales="free")
}
Above we show one dot per train/test split, and another way to do that is via the plot method, as below.
plot(score.dt)+my_theme
Below we take the mean/SD over folds.
score.wide <- dcast(
  score.long,
  algorithm + test.subset + train.subsets + variable ~ .,
  list(mean, sd),
  value.var="value")
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      value_mean, train.subsets, color=algorithm),
      size=3,
      fill="white",
      shape=21,
      data=score.wide)+
    geom_segment(aes(
      value_mean+value_sd, train.subsets,
      color=algorithm,
      linewidth=algorithm,
      xend=value_mean-value_sd, yend=train.subsets),
      data=score.wide)+
    scale_linewidth_manual(values=c(featureless=2, rpart=1))+
    facet_grid(test.subset ~ variable, labeller=label_both, scales="free")+
    scale_x_continuous(
      "Mean +/- SD of test accuracy/AUC over folds/splits")
}
The plot above shows an interesting pattern:
Another way to visualize these patterns is via the plot method for pvalue objects, as below.
AZ_pval <- mlr3resampling::pvalue(score.dt, digits=3)
plot(AZ_pval)+my_theme
The figure above shows P-values for classification accuracy (by default the first measure is used). If we want to compute P-values for AUC, we can use the code below:
AZ_pval_AUC <- mlr3resampling::pvalue(score.dt, "classif.auc", digits=3)
plot(AZ_pval_AUC)+my_theme
Column roles group, stratum, and subset may be used together, in
the same task, in order to perform a cross-validation experiment which
captures the structure in the data.
sessionInfo()
#> R Under development (unstable) (2025-05-21 r88220)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
#>  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Europe/Paris
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] directlabels_2025.5.20   mlr3resampling_2025.6.23 mlr3_1.0.0.9000         
#> [4] future_1.58.0            ggplot2_3.5.1            data.table_1.17.99      
#> 
#> loaded via a namespace (and not attached):
#>  [1] future.apply_1.20.0  gtable_0.3.6         crayon_1.5.3        
#>  [4] dplyr_1.1.4          compiler_4.6.0       rpart_4.1.24        
#>  [7] tidyselect_1.2.1     parallel_4.6.0       globals_0.18.0      
#> [10] scales_1.3.0         uuid_1.2-1           R6_2.6.1            
#> [13] mlr3tuning_1.3.0     labeling_0.4.3       generics_0.1.3      
#> [16] knitr_1.50           palmerpenguins_0.1.1 backports_1.5.0     
#> [19] checkmate_2.3.2      tibble_3.2.1         munsell_0.5.1       
#> [22] paradox_1.0.1        pillar_1.10.2        mlr3measures_1.0.0  
#> [25] rlang_1.1.6          lgr_0.4.4            xfun_0.51           
#> [28] quadprog_1.5-8       mlr3misc_0.18.0      cli_3.6.5           
#> [31] withr_3.0.2          magrittr_2.0.3       digest_0.6.37       
#> [34] grid_4.6.0           bbotk_1.5.0          lifecycle_1.0.4     
#> [37] vctrs_0.6.5          evaluate_1.0.3       glue_1.8.0          
#> [40] farver_2.1.2         listenv_0.9.1        codetools_0.2-20    
#> [43] parallelly_1.45.0    colorspace_2.1-1     tools_4.6.0         
#> [46] pkgconfig_2.0.3