1 Preamble

Dependencies:

suppressPackageStartupMessages(library(plantmix))
suppressPackageStartupMessages(library(lme4))


2 Overview

In this vignette, varietal mixtures will be assembled from a panel of 200 genotypes. Only 1000 mixtures will be observed, among all 1.99^{4} possible ones, i.e., approximately 5%. This incomplete design will be optimized so that it is balanced: all genotype will be observed in the same number of mixtures. Phenotypes will then be simulated organized into a field trial in a randomized complete block design, according to all six GMA-SMA models that can be conceived: 1, 2, 2’, 2’‘, 3 and 3’. See the article by Forst et al (2019), especially for the first three models, as well as the introductory vignette.

3 Simulate data

set.seed(12345)

3.1 Simulate the genotypes

3.1.1 Set the dimensions

nbGenos <- 200
levGenos <- sprintf(
  fmt = paste0("g%0", floor(log10(nbGenos)) + 1, "i"),
  1:nbGenos
)

nbSnps <- 1000
levSnps <- sprintf(
  fmt = paste0("s%0", floor(log10(nbSnps)) + 1, "i"),
  1:nbSnps
)

3.1.2 SNP genotypes

nb_pops <- 10
weak_div_pops <- diag(nb_pops)
weak_div_pops[upper.tri(weak_div_pops)] <- 0.9
weak_div_pops[lower.tri(weak_div_pops)] <- weak_div_pops[upper.tri(weak_div_pops)]
tmp <- rep(nbGenos / nb_pops, nb_pops - 1)
tmp <- c(tmp, nbGenos - sum(tmp))
snpGenos <- simulGenosDoseStruct(
  nb_genos = tmp,
  nb_snps = nbSnps,
  div_pops = weak_div_pops,
  geno_IDs = levGenos,
  snp_IDs = levSnps
)
dim(snpGenos)
## [1]  200 1000
snpGenos[1:3, 1:4]
##      s0001 s0002 s0003 s0004
## g001     1     1     1     1
## g002     1     0     1     1
## g003     0     1     2     0

3.1.3 Additive genetic relationships

For simplicity, the first estimator of VanRaden (2008) is used, that assumes linkage equilibrium and Hardy-Weinberg equilibrium.

GRM <- estimGRM(snpGenos)
GRM <- as.matrix(Matrix::nearPD(GRM)$mat)
image(Matrix(GRM), main = "GRM")

hist(diag(GRM), main = "GRM")

hist(GRM[upper.tri(GRM)], main = "GRM")

3.2 Simulate the phenotypes

3.2.1 Incomplete, yet balanced design

nbMixes <- 1000
design <- getDesignBinaryVarMix(levGenos, nbMixes = nbMixes)
tmp <- getMixturesPerGeno(getMixtureList(design$combs))
table(sapply(tmp, length)) # each genotype is in the same nb of mixtures -> balanced design
## 
##  10 
## 200

3.2.2 Field trial

monoStands <- paste(levGenos, levGenos, sep = "_")

pairs <- design$combs
mixStands <- paste(pairs[, 1], pairs[, 2], sep = "_")
IDs <- c(monoStands, mixStands)

nbBlocks <- 3
blocks <- LETTERS[1:nbBlocks]

dat <- do.call(rbind, lapply(blocks, function(block) {
  data.frame(
    ID = IDs,
    block = block,
    stringsAsFactors = TRUE
  )
}))

3.2.3 Design matrices

listContr <- list(block = "contr.sum")
X <- model.matrix(~ 1 + block, data = dat, contrasts = listContr)

allZ <- list()
allZ$GMA <- mkZGMA(dat, col = "ID", sep = "_")
system.time(
  allZ <- append(
    allZ,
    mkAllZSMA(dat, col = "ID", sep = "_", verbose = 1)
  )
)
## [1] "model 2'"
## [1] "model 2"
## [1] "model 2''"
## [1] "model 3"
## [1] "model 3'"
##    user  system elapsed 
##  27.706   1.384  29.094
sapply(allZ, dim)
##       GMA SMA_mod2p SMA_mod2 SMA_mod2pp_ij SMA_mod2pp_ii SMA_mod3 SMA_mod3p_ij
## [1,] 3600      3600     3600          3600          3600     3600         3600
## [2,]  200      1000     1200          1000           200     1200         1000
##      SMA_mod3p_ii
## [1,]         3600
## [2,]          200

3.2.4 Parameters

Several packages will be tested below, most notably lme4 and TMB. However, lme4 does not natively take a GRM as input. Hence the GRM will be replaced by the identity matrix to ensure a fair comparison between packages.

truth <- list(
  "intercept" = 100,
  "var_GMA" = 10,
  "var_SMA" = 2,
  "var_SMA_ij" = 1.5,
  "var_SMA_ii" = 0.8,
  "var_error" = 1
)
set.seed(1234)
truth[["blockEffs"]] <- sample(x = c(-1, 1), size = nbBlocks - 1, replace = TRUE) *
  rnorm(n = nbBlocks - 1, mean = 3, sd = 2)
GRM <- diag(nbGenos)
truth[["GMAs"]] <- MASS::mvrnorm(
  n = 1, mu = rep(0, nbGenos),
  Sigma = truth$var_GMA * GRM
)
truth[["SMAs"]] <- rnorm(n = length(IDs), mean = 0, sd = sqrt(truth$var_SMA))
truth[["SMAs_ij"]] <- rnorm(n = length(mixStands), mean = 0, sd = sqrt(truth$var_SMA_ij))
truth[["SMAs_ii"]] <- rnorm(n = length(monoStands), mean = 0, sd = sqrt(truth$var_SMA_ii))
truth[["errors"]] <- rnorm(n = nrow(dat), mean = 0, sd = sqrt(truth$var_error))

3.2.5 Phenotypes

y1 <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  truth$errors
dat$pheno1 <- y1[, 1]

y2 <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod2 %*% truth$SMAs +
  truth$errors
dat$pheno2 <- y2[, 1]

y3 <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod3 %*% truth$SMAs +
  truth$errors
dat$pheno3 <- y3[, 1]

y2p <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod2p %*% truth$SMAs_ij +
  truth$errors
dat$pheno2p <- y2p[, 1]

y2pp <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod2pp_ij %*% truth$SMAs_ij +
  allZ$SMA_mod2pp_ii %*% truth$SMAs_ii +
  truth$errors
dat$pheno2pp <- y2pp[, 1]

y3p <- X %*% c(truth$intercept, truth$blockEffs) +
  allZ$GMA %*% truth$GMAs +
  allZ$SMA_mod3p_ij %*% truth$SMAs_ij +
  allZ$SMA_mod3p_ii %*% truth$SMAs_ii +
  truth$errors
dat$pheno3p <- y3p[, 1]

4 Infer the parameters

Function fitGMASMA allows to fit the models with various packages. In this vignette, only lme4 and TMB will be used and, as you can see below, both return the same estimates. (To save time, MM4LMM is commented.)

pkgs <- c("lme4", "TMB") # , "MM4LMM")
if ("MM4LMM" %in% pkgs) {
  suppressPackageStartupMessages(library(MM4LMM))
}

runAllPkgs <- function(pkgs, form, dat, listZ, listVCov, listContr, ...) {
  ## mcMap(function(pkg) {
  Map(function(pkg) {
    print(paste0("fit with ", pkg, "..."))
    fitGMASMA(form, dat, listZ, pkg, listVCov, listContr)
    st <- system.time(
      try(fit <- fitGMASMA(form, dat, listZ, pkg, listVCov, listContr, REML = TRUE, ...))
    )
    print(st)
    fit
  }, pkgs) # , mc.cores = nbCores)
}

4.1 Model 1

4.1.1 Fit

form <- pheno1 ~ 1 + block
listZ <- list("GMA" = allZ$GMA)
listVCov <- list("GMA" = GRM)
listContr <- list(block = "contr.sum")
fits1 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
##    user  system elapsed 
##   0.556   0.000   0.556 
## [1] "fit with TMB..."
##    user  system elapsed 
##   3.271   0.000   3.271

4.1.2 Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits1$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_error")]),
  estim = tmp[c("GMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.5474529  5.474529
## var_error     1  0.9599666 -4.003335

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_error")]),
  estim = do.call(c, fits1$TMB$report[c("var_GMA", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.5474516  5.474516
## var_error     1  0.9599667 -4.003335

if ("MM4LMM" %in% names(fits1)) {
  print(fits1$MM4LMM$Sigma2)
}

4.2 Model 2

4.2.1 Fit

form <- pheno2 ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA" = allZ$SMA_mod2
)
listVCov <- list(
  "GMA" = GRM,
  "SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits2 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
##    user  system elapsed 
##   1.440   0.000   1.439 
## [1] "fit with TMB..."
##    user  system elapsed 
##  10.752   0.108  10.862

4.2.2 Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.4978544  4.9785441
## var_SMA       2  2.0197144  0.9857177
## var_error     1  0.9453828 -5.4617169

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = do.call(c, fits2$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.4979307  4.9793071
## var_SMA       2  2.0197059  0.9852969
## var_error     1  0.9453837 -5.4616269

if ("MM4LMM" %in% names(fits2)) {
  print(fits2$MM4LMM$Sigma2)
}

4.3 Model 3

4.3.1 Fit

form <- pheno3 ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA" = allZ$SMA_mod3
)
listVCov <- list(
  "GMA" = GRM,
  "SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits3 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
##    user  system elapsed 
##   2.454   0.008   2.463 
## [1] "fit with TMB..."
##    user  system elapsed 
##  11.432   0.128  11.561

4.3.2 Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits3$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.2611009  2.611009
## var_SMA       2  2.1054392  5.271960
## var_error     1  0.9469001 -5.309988

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = do.call(c, fits3$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim       nBE
## var_GMA      10 10.2611022  2.611022
## var_SMA       2  2.1054387  5.271936
## var_error     1  0.9469002 -5.309984

if ("MM4LMM" %in% names(fits3)) {
  print(fits3$MM4LMM$Sigma2)
}

4.4 Model 2’

4.4.1 Fit

form <- pheno2p ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA" = allZ$SMA_mod2p
)
listVCov <- list(
  "GMA" = GRM,
  "SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits2p <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
##    user  system elapsed 
##   2.061   0.000   2.061 
## [1] "fit with TMB..."
##    user  system elapsed 
##   9.650   0.072   9.723

4.4.2 Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2p$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.3596090   3.596090
## var_SMA       2  1.6026602 -19.866989
## var_error     1  0.9456708  -5.432918

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
  estim = do.call(c, fits2p$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##           truth      estim        nBE
## var_GMA      10 10.3596074   3.596074
## var_SMA       2  1.6026609 -19.866954
## var_error     1  0.9456708  -5.432919

if ("MM4LMM" %in% names(fits2p)) {
  print(fits2p$MM4LMM$Sigma2)
}

4.5 Model 2’’

4.5.1 Fit

form <- pheno2pp ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA_ij" = allZ$SMA_mod2pp_ij,
  "SMA_ii" = allZ$SMA_mod2pp_ii
)
listVCov <- list(
  "GMA" = GRM,
  "SMA_ij" = diag(ncol(listZ$SMA_ij)),
  "SMA_ii" = diag(ncol(listZ$SMA_ii))
)
listContr <- list(block = "contr.sum")
fits2pp <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
##    user  system elapsed 
##   2.375   0.000   2.375 
## [1] "fit with TMB..."
##    user  system elapsed 
##  10.384   0.092  10.476

4.5.2 Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2pp$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = tmp[c("GMA", "SMA_ij", "SMA_ii", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim         nBE
## var_GMA     10.0 10.0082780  0.08278019
## var_SMA_ij   1.5  1.6172865  7.81910037
## var_SMA_ii   0.8  0.8345874  4.32342341
## var_error    1.0  0.9453837 -5.46163448

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = do.call(c, fits2pp$TMB$report[c("var_GMA", "var_SMA1", "var_SMA2", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim         nBE
## var_GMA     10.0 10.0082809  0.08280897
## var_SMA_ij   1.5  1.6172866  7.81910639
## var_SMA_ii   0.8  0.8345874  4.32341949
## var_error    1.0  0.9453836 -5.46163514

if ("MM4LMM" %in% names(fits2pp)) {
  print(fits2pp$MM4LMM$Sigma2)
}

4.6 Model 3’

4.6.1 Fit

form <- pheno3p ~ 1 + block
listZ <- list(
  "GMA" = allZ$GMA,
  "SMA_ij" = allZ$SMA_mod3p_ij,
  "SMA_ii" = allZ$SMA_mod3p_ii
)
listVCov <- list(
  "GMA" = GRM,
  "SMA_ij" = diag(ncol(listZ$SMA_ij)),
  "SMA_ii" = diag(ncol(listZ$SMA_ii))
)
listContr <- list(block = "contr.sum")
fits3p <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
##    user  system elapsed 
##   3.593   0.000   3.594 
## [1] "fit with TMB..."
##    user  system elapsed 
##  11.519   0.116  11.635

4.6.2 Check

print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits3p$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = tmp[c("GMA", "SMA_ij", "SMA_ii", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim        nBE
## var_GMA     10.0 10.0390341  0.3903412
## var_SMA_ij   1.5  1.6726159 11.5077264
## var_SMA_ii   0.8  0.9481598 18.5199796
## var_error    1.0  0.9440193 -5.5980736

print("TMB:")
## [1] "TMB:"
checks <- data.frame(
  truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
  estim = do.call(c, fits3p$TMB$report[c("var_GMA", "var_SMA1", "var_SMA2", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
##            truth      estim        nBE
## var_GMA     10.0 10.0390775  0.3907745
## var_SMA_ij   1.5  1.6726225 11.5081637
## var_SMA_ii   0.8  0.9481763 18.5220407
## var_error    1.0  0.9440195 -5.5980522

if ("MM4LMM" %in% names(fits3p)) {
  print(fits3p$MM4LMM$Sigma2)
}

5 Appendix

t1 <- proc.time()
t1 - t0
##    user  system elapsed 
## 185.289   2.547 190.107
print(sessionInfo(), locale = FALSE)
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Debian GNU/Linux 12 (bookworm)
## 
## Matrix products: default
## BLAS:   /usr/local/R/4.5.1/lib/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.11.0  LAPACK version 3.11.0
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plantmix_1.0.2 lme4_1.1-37    Matrix_1.7-3   ggplot2_3.5.2  knitr_1.50    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     dplyr_1.1.4        compiler_4.5.1    
##  [5] Rcpp_1.1.0         tidyselect_1.2.1   dichromat_2.0-0.1  jquerylib_0.1.4   
##  [9] splines_4.5.1      scales_1.4.0       boot_1.3-31        yaml_2.3.10       
## [13] fastmap_1.2.0      lattice_0.22-7     R6_2.6.1           generics_0.1.4    
## [17] igraph_2.1.4       rbibutils_2.3      MASS_7.3-65        tibble_3.3.0      
## [21] nloptr_2.2.1       minqa_1.2.8        TMB_1.9.17         bslib_0.9.0       
## [25] pillar_1.11.0      RColorBrewer_1.1-3 rlang_1.1.7        cachem_1.1.0      
## [29] xfun_0.52          sass_0.4.10        cli_3.6.5          withr_3.0.2       
## [33] magrittr_2.0.3     Rdpack_2.6.4       digest_0.6.37      grid_4.5.1        
## [37] lifecycle_1.0.5    nlme_3.1-168       reformulas_0.4.1   vctrs_0.6.5       
## [41] evaluate_1.0.4     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
## [45] tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1