This document covers the topic of finding combinations or permutations that meet a specific set of criteria. For example, retrieving all combinations of a vector that have a product between two bounds.
There are 5 compiled constraint functions that can be utilized efficiently to test a given result.
They are passed as strings to the constraintFun parameter. When these are employed without any other parameters being set, an additional column is added that represents the result of applying the given function to that combination/permutation. You can also set keepResults = TRUE (more on this later).
library(RcppAlgos)
## base R using combn and FUN
combnSum <- combn(20, 10, sum)
algosSum <- comboGeneral(20, 10, constraintFun = "sum")
## Notice the additional column (i.e. the 11th column)
head(algosSum)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
#> [1,]    1    2    3    4    5    6    7    8    9    10    55
#> [2,]    1    2    3    4    5    6    7    8    9    11    56
#> [3,]    1    2    3    4    5    6    7    8    9    12    57
#> [4,]    1    2    3    4    5    6    7    8    9    13    58
#> [5,]    1    2    3    4    5    6    7    8    9    14    59
#> [6,]    1    2    3    4    5    6    7    8    9    15    60
identical(as.integer(combnSum), algosSum[,11])
#> [1] TRUE
## Using parallel
paralSum <- comboGeneral(20, 10, constraintFun = "sum", Parallel = TRUE)
identical(paralSum, algosSum)
#> [1] TRUE
library(microbenchmark)
microbenchmark(serial = comboGeneral(20, 10, constraintFun = "sum"),
             parallel = comboGeneral(20, 10, constraintFun = "sum", Parallel = TRUE),
             combnSum = combn(20, 10, sum), unit = "relative")
#> Unit: relative
#>      expr        min         lq       mean    median         uq        max neval cld
#>    serial   3.177339   3.173798   3.054257   3.14192   2.948129   2.859115   100  a
#>  parallel   1.000000   1.000000   1.000000   1.00000   1.000000   1.000000   100  a
#>  combnSum 234.997953 208.891710 189.269384 201.36549 188.176221 176.208449   100   browSums and rowMeansFinding row sums or row means is even faster than simply applying the highly efficient rowSums/rowMeans after the combinations have already been generated:
## Pre-generate combinations
combs <- comboGeneral(25, 10)
## Testing rowSums alone against generating combinations as well as summing
microbenchmark(serial = comboGeneral(25, 10, constraintFun = "sum"),
             parallel = comboGeneral(25, 10, constraintFun = "sum", Parallel = TRUE),
              rowsums = rowSums(combs), unit = "relative")
#> Unit: relative
#>      expr     min       lq     mean   median       uq       max neval cld
#>    serial 3.18877 3.009145 2.592448 2.556819 2.446044 1.4423210   100   c
#>  parallel 1.00000 1.000000 1.000000 1.000000 1.000000 1.0000000   100 a
#>   rowsums 2.90689 2.636867 2.212371 2.069613 2.254097 0.6039177   100  b
all.equal(rowSums(combs),
          comboGeneral(25, 10,
                       constraintFun = "sum",
                       Parallel = TRUE)[,11])
#> [1] TRUE
## Testing rowMeans alone against generating combinations as well as obtain row means
microbenchmark(serial = comboGeneral(25, 10, constraintFun = "mean"),
             parallel = comboGeneral(25, 10, constraintFun = "mean", Parallel = TRUE),
             rowmeans = rowMeans(combs), unit = "relative")
#> Unit: relative
#>      expr      min       lq     mean   median       uq      max neval cld
#>    serial 3.063023 2.561861 2.411453 2.542078 2.529883 1.553592   100   c
#>  parallel 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100 a
#>  rowmeans 1.759344 1.443430 1.348248 1.418801 1.404468 0.532154   100  b
all.equal(rowMeans(combs),
          comboGeneral(25, 10,
                       constraintFun = "mean",
                       Parallel = TRUE)[,11])
#> [1] TRUEIn both cases above, RcppAlgos is doing double the work nearly twice as fast!!!
limitConstraintsThe standard 5 comparison operators (i.e. "<", ">", "<=", ">=", & "==") can be used in a variety of ways. In order for them to have any effect, they must be used in conjunction with constraintFun as well as limitConstraints. The latter is the value(s) that will be used for comparison. It can be passed as a single value or a vector of two numerical values. This is useful when one wants to find results that are between (or outside) of a given range.
First we will look at cases with only one comparison and one value for the limitConstraint.
## Generate some random data. N.B. Using R >= 4.0.0
set.seed(101)
myNums <- sample(500, 20)
myNums
#> [1] 329 313 430  95 209 442 351 317 444 315 246 355 128 131 288   9 352 489 354 244
## Find all 5-tuples combinations without repetition of myNums
## (defined above) such that the sum is equal to 1176.
p1 <- comboGeneral(v = myNums, m = 5,
                   constraintFun = "sum",
                   comparisonFun = "==",
                   limitConstraints = 1176)
tail(p1)
#>       [,1] [,2] [,3] [,4] [,5]
#> [10,]   95  128  246  352  355
#> [11,]   95  128  288  313  352
#> [12,]   95  131  244  351  355
#> [13,]   95  131  244  352  354
#> [14,]   95  209  244  313  315
#> [15,]  128  131  246  317  354
## Authenticate with brute force
allCombs <- comboGeneral(sort(myNums), 5)
identical(p1, allCombs[which(rowSums(allCombs) == 1176), ])
#> [1] TRUE
## How about finding combinations with repetition
## whose mean is less than or equal to 150.
p2 <- comboGeneral(v = myNums, m = 5, TRUE,
                   constraintFun = "mean",
                   comparisonFun = "<=",
                   limitConstraints = 150)
## Again, we authenticate with brute force
allCombs <- comboGeneral(sort(myNums), 5, TRUE)
identical(p2, allCombs[which(rowMeans(allCombs) <= 150), ])
#> [1] FALSE  ### <<--- What? They should be the same
## N.B.
class(p2[1, ])
#> [1] "numeric"
class(allCombs[1, ])
#> [1] "integer"
## When mean is employed or it can be determined that integral
## values will not suffice for the comparison, we fall back to
## numeric types, thus all.equal should return TRUE
all.equal(p2, allCombs[which(rowMeans(allCombs) <= 150), ])
#> [1] TRUESometimes, we need to generate combinations/permutations such that when we apply a constraint function, the results are between (or outside) a given range. There is a natural two step process when finding results outside a range, however for finding results between a range, this two step approach could become computationally demanding. The underlying algorithms in RcppAlgos are optimized for both cases and avoids adding results that will eventually be removed.
Using two comparisons is easy. The first comparison operator is applied to the first limit and the second operator is applied to the second limit.
Note that in the examples below, we have keepResults = TRUE. This means an additional column will be added to the output that is the result of applying constraintFun to that particular combination.
## Get combinations such that the product is
## strictly between 3600 and 4000
comboGeneral(5, 7, TRUE, constraintFun = "prod",
             comparisonFun = c(">","<"),          ## Find results > 3600 and < 4000
             limitConstraints = c(3600, 4000),
             keepResults = TRUE)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,]    1    2    3    5    5    5    5 3750
#> [2,]    1    3    4    4    4    4    5 3840
#> [3,]    2    2    3    4    4    4    5 3840
#> [4,]    3    3    3    3    3    3    5 3645
#> [5,]    3    3    3    3    3    4    4 3888
# ## The above is the same as doing the following:
# comboGeneral(5, 7, TRUE, constraintFun = "prod",
#              comparisonFun = c("<",">"),          ## Note that the comparison vector
#              limitConstraints = c(4000, 3600),    ## and the limits have flipped
#              keepResults = TRUE)
## What about finding combinations outside a range
outside <- comboGeneral(5, 7, TRUE, constraintFun = "prod",
                        comparisonFun = c("<=",">="),
                        limitConstraints = c(3600, 4000),
                        keepResults = TRUE)
all(apply(outside[, -8], 1, prod) <= 3600
     | apply(outside[, -8], 1, prod) >= 4000)
#> [1] TRUE
dim(outside)
#> [1] 325   8
## Note that we obtained 5 results when searching "between"
## 3600 and 4000. Thus we have: 325 + 5 = 330
comboCount(5, 7, T)
#> [1] 330toleranceWhen the underlying type is numeric, round-off errors can occur. As stated in floating-point error mitigation:
“By definition, floating-point error cannot be eliminated, and, at best, can only be managed.”
Here is a great stackoverflow post that further illuminates this tricky topic:
For these reasons, the argument tolerance can be utilized to refine a given constraint. It is added to the upper limit and subtracted from the lower limit. The default value is sqrt(.Machine$double.eps) ~= 0.00000001490116.
This default value is good and bad.
For the good side:
dim(comboGeneral(seq(0, 0.5, 0.05), 6, TRUE,
                 constraintFun = "sum",
                 comparisonFun = "==",
                 limitConstraints = 1))
#> [1] 199   6
## Confirm with integers and brute force
allCbs <- comboGeneral(seq(0L, 50L, 5L), 6, TRUE, constraintFun = "sum")
sum(allCbs[, 7] == 100L)
#> [1] 199If we had a tolerance of zero, we would have obtained an incorrect result:
## We miss 31 combinations that add up to 1
dim(comboGeneral(seq(0, 0.5, 0.05), 6, TRUE,
                 constraintFun = "sum",
                 comparisonFun = "==",
                 limitConstraints = 1, tolerance = 0))
#> [1] 168   6And now for a less desirable result. The example below appears to give incorrect results. That is, we shouldn’t return any combination with a mean of 4.1 or 5.1.
comboGeneral(c(2.1, 3.1, 5.1, 7.1), 3, T,
             constraintFun = "mean", comparisonFun = c("<", ">"),
             limitConstraints = c(5.1, 4.1), keepResults = TRUE)
#>      [,1] [,2] [,3]     [,4]
#> [1,]  2.1  3.1  7.1 4.100000
#> [2,]  2.1  5.1  5.1 4.100000
#> [3,]  2.1  5.1  7.1 4.766667
#> [4,]  3.1  3.1  7.1 4.433333
#> [5,]  3.1  5.1  5.1 4.433333
#> [6,]  3.1  5.1  7.1 5.100000
#> [7,]  5.1  5.1  5.1 5.100000In the above example, the range that is actually tested against is c(4.0999999950329462, 5.1000000049670531).
If you want to be absolutely sure you are getting the correct results, one must rely on integers as simple changes in arithmetic can throw off precision in floating point operations.
comboGeneral(c(21, 31, 51, 71), 3, T,
             constraintFun = "mean", comparisonFun = c("<", ">"),
             limitConstraints = c(51, 41), keepResults = TRUE) / 10
#>      [,1] [,2] [,3]     [,4]
#> [1,]    2    5    7 4.666667
#> [2,]    3    3    7 4.333333
#> [3,]    3    5    5 4.333333permuteGeneralTypically, when we call permuteGeneral, the output is in lexicographical order, however when we apply a constraint, the underlying algorithm checks against combinations only, as this is more efficient. If a particular combination meets a constraint, then all permutations of that vector also meet that constraint, so there is no need to check them. For this reason, the output isn’t in order. Observe:
permuteGeneral(c(2, 3, 5, 7), 3, freqs = rep(2, 4),
               constraintFun = "mean", comparisonFun = c(">", "<"),
               limitConstraints = c(4, 5), keepResults = TRUE, tolerance = 0)
#>       [,1] [,2] [,3]     [,4]
#>  [1,]    2    5    7 4.666667   <-- First combination that meets the criteria
#>  [2,]    2    7    5 4.666667
#>  [3,]    5    2    7 4.666667
#>  [4,]    5    7    2 4.666667
#>  [5,]    7    2    5 4.666667
#>  [6,]    7    5    2 4.666667
#>  [7,]    3    3    7 4.333333   <-- Second combination that meets the criteria
#>  [8,]    3    7    3 4.333333
#>  [9,]    7    3    3 4.333333
#> [10,]    3    5    5 4.333333   <-- Third combination that meets the criteria
#> [11,]    5    3    5 4.333333
#> [12,]    5    5    3 4.333333As you can see, the 2nd through the 6th entries are simply permutations of the 1st entry. Similarly, entries 8 and 9 are permutations of the 7th and entries 11 and 12 are permutations of the 10th.
In version 2.4.* it was possible to interrupt long running jobs by taking advantage of Rcpp::checkUserInterrupt. Since we no longer rely on Rcpp in version 2.5.0, we are currently not checking for user interruption. Our goal is to eventually include this feature as it is very convenient, however it will take great care to implement as it is non-trivial.
For now, we encourage user to use iterators (See Combinatorial Iterators in RcppAlgos) as they offer greater flexibility. For example, with iterators it is easy to avoid resource consuming calls by only fetching a few results at a time.
Here is an example of how to investigate difficult problems due to combinatorial explosion without fear of having to restart R.
set.seed(123)
s <- rnorm(1000)
## In previous versions, you could execute commands and interrupt
## them if it took a really long.
##
## Now, we use iterators.
iter <- comboIter(s, 20, TRUE,
                  constraintFun = "sum",
                  comparisonFun = "==",
                  limitConstraints = 0)
## Test one iteration to see if we need to relax the tolerance
system.time(iter@nextIter())
#>    user  system elapsed
#>   7.448   0.013   7.468
## 8 seconds per iteration is a bit much... Let's loosen things
## a little by increasing the tolerance from sqrt(.Machine$double.eps)
## ~= 1.49e-8 to 1e-5.
relaxedIter <- comboIter(s, 20, TRUE,
                         constraintFun = "sum",
                         comparisonFun = "==",
                         limitConstraints = 0,
                         tolerance = 1e-5)
system.time(relaxedIter@nextIter())
#>    user  system elapsed
#>   0.001   0.000   0.001Specialized algorithms are employed when it can be determined that we are looking for integer partitions.
As of version 2.5.0, we now have added partitionsGeneral which is a wrapper of comboGeneral with constraintFun = "sum" and comparisonFun = "==". Instead of using the very general limitConstraints parameter, we use target with a default of max(abs(v)) as it seems more fitting for partitions.
We need v = 0:N, repetition = TRUE. When we leave m = NULL, m is internally set to the length of the longest non-zero combination (this is true for all cases below).
partitionsGeneral(0:5, repetition = TRUE)
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    0    0    0    0    5
#> [2,]    0    0    0    1    4
#> [3,]    0    0    0    2    3
#> [4,]    0    0    1    1    3
#> [5,]    0    0    1    2    2
#> [6,]    0    1    1    1    2
#> [7,]    1    1    1    1    1
## Note that we could also use comboGeneral:
## comboGeneral(0:5, repetition = TRUE,
##              constraintFun = "sum",
##              comparisonFun = "==", limitConstraints = 5)
##
## The same goes for any of the examples belowEverything is the same as above except for explicitly setting the desired length and deciding whether to include zero or not.
## Including zero
partitionsGeneral(0:5, 3, repetition = TRUE)
#>      [,1] [,2] [,3]
#> [1,]    0    0    5
#> [2,]    0    1    4
#> [3,]    0    2    3
#> [4,]    1    1    3
#> [5,]    1    2    2
## Zero not included
partitionsGeneral(5, 3, repetition = TRUE)
#>      [,1] [,2] [,3]
#> [1,]    1    1    3
#> [2,]    1    2    2Same as Case 1 & 2 except now we have repetition = FALSE.
partitionsGeneral(0:10)
#>      [,1] [,2] [,3] [,4]
#> [1,]    0    1    2    7
#> [2,]    0    1    3    6
#> [3,]    0    1    4    5
#> [4,]    0    2    3    5
#> [5,]    1    2    3    4
## Zero not included and restrict the length
partitionsGeneral(10, 3)
#>      [,1] [,2] [,3]
#> [1,]    1    2    7
#> [2,]    1    3    6
#> [3,]    1    4    5
#> [4,]    2    3    5
## Include zero and restrict the length
partitionsGeneral(0:10, 3)
#>      [,1] [,2] [,3]
#> [1,]    0    1    9
#> [2,]    0    2    8
#> [3,]    0    3    7
#> [4,]    0    4    6
#> [5,]    1    2    7
#> [6,]    1    3    6
#> [7,]    1    4    5
#> [8,]    2    3    5
## partitions of 10 into distinct parts of every length
lapply(1:4, function(x) {
    partitionsGeneral(10, x)
})
#> [[1]]
#>      [,1]
#> [1,]   10
#>
#> [[2]]
#>      [,1] [,2]
#> [1,]    1    9
#> [2,]    2    8
#> [3,]    3    7
#> [4,]    4    6
#>
#> [[3]]
#>      [,1] [,2] [,3]
#> [1,]    1    2    7
#> [2,]    1    3    6
#> [3,]    1    4    5
#> [4,]    2    3    5
#>
#> [[4]]
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    2    3    4freqs to Refine LengthWe can utilize the freqs argument to obtain more distinct partitions by allowing for repeated zeros. The super optimized algorithm will only be carried out if zero is included and the number of repetitions for every number except zero is one.
For example, given v = 0:N and J >= 1, if freqs = c(J, rep(1, N)), then the super optimized algorithm will be used, however if freqs = c(J, 2, rep(1, N - 1)), the general algorithm will be used. It should be noted that the general algorithms are still highly optimized so one should not fear using it.
A pattern that is guaranteed to retrieve all distinct partitions of N is to set v = 0:N and freqs = c(N, rep(1, N)) (the extra zeros will be left off).
## Obtain all distinct partitions of 10
partitionsGeneral(0:10, freqs = c(10, rep(1, 10)))    ## Same as c(3, rep(1, 10))
#>       [,1] [,2] [,3] [,4]
#>  [1,]    0    0    0   10
#>  [2,]    0    0    1    9
#>  [3,]    0    0    2    8
#>  [4,]    0    0    3    7
#>  [5,]    0    0    4    6
#>  [6,]    0    1    2    7
#>  [7,]    0    1    3    6
#>  [8,]    0    1    4    5
#>  [9,]    0    2    3    5
#> [10,]    1    2    3    4freqsAs noted in Case 1, if m = NULL, the length of the output will be determined by the longest non-zero combination that sums to N.
## m is NOT NULL and output has at most 2 zeros
partitionsGeneral(0:10, 3, freqs = c(2, rep(1, 10)))
#>      [,1] [,2] [,3]
#> [1,]    0    0   10
#> [2,]    0    1    9
#> [3,]    0    2    8
#> [4,]    0    3    7
#> [5,]    0    4    6
#> [6,]    1    2    7
#> [7,]    1    3    6
#> [8,]    1    4    5
#> [9,]    2    3    5
## m is NULL and output has at most 2 zeros
partitionsGeneral(0:10, freqs = c(2, rep(1, 10)))
#>      [,1] [,2] [,3] [,4]
#> [1,]    0    0    1    9
#> [2,]    0    0    2    8
#> [3,]    0    0    3    7
#> [4,]    0    0    4    6
#> [5,]    0    1    2    7
#> [6,]    0    1    3    6
#> [7,]    0    1    4    5
#> [8,]    0    2    3    5
#> [9,]    1    2    3    4Note, as of version 2.5.0, one can generate partitions in parallel using the nThreads argument.
## partitions of 60
partitionsCount(0:60, repetition = TRUE)
#> [1] 966467
## Single threaded
system.time(partitionsGeneral(0:60, repetition = TRUE))
#>  user  system elapsed
#> 0.092   0.051   0.143
## Using nThreads
system.time(partitionsGeneral(0:60, repetition = TRUE, nThreads=4))
#>  user  system elapsed
#> 0.116   0.103   0.056
## partitions of 120 into distinct parts
partitionsCount(0:120, freqs = c(120, rep(1, 120)))
#> [1] 2194432
system.time(partitionsGeneral(0:120, freqs = c(120, rep(1, 120))))
#>  user  system elapsed
#>  0.11    0.00    0.11
system.time(partitionsGeneral(0:120, freqs = c(120, rep(1, 120)), nThreads=4))
#>  user  system elapsed
#> 0.146   0.001   0.038