Estimates the runtimes of jobs using the random forest implemented in ranger.
Observed runtimes are retrieved from the Registry
and runtimes are
predicted for unfinished jobs.
The estimated remaining time is calculated in the print
method.
You may also pass n
here to determine the number of parallel jobs which is then used
in a simple Longest Processing Time (LPT) algorithm to give an estimate for the parallel runtime.
Usage
estimateRuntimes(tab, ..., reg = getDefaultRegistry())
# S3 method for class 'RuntimeEstimate'
print(x, n = 1L, ...)
Arguments
- tab
[
data.table
]
Table with column “job.id” and additional columns to predict the runtime. Observed runtimes will be looked up in the registry and serve as dependent variable. All columns intab
except “job.id” will be passed toranger
as independent variables to fit the model.- ...
[ANY]
Additional parameters passed toranger
. Ignored for theprint
method.- reg
[
Registry
]
Registry. If not explicitly passed, uses the default registry (seesetDefaultRegistry
).- x
[
RuntimeEstimate
]
Object to print.- n
[
integer(1)
]
Number of parallel jobs to assume for runtime estimation.
Value
[RuntimeEstimate
] which is a list
with two named elements:
“runtimes” is a data.table
with columns “job.id”,
“runtime” (in seconds) and “type” (“estimated” if runtime is estimated,
“observed” if runtime was observed).
The other element of the list named “model”] contains the fitted random forest object.
Examples
# Create a simple toy registry
set.seed(1)
tmp = makeExperimentRegistry(file.dir = NA, make.default = FALSE, seed = 1)
#> No readable configuration file found
#> Created registry in '/tmp/batchtools-example/reg' using cluster functions 'Interactive'
addProblem(name = "iris", data = iris, fun = function(data, ...) nrow(data), reg = tmp)
#> Adding problem 'iris'
addAlgorithm(name = "nrow", function(instance, ...) nrow(instance), reg = tmp)
#> Adding algorithm 'nrow'
addAlgorithm(name = "ncol", function(instance, ...) ncol(instance), reg = tmp)
#> Adding algorithm 'ncol'
addExperiments(algo.designs = list(nrow = data.table::CJ(x = 1:50, y = letters[1:5])), reg = tmp)
#> Adding 250 experiments ('iris'[1] x 'nrow'[250] x repls[1]) ...
addExperiments(algo.designs = list(ncol = data.table::CJ(x = 1:50, y = letters[1:5])), reg = tmp)
#> Adding 250 experiments ('iris'[1] x 'ncol'[250] x repls[1]) ...
# We use the job parameters to predict runtimes
tab = unwrap(getJobPars(reg = tmp))
# First we need to submit some jobs so that the forest can train on some data.
# Thus, we just sample some jobs from the registry while grouping by factor variables.
library(data.table)
ids = tab[, .SD[sample(nrow(.SD), 5)], by = c("problem", "algorithm", "y")]
setkeyv(ids, "job.id")
submitJobs(ids, reg = tmp)
#> Submitting 50 jobs in 50 chunks using cluster functions 'Interactive' ...
waitForJobs(reg = tmp)
#> [1] TRUE
# We "simulate" some more realistic runtimes here to demonstrate the functionality:
# - Algorithm "ncol" is 5 times more expensive than "nrow"
# - x has no effect on the runtime
# - If y is "a" or "b", the runtimes are really high
runtime = function(algorithm, x, y) {
ifelse(algorithm == "nrow", 100L, 500L) + 1000L * (y %in% letters[1:2])
}
tmp$status[ids, done := done + tab[ids, runtime(algorithm, x, y)]]
#> Key: <job.id>
#> job.id def.id submitted started done error mem.used resource.id
#> <int> <int> <num> <num> <num> <char> <num> <int>
#> 1: 1 1 NA NA NA <NA> NA NA
#> 2: 2 2 NA NA NA <NA> NA NA
#> 3: 3 3 NA NA NA <NA> NA NA
#> 4: 4 4 NA NA NA <NA> NA NA
#> 5: 5 5 NA NA NA <NA> NA NA
#> ---
#> 496: 496 496 NA NA NA <NA> NA NA
#> 497: 497 497 NA NA NA <NA> NA NA
#> 498: 498 498 NA NA NA <NA> NA NA
#> 499: 499 499 1755681747 1755681747 1755682247 <NA> NA 1
#> 500: 500 500 NA NA NA <NA> NA NA
#> batch.id log.file job.hash job.name repl
#> <char> <char> <char> <char> <int>
#> 1: <NA> <NA> <NA> <NA> 1
#> 2: <NA> <NA> <NA> <NA> 1
#> 3: <NA> <NA> <NA> <NA> 1
#> 4: <NA> <NA> <NA> <NA> 1
#> 5: <NA> <NA> <NA> <NA> 1
#> ---
#> 496: <NA> <NA> <NA> <NA> 1
#> 497: <NA> <NA> <NA> <NA> 1
#> 498: <NA> <NA> <NA> <NA> 1
#> 499: cfInteractive <NA> job5af85cb44724cbffc9d33af5cfa4c77d <NA> 1
#> 500: <NA> <NA> <NA> <NA> 1
rjoin(sjoin(tab, ids), getJobStatus(ids, reg = tmp)[, c("job.id", "time.running")])
#> Key: <job.id>
#> job.id problem algorithm x y time.running
#> <int> <char> <char> <int> <char> <difftime>
#> 1: 32 iris nrow 7 b 1100.0341 secs
#> 2: 42 iris nrow 9 b 1100.0341 secs
#> 3: 47 iris nrow 10 b 1100.0338 secs
#> 4: 66 iris nrow 14 a 1100.0342 secs
#> 5: 73 iris nrow 15 c 100.0331 secs
#> 6: 75 iris nrow 15 e 100.0329 secs
#> 7: 86 iris nrow 18 a 1100.0329 secs
#> 8: 100 iris nrow 20 e 100.0344 secs
#> 9: 101 iris nrow 21 a 1100.0336 secs
#> 10: 103 iris nrow 21 c 100.0342 secs
#> 11: 123 iris nrow 25 c 100.0330 secs
#> 12: 125 iris nrow 25 e 100.0328 secs
#> 13: 161 iris nrow 33 a 1100.0365 secs
#> 14: 165 iris nrow 33 e 100.0341 secs
#> 15: 169 iris nrow 34 d 100.0340 secs
#> 16: 183 iris nrow 37 c 100.0339 secs
#> 17: 184 iris nrow 37 d 100.0329 secs
#> 18: 203 iris nrow 41 c 100.0327 secs
#> 19: 207 iris nrow 42 b 1100.0373 secs
#> 20: 209 iris nrow 42 d 100.0337 secs
#> 21: 220 iris nrow 44 e 100.0342 secs
#> 22: 227 iris nrow 46 b 1100.0332 secs
#> 23: 229 iris nrow 46 d 100.0330 secs
#> 24: 231 iris nrow 47 a 1100.0330 secs
#> 25: 244 iris nrow 49 d 100.0350 secs
#> 26: 260 iris ncol 2 e 500.0336 secs
#> 27: 276 iris ncol 6 a 1500.0339 secs
#> 28: 278 iris ncol 6 c 500.0329 secs
#> 29: 279 iris ncol 6 d 500.0329 secs
#> 30: 296 iris ncol 10 a 1500.0342 secs
#> 31: 320 iris ncol 14 e 500.0340 secs
#> 32: 340 iris ncol 18 e 500.0345 secs
#> 33: 347 iris ncol 20 b 1500.0329 secs
#> 34: 363 iris ncol 23 c 500.0329 secs
#> 35: 369 iris ncol 24 d 500.0370 secs
#> 36: 373 iris ncol 25 c 500.0353 secs
#> 37: 387 iris ncol 28 b 1500.0343 secs
#> 38: 410 iris ncol 32 e 500.0332 secs
#> 39: 421 iris ncol 35 a 1500.0327 secs
#> 40: 436 iris ncol 38 a 1500.0365 secs
#> 41: 444 iris ncol 39 d 500.0365 secs
#> 42: 448 iris ncol 40 c 500.0343 secs
#> 43: 456 iris ncol 42 a 1500.0331 secs
#> 44: 459 iris ncol 42 d 500.0330 secs
#> 45: 467 iris ncol 44 b 1500.0547 secs
#> 46: 468 iris ncol 44 c 500.0348 secs
#> 47: 475 iris ncol 45 e 500.0336 secs
#> 48: 482 iris ncol 47 b 1500.0329 secs
#> 49: 492 iris ncol 49 b 1500.0330 secs
#> 50: 499 iris ncol 50 d 500.0332 secs
#> job.id problem algorithm x y time.running
# Estimate runtimes:
est = estimateRuntimes(tab, reg = tmp)
print(est)
#> Runtime Estimate for 500 jobs with 1 CPUs
#> Done : 0d 09h 43m 21.7s
#> Remaining: 3d 17h 38m 27.5s
#> Total : 4d 03h 21m 49.2s
rjoin(tab, est$runtimes)
#> Key: <job.id>
#> job.id problem algorithm x y type runtime
#> <int> <char> <char> <int> <char> <fctr> <num>
#> 1: 1 iris nrow 1 a estimated 1106.8478
#> 2: 2 iris nrow 1 b estimated 1090.8022
#> 3: 3 iris nrow 1 c estimated 337.8401
#> 4: 4 iris nrow 1 d estimated 318.5059
#> 5: 5 iris nrow 1 e estimated 318.5499
#> ---
#> 496: 496 iris ncol 50 a estimated 1381.0745
#> 497: 497 iris ncol 50 b estimated 1387.9654
#> 498: 498 iris ncol 50 c estimated 614.6164
#> 499: 499 iris ncol 50 d observed 500.0332
#> 500: 500 iris ncol 50 e estimated 574.8083
print(est, n = 10)
#> Runtime Estimate for 500 jobs with 10 CPUs
#> Done : 0d 09h 43m 21.7s
#> Remaining: 3d 17h 38m 27.5s
#> Parallel : 0d 08h 58m 29.4s
#> Total : 4d 03h 21m 49.2s
# Submit jobs with longest runtime first:
ids = est$runtimes[type == "estimated"][order(runtime, decreasing = TRUE)]
print(ids)
#> job.id type runtime
#> <int> <fctr> <num>
#> 1: 466 estimated 1423.2531
#> 2: 461 estimated 1419.8582
#> 3: 472 estimated 1417.1139
#> 4: 462 estimated 1416.9590
#> 5: 457 estimated 1416.1590
#> ---
#> 446: 194 estimated 133.3099
#> 447: 185 estimated 132.9607
#> 448: 174 estimated 131.1211
#> 449: 204 estimated 130.3195
#> 450: 179 estimated 129.9743
if (FALSE) { # \dontrun{
submitJobs(ids, reg = tmp)
} # }
# Group jobs into chunks with runtime < 1h
ids = est$runtimes[type == "estimated"]
ids[, chunk := binpack(runtime, 3600)]
#> Key: <job.id>
#> job.id type runtime chunk
#> <int> <fctr> <num> <int>
#> 1: 1 estimated 1106.8478 47
#> 2: 2 estimated 1090.8022 51
#> 3: 3 estimated 337.8401 54
#> 4: 4 estimated 318.5059 69
#> 5: 5 estimated 318.5499 32
#> ---
#> 446: 495 estimated 582.5429 15
#> 447: 496 estimated 1381.0745 21
#> 448: 497 estimated 1387.9654 17
#> 449: 498 estimated 614.6164 4
#> 450: 500 estimated 574.8083 26
print(ids)
#> Key: <job.id>
#> job.id type runtime chunk
#> <int> <fctr> <num> <int>
#> 1: 1 estimated 1106.8478 47
#> 2: 2 estimated 1090.8022 51
#> 3: 3 estimated 337.8401 54
#> 4: 4 estimated 318.5059 69
#> 5: 5 estimated 318.5499 32
#> ---
#> 446: 495 estimated 582.5429 15
#> 447: 496 estimated 1381.0745 21
#> 448: 497 estimated 1387.9654 17
#> 449: 498 estimated 614.6164 4
#> 450: 500 estimated 574.8083 26
print(ids[, list(runtime = sum(runtime)), by = chunk])
#> chunk runtime
#> <int> <num>
#> 1: 47 3494.080
#> 2: 51 3597.429
#> 3: 54 3591.045
#> 4: 69 3498.425
#> 5: 32 3597.399
#> 6: 48 3491.006
#> 7: 52 3599.996
#> 8: 55 3589.659
#> 9: 70 3494.986
#> 10: 33 3596.556
#> 11: 56 3579.751
#> 12: 71 3492.356
#> 13: 34 3593.020
#> 14: 57 3571.296
#> 15: 72 3489.163
#> 16: 46 3519.206
#> 17: 50 3599.309
#> 18: 38 3596.719
#> 19: 53 3595.735
#> 20: 35 3594.687
#> 21: 43 3564.789
#> 22: 65 3515.758
#> 23: 36 3593.596
#> 24: 42 3574.589
#> 25: 63 3529.568
#> 26: 62 3532.593
#> 27: 39 3596.573
#> 28: 59 3550.710
#> 29: 58 3560.229
#> 30: 49 3486.985
#> 31: 41 3587.229
#> 32: 60 3539.632
#> 33: 64 3525.702
#> 34: 40 3595.411
#> 35: 37 3599.172
#> 36: 61 3536.266
#> 37: 44 3535.875
#> 38: 66 3513.120
#> 39: 45 3535.075
#> 40: 67 3509.344
#> 41: 68 3507.273
#> 42: 28 3592.495
#> 43: 25 3594.225
#> 44: 26 3589.899
#> 45: 27 3595.539
#> 46: 24 3597.866
#> 47: 29 3570.517
#> 48: 75 3598.764
#> 49: 20 3522.870
#> 50: 74 3472.064
#> 51: 10 3583.204
#> 52: 31 3599.119
#> 53: 6 3594.824
#> 54: 9 3590.023
#> 55: 12 3562.949
#> 56: 5 3598.228
#> 57: 11 3574.173
#> 58: 7 3595.615
#> 59: 4 3598.811
#> 60: 81 3477.688
#> 61: 83 3599.440
#> 62: 3 3599.955
#> 63: 82 3599.987
#> 64: 73 3491.335
#> 65: 79 3497.623
#> 66: 80 3486.885
#> 67: 76 3584.110
#> 68: 87 3567.462
#> 69: 91 2032.264
#> 70: 77 3522.605
#> 71: 89 3551.033
#> 72: 88 3558.315
#> 73: 78 3507.098
#> 74: 84 3593.708
#> 75: 2 3599.786
#> 76: 86 3576.357
#> 77: 90 3528.326
#> 78: 85 3584.227
#> 79: 1 3599.806
#> 80: 8 3598.984
#> 81: 22 3597.463
#> 82: 18 3573.997
#> 83: 13 3596.548
#> 84: 30 3556.171
#> 85: 19 3569.384
#> 86: 16 3585.008
#> 87: 21 3597.847
#> 88: 17 3579.601
#> 89: 23 3599.390
#> 90: 14 3598.870
#> 91: 15 3593.749
#> chunk runtime
if (FALSE) { # \dontrun{
submitJobs(ids, reg = tmp)
} # }
# Group jobs into 10 chunks with similar runtime
ids = est$runtimes[type == "estimated"]
ids[, chunk := lpt(runtime, 10)]
#> Key: <job.id>
#> job.id type runtime chunk
#> <int> <fctr> <num> <int>
#> 1: 1 estimated 1106.8478 4
#> 2: 2 estimated 1090.8022 10
#> 3: 3 estimated 337.8401 9
#> 4: 4 estimated 318.5059 1
#> 5: 5 estimated 318.5499 7
#> ---
#> 446: 495 estimated 582.5429 10
#> 447: 496 estimated 1381.0745 2
#> 448: 497 estimated 1387.9654 7
#> 449: 498 estimated 614.6164 5
#> 450: 500 estimated 574.8083 8
print(ids[, list(runtime = sum(runtime)), by = chunk])
#> chunk runtime
#> <int> <num>
#> 1: 4 32234.78
#> 2: 10 32309.39
#> 3: 9 32235.20
#> 4: 1 32309.02
#> 5: 7 32240.06
#> 6: 3 32300.30
#> 7: 8 32309.07
#> 8: 6 32234.50
#> 9: 5 32234.59
#> 10: 2 32300.62