Skip to contents

Timings hydrorecipes vs recipes

Timings for the hydrorecipes package are prefaced with an “h”. The first few comparisons include the R6 interface in hydrorecipes to check if there is a loss of speed compared to the standard API. Most users are likely to use the standard API so the remaining benchmarks only present that. Typical speed improvements are between 2-10x and memory consumption is typically half of the recipes package.

creating a recipe

relative <- TRUE
n <- c(1e2, 1e4, 5e6)
formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows)
    bench::mark(
      hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat),
      hrec2 = recipe(formula = formula, data = dat),
      rec   = recipes::recipe(formula = formula, data = dat),
      check = FALSE,
      relative = relative
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 9 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1          100  1      1.00      5.80    532.       1   
#> 2 hrec2          100  1.01   1         5.76      1        1.02
#> 3 rec            100  6.01   5.78      1      5187.       1.54
#> 4 hrec1        10000  1      1         5.70      1        2.02
#> 5 hrec2        10000  1.01   1.01      5.48      1        2.02
#> 6 rec          10000  5.92   5.70      1         4.34     1   
#> 7 hrec1      5000000  1      1.00      5.69      1        2.02
#> 8 hrec2      5000000  1.00   1         5.75      1        2.02
#> 9 rec        5000000  5.93   5.72      1         4.34     1

add a step

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows)
    bench::mark(
      hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat)$
        add_step(hydrorecipes:::StepCenter$new(x)),
      hrec2 = recipe(formula = formula, data = dat) |>
        step_center(x),
      rec  = {recipes::recipe(formula = formula, data = dat) |>
          recipes::step_center(x)},
      check = FALSE,
      relative = relative
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 9 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1          100  1      1         3.89     54.9      2.01
#> 2 hrec2          100  1.04   1.04      3.68      1        2.01
#> 3 rec            100  3.99   3.88      1        10.5      1   
#> 4 hrec1        10000  1      1         3.85      1        1   
#> 5 hrec2        10000  1.04   1.04      3.75      1        1.00
#> 6 rec          10000  4.00   3.87      1         1.32     1.01
#> 7 hrec1      5000000  1      1         3.87      1        2.02
#> 8 hrec2      5000000  1.04   1.04      3.73      1        2.02
#> 9 rec        5000000  4.02   3.86      1         1.32     1

step_center prep

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows)
    hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat)$
      add_step(hydrorecipes:::StepCenter$new(x))
    hrec2 = recipe(formula = formula, data = dat) |>
      step_center(x)      
    rec   = recipes::recipe(formula = formula, data = dat) |>
      recipes::step_center(x)
    bench::mark(
      hrec1$prep(),
      hrec2 |> prep(),
      rec |> recipes::prep(),
      check = FALSE,
      min_iterations = 1L,
      relative = relative
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 9 × 7
#>   expression            rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>           <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1$prep()           100  1.00   1         80.8      53.8     1.00
#> 2 prep(hrec2)            100  1      1.00      82.7       1       1   
#> 3 recipes::prep(rec)     100 85.2   83.9        1      1516.      1.05
#> 4 hrec1$prep()         10000  1.00   1         80.0     NaN       1.01
#> 5 prep(hrec2)          10000  1      1.00      79.2     NaN       1   
#> 6 recipes::prep(rec)   10000 83.8   81.8        1       Inf       1.06
#> 7 hrec1$prep()       5000000  1.01   1         98.8     NaN     NaN   
#> 8 prep(hrec2)        5000000  1      1.00      98.9     NaN     NaN   
#> 9 recipes::prep(rec) 5000000 50.6  123.         1       Inf     Inf

step_center prep and bake

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows)
    hrec1 = hydrorecipes:::Recipe$new(formula = formula, data = dat)$
      add_step(hydrorecipes:::StepCenter$new(x))
    hrec2 = recipe(formula = formula, data = dat) |>
      step_center(x)
    rec   = recipes::recipe(formula = formula, data = dat) |>
      recipes::step_center(x)
    
    bench::mark(
      hrec1$prep()$bake(),
      hrec2 |> prep() |> bake(),
      rec |> recipes::prep() |> recipes::bake(new_data = NULL),
      check = FALSE,
      min_iterations = 1L,
      relative = relative
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 9 × 7
#>   expression                      rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                     <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1$prep()$bake()              1e2  1      1.00      71.5     47.6      1.00
#> 2 bake(prep(hrec2))                1e2  1.00   1         73.2      1        1   
#> 3 recipes::bake(recipes::prep(r…   1e2 77.0   74.4        1       45.6      1.06
#> 4 hrec1$prep()$bake()              1e4  1      1         74.6      1        1.00
#> 5 bake(prep(hrec2))                1e4  1.01   1.01      74.1      1        1   
#> 6 recipes::bake(recipes::prep(r…   1e4 77.5   75.1        1        3.44     1.07
#> 7 hrec1$prep()$bake()              5e6  1.00   1         65.0      1      NaN   
#> 8 bake(prep(hrec2))                5e6  1      1.00      64.9      1      NaN   
#> 9 recipes::bake(recipes::prep(r…   5e6 47.6   47.2        1        3.00   Inf

step_center

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows,
                      z = rnorm(rows))
    
    bench::mark(
      hrec = (recipe(formula = formula, data = dat) |>
                step_center(x) |>
                plate())[["x"]],
      rec  = (recipes::recipe(formula = formula, data = dat) |>
                recipes::step_center(x) |> 
                recipes::prep() |> 
                recipes::bake(new_data = NULL))[["x"]],
      check = TRUE,
      min_iterations = 1L,
      relative = relative
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100   1      1        14.7      5.19     2.00
#> 2 rec            100  14.9   14.6       1        1        1   
#> 3 hrec         10000   1      1        14.5      1        1   
#> 4 rec          10000  14.7   14.5       1        3.29     1.06
#> 5 hrec       5000000   1      1        11.1      1        4.51
#> 6 rec        5000000  11.7   11.2       1        3.00     1

step_scale

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows,
                      z = rnorm(rows))
    bench::mark(
      hrec = (recipe(formula = formula, data = dat) |>
                step_scale(x, fun = fsd, n_sd = 2L) |>
                plate())[["x"]],
      rec  = (recipes::recipe(formula = formula, data = dat) |>
                recipes::step_scale(x, factor = 2L) |> 
                recipes::prep() |> 
                recipes::bake(new_data = NULL))[["x"]],
      check = TRUE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1        14.1       4.50     1.98
#> 2 rec            100 14.5   14.0       1         1        1   
#> 3 hrec         10000  1      1        13.4       1        1   
#> 4 rec          10000 13.6   13.3       1         2.82     1.05
#> 5 hrec       5000000  1      1         2.88      1        1   
#> 6 rec        5000000  2.86   2.89      1         2.50     1.04

step_intercept

formula <- as.formula(y~x)
results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = rnorm(rows))
    bench::mark(
      hrec = (recipe(formula = formula, data = dat) |>
                step_intercept() |>
                plate("tbl"))[["intercept"]],
      rec = (recipes::recipe(formula = formula, data = dat) |>
               recipes::step_intercept() |> 
               recipes::prep() |> 
               recipes::bake(new_data = NULL))[["intercept"]],
      check = TRUE,
      relative = relative,
      min_iterations = 1L
    )
    
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1        14.4       4.65     1.97
#> 2 rec            100 14.7   14.5       1         1        1   
#> 3 hrec         10000  1      1        14.5       1        1   
#> 4 rec          10000 14.5   14.4       1         1.40     1.05
#> 5 hrec       5000000  1      1         5.84      1        4.52
#> 6 rec        5000000  8.72   5.79      1         1.00     1

step_normalize

formula <- as.formula(y~x+z)
results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = rnorm(rows),
                      z = rnorm(rows))
    
    bench::mark(
      hrec1 = (recipe(formula = formula, data = dat) |>
                 step_normalize(c(x, z, y)) |>
                 plate("tbl"))[, c("x", "z", "y")],
      
      hrec2 = (recipe(formula = formula, data = dat) |>
                 step_center(c(x, z, y)) |>
                 step_scale(c(x, z, y)) |>
                 plate("tbl"))[, c("x", "z", "y")],
      
      rec = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_normalize(x, y, z) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      
      relative = relative,
      min_iterations = 1L,
      check = TRUE
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 9 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1          100  1      1        14.0      52.9      2.02
#> 2 hrec2          100  1.36   1.35     10.3       1        1   
#> 3 rec            100 14.2   14.0       1         8.58     2.17
#> 4 hrec1        10000  1      1        12.1       1        2.02
#> 5 hrec2        10000  1.31   1.30      9.37      1.00     2.02
#> 6 rec          10000 12.1   12.3       1         1.41     1   
#> 7 hrec1      5000000  1      1         1.85      1        1.85
#> 8 hrec2      5000000  1.03   1.00      1.85      1.00     1.39
#> 9 rec        5000000  1.86   2.36      1         1.33     1

step_drop_columns

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = rnorm(rows),
                      z = rnorm(rows))
    bench::mark(
      hrec = recipe(formula = formula, data = dat)  |>
        step_drop_columns(z) |>
        plate("tbl"),
      rec = recipes::recipe(formula = formula, data = dat)  |>
        recipes::step_rm(z) |>
        recipes::prep() |>
        recipes::bake(new_data = NULL),
      check = TRUE,
      relative = relative
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100   1      1        14.8      7.53     2.02
#> 2 rec            100  15.1   14.8       1        1        1   
#> 3 hrec         10000   1      1        14.7      1        1   
#> 4 rec          10000  15.0   14.8       1       57.2      1.05
#> 5 hrec       5000000   1      1        40.9      1        1   
#> 6 rec        5000000  41.7   40.8       1    14563.       1.00

step_subset_na_omit

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = c(1e6, 1e7),
  {
    dat <- tibble(x = rnorm(rows), 
                  z = rnorm(rows),
                  y = rnorm(rows))
    dat[1:5, "x"] <- NA_real_
    dat[100:150, "z"] <- NA_real_
    dat[10000:15000, "y"] <- NA_real_
    
    bench::mark(
      hrec1 = (recipe(formula = formula, data = dat) |>
                 step_subset_na_omit(terms = x) |>
                 prep() |>
                 bake())$get_result("tbl"),
      
      rec = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_naomit(x) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      
      relative = FALSE,
      min_iterations = 1L,
      check = TRUE
    )
  }
)
#> Running with:
#>       rows
#> 1  1000000
#> 2 10000000

results
#> # A tibble: 4 × 7
#>   expression     rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1       1000000    6.1ms   6.46ms    139.      34.9MB    19.8 
#> 2 rec         1000000   29.5ms  30.61ms     32.3       35MB     4.61
#> 3 hrec1      10000000   75.4ms  75.56ms     13.2    343.3MB    22.0 
#> 4 rec        10000000  153.6ms 153.58ms      6.51   343.4MB    19.5

step_subset_rows

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = c(1e6, 1e7),
  {
    dat <- tibble(x = rnorm(rows), 
                  z = rnorm(rows),
                  y = rnorm(rows))
    sub <- sample(1:rows, size = 5e5)
    
    bench::mark(
      hrec1 = (recipe(formula = formula, data = dat) |>
                 step_subset_rows(row_numbers = sub) |>
                 prep() |>
                 bake())$get_result("tbl"),
      
      rec = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_slice(sub) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      
      base = dat[sub, ],
      
      relative = FALSE,
      min_iterations = 1L,
      check = TRUE
    )
  }
)
#> Running with:
#>       rows
#> 1  1000000
#> 2 10000000

results
#> # A tibble: 6 × 7
#>   expression     rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1       1000000    4.6ms    5.5ms     161.       12MB     6.42
#> 2 rec         1000000   36.2ms   37.1ms      25.3    30.7MB     0   
#> 3 base        1000000   9.38ms   13.8ms      80.3    19.1MB     2.11
#> 4 hrec1      10000000   9.45ms   12.5ms      81.6    11.4MB     2.09
#> 5 rec        10000000  94.72ms   95.2ms      10.4    64.9MB     2.09
#> 6 base       10000000  11.53ms   12.2ms      79.3    19.1MB     4.29

step_subset_sample

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = c(1e6, 1e7),
  {
    dat <- data.frame(x = rnorm(rows), 
                      z = rnorm(rows),
                      y = rnorm(rows))
    
    bench::mark(
      h <- {hrec1 = recipe(formula = formula, data = dat) |>
        step_subset_sample(size = 10000L) |>
        prep() |>
        bake()
      h = nrow(hrec1$get_result("tbl"))},
      
      rec = nrow(recipes::recipe(formula = formula, data = dat) |>
                   recipes::step_sample(size = 10000 / rows) |>
                   recipes::prep() |>
                   recipes::bake(new_data = NULL)),
      
      relative = FALSE,
      min_iterations = 1L,
      check = TRUE
    )
  }
)
#> Running with:
#>       rows
#> 1  1000000
#> 2 10000000

results
#> # A tibble: 4 × 7
#>   expression                   rows     min  median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                  <dbl> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl>
#> 1 h <- { hrec1 = bake(prep(s…   1e6  1.67ms  1.74ms     571.   851.53KB     2.03
#> 2 rec                           1e6 25.63ms 26.18ms      38.0    8.45MB     2.11
#> 3 h <- { hrec1 = bake(prep(s…   1e7  1.99ms  2.05ms     473.   315.12KB     3.31
#> 4 rec                           1e7 86.01ms 86.49ms      11.5    76.9MB     0

step_cross_correlation


formula <- as.formula(y~x)
results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = rnorm(rows))
    
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_cross_correlation(c(x, z, y), lag_max = 1000) |>
        plate("tbl"),
      
      min_iterations = 1L,
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 3 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.28ms   1.32ms    752.       550KB     2.02
#> 2 hrec1        10000   1.98ms   2.02ms    494.      18.1KB     4.10
#> 3 hrec1      5000000 524.51ms 524.51ms      1.91    18.1KB     0


x <- rnorm(5e5)
y <- rnorm(5e5)
lag_max <- 1000
results <- bench::mark(fft_ccf  <- hydrorecipes:::convolve_correlation(x, y, lag_max),
                       ccf_base <- as.numeric(ccf(x, y, lag.max = lag_max, plot = FALSE)$acf),
                       min_iterations = 1L,
                       check = TRUE
)

results
#> # A tibble: 2 × 6
#>   expression                            min  median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                        <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl>
#> 1 fft_ccf <- hydrorecipes:::convol… 32.05ms 32.88ms    30.3      15.7KB        0
#> 2 ccf_base <- as.numeric(ccf(x, y,…   1.94s   1.94s     0.516   143.7MB        0

step_lag

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = as.numeric(1:rows),
                      z = rnorm(rows))
    bench::mark(
      hrec1 = unname(recipe(formula = formula, data = dat) |>
                       step_lead_lag(x, lag = 1:30) |>
                       plate("tbl")),
      rec   = unname(recipes::recipe(formula = formula, data = dat) |>
                       recipes::step_lag(x, lag = 1:30) |> 
                       recipes::prep() |> 
                       recipes::bake(new_data = NULL)),
      check = TRUE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1          100  1      1         8.28      2.34     1   
#> 2 rec            100  8.36   8.28      1         1        1.04
#> 3 hrec1        10000  1      1         7.44      1        1.98
#> 4 rec          10000  7.44   7.41      1         2.58     1   
#> 5 hrec1      5000000  1      1         3.03      1        1   
#> 6 rec        5000000  3.60   3.03      1         2.55     1.32

step_distributed_lag

formula <- as.formula(y~x)

results <- bench::press(
  rows = c(5e5, 5e6, 1e7),
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows,
                      z = rnorm(rows))
    bench::mark(
      hrec = recipe(formula = formula, data = dat) |>
        step_distributed_lag(x, knots = log_lags(5, 86401)) |>
        prep() |> bake(),
      check = FALSE,
      relative = FALSE,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>       rows
#> 1   500000
#> 2  5000000
#> 3 10000000

results
#> # A tibble: 3 × 7
#>   expression     rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec         500000   64.4ms   75.7ms     13.3     19.1MB        0
#> 2 hrec        5000000  518.1ms  518.1ms      1.93   155.9MB        0
#> 3 hrec       10000000  883.6ms  883.6ms      1.13   308.5MB        0

step_harmonic

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows,
                      z = rnorm(rows))
    bench::mark(
      hrec = recipe(formula = formula, data = dat) |>
        step_harmonic(x, 
                      frequency = c(1.0, 2.0, 3.0), 
                      cycle_size = 0.1, 
                      starting_value = 0.0) |>
        plate("tbl"),
      rec  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_harmonic(x, 
                               frequency = c(1.0, 2.0, 3.0), 
                               cycle_size = 0.1, 
                               starting_val = 0.0,
                               keep_original_cols = TRUE) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      
      # sin and cos terms order is different
      check = FALSE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000
results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1        14.5       3.26     2.00
#> 2 rec            100 14.5   14.4       1         1        1   
#> 3 hrec         10000  1      1         7.54      1      NaN   
#> 4 rec          10000  7.52   7.59      1         3.55   Inf   
#> 5 hrec       5000000  1      1         1.20      1      NaN   
#> 6 rec        5000000  1.20   1.20      1         3.50   Inf

# rows <- 1e6
# dat <- data.frame(x = rnorm(rows), 
#                   y = 1:rows,
#                   z = rnorm(rows))
# bench::mark(
#   
#   {hrec = recipe(formula = formula, data = dat) |>
#     step_harmonic(x, 
#                   frequency = c(1.0, 2.0, 3.0), 
#                   cycle_size = 0.1, 
#                   starting_value = 0.0,
#                   varying = "cycle_size") |>
#         step_harmonic(x, 
#                   frequency = c(1.0, 2.0, 3.0), 
#                   cycle_size = 0.1, 
#                   starting_value = 0.0) |>
#     step_intercept() |>
#     step_center(x) |>
#     prep() |>
#     bake()}, 
#   
#   {hrec$steps[[2]]$update_step("cycle_size", 0.2)
#     hrec$bake()
#   },
#   check = FALSE
# )

step_pca

set.seed(1)
formula <- as.formula(x~a + b + c + d + e + f + g + h + i + j + k + l)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      a = rnorm(rows),
                      b = rnorm(rows),
                      c = rnorm(rows),
                      d = rnorm(rows),
                      e = rnorm(rows),
                      f = rnorm(rows),
                      g = rnorm(rows),
                      h = rnorm(rows),
                      i = rnorm(rows),
                      j = rnorm(rows),
                      k = rnorm(rows),
                      l = rnorm(rows)
    )
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat)|>
        step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l), n_comp = 10L) |>
        plate(),
      hrec2 = recipe(formula = formula, data = dat)|>
        step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l), n_comp = 5L) |>
        plate(),
      hrec3 = recipe(formula = formula, data = dat)|>
        step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l),
                 n_comp = 10L,
                 center = FALSE,
                 scale = FALSE) |>
        plate(),
      hrec4 = recipe(formula = formula, data = dat)|>
        step_pca(c(a,b,c,d,e,f,g,h,i,j,k,l),
                 n_comp = 5L,
                 center = FALSE,
                 scale = FALSE) |>
        plate(),
      
      rec1  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_pca(recipes::all_predictors(),
                          num_comp = 10L,
                          options = list(center = TRUE, scale. = TRUE))|> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      rec2  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_pca(recipes::all_predictors(),
                          num_comp = 5L,
                          options = list(center = TRUE, scale. = TRUE)) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      rec3  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_pca(recipes::all_predictors(),
                          num_comp = 10L) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      rec4  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_pca(recipes::all_predictors(),
                          num_comp = 5L) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      check = FALSE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000


print(results, n = 100)
#> # A tibble: 24 × 14
#>    expression    rows   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#>    <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>
#>  1 hrec1          100  1.03   1.03     14.7      15.3      1      323     1
#>  2 hrec2          100  1.03   1.03     14.6       1        2.03   316     2
#>  3 hrec3          100  1.01   1.00     15.0       1.20     2.03   327     2
#>  4 hrec4          100  1      1        15.0       1        2.07   318     2
#>  5 rec1           100 14.9   14.7       1.02     13.3      2.15    21     2
#>  6 rec2           100 14.5   14.6       1         6.04     2.20    20     2
#>  7 rec3           100 14.8   14.7       1.01      3.35     1.01    22     1
#>  8 rec4           100 14.7   14.6       1.01      2.99     2.13    21     2
#>  9 hrec1        10000  1.62   1.88      7.87      1.22   NaN       86     0
#> 10 hrec2        10000  1.60   1.57      8.95      1      Inf       94     1
#> 11 hrec3        10000  1.03   1.61      9.95      1.22   Inf      100     1
#> 12 hrec4        10000  1      1        12.8       1      NaN      139     0
#> 13 rec1         10000 14.1   13.9       1         6.02   Inf       10     1
#> 14 rec2         10000 13.9   13.8       1.08      5.69   Inf       11     1
#> 15 rec3         10000 11.8   11.7       1.27      2.24   Inf       13     1
#> 16 rec4         10000 11.5   11.5       1.29      1.92   Inf       13     1
#> 17 hrec1      5000000  1.81   1.81      4.96      1.22     1        1     1
#> 18 hrec2      5000000  1.84   1.84      4.88      1        1.97     1     2
#> 19 hrec3      5000000  1      1         8.99      1.22     1.81     1     1
#> 20 hrec4      5000000  1.05   1.05      8.56      1        1.73     1     1
#> 21 rec1       5000000  8.99   8.99      1         6.01     1.81     1     9
#> 22 rec2       5000000  8.40   8.40      1.07      5.68     1.73     1     8
#> 23 rec3       5000000  3.73   3.73      2.41      2.23     1.46     1     3
#> 24 rec4       5000000  3.59   3.59      2.50      1.90     1.01     1     2
#> # ℹ 5 more variables: total_time <bch:tm>, result <list>, memory <list>,
#> #   time <list>, gc <list>

step_dummy

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = qF(sample(1:10, rows, replace = TRUE)),
                      z = rnorm(rows))
    bench::mark(
      hrec = unname(recipe(formula = formula, data = dat) |>
                      step_dummy(y) |>
                      plate("tbl"))[,3:11],
      rec  = unname(recipes::recipe(formula = formula, data = dat) |>
                      recipes::step_dummy(y, keep_original_cols = TRUE) |>
                      recipes::prep() |>
                      recipes::bake(new_data = NULL))[,3:11],
      check = TRUE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows    min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100   1      1         8.01      1        1   
#> 2 rec            100   7.94   7.92      1         1.30     1.06
#> 3 hrec         10000   1      1        11.8       1        2.02
#> 4 rec          10000  11.6   11.6       1        17.6      1   
#> 5 hrec       5000000   1      1       183.        1        9.13
#> 6 rec        5000000 223.   183.        1        12.9      1

step_find_interval

  • no direct comparison so compare to step_cut
formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), 
                      y = 1:rows,
                      z = rnorm(rows))
    bench::mark(
      hrec = recipe(formula = formula, data = dat) |>
        step_find_interval(x, vec = c(-0.1, 0, 0.1)) |>
        plate("tbl"),
      rec  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_cut(x, breaks = c(-0.1, 0, 0.1)) |>
        recipes::prep() |>
        recipes::bake(new_data = NULL),
      check = FALSE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1        14.2       3.25      NaN
#> 2 rec            100 14.2   13.9       1         1         Inf
#> 3 hrec         10000  1      1        13.1       1         Inf
#> 4 rec          10000 13.1   13.0       1         3.69      NaN
#> 5 hrec       5000000  1      1         5.25      1         Inf
#> 6 rec        5000000  5.42   5.13      1         3.50      NaN

step_varying

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rep(1, rows), 
                      y = 1:rows,
                      z = rnorm(rows))
    bench::mark(
      hrec = recipe(formula = formula, data = dat) |>
        step_varying(c(x, y, z)) |>
        plate("tbl"),
      rec  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_zv(x, y, z) |>
        recipes::prep() |>
        recipes::bake(new_data = NULL),
      check = TRUE,
      relative = relative,
      min_iterations = 1L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100   1      1        14.7      6.28   NaN   
#> 2 rec            100  15.0   14.8       1        1      Inf   
#> 3 hrec         10000   1      1        15.5      1        1   
#> 4 rec          10000  15.6   15.4       1       32.9      1.03
#> 5 hrec       5000000   1      1        73.4      1      NaN   
#> 6 rec        5000000  74.7   74.0       1       34.1    Inf

step_kernel_filter

step_kernel_filter uses an Fast Fourier Transform (FFT) based convolution instead of an explicit sliding window. This should be much faster for large datasets and particularly when the kernel size is also large.

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = c(2e4, 2e5),
  {
    dat <- data.frame(x = rep(1, rows), 
                      y = 1:rows,
                      z = cumsum(rnorm(rows)))
    bench::mark(
      hrec = unname((recipe(formula = formula, data = dat) |>
                       step_kernel_filter(z, kernel = list(rep(1, 5001L)/5001L), align = "center") |>
                       plate("tbl"))[10000, "kernel_filter_z"]),
      {rec  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_window(z, size = 5001L, statistic = "mean") |>
        recipes::prep() |>
        recipes::bake(new_data = NULL)
      unname(rec[10000, "z"])},
      
      min_iterations = 1L,
      relative = relative,
      check = TRUE
    )
  }
)
#> Running with:
#>     rows
#> 1  20000
#> 2 200000

results
#> # A tibble: 4 × 7
#>   expression                      rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                     <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec                             2e4   1      1        85.7      1.48      NaN
#> 2 { rec = recipes::bake(recipes…   2e4  86.3   85.9       1        1         NaN
#> 3 hrec                             2e5   1      1       630.       1         Inf
#> 4 { rec = recipes::bake(recipes…   2e5 656.   645.        1        1.99      NaN

step_convolve_gamma

formula <- as.formula(y~x+z)

results <- bench::press(
  rows = c(2e4, 2e6),
  {
    dat <- data.frame(x = rep(1, rows), 
                      y = 1:rows,
                      z = cumsum(rnorm(rows)))
    bench::mark(
      hrec = (recipe(formula = formula, data = dat) |>
                step_convolve_gamma(z, amplitude = 1, k = 1, theta = 1) |>
                plate("tbl")),
      min_iterations = 1,
      relative = FALSE,
      check = TRUE
    )
  }
)
#> Running with:
#>      rows
#> 1   20000
#> 2 2000000

results
#> # A tibble: 2 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec         20000   1.79ms   1.86ms     537.    714.2KB        0
#> 2 hrec       2000000  27.86ms  28.49ms      34.9    15.3MB        0

step_compare_columns

multiple steps

step_harmonic dominates these results.

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = as.numeric(1:rows), 
                      y = 1:rows)
    bench::mark(
      hrec = recipe(formula = formula, data = dat) |>
        step_lead_lag(x, lag = 1:20) |>
        step_harmonic(x, 
                      frequency = c(1.0, 2.0, 3.0), 
                      cycle_size = 0.1, 
                      starting_value = 0.0) |>
        step_center(x) |> 
        plate("tbl"),
      rec  = recipes::recipe(formula = formula, data = dat) |>
        recipes::step_lag(x, lag = 1:20, keep_original_cols = TRUE) |>
        recipes::step_harmonic(x, 
                               frequency = c(1.0, 2.0, 3.0), 
                               cycle_size = 0.1, 
                               starting_val = 0.0,
                               keep_original_cols = TRUE) |>
        recipes::step_center(x) |> 
        recipes::prep() |> 
        recipes::bake(new_data = NULL),
      check = FALSE,
      relative = relative,
      min_iterations = 1
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1        15.1       1      NaN   
#> 2 rec            100 15.3   15.1       1         6.39   Inf   
#> 3 hrec         10000  1      1        11.3       1        1   
#> 4 rec          10000 11.4   11.4       1         2.70     1.02
#> 5 hrec       5000000  1      1         1.32      1        1   
#> 6 rec        5000000  1.32   1.32      1         2.64     1.13

step_spline_b

formula <- as.formula(y~x)
n <- c(100, 1e4, 5e6)
results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows),
                      y = 1:rows)
    bench::mark(
      hrec = unname(recipe(formula = formula, data = dat) |>
                      step_spline_b(x, df = 13) |>
                      plate("tbl")),
      rec  = unname(recipes::recipe(formula = formula, data = dat) |>
                      recipes::step_spline_b(x, deg_free = 13, keep_original_cols = TRUE)|> 
                      recipes::prep() |> 
                      recipes::bake(new_data = NULL)),
      check = TRUE,
      relative = relative,
      min_iterations = 2
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1         7.78      2.25   NaN   
#> 2 rec            100  7.77   7.75      1         1      Inf   
#> 3 hrec         10000  1      1         6.19      1      Inf   
#> 4 rec          10000  6.26   6.16      1         4.24   NaN   
#> 5 hrec       5000000  1      1         3.03      1        1.82
#> 6 rec        5000000  2.92   3.03      1         4.21     1

step_spline_n

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows),
                      y = 1:rows)
    bench::mark(
      hrec = unname(recipe(formula = formula, data = dat) |>
                      step_spline_n(x, df = 11L) |>
                      plate("tbl")),
      rec  = unname(recipes::recipe(formula = formula, data = dat) |>
                      recipes::step_spline_natural(x, deg_free = 11L, keep_original_cols = TRUE)|> 
                      recipes::prep() |> 
                      recipes::bake(new_data = NULL)),
      check = TRUE,
      relative = relative,
      min_iterations = 2L
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec           100  1      1         7.47      2.93   NaN   
#> 2 rec            100  7.51   7.46      1         1      Inf   
#> 3 hrec         10000  1      1         5.86      1      Inf   
#> 4 rec          10000  5.93   5.92      1         4.03   NaN   
#> 5 hrec       5000000  1      1         3.22      1        1   
#> 6 rec        5000000  3.05   3.22      1         4.00     1.55

step_add_noise

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = as.numeric(1:rows),
                      y = rep(0.01, rows))
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_add_noise(y) |>
        plate("dt"))
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 3 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.22ms   1.26ms    790.     544.7KB     0   
#> 2 hrec1        10000   1.56ms   1.61ms    616.     160.5KB     2.07
#> 3 hrec1      5000000 189.19ms 189.31ms      5.28    76.3MB     0

step_aquifer_grf & step_aquifer_theis

The Theis solution is a subset of the grf solution.

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = as.numeric(1:rows),
                      y = rep(0.01, rows))
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_aquifer_grf(time = x, flow_rate = y) |>
        plate("dt"),
      hrec2 = recipe(formula = formula, data = dat) |>
        step_aquifer_theis(time = x, flow_rate = y) |>
        plate("dt"),
      check = TRUE)
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.35ms    1.4ms    714.     557.3KB     2.08
#> 2 hrec2          100   1.46ms   1.51ms    660.     531.9KB     2.19
#> 3 hrec1        10000    2.3ms   2.36ms    420.     160.6KB     0   
#> 4 hrec2        10000   2.38ms   2.48ms    402.      83.6KB     0   
#> 5 hrec1      5000000 665.94ms 665.94ms      1.50    76.3MB     1.50
#> 6 hrec2      5000000 637.58ms 637.58ms      1.57    38.2MB     0

step_aquifer_theis_aniso

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = as.numeric(1:rows),
                      y = rep(0.01, rows))
    bench::mark(
      hrec1 = unname(recipe(formula = formula, data = dat) |>
                       step_aquifer_theis_aniso(time = x, 
                                                flow_rate = y,
                                                distance_x = 0, 
                                                distance_y = 100,
                                                hydraulic_conductivity_major = 1e-4,
                                                hydraulic_conductivity_minor = 1e-4) |>
                       plate("dt")),
      hrec2 = unname(recipe(formula = formula, data = dat) |>
                       step_aquifer_theis(time = x, flow_rate = y,) |>
                       plate("dt")),
      check = TRUE)
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.47ms   1.52ms    654.    626.09KB     0   
#> 2 hrec2          100   1.56ms    1.6ms    620.      7.98KB     2.07
#> 3 hrec1        10000   2.39ms   2.43ms    409.    162.39KB     2.08
#> 4 hrec2        10000   2.46ms    2.5ms    398.     85.33KB     0   
#> 5 hrec1      5000000 632.39ms 632.39ms      1.58    76.3MB     0   
#> 6 hrec2      5000000 630.01ms 630.01ms      1.59   38.15MB     0

step_aquifer_leaky

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = as.numeric(1:rows),
                      y = rep(0.01, rows))
    bench::mark(
      hrec1 = unname(recipe(formula = formula, data = dat) |>
                       step_aquifer_leaky(time = x,
                                          flow_rate = y,
                                          leakage = 100000000) |>
                       plate("dt")),
      hrec2 = unname(recipe(formula = formula, data = dat) |>
                       step_aquifer_theis(time = x,
                                          flow_rate = y) |>
                       plate("dt")),
      check = TRUE)
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 6 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.49ms   1.53ms   652.     563.34KB     2.10
#> 2 hrec2          100   1.57ms   1.61ms   617.       7.98KB     2.11
#> 3 hrec1        10000   3.25ms   3.31ms   298.     396.84KB     0   
#> 4 hrec2        10000   2.47ms   2.51ms   398.      85.33KB     0   
#> 5 hrec1      5000000    1.33s    1.33s     0.749  190.74MB     0   
#> 6 hrec2      5000000 655.34ms 655.34ms     1.53    38.15MB     1.53

step_aquifer_patch

formula <- as.formula(y~x)

results <- bench::press(
  rows = c(1e4, 1e5, 1e6),
  {
    dat <- data.frame(x = as.numeric(1:rows),
                      y = rep(0.01, rows))
    bench::mark(
      hrec1 = (recipe(formula = formula, data = dat) |>
                 step_aquifer_grf(time = x, flow_rate = y) |>
                 plate("dt")),
      hrec3 = (recipe(formula = formula, data = dat) |>
                 step_aquifer_patch(time = x,
                                    flow_rate = 0.01,
                                    thickness = 1.0,
                                    radius = 100.0,
                                    radius_patch = 200.0,
                                    specific_storage_inner = 1e-6,
                                    specific_storage_outer = 1e-6,
                                    hydraulic_conductivity_inner = 1e-4,
                                    hydraulic_conductivity_outer = 1e-4,
                                    n_stehfest = 8L
                 ) |>
                 plate("dt")),
      check = FALSE,
      relative = relative)
  }
)
#> Running with:
#>      rows
#> 1   10000
#> 2  100000
#> 3 1000000


results
#> # A tibble: 6 × 7
#>   expression    rows   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 hrec1        10000   1      1        52.6      1         NaN
#> 2 hrec3        10000  53.8   53.2       1        4.00      NaN
#> 3 hrec1       100000   1      1        97.3      1.99      NaN
#> 4 hrec3       100000 101.    98.2       1        1         NaN
#> 5 hrec1      1000000   1      1        77.8      2.00      NaN
#> 6 hrec3      1000000  81.2   77.3       1        1         NaN

step_aquifer_wellbore_storage

  • currently this is slow for long series.
results <- bench::press(
  rows = c(1e3, 1e4, 1e5),
  {
    dat <- data.frame(x = as.numeric(1:rows), 
                      y = as.numeric(1:rows))
    bench::mark(
      hrec1 = unname(recipe(formula = formula, data = dat) |>
                       step_aquifer_wellbore_storage(time = x,
                                                     flow_rate = 0.01,
                                                     hydraulic_conductivity = 1e-4,
                                                     specific_storage = 1e-6, 
                                                     radius = 100,
                                                     radius_casing = 1e-15,
                                                     radius_well = 1e-15, n_terms = 18) |>
                       plate("dt")),
      hrec2 = unname(recipe(formula = formula, data = dat) |>
                       step_aquifer_theis(time = x,
                                          flow_rate = y) |>
                       plate("dt")),
      check = FALSE
    )
  }
)
#> Running with:
#>     rows
#> 1   1000
#> 2  10000
#> 3 100000

results
#> # A tibble: 6 × 7
#>   expression   rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1        1000   8.59ms   8.64ms    116.    574.91KB     0   
#> 2 hrec2        1000   1.68ms   1.74ms    564.     22.88KB     2.20
#> 3 hrec1       10000  59.42ms  59.71ms     16.7   162.36KB     0   
#> 4 hrec2       10000   2.51ms   2.56ms    387.     163.5KB     2.22
#> 5 hrec1      100000 532.97ms 532.97ms      1.88    1.53MB     0   
#> 6 hrec2      100000  10.08ms  10.33ms     96.4     1.53MB     0

step_vadose_weeks

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = as.numeric(1:rows), 
                      y = as.numeric(1:rows))
    bench::mark(
      hrec1 = (recipe(formula = formula, data = dat) |>
                 step_vadose_weeks(time = x, 
                                   air_diffusivity = 0.8, 
                                   thickness = 5, 
                                   precision = 1e-12) |>
                 plate("dt")),
      check = FALSE,
      min_iterations = 2
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 3 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.23ms   1.26ms    790.     548.6KB     2.06
#> 2 hrec1        10000   1.39ms   1.42ms    700.     160.5KB     0   
#> 3 hrec1      5000000 173.68ms 173.89ms      5.75    76.3MB     0

step_transport_ogata_banks

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(expand.grid(as.numeric(1:rows), 
                                  as.numeric(1:10)))
    names(dat) <- c('x', 'y')
    bench::mark(
      hrec1 = (recipe(formula = formula, data = dat) |>
                 step_transport_ogata_banks(time = x,
                                            distance = y) |>
                 plate("dt")),
      check = FALSE,
      min_iterations = 2
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 3 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.31ms   1.35ms   730.        562KB    2.09 
#> 2 hrec1        10000   5.08ms   5.12ms   194.        786KB    0    
#> 3 hrec1      5000000    1.57s    1.57s     0.635     381MB    0.635

step_transport_fractures_solute

formula <- as.formula(~time+z+x)

dat <- setDT(expand.grid(10^(3:8),
                         seq(0.0, 10, 1),
                         c(0.0)))

names(dat) <- c("time", "z", "x")

results <- 
  bench::mark(
    hrec1 = recipe(formula = formula, data = dat) |>
      step_transport_fractures_solute(time = time,
                                      distance_fracture = z,
                                      distance_matrix = x) |>
      plate("dt"),
    check = FALSE,
    min_iterations = 2
  )

results
#> # A tibble: 1 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1        1.72ms   1.76ms      566.     586KB        0

step_transport_fractures_heat

formula <- as.formula(~time+z+x)

dat <- setDT(expand.grid(10^(3:8),
                         seq(0.0, 100, 1),
                         c(0.0, 0.05)))

names(dat) <- c("time", "z", "x")

results <- 
  bench::mark(
    hrec1 = recipe(formula = formula, data = dat) |>
      step_transport_fractures_heat(time = time,
                                    distance_fracture = z,
                                    distance_matrix = x) |>
      plate("dt"),
    check = FALSE,
    min_iterations = 2
  )

results
#> # A tibble: 1 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1        9.86ms   9.89ms      101.     594KB        0

step_fft_pgram, step_fft_welch

formula <- as.formula(y~x + z)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows), y = rnorm(rows), z = rnorm(rows),
                      q = rnorm(rows), r = rnorm(rows), s = rnorm(rows))
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_fft_pgram(c(x, y), 
                       3,
                       TRUE,
                       TRUE,
                       FALSE,
                       0.1, 
                       time_step = 1) |> 
        prep() |>
        bake(),
      hrec2 = recipe(formula = formula, data = dat) |>
        step_fft_pgram(c(x, y), 
                       3,
                       TRUE,
                       TRUE,
                       TRUE,
                       0.1, 
                       time_step = 1) |> 
        prep() |>
        bake(),
      hrec3 = recipe(formula = formula, data = dat) |>
        step_fft_welch(c(x, y),
                       length_subset =  nrow(dat) / 10,
                       overlap = 0.60,
                       window = window_nuttall(nrow(dat) / 10),
                       time_step = 1) |>
        prep() |>
        bake(),
      check = FALSE,
      min_iterations = 1
    )
  }
)
#> Running with:
#>      rows
#> 1     100
#> 2   10000
#> 3 5000000

results
#> # A tibble: 9 × 7
#>   expression    rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1          100   1.68ms   1.93ms    516.    579.46KB     2.79
#> 2 hrec2          100   1.55ms   1.61ms    621.     17.76KB     0   
#> 3 hrec3          100   1.38ms   1.42ms    702.    558.04KB     2.15
#> 4 hrec1        10000   2.72ms   2.81ms    356.      1.45MB     0   
#> 5 hrec2        10000   2.39ms   2.52ms    379.      1.15MB     0   
#> 6 hrec3        10000   2.62ms   2.69ms    371.    268.44KB     0   
#> 7 hrec1      5000000  534.7ms  534.7ms      1.87   724.8MB     0   
#> 8 hrec2      5000000 573.09ms 573.09ms      1.74  572.21MB     1.74
#> 9 hrec3      5000000 450.98ms  454.4ms      2.20   129.7MB     0

step_fft_transfer_welch and step_fft_transfer_pgram, step_fft_transfer_experimental

formula <- as.formula(y~x)

results <- bench::press(
  rows = c(1e5, 1e6, 1e7),
  {
    dat <- data.frame(x = rnorm(rows), y = rnorm(rows))
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_fft_transfer_pgram(c(x, y), 
                                3,
                                TRUE,
                                TRUE,
                                0.1, 
                                time_step = 1) |> 
        prep() |>
        bake(),
      hrec2 = recipe(formula = formula, data = dat) |>
        step_fft_transfer_welch(c(x, y),
                                length_subset =  nrow(dat) / 10,
                                overlap = 0.60,
                                window = window_nuttall(nrow(dat) / 10), 
                                time_step = 1) |> 
        prep() |>
        bake(),
      hrec3 <- recipe(formula = formula, data = dat) |>
        step_fft_transfer_experimental(c(x, y), 
                                       spans = 3, 
                                       taper = 0.1, 
                                       n_groups = 300,
                                       time_step = 1) |>
        prep() |>
        bake(),
      check = FALSE,
      min_iterations = 1
    )
  }
)
#> Running with:
#>       rows
#> 1   100000
#> 2  1000000
#> 3 10000000

results
#> # A tibble: 9 × 7
#>   expression                 rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1                       1e5  18.52ms  20.03ms    50.1      6.66MB     2.78
#> 2 hrec2                       1e5   9.62ms  10.43ms    96.7      2.61MB     0   
#> 3 hrec3 <- bake(prep(step_…   1e5   7.19ms   7.49ms   128.       2.48MB     0   
#> 4 hrec1                       1e6 190.28ms 204.38ms     4.97    61.04MB     0   
#> 5 hrec2                       1e6  86.24ms     87ms    11.5      20.6MB     2.86
#> 6 hrec3 <- bake(prep(step_…   1e6  58.59ms  60.28ms    16.7     19.09MB     0   
#> 7 hrec1                       1e7    2.06s    2.06s     0.486  610.35MB     0   
#> 8 hrec2                       1e7    1.03s    1.03s     0.973     206MB     0   
#> 9 hrec3 <- bake(prep(step_…   1e7 903.11ms 903.11ms     1.11   190.75MB     0

step_ols

formula <- as.formula(y~.)


results <- bench::press(
  rows = c(1e5, 1e6, 1e7),
  {
    dat <- data.frame(
      y = rnorm(rows),
      x = rnorm(rows), 
      z = rnorm(rows),
      a = rnorm(rows),
      b = rnorm(rows),
      d = rnorm(rows),
      e = rnorm(rows),
      f = rnorm(rows),
      g = rnorm(rows))
    m <- qM(dat)
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_ols(formula = as.formula(y~.), 
                 do_response = FALSE) |>
        prep() |>
        bake(),
      hrec2 = recipe(formula = formula, data = dat) |>
        step_ols(formula = as.formula(y~.), 
                 do_response = TRUE) |>
        prep() |>
        bake(),
      lm = lm(y~. - 1, dat),
      lm.fit(x = m[, c(2:ncol(dat))], y = m[, 1]),
      check = FALSE,
      relative = FALSE
    )
  }
)
#> Running with:
#>       rows
#> 1   100000
#> 2  1000000
#> 3 10000000


results
#> # A tibble: 12 × 7
#>    expression                rows      min   median `itr/sec` mem_alloc `gc/sec`
#>    <bch:expr>               <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#>  1 hrec1                      1e5   6.71ms   6.86ms   131.      17.36MB    2.15 
#>  2 hrec2                      1e5    7.5ms   7.77ms   107.      19.13MB    0    
#>  3 lm                         1e5  30.53ms  31.31ms    28.5     47.75MB    2.38 
#>  4 lm.fit(x = m[, c(2:ncol…   1e5  12.29ms  12.62ms    74.8     17.55MB    0    
#>  5 hrec1                      1e6  54.74ms  59.81ms    15.8    167.85MB    2.26 
#>  6 hrec2                      1e6  64.48ms   68.6ms    14.5    190.74MB    2.42 
#>  7 lm                         1e6 203.43ms 203.43ms     4.92    473.4MB    9.83 
#>  8 lm.fit(x = m[, c(2:ncol…   1e6 137.59ms 142.62ms     7.06   175.48MB    2.35 
#>  9 hrec1                      1e7 681.79ms 681.79ms     1.47     1.64GB    0    
#> 10 hrec2                      1e7 930.26ms 930.26ms     1.07     1.86GB    1.07 
#> 11 lm                         1e7    2.47s    2.47s     0.405    4.67GB    1.21 
#> 12 lm.fit(x = m[, c(2:ncol…   1e7    1.55s    1.55s     0.647    1.71GB    0.647


# formula <- as.formula(y~x+z)
# 
# 
# results <- bench::press(
#   rows = n,
#   {
#     dat <- data.frame(x = rnorm(rows), 
#                       y = rnorm(rows),
#                       z = rnorm(rows))
#     bench::mark(
#       hrec = recipe(formula = formula, data = dat) |>
#         step_intercept() |>
#         step_nls(formula = as.formula(y~.)) |>
#         prep() |>
#         bake(),
#       check = FALSE,
#       relative = FALSE
#     )
#   }
# )

step_nls


n0 <- 5e5
n <- 2e4
n2 <- 1e4
b <- cumsum(rnorm(n0))
b <- b - mean(b)
max_t <- 720 * ceiling(2.554)
a <- hydrorecipes:::convolve_overlap_save(x = b,
                                          y = hydrorecipes:::gamma_3(0:max_t, 0.816, 9.221, 2.554),
                                          0)

max_t <- 720 * ceiling(2.554)

dat <- data.frame(a = a, b = b)
formula <- formula(a~b)

# for gsl_nls
f <- function(z, x) {
  max_t <- 720 * ceiling(z[3])
  hydrorecipes:::convolve_overlap_save(x = x,
                                       y = hydrorecipes:::gamma_3(0:max_t, z[1], z[2], z[3]),
                                       align = 0)[-(1:7200)]
}


results <- bench::mark(
  gsl_fun <- unname(round(coef(gsl_nls(
    fn = f,                   ## model function      
    y = a[-(1:7200)],           ## response vector 
    x = b,
    start = c(A = 0.5, n = 2.0, a = 2.0),  ## starting values
    lower = c(A = 0.01, n = 1.0, a = 1.0), 
    upper = c(A = 1.0, n = 10.0, a = 10.0),
    control = gsl_nls_control(xtol = 1e-8),
    trace = FALSE,
    algorithm = "lm"               ## algorithm
  )), 3)), 
  h_1 = {h = recipe(formula = formula, data = dat) |>
    step_convolve_gamma(b, amplitude = 0.5, k = 2.0, theta = 2.0, 
                        varying = list(name = c("amplitude","k", "theta"),
                                       start = c(0.5, 2.0, 2.0),
                                       lower = c(0.01, 1.0, 1.0),
                                       upper = c(1.0, 10.0, 10.0))) |>
    step_nls(formula = formula(a~b), n_subset = 1L, 
             trace = FALSE,
             algorithm = "lm",
             control = gsl_nls_control(xtol = 1e-8))
  
  h$prep()$bake()
  unname(round(coef(h$steps[[3]]$fit), 3))},
  
  h_10 = {h = recipe(formula = formula, data = dat) |>
    step_convolve_gamma(b, amplitude = 0.5, k = 2.0, theta = 2.0, 
                        varying = list(name = c("amplitude","k", "theta"),
                                       start = c(0.5, 2.0, 2.0),
                                       lower = c(0.01, 1.0, 1.0),
                                       upper = c(1.0, 10.0, 10.0))) |>
    step_nls(formula = formula(a~b), n_subset = 10L, 
             trace = FALSE,
             algorithm = "lm",
             control = gsl_nls_control(xtol = 1e-8))
  
  h$prep()$bake()
  unname(round(coef(h$steps[[3]]$fit), 3))},
  
  h_100 = {h = recipe(formula = formula, data = dat) |>
    step_convolve_gamma(b, amplitude = 0.5, k = 2.0, theta = 2.0, 
                        varying = list(name = c("amplitude","k", "theta"),
                                       start = c(0.5, 2.0, 2.0),
                                       lower = c(0.01, 1.0, 1.0),
                                       upper = c(1.0, 10.0, 10.0))) |>
    step_nls(formula = formula(a~b), n_subset = 100L, 
             trace = FALSE,
             algorithm = "lm",
             control = gsl_nls_control(xtol = 1e-8))
  
  h$prep()$bake()
  unname(round(coef(h$steps[[3]]$fit), 3))},
  
  check = TRUE
)

results
#> # A tibble: 4 × 6
#>   expression                           min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 gsl_fun <- unname(round(coef(g…    1.36s    1.36s     0.734     893MB        0
#> 2 h_1                                1.69s    1.69s     0.591     900MB        0
#> 3 h_10                            919.88ms 919.88ms     1.09      309MB        0
#> 4 h_100                           775.05ms 775.05ms     1.29      250MB        0

step_ols_gap_fill

set.seed(123)
n <- 100000
frm <- formula(x ~ y + z)


x <- cumsum(rnorm(n))
dat <- data.table(x = x, y = x, z = as.numeric(1:n))
dat[, x := x + c(rep(20, n/2), rep(0, n/2))]
dat[, x := x + 3.0 * sin(z * 1/n)]
tmp <- copy(dat$x)

# Set value to NA.  These values will be estimated.
dat[60000:70000, x := NA_real_]

dat <- unclass(dat)

bench::mark(
  {h = recipe(formula = frm, data = dat) |>
    step_find_interval(z, vec = c(0, n/2, n)) |>
    step_intercept() |>
    step_spline_b(z, df = 4) |>
    step_drop_columns(z)
  
  hrec = recipe(formula = frm, data = dat) |>
    step_ols_gap_fill(c(x, y, z), recipe = h) |>
    prep() |>
    bake()},
  check = FALSE
)
#> # A tibble: 1 × 6
#>   expression                             min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                          <bch:> <bch:>     <dbl> <bch:byt>    <dbl>
#> 1 { h = step_drop_columns(step_splin… 8.23ms 8.43ms      118.     9.4MB        0

check

step_check_spacing

formula <- as.formula(y~x)

results <- bench::press(
  rows = n,
  {
    dat <- data.frame(x = rnorm(rows),
                      y = 1:rows)
    dat[9:50, "x"] <- NA
    dat[9L, "y"] <- NA
    
    bench::mark(
      hrec1 = recipe(formula = formula, data = dat) |>
        step_check_spacing(y) |>
        step_check_na(y) |>
        prep() |>
        bake(),
      hrec2 =recipe(formula = formula, data = dat) |>
        step_check_spacing(x) |>
        step_check_na(x) |>
        prep() |>
        bake(),
      check = FALSE,
      relative = FALSE,
      min_iterations = 2
    )
  }
)
#> Running with:
#>     rows
#> 1 100000

results
#> # A tibble: 2 × 7
#>   expression   rows      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 hrec1      100000   3.05ms   3.15ms      267.    3.37MB     2.17
#> 2 hrec2      100000   3.06ms   3.13ms      319.    2.29MB     0
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] splines2_0.5.3     RcppRoll_0.3.1     tibble_3.2.1       bench_1.1.4       
#> [5] hydrorecipes_0.0.6 Bessel_0.6-1       data.table_1.16.4  gslnls_1.4.1      
#> [9] collapse_2.0.19   
#> 
#> loaded via a namespace (and not attached):
#>  [1] xfun_0.50           bslib_0.8.0         htmlwidgets_1.6.4  
#>  [4] recipes_1.1.0       lattice_0.22-6      vctrs_0.6.5        
#>  [7] tools_4.4.2         generics_0.1.3      parallel_4.4.2     
#> [10] pkgconfig_2.0.3     Matrix_1.7-1        desc_1.4.3         
#> [13] lifecycle_1.0.4     compiler_4.4.2      textshaping_1.0.0  
#> [16] codetools_0.2-20    RcppThread_2.2.0    htmltools_0.5.8.1  
#> [19] class_7.3-22        sass_0.4.9          yaml_2.3.10        
#> [22] gmp_0.7-5           profmem_0.6.0       prodlim_2024.06.25 
#> [25] tidyr_1.3.1         pillar_1.10.1       pkgdown_2.1.1      
#> [28] jquerylib_0.1.4     MASS_7.3-61         cachem_1.1.0       
#> [31] gower_1.0.2         rpart_4.1.23        parallelly_1.41.0  
#> [34] lava_1.8.1          tidyselect_1.2.1    digest_0.6.37      
#> [37] earthtide_0.1.7     future_1.34.0       dplyr_1.1.4        
#> [40] purrr_1.0.2         listenv_0.9.1       splines_4.4.2      
#> [43] fastmap_1.2.0       grid_4.4.2          cli_3.6.3          
#> [46] magrittr_2.0.3      utf8_1.2.4          survival_3.7-0     
#> [49] future.apply_1.11.3 withr_3.0.2         Rmpfr_1.0-0        
#> [52] lubridate_1.9.4     timechange_0.3.0    rmarkdown_2.29     
#> [55] globals_0.16.3      nnet_7.3-19         timeDate_4041.110  
#> [58] ragg_1.3.3          evaluate_1.0.3      knitr_1.49         
#> [61] hardhat_1.4.0       rlang_1.1.5         Rcpp_1.0.14        
#> [64] glue_1.8.0          ipred_0.9-15        jsonlite_1.8.9     
#> [67] R6_2.5.1            systemfonts_1.2.1   fs_1.6.5