From 53bb8b431487ca1caa578e79ad88d8ed872b9b12 Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Tue, 3 Sep 2024 09:03:35 +0200
Subject: [PATCH] Update benchmark and timinigs in README

---
 DESCRIPTION         |  2 +-
 README.md           |  2 +-
 backlog/benchmark.R | 63 ++++++++++++++++++++++-----------------------
 packaging.R         |  2 +-
 4 files changed, 34 insertions(+), 35 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index c6fcd07..c15f2fc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hstats
 Title: Interaction Statistics
-Version: 1.2.1
+Version: 1.2.2
 Authors@R: c(
     person("Michael", "Mayer", , "mayermichael79@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0009-0007-2540-9629")),
diff --git a/README.md b/README.md
index 819fa2d..20ec7b9 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ average_loss(fit, X = X_valid, y = y_valid)
 Let's calculate different H-statistics via `hstats()`:
 
 ```r
-# 4 seconds on simple laptop - a random forest will take 2 minutes
+# 2 seconds on laptop - a random forest will take much longer
 set.seed(782)
 system.time(
   s <- hstats(fit, X = X_train)  #, approx = TRUE: twice as fast
diff --git a/backlog/benchmark.R b/backlog/benchmark.R
index 3923a1c..1dd7608 100644
--- a/backlog/benchmark.R
+++ b/backlog/benchmark.R
@@ -115,11 +115,11 @@ bench::mark(
   check = FALSE,
   min_iterations = 3
 )
-#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result
-# 1 iml           1.76s    1.76s     0.565   211.6MB    3.39      3    18      5.31s <NULL>
-# 2 dalex      688.54ms 697.71ms     1.44     35.2MB    1.91      3     4      2.09s <NULL>
-# 3 flashlight 667.51ms 676.07ms     1.47     28.1MB    1.96      3     4      2.04s <NULL>
-# 4 hstats     392.15ms 414.41ms     2.39     26.6MB    0.796     3     1      1.26s <NULL>  
+# expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
+# iml           1.26s    1.29s     0.778   235.3MB     3.63     3    14
+# dalex      594.39ms 620.76ms     1.63     30.5MB     1.63     3     3
+# flashlight  800.1ms  808.4ms     1.20     62.6MB     3.20     3     8
+# hstats     151.06ms 160.19ms     5.94       26MB     1.98     3     1
 
 # Partial dependence (cont)
 v <- "tot_lvg_area"
@@ -131,12 +131,12 @@ bench::mark(
   check = FALSE,
   min_iterations = 3
 )
-#     expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result
-# <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>
-#   1 iml            1.2s     1.4s     0.726   376.9MB     4.12     3    17      4.13s <NULL>
-#   2 dalex       759.3ms  760.6ms     1.28    192.8MB     2.55     3     6      2.35s <NULL>
-#   3 flashlight  369.1ms  403.1ms     2.55     66.8MB     2.55     3     3      1.18s <NULL>
-#   4 hstats      242.1ms  243.8ms     4.03     14.2MB     0        3     0   744.25ms <NULL>#   
+# expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
+# iml         921.7ms  923.5ms      1.06   369.7MB     3.18     3     9
+# dalex       503.2ms  516.7ms      1.90   195.1MB     3.80     3     6
+# flashlight  190.3ms  228.9ms      4.62    67.9MB     1.54     3     1
+# hstats       74.6ms   76.8ms     12.2     14.1MB     1.75     7     1
+
 # Partial dependence (discrete)
 v <- "structure_quality"
 bench::mark(
@@ -145,39 +145,38 @@ bench::mark(
   flashlight = light_profile(fl, v = v, pd_n_max = Inf),
   hstats = partial_dep(fit, v = v, X = X_valid, n_max = Inf),
   check = FALSE,
-  min_iterations = 3
+  min_iterations = 10
 )
-#     expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result
-# <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>
-#   1 iml         107.9ms    108ms      9.26   13.64MB     9.26     2     2      216ms <NULL>
-#   2 dalex         172ms  172.2ms      5.81   21.14MB     2.90     2     1      344ms <NULL>
-#   3 flashlight   40.3ms   41.6ms     23.8     8.61MB     2.16    11     1      462ms <NULL>
-#   4 hstats       24.5ms   25.9ms     35.5     1.64MB     0       18     0      507ms <NULL>
-  
+# expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
+# iml          77.1ms   80.6ms     12.4    12.66MB     8.25     6     4
+# dalex       122.4ms  137.7ms      7.49   20.48MB     7.49     5     5
+# flashlight   25.8ms   27.5ms     34.0      6.3MB     2.13    16     1
+# hstats       14.3ms   15.1ms     66.1     1.49MB     2.13    31     1
+
 # H-Stats -> we use a subset of 500 rows
 X_v500 <- X_valid[1:500, ]
 mod500 <- Predictor$new(fit, data = as.data.frame(X_v500), predict.function = predf)
 fl500 <- flashlight(fl, data = as.data.frame(valid[1:500, ]))
 
-# iml  # 243s total, using slow exact calculations
-system.time(  # 110s
+# iml 173 s total, using slow exact calculations (to be able to compare results)
+system.time(  # 85s
   iml_overall <- Interaction$new(mod500, grid.size = 500)
 )
-system.time(  # 133s for all combinations of latitude
+system.time(  # 88s for all combinations of latitude
   iml_pairwise <- Interaction$new(mod500, grid.size = 500, feature = "latitude")
 )
 
-# flashlight: 14s total, doing only one pairwise calculation, otherwise would take 63s
-system.time(  # 11.7s
+# flashlight: 8s total, doing only one pairwise calculation, otherwise would take 63s
+system.time(  # 7s
   fl_overall <- light_interaction(fl500, v = x, grid_size = Inf, n_max = Inf)
 )
-system.time(  # 2.3s
+system.time(  # 1s
   fl_pairwise <- light_interaction(
     fl500, v = coord, grid_size = Inf, n_max = Inf, pairwise = TRUE
   )
 )
 
-# hstats: 3.5s total
+# hstats: 1.5s total
 system.time({
   H <- hstats(fit, v = x, X = X_v500, n_max = Inf)
   hstats_overall <- h2_overall(H, squared = FALSE, zero = FALSE)
@@ -185,7 +184,7 @@ system.time({
 }
 )
 
-# Using 50 quantiles to approximate dense numerics: 0.8s
+# Using 50 quantiles to approximate dense numerics: 0.5s
 system.time(
   H_approx <- hstats(fit, v = x, X = X_v500, n_max = Inf, approx = TRUE)
 )
@@ -198,10 +197,10 @@ iml_overall$results |>
 # 2: longitude    0.2458269
 
 fl_overall$data |>
-  subset(value_ > 0, select = c(variable_, value_))
+  subset(value > 0, select = c(variable, value))
 #   variable_    value_
-# 3  latitude 0.2458269
-# 4 longitude 0.2458269
+# 3  latitude 0.246
+# 4 longitude 0.246
 
 hstats_overall
 # longitude  latitude 
@@ -214,8 +213,8 @@ iml_pairwise$results |>
 # 1: longitude:latitude    0.3942526
 
 fl_pairwise$data |>
-  subset(value_ > 0, select = c(variable_, value_))
-# latitude:longitude 0.3942526
+  subset(value > 0, select = c(variable, value))
+# latitude:longitude 0.394
 
 hstats_pairwise
 # latitude:longitude 
diff --git a/packaging.R b/packaging.R
index 259d273..924c797 100644
--- a/packaging.R
+++ b/packaging.R
@@ -15,7 +15,7 @@ library(usethis)
 use_description(
   fields = list(
     Title = "Interaction Statistics",
-    Version = "1.2.1",
+    Version = "1.2.2",
     Description = "Fast, model-agnostic implementation of different H-statistics
     introduced by Jerome H. Friedman and Bogdan E. Popescu (2008) <doi:10.1214/07-AOAS148>. 
     These statistics quantify interaction strength per feature, feature pair,