Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update benchmark and timinigs in README #125

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hstats
Title: Interaction Statistics
Version: 1.2.1
Version: 1.2.2
Authors@R: c(
person("Michael", "Mayer", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0009-0007-2540-9629")),
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ average_loss(fit, X = X_valid, y = y_valid)
Let's calculate different H-statistics via `hstats()`:

```r
# 4 seconds on simple laptop - a random forest will take 2 minutes
# 2 seconds on laptop - a random forest will take much longer
set.seed(782)
system.time(
s <- hstats(fit, X = X_train) #, approx = TRUE: twice as fast
Expand Down
63 changes: 31 additions & 32 deletions backlog/benchmark.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ bench::mark(
check = FALSE,
min_iterations = 3
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result
# 1 iml 1.76s 1.76s 0.565 211.6MB 3.39 3 18 5.31s <NULL>
# 2 dalex 688.54ms 697.71ms 1.44 35.2MB 1.91 3 4 2.09s <NULL>
# 3 flashlight 667.51ms 676.07ms 1.47 28.1MB 1.96 3 4 2.04s <NULL>
# 4 hstats 392.15ms 414.41ms 2.39 26.6MB 0.796 3 1 1.26s <NULL>
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# iml 1.26s 1.29s 0.778 235.3MB 3.63 3 14
# dalex 594.39ms 620.76ms 1.63 30.5MB 1.63 3 3
# flashlight 800.1ms 808.4ms 1.20 62.6MB 3.20 3 8
# hstats 151.06ms 160.19ms 5.94 26MB 1.98 3 1

# Partial dependence (cont)
v <- "tot_lvg_area"
Expand All @@ -131,12 +131,12 @@ bench::mark(
check = FALSE,
min_iterations = 3
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list>
# 1 iml 1.2s 1.4s 0.726 376.9MB 4.12 3 17 4.13s <NULL>
# 2 dalex 759.3ms 760.6ms 1.28 192.8MB 2.55 3 6 2.35s <NULL>
# 3 flashlight 369.1ms 403.1ms 2.55 66.8MB 2.55 3 3 1.18s <NULL>
# 4 hstats 242.1ms 243.8ms 4.03 14.2MB 0 3 0 744.25ms <NULL>#
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# iml 921.7ms 923.5ms 1.06 369.7MB 3.18 3 9
# dalex 503.2ms 516.7ms 1.90 195.1MB 3.80 3 6
# flashlight 190.3ms 228.9ms 4.62 67.9MB 1.54 3 1
# hstats 74.6ms 76.8ms 12.2 14.1MB 1.75 7 1

# Partial dependence (discrete)
v <- "structure_quality"
bench::mark(
Expand All @@ -145,47 +145,46 @@ bench::mark(
flashlight = light_profile(fl, v = v, pd_n_max = Inf),
hstats = partial_dep(fit, v = v, X = X_valid, n_max = Inf),
check = FALSE,
min_iterations = 3
min_iterations = 10
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list>
# 1 iml 107.9ms 108ms 9.26 13.64MB 9.26 2 2 216ms <NULL>
# 2 dalex 172ms 172.2ms 5.81 21.14MB 2.90 2 1 344ms <NULL>
# 3 flashlight 40.3ms 41.6ms 23.8 8.61MB 2.16 11 1 462ms <NULL>
# 4 hstats 24.5ms 25.9ms 35.5 1.64MB 0 18 0 507ms <NULL>

# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# iml 77.1ms 80.6ms 12.4 12.66MB 8.25 6 4
# dalex 122.4ms 137.7ms 7.49 20.48MB 7.49 5 5
# flashlight 25.8ms 27.5ms 34.0 6.3MB 2.13 16 1
# hstats 14.3ms 15.1ms 66.1 1.49MB 2.13 31 1

# H-Stats -> we use a subset of 500 rows
X_v500 <- X_valid[1:500, ]
mod500 <- Predictor$new(fit, data = as.data.frame(X_v500), predict.function = predf)
fl500 <- flashlight(fl, data = as.data.frame(valid[1:500, ]))

# iml # 243s total, using slow exact calculations
system.time( # 110s
# iml 173 s total, using slow exact calculations (to be able to compare results)
system.time( # 85s
iml_overall <- Interaction$new(mod500, grid.size = 500)
)
system.time( # 133s for all combinations of latitude
system.time( # 88s for all combinations of latitude
iml_pairwise <- Interaction$new(mod500, grid.size = 500, feature = "latitude")
)

# flashlight: 14s total, doing only one pairwise calculation, otherwise would take 63s
system.time( # 11.7s
# flashlight: 8s total, doing only one pairwise calculation, otherwise would take 63s
system.time( # 7s
fl_overall <- light_interaction(fl500, v = x, grid_size = Inf, n_max = Inf)
)
system.time( # 2.3s
system.time( # 1s
fl_pairwise <- light_interaction(
fl500, v = coord, grid_size = Inf, n_max = Inf, pairwise = TRUE
)
)

# hstats: 3.5s total
# hstats: 1.5s total
system.time({
H <- hstats(fit, v = x, X = X_v500, n_max = Inf)
hstats_overall <- h2_overall(H, squared = FALSE, zero = FALSE)
hstats_pairwise <- h2_pairwise(H, squared = FALSE, zero = FALSE)
}
)

# Using 50 quantiles to approximate dense numerics: 0.8s
# Using 50 quantiles to approximate dense numerics: 0.5s
system.time(
H_approx <- hstats(fit, v = x, X = X_v500, n_max = Inf, approx = TRUE)
)
Expand All @@ -198,10 +197,10 @@ iml_overall$results |>
# 2: longitude 0.2458269

fl_overall$data |>
subset(value_ > 0, select = c(variable_, value_))
subset(value > 0, select = c(variable, value))
# variable_ value_
# 3 latitude 0.2458269
# 4 longitude 0.2458269
# 3 latitude 0.246
# 4 longitude 0.246

hstats_overall
# longitude latitude
Expand All @@ -214,8 +213,8 @@ iml_pairwise$results |>
# 1: longitude:latitude 0.3942526

fl_pairwise$data |>
subset(value_ > 0, select = c(variable_, value_))
# latitude:longitude 0.3942526
subset(value > 0, select = c(variable, value))
# latitude:longitude 0.394

hstats_pairwise
# latitude:longitude
Expand Down
2 changes: 1 addition & 1 deletion packaging.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ library(usethis)
use_description(
fields = list(
Title = "Interaction Statistics",
Version = "1.2.1",
Version = "1.2.2",
Description = "Fast, model-agnostic implementation of different H-statistics
introduced by Jerome H. Friedman and Bogdan E. Popescu (2008) <doi:10.1214/07-AOAS148>.
These statistics quantify interaction strength per feature, feature pair,
Expand Down
Loading