From 5f64153bac8a66b086728b2ae21ad12f6d5356f5 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 12 Jul 2024 13:23:03 +0200 Subject: [PATCH 1/3] Release candidate --- .Rbuildignore | 1 + DESCRIPTION | 2 +- README.md | 42 +++++++++++++++++++++++++++--------------- cran-comments.md | 22 +++++++--------------- 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 9d5e7202..9af5dbbc 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,4 @@ ^test.R$ ^backlog$ ^CRAN-SUBMISSION$ +^pkgdown$ diff --git a/DESCRIPTION b/DESCRIPTION index cd4ef1db..602f3da2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,5 +30,5 @@ Imports: Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 -URL: https://github.com/ModelOriented/hstats, https://mayer79.github.io/hstats +URL: https://github.com/ModelOriented/hstats, https://ModelOriented.github.io/hstats BugReports: https://github.com/ModelOriented/hstats/issues diff --git a/README.md b/README.md index a6ff0ae4..13e66972 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,8 @@ Let's study different plots to understand *how* the strong interaction between d They all reveal a substantial interaction between the two variables in the sense that the age effect gets weaker the closer to the ocean. Note that numeric `BY` features are automatically binned into quartile groups. ```r -plot(partial_dep(fit, v = "age", X = X_train, BY = "log_ocean"), show_points = FALSE) +partial_dep(fit, v = "age", X = X_train, BY = "log_ocean") |> + plot(show_points = FALSE) ``` ![](man/figures/pdp_ocean_age.svg) @@ -167,8 +168,8 @@ plot(pd, d2_geom = "line", show_points = FALSE) ![](man/figures/pdp_2d_line.svg) ```r -ic <- ice(fit, v = "age", X = X_train, BY = "log_ocean") -plot(ic, center = TRUE) +ice(fit, v = "age", X = X_train, BY = "log_ocean") |> + plot(center = TRUE) ``` ![](man/figures/ice.svg) @@ -178,11 +179,14 @@ plot(ic, center = TRUE) In the spirit of [1], and related to [4], we can extract from the "hstats" objects a partial dependence based variable importance measure. It measures not only the main effect strength (see [4]), but also all its interaction effects. It is rather experimental, so use it with care (details in the section "Background"): ```r -plot(pd_importance(s)) +pd_importance(s) |> + plot() # Compared with four times repeated permutation importance regarding MSE set.seed(10) -plot(perm_importance(fit, X = X_valid, y = y_valid)) + +perm_importance(fit, X = X_valid, y = y_valid) |> + plot() ``` ![](man/figures/importance.svg) @@ -209,14 +213,17 @@ s <- hstats(ex) s # 0.054 plot(s) -# Strongest relative interaction -plot(ice(ex, v = "Sepal.Width", BY = "Petal.Width"), center = TRUE) -plot(partial_dep(ex, v = "Sepal.Width", BY = "Petal.Width"), show_points = FALSE) -plot(partial_dep(ex, v = c("Sepal.Width", "Petal.Width"), grid_size = 200)) +# Strongest relative interaction (different visualizations) +ice(ex, v = "Sepal.Width", BY = "Petal.Width") |> + plot(center = TRUE) + +partial_dep(ex, v = "Sepal.Width", BY = "Petal.Width") |> + plot(show_points = FALSE) -perm_importance(ex) +partial_dep(ex, v = c("Sepal.Width", "Petal.Width"), grid_size = 200) |> + plot() -# Permutation importance +perm_importance(ex) # Petal.Length Petal.Width Sepal.Width Species # 0.59836442 0.11625137 0.07966910 0.03982554 ``` @@ -298,7 +305,7 @@ fit <- lgb.train( average_loss(fit, X = X_valid, y = y_valid, loss = "mlogloss") perm_importance(fit, X = X_valid, y = y_valid, loss = "mlogloss", m_rep = 100) -# Permutation importance regarding mlogloss + # Petal.Length Petal.Width Sepal.Width Sepal.Length # 2.624241332 1.011168660 0.082477177 0.009757393 @@ -396,7 +403,9 @@ fit <- iris_wf |> s <- hstats(fit, X = iris[, -1]) s # 0 -> no interactions -plot(partial_dep(fit, v = "Petal.Width", X = iris)) + +partial_dep(fit, v = "Petal.Width", X = iris) |> + plot() imp <- perm_importance(fit, X = iris, y = "Sepal.Length") imp @@ -425,8 +434,11 @@ fit <- train( h2(hstats(fit, X = iris[, -1])) # 0 -plot(ice(fit, v = "Petal.Width", X = iris), center = TRUE) -plot(perm_importance(fit, X = iris, y = "Sepal.Length")) +ice(fit, v = "Petal.Width", X = iris) |> + plot(center = TRUE) + +perm_importance(fit, X = iris, y = "Sepal.Length") |> + plot() ``` ### mlr3 diff --git a/cran-comments.md b/cran-comments.md index 602b8177..524efcf6 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,23 +1,15 @@ -# hstats 1.1.2 +# hstats 1.2.0 -Hello CRAN team +Hello CRAN -This is a small release with two convenient API improvements. +This release mainly updates the new repository ("ModelOriented" of TU Warcaw instead of my personal one), and adds Prof Biecek as co-author. -## Local checks: 0 errors, 0 warnings, 0 notes +## Local checks -## Rhub: 3 NOTES (sounding harmless) - -- checking HTML version of manual ... NOTE - Skipping checking math rendering: package 'V8' unavailable -- checking for non-standard things in the check directory ... NOTE - Found the following files/directories: - ''NULL'' -- checking for detritus in the temp directory ... NOTE - Found the following files/directories: - 'lastMiKTeXException' +0 errors, 0 warnings, 0 notes ## Winbuilder -Status: OK +Status: 1 NOTE +R Under development (unstable) (2024-07-11 r86890 ucrt) From daf4cee64500abb8d78f92d8b1e8f1e588a59884 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 12 Jul 2024 14:02:08 +0200 Subject: [PATCH 2/3] Changed old links in NEWS --- CRAN-SUBMISSION | 6 +++--- DESCRIPTION | 4 ++-- NEWS.md | 26 +++++++++++--------------- cran-comments.md | 6 +++++- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index dc82f73f..37ac85d7 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 1.1.2 -Date: 2024-02-03 15:31:07 UTC -SHA: 9eebdab2e583bf97d51dff9e7783df2b87751097 +Version: 1.2.0 +Date: 2024-07-12 11:23:55 UTC +SHA: 5f64153bac8a66b086728b2ae21ad12f6d5356f5 diff --git a/DESCRIPTION b/DESCRIPTION index 602f3da2..6944524f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,5 +30,5 @@ Imports: Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 -URL: https://github.com/ModelOriented/hstats, https://ModelOriented.github.io/hstats -BugReports: https://github.com/ModelOriented/hstats/issues +URL: https://github.com/ModelOriented/hstats/, https://modeloriented.github.io/hstats/ +BugReports: https://github.com/ModelOriented/hstats/issues/ diff --git a/NEWS.md b/NEWS.md index 0c5bcdb9..9238f4ad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,18 +1,14 @@ # hstats 1.2.0 -## New home +## My new home - My brand new home: https://github.com/ModelOriented/hstats -## Major changes +## Other changes - Factor-valued predictions are no longer possible. - Consequently, also removed "classification_error" loss. -## Minor changes - -- Code simplifications. - # hstats 1.1.2 ## ICE plots @@ -28,29 +24,29 @@ ## Performance improvements -- For pure data.frames (no tibbles, data.tables etc.), most functions are significantly faster ([#110](https://github.com/mayer79/hstats/pull/110)). -- Slight speed-up of permutation importance for non-matrix `X` ([#109](https://github.com/mayer79/hstats/pull/109)). +- For pure data.frames (no tibbles, data.tables etc.), most functions are significantly faster ([#110](https://github.com/ModelOriented/hstats/pull/110)). +- Slight speed-up of permutation importance for non-matrix `X` ([#109](https://github.com/ModelOriented/hstats/pull/109)). ## Other changes -- In multivariate cases, it was possible that normalized H-statistics could equal `0/0 (= NaN)`. Such values are now replaced by 0 ([#107](https://github.com/mayer79/hstats/issues/107)). -- Removed an unnecessary special case when calculating column means ([#106](https://github.com/mayer79/hstats/pull/106)). +- In multivariate cases, it was possible that normalized H-statistics could equal `0/0 (= NaN)`. Such values are now replaced by 0 ([#107](https://github.com/ModelOriented/hstats/issues/107)). +- Removed an unnecessary special case when calculating column means ([#106](https://github.com/ModelOriented/hstats/pull/106)). # hstats 1.1.0 ## Enhancements -- {hstats} now also works for factor predictions. The levels are represented by one-hot-encoded columns ([PR#101](https://github.com/mayer79/hstats/pull/101)). -- The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/mayer79/hstats/issues/91), see also [issue #94](https://github.com/mayer79/hstats/issues/94). +- {hstats} now also works for factor predictions. The levels are represented by one-hot-encoded columns ([PR#101](https://github.com/ModelOriented/hstats/pull/101)). +- The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/ModelOriented/hstats/issues/91), see also [issue #94](https://github.com/ModelOriented/hstats/issues/94). ## Bug fixes -- Using `BY` and `w` via column names would fail for tibbles. This problem was described in [#92](https://github.com/mayer79/hstats/issues/92) by [Roel Verbelen](https://github.com/RoelVerbelen). Thx! +- Using `BY` and `w` via column names would fail for tibbles. This problem was described in [#92](https://github.com/ModelOriented/hstats/issues/92) by [Roel Verbelen](https://github.com/RoelVerbelen). Thx! ## Other changes -- Much faster one-hot-encoding, thanks to Mathias Ambühl ([PR#101](https://github.com/mayer79/hstats/pull/101)). -- Most functions are slightly faster ([PR#101](https://github.com/mayer79/hstats/pull/101)). +- Much faster one-hot-encoding, thanks to Mathias Ambühl ([PR#101](https://github.com/ModelOriented/hstats/pull/101)). +- Most functions are slightly faster ([PR#101](https://github.com/ModelOriented/hstats/pull/101)). - Add unit tests to compare against {iml}. - Made all examples "tibble" and "data.table" friendly. - Revised input checks in loss functions (relevant for `perm_importance()` and `average_loss()`). diff --git a/cran-comments.md b/cran-comments.md index 524efcf6..d5dfef7e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,4 +1,8 @@ -# hstats 1.2.0 +# Re-submission: hstats 1.2.0 + +Moving the github repo has left some old links in the NEWS file. This is fixed here. + +# Original message Hello CRAN From ac27f3eeaf1d2d7efecc63b5858713f163e3a429 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 12 Jul 2024 14:24:43 +0200 Subject: [PATCH 3/3] On its way to CRAN --- CRAN-SUBMISSION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 37ac85d7..30246340 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ Version: 1.2.0 -Date: 2024-07-12 11:23:55 UTC -SHA: 5f64153bac8a66b086728b2ae21ad12f6d5356f5 +Date: 2024-07-12 12:10:00 UTC +SHA: daf4cee64500abb8d78f92d8b1e8f1e588a59884