diff --git a/.Rbuildignore b/.Rbuildignore index 9d5e720..9af5dbb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,4 @@ ^test.R$ ^backlog$ ^CRAN-SUBMISSION$ +^pkgdown$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index dc82f73..3024634 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 1.1.2 -Date: 2024-02-03 15:31:07 UTC -SHA: 9eebdab2e583bf97d51dff9e7783df2b87751097 +Version: 1.2.0 +Date: 2024-07-12 12:10:00 UTC +SHA: daf4cee64500abb8d78f92d8b1e8f1e588a59884 diff --git a/DESCRIPTION b/DESCRIPTION index cd4ef1d..6944524 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,5 +30,5 @@ Imports: Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 -URL: https://github.com/ModelOriented/hstats, https://mayer79.github.io/hstats -BugReports: https://github.com/ModelOriented/hstats/issues +URL: https://github.com/ModelOriented/hstats/, https://modeloriented.github.io/hstats/ +BugReports: https://github.com/ModelOriented/hstats/issues/ diff --git a/NEWS.md b/NEWS.md index 0c5bcdb..9238f4a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,18 +1,14 @@ # hstats 1.2.0 -## New home +## My new home - My brand new home: https://github.com/ModelOriented/hstats -## Major changes +## Other changes - Factor-valued predictions are no longer possible. - Consequently, also removed "classification_error" loss. -## Minor changes - -- Code simplifications. - # hstats 1.1.2 ## ICE plots @@ -28,29 +24,29 @@ ## Performance improvements -- For pure data.frames (no tibbles, data.tables etc.), most functions are significantly faster ([#110](https://github.com/mayer79/hstats/pull/110)). -- Slight speed-up of permutation importance for non-matrix `X` ([#109](https://github.com/mayer79/hstats/pull/109)). +- For pure data.frames (no tibbles, data.tables etc.), most functions are significantly faster ([#110](https://github.com/ModelOriented/hstats/pull/110)). +- Slight speed-up of permutation importance for non-matrix `X` ([#109](https://github.com/ModelOriented/hstats/pull/109)). ## Other changes -- In multivariate cases, it was possible that normalized H-statistics could equal `0/0 (= NaN)`. Such values are now replaced by 0 ([#107](https://github.com/mayer79/hstats/issues/107)). -- Removed an unnecessary special case when calculating column means ([#106](https://github.com/mayer79/hstats/pull/106)). +- In multivariate cases, it was possible that normalized H-statistics could equal `0/0 (= NaN)`. Such values are now replaced by 0 ([#107](https://github.com/ModelOriented/hstats/issues/107)). +- Removed an unnecessary special case when calculating column means ([#106](https://github.com/ModelOriented/hstats/pull/106)). # hstats 1.1.0 ## Enhancements -- {hstats} now also works for factor predictions. The levels are represented by one-hot-encoded columns ([PR#101](https://github.com/mayer79/hstats/pull/101)). -- The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/mayer79/hstats/issues/91), see also [issue #94](https://github.com/mayer79/hstats/issues/94). +- {hstats} now also works for factor predictions. The levels are represented by one-hot-encoded columns ([PR#101](https://github.com/ModelOriented/hstats/pull/101)). +- The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/ModelOriented/hstats/issues/91), see also [issue #94](https://github.com/ModelOriented/hstats/issues/94). ## Bug fixes -- Using `BY` and `w` via column names would fail for tibbles. This problem was described in [#92](https://github.com/mayer79/hstats/issues/92) by [Roel Verbelen](https://github.com/RoelVerbelen). Thx! +- Using `BY` and `w` via column names would fail for tibbles. This problem was described in [#92](https://github.com/ModelOriented/hstats/issues/92) by [Roel Verbelen](https://github.com/RoelVerbelen). Thx! ## Other changes -- Much faster one-hot-encoding, thanks to Mathias Ambühl ([PR#101](https://github.com/mayer79/hstats/pull/101)). -- Most functions are slightly faster ([PR#101](https://github.com/mayer79/hstats/pull/101)). +- Much faster one-hot-encoding, thanks to Mathias Ambühl ([PR#101](https://github.com/ModelOriented/hstats/pull/101)). +- Most functions are slightly faster ([PR#101](https://github.com/ModelOriented/hstats/pull/101)). - Add unit tests to compare against {iml}. - Made all examples "tibble" and "data.table" friendly. - Revised input checks in loss functions (relevant for `perm_importance()` and `average_loss()`). diff --git a/README.md b/README.md index a6ff0ae..13e6697 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,8 @@ Let's study different plots to understand *how* the strong interaction between d They all reveal a substantial interaction between the two variables in the sense that the age effect gets weaker the closer to the ocean. Note that numeric `BY` features are automatically binned into quartile groups. ```r -plot(partial_dep(fit, v = "age", X = X_train, BY = "log_ocean"), show_points = FALSE) +partial_dep(fit, v = "age", X = X_train, BY = "log_ocean") |> + plot(show_points = FALSE) ``` ![](man/figures/pdp_ocean_age.svg) @@ -167,8 +168,8 @@ plot(pd, d2_geom = "line", show_points = FALSE) ![](man/figures/pdp_2d_line.svg) ```r -ic <- ice(fit, v = "age", X = X_train, BY = "log_ocean") -plot(ic, center = TRUE) +ice(fit, v = "age", X = X_train, BY = "log_ocean") |> + plot(center = TRUE) ``` ![](man/figures/ice.svg) @@ -178,11 +179,14 @@ plot(ic, center = TRUE) In the spirit of [1], and related to [4], we can extract from the "hstats" objects a partial dependence based variable importance measure. It measures not only the main effect strength (see [4]), but also all its interaction effects. It is rather experimental, so use it with care (details in the section "Background"): ```r -plot(pd_importance(s)) +pd_importance(s) |> + plot() # Compared with four times repeated permutation importance regarding MSE set.seed(10) -plot(perm_importance(fit, X = X_valid, y = y_valid)) + +perm_importance(fit, X = X_valid, y = y_valid) |> + plot() ``` ![](man/figures/importance.svg) @@ -209,14 +213,17 @@ s <- hstats(ex) s # 0.054 plot(s) -# Strongest relative interaction -plot(ice(ex, v = "Sepal.Width", BY = "Petal.Width"), center = TRUE) -plot(partial_dep(ex, v = "Sepal.Width", BY = "Petal.Width"), show_points = FALSE) -plot(partial_dep(ex, v = c("Sepal.Width", "Petal.Width"), grid_size = 200)) +# Strongest relative interaction (different visualizations) +ice(ex, v = "Sepal.Width", BY = "Petal.Width") |> + plot(center = TRUE) + +partial_dep(ex, v = "Sepal.Width", BY = "Petal.Width") |> + plot(show_points = FALSE) -perm_importance(ex) +partial_dep(ex, v = c("Sepal.Width", "Petal.Width"), grid_size = 200) |> + plot() -# Permutation importance +perm_importance(ex) # Petal.Length Petal.Width Sepal.Width Species # 0.59836442 0.11625137 0.07966910 0.03982554 ``` @@ -298,7 +305,7 @@ fit <- lgb.train( average_loss(fit, X = X_valid, y = y_valid, loss = "mlogloss") perm_importance(fit, X = X_valid, y = y_valid, loss = "mlogloss", m_rep = 100) -# Permutation importance regarding mlogloss + # Petal.Length Petal.Width Sepal.Width Sepal.Length # 2.624241332 1.011168660 0.082477177 0.009757393 @@ -396,7 +403,9 @@ fit <- iris_wf |> s <- hstats(fit, X = iris[, -1]) s # 0 -> no interactions -plot(partial_dep(fit, v = "Petal.Width", X = iris)) + +partial_dep(fit, v = "Petal.Width", X = iris) |> + plot() imp <- perm_importance(fit, X = iris, y = "Sepal.Length") imp @@ -425,8 +434,11 @@ fit <- train( h2(hstats(fit, X = iris[, -1])) # 0 -plot(ice(fit, v = "Petal.Width", X = iris), center = TRUE) -plot(perm_importance(fit, X = iris, y = "Sepal.Length")) +ice(fit, v = "Petal.Width", X = iris) |> + plot(center = TRUE) + +perm_importance(fit, X = iris, y = "Sepal.Length") |> + plot() ``` ### mlr3 diff --git a/cran-comments.md b/cran-comments.md index 602b817..d5dfef7 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,23 +1,19 @@ -# hstats 1.1.2 +# Re-submission: hstats 1.2.0 -Hello CRAN team +Moving the github repo has left some old links in the NEWS file. This is fixed here. -This is a small release with two convenient API improvements. +# Original message -## Local checks: 0 errors, 0 warnings, 0 notes +Hello CRAN -## Rhub: 3 NOTES (sounding harmless) +This release mainly updates the new repository ("ModelOriented" of TU Warcaw instead of my personal one), and adds Prof Biecek as co-author. -- checking HTML version of manual ... NOTE - Skipping checking math rendering: package 'V8' unavailable -- checking for non-standard things in the check directory ... NOTE - Found the following files/directories: - ''NULL'' -- checking for detritus in the temp directory ... NOTE - Found the following files/directories: - 'lastMiKTeXException' +## Local checks + +0 errors, 0 warnings, 0 notes ## Winbuilder -Status: OK +Status: 1 NOTE +R Under development (unstable) (2024-07-11 r86890 ucrt)