From 686c5bd599ba54317b7cd06c2b95949d9fd5f083 Mon Sep 17 00:00:00 2001 From: Ben Schneider Date: Thu, 28 Nov 2024 12:53:38 -0500 Subject: [PATCH] Change default value of `tau` argument in generalized bootstrap functions. --- NEWS.md | 3 +++ R/generalized_bootstrap.R | 23 +++++++++++++++-------- man/as_gen_boot_design.Rd | 11 ++++++++--- man/make_gen_boot_factors.Rd | 9 ++++++--- vignettes/bootstrap-replicates.Rmd | 2 +- 5 files changed, 33 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5640ef1..3c3a10b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # svrep (development version) +- For the generalized bootstrap functions, the default value for the argument `tau` is now `1` instead of `auto`. + This means that generalized bootstrap replicates are not rescaled by default. + - Added a kernel-based variance estimator (of Breidt, Opsomer, and Sanchez-Borrego 2016) for finely stratified or systematic samples. This can be used by calling `as_fays_gen_rep_design(..., variance_estimator = "BOSB")` or `as_gen_boot_design(..., variance_estimator = "BOSB")`. Or you can directly create the quadratic form matrix for this estimator using the function `make_kernel_var_matrix()`. Currently only supports a single auxiliary variable and only the Epanechnikov kernel function. - Added the option `variance_estimator = "Beaumont-Emond"` to the generalized replication methods. diff --git a/R/generalized_bootstrap.R b/R/generalized_bootstrap.R index 8c2d741..3607bb4 100644 --- a/R/generalized_bootstrap.R +++ b/R/generalized_bootstrap.R @@ -7,13 +7,16 @@ #' @param Sigma The matrix of the quadratic form used to represent the variance estimator. #' Must be positive semidefinite. #' @param num_replicates The number of bootstrap replicates to create. -#' @param tau Either \code{"auto"}, or a single number. This is the rescaling constant +#' @param tau Either \code{"auto"}, or a single number; the default value is 1. +#' This is the rescaling constant #' used to avoid negative weights through the transformation \eqn{\frac{w + \tau - 1}{\tau}}, #' where \eqn{w} is the original weight and \eqn{\tau} is the rescaling constant \code{tau}. \cr #' If \code{tau="auto"}, the rescaling factor is determined automatically as follows: #' if all of the adjustment factors are nonnegative, then \code{tau} is set equal to 1; #' otherwise, \code{tau} is set to the smallest value needed to rescale #' the adjustment factors such that they are all at least \code{0.01}. +#' Instead of using \code{tau="auto"}, the user can instead use the function +#' \code{rescale_reps()} to rescale the replicates later. #' @param exact_vcov If \code{exact_vcov=TRUE}, the replicate factors will be generated #' such that their variance-covariance matrix exactly matches the target variance estimator's #' quadratic form (within numeric precision). @@ -172,7 +175,7 @@ #' svytotal(x = ~ Bush + Kerry, #' design = election_pps_ht_design) #' } -make_gen_boot_factors <- function(Sigma, num_replicates, tau = "auto", exact_vcov = FALSE) { +make_gen_boot_factors <- function(Sigma, num_replicates, tau = 1, exact_vcov = FALSE) { n <- nrow(Sigma) @@ -294,13 +297,16 @@ make_gen_boot_factors <- function(Sigma, num_replicates, tau = "auto", exact_vco #' A vector of the names of auxiliary variables used in sampling. #' @param replicates Number of bootstrap replicates (should be as large as possible, given computer memory/storage limitations). #' A commonly-recommended default is 500. -#' @param tau Either \code{"auto"}, or a single number. This is the rescaling constant +#' @param tau Either \code{"auto"}, or a single number; the default value is 1. +#' This is the rescaling constant #' used to avoid negative weights through the transformation \eqn{\frac{w + \tau - 1}{\tau}}, #' where \eqn{w} is the original weight and \eqn{\tau} is the rescaling constant \code{tau}. \cr #' If \code{tau="auto"}, the rescaling factor is determined automatically as follows: #' if all of the adjustment factors are nonnegative, then \code{tau} is set equal to 1; #' otherwise, \code{tau} is set to the smallest value needed to rescale #' the adjustment factors such that they are all at least \code{0.01}. +#' Instead of using \code{tau="auto"}, the user can instead use the function +#' \code{rescale_reps()} to rescale the replicates later. #' @param exact_vcov If \code{exact_vcov=TRUE}, the replicate factors will be generated #' such that variance estimates for totals exactly match the results from the target variance estimator. #' This requires that \code{num_replicates} exceeds the rank of \code{Sigma}. @@ -397,7 +403,8 @@ make_gen_boot_factors <- function(Sigma, num_replicates, tau = "auto", exact_vco #' \textbf{After rescaling: } v_B\left(\hat{T}_y\right) = \frac{\tau^2}{B}\sum_{b=1}^B\left(\hat{T}_y^{S*(b)}-\hat{T}_y\right)^2 #' } #' When sharing a dataset that uses rescaled weights from a generalized survey bootstrap, the documentation for the dataset should instruct the user to use replication scale factor \eqn{\frac{\tau^2}{B}} rather than \eqn{\frac{1}{B}} when estimating sampling variances. -#' +#' This rescaling method does not affect variance estimates for linear statistics, +#' but its impact on non-smooth statistics such as quantiles is unclear. #' @section Two-Phase Designs: #' For a two-phase design, \code{variance_estimator} should be a list of variance estimators' names, @@ -534,7 +541,7 @@ make_gen_boot_factors <- function(Sigma, num_replicates, tau = "auto", exact_vco #' } as_gen_boot_design <- function(design, variance_estimator = NULL, aux_var_names = NULL, - replicates = 500, tau = "auto", exact_vcov = FALSE, + replicates = 500, tau = 1, exact_vcov = FALSE, psd_option = "warn", mse = getOption("survey.replicates.mse"), compress = TRUE) { @@ -544,7 +551,7 @@ as_gen_boot_design <- function(design, variance_estimator = NULL, #' @export as_gen_boot_design.twophase2 <- function(design, variance_estimator = NULL, aux_var_names = NULL, - replicates = 500, tau = "auto", + replicates = 500, tau = 1, exact_vcov = FALSE, psd_option = "warn", mse = getOption("survey.replicates.mse"), compress = TRUE) { @@ -610,7 +617,7 @@ as_gen_boot_design.twophase2 <- function(design, variance_estimator = NULL, #' @export as_gen_boot_design.survey.design <- function(design, variance_estimator = NULL, aux_var_names = NULL, - replicates = 500, tau = "auto", exact_vcov = FALSE, + replicates = 500, tau = 1, exact_vcov = FALSE, psd_option = 'warn', mse = getOption("survey.replicates.mse"), compress = TRUE) { @@ -697,7 +704,7 @@ as_gen_boot_design.survey.design <- function(design, variance_estimator = NULL, #' @export as_gen_boot_design.DBIsvydesign <- function(design, variance_estimator = NULL, aux_var_names = NULL, - replicates = 500, tau = "auto", + replicates = 500, tau = 1, exact_vcov = FALSE, psd_option = "warn", mse = getOption("survey.replicates.mse"), compress = TRUE) { diff --git a/man/as_gen_boot_design.Rd b/man/as_gen_boot_design.Rd index 311f69b..8512655 100644 --- a/man/as_gen_boot_design.Rd +++ b/man/as_gen_boot_design.Rd @@ -9,7 +9,7 @@ as_gen_boot_design( variance_estimator = NULL, aux_var_names = NULL, replicates = 500, - tau = "auto", + tau = 1, exact_vcov = FALSE, psd_option = "warn", mse = getOption("survey.replicates.mse"), @@ -64,13 +64,16 @@ A vector of the names of auxiliary variables used in sampling.} \item{replicates}{Number of bootstrap replicates (should be as large as possible, given computer memory/storage limitations). A commonly-recommended default is 500.} -\item{tau}{Either \code{"auto"}, or a single number. This is the rescaling constant +\item{tau}{Either \code{"auto"}, or a single number; the default value is 1. +This is the rescaling constant used to avoid negative weights through the transformation \eqn{\frac{w + \tau - 1}{\tau}}, where \eqn{w} is the original weight and \eqn{\tau} is the rescaling constant \code{tau}. \cr If \code{tau="auto"}, the rescaling factor is determined automatically as follows: if all of the adjustment factors are nonnegative, then \code{tau} is set equal to 1; otherwise, \code{tau} is set to the smallest value needed to rescale -the adjustment factors such that they are all at least \code{0.01}.} +the adjustment factors such that they are all at least \code{0.01}. +Instead of using \code{tau="auto"}, the user can instead use the function +\code{rescale_reps()} to rescale the replicates later.} \item{exact_vcov}{If \code{exact_vcov=TRUE}, the replicate factors will be generated such that variance estimates for totals exactly match the results from the target variance estimator. @@ -174,6 +177,8 @@ If the adjustment factors are rescaled in this manner, it is important to adjust \textbf{After rescaling: } v_B\left(\hat{T}_y\right) = \frac{\tau^2}{B}\sum_{b=1}^B\left(\hat{T}_y^{S*(b)}-\hat{T}_y\right)^2 } When sharing a dataset that uses rescaled weights from a generalized survey bootstrap, the documentation for the dataset should instruct the user to use replication scale factor \eqn{\frac{\tau^2}{B}} rather than \eqn{\frac{1}{B}} when estimating sampling variances. +This rescaling method does not affect variance estimates for linear statistics, +but its impact on non-smooth statistics such as quantiles is unclear. } \section{Two-Phase Designs}{ diff --git a/man/make_gen_boot_factors.Rd b/man/make_gen_boot_factors.Rd index 80ec215..e2fe3eb 100644 --- a/man/make_gen_boot_factors.Rd +++ b/man/make_gen_boot_factors.Rd @@ -4,7 +4,7 @@ \alias{make_gen_boot_factors} \title{Creates replicate factors for the generalized survey bootstrap} \usage{ -make_gen_boot_factors(Sigma, num_replicates, tau = "auto", exact_vcov = FALSE) +make_gen_boot_factors(Sigma, num_replicates, tau = 1, exact_vcov = FALSE) } \arguments{ \item{Sigma}{The matrix of the quadratic form used to represent the variance estimator. @@ -12,13 +12,16 @@ Must be positive semidefinite.} \item{num_replicates}{The number of bootstrap replicates to create.} -\item{tau}{Either \code{"auto"}, or a single number. This is the rescaling constant +\item{tau}{Either \code{"auto"}, or a single number; the default value is 1. +This is the rescaling constant used to avoid negative weights through the transformation \eqn{\frac{w + \tau - 1}{\tau}}, where \eqn{w} is the original weight and \eqn{\tau} is the rescaling constant \code{tau}. \cr If \code{tau="auto"}, the rescaling factor is determined automatically as follows: if all of the adjustment factors are nonnegative, then \code{tau} is set equal to 1; otherwise, \code{tau} is set to the smallest value needed to rescale -the adjustment factors such that they are all at least \code{0.01}.} +the adjustment factors such that they are all at least \code{0.01}. +Instead of using \code{tau="auto"}, the user can instead use the function +\code{rescale_reps()} to rescale the replicates later.} \item{exact_vcov}{If \code{exact_vcov=TRUE}, the replicate factors will be generated such that their variance-covariance matrix exactly matches the target variance estimator's diff --git a/vignettes/bootstrap-replicates.Rmd b/vignettes/bootstrap-replicates.Rmd index cc53978..d57565d 100644 --- a/vignettes/bootstrap-replicates.Rmd +++ b/vignettes/bootstrap-replicates.Rmd @@ -339,7 +339,7 @@ $$ \end{aligned} $$ -When sharing a dataset that uses rescaled weights from a generalized survey bootstrap, the documentation for the dataset should instruct the user to use replication scale factor $\frac{\tau^2}{B}$ rather than $\frac{1}{B}$ when estimating sampling variances. +When sharing a dataset that uses rescaled weights from a generalized survey bootstrap, the documentation for the dataset should instruct the user to use replication scale factor $\frac{\tau^2}{B}$ rather than $\frac{1}{B}$ when estimating sampling variances. This rescaling method does not affect variance estimates for linear statistics such as totals, but its affect on non-smooth statistics such as quantiles is unclear. ## Implementation