From 5d16f4f719e2398331658466bce2d0ba3fabe653 Mon Sep 17 00:00:00 2001 From: BERENZ Date: Mon, 27 Jan 2025 22:29:03 +0100 Subject: [PATCH] more informative information --- NAMESPACE | 9 ----- NEWS.md | 5 +++ R/boot_mi.R | 6 ++-- R/cloglogModel.R | 22 ++++++------ R/control_inference.R | 4 +-- R/control_selection.R | 6 ++-- R/data_manip.R | 2 +- R/generateData.r | 78 ------------------------------------------- R/internals.R | 6 ++-- R/logitModel.R | 24 ++++++------- R/nonprob.R | 34 +++++++++---------- R/nonprobDR.R | 8 ++--- R/nonprobIPW.R | 8 ++--- R/nonprobMI.R | 6 ++-- R/probitModel.R | 22 ++++++------ R/simple_methods.R | 42 +++++++++++------------ man/control_sel.Rd | 4 +-- man/genSimData.Rd | 47 -------------------------- 18 files changed, 102 insertions(+), 231 deletions(-) delete mode 100644 R/generateData.r delete mode 100644 man/genSimData.Rd diff --git a/NAMESPACE b/NAMESPACE index 42f8292..b447a71 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,7 +21,6 @@ export(cloglog_model_nonprobsvy) export(control_inf) export(control_out) export(control_sel) -export(genSimData) export(logit_model_nonprobsvy) export(nonprob) export(pop_size) @@ -53,7 +52,6 @@ importFrom(stats,coef) importFrom(stats,confint) importFrom(stats,contrasts) importFrom(stats,cooks.distance) -importFrom(stats,cor) importFrom(stats,cov) importFrom(stats,delete.response) importFrom(stats,deviance) @@ -61,7 +59,6 @@ importFrom(stats,dnorm) importFrom(stats,get_all_vars) importFrom(stats,glm.fit) importFrom(stats,hatvalues) -importFrom(stats,lm.fit) importFrom(stats,loess) importFrom(stats,loess.control) importFrom(stats,logLik) @@ -77,18 +74,12 @@ importFrom(stats,printCoefmat) importFrom(stats,pt) importFrom(stats,qlogis) importFrom(stats,qnorm) -importFrom(stats,rbinom) -importFrom(stats,rchisq) importFrom(stats,reformulate) importFrom(stats,residuals) -importFrom(stats,rexp) -importFrom(stats,rnorm) -importFrom(stats,runif) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,summary.glm) importFrom(stats,terms) -importFrom(stats,uniroot) importFrom(stats,update) importFrom(stats,var) importFrom(stats,vcov) diff --git a/NEWS.md b/NEWS.md index 3cc785f..143aeab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ ### Breaking changes - functions `pop.size`, `controlSel`, `controlOut` and `controlInf` were renamed to `pop_size`, `control_sel`, `control_out` and `control_inf` respectively. +- function `genSimData` removed completely as it is not used anywhere in the package. +- argument `maxLik_method` renamed to `maxlik_method` in the `control_sel` function. ### Features @@ -15,6 +17,9 @@ ### Bugfixes - basic methods and functions related to variance estimation, weights and probability linking methods have been rewritten in a more optimal and readable way. +### Other +- more informative error messages added. + ### Documentation - annotation has been added that arguments such as `strata`, `subset` and `na_action` are not supported for the time being. diff --git a/R/boot_mi.R b/R/boot_mi.R index 1fbd233..2a35170 100644 --- a/R/boot_mi.R +++ b/R/boot_mi.R @@ -57,7 +57,7 @@ bootMI <- function(X_rand, if (class(svydesign)[1] != "pps") { rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights } else { - stop("pps bootstrap variance in development") + stop("The pps bootstrap variance estimator is under development") } if (method == "glm") { while (k <= num_boot) { @@ -444,7 +444,7 @@ bootMI_multicore <- function(X_rand, family_nonprobsvy <- family_nonprobsvy() } - if (verbose) message("Multicores bootstrap in progress..") + if (verbose) message("Multicores bootstrap in progress...") cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) @@ -460,7 +460,7 @@ bootMI_multicore <- function(X_rand, if (class(svydesign)[1] != "pps") { rep_weights <- survey::as.svrepdesign(svydesign, type = rep_type, replicates = num_boot)$repweights$weights } else { - stop("pps bootstrap variance in development") + stop("The pps bootstrap variance estimator is under development.") } if (method == "glm") { k <- 1:num_boot diff --git a/R/cloglogModel.R b/R/cloglogModel.R index b7b2aa3..06efcf5 100644 --- a/R/cloglogModel.R +++ b/R/cloglogModel.R @@ -125,12 +125,12 @@ cloglog_model_nonprobsvy <- function(...) { if (maxLik_an$code %in% c(3:7, 100)) { switch(as.character(maxLik_an$code), - "3" = warning("Warning in fitting selection model with maxLik: probably not converged."), - "4" = warning("Maxiteration limit reached in fitting selection model by maxLik."), - "5" = stop("Inifinite value of log_like in fitting selection model by maxLik, error code 5"), - "6" = stop("Inifinite value of gradient in fitting selection model by maxLik, error code 6"), - "7" = stop("Inifinite value of hessian in fitting selection model by maxLik, error code 7"), - "100" = stop("Error in fitting selection model with maxLik, error code 100:: Bad start."), + "3" = warning("Warning in fitting selection model with the `maxLik` package: probably not converged."), + "4" = warning("Maxiteration limit reached in fitting selection model by the `maxLik` package."), + "5" = stop("Infinite value of log_like in fitting selection model by the `maxLik` package, error code 5."), + "6" = stop("Infinite value of gradient in fitting selection model by the `maxLik` package, error code 6."), + "7" = stop("Infinite value of hessian in fitting selection model by the `maxLik` package, error code 7."), + "100" = stop("Error in fitting selection model with the `maxLik` package, error code 100: Bad start."), ) } @@ -153,10 +153,10 @@ cloglog_model_nonprobsvy <- function(...) { ) if (maxLik_an$convergence %in% c(1, 10, 51, 52)) { switch(as.character(maxLik_an$convergence), - "1" = warning("Warning in fitting selection model with optim: the iteration limit maxit had been reached."), - "10" = warning("Degeneracy of the Nelder Mead simplex in fitting selection model by optim."), # TODO - - "51" = warning("Warning from the L BFGS B when fitting by optim."), # TODO - - "52" = stop("Indicates an error from the L-BFGS-B method when fitting by optim.") + "1" = warning("Warning in fitting selection model with the `optim` function: the iteration limit maxit had been reached."), + "10" = warning("Degeneracy of the Nelder Mead simplex in fitting selection model by the `optim` function."), # TODO - + "51" = warning("Warning from the L-BFGS-B when fitting by the `optim` function."), # TODO - + "52" = stop("Indicates an error from the L-BFGS-B method when fitting by the `optim` function.") ) } theta <- maxLik_an$par @@ -164,7 +164,7 @@ cloglog_model_nonprobsvy <- function(...) { grad <- gradient(theta) hess <- hessian(theta) } else { - stop("Provided invalid optimizer.") + stop("Provide valid optimizer (`optim` or `maxLik`).") } list( diff --git a/R/control_inference.R b/R/control_inference.R index 165dad0..2178fe6 100644 --- a/R/control_inference.R +++ b/R/control_inference.R @@ -68,13 +68,13 @@ control_inf <- function(vars_selection = FALSE, TRUE } else { if (!is.logical(keep_boot)) { - stop("keep_boot argument for controlInf must be logical") + stop("The `keep_boot` argument for `control_inf` function must be logical.") } else { keep_boot } }, nn_exact_se = if (!is.logical(nn_exact_se) & length(nn_exact_se) == 1) { - stop("Argument nn_exact_se must be a logical scalar") + stop("The `nn_exact_se` argument must be a logical scalar.") } else { nn_exact_se }, diff --git a/R/control_selection.R b/R/control_selection.R index be3ca16..3a64186 100644 --- a/R/control_selection.R +++ b/R/control_selection.R @@ -10,7 +10,7 @@ #' @param trace logical value. If `TRUE` trace steps of the fitting algorithms. Default is `FALSE` #' @param optimizer - optimization function for maximum likelihood estimation. #' @param optim_method maximisation method that will be passed to [stats::optim()] function. Default is `BFGS`. -#' @param maxLik_method maximisation method that will be passed to [maxLik::maxLik()] function. Default is `NR`. +#' @param maxlik_method maximisation method that will be passed to [maxLik::maxLik()] function. Default is `NR`. #' @param dependence logical value - `TRUE` if samples are dependent. #' @param key binary key variable #' @param est_method_sel Method of estimation for propensity score model. @@ -51,7 +51,7 @@ control_sel <- function(method = "glm.fit", # perhaps another control function f maxit = 500, trace = FALSE, optimizer = c("maxLik", "optim"), - maxLik_method = "NR", + maxlik_method = "NR", optim_method = "BFGS", dependence = FALSE, key = NULL, @@ -77,7 +77,7 @@ control_sel <- function(method = "glm.fit", # perhaps another control function f maxit = maxit, trace = trace, optimizer = if (missing(optimizer)) "optim" else optimizer, - maxLik_method = maxLik_method, + maxlik_method = maxlik_method, optim_method = optim_method, dependence = dependence, key = key, diff --git a/R/data_manip.R b/R/data_manip.R index 1c14761..2b1122f 100644 --- a/R/data_manip.R +++ b/R/data_manip.R @@ -52,7 +52,7 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot # X_rand <- model.matrix(formula, svydesign$variables[, nons_names])# matrix of probability sample with intercept # } } else { - stop("Variable names in data and svydesign do not match") + stop("The names of the variables in the data and in svydesign do not match.") } list( diff --git a/R/generateData.r b/R/generateData.r deleted file mode 100644 index 951be3a..0000000 --- a/R/generateData.r +++ /dev/null @@ -1,78 +0,0 @@ -#' @import mathjaxr -NULL -#' @title Simulation data -#' @author Łukasz Chrostowski, Maciej Beręsewicz -# -#' @description Generate simulated data according to Chen, Li & Wu (2020), section 5. -#' -#' \loadmathjax -# -#' @param N \code{integer}, population size, default 10000 -#' @param n \code{integer}, big data sample, default 1000 -#' @return \code{genSimData} returns a data.frame, with the following columns: -#' \itemize{ -#' \item{x0 -- intercept} -#' \item{x1 -- the first variable based on z1} -#' \item{x2 -- the second variable based on z2 and x1} -#' \item{x3 -- the third variable based on z3 and x1 and x2} -#' \item{x4 -- the third variable based on z4 and x1, x2 and x3} -#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.30} -#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.60} -#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.80} -#' \item{rho -- true propensity scores for big data such that sum(rho)=n} -#' \item{srs -- probabilities of inclusion to random sample such that max(srs)/min(srs)=50} -#' } -#' -#' @references Chen, Y., Li, P., & Wu, C. (2020). Doubly Robust Inference With Nonprobability Survey Samples. Journal of the American Statistical Association, 115(532), 2011–2021. doi:10.1080/01621459.2019.1677241 -#' -#' @examples -#' ## generate data with N=20000 and n=2000 -#' genSimData(N = 20000, n = 2000) -#' -#' ## generate data when big data is almost as N -#' genSimData(N = 10000, n = 9000) -#' -#' @importFrom stats cor lm.fit rbinom rchisq rexp rnorm runif uniroot -#' -#' @export -genSimData <- function(N = 10000, n = 1000) { - find_sigma_regsim <- function(sigma, rho) { - y <- 2 + x1 + x2 + x3 + x4 + sigma * epsilon - model_fit <- lm.fit(x = cbind(1, x1, x2, x3, x4), y = y) - res <- cor(y, model_fit$fitted.values) - rho - res - } - - find_theta_logsim <- function(theta, n) { - eta <- theta + 0.1 * x1 + 0.2 * x2 + 0.1 * x3 + 0.2 * x4 - rho <- exp(eta) / (1 + exp(eta)) - res <- sum(rho) - n - res - } - - z1 <- rbinom(N, 1, 0.5) - z2 <- runif(N, 0, 2) - z3 <- rexp(N) - z4 <- rchisq(N, 4) - epsilon <- rnorm(N) - - x0 <- rep(1, N) - x1 <- z1 - x2 <- z2 + 0.3 * x1 - x3 <- z3 + 0.2 * (x1 + x2) - x4 <- z4 + 0.1 * (x1 + x2 + x3) - sigma30 <- uniroot(find_sigma_regsim, lower = 0, upper = 30, rho = 0.3)$root - sigma60 <- uniroot(find_sigma_regsim, lower = 0, upper = 30, rho = 0.6)$root - sigma80 <- uniroot(find_sigma_regsim, lower = 0, upper = 30, rho = 0.8)$root - y30 <- 2 + x1 + x2 + x3 + x4 + sigma30 * epsilon - y60 <- 2 + x1 + x2 + x3 + x4 + sigma60 * epsilon - y80 <- 2 + x1 + x2 + x3 + x4 + sigma80 * epsilon - theta <- uniroot(find_theta_logsim, lower = -100, upper = 100, n = n)$root - rho <- exp(theta + 0.1 * x1 + 0.2 * x2 + 0.1 * x3 + 0.2 * x4) / (1 + exp(theta + 0.1 * x1 + 0.2 * x2 + 0.1 * x3 + 0.2 * x4)) - - p <- uniroot(f = function(x) max(x3 + x) - 50 * min(x3 + x), lower = -200, upper = 100)$root - - sim_df <- data.frame(x0, x1, x2, x3, x4, y30, y60, y80, rho, srs = (x3 + p) / 10) - - return(sim_df) -} diff --git a/R/internals.R b/R/internals.R index 22d1581..8d50f64 100644 --- a/R/internals.R +++ b/R/internals.R @@ -155,7 +155,7 @@ ff <- function(formula) { formula_string <- paste(deparse(formula), collapse = " ") formula_parts <- strsplit(formula_string, "~")[[1]] if (length(formula_parts) != 2) { - stop("The formula must contain exactly one '~' operator.") + stop("The `formula` must contain exactly one '~' operator.") } lhs <- trimws(formula_parts[1]) @@ -165,7 +165,7 @@ ff <- function(formula) { independent_vars <- strsplit(rhs, "\\s*\\+\\s*")[[1]] if (any(duplicated(dependent_vars))) { - warning("Duplicate dependent variable names detected. They have been made unique.") + warning("Duplicate dependent variable names have been detected. They have been made unique.") dependent_vars <- unique(dependent_vars) } outcome_formulas <- vector("list", length(dependent_vars)) @@ -261,7 +261,7 @@ process_family <- function(family_spec) { } else if (inherits(family_spec, "family")) { family <- family_spec } else { - stop("Invalid family specification") + stop("Invalid family specification.") } return(family) } diff --git a/R/logitModel.R b/R/logitModel.R index a1ea3d3..8a1ec70 100644 --- a/R/logitModel.R +++ b/R/logitModel.R @@ -97,7 +97,7 @@ logit_model_nonprobsvy <- function(...) { logLik = log_like, grad = gradient, hess = hessian, - method = control$maxLik_method, + method = control$maxlik_method, start = start, printLevel = control$print_level ) @@ -108,12 +108,12 @@ logit_model_nonprobsvy <- function(...) { log_likelihood <- log_like(theta) if (maxLik_an$code %in% c(3:7, 100)) { switch(as.character(maxLik_an$code), - "3" = warning("Warning in fitting selection model with maxLik: probably not converged."), - "4" = warning("Max iteration limit reached in fitting selection model by maxLik."), - "5" = stop("Inifinite value of log_like in fitting selection model by maxLik, error code 5"), - "6" = stop("Inifinite value of gradient in fitting selection model by maxLik, error code 6"), - "7" = stop("Inifinite value of hessian in fitting selection model by maxLik, error code 7"), - "100" = stop("Error in fitting selection model with maxLik, error code 100:: Bad start."), + "3" = warning("Warning in fitting selection model with the `maxLik` package: probably not converged."), + "4" = warning("Max iteration limit reached in fitting selection model by the `maxLik` package."), + "5" = stop("Infinite value of log_like in fitting selection model by the `maxLik` package, error code 5."), + "6" = stop("Infinite value of gradient in fitting selection model by the `maxLik` package, error code 6."), + "7" = stop("Infinite value of hessian in fitting selection model by the `maxLik` package, error code 7."), + "100" = stop("Error in fitting selection model with the `maxLik` package, error code 100: Bad start."), ) } } else if (control$optimizer == "optim") { # TODO add optimParallel for high-dimensional data @@ -132,10 +132,10 @@ logit_model_nonprobsvy <- function(...) { ) if (maxLik_an$convergence %in% c(1, 10, 51, 52)) { switch(as.character(maxLik_an$convergence), - "1" = warning("Warning in fitting selection model with optim: the iteration limit maxit had been reached."), - "10" = warning("degeneracy of the Nelder Mead simplex in fitting selection model by optim."), # TODO - - "51" = warning("Warning from the L BFGS B when fitting by optim."), # TODO - - "52" = stop("Indicates an error from the L BFGS B method when fitting by optim.") + "1" = warning("Warning in fitting selection model with the `optim` function: the iteration limit maxit had been reached."), + "10" = warning("degeneracy of the Nelder Mead simplex in fitting selection model by the `optim` function."), # TODO - + "51" = warning("Warning from the L-BFGS-B when fitting by the `optim` function."), # TODO - + "52" = stop("Indicates an error from the L-BFGS-B method when fitting by the `optim` function.") ) } @@ -144,7 +144,7 @@ logit_model_nonprobsvy <- function(...) { grad <- gradient(theta) hess <- hessian(theta) } else { - stop("Provided invalid optimizer.") + stop("Provide valid optimizer (`optim` or `maxLik`).") } list( diff --git a/R/nonprob.R b/R/nonprob.R index 60309d6..0fae7b9 100644 --- a/R/nonprob.R +++ b/R/nonprob.R @@ -36,43 +36,43 @@ nonprob <- function(data, if (missing(method_selection)) method_selection <- "logit" if (missing(family_outcome)) family_outcome <- "gaussian" if (missing(method_outcome)) method_outcome <- "glm" - if (!(method_outcome %in% c("glm", "nn", "pmm"))) stop("Invalid method for outcome variable.") + if (!(method_outcome %in% c("glm", "nn", "pmm"))) stop("Invalid method for the `outcome` variable.") if (!is.null(svydesign)) { if ("svyrep.design" %in% class(svydesign)) stop("We do not currently support the `svyrep.design` class. Provide the survey data in the `survey.design2` class.") if ("pps" %in% class(svydesign)) stop("The `as.svrepdesign` function does not allow `pps` designs. For more details, see the `survey` package.") } if (!is.null(pop_totals)) { - if (!is.vector(pop_totals)) stop("pop_totals must be a vector.") + if (!is.vector(pop_totals)) stop("The `pop_totals` argument must be a vector.") } if (!is.null(pop_means)) { - if (!is.vector(pop_means)) stop("pop_means must be a vector.") + if (!is.vector(pop_means)) stop("The `pop_means` argument must be a vector.") } - if (!(method_selection %in% c("logit", "cloglog", "probit"))) stop("Invalid method for selection formula.") - if (!(family_outcome %in% c("gaussian", "binomial", "poisson"))) stop("Invalid family for outcome formula.") + if (!(method_selection %in% c("logit", "cloglog", "probit"))) stop("Invalid method for the `selection` formula.") + if (!(family_outcome %in% c("gaussian", "binomial", "poisson"))) stop("Invalid family for the `outcome` formula.") if (!is.null(control_selection$key)) { if (!(control_selection$key %in% colnames(data)) || !(control_selection$key %in% colnames(svydesign$variables))) { - stop("Key variable for overlapping units must be defined with this same name in prob and nonprob sample.") + stop("Key variable for overlapping units must be defined with the same name in prob and nonprob sample.") } } ## basic checkers if (is.null(selection) & is.null(outcome)) { - stop("Please provide selection or outcome formula.") + stop("Please provide thw `selection` or `outcome` argument.") } # Check formula inputs if (!is.null(selection) && !inherits(selection, "formula")) { - stop("'selection' must be a formula") + stop("The `selection` argument must be a formula.") } if (!is.null(outcome) && !inherits(outcome, "formula")) { - stop("'outcome' must be a formula") + stop("The `outcome` argument must be a formula.") } if (!is.null(target) && !inherits(target, "formula")) { - stop("'target' must be a formula") + stop("The `target` argument must be a formula.") } if (inherits(selection, "formula") && (is.null(outcome) || inherits(outcome, "formula") == FALSE)) { - if (inherits(target, "formula") == FALSE) stop("Please provide target variable") + if (inherits(target, "formula") == FALSE) stop("Please provide the `target` argument.") model_used <- "P" } @@ -86,27 +86,27 @@ nonprob <- function(data, # Check numeric inputs if (!is.null(pop_size) && !is.numeric(pop_size)) { - stop("'pop_size' must be numeric") + stop("The `pop_size` argument must be numeric scalar.") } if (!is.null(weights) && !is.numeric(weights)) { - stop("'weights' must be numeric") + stop("The `weights` argument must be a numeric vector.") } # Check weights length if (!is.null(weights) && length(weights) != nrow(data)) { - stop("Length of weights must match number of rows in data") + stop("Length of the `weights` argument must match the number of rows in data.") } if (!is.null(pop_totals) && !is.null(pop_means)) { - stop("Cannot specify both pop_totals and pop_means") + stop("Specify one of the `pop_totals' or `pop_means' arguments, not both.") } if (!is.null(pop_size)) { if (pop_size <= 0) { - stop("pop_size must be positive") + stop("The `pop_size` argument must be positive.") } if (pop_size < nrow(data)) { - stop("pop_size cannot be smaller than sample size") + stop("The `pop_size` argument cannot be smaller than sample size.") } } diff --git a/R/nonprobDR.R b/R/nonprobDR.R index c6f198f..67e6fac 100644 --- a/R/nonprobDR.R +++ b/R/nonprobDR.R @@ -177,7 +177,7 @@ nonprobDR <- function(selection, pop_totals <- c(pop_size, pop_size * pop_means) names(pop_totals) <- c("(Intercept)", names(pop_means)) } else { - stop("pop_size must be defined when estimating with pop_means.") + stop("The `pop_size` argument must be specified when the `pop_means` argument is provided.") } } @@ -232,7 +232,7 @@ nonprobDR <- function(selection, } ############ } else { - stop("Please, provide only one of svydesign object or pop_totals/pop_means.") + stop("Specify one of the `svydesign`, `pop_totals' or `pop_means' arguments, not all.") } for (k in 1:outcomes$l) { outcome <- outcomes$outcome[[k]] @@ -691,7 +691,7 @@ nonprobDR <- function(selection, mu_hat <- 1 / N_nons * sum((1 / ps_nons) * (weights * (OutcomeModel$y_nons - y_nons_pred))) + y_rand_pred } else { - stop("Please, provide only one of svydesign object or pop_totals/pop_means.") + stop("Specify one of the `svydesign`, `pop_totals' or `pop_means' arguments, not all.") } ys[[k]] <- as.numeric(y_nons) @@ -803,7 +803,7 @@ nonprobDR <- function(selection, stat[, k] <- boot_obj$stat # mu_hat <- boot_obj$mu } else { - stop("Invalid method for variance estimation.") + stop("Invalid `var_method` argument for the variance estimation.") } SE <- sqrt(var) alpha <- control_inference$alpha diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R index ff0f14e..c0b20c2 100644 --- a/R/nonprobIPW.R +++ b/R/nonprobIPW.R @@ -44,7 +44,7 @@ nonprobIPW <- function(selection, nfolds <- control_selection$nfolds eps <- control_selection$epsilon rep_type <- control_inference$rep_type - if (!(target[3] == "NULL()")) stop("ill-defined formula for the target") + if (!(target[3] == "NULL()")) stop("Ill-defined formula for the `target` argument.") # formula for outcome variable if target defined dependents <- paste(selection, collapse = " ") outcome <- outcome_init <- stats::as.formula(paste(target[2], dependents)) @@ -194,7 +194,7 @@ nonprobIPW <- function(selection, pop_totals <- c(pop_size, pop_size * pop_means) names(pop_totals) <- c("(Intercept)", names(pop_means)) } else { - stop("pop_size must be defined when estimating with pop_means.") + stop("The `pop_size` argument must be specified when the `pop_means` argument is provided.") } } @@ -360,7 +360,7 @@ nonprobIPW <- function(selection, ) ####################################### } else { - stop("Please, provide svydesign object or pop_totals/pop_means.") + stop("Specify one of the `svydesign`, `pop_totals' or `pop_means' arguments, not all.") } mu_hats <- numeric(length = outcomes$l) for (k in 1:outcomes$l) { @@ -494,7 +494,7 @@ nonprobIPW <- function(selection, SE_values[[k]] <- data.frame(t(data.frame("SE" = c(nonprob = NA, prob = NA)))) } } else { - stop("Invalid method for variance estimation.") + stop("Invalid `var_method` for the variance estimation.") } SE <- sqrt(var) alpha <- control_inference$alpha diff --git a/R/nonprobMI.R b/R/nonprobMI.R index 45c83cc..4beaaf2 100644 --- a/R/nonprobMI.R +++ b/R/nonprobMI.R @@ -258,7 +258,7 @@ nonprobMI <- function(outcome, pop_totals <- c(pop_size, pop_size * pop_means) names(pop_totals) <- c("(Intercept)", names(pop_means)) } else { - stop("pop_size must be defined when estimating with pop_means.") + stop("The `pop_size` argument must be specified when the `pop_means` argument is provided.") } } Model <- model_frame(formula = outcome, data = data, pop_totals = pop_totals) @@ -330,7 +330,7 @@ nonprobMI <- function(outcome, OutcomeList[[k]]$model_frame <- Model$model_frame_rand mu_hat <- y_rand_pred } else { - stop("Please, provide svydesign object or pop_totals/pop_means.") + stop("Specify one of the `svydesign`, `pop_totals' or `pop_means' arguments, not all.") } if (isTRUE(attr(model_obj$model, "method") == "pmm") & !(control_inference$nn_exact_se)) { @@ -425,7 +425,7 @@ nonprobMI <- function(outcome, prob = NA )))) } else { - stop("Invalid method for variance estimation.") + stop("Invalid `var_method` for the variance estimation.") } SE <- sqrt(var) alpha <- control_inference$alpha diff --git a/R/probitModel.R b/R/probitModel.R index eae8435..3ebda1a 100644 --- a/R/probitModel.R +++ b/R/probitModel.R @@ -109,19 +109,19 @@ probit_model_nonprobsvy <- function(...) { logLik = log_like, grad = gradient, # fixed hess = hessian, # fixed - method = control$maxLik_method, + method = control$maxlik_method, start = start, printLevel = control$print_level ) # NA in gradient for Newton-Raphson method if (maxLik_an$code %in% c(3:7, 100)) { switch(as.character(maxLik_an$code), - "3" = warning("Warning in fitting selection model with maxLik: probably not converged."), - "4" = warning("Maxiteration limit reached in fitting selection model by maxLik."), - "5" = stop("Inifinite value of log_like in fitting selection model by maxLik, error code 5"), - "6" = stop("Inifinite value of gradient in fitting selection model by maxLik, error code 6"), - "7" = stop("Inifinite value of hessian in fitting selection model by maxLik, error code 7"), - "100" = stop("Error in fitting selection model with maxLik, error code 100:: Bad start."), + "3" = warning("Warning in fitting selection model with the `maxLik` package: probably not converged."), + "4" = warning("Maxiteration limit reached in fitting selection model by the `maxLik` package."), + "5" = stop("Infinite value of log_like in fitting selection model by the `maxLik` package, error code 5."), + "6" = stop("Infinite value of gradient in fitting selection model by the `maxLik` package, error code 6."), + "7" = stop("Infinite value of hessian in fitting selection model by the `maxLik` package, error code 7."), + "100" = stop("Error in fitting selection model with the `maxLik` package, error code 100: Bad start."), ) } @@ -144,10 +144,10 @@ probit_model_nonprobsvy <- function(...) { ) if (maxLik_an$convergence %in% c(1, 10, 51, 52)) { switch(as.character(maxLik_an$convergence), - "1" = warning("Warning in fitting selection model with optim: the iteration limit maxit had been reached."), - "10" = warning("Degeneracy of the Nelder Mead simplex in fitting selection model by optim."), # TODO - - "51" = warning("Warning from the L BFGS B when fitting by optim."), # TODO - - "52" = stop("Indicates an error from the L-BFGS-B method when fitting by optim.") + "1" = warning("Warning in fitting selection model with the `optim` function: the iteration limit maxit had been reached."), + "10" = warning("Degeneracy of the Nelder Mead simplex in fitting selection model by the `optim` function."), # TODO - + "51" = warning("Warning from the L-BFGS-B when fitting by the `optim` function."), # TODO - + "52" = stop("Indicates an error from the L-BFGS-B method when fitting by the `optim` function.") ) } diff --git a/R/simple_methods.R b/R/simple_methods.R index 1dc902f..89bb7e8 100644 --- a/R/simple_methods.R +++ b/R/simple_methods.R @@ -326,39 +326,39 @@ deviance.nonprobsvy <- function(object, check_balance.nonprobsvy <- function(x, object, dig = 10) { # Input validation if (!inherits(x, "formula")) { - stop("'x' must be a formula") + stop("The `x` argument must be a formula.") } if (missing(object) || is.null(object)) { - stop("'object' is required") + stop("The `object` argument is required.") } if (!any(c("nonprobsvy_dr", "nonprobsvy_ipw") %in% class(object))) { - stop("No estimated weights available. Only IPW or DR methods are supported.") + stop("No estimated weights available. Only the IPW or the DR methods are supported.") } if (!is.numeric(dig) || dig < 0) { - stop("'dig' must be a non-negative number") + stop("The `dig` argument must be a non-negative number") } if (nrow(object$data) == 0) { - stop("Empty dataset") + stop("An empty dataset detected (zero rows).") } if (sum(object$weights) == 0) { - stop("Sum of weights is zero") + stop("The sum of weights is zero.") } # Extract variables from formula vars <- all.vars(x) if (length(vars) == 0) { - stop("No variables specified in formula") + stop("No variables specified in the `formula` argument") } # Check if all variables exist in the data missing_vars <- setdiff(vars, names(object$data)) if (length(missing_vars) > 0) { - stop(sprintf("The following variables are not present in the dataset: %s", + stop(sprintf("The following variables are not present in the dataset: %s.", paste(missing_vars, collapse = ", "))) } @@ -366,7 +366,7 @@ check_balance.nonprobsvy <- function(x, object, dig = 10) { calculate_totals <- function(var, data) { # Check for NAs if (sum(is.na(data[[var]])) > 0) { - warning(sprintf("NA values found in variable %s", var)) + warning(sprintf("NA values found in variable %s.", var)) } # For categorical variables, handle each level @@ -375,7 +375,7 @@ check_balance.nonprobsvy <- function(x, object, dig = 10) { totals <- sapply(levels, function(lvl) { data_subset <- data[data[[var]] == lvl, ] if (nrow(data_subset) < 5) { - warning(sprintf("Small group size (< 5) for level %s in variable %s", lvl, var)) + warning(sprintf("Small group size (< 5) for level %s in variable %s.", lvl, var)) } sum(data_subset$weights) }) @@ -395,7 +395,7 @@ check_balance.nonprobsvy <- function(x, object, dig = 10) { nonprob_totals <- tryCatch({ unlist(lapply(vars, function(var) calculate_totals(var, data))) }, error = function(e) { - stop(sprintf("Error calculating nonprobability totals: %s", e$message)) + stop(sprintf("Error calculating nonprobability totals: %s.", e$message)) }) # Calculate probability totals @@ -407,13 +407,13 @@ check_balance.nonprobsvy <- function(x, object, dig = 10) { names(svy_totals) <- names(svy_total) svy_totals }, error = function(e) { - stop(sprintf("Error calculating survey totals: %s", e$message)) + stop(sprintf("Error calculating survey totals: %s.", e$message)) }) } else { # Use population totals prob_totals <- object$selection$pop_totals if (is.null(prob_totals)) { - stop("No population totals available") + stop("The `pop_totals` argument is null.") } } @@ -421,7 +421,7 @@ check_balance.nonprobsvy <- function(x, object, dig = 10) { diff <- tryCatch({ round(nonprob_totals - prob_totals[names(nonprob_totals)], digits = dig) }, error = function(e) { - stop(sprintf("Error calculating differences: %s", e$message)) + stop(sprintf("Error calculating differences: %s.", e$message)) }) # Return results with meaningful names @@ -455,10 +455,10 @@ nonprobsvytotal.nonprobsvy <- function(x, object, interaction = FALSE) { var <- lhs.vars(x) # Validate inputs if (!is.null(var)) { - stop("no dependend variable needed for this method, please remove it and try again") + stop("No dependend variable needed for this method, please remove it and try again.") } if (nrow(object$data) == 0) { - stop("Empty dataset") + stop("An empty dataset detected (zero rows).") } class_nonprob <- class(object)[2] if (!class_nonprob %in% c("nonprobsvy_ipw", "nonprobsvy_dr", "nonprobsvy_mi")) { @@ -570,7 +570,7 @@ nonprobsvymean.nonprobsvy <- function(x, object, interaction = FALSE) { } if (nrow(object$data) == 0) { - stop("Empty dataset") + stop("An empty dataset detected (zero rows).") } class_nonprob <- class(object)[2] @@ -659,12 +659,12 @@ nonprobsvyby.nonprobsvy <- function(y, by, object, FUN) { } if (nrow(object$data) == 0) { - stop("Empty dataset") + stop("An empty dataset detected (zero rows).") } class_nonprob <- class(object)[2] if (!class_nonprob %in% c("nonprobsvy_ipw", "nonprobsvy_dr", "nonprobsvy_mi")) { - stop("Invalid nonprob object class") + stop("An invalid `nonprob` object class.") } variables <- rhs.vars(y) @@ -673,13 +673,13 @@ nonprobsvyby.nonprobsvy <- function(y, by, object, FUN) { # Validate inputs missing_vars <- setdiff(groups, names(object$data)) if (length(missing_vars) > 0) { - stop(sprintf("The following variables are not present in the dataset: %s", + stop(sprintf("The following variables are not present in the dataset: %s.", paste(missing_vars, collapse = ", "))) } if (!is.null(variables) && !all(variables %in% names(object$data))) { missing_dep_vars <- setdiff(variables, names(object$y)) - stop(sprintf("The following dependent variables are not present: %s", + stop(sprintf("The following dependent variables are not present: %s.", paste(missing_dep_vars, collapse = ", "))) } diff --git a/man/control_sel.Rd b/man/control_sel.Rd index 8de7095..bba60ec 100644 --- a/man/control_sel.Rd +++ b/man/control_sel.Rd @@ -10,7 +10,7 @@ control_sel( maxit = 500, trace = FALSE, optimizer = c("maxLik", "optim"), - maxLik_method = "NR", + maxlik_method = "NR", optim_method = "BFGS", dependence = FALSE, key = NULL, @@ -43,7 +43,7 @@ control_sel( \item optimization function for maximum likelihood estimation. }} -\item{maxLik_method}{maximisation method that will be passed to \code{\link[maxLik:maxLik]{maxLik::maxLik()}} function. Default is \code{NR}.} +\item{maxlik_method}{maximisation method that will be passed to \code{\link[maxLik:maxLik]{maxLik::maxLik()}} function. Default is \code{NR}.} \item{optim_method}{maximisation method that will be passed to \code{\link[stats:optim]{stats::optim()}} function. Default is \code{BFGS}.} diff --git a/man/genSimData.Rd b/man/genSimData.Rd deleted file mode 100644 index 337a7e1..0000000 --- a/man/genSimData.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/generateData.r -\name{genSimData} -\alias{genSimData} -\title{Simulation data} -\usage{ -genSimData(N = 10000, n = 1000) -} -\arguments{ -\item{N}{\code{integer}, population size, default 10000} - -\item{n}{\code{integer}, big data sample, default 1000} -} -\value{ -\code{genSimData} returns a data.frame, with the following columns: -\itemize{ -\item{x0 -- intercept} -\item{x1 -- the first variable based on z1} -\item{x2 -- the second variable based on z2 and x1} -\item{x3 -- the third variable based on z3 and x1 and x2} -\item{x4 -- the third variable based on z4 and x1, x2 and x3} -\item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.30} -\item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.60} -\item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.80} -\item{rho -- true propensity scores for big data such that sum(rho)=n} -\item{srs -- probabilities of inclusion to random sample such that max(srs)/min(srs)=50} -} -} -\description{ -Generate simulated data according to Chen, Li & Wu (2020), section 5. - -\loadmathjax -} -\examples{ -## generate data with N=20000 and n=2000 -genSimData(N = 20000, n = 2000) - -## generate data when big data is almost as N -genSimData(N = 10000, n = 9000) - -} -\references{ -Chen, Y., Li, P., & Wu, C. (2020). Doubly Robust Inference With Nonprobability Survey Samples. Journal of the American Statistical Association, 115(532), 2011–2021. doi:10.1080/01621459.2019.1677241 -} -\author{ -Łukasz Chrostowski, Maciej Beręsewicz -}