ncn-foreigners · LukaszChrostowski · Nov 12, 2024 · Apr 3, 2024 · Apr 10, 2024 · May 23, 2024
diff --git a/.Rapp.history b/.Rapp.history
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master, package-development]
+    branches: [main, master, dev]
   pull_request:
-    branches: [main, master, package-development]
+    branches: [main, master, dev]
 
 name: R-CMD-check
 

diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master]
+    branches: [main, master, dev]
   pull_request:
-    branches: [main, master]
+    branches: [main, master, dev]
   release:
     types: [published]
   workflow_dispatch:

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master]
+    branches: [main, master, dev]
   pull_request:
-    branches: [main, master]
+    branches: [main, master, dev]
 
 name: test-coverage
 
@@ -13,6 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      TEST_NONPROBSVY_MULTICORE_DEVELOPER: "true"
 
     steps:
       - uses: actions/checkout@v3

diff --git a/NAMESPACE b/NAMESPACE
@@ -71,6 +71,7 @@ importFrom(stats,pt)
 importFrom(stats,qnorm)
 importFrom(stats,rbinom)
 importFrom(stats,rchisq)
+importFrom(stats,reformulate)
 importFrom(stats,residuals)
 importFrom(stats,rexp)
 importFrom(stats,rnorm)
@@ -86,6 +87,7 @@ importFrom(stats,weighted.mean)
 importFrom(survey,as.svrepdesign)
 importFrom(survey,svymean)
 importFrom(survey,svyrecvar)
+importFrom(survey,svytotal)
 importFrom(utils,setTxtProgressBar)
 importFrom(utils,txtProgressBar)
 useDynLib(nonprobsvy)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,19 @@
-## version 0.1.0
+# nonprobsvy 0.1.1
+
+-   bugfixes
+    -   bug Fix occuring when estimation was based on auxiliary variable, which led to compression of the data from the frame to the vector.
+    -   bug Fix related to not passing `maxit` argument from `controlSel` function to internally used `nleqslv` function
+    -   bug Fix related to storing `vector` in `model_frame` when predicting `y_hat` in mass imputation `glm` model when X is based in one auxiliary variable only - fix provided converting it to `data.frame` object.
+-   features
+    -   add information to `summary` about quality of estimation basing on difference between estimated and known total values of auxiliary variables
+    -   add estimation of exact standard error for k-nearest neighbor estimator.
+    -   add breaking change to `controlOut` function by switching values for `predictive_match` argument. From now on, the `predictive_match = 1` means $\hat{y}-\hat{y}$ in predictive mean matching imputation and `predictive_match = 2` corresponds to $\hat{y}-y$ matching.
+    - implement `div` option when variable selection (more in documentation) for doubly robust estimation.
+    - add more insights to `nonprob` output such as gradient, hessian and jacobian derived from IPW estimation for `mle` and `gee` methods when `IPW` or `DR` model executed.
+    - add estimated inclusion probabilities and its derivatives for probability and non-probability samples to `nonprob` output when `IPW` or `DR` model executed.
+    - add `model_frame` matrix data from probability sample used for mass imputation to `nonprob` when `MI` or `DR` model executed.
+
+## nonprobsvy 0.1.0
 
 ------------------------------------------------------------------------
 

diff --git a/R/bias_correction_ipw.R b/R/bias_correction_ipw.R
@@ -1,5 +1,20 @@
 # bias correction
-mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, start_selection, start_outcome, boot = FALSE) {
+mm <- function(X,
+               y,
+               weights,
+               weights_rand,
+               R,
+               n_nons,
+               n_rand,
+               method_selection,
+               family,
+               start_selection,
+               start_outcome,
+               maxit,
+               nleqslv_method,
+               nleqslv_global,
+               nleqslv_xscalm,
+               boot = FALSE) {
   method_selection_function <- paste(method_selection, "_model_nonprobsvy", sep = "")
   method <- get_method(method_selection_function)
   inv_link <- method$make_link_inv
@@ -63,13 +78,16 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
   # par_sel <- multiroot$par
   ######### NLESQLV
   multiroot <- nleqslv::nleqslv(
-    x = par0, # TODO add user-specified parameters to control functions
+    x = par0,
     fn = u_theta_beta_dr,
-    method = "Newton", # TODO consider the method Broyden
-    global = "dbldog", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none")
-    xscalm = "auto", # c("fixed","auto")
+    method = nleqslv_method,
+    global = nleqslv_global,
+    xscalm = nleqslv_xscalm,
     jacobian = TRUE,
-    control = list(scalex = rep(1, length(par0))), # TODO algorithm did not converge in maxit iterations for cloglog
+    control = list(
+      scalex = rep(1, length(par0)),
+      maxit = maxit
+    ), # TODO algorithm did not converge in maxit iterations for cloglog
     R = R,
     X = X,
     y = y,

diff --git a/R/boot_mi.R b/R/boot_mi.R
@@ -46,10 +46,9 @@ bootMI <- function(X_rand,
 
 
   predictive_match <- control_outcome$predictive_match
-  pmm_exact_se <- control_inference$pmm_exact_se
+  nn_exact_se <- control_inference$nn_exact_se
   pmm_reg_engine <- control_outcome$pmm_reg_engine
   pi_ij <- control_inference$pi_ij
-  pmm_exact_se <- control_inference$pmm_exact_se
   comp2_stat <- numeric(length = num_boot)
 
   if (is.null(pop_totals)) {
@@ -399,7 +398,7 @@ bootMI <- function(X_rand,
   }
   # mu_hat_boot <- mean(mu_hats)
   if (method == "pmm") {
-    if (pmm_exact_se) {
+    if (nn_exact_se) {
       comp2 <- pmm_exact(pi_ij,
         weights_rand,
         n_nons = n_nons,

diff --git a/R/control_inference.R b/R/control_inference.R
@@ -17,9 +17,9 @@
 #' @param cores Number of cores in parallel computing.
 #' @param keep_boot Logical indicating whether statistics from bootstrap should be kept.
 #' By default set to \code{TRUE}
-#' @param pmm_exact_se Logical value indicating whether to compute the exact
-#' standard error estimate for \code{pmm} estimator. The variance estimator for
-#' estimation based on \code{pmm} can be decomposed into three parts, with the
+#' @param nn_exact_se Logical value indicating whether to compute the exact
+#' standard error estimate for \code{nn} or \code{pmm} estimator. The variance estimator for
+#' estimation based on \code{nn} or \code{pmm} can be decomposed into three parts, with the
 #' third being computed using covariance between imputed values for units in
 #' probability sample using predictive matches from non-probability sample.
 #' In most situations this term is negligible and is very computationally
@@ -51,7 +51,7 @@ controlInf <- function(vars_selection = FALSE,
                        alpha = 0.05,
                        cores = 1,
                        keep_boot,
-                       pmm_exact_se = FALSE,
+                       nn_exact_se = FALSE,
                        pi_ij) {
   list(
     vars_selection = if (missing(vars_selection)) FALSE else vars_selection,
@@ -71,10 +71,10 @@ controlInf <- function(vars_selection = FALSE,
         keep_boot
       }
     },
-    pmm_exact_se = if (!is.logical(pmm_exact_se) & length(pmm_exact_se) == 1) {
-      stop("Argument pmm_exact_se must be a logical scalar")
+    nn_exact_se = if (!is.logical(nn_exact_se) & length(nn_exact_se) == 1) {
+      stop("Argument nn_exact_se must be a logical scalar")
     } else {
-      pmm_exact_se
+      nn_exact_se
     },
     pi_ij = if (missing(pi_ij)) NULL else pi_ij
   )

diff --git a/R/control_outcome.R b/R/control_outcome.R
@@ -16,8 +16,8 @@
 #' @param predictive_match (Only for predictive mean matching)
 #' Indicates how to select 'closest' unit from nonprobability sample for each
 #' unit in probability sample. Either \code{1} (default) or \code{2} where
-#' \code{1} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for
-#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{2}
+#' \code{2} is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for
+#' \mjseqn{i \in S_{A}} and \mjseqn{y_{j}} for \mjseqn{j \in S_{B}} and \code{1}
 #' is matching by minimizing distance between \mjseqn{\hat{y}_{i}} for
 #' \mjseqn{i \in S_{A}} and \mjseqn{\hat{y}_{i}} for \mjseqn{i \in S_{A}}.
 #' @param pmm_weights (Only for predictive mean matching)

diff --git a/R/control_selection.R b/R/control_selection.R
@@ -36,6 +36,9 @@
 #' \item if \code{naive} then start consists of a vector which has the value of an estimated parameter for one-dimensional data (on intercept) and 0 for the rest.
 #' \item if \code{zero} then start is a vector of zeros.
 #' }
+#' @param nleqslv_method The method that will be passed to [nleqslv::nleqslv()] function.
+#' @param nleqslv_global The global strategy that will be passed to [nleqslv::nleqslv()] function.
+#' @param nleqslv_xscalm The type of x scaling that will be passed to [nleqslv::nleqslv()] function.
 #'
 #' @return List with selected parameters.
 #'
@@ -64,7 +67,13 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
                        nlambda = 50,
                        nfolds = 10,
                        print_level = 0,
-                       start_type = c("glm", "naive", "zero")) {
+                       start_type = c("glm", "naive", "zero"),
+                       nleqslv_method = c("Broyden", "Newton"),
+                       nleqslv_global = c(
+                         "dbldog", "pwldog",
+                         "cline", "qline", "gline", "hook", "none"
+                       ),
+                       nleqslv_xscalm = c("fixed", "auto")) {
   list(
     epsilon = epsilon,
     maxit = maxit,
@@ -84,6 +93,9 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
     nfolds = nfolds,
     lambda = lambda,
     print_level = print_level,
-    start_type = if (missing(start_type)) "naive" else start_type
+    start_type = if (missing(start_type)) "naive" else start_type,
+    nleqslv_method = if (missing(nleqslv_method)) "Newton" else nleqslv_method,
+    nleqslv_global = if (missing(nleqslv_global)) "dbldog" else nleqslv_global,
+    nleqslv_xscalm = if (missing(nleqslv_xscalm)) "auto" else nleqslv_xscalm
   )
 }
diff --git a/R/data_manip.R b/R/data_manip.R
@@ -5,8 +5,10 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot
     model_Frame <- model.frame(formula, data)
     y_nons <- model.response(model_Frame)
     outcome_name <- names(model_Frame)[1]
-    mt <- attr(model_Frame, "terms")
-    nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data)
+
+    mt <- terms(formula) # attr(model_Frame, "terms")
+    nons_names <- all.vars(as.formula(paste("~", paste(attr(mt, "term.labels"), collapse = " + "))))
+    # nons_names <- attr(mt, "term.labels") # colnames(get_all_vars(formula, data)) names of variables of nonprobability sample terms(formula, data = data)
     ##### Model frame for probability sample #####
     if (outcome_name %in% colnames(svydesign$variables)) {
       # design_to_frame <- svydesign$variables
@@ -19,16 +21,25 @@ model_frame <- function(formula, data, weights = NULL, svydesign = NULL, pop_tot
       design_to_frame <- svydesign$variables
       design_to_frame[, outcome_name][is.na(design_to_frame[, outcome_name])] <- 0 # replace NA in dependent outcome with 0
       names_rand <- all.vars(formula)
-      model_Frame_rand <- design_to_frame[, names_rand]
+      model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE]
 
-      nons_names_rand <- attr(attr(model.frame(formula, design_to_frame), "terms"), "term.labels")
+      nons_names_rand <- intersect(names_rand, colnames(design_to_frame))
     } else {
       design_to_frame <- svydesign$variables
-      names_rand <- all.vars(formula[-2])
-      model_Frame_rand <- design_to_frame[, names_rand]
+      ##
+      terms_object <- terms(formula)
+      # names_rand <- all.vars(terms_object)
+      names_rand <- all.vars(as.formula(paste("~", paste(attr(terms_object, "term.labels"), collapse = " + "))))
+      ##
+      # names_rand <- all.vars(formula[-2]) # old
+      ##
+
+      model_Frame_rand <- design_to_frame[, names_rand, drop = FALSE]
+      nons_names_rand <- intersect(names_rand, colnames(design_to_frame))
 
-      nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels")
+      # nons_names_rand <- attr(attr(model.frame(formula[-2], design_to_frame), "terms"), "term.labels") # old
 
+      # old
       # model_Frame_rand <- model.frame(formula[-2], svydesign$variables)
       # mt_rand <- attr(model_Frame_rand, "terms")
       # nons_names_rand <- attr(mt_rand, "term.labels")

diff --git a/R/gee_ipw.R b/R/gee_ipw.R
@@ -111,6 +111,9 @@ gee <- function(...) {
           h = h,
           method_selection = method_selection,
           maxit = maxit,
+          nleqslv_method = control_selection$nleqslv_method,
+          nleqslv_global = control_selection$nleqslv_global,
+          nleqslv_xscalm = control_selection$nleqslv_xscalm,
           start = 0
         )$theta_h)
         start <- c(start_h, rep(0, ncol(X) - 1))
@@ -127,6 +130,9 @@ gee <- function(...) {
       h = h,
       method_selection = method_selection,
       maxit = maxit,
+      nleqslv_method = control_selection$nleqslv_method,
+      nleqslv_global = control_selection$nleqslv_method,
+      nleqslv_xscalm = control_selection$nleqslv_method,
       start = start
     )
     theta_hat <- h_object$theta_h

diff --git a/R/glm.R b/R/glm.R
@@ -68,6 +68,15 @@ glm_nonprobsvy <- function(outcome,
     y_rand_pred <- family_nonprobsvy$linkinv(eta)
     y_nons_pred <- model$fitted.values
 
+    # artificial formula and data created for pmm_extact_se
+    predictors <- colnames(X_nons)[-1]
+    outcome_name <- names(model_frame)[1]
+    formula_str <- paste(outcome_name, "~", paste(predictors, collapse = " + "))
+    formula <- as.formula(formula_str)
+    model$formula <- formula
+    model$data <- as.data.frame(cbind(y_nons, X_nons[, -1, drop = FALSE]))
+    colnames(model$data) <- c(outcome_name, predictors)
+
     model_out <- list(
       glm = model,
       glm_summary = model_summ

diff --git a/R/internals.R b/R/internals.R
@@ -151,20 +151,29 @@ get_method <- function(method) {
 }
 
 ff <- function(formula) {
-  fff <- as.character(formula)
-  f <- strsplit(fff[2], "\\s*\\+\\s*")[[1]]
-  outcome_formulas <- list()
-  if (any(duplicated(f))) {
-    warning("No unique names of the outcome variables in formula. The error has been corrected")
-    f <- unique(f)
-  }
-  l <- length(f)
-  for (i in 1:l) {
-    outcome_formulas[[i]] <- as.formula(paste(f[i], fff[3], sep = " ~ "))
+  formula_string <- paste(deparse(formula), collapse = " ")
+  formula_parts <- strsplit(formula_string, "~")[[1]]
+  if (length(formula_parts) != 2) {
+    stop("The formula must contain exactly one '~' operator.")
   }
 
+  lhs <- trimws(formula_parts[1])
+  rhs <- trimws(formula_parts[2])
+
+  dependent_vars <- strsplit(lhs, "\\s*\\+\\s*")[[1]]
+  independent_vars <- strsplit(rhs, "\\s*\\+\\s*")[[1]]
+
+  if (any(duplicated(dependent_vars))) {
+    warning("Duplicate dependent variable names detected. They have been made unique.")
+    dependent_vars <- unique(dependent_vars)
+  }
+  outcome_formulas <- vector("list", length(dependent_vars))
+  l <- length(dependent_vars)
+  for (i in seq_along(dependent_vars)) {
+    outcome_formulas[[i]] <- reformulate(termlabels = independent_vars, response = dependent_vars[i])
+  }
   list(
-    f = f,
+    f = dependent_vars,
     outcomes = outcome_formulas,
     l = l
   )