ncn-foreigners · LukaszChrostowski · Nov 27, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,12 +3,12 @@ Type: Package
 Title: Package for Inference Based on Nonprobability Samples
 Version: 0.1.0
 Authors@R:
-    c(person(given = "Lukasz",
+    c(person(given = "Łukasz",
              family = "Chrostowski",
              role = c("aut", "cre"),
              email = "[email protected]"),
       person(given = "Maciej",
-             family = "Beresewicz",
+             family = "Beręsewicz",
              role = c("aut", "ctb"),
              email = "[email protected]"),
       person(given = "Piotr",

diff --git a/R/EstimationMethods.R b/R/EstimationMethods.R
@@ -135,7 +135,7 @@ mle <- function(...) {
     ps_nons_der <- dinv_link(eta_nons)
     est_ps_rand_der <- dinv_link(eta_rand)
 
-    resids <- c(est_ps_rand, ps_nons) - R
+    resids <- R - c(est_ps_rand, ps_nons)
 
     variance <- (t(resids) %*% resids) / df_reduced
 
@@ -293,7 +293,7 @@ gee <- function(...) {
     ps_nons <- inv_link(eta_nons)
     est_ps_rand <- inv_link(eta_rand)
     variance_covariance <- solve(-hess)
-    resids <- c(est_ps_rand, ps_nons) - R
+    resids <-  R - c(est_ps_rand, ps_nons)
 
     df_reduced <- nrow(X) - length(theta_hat)
     variance <- as.vector((t(resids) %*% resids) / df_reduced)
@@ -382,15 +382,16 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
   names(theta_hat) <- names(beta_hat) <- colnames(X)
   df_residual <- nrow(X) - length(theta_hat)
 
+  # selection parameters
   ps <- inv_link(theta_hat %*% t(X)) # inv_link(as.vector(X_design %*% as.matrix(theta_hat)))
   eta_sel <- theta_hat %*% t(X)
   ps_der <- dinv_link(eta_sel)
   ps_nons <- ps[loc_nons]
   est_ps_rand <- ps[loc_rand]
   ps_nons_der <- ps_der[loc_nons]
   weights_nons <- 1/ps_nons
-  resids <- c(est_ps_rand, ps_nons) - R
-  variance <- (t(resids) %*% resids) / df_residual
+  resids <-  R - c(est_ps_rand, ps_nons)
+  variance <- as.vector( (t(resids) %*% resids) / df_residual)
 
   if (!boot) {
     N_nons <- sum(weights * weights_nons)
@@ -407,17 +408,19 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
   y_nons_pred <- y_hat[loc_nons]
 
   if (!boot) {
-    sigma <- family$variance(mu = y_hat, y = y[loc_rand])
-    residuals <- family$residuals(mu = y_rand_pred, y = y[loc_rand])
+    sigma_nons <- family$variance(mu = y_nons_pred, y = y[loc_nons])
+    sigma_rand <- family$variance(mu = y_rand_pred, y = y[loc_rand])
+    residuals <- family$residuals(mu = y_nons_pred, y = y[loc_nons])
   }
 
   if (!boot) {
     # variance-covariance matrix for outcome model
     # vcov_outcome <- solve(t(X_design) %*% diag(sigma) %*% X_design)
-    vcov_outcome <- solve(t(X) %*% (sigma * X))
+    vcov_outcome <- solve(t(X[loc_nons,]) %*% (sigma_nons * X[loc_nons,]))
     beta_errors <- sqrt(diag(vcov_outcome))
   }
 
+  # grad = multiroot$f.root[(p+1):(2*p)]
   if (!boot) {
     hess <- NULL
     selection <- list(theta_hat = theta_hat, # TODO list as close as possible to SelecttionList
@@ -428,24 +431,26 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
                       variance_covariance = vcov_selection,
                       df_residual = df_residual,
                       log_likelihood = "NULL",
-                      linear.predictors = eta_sel,
+                      eta = eta_sel,
                       aic = NULL,
                       residuals = resids,
                       variance = variance,
                       method = method)
 
     outcome <- list(coefficients = beta_hat, # TODO list as close as possible to glm
-                    grad = multiroot$f.root[(p+1):(2*p)],
-                    hess = hess, # TODO
+                    std_err = beta_errors,
                     variance_covariance = vcov_outcome,
                     df_residual = df_residual,
-                    family = list(mu = y_hat,
-                                  variance = sigma[loc_rand],
-                                  residuals = residuals),
+                    family = list(mu = y_nons_pred,
+                                  variance = sigma_nons,
+                                  family = family$family),
+                    residuals = residuals,
+                    fitted.values = y_nons_pred,
+                    sigma_rand = sigma_rand,
                     y_rand_pred = y_rand_pred,
                     y_nons_pred = y_nons_pred,
-                    log_likelihood = "NULL",
-                    linear.predictors = eta_out)
+                    linear.predictors = eta_out[loc_nons],
+                    X = X[loc_nons,])
   } else {
     selection <- list(coefficients = theta_hat,# TODO list as close as possible to SelecttionList
                       ps_nons = ps_nons)

diff --git a/R/cloglogModel.R b/R/cloglogModel.R
@@ -5,6 +5,10 @@
 #'
 #' @param ... Additional, optional arguments.
 #'
+#' @seealso
+#'
+#' [nonprob()] -- for fitting procedure with non-probability samples.
+#'
 #' @return List with selected methods/objects/functions.
 #' @importFrom maxLik maxLik
 #' @importFrom Matrix Matrix

diff --git a/R/control.R b/R/control.R
@@ -37,6 +37,10 @@
 #'
 #' @return List with selected parameters.
 #'
+#' @seealso
+#'
+#' [nonprob()] -- for fitting procedure with non-probability samples.
+#'
 #' @export
 
 controlSel <- function(method = "glm.fit", # perhaps another control function for model with variables selection
@@ -102,6 +106,11 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
 #'
 #' @return List with selected parameters.
 #'
+#' @seealso
+#'
+#' [nonprob()] -- for fitting procedure with non-probability samples.
+#'
+#'
 #' @export
 
 controlOut <- function(epsilon = 1e-6,
@@ -155,6 +164,10 @@ controlOut <- function(epsilon = 1e-6,
 #'
 #' @return List with selected parameters.
 #'
+#' @seealso
+#'
+#' [nonprob()] -- for fitting procedure with non-probability samples.
+#'
 #' @export
 
 controlInf <- function(vars_selection = FALSE,

diff --git a/R/family.R b/R/family.R
@@ -10,21 +10,23 @@ poisson_nonprobsvy <- function(link = "log") {
   structure(list(mu = mu,
                  variance = variance,
                  mu_der = mu_der,
-                 residuals = residuals),
+                 residuals = residuals,
+                 family = "poisson"),
             class = "family")
 }
 
 gaussian_nonprobsvy <- function(link = "identity") {
   mu <- function(eta) eta
-  variance <- function(mu, y = NULL) rep.int(1, length(mu)) #mean((y - mu)^2) rep.int(1, length(mu))
+  variance <- function(mu, y = NULL) mean((y - mu)^2) #rep.int(1, length(mu)) rep.int(1, length(mu))
   mu_der <- function(mu) 1 # first derivative
   mu_der2 <- function(mu) 0 # second derivative
   residuals <- function(mu, y) as.vector(y - mu)
 
   structure(list(mu = mu,
                  variance = variance,
                  mu_der = mu_der,
-                 residuals = residuals),
+                 residuals = residuals,
+                 family = "gaussian"),
              class = "family")
 }
 
@@ -41,6 +43,7 @@ binomial_nonprobsvy <- function(link = "logit") {
   structure(list(mu = mu,
                  variance = variance,
                  mu_der = mu_der,
-                 residuals = residuals),
+                 residuals = residuals,
+                 family = "binomial"),
             class = "family")
 }
diff --git a/R/generateData.r b/R/generateData.r
@@ -16,9 +16,9 @@ NULL
 #' \item{x2 -- the second variable based on z2 and x1}
 #' \item{x3 -- the third variable based on z3 and x1 and x2}
 #' \item{x4 -- the third variable based on z4 and x1, x2 and x3}
-#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\epsilon}, so the cor(y,y_hat) = 0.30}
-#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\\epsilon}, so the cor(y,y_hat) = 0.60}
-#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\epsilon}, so the cor(y,y_hat) = 0.80}
+#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.30}
+#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.60}
+#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.80}
 #' \item{rho -- true propensity scores for big data such that sum(rho)=n}
 #' \item{srs -- probabilities of inclusion to random sample such that max(srs)/min(srs)=50}
 #' }

diff --git a/R/internals.R b/R/internals.R
@@ -387,7 +387,7 @@ internal_varDR <- function(OutcomeModel,
     svydesign_mean <- survey::svymean(~y_rand, svydesign)
 
     var_prob <- as.vector(attr(svydesign_mean, "var")) # based on survey package, probability component
-    var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigma))/N_nons^2 # nonprobability component
+    var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigma))/N_nons^2 # TODO potential bug here nonprobability component
     } else {
     eta <- as.vector(SelectionModel$X_nons %*% as.matrix(theta))
     h_n <- 1/N_nons * sum(OutcomeModel$y_nons - y_nons_pred) # TODO add weights # errors mean

diff --git a/R/logitModel.R b/R/logitModel.R
@@ -7,9 +7,15 @@
 #'
 #' @return List with selected methods/objects/functions.
 #'
+#' @seealso
+#'
+#' [nonprob()] -- for fitting procedure with non-probability samples.
+#'
 #' @importFrom maxLik maxLik
 #' @importFrom Matrix Matrix
 #' @importFrom survey svyrecvar
+#'
+#'
 #' @export
 
 logit_model_nonprobsvy <- function(...) {

diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R
@@ -6,8 +6,12 @@ NULL
 #' \loadmathjax
 #' @description \code{nonprob} fits model for inference based on non-probability surveys (including big data) using various methods.
 #' The function allows you to estimate the population mean having access to a reference probability sample as well as total/mean values of covariates.
-#' In the package implemented state-of-the-art approaches recently proposed in the literature: Chen et al. (2020), Yang et al. (2020), Wu (2022) and use `survey` package [Lumley 2004](https://CRAN.R-project.org/package=survey) for inference.
-#' Provided propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and doubly robust estimators that take into account minimization of the asymptotic bias of the population mean estimators,
+#'
+#' In the package implemented state-of-the-art approaches recently proposed in the literature: Chen et al. (2020),
+#' Yang et al. (2020), Wu (2022) and use `survey` package [Lumley 2004](https://CRAN.R-project.org/package=survey) for inference.
+#'
+#' Provided propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and
+#' doubly robust estimators that take into account minimization of the asymptotic bias of the population mean estimators,
 #' variable selection or overlap between random and non-random sample.
 #' The package uses `survey` package functionalities when a probability sample is available.
 #'
@@ -84,7 +88,7 @@ NULL
 #'  Inverse probability approach is based on assumption that reference probability sample
 #'  is available and therefore we can estimate propensity score of selection mechanism.
 #'  Estimator has following form:
-#'  \mjsdeqn{\mu_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
+#'  \mjsdeqn{\hat{\mu}_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
 #'  For this purpose with consider multiple ways of estimation. The first approach is Maximum Likelihood Estimation with corrected
 #'  log-likelihood function which is given by the following formula
 #'  \mjsdeqn{
@@ -101,14 +105,14 @@ NULL
 #'    where imputed values of the outcome variables are created for whole probability sample.
 #'    In this case we treat big-data sample as a training dataset, which is used to build an imputation model. Using imputed values
 #'    for probability sample and design (known) weights, we can build population mean estimator of form:
-#'    \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
+#'    \mjsdeqn{\hat{\mu}_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
 #'    It opens the the door to very flexible method for imputation model. In the package used generalized linear models from [stats::glm()]
 #'    nearest neighbour algorithm using [RANN::nn2()] and predictive mean matching.
 #'
 #' 3. Doubly robust estimation -- The IPW and MI estimators are sensible on misspecified models for propensity score and outcome variable respectively.
 #'    For this purpose so called doubly-robust methods, which take into account these problems, are presented.
 #'    It is quite simple idea of combination propensity score and imputation models during inference which lead to the following estimator
-#'    \mjsdeqn{\mu_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
+#'    \mjsdeqn{\hat{\mu}_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
 #'    In addition, an approach based directly on bias minimisation has been implemented. Following formula
 #'    \mjsdeqn{
 #'    \begin{aligned}
@@ -191,14 +195,29 @@ NULL
 #'  \item{\code{nonprob_size} -- size of non-probability sample}
 #'  \item{\code{prob_size} -- size of probability sample}
 #'  \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)}
-#'  \item{\code{outcome} -- list containing information about fitting of mass imputation model, in case of regression model, object containing list returned by the function.
-#'  Additionaly when variables selection model for outcome variable is fitting, list includes of \code{cve} -- the error for each value of `lambda`, averaged accross the cross-validation folds.
-#'  [stats::glm()], in case of nearest neigbour imputation object containing list returned by [RANN::nn2()].}
+#'  \item{\code{outcome} -- list containing information about fitting of mass imputation model, in case of regression model, object containing list returned by the function
+#'  [stats::glm()], in case of nearest neighbour imputation object containing list returned by [RANN::nn2()]. If `bias_correction` in [controlInf()] is set on `TRUE`, then estimation is based on
+#'  the joint estimating equations for `selection` and `outcome` model and therefore, the list differs from the one returned by the [stats::glm()] function and contains elements such as
+#'  \itemize{
+#'  \item{\code{coefficients} -- estimated coefficients of the regression model}
+#'  \item{\code{std_err} -- standard errors of the estimated coefficients}
+#'  \item{\code{residuals} -- The response residuals}
+#'  \item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates}
+#'  \item{\code{df_residual} -- The degrees of freedom for residuals}
+#'  \item{\code{family} -- specifies the error distribution and link function to be used in the model}
+#'  \item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model}
+#'  \item{\code{linear.predictors} -- The linear fit on link scale}
+#'  \item{\code{X} -- The design matrix}
+#'  \item{\code{method} -- set on `glm`, since the regression method}
+#'  }
+#'  }
+#'  Additionally when variables selection model for outcome variable is fitting, list includes of \code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds.
 #'  \item{\code{selection} -- list containing information about fitting of propensity score model, such as
 #'  \itemize{
 #'  \item{\code{coefficients} -- a named vector of coefficients}
 #'  \item{\code{std_err} -- standard errors of the estimated model coefficients}
-#'  \item{\code{residuals} -- the working residuals}
+#'  \item{\code{residuals} -- the response residuals}
+#'  \item{\code{variance} -- the root mean square error}
 #'  \item{\code{fitted_values} -- the fitted mean values, obtained by transforming the linear predictors by the inverse of the link function.}
 #'  \item{\code{link} -- the `link` object used.}
 #'  \item{\code{linear_predictors} -- the linear fit on link scale.}
@@ -208,7 +227,7 @@ NULL
 #'  \item{\code{formula} -- the formula supplied.}
 #'  \item{\code{df_residual} -- the residual degrees of freedom.}
 #'  \item{\code{log_likelihood} -- value of log-likelihood function if `mle` method, in the other case `NULL`.}
-#'  \item{\code{cve} -- the error for each value of `lambda`, averaged accross the cross-validation folds for variables selection model
+#'  \item{\code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds for variables selection model
 #'  when propensity score model is fitting.}
 #'   }
 #'  }