add unit tests for cloglog models, bugFix for start point customisation

LukaszChrostowski · LukaszChrostowski · commit ce5398c65de0 · 2023-11-08T16:42:42.000+01:00
diff --git a/R/EstimationMethods.R b/R/EstimationMethods.R
@@ -104,12 +104,12 @@ mle <- function(...) {
                            weights_rand = weights_rand,
                            method_selection = method_selection)
       } else if (control_selection$start_type == "naive") {
-        intercept_start <- max_lik(X_nons = X_nons[, 1, drop = FALSE],
+        intercept_start <- suppressWarnings(max_lik(X_nons = X_nons[, 1, drop = FALSE],
                                    X_rand = X_rand[, 1, drop = FALSE],
                                    weights = weights,
                                    weights_rand = weights_rand,
                                    start = 0,
-                                   control = control_selection)$theta_hat
+                                   control = control_selection)$theta_hat)
         start <- c(intercept_start, rep(0, ncol(X_nons) - 1))
       }
     }
@@ -237,20 +237,20 @@ gee <- function(...) {
 
     if (is.null(start)) {
       if (control_selection$start_type == "glm") {
-        start0 <- start_fit(X = X, # <--- does not work with pop_totals
+        start <- start_fit(X = X, # <--- does not work with pop_totals
                             R = R,
                             weights = weights,
                             weights_rand = weights_rand,
                             method_selection = method_selection)
       } else if (control_selection$start_type == "naive") {
-        start_h <- theta_h_estimation(R = R,
+        start_h <- suppressWarnings(theta_h_estimation(R = R,
                                       X = X[, 1, drop = FALSE],
                                       weights_rand = weights_rand,
                                       weights = weights,
                                       h = h,
                                       method_selection = method_selection,
                                       maxit = maxit,
-                                      start = 0)
+                                      start = 0)$theta_h)
         start <- c(start_h, rep(0, ncol(X) - 1))
       }
   }
diff --git a/R/cloglogModel.R b/R/cloglogModel.R
@@ -246,16 +246,16 @@ cloglog_model_nonprobsvy <- function(...) {
     V2
   }
 
-  b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) {
+  b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) { # TODO to fix
 
     hess_inv_neg <- solve(-hess)
     #print(mean(-((1 - ps)/ps^2 * log(1 - ps) * weights * (y - mu))))
     if (is.null(pop_size)) {
-      # b <- ((1 - ps)/ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv # TODO opposite sign here (?)
-      b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * (y - mu)) %*% X %*% hess_inv_neg
+      b <- - ((1 - ps)/ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv_neg # TODO opposite sign here (?)
+      # b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * (y - mu)) %*% X %*% hess_inv_neg
     } else {
-      # b <-  ((1 - ps)/ps^2 * exp(eta) * weights * y) %*% X %*% hess_inv # TODO opposite sign here (?)
-      b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * y) %*% X %*% hess_inv_neg
+      b <- - ((1 - ps)/ps^2 * exp(eta) * weights * y) %*% X %*% hess_inv_neg # TODO opposite sign here (?)
+      #b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * y) %*% X %*% hess_inv_neg
     }
 
     list(b = b)
diff --git a/R/internals.R b/R/internals.R
@@ -112,15 +112,25 @@ theta_h_estimation <- function(R,
                              method_selection = method_selection,
                              pop_totals = pop_totals)
 
-  root <- nleqslv::nleqslv(x = start,
-                           fn = u_theta,
-                           method = "Newton", # TODO consider the methods
-                           global = "cline", #qline",
-                           xscalm = "fixed",
-                           jacobian = TRUE,
-                           jac = u_theta_der
-                           #control = list(sigma = 0.1, trace = 1)
-                           )
+  if (method_selection == "cloglog") {
+    root <- nleqslv::nleqslv(x = start,
+                             fn = u_theta,
+                             method = "Newton", # TODO consider the methods
+                             global = "cline", #qline",
+                             xscalm = "fixed",
+                             jacobian = TRUE
+                             )
+  } else {
+    root <- nleqslv::nleqslv(x = start,
+                             fn = u_theta,
+                             method = "Newton", # TODO consider the methods
+                             global = "cline", #qline",
+                             xscalm = "fixed",
+                             jacobian = TRUE,
+                             jac = u_theta_der
+                             #control = list(sigma = 0.1, trace = 1)
+    )
+  }
 
 
   theta_root <- root$x
@@ -136,8 +146,11 @@ theta_h_estimation <- function(R,
   }
   theta_h <- as.vector(theta_root)
   grad <- u_theta(theta_h)
-  hess <- u_theta_der(theta_h) # TODO compare with root$jac
-  #hess <- root$jac
+  if (method_selection == "cloglog") {
+    hess <- root$jac
+  } else {
+    hess <- u_theta_der(theta_h) # TODO compare with root$jac
+  }
 
   list(theta_h = theta_h,
        hess = hess,
diff --git a/R/main_function_documentation.R b/R/main_function_documentation.R
@@ -38,49 +38,43 @@ NULL
 #' @param ... Additional, optional arguments.
 #'
 #' @details Let \mjseqn{y} be the response variable for which we want to estimate population mean
-#' given by \mjsdeqn{\mu_{y} = \frac{1}{N}\sum_{i=1}^N y_i.} For this purposes we consider data integration
+#' given by \mjsdeqn{\mu_{y} = \frac{1}{N} \sum_{i=1}^N y_{i}.} For this purposes we consider data integration
 #' with the following structure. Let \mjseqn{S_A} be the non-probability sample with design matrix of covariates as
 #' \mjsdeqn{
-#' \begin{equation}
 #' \boldsymbol{X}_A =
 #'   \begin{bmatrix}
 #' x_{11} & x_{12} & \cdots & x_{1p} \cr
 #' x_{21} & x_{22} & \cdots & x_{2p} \cr
 #' \vdots & \vdots & \ddots & \vdots \cr
 #' x_{n_{A}1} & x_{n_{A}2} & \cdots & x_{n_{A}p} \cr
 #' \end{bmatrix}
-#' \end{equation}
 #' }
 #' and vector of outcome variable
 #' \mjsdeqn{
-#' \begin{equation}
 #' \boldsymbol{y} =
 #'   \begin{bmatrix}
 #' y_{1} \cr
 #' y_{2} \cr
 #' \vdots \cr
 #' y_{n_{A}}.
 #' \end{bmatrix}
-#' \end{equation}
 #' }
 #' On the other hand let \mjseqn{S_B} be the probability sample with design matrix of covariates as
 #' \mjsdeqn{
-#' \begin{equation}
 #' \boldsymbol{X}_B =
 #'   \begin{bmatrix}
 #' x_{11} & x_{12} & \cdots & x_{1p} \cr
 #' x_{21} & x_{22} & \cdots & x_{2p} \cr
 #' \vdots & \vdots & \ddots & \vdots \cr
 #' x_{n_{B}1} & x_{n_{B}2} & \cdots & x_{n_{B}p}. \cr
 #' \end{bmatrix}
-#' \end{equation}
 #' }
 #' Instead of sample of units we can consider vector of population totals in the form of \mjseqn{\tau_x = (\sum_{i \in \mathcal{U}}\boldsymbol{x}_{i1}, \sum_{i \in \mathcal{U}}\boldsymbol{x}_{i2}, ..., \sum_{i \in \mathcal{U}}\boldsymbol{x}_{ip})} or means
 #' \mjseqn{\frac{\tau_x}{N}}, where \mjseqn{\mathcal{U}} regards to some finite population. Notice that we do not assume access to the response variable for \mjseqn{S_B}.
 #' Generally we provide following assumptions:
-#' 1.  The selection indicator of belonging to non-probability sample \mjseqn{R_i} and the response variable \mjseqn{y_i} are independent given the set of covariates \mjseqn{\boldsymbol{x}_i}.
-#' 2.  All units have a non-zero propensity score, that is, \mjseqn{\pi_i^A > 0} for all i.
-#' 3.  The indicator variables \mjseqn{R_i^A} and \mjseqn{R_j^A} are independent for given \mjseqn{\boldsymbol{x}_i} and \mjseqn{\boldsymbol{x}_j} for \mjseqn{i \neq j}.
+#' 1.  The selection indicator of belonging to non-probability sample \mjseqn{R_{i}} and the response variable \mjseqn{y_i} are independent given the set of covariates \mjseqn{\boldsymbol{x}_i}.
+#' 2.  All units have a non-zero propensity score, that is, \mjseqn{\pi_{i}^{A} > 0} for all i.
+#' 3.  The indicator variables \mjseqn{R_{i}^{A}} and \mjseqn{R_{j}^{A}} are independent for given \mjseqn{\boldsymbol{x}_i} and \mjseqn{\boldsymbol{x}_j} for \mjseqn{i \neq j}.
 #'
 #' There are three possible approaches to problem of population mean estimation using non-probability samples:
 #'
@@ -89,35 +83,31 @@ NULL
 #'  Inverse probability approach is based on assumption that reference probability sample
 #'  is available and therefore we can estimate propensity score of selection mechanism.
 #'  Estimator has following form:
-#'  \mjsdeqn{\mu_{IPW} = \frac{1}{N^A}\sum_{i \in S_A}\frac{y_i}{\hat{\pi}_{i}^A}.}
+#'  \mjsdeqn{\mu_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
 #'  For this purpose with consider multiple ways of estimation. The first approach is Maximum Likelihood Estimation with corrected
 #'  log-likelihood function which is given by the following formula
 #'  \mjsdeqn{
-#'  \begin{equation}
-#'  \ell^{*}(\boldsymbol{\theta}) = \sum_{i \in S_{A}}\log \left\lbrace \frac{\pi(\boldsymbol{x}_{i}, \boldsymbol{\theta})}{1 - \pi(\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace + \sum_{i \in S_{B}}d_{i}^{B}\log \left\lbrace 1 - \pi({\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace.
-#'  \end{equation}}
+#'  \ell^{*}(\boldsymbol{\theta}) = \sum_{i \in S_{A}}\log \left\lbrace \frac{\pi(\boldsymbol{x}_{i}, \boldsymbol{\theta})}{1 - \pi(\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace + \sum_{i \in S_{B}}d_{i}^{B}\log \left\lbrace 1 - \pi({\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace.}
 #'  In the literature main approach is based on `logit` link function when it comes to modelling propensity scores \mjseqn{\pi_i^A},
 #'  however we expand propensity score model with the additional link functions, such as `cloglog` and `probit`.
 #'  The pseudo score equations derived from ML methods can be replaced by idea of generalized estimating equations with calibration constraints defined by equations
 #'  \mjsdeqn{
-#'  \begin{equation}
-#'  \mathbf{U}(\boldsymbol{\theta})=\sum_{i \in S_A} \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right)-\sum_{i \in S_B} d_i^B \pi\left(\mathbf{x}_i, \boldsymbol{\theta}\right) \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right).
-#'  \end{equation}}
+#'  \mathbf{U}(\boldsymbol{\theta})=\sum_{i \in S_A} \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right)-\sum_{i \in S_B} d_i^B \pi\left(\mathbf{x}_i, \boldsymbol{\theta}\right) \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right).}
 #'  Notice that for \mjseqn{ \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right) = \frac{\pi(\boldsymbol{x}, \boldsymbol{\theta})}{\boldsymbol{x}}} we do not require probability
 #'  sample and can use vector of population totals/means.
 #'
 #' 2. Mass imputation -- This method relies on framework,
 #'    where imputed values of the outcome variables are created for whole probability sample.
 #'    In this case we treat big-data sample as a training dataset, which is used to build an imputation model. Using imputed values
 #'    for probability sample and design (known) weights, we can build population mean estimator of form:
-#'    \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
+#'    \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
 #'    It opens the the door to very flexible method for imputation model. In the package used generalized linear models from [stats::glm()]
 #'    nearest neighbour algorithm using [RANN::nn2()] and predictive mean matching.
 #'
 #' 3. Doubly robust estimation -- The IPW and MI estimators are sensible on misspecified models for propensity score and outcome variable respectively.
 #'    For this purpose so called doubly-robust methods, which take into account these problems, are presented.
 #'    It is quite simple idea of combination propensity score and imputation models during inference which lead to the following estimator
-#'    \mjsdeqn{\mu_{DR} = \frac{1}{N^A} \sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
+#'    \mjsdeqn{\mu_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
 #'    In addition, an approach based directly on bias minimisation has been implemented. Following formula
 #'    \mjsdeqn{
 #'    \begin{aligned}
@@ -127,7 +117,6 @@ NULL
 #'    }
 #'    lead us to system of equations
 #'    \mjsdeqn{
-#'    \begin{equation}
 #'    \begin{aligned}
 #'    J(\theta, \beta) =
 #'    \left\lbrace
@@ -140,9 +129,8 @@ NULL
 #'                                                - \sum_{i \in \mathcal{S}_{\mathrm{B}}} d_i^{\mathrm{B}} \frac{\partial m(\boldsymbol{x}_i, \boldsymbol{\beta})}{\partial \boldsymbol{\beta}}
 #'   \end{array} \right\rbrace,
 #'   \end{aligned}
-#'   \end{equation}
 #'   }
-#'   where \mjseqn{m\left(\boldsymbol{x}_i, \boldsymbol{\beta}\right)} is mass imputation (regression) model for outcome variable and
+#'   where \mjseqn{m\left(\boldsymbol{x}_{i}, \boldsymbol{\beta}\right)} is mass imputation (regression) model for outcome variable and
 #'   propensity scores \mjseqn{\pi_i^A} are estimated using `logit` function for the model. As in the `MLE` and `GEE` approaches we have expanded
 #'   this method on `cloglog` and `probit` links.
 #'
diff --git a/R/nonprobDR.R b/R/nonprobDR.R
@@ -485,15 +485,15 @@ nonprobDR <- function(selection,
                              weights_rand = weights_rand,
                              method_selection = method_selection)
         } else if (control_selection$start_type == "naive") {
-          start_h <- theta_h_estimation(R = R,
-                                        X = SelectionModel$X_nons[,1,drop=FALSE],
-                                        weights_rand = weights_rand,
-                                        weights = weights,
-                                        h = h,
-                                        method_selection = method_selection,
-                                        start = 0,
-                                        maxit = maxit,
-                                        pop_totals = SelectionModel$pop_totals[1])$theta_h
+          start_h <- suppressWarnings(theta_h_estimation(R = R,
+                                      X = SelectionModel$X_nons[,1,drop=FALSE],
+                                      weights_rand = weights_rand,
+                                      weights = weights,
+                                      h = h,
+                                      method_selection = method_selection,
+                                      start = 0,
+                                      maxit = maxit,
+                                      pop_totals = SelectionModel$pop_totals[1])$theta_h)
           start <- c(start_h, rep(0, ncol(X) - 1))
         }
       }
@@ -703,6 +703,8 @@ nonprobDR <- function(selection,
   SE_values <- do.call(rbind, SE_values)
   rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
   names(OutcomeList) <- outcomes$f
+  if (is.null(pop_size)) pop_size <- N_nons
+  names(pop_size) <- "pop_size"
 
   SelectionList <- list(coefficients = selection_model$theta_hat,
                         std_err = theta_standard_errors,
diff --git a/R/nonprobIPW.R b/R/nonprobIPW.R
@@ -261,15 +261,15 @@ nonprobIPW <- function(selection,
                                weights_rand = weights_rand,
                                method_selection = method_selection)
           } else if (control_selection$start_type == "naive") {
-            start_h <- theta_h_estimation(R = R,
+            start_h <- suppressWarnings(theta_h_estimation(R = R,
                                           X = X[,1,drop=FALSE],
                                           weights_rand = weights_rand,
                                           weights = weights,
                                           h = h,
                                           method_selection = method_selection,
                                           start = 0,
                                           maxit = maxit,
-                                          pop_totals = pop_totals[1])$theta_h
+                                          pop_totals = pop_totals[1])$theta_h)
             start <- c(start_h, rep(0, ncol(X) - 1))
           }
         }
@@ -457,6 +457,7 @@ nonprobIPW <- function(selection,
   SE_values <- do.call(rbind, SE_values)
   rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
   if (is.null(pop_size)) pop_size <- N # estimated pop_size
+  names(pop_size) <- "pop_size"
 
   SelectionList <- list(coefficients = selection_model$theta_hat,
                         std_err = theta_standard_errors,
diff --git a/R/nonprobMI.R b/R/nonprobMI.R
@@ -287,6 +287,7 @@ nonprobMI <- function(outcome,
   SE_values <- do.call(rbind, SE_values)
   rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
   names(OutcomeList) <- outcomes$f
+  names(pop_size) <- "pop_size"
 
   structure(
     list(X = if(isTRUE(x)) X else NULL,
diff --git a/inst/tinytest/test-2-ipw-totals.R b/inst/tinytest/test-2-ipw-totals.R
diff --git a/inst/tinytest/test_nonprobsvy.R b/inst/tinytest/test_nonprobsvy.R
diff --git a/man/nonprob.Rd b/man/nonprob.Rd