Skip to content

Commit

Permalink
Merge pull request #23 from ncn-foreigners/dev
Browse files Browse the repository at this point in the history
add unit tests for cloglog models, bugFix for start point customisation
  • Loading branch information
LukaszChrostowski authored Nov 8, 2023
2 parents 0a03ca8 + ce5398c commit 963b65a
Show file tree
Hide file tree
Showing 10 changed files with 326 additions and 334 deletions.
10 changes: 5 additions & 5 deletions R/EstimationMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ mle <- function(...) {
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
intercept_start <- max_lik(X_nons = X_nons[, 1, drop = FALSE],
intercept_start <- suppressWarnings(max_lik(X_nons = X_nons[, 1, drop = FALSE],
X_rand = X_rand[, 1, drop = FALSE],
weights = weights,
weights_rand = weights_rand,
start = 0,
control = control_selection)$theta_hat
control = control_selection)$theta_hat)
start <- c(intercept_start, rep(0, ncol(X_nons) - 1))
}
}
Expand Down Expand Up @@ -237,20 +237,20 @@ gee <- function(...) {

if (is.null(start)) {
if (control_selection$start_type == "glm") {
start0 <- start_fit(X = X, # <--- does not work with pop_totals
start <- start_fit(X = X, # <--- does not work with pop_totals
R = R,
weights = weights,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
start_h <- theta_h_estimation(R = R,
start_h <- suppressWarnings(theta_h_estimation(R = R,
X = X[, 1, drop = FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
maxit = maxit,
start = 0)
start = 0)$theta_h)
start <- c(start_h, rep(0, ncol(X) - 1))
}
}
Expand Down
10 changes: 5 additions & 5 deletions R/cloglogModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -246,16 +246,16 @@ cloglog_model_nonprobsvy <- function(...) {
V2
}

b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) {
b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) { # TODO to fix

hess_inv_neg <- solve(-hess)
#print(mean(-((1 - ps)/ps^2 * log(1 - ps) * weights * (y - mu))))
if (is.null(pop_size)) {
# b <- ((1 - ps)/ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv # TODO opposite sign here (?)
b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * (y - mu)) %*% X %*% hess_inv_neg
b <- - ((1 - ps)/ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv_neg # TODO opposite sign here (?)
# b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * (y - mu)) %*% X %*% hess_inv_neg
} else {
# b <- ((1 - ps)/ps^2 * exp(eta) * weights * y) %*% X %*% hess_inv # TODO opposite sign here (?)
b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * y) %*% X %*% hess_inv_neg
b <- - ((1 - ps)/ps^2 * exp(eta) * weights * y) %*% X %*% hess_inv_neg # TODO opposite sign here (?)
#b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * y) %*% X %*% hess_inv_neg
}

list(b = b)
Expand Down
35 changes: 24 additions & 11 deletions R/internals.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,25 @@ theta_h_estimation <- function(R,
method_selection = method_selection,
pop_totals = pop_totals)

root <- nleqslv::nleqslv(x = start,
fn = u_theta,
method = "Newton", # TODO consider the methods
global = "cline", #qline",
xscalm = "fixed",
jacobian = TRUE,
jac = u_theta_der
#control = list(sigma = 0.1, trace = 1)
)
if (method_selection == "cloglog") {
root <- nleqslv::nleqslv(x = start,
fn = u_theta,
method = "Newton", # TODO consider the methods
global = "cline", #qline",
xscalm = "fixed",
jacobian = TRUE
)
} else {
root <- nleqslv::nleqslv(x = start,
fn = u_theta,
method = "Newton", # TODO consider the methods
global = "cline", #qline",
xscalm = "fixed",
jacobian = TRUE,
jac = u_theta_der
#control = list(sigma = 0.1, trace = 1)
)
}


theta_root <- root$x
Expand All @@ -136,8 +146,11 @@ theta_h_estimation <- function(R,
}
theta_h <- as.vector(theta_root)
grad <- u_theta(theta_h)
hess <- u_theta_der(theta_h) # TODO compare with root$jac
#hess <- root$jac
if (method_selection == "cloglog") {
hess <- root$jac
} else {
hess <- u_theta_der(theta_h) # TODO compare with root$jac
}

list(theta_h = theta_h,
hess = hess,
Expand Down
32 changes: 10 additions & 22 deletions R/main_function_documentation.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,49 +38,43 @@ NULL
#' @param ... Additional, optional arguments.
#'
#' @details Let \mjseqn{y} be the response variable for which we want to estimate population mean
#' given by \mjsdeqn{\mu_{y} = \frac{1}{N}\sum_{i=1}^N y_i.} For this purposes we consider data integration
#' given by \mjsdeqn{\mu_{y} = \frac{1}{N} \sum_{i=1}^N y_{i}.} For this purposes we consider data integration
#' with the following structure. Let \mjseqn{S_A} be the non-probability sample with design matrix of covariates as
#' \mjsdeqn{
#' \begin{equation}
#' \boldsymbol{X}_A =
#' \begin{bmatrix}
#' x_{11} & x_{12} & \cdots & x_{1p} \cr
#' x_{21} & x_{22} & \cdots & x_{2p} \cr
#' \vdots & \vdots & \ddots & \vdots \cr
#' x_{n_{A}1} & x_{n_{A}2} & \cdots & x_{n_{A}p} \cr
#' \end{bmatrix}
#' \end{equation}
#' }
#' and vector of outcome variable
#' \mjsdeqn{
#' \begin{equation}
#' \boldsymbol{y} =
#' \begin{bmatrix}
#' y_{1} \cr
#' y_{2} \cr
#' \vdots \cr
#' y_{n_{A}}.
#' \end{bmatrix}
#' \end{equation}
#' }
#' On the other hand let \mjseqn{S_B} be the probability sample with design matrix of covariates as
#' \mjsdeqn{
#' \begin{equation}
#' \boldsymbol{X}_B =
#' \begin{bmatrix}
#' x_{11} & x_{12} & \cdots & x_{1p} \cr
#' x_{21} & x_{22} & \cdots & x_{2p} \cr
#' \vdots & \vdots & \ddots & \vdots \cr
#' x_{n_{B}1} & x_{n_{B}2} & \cdots & x_{n_{B}p}. \cr
#' \end{bmatrix}
#' \end{equation}
#' }
#' Instead of sample of units we can consider vector of population totals in the form of \mjseqn{\tau_x = (\sum_{i \in \mathcal{U}}\boldsymbol{x}_{i1}, \sum_{i \in \mathcal{U}}\boldsymbol{x}_{i2}, ..., \sum_{i \in \mathcal{U}}\boldsymbol{x}_{ip})} or means
#' \mjseqn{\frac{\tau_x}{N}}, where \mjseqn{\mathcal{U}} regards to some finite population. Notice that we do not assume access to the response variable for \mjseqn{S_B}.
#' Generally we provide following assumptions:
#' 1. The selection indicator of belonging to non-probability sample \mjseqn{R_i} and the response variable \mjseqn{y_i} are independent given the set of covariates \mjseqn{\boldsymbol{x}_i}.
#' 2. All units have a non-zero propensity score, that is, \mjseqn{\pi_i^A > 0} for all i.
#' 3. The indicator variables \mjseqn{R_i^A} and \mjseqn{R_j^A} are independent for given \mjseqn{\boldsymbol{x}_i} and \mjseqn{\boldsymbol{x}_j} for \mjseqn{i \neq j}.
#' 1. The selection indicator of belonging to non-probability sample \mjseqn{R_{i}} and the response variable \mjseqn{y_i} are independent given the set of covariates \mjseqn{\boldsymbol{x}_i}.
#' 2. All units have a non-zero propensity score, that is, \mjseqn{\pi_{i}^{A} > 0} for all i.
#' 3. The indicator variables \mjseqn{R_{i}^{A}} and \mjseqn{R_{j}^{A}} are independent for given \mjseqn{\boldsymbol{x}_i} and \mjseqn{\boldsymbol{x}_j} for \mjseqn{i \neq j}.
#'
#' There are three possible approaches to problem of population mean estimation using non-probability samples:
#'
Expand All @@ -89,35 +83,31 @@ NULL
#' Inverse probability approach is based on assumption that reference probability sample
#' is available and therefore we can estimate propensity score of selection mechanism.
#' Estimator has following form:
#' \mjsdeqn{\mu_{IPW} = \frac{1}{N^A}\sum_{i \in S_A}\frac{y_i}{\hat{\pi}_{i}^A}.}
#' \mjsdeqn{\mu_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
#' For this purpose with consider multiple ways of estimation. The first approach is Maximum Likelihood Estimation with corrected
#' log-likelihood function which is given by the following formula
#' \mjsdeqn{
#' \begin{equation}
#' \ell^{*}(\boldsymbol{\theta}) = \sum_{i \in S_{A}}\log \left\lbrace \frac{\pi(\boldsymbol{x}_{i}, \boldsymbol{\theta})}{1 - \pi(\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace + \sum_{i \in S_{B}}d_{i}^{B}\log \left\lbrace 1 - \pi({\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace.
#' \end{equation}}
#' \ell^{*}(\boldsymbol{\theta}) = \sum_{i \in S_{A}}\log \left\lbrace \frac{\pi(\boldsymbol{x}_{i}, \boldsymbol{\theta})}{1 - \pi(\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace + \sum_{i \in S_{B}}d_{i}^{B}\log \left\lbrace 1 - \pi({\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace.}
#' In the literature main approach is based on `logit` link function when it comes to modelling propensity scores \mjseqn{\pi_i^A},
#' however we expand propensity score model with the additional link functions, such as `cloglog` and `probit`.
#' The pseudo score equations derived from ML methods can be replaced by idea of generalized estimating equations with calibration constraints defined by equations
#' \mjsdeqn{
#' \begin{equation}
#' \mathbf{U}(\boldsymbol{\theta})=\sum_{i \in S_A} \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right)-\sum_{i \in S_B} d_i^B \pi\left(\mathbf{x}_i, \boldsymbol{\theta}\right) \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right).
#' \end{equation}}
#' \mathbf{U}(\boldsymbol{\theta})=\sum_{i \in S_A} \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right)-\sum_{i \in S_B} d_i^B \pi\left(\mathbf{x}_i, \boldsymbol{\theta}\right) \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right).}
#' Notice that for \mjseqn{ \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right) = \frac{\pi(\boldsymbol{x}, \boldsymbol{\theta})}{\boldsymbol{x}}} we do not require probability
#' sample and can use vector of population totals/means.
#'
#' 2. Mass imputation -- This method relies on framework,
#' where imputed values of the outcome variables are created for whole probability sample.
#' In this case we treat big-data sample as a training dataset, which is used to build an imputation model. Using imputed values
#' for probability sample and design (known) weights, we can build population mean estimator of form:
#' \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
#' \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
#' It opens the the door to very flexible method for imputation model. In the package used generalized linear models from [stats::glm()]
#' nearest neighbour algorithm using [RANN::nn2()] and predictive mean matching.
#'
#' 3. Doubly robust estimation -- The IPW and MI estimators are sensible on misspecified models for propensity score and outcome variable respectively.
#' For this purpose so called doubly-robust methods, which take into account these problems, are presented.
#' It is quite simple idea of combination propensity score and imputation models during inference which lead to the following estimator
#' \mjsdeqn{\mu_{DR} = \frac{1}{N^A} \sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
#' \mjsdeqn{\mu_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
#' In addition, an approach based directly on bias minimisation has been implemented. Following formula
#' \mjsdeqn{
#' \begin{aligned}
Expand All @@ -127,7 +117,6 @@ NULL
#' }
#' lead us to system of equations
#' \mjsdeqn{
#' \begin{equation}
#' \begin{aligned}
#' J(\theta, \beta) =
#' \left\lbrace
Expand All @@ -140,9 +129,8 @@ NULL
#' - \sum_{i \in \mathcal{S}_{\mathrm{B}}} d_i^{\mathrm{B}} \frac{\partial m(\boldsymbol{x}_i, \boldsymbol{\beta})}{\partial \boldsymbol{\beta}}
#' \end{array} \right\rbrace,
#' \end{aligned}
#' \end{equation}
#' }
#' where \mjseqn{m\left(\boldsymbol{x}_i, \boldsymbol{\beta}\right)} is mass imputation (regression) model for outcome variable and
#' where \mjseqn{m\left(\boldsymbol{x}_{i}, \boldsymbol{\beta}\right)} is mass imputation (regression) model for outcome variable and
#' propensity scores \mjseqn{\pi_i^A} are estimated using `logit` function for the model. As in the `MLE` and `GEE` approaches we have expanded
#' this method on `cloglog` and `probit` links.
#'
Expand Down
20 changes: 11 additions & 9 deletions R/nonprobDR.R
Original file line number Diff line number Diff line change
Expand Up @@ -485,15 +485,15 @@ nonprobDR <- function(selection,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
start_h <- theta_h_estimation(R = R,
X = SelectionModel$X_nons[,1,drop=FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
start = 0,
maxit = maxit,
pop_totals = SelectionModel$pop_totals[1])$theta_h
start_h <- suppressWarnings(theta_h_estimation(R = R,

This comment has been minimized.

Copy link
@Kertoo

Kertoo Nov 8, 2023

Member

You should look up tryCatch maybe you'll want to check if these warning messages are important before supressing them

X = SelectionModel$X_nons[,1,drop=FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
start = 0,
maxit = maxit,
pop_totals = SelectionModel$pop_totals[1])$theta_h)
start <- c(start_h, rep(0, ncol(X) - 1))
}
}
Expand Down Expand Up @@ -703,6 +703,8 @@ nonprobDR <- function(selection,
SE_values <- do.call(rbind, SE_values)
rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
names(OutcomeList) <- outcomes$f
if (is.null(pop_size)) pop_size <- N_nons
names(pop_size) <- "pop_size"

SelectionList <- list(coefficients = selection_model$theta_hat,
std_err = theta_standard_errors,
Expand Down
5 changes: 3 additions & 2 deletions R/nonprobIPW.R
Original file line number Diff line number Diff line change
Expand Up @@ -261,15 +261,15 @@ nonprobIPW <- function(selection,
weights_rand = weights_rand,
method_selection = method_selection)
} else if (control_selection$start_type == "naive") {
start_h <- theta_h_estimation(R = R,
start_h <- suppressWarnings(theta_h_estimation(R = R,
X = X[,1,drop=FALSE],
weights_rand = weights_rand,
weights = weights,
h = h,
method_selection = method_selection,
start = 0,
maxit = maxit,
pop_totals = pop_totals[1])$theta_h
pop_totals = pop_totals[1])$theta_h)
start <- c(start_h, rep(0, ncol(X) - 1))
}
}
Expand Down Expand Up @@ -457,6 +457,7 @@ nonprobIPW <- function(selection,
SE_values <- do.call(rbind, SE_values)
rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
if (is.null(pop_size)) pop_size <- N # estimated pop_size
names(pop_size) <- "pop_size"

SelectionList <- list(coefficients = selection_model$theta_hat,
std_err = theta_standard_errors,
Expand Down
1 change: 1 addition & 0 deletions R/nonprobMI.R
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ nonprobMI <- function(outcome,
SE_values <- do.call(rbind, SE_values)
rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
names(OutcomeList) <- outcomes$f
names(pop_size) <- "pop_size"

structure(
list(X = if(isTRUE(x)) X else NULL,
Expand Down
Loading

0 comments on commit 963b65a

Please sign in to comment.