Skip to content

Commit ce5398c

Browse files
add unit tests for cloglog models, bugFix for start point customisation
1 parent 40d5c06 commit ce5398c

10 files changed

+326
-334
lines changed

R/EstimationMethods.R

+5-5
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,12 @@ mle <- function(...) {
104104
weights_rand = weights_rand,
105105
method_selection = method_selection)
106106
} else if (control_selection$start_type == "naive") {
107-
intercept_start <- max_lik(X_nons = X_nons[, 1, drop = FALSE],
107+
intercept_start <- suppressWarnings(max_lik(X_nons = X_nons[, 1, drop = FALSE],
108108
X_rand = X_rand[, 1, drop = FALSE],
109109
weights = weights,
110110
weights_rand = weights_rand,
111111
start = 0,
112-
control = control_selection)$theta_hat
112+
control = control_selection)$theta_hat)
113113
start <- c(intercept_start, rep(0, ncol(X_nons) - 1))
114114
}
115115
}
@@ -237,20 +237,20 @@ gee <- function(...) {
237237

238238
if (is.null(start)) {
239239
if (control_selection$start_type == "glm") {
240-
start0 <- start_fit(X = X, # <--- does not work with pop_totals
240+
start <- start_fit(X = X, # <--- does not work with pop_totals
241241
R = R,
242242
weights = weights,
243243
weights_rand = weights_rand,
244244
method_selection = method_selection)
245245
} else if (control_selection$start_type == "naive") {
246-
start_h <- theta_h_estimation(R = R,
246+
start_h <- suppressWarnings(theta_h_estimation(R = R,
247247
X = X[, 1, drop = FALSE],
248248
weights_rand = weights_rand,
249249
weights = weights,
250250
h = h,
251251
method_selection = method_selection,
252252
maxit = maxit,
253-
start = 0)
253+
start = 0)$theta_h)
254254
start <- c(start_h, rep(0, ncol(X) - 1))
255255
}
256256
}

R/cloglogModel.R

+5-5
Original file line numberDiff line numberDiff line change
@@ -246,16 +246,16 @@ cloglog_model_nonprobsvy <- function(...) {
246246
V2
247247
}
248248

249-
b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) {
249+
b_vec_ipw <- function(y, mu, ps, psd = NULL, eta = NULL, X, hess, pop_size, weights) { # TODO to fix
250250

251251
hess_inv_neg <- solve(-hess)
252252
#print(mean(-((1 - ps)/ps^2 * log(1 - ps) * weights * (y - mu))))
253253
if (is.null(pop_size)) {
254-
# b <- ((1 - ps)/ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv # TODO opposite sign here (?)
255-
b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * (y - mu)) %*% X %*% hess_inv_neg
254+
b <- - ((1 - ps)/ps^2 * exp(eta) * weights * (y - mu)) %*% X %*% hess_inv_neg # TODO opposite sign here (?)
255+
# b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * (y - mu)) %*% X %*% hess_inv_neg
256256
} else {
257-
# b <- ((1 - ps)/ps^2 * exp(eta) * weights * y) %*% X %*% hess_inv # TODO opposite sign here (?)
258-
b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * y) %*% X %*% hess_inv_neg
257+
b <- - ((1 - ps)/ps^2 * exp(eta) * weights * y) %*% X %*% hess_inv_neg # TODO opposite sign here (?)
258+
#b <- - (exp(eta + exp(eta)) / (exp(exp(eta)) - 1)^2 * weights * y) %*% X %*% hess_inv_neg
259259
}
260260

261261
list(b = b)

R/internals.R

+24-11
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,25 @@ theta_h_estimation <- function(R,
112112
method_selection = method_selection,
113113
pop_totals = pop_totals)
114114

115-
root <- nleqslv::nleqslv(x = start,
116-
fn = u_theta,
117-
method = "Newton", # TODO consider the methods
118-
global = "cline", #qline",
119-
xscalm = "fixed",
120-
jacobian = TRUE,
121-
jac = u_theta_der
122-
#control = list(sigma = 0.1, trace = 1)
123-
)
115+
if (method_selection == "cloglog") {
116+
root <- nleqslv::nleqslv(x = start,
117+
fn = u_theta,
118+
method = "Newton", # TODO consider the methods
119+
global = "cline", #qline",
120+
xscalm = "fixed",
121+
jacobian = TRUE
122+
)
123+
} else {
124+
root <- nleqslv::nleqslv(x = start,
125+
fn = u_theta,
126+
method = "Newton", # TODO consider the methods
127+
global = "cline", #qline",
128+
xscalm = "fixed",
129+
jacobian = TRUE,
130+
jac = u_theta_der
131+
#control = list(sigma = 0.1, trace = 1)
132+
)
133+
}
124134

125135

126136
theta_root <- root$x
@@ -136,8 +146,11 @@ theta_h_estimation <- function(R,
136146
}
137147
theta_h <- as.vector(theta_root)
138148
grad <- u_theta(theta_h)
139-
hess <- u_theta_der(theta_h) # TODO compare with root$jac
140-
#hess <- root$jac
149+
if (method_selection == "cloglog") {
150+
hess <- root$jac
151+
} else {
152+
hess <- u_theta_der(theta_h) # TODO compare with root$jac
153+
}
141154

142155
list(theta_h = theta_h,
143156
hess = hess,

R/main_function_documentation.R

+10-22
Original file line numberDiff line numberDiff line change
@@ -38,49 +38,43 @@ NULL
3838
#' @param ... Additional, optional arguments.
3939
#'
4040
#' @details Let \mjseqn{y} be the response variable for which we want to estimate population mean
41-
#' given by \mjsdeqn{\mu_{y} = \frac{1}{N}\sum_{i=1}^N y_i.} For this purposes we consider data integration
41+
#' given by \mjsdeqn{\mu_{y} = \frac{1}{N} \sum_{i=1}^N y_{i}.} For this purposes we consider data integration
4242
#' with the following structure. Let \mjseqn{S_A} be the non-probability sample with design matrix of covariates as
4343
#' \mjsdeqn{
44-
#' \begin{equation}
4544
#' \boldsymbol{X}_A =
4645
#' \begin{bmatrix}
4746
#' x_{11} & x_{12} & \cdots & x_{1p} \cr
4847
#' x_{21} & x_{22} & \cdots & x_{2p} \cr
4948
#' \vdots & \vdots & \ddots & \vdots \cr
5049
#' x_{n_{A}1} & x_{n_{A}2} & \cdots & x_{n_{A}p} \cr
5150
#' \end{bmatrix}
52-
#' \end{equation}
5351
#' }
5452
#' and vector of outcome variable
5553
#' \mjsdeqn{
56-
#' \begin{equation}
5754
#' \boldsymbol{y} =
5855
#' \begin{bmatrix}
5956
#' y_{1} \cr
6057
#' y_{2} \cr
6158
#' \vdots \cr
6259
#' y_{n_{A}}.
6360
#' \end{bmatrix}
64-
#' \end{equation}
6561
#' }
6662
#' On the other hand let \mjseqn{S_B} be the probability sample with design matrix of covariates as
6763
#' \mjsdeqn{
68-
#' \begin{equation}
6964
#' \boldsymbol{X}_B =
7065
#' \begin{bmatrix}
7166
#' x_{11} & x_{12} & \cdots & x_{1p} \cr
7267
#' x_{21} & x_{22} & \cdots & x_{2p} \cr
7368
#' \vdots & \vdots & \ddots & \vdots \cr
7469
#' x_{n_{B}1} & x_{n_{B}2} & \cdots & x_{n_{B}p}. \cr
7570
#' \end{bmatrix}
76-
#' \end{equation}
7771
#' }
7872
#' Instead of sample of units we can consider vector of population totals in the form of \mjseqn{\tau_x = (\sum_{i \in \mathcal{U}}\boldsymbol{x}_{i1}, \sum_{i \in \mathcal{U}}\boldsymbol{x}_{i2}, ..., \sum_{i \in \mathcal{U}}\boldsymbol{x}_{ip})} or means
7973
#' \mjseqn{\frac{\tau_x}{N}}, where \mjseqn{\mathcal{U}} regards to some finite population. Notice that we do not assume access to the response variable for \mjseqn{S_B}.
8074
#' Generally we provide following assumptions:
81-
#' 1. The selection indicator of belonging to non-probability sample \mjseqn{R_i} and the response variable \mjseqn{y_i} are independent given the set of covariates \mjseqn{\boldsymbol{x}_i}.
82-
#' 2. All units have a non-zero propensity score, that is, \mjseqn{\pi_i^A > 0} for all i.
83-
#' 3. The indicator variables \mjseqn{R_i^A} and \mjseqn{R_j^A} are independent for given \mjseqn{\boldsymbol{x}_i} and \mjseqn{\boldsymbol{x}_j} for \mjseqn{i \neq j}.
75+
#' 1. The selection indicator of belonging to non-probability sample \mjseqn{R_{i}} and the response variable \mjseqn{y_i} are independent given the set of covariates \mjseqn{\boldsymbol{x}_i}.
76+
#' 2. All units have a non-zero propensity score, that is, \mjseqn{\pi_{i}^{A} > 0} for all i.
77+
#' 3. The indicator variables \mjseqn{R_{i}^{A}} and \mjseqn{R_{j}^{A}} are independent for given \mjseqn{\boldsymbol{x}_i} and \mjseqn{\boldsymbol{x}_j} for \mjseqn{i \neq j}.
8478
#'
8579
#' There are three possible approaches to problem of population mean estimation using non-probability samples:
8680
#'
@@ -89,35 +83,31 @@ NULL
8983
#' Inverse probability approach is based on assumption that reference probability sample
9084
#' is available and therefore we can estimate propensity score of selection mechanism.
9185
#' Estimator has following form:
92-
#' \mjsdeqn{\mu_{IPW} = \frac{1}{N^A}\sum_{i \in S_A}\frac{y_i}{\hat{\pi}_{i}^A}.}
86+
#' \mjsdeqn{\mu_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
9387
#' For this purpose with consider multiple ways of estimation. The first approach is Maximum Likelihood Estimation with corrected
9488
#' log-likelihood function which is given by the following formula
9589
#' \mjsdeqn{
96-
#' \begin{equation}
97-
#' \ell^{*}(\boldsymbol{\theta}) = \sum_{i \in S_{A}}\log \left\lbrace \frac{\pi(\boldsymbol{x}_{i}, \boldsymbol{\theta})}{1 - \pi(\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace + \sum_{i \in S_{B}}d_{i}^{B}\log \left\lbrace 1 - \pi({\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace.
98-
#' \end{equation}}
90+
#' \ell^{*}(\boldsymbol{\theta}) = \sum_{i \in S_{A}}\log \left\lbrace \frac{\pi(\boldsymbol{x}_{i}, \boldsymbol{\theta})}{1 - \pi(\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace + \sum_{i \in S_{B}}d_{i}^{B}\log \left\lbrace 1 - \pi({\boldsymbol{x}_{i},\boldsymbol{\theta})}\right\rbrace.}
9991
#' In the literature main approach is based on `logit` link function when it comes to modelling propensity scores \mjseqn{\pi_i^A},
10092
#' however we expand propensity score model with the additional link functions, such as `cloglog` and `probit`.
10193
#' The pseudo score equations derived from ML methods can be replaced by idea of generalized estimating equations with calibration constraints defined by equations
10294
#' \mjsdeqn{
103-
#' \begin{equation}
104-
#' \mathbf{U}(\boldsymbol{\theta})=\sum_{i \in S_A} \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right)-\sum_{i \in S_B} d_i^B \pi\left(\mathbf{x}_i, \boldsymbol{\theta}\right) \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right).
105-
#' \end{equation}}
95+
#' \mathbf{U}(\boldsymbol{\theta})=\sum_{i \in S_A} \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right)-\sum_{i \in S_B} d_i^B \pi\left(\mathbf{x}_i, \boldsymbol{\theta}\right) \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right).}
10696
#' Notice that for \mjseqn{ \mathbf{h}\left(\mathbf{x}_i, \boldsymbol{\theta}\right) = \frac{\pi(\boldsymbol{x}, \boldsymbol{\theta})}{\boldsymbol{x}}} we do not require probability
10797
#' sample and can use vector of population totals/means.
10898
#'
10999
#' 2. Mass imputation -- This method relies on framework,
110100
#' where imputed values of the outcome variables are created for whole probability sample.
111101
#' In this case we treat big-data sample as a training dataset, which is used to build an imputation model. Using imputed values
112102
#' for probability sample and design (known) weights, we can build population mean estimator of form:
113-
#' \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
103+
#' \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
114104
#' It opens the the door to very flexible method for imputation model. In the package used generalized linear models from [stats::glm()]
115105
#' nearest neighbour algorithm using [RANN::nn2()] and predictive mean matching.
116106
#'
117107
#' 3. Doubly robust estimation -- The IPW and MI estimators are sensible on misspecified models for propensity score and outcome variable respectively.
118108
#' For this purpose so called doubly-robust methods, which take into account these problems, are presented.
119109
#' It is quite simple idea of combination propensity score and imputation models during inference which lead to the following estimator
120-
#' \mjsdeqn{\mu_{DR} = \frac{1}{N^A} \sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
110+
#' \mjsdeqn{\mu_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
121111
#' In addition, an approach based directly on bias minimisation has been implemented. Following formula
122112
#' \mjsdeqn{
123113
#' \begin{aligned}
@@ -127,7 +117,6 @@ NULL
127117
#' }
128118
#' lead us to system of equations
129119
#' \mjsdeqn{
130-
#' \begin{equation}
131120
#' \begin{aligned}
132121
#' J(\theta, \beta) =
133122
#' \left\lbrace
@@ -140,9 +129,8 @@ NULL
140129
#' - \sum_{i \in \mathcal{S}_{\mathrm{B}}} d_i^{\mathrm{B}} \frac{\partial m(\boldsymbol{x}_i, \boldsymbol{\beta})}{\partial \boldsymbol{\beta}}
141130
#' \end{array} \right\rbrace,
142131
#' \end{aligned}
143-
#' \end{equation}
144132
#' }
145-
#' where \mjseqn{m\left(\boldsymbol{x}_i, \boldsymbol{\beta}\right)} is mass imputation (regression) model for outcome variable and
133+
#' where \mjseqn{m\left(\boldsymbol{x}_{i}, \boldsymbol{\beta}\right)} is mass imputation (regression) model for outcome variable and
146134
#' propensity scores \mjseqn{\pi_i^A} are estimated using `logit` function for the model. As in the `MLE` and `GEE` approaches we have expanded
147135
#' this method on `cloglog` and `probit` links.
148136
#'

R/nonprobDR.R

+11-9
Original file line numberDiff line numberDiff line change
@@ -485,15 +485,15 @@ nonprobDR <- function(selection,
485485
weights_rand = weights_rand,
486486
method_selection = method_selection)
487487
} else if (control_selection$start_type == "naive") {
488-
start_h <- theta_h_estimation(R = R,
489-
X = SelectionModel$X_nons[,1,drop=FALSE],
490-
weights_rand = weights_rand,
491-
weights = weights,
492-
h = h,
493-
method_selection = method_selection,
494-
start = 0,
495-
maxit = maxit,
496-
pop_totals = SelectionModel$pop_totals[1])$theta_h
488+
start_h <- suppressWarnings(theta_h_estimation(R = R,
489+
X = SelectionModel$X_nons[,1,drop=FALSE],
490+
weights_rand = weights_rand,
491+
weights = weights,
492+
h = h,
493+
method_selection = method_selection,
494+
start = 0,
495+
maxit = maxit,
496+
pop_totals = SelectionModel$pop_totals[1])$theta_h)
497497
start <- c(start_h, rep(0, ncol(X) - 1))
498498
}
499499
}
@@ -703,6 +703,8 @@ nonprobDR <- function(selection,
703703
SE_values <- do.call(rbind, SE_values)
704704
rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
705705
names(OutcomeList) <- outcomes$f
706+
if (is.null(pop_size)) pop_size <- N_nons
707+
names(pop_size) <- "pop_size"
706708

707709
SelectionList <- list(coefficients = selection_model$theta_hat,
708710
std_err = theta_standard_errors,

R/nonprobIPW.R

+3-2
Original file line numberDiff line numberDiff line change
@@ -261,15 +261,15 @@ nonprobIPW <- function(selection,
261261
weights_rand = weights_rand,
262262
method_selection = method_selection)
263263
} else if (control_selection$start_type == "naive") {
264-
start_h <- theta_h_estimation(R = R,
264+
start_h <- suppressWarnings(theta_h_estimation(R = R,
265265
X = X[,1,drop=FALSE],
266266
weights_rand = weights_rand,
267267
weights = weights,
268268
h = h,
269269
method_selection = method_selection,
270270
start = 0,
271271
maxit = maxit,
272-
pop_totals = pop_totals[1])$theta_h
272+
pop_totals = pop_totals[1])$theta_h)
273273
start <- c(start_h, rep(0, ncol(X) - 1))
274274
}
275275
}
@@ -457,6 +457,7 @@ nonprobIPW <- function(selection,
457457
SE_values <- do.call(rbind, SE_values)
458458
rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
459459
if (is.null(pop_size)) pop_size <- N # estimated pop_size
460+
names(pop_size) <- "pop_size"
460461

461462
SelectionList <- list(coefficients = selection_model$theta_hat,
462463
std_err = theta_standard_errors,

R/nonprobMI.R

+1
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ nonprobMI <- function(outcome,
287287
SE_values <- do.call(rbind, SE_values)
288288
rownames(output) <- rownames(confidence_interval) <- rownames(SE_values) <- outcomes$f
289289
names(OutcomeList) <- outcomes$f
290+
names(pop_size) <- "pop_size"
290291

291292
structure(
292293
list(X = if(isTRUE(x)) X else NULL,

0 commit comments

Comments
 (0)