Skip to content

Commit c5df762

Browse files
Merge pull request #26 from ncn-foreigners/dev
Dev
2 parents 79d924d + e8d019d commit c5df762

23 files changed

+239
-113
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ Type: Package
33
Title: Package for Inference Based on Nonprobability Samples
44
Version: 0.1.0
55
Authors@R:
6-
c(person(given = "Lukasz",
6+
c(person(given = "Łukasz",
77
family = "Chrostowski",
88
role = c("aut", "cre"),
99
email = "[email protected]"),
1010
person(given = "Maciej",
11-
family = "Beresewicz",
11+
family = "Beręsewicz",
1212
role = c("aut", "ctb"),
1313
email = "[email protected]"),
1414
person(given = "Piotr",

R/EstimationMethods.R

+20-15
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ mle <- function(...) {
135135
ps_nons_der <- dinv_link(eta_nons)
136136
est_ps_rand_der <- dinv_link(eta_rand)
137137

138-
resids <- c(est_ps_rand, ps_nons) - R
138+
resids <- R - c(est_ps_rand, ps_nons)
139139

140140
variance <- (t(resids) %*% resids) / df_reduced
141141

@@ -293,7 +293,7 @@ gee <- function(...) {
293293
ps_nons <- inv_link(eta_nons)
294294
est_ps_rand <- inv_link(eta_rand)
295295
variance_covariance <- solve(-hess)
296-
resids <- c(est_ps_rand, ps_nons) - R
296+
resids <- R - c(est_ps_rand, ps_nons)
297297

298298
df_reduced <- nrow(X) - length(theta_hat)
299299
variance <- as.vector((t(resids) %*% resids) / df_reduced)
@@ -382,15 +382,16 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
382382
names(theta_hat) <- names(beta_hat) <- colnames(X)
383383
df_residual <- nrow(X) - length(theta_hat)
384384

385+
# selection parameters
385386
ps <- inv_link(theta_hat %*% t(X)) # inv_link(as.vector(X_design %*% as.matrix(theta_hat)))
386387
eta_sel <- theta_hat %*% t(X)
387388
ps_der <- dinv_link(eta_sel)
388389
ps_nons <- ps[loc_nons]
389390
est_ps_rand <- ps[loc_rand]
390391
ps_nons_der <- ps_der[loc_nons]
391392
weights_nons <- 1/ps_nons
392-
resids <- c(est_ps_rand, ps_nons) - R
393-
variance <- (t(resids) %*% resids) / df_residual
393+
resids <- R - c(est_ps_rand, ps_nons)
394+
variance <- as.vector( (t(resids) %*% resids) / df_residual)
394395

395396
if (!boot) {
396397
N_nons <- sum(weights * weights_nons)
@@ -407,17 +408,19 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
407408
y_nons_pred <- y_hat[loc_nons]
408409

409410
if (!boot) {
410-
sigma <- family$variance(mu = y_hat, y = y[loc_rand])
411-
residuals <- family$residuals(mu = y_rand_pred, y = y[loc_rand])
411+
sigma_nons <- family$variance(mu = y_nons_pred, y = y[loc_nons])
412+
sigma_rand <- family$variance(mu = y_rand_pred, y = y[loc_rand])
413+
residuals <- family$residuals(mu = y_nons_pred, y = y[loc_nons])
412414
}
413415

414416
if (!boot) {
415417
# variance-covariance matrix for outcome model
416418
# vcov_outcome <- solve(t(X_design) %*% diag(sigma) %*% X_design)
417-
vcov_outcome <- solve(t(X) %*% (sigma * X))
419+
vcov_outcome <- solve(t(X[loc_nons,]) %*% (sigma_nons * X[loc_nons,]))
418420
beta_errors <- sqrt(diag(vcov_outcome))
419421
}
420422

423+
# grad = multiroot$f.root[(p+1):(2*p)]
421424
if (!boot) {
422425
hess <- NULL
423426
selection <- list(theta_hat = theta_hat, # TODO list as close as possible to SelecttionList
@@ -428,24 +431,26 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
428431
variance_covariance = vcov_selection,
429432
df_residual = df_residual,
430433
log_likelihood = "NULL",
431-
linear.predictors = eta_sel,
434+
eta = eta_sel,
432435
aic = NULL,
433436
residuals = resids,
434437
variance = variance,
435438
method = method)
436439

437440
outcome <- list(coefficients = beta_hat, # TODO list as close as possible to glm
438-
grad = multiroot$f.root[(p+1):(2*p)],
439-
hess = hess, # TODO
441+
std_err = beta_errors,
440442
variance_covariance = vcov_outcome,
441443
df_residual = df_residual,
442-
family = list(mu = y_hat,
443-
variance = sigma[loc_rand],
444-
residuals = residuals),
444+
family = list(mu = y_nons_pred,
445+
variance = sigma_nons,
446+
family = family$family),
447+
residuals = residuals,
448+
fitted.values = y_nons_pred,
449+
sigma_rand = sigma_rand,
445450
y_rand_pred = y_rand_pred,
446451
y_nons_pred = y_nons_pred,
447-
log_likelihood = "NULL",
448-
linear.predictors = eta_out)
452+
linear.predictors = eta_out[loc_nons],
453+
X = X[loc_nons,])
449454
} else {
450455
selection <- list(coefficients = theta_hat,# TODO list as close as possible to SelecttionList
451456
ps_nons = ps_nons)

R/cloglogModel.R

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#'
66
#' @param ... Additional, optional arguments.
77
#'
8+
#' @seealso
9+
#'
10+
#' [nonprob()] -- for fitting procedure with non-probability samples.
11+
#'
812
#' @return List with selected methods/objects/functions.
913
#' @importFrom maxLik maxLik
1014
#' @importFrom Matrix Matrix

R/control.R

+13
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
#'
3838
#' @return List with selected parameters.
3939
#'
40+
#' @seealso
41+
#'
42+
#' [nonprob()] -- for fitting procedure with non-probability samples.
43+
#'
4044
#' @export
4145

4246
controlSel <- function(method = "glm.fit", # perhaps another control function for model with variables selection
@@ -102,6 +106,11 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
102106
#'
103107
#' @return List with selected parameters.
104108
#'
109+
#' @seealso
110+
#'
111+
#' [nonprob()] -- for fitting procedure with non-probability samples.
112+
#'
113+
#'
105114
#' @export
106115

107116
controlOut <- function(epsilon = 1e-6,
@@ -155,6 +164,10 @@ controlOut <- function(epsilon = 1e-6,
155164
#'
156165
#' @return List with selected parameters.
157166
#'
167+
#' @seealso
168+
#'
169+
#' [nonprob()] -- for fitting procedure with non-probability samples.
170+
#'
158171
#' @export
159172

160173
controlInf <- function(vars_selection = FALSE,

R/family.R

+7-4
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,23 @@ poisson_nonprobsvy <- function(link = "log") {
1010
structure(list(mu = mu,
1111
variance = variance,
1212
mu_der = mu_der,
13-
residuals = residuals),
13+
residuals = residuals,
14+
family = "poisson"),
1415
class = "family")
1516
}
1617

1718
gaussian_nonprobsvy <- function(link = "identity") {
1819
mu <- function(eta) eta
19-
variance <- function(mu, y = NULL) rep.int(1, length(mu)) #mean((y - mu)^2) rep.int(1, length(mu))
20+
variance <- function(mu, y = NULL) mean((y - mu)^2) #rep.int(1, length(mu)) rep.int(1, length(mu))
2021
mu_der <- function(mu) 1 # first derivative
2122
mu_der2 <- function(mu) 0 # second derivative
2223
residuals <- function(mu, y) as.vector(y - mu)
2324

2425
structure(list(mu = mu,
2526
variance = variance,
2627
mu_der = mu_der,
27-
residuals = residuals),
28+
residuals = residuals,
29+
family = "gaussian"),
2830
class = "family")
2931
}
3032

@@ -41,6 +43,7 @@ binomial_nonprobsvy <- function(link = "logit") {
4143
structure(list(mu = mu,
4244
variance = variance,
4345
mu_der = mu_der,
44-
residuals = residuals),
46+
residuals = residuals,
47+
family = "binomial"),
4548
class = "family")
4649
}

R/generateData.r

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ NULL
1616
#' \item{x2 -- the second variable based on z2 and x1}
1717
#' \item{x3 -- the third variable based on z3 and x1 and x2}
1818
#' \item{x4 -- the third variable based on z4 and x1, x2 and x3}
19-
#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\epsilon}, so the cor(y,y_hat) = 0.30}
20-
#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\\epsilon}, so the cor(y,y_hat) = 0.60}
21-
#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\epsilon}, so the cor(y,y_hat) = 0.80}
19+
#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.30}
20+
#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.60}
21+
#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.80}
2222
#' \item{rho -- true propensity scores for big data such that sum(rho)=n}
2323
#' \item{srs -- probabilities of inclusion to random sample such that max(srs)/min(srs)=50}
2424
#' }

R/internals.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ internal_varDR <- function(OutcomeModel,
387387
svydesign_mean <- survey::svymean(~y_rand, svydesign)
388388

389389
var_prob <- as.vector(attr(svydesign_mean, "var")) # based on survey package, probability component
390-
var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigma))/N_nons^2 # nonprobability component
390+
var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigma))/N_nons^2 # TODO potential bug here nonprobability component
391391
} else {
392392
eta <- as.vector(SelectionModel$X_nons %*% as.matrix(theta))
393393
h_n <- 1/N_nons * sum(OutcomeModel$y_nons - y_nons_pred) # TODO add weights # errors mean

R/logitModel.R

+6
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,15 @@
77
#'
88
#' @return List with selected methods/objects/functions.
99
#'
10+
#' @seealso
11+
#'
12+
#' [nonprob()] -- for fitting procedure with non-probability samples.
13+
#'
1014
#' @importFrom maxLik maxLik
1115
#' @importFrom Matrix Matrix
1216
#' @importFrom survey svyrecvar
17+
#'
18+
#'
1319
#' @export
1420

1521
logit_model_nonprobsvy <- function(...) {

R/main_function_documentation.R

+29-10
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@ NULL
66
#' \loadmathjax
77
#' @description \code{nonprob} fits model for inference based on non-probability surveys (including big data) using various methods.
88
#' The function allows you to estimate the population mean having access to a reference probability sample as well as total/mean values of covariates.
9-
#' In the package implemented state-of-the-art approaches recently proposed in the literature: Chen et al. (2020), Yang et al. (2020), Wu (2022) and use `survey` package [Lumley 2004](https://CRAN.R-project.org/package=survey) for inference.
10-
#' Provided propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and doubly robust estimators that take into account minimization of the asymptotic bias of the population mean estimators,
9+
#'
10+
#' In the package implemented state-of-the-art approaches recently proposed in the literature: Chen et al. (2020),
11+
#' Yang et al. (2020), Wu (2022) and use `survey` package [Lumley 2004](https://CRAN.R-project.org/package=survey) for inference.
12+
#'
13+
#' Provided propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and
14+
#' doubly robust estimators that take into account minimization of the asymptotic bias of the population mean estimators,
1115
#' variable selection or overlap between random and non-random sample.
1216
#' The package uses `survey` package functionalities when a probability sample is available.
1317
#'
@@ -84,7 +88,7 @@ NULL
8488
#' Inverse probability approach is based on assumption that reference probability sample
8589
#' is available and therefore we can estimate propensity score of selection mechanism.
8690
#' Estimator has following form:
87-
#' \mjsdeqn{\mu_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
91+
#' \mjsdeqn{\hat{\mu}_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
8892
#' For this purpose with consider multiple ways of estimation. The first approach is Maximum Likelihood Estimation with corrected
8993
#' log-likelihood function which is given by the following formula
9094
#' \mjsdeqn{
@@ -101,14 +105,14 @@ NULL
101105
#' where imputed values of the outcome variables are created for whole probability sample.
102106
#' In this case we treat big-data sample as a training dataset, which is used to build an imputation model. Using imputed values
103107
#' for probability sample and design (known) weights, we can build population mean estimator of form:
104-
#' \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
108+
#' \mjsdeqn{\hat{\mu}_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
105109
#' It opens the the door to very flexible method for imputation model. In the package used generalized linear models from [stats::glm()]
106110
#' nearest neighbour algorithm using [RANN::nn2()] and predictive mean matching.
107111
#'
108112
#' 3. Doubly robust estimation -- The IPW and MI estimators are sensible on misspecified models for propensity score and outcome variable respectively.
109113
#' For this purpose so called doubly-robust methods, which take into account these problems, are presented.
110114
#' It is quite simple idea of combination propensity score and imputation models during inference which lead to the following estimator
111-
#' \mjsdeqn{\mu_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
115+
#' \mjsdeqn{\hat{\mu}_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
112116
#' In addition, an approach based directly on bias minimisation has been implemented. Following formula
113117
#' \mjsdeqn{
114118
#' \begin{aligned}
@@ -191,14 +195,29 @@ NULL
191195
#' \item{\code{nonprob_size} -- size of non-probability sample}
192196
#' \item{\code{prob_size} -- size of probability sample}
193197
#' \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)}
194-
#' \item{\code{outcome} -- list containing information about fitting of mass imputation model, in case of regression model, object containing list returned by the function.
195-
#' Additionaly when variables selection model for outcome variable is fitting, list includes of \code{cve} -- the error for each value of `lambda`, averaged accross the cross-validation folds.
196-
#' [stats::glm()], in case of nearest neigbour imputation object containing list returned by [RANN::nn2()].}
198+
#' \item{\code{outcome} -- list containing information about fitting of mass imputation model, in case of regression model, object containing list returned by the function
199+
#' [stats::glm()], in case of nearest neighbour imputation object containing list returned by [RANN::nn2()]. If `bias_correction` in [controlInf()] is set on `TRUE`, then estimation is based on
200+
#' the joint estimating equations for `selection` and `outcome` model and therefore, the list differs from the one returned by the [stats::glm()] function and contains elements such as
201+
#' \itemize{
202+
#' \item{\code{coefficients} -- estimated coefficients of the regression model}
203+
#' \item{\code{std_err} -- standard errors of the estimated coefficients}
204+
#' \item{\code{residuals} -- The response residuals}
205+
#' \item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates}
206+
#' \item{\code{df_residual} -- The degrees of freedom for residuals}
207+
#' \item{\code{family} -- specifies the error distribution and link function to be used in the model}
208+
#' \item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model}
209+
#' \item{\code{linear.predictors} -- The linear fit on link scale}
210+
#' \item{\code{X} -- The design matrix}
211+
#' \item{\code{method} -- set on `glm`, since the regression method}
212+
#' }
213+
#' }
214+
#' Additionally when variables selection model for outcome variable is fitting, list includes of \code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds.
197215
#' \item{\code{selection} -- list containing information about fitting of propensity score model, such as
198216
#' \itemize{
199217
#' \item{\code{coefficients} -- a named vector of coefficients}
200218
#' \item{\code{std_err} -- standard errors of the estimated model coefficients}
201-
#' \item{\code{residuals} -- the working residuals}
219+
#' \item{\code{residuals} -- the response residuals}
220+
#' \item{\code{variance} -- the root mean square error}
202221
#' \item{\code{fitted_values} -- the fitted mean values, obtained by transforming the linear predictors by the inverse of the link function.}
203222
#' \item{\code{link} -- the `link` object used.}
204223
#' \item{\code{linear_predictors} -- the linear fit on link scale.}
@@ -208,7 +227,7 @@ NULL
208227
#' \item{\code{formula} -- the formula supplied.}
209228
#' \item{\code{df_residual} -- the residual degrees of freedom.}
210229
#' \item{\code{log_likelihood} -- value of log-likelihood function if `mle` method, in the other case `NULL`.}
211-
#' \item{\code{cve} -- the error for each value of `lambda`, averaged accross the cross-validation folds for variables selection model
230+
#' \item{\code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds for variables selection model
212231
#' when propensity score model is fitting.}
213232
#' }
214233
#' }

0 commit comments

Comments
 (0)