Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #26

Merged
merged 5 commits into from
Nov 27, 2023
Merged

Dev #26

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ Type: Package
Title: Package for Inference Based on Nonprobability Samples
Version: 0.1.0
Authors@R:
c(person(given = "Lukasz",
c(person(given = "Łukasz",
family = "Chrostowski",
role = c("aut", "cre"),
email = "[email protected]"),
person(given = "Maciej",
family = "Beresewicz",
family = "Beręsewicz",
role = c("aut", "ctb"),
email = "[email protected]"),
person(given = "Piotr",
Expand Down
35 changes: 20 additions & 15 deletions R/EstimationMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ mle <- function(...) {
ps_nons_der <- dinv_link(eta_nons)
est_ps_rand_der <- dinv_link(eta_rand)

resids <- c(est_ps_rand, ps_nons) - R
resids <- R - c(est_ps_rand, ps_nons)

variance <- (t(resids) %*% resids) / df_reduced

Expand Down Expand Up @@ -293,7 +293,7 @@ gee <- function(...) {
ps_nons <- inv_link(eta_nons)
est_ps_rand <- inv_link(eta_rand)
variance_covariance <- solve(-hess)
resids <- c(est_ps_rand, ps_nons) - R
resids <- R - c(est_ps_rand, ps_nons)

df_reduced <- nrow(X) - length(theta_hat)
variance <- as.vector((t(resids) %*% resids) / df_reduced)
Expand Down Expand Up @@ -382,15 +382,16 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
names(theta_hat) <- names(beta_hat) <- colnames(X)
df_residual <- nrow(X) - length(theta_hat)

# selection parameters
ps <- inv_link(theta_hat %*% t(X)) # inv_link(as.vector(X_design %*% as.matrix(theta_hat)))
eta_sel <- theta_hat %*% t(X)
ps_der <- dinv_link(eta_sel)
ps_nons <- ps[loc_nons]
est_ps_rand <- ps[loc_rand]
ps_nons_der <- ps_der[loc_nons]
weights_nons <- 1/ps_nons
resids <- c(est_ps_rand, ps_nons) - R
variance <- (t(resids) %*% resids) / df_residual
resids <- R - c(est_ps_rand, ps_nons)
variance <- as.vector( (t(resids) %*% resids) / df_residual)

if (!boot) {
N_nons <- sum(weights * weights_nons)
Expand All @@ -407,17 +408,19 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
y_nons_pred <- y_hat[loc_nons]

if (!boot) {
sigma <- family$variance(mu = y_hat, y = y[loc_rand])
residuals <- family$residuals(mu = y_rand_pred, y = y[loc_rand])
sigma_nons <- family$variance(mu = y_nons_pred, y = y[loc_nons])
sigma_rand <- family$variance(mu = y_rand_pred, y = y[loc_rand])
residuals <- family$residuals(mu = y_nons_pred, y = y[loc_nons])
}

if (!boot) {
# variance-covariance matrix for outcome model
# vcov_outcome <- solve(t(X_design) %*% diag(sigma) %*% X_design)
vcov_outcome <- solve(t(X) %*% (sigma * X))
vcov_outcome <- solve(t(X[loc_nons,]) %*% (sigma_nons * X[loc_nons,]))
beta_errors <- sqrt(diag(vcov_outcome))
}

# grad = multiroot$f.root[(p+1):(2*p)]
if (!boot) {
hess <- NULL
selection <- list(theta_hat = theta_hat, # TODO list as close as possible to SelecttionList
Expand All @@ -428,24 +431,26 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
variance_covariance = vcov_selection,
df_residual = df_residual,
log_likelihood = "NULL",
linear.predictors = eta_sel,
eta = eta_sel,
aic = NULL,
residuals = resids,
variance = variance,
method = method)

outcome <- list(coefficients = beta_hat, # TODO list as close as possible to glm
grad = multiroot$f.root[(p+1):(2*p)],
hess = hess, # TODO
std_err = beta_errors,
variance_covariance = vcov_outcome,
df_residual = df_residual,
family = list(mu = y_hat,
variance = sigma[loc_rand],
residuals = residuals),
family = list(mu = y_nons_pred,
variance = sigma_nons,
family = family$family),
residuals = residuals,
fitted.values = y_nons_pred,
sigma_rand = sigma_rand,
y_rand_pred = y_rand_pred,
y_nons_pred = y_nons_pred,
log_likelihood = "NULL",
linear.predictors = eta_out)
linear.predictors = eta_out[loc_nons],
X = X[loc_nons,])
} else {
selection <- list(coefficients = theta_hat,# TODO list as close as possible to SelecttionList
ps_nons = ps_nons)
Expand Down
4 changes: 4 additions & 0 deletions R/cloglogModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#'
#' @param ... Additional, optional arguments.
#'
#' @seealso
#'
#' [nonprob()] -- for fitting procedure with non-probability samples.
#'
#' @return List with selected methods/objects/functions.
#' @importFrom maxLik maxLik
#' @importFrom Matrix Matrix
Expand Down
13 changes: 13 additions & 0 deletions R/control.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
#'
#' @return List with selected parameters.
#'
#' @seealso
#'
#' [nonprob()] -- for fitting procedure with non-probability samples.
#'
#' @export

controlSel <- function(method = "glm.fit", # perhaps another control function for model with variables selection
Expand Down Expand Up @@ -102,6 +106,11 @@ controlSel <- function(method = "glm.fit", # perhaps another control function fo
#'
#' @return List with selected parameters.
#'
#' @seealso
#'
#' [nonprob()] -- for fitting procedure with non-probability samples.
#'
#'
#' @export

controlOut <- function(epsilon = 1e-6,
Expand Down Expand Up @@ -155,6 +164,10 @@ controlOut <- function(epsilon = 1e-6,
#'
#' @return List with selected parameters.
#'
#' @seealso
#'
#' [nonprob()] -- for fitting procedure with non-probability samples.
#'
#' @export

controlInf <- function(vars_selection = FALSE,
Expand Down
11 changes: 7 additions & 4 deletions R/family.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,23 @@ poisson_nonprobsvy <- function(link = "log") {
structure(list(mu = mu,
variance = variance,
mu_der = mu_der,
residuals = residuals),
residuals = residuals,
family = "poisson"),
class = "family")
}

gaussian_nonprobsvy <- function(link = "identity") {
mu <- function(eta) eta
variance <- function(mu, y = NULL) rep.int(1, length(mu)) #mean((y - mu)^2) rep.int(1, length(mu))
variance <- function(mu, y = NULL) mean((y - mu)^2) #rep.int(1, length(mu)) rep.int(1, length(mu))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just as a reminder you don't have to write all those for every exponential family you want to use you can also just do something simpler like:

gaussian_nonprobsvy <- function(link = "identity") {
  x <- stats::gaussian(link = link)
  x$mu_der <- function(mu) 1
  # Sooner or later you will also probably need a class for those as well
  class(x) <- c(class(x), "nonprobsvy_family")
}

mu_der <- function(mu) 1 # first derivative
mu_der2 <- function(mu) 0 # second derivative
residuals <- function(mu, y) as.vector(y - mu)

structure(list(mu = mu,
variance = variance,
mu_der = mu_der,
residuals = residuals),
residuals = residuals,
family = "gaussian"),
class = "family")
}

Expand All @@ -41,6 +43,7 @@ binomial_nonprobsvy <- function(link = "logit") {
structure(list(mu = mu,
variance = variance,
mu_der = mu_der,
residuals = residuals),
residuals = residuals,
family = "binomial"),
class = "family")
}
6 changes: 3 additions & 3 deletions R/generateData.r
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ NULL
#' \item{x2 -- the second variable based on z2 and x1}
#' \item{x3 -- the third variable based on z3 and x1 and x2}
#' \item{x4 -- the third variable based on z4 and x1, x2 and x3}
#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\epsilon}, so the cor(y,y_hat) = 0.30}
#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\\epsilon}, so the cor(y,y_hat) = 0.60}
#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma*\epsilon}, so the cor(y,y_hat) = 0.80}
#' \item{\mjseqn{y30} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.30}
#' \item{\mjseqn{y60} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.60}
#' \item{\mjseqn{y80} -- \mjseqn{y} generated from the model \mjseqn{y=2+x1+x2+x3+x4+\sigma \cdot \varepsilon}, so the cor(y,y_hat) = 0.80}
#' \item{rho -- true propensity scores for big data such that sum(rho)=n}
#' \item{srs -- probabilities of inclusion to random sample such that max(srs)/min(srs)=50}
#' }
Expand Down
2 changes: 1 addition & 1 deletion R/internals.R
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ internal_varDR <- function(OutcomeModel,
svydesign_mean <- survey::svymean(~y_rand, svydesign)

var_prob <- as.vector(attr(svydesign_mean, "var")) # based on survey package, probability component
var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigma))/N_nons^2 # nonprobability component
var_nonprob <- (sum((infl1) - 2*infl2) + sum(weights_rand * sigma))/N_nons^2 # TODO potential bug here nonprobability component
} else {
eta <- as.vector(SelectionModel$X_nons %*% as.matrix(theta))
h_n <- 1/N_nons * sum(OutcomeModel$y_nons - y_nons_pred) # TODO add weights # errors mean
Expand Down
6 changes: 6 additions & 0 deletions R/logitModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@
#'
#' @return List with selected methods/objects/functions.
#'
#' @seealso
#'
#' [nonprob()] -- for fitting procedure with non-probability samples.
#'
#' @importFrom maxLik maxLik
#' @importFrom Matrix Matrix
#' @importFrom survey svyrecvar
#'
#'
#' @export

logit_model_nonprobsvy <- function(...) {
Expand Down
39 changes: 29 additions & 10 deletions R/main_function_documentation.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ NULL
#' \loadmathjax
#' @description \code{nonprob} fits model for inference based on non-probability surveys (including big data) using various methods.
#' The function allows you to estimate the population mean having access to a reference probability sample as well as total/mean values of covariates.
#' In the package implemented state-of-the-art approaches recently proposed in the literature: Chen et al. (2020), Yang et al. (2020), Wu (2022) and use `survey` package [Lumley 2004](https://CRAN.R-project.org/package=survey) for inference.
#' Provided propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and doubly robust estimators that take into account minimization of the asymptotic bias of the population mean estimators,
#'
#' In the package implemented state-of-the-art approaches recently proposed in the literature: Chen et al. (2020),
#' Yang et al. (2020), Wu (2022) and use `survey` package [Lumley 2004](https://CRAN.R-project.org/package=survey) for inference.
#'
#' Provided propensity score weighting (e.g. with calibration constraints), mass imputation (e.g. nearest neighbour) and
#' doubly robust estimators that take into account minimization of the asymptotic bias of the population mean estimators,
#' variable selection or overlap between random and non-random sample.
#' The package uses `survey` package functionalities when a probability sample is available.
#'
Expand Down Expand Up @@ -84,7 +88,7 @@ NULL
#' Inverse probability approach is based on assumption that reference probability sample
#' is available and therefore we can estimate propensity score of selection mechanism.
#' Estimator has following form:
#' \mjsdeqn{\mu_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
#' \mjsdeqn{\hat{\mu}_{IPW} = \frac{1}{N^{A}}\sum_{i \in S_{A}} \frac{y_{i}}{\hat{\pi}_{i}^{A}}.}
#' For this purpose with consider multiple ways of estimation. The first approach is Maximum Likelihood Estimation with corrected
#' log-likelihood function which is given by the following formula
#' \mjsdeqn{
Expand All @@ -101,14 +105,14 @@ NULL
#' where imputed values of the outcome variables are created for whole probability sample.
#' In this case we treat big-data sample as a training dataset, which is used to build an imputation model. Using imputed values
#' for probability sample and design (known) weights, we can build population mean estimator of form:
#' \mjsdeqn{\mu_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
#' \mjsdeqn{\hat{\mu}_{MI} = \frac{1}{N^B}\sum_{i \in S_{B}} d_{i}^{B} \hat{y}_i.}
#' It opens the the door to very flexible method for imputation model. In the package used generalized linear models from [stats::glm()]
#' nearest neighbour algorithm using [RANN::nn2()] and predictive mean matching.
#'
#' 3. Doubly robust estimation -- The IPW and MI estimators are sensible on misspecified models for propensity score and outcome variable respectively.
#' For this purpose so called doubly-robust methods, which take into account these problems, are presented.
#' It is quite simple idea of combination propensity score and imputation models during inference which lead to the following estimator
#' \mjsdeqn{\mu_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
#' \mjsdeqn{\hat{\mu}_{DR} = \frac{1}{N^A}\sum_{i \in S_A} \hat{d}_i^A (y_i - \hat{y}_i) + \frac{1}{N^B}\sum_{i \in S_B} d_i^B \hat{y}_i.}
#' In addition, an approach based directly on bias minimisation has been implemented. Following formula
#' \mjsdeqn{
#' \begin{aligned}
Expand Down Expand Up @@ -191,14 +195,29 @@ NULL
#' \item{\code{nonprob_size} -- size of non-probability sample}
#' \item{\code{prob_size} -- size of probability sample}
#' \item{\code{pop_size} -- estimated population size derived from estimated weights (non-probability sample) or known design weights (probability sample)}
#' \item{\code{outcome} -- list containing information about fitting of mass imputation model, in case of regression model, object containing list returned by the function.
#' Additionaly when variables selection model for outcome variable is fitting, list includes of \code{cve} -- the error for each value of `lambda`, averaged accross the cross-validation folds.
#' [stats::glm()], in case of nearest neigbour imputation object containing list returned by [RANN::nn2()].}
#' \item{\code{outcome} -- list containing information about fitting of mass imputation model, in case of regression model, object containing list returned by the function
#' [stats::glm()], in case of nearest neighbour imputation object containing list returned by [RANN::nn2()]. If `bias_correction` in [controlInf()] is set on `TRUE`, then estimation is based on
#' the joint estimating equations for `selection` and `outcome` model and therefore, the list differs from the one returned by the [stats::glm()] function and contains elements such as
#' \itemize{
#' \item{\code{coefficients} -- estimated coefficients of the regression model}
#' \item{\code{std_err} -- standard errors of the estimated coefficients}
#' \item{\code{residuals} -- The response residuals}
#' \item{\code{variance_covariance} -- The variance-covariance matrix of the coefficient estimates}
#' \item{\code{df_residual} -- The degrees of freedom for residuals}
#' \item{\code{family} -- specifies the error distribution and link function to be used in the model}
#' \item{\code{fitted.values} -- The predicted values of the response variable based on the fitted model}
#' \item{\code{linear.predictors} -- The linear fit on link scale}
#' \item{\code{X} -- The design matrix}
#' \item{\code{method} -- set on `glm`, since the regression method}
#' }
#' }
#' Additionally when variables selection model for outcome variable is fitting, list includes of \code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds.
#' \item{\code{selection} -- list containing information about fitting of propensity score model, such as
#' \itemize{
#' \item{\code{coefficients} -- a named vector of coefficients}
#' \item{\code{std_err} -- standard errors of the estimated model coefficients}
#' \item{\code{residuals} -- the working residuals}
#' \item{\code{residuals} -- the response residuals}
#' \item{\code{variance} -- the root mean square error}
#' \item{\code{fitted_values} -- the fitted mean values, obtained by transforming the linear predictors by the inverse of the link function.}
#' \item{\code{link} -- the `link` object used.}
#' \item{\code{linear_predictors} -- the linear fit on link scale.}
Expand All @@ -208,7 +227,7 @@ NULL
#' \item{\code{formula} -- the formula supplied.}
#' \item{\code{df_residual} -- the residual degrees of freedom.}
#' \item{\code{log_likelihood} -- value of log-likelihood function if `mle` method, in the other case `NULL`.}
#' \item{\code{cve} -- the error for each value of `lambda`, averaged accross the cross-validation folds for variables selection model
#' \item{\code{cve} -- the error for each value of `lambda`, averaged across the cross-validation folds for variables selection model
#' when propensity score model is fitting.}
#' }
#' }
Expand Down
Loading