From dbdee1112483587a1a6045cf6944833f33287dd6 Mon Sep 17 00:00:00 2001 From: BERENZ Date: Sat, 1 Feb 2025 10:46:57 +0100 Subject: [PATCH] testing quarto vignette --- DESCRIPTION | 4 +- vignettes/bibliography.bib | 1057 ++++++++++++++++++++++ vignettes/nonprobsvy-getting-started.qmd | 198 ++++ 3 files changed, 1258 insertions(+), 1 deletion(-) create mode 100644 vignettes/bibliography.bib create mode 100644 vignettes/nonprobsvy-getting-started.qmd diff --git a/DESCRIPTION b/DESCRIPTION index 55bf22e..6dff0a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,6 +28,7 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 URL: https://github.com/ncn-foreigners/nonprobsvy, https://ncn-foreigners.github.io/nonprobsvy/ BugReports: https://github.com/ncn-foreigners/nonprobsvy/issues +VignetteBuilder: quarto Depends: R (>= 4.0.0), survey @@ -49,7 +50,8 @@ Suggests: tinytest, covr, sampling, - spelling + spelling, + quarto LinkingTo: Rcpp, RcppArmadillo diff --git a/vignettes/bibliography.bib b/vignettes/bibliography.bib new file mode 100644 index 0000000..d9ab0dc --- /dev/null +++ b/vignettes/bibliography.bib @@ -0,0 +1,1057 @@ +@article{chen2020doubly, + title={Doubly robust inference with nonprobability survey samples}, + author={Chen, Yilin and Li, Pengfei and Wu, Changbao}, + journal={Journal of the American Statistical Association}, + volume={115}, + number={532}, + pages={2011--2021}, + year={2020}, + publisher={Taylor \& Francis} +} +@article{harms2006calibration, + title={On calibration estimation for quantiles}, + author={Harms, Torsten and Duchesne, Pierre}, + journal={Survey Methodology}, + volume={32}, + number={1}, + pages={37--52}, + year={2006} +} + +@report{ballerini2024mapping, + title={Report on mapping, harmonising and integrating novel data sources for research purposes}, + author={Ballerini, V. and Beraldo, D. and Bocci, C. and Braito, L. and Milana, R. and Trans, M.}, + journal={SPES Report no. 4.1, SPES project -- Sustainability Performances, Evidence and Scenarios}, + year={2024}, + institution={University of Florence}, + address={Florence} +} + +@misc{chlebicki2025, + title={Data integration of non-probability and probability samples with predictive mean matching}, + author={Piotr Chlebicki and Łukasz Chrostowski and Maciej Beręsewicz}, + year={2024}, + eprint={2403.13750}, + archivePrefix={arXiv}, + primaryClass={stat.ME}, + url={https://arxiv.org/abs/2403.13750}, +} + +@article{lumley2004, + author = {Thomas Lumley}, + title = {survey R package}, + journal = {}, + year = {2004}, +} + + +@article{deville1992calibration, + title={Calibration estimators in survey sampling}, + author={Deville, Jean-Claude and S{\"a}rndal, Carl-Erik}, + journal={Journal of the American Statistical Association}, + volume={87}, + number={418}, + pages={376--382}, + year={1992}, + publisher={Taylor \& Francis} +} + + +@article{yang_asymptotic_2020, + title = {Asymptotic theory and inference of predictive mean matching imputation using a superpopulation model framework}, + volume = {47}, + issn = {0303-6898, 1467-9469}, + url = {https://onlinelibrary.wiley.com/doi/10.1111/sjos.12429}, + doi = {10.1111/sjos.12429}, + abstract = {Predictive mean matching imputation is popular for handling item nonresponse in survey sampling. In this article, we study the asymptotic properties of the predictive mean matching estimator for finite-population inference using a superpopulation model framework. We also clarify conditions for its robustness. For variance estimation, the conventional bootstrap inference is invalid for matching estimators with a fixed number of matches due to the nonsmoothness nature of the matching estimator. We propose a new replication variance estimator, which is asymptotically valid. The key strategy is to construct replicates directly based on the linear terms of the martingale representation for the matching estimator, instead of individual records of variables. Simulation studies confirm that the proposed method provides valid inference.}, + language = {en}, + number = {3}, + urldate = {2023-12-10}, + journal = {Scandinavian Journal of Statistics}, + author = {Yang, Shu and Kim, Jae Kwang}, + month = sep, + year = {2020}, + pages = {839--861}, +} + +@article{kim2019sampling, + title={Sampling techniques for big data analysis}, + author={Kim, Jae-Kwang and Wang, Zhonglei}, + journal={International Statistical Review}, + volume={87}, + pages={S177--S191}, + year={2019}, + publisher={Wiley Online Library} +} + +@article{yang2021integration, + title={Integration of data from probability surveys and big found data for finite population inference using mass imputation}, + author={Yang, Shu and Kim, Jae-Kwang and Hwang, Youngdeok}, + journal = {Survey Methodology}, +volume = {47}, + issue = {1}, +pages = {29-58}, + year={2021} +} + +@Manual{rcran, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2023}, + url = {https://www.R-project.org/}, + } + +@Manual{sampling, + title = {sampling: Survey Sampling}, + author = {Yves Tillé and Alina Matei}, + year = {2021}, + note = {R package version 2.9}, + url = {https://CRAN.R-project.org/package=sampling}, + } + +@Manual{GJRM, + title = {GJRM: Generalized Joint Regression Modelling}, + pages = {R package version 0.2-6.4}, + year = {2023}, + author = {Giampero Marra and Rosalba Rodicw} + } + +@Manual{Rcpp, +title = {Rcpp: Seamless R and C++ Integration}, +author = {Dirk Eddelbuettel and Romain Francois and JJ Allaire and + Kevin Ushey and Qiang Kou and Nathan Russell and Inaki Ucar and + Douglas Bates and John Chambers}, +year = {2024}, +note = {R package version 1.0.13}, +url = {https://CRAN.R-project.org/package=Rcpp}, +} + +@Article{RcppArmadillo, + title = {RcppArmadillo: Accelerating R with high-performance C++ linear algebra}, + author = {Dirk Eddelbuettel and Conrad Sanderson}, + journal = {Computational Statistics and Data Analysis}, + year = {2014}, + volume = {71}, + month = {March}, + pages = {1054--1063}, + doi = {10.1016/j.csda.2013.02.005}, +} +@Manual{foreach, + title = {foreach: Provides Foreach Looping Construct for R}, + author = {Folashade Daniel Hong Ooi, Rich Calaway and Steve Weston}, + year = {2023}, + note = {R package version 1.5.2}, + url = {https://CRAN.R-project.org/package=foreach}, +} + +@Manual{maxLik, + title = {maxLik: Maximum Likelihood Estimation and Related Tools}, + author = {Arne Henningsen and Ott Toomet}, + year = {2023}, + note = {R package version 1.8-5}, + url = {https://CRAN.R-project.org/package=maxLik}, +} + +@Manual{Matrix, + title = {Matrix: Sparse and Dense Matrix Classes and Methods}, + author = {Douglas Bates and Martin Maechler and others}, + year = {2023}, + note = {R package version 1.6-1.1}, + url = {https://CRAN.R-project.org/package=Matrix}, +} + +@Manual{MASS, + title = {MASS: Support Functions and Datasets for Venables and Ripley's MASS}, + author = {Brian Ripley and William Venables and others}, + year = {2023}, + note = {R package version 7.3-60}, + url = {https://CRAN.R-project.org/package=MASS}, +} + +@Article{ncvreg, + author = {Patrick Breheny and Jian Huang}, + title = {Coordinate descent algorithms for nonconvex penalized regression, + with applications to biological feature selection}, + journal = {Annals of Applied Statistics}, + year = {2011}, + volume = {5}, + pages = {232--253}, + number = {1}, + doi = {10.1214/10-AOAS388}, + url = {https://doi.org/10.1214/10-AOAS388}, + } + +@Manual{mathjaxr, + title = {mathjaxr: Using MathJax in Rd Files for Dynamic Rendering of Equations}, + author = {Sacha Epskamp}, + year = {2023}, + note = {R package version 1.6-0}, + url = {https://CRAN.R-project.org/package=mathjaxr}, +} + +@Manual{nleqslv, + title = {nleqslv: Solve Systems of Nonlinear Equations}, + author = {Uwe Groemping}, + year = {2023}, + note = {R package version 3.3.3}, + url = {https://CRAN.R-project.org/package=nleqslv}, +} + +@Manual{doParallel, + title = {doParallel: Foreach Parallel Adaptor for the 'parallel' Package}, + author = {Steve Weston, Folashade Daniel, Steve Weston and Dan Tenenbaum}, + year = {2022}, + note = {R package version 1.0.17}, + url = {https://CRAN.R-project.org/package=doParallel}, +} + + +@Manual{NonProbEst-pkg, + title = {NonProbEst: Estimation in Nonprobability Sampling}, + pages = {R package version 0.2.4}, + year = {2020}, + author = {Luis Castro Martín, Ramón Ferri García and María del Mar Rueda}, +} + +@article{anastasiade2017decomposition, + title={Decomposition of gender wage inequalities through calibration: application to the Swiss structure of earnings survey}, + author={Anastasiade, Mihaela-Catalina and Tille, Yves}, + journal={Survey Methodology}, + volume={43}, + number={2}, + pages={211--235}, + year={2017}, + publisher={Statistics Canada} +} + + +@article{haziza2016discussion, + title={A discussion of weighting procedures for unit nonresponse}, + author={Haziza, David and Lesage, {\'E}ric}, + journal={Journal of Official Statistics}, + volume={32}, + number={1}, + pages={129--145}, + year={2016} +} + +@book{sarndal2005estimation, + title={Estimation in surveys with nonresponse}, + author={S{\"a}rndal, Carl-Erik and Lundstr{\"o}m, Sixten}, + year={2005}, + publisher={John Wiley \& Sons} +} + +@article{duchesne1999robust, + title={Robust calibration estimators}, + author={Duchesne, PIERRE}, + journal={Survey Methodology}, + volume={25}, + pages={43--56}, + year={1999}, + publisher={Statistics Canada} +} + +@article{chen2017approaches, + ISSN = {08834237, 21688745}, + URL = {http://www.jstor.org/stable/26408227}, + author = {Qixuan Chen and Michael R. Elliott and David Haziza and Ye Yang and Malay Ghosh and Roderick J. A. Little and Joseph Sedransk and Mary Thompson}, + journal = {Statistical Science}, + number = {2}, + pages = {227--248}, + publisher = {Institute of Mathematical Statistics}, + title = {Approaches to Improving Survey-Weighted Estimates}, + urldate = {2023-08-08}, + volume = {32}, + year = {2017} +} + +@article{tsung2018model, + title={Model-assisted calibration of non-probability sample survey data using adaptive {L}{A}{S}{S}{O}.}, + author={Tsung, Chen and Kuang, Jack and Valliant, Richard L and Elliott, Michael R}, + journal={Survey Methodology}, + volume={44}, + number={1}, + pages={117--145}, + year={2018}, + publisher={Statistics Canada} +} + +@article{chen2019calibrating, + title={Calibrating non-probability surveys to estimated control totals using {L}{A}{S}{S}{O}, with an application to political polling}, + author={Chen, Jack Kuang Tsung and Valliant, Richard L and Elliott, Michael R}, + journal={Journal of the Royal Statistical Society Series C: Applied Statistics}, + volume={68}, + number={3}, + pages={657--681}, + year={2019}, + publisher={Oxford University Press} +} + +@article{sarndal2007calibration, + title={The calibration approach in survey theory and practice}, + author={S{\"a}rndal, Carl-Erik}, + journal={Survey methodology}, + volume={33}, + number={2}, + pages={99--119}, + year={2007} +} + +@article{kott2010using, + title={Using calibration weighting to adjust for nonignorable unit nonresponse}, + author={Kott, Phillip S and Chang, Ted}, + journal={Journal of the American Statistical Association}, + volume={105}, + number={491}, + pages={1265--1275}, + year={2010}, + publisher={Taylor \& Francis} +} + +@article{chen2002using, + title={Using empirical likelihood methods to obtain range restricted weights in regression estimators for surveys}, + author={Chen, J and Sitter, RR and Wu, C}, + journal={Biometrika}, + volume={89}, + number={1}, + pages={230--237}, + year={2002}, + publisher={Oxford University Press} +} + +@book{wu2020sampling, + title={Sampling theory and practice}, + author={Wu, Changbao and Thompson, Mary E}, + year={2020}, + publisher={Springer} +} + +@article{wu2022statistical, + title={Statistical inference with non-probability survey samples}, + author={Wu, Changbao}, + journal={Survey Methodology}, + volume={48}, + pages={283--311}, + year={2022} +} + +@article{beaumont2020probability, + title={Are probability surveys bound to disappear for the production of official statistics}, + author={Beaumont, Jean-Francois}, + journal={Survey Methodology}, + volume={46}, + number={1}, + pages={1--28}, + year={2020}, + publisher={Statistics Canada} +} + +@article{bethlehem2014using, + title={Using Web Panels for Official Statistics}, + author={Bethlehem, Jelke}, +pages={1--9}, +journal={Proceedings of Statistics Canada Symposium 2014 +}, + year={2014} +} + +@article{elliott_inference_2017, + title = {Inference for {Nonprobability} {Samples}}, + volume = {32}, + issn = {0883-4237}, + url = {https://projecteuclid.org/journals/statistical-science/volume-32/issue-2/Inference-for-Nonprobability-Samples/10.1214/16-STS598.full}, + doi = {10.1214/16-STS598}, + abstract = {Although selecting a probability sample has been the standard for decades when making inferences from a sample to a finite population, incentives are increasing to use nonprobability samples. In a world of “big data”, large amounts of data are available that are faster and easier to collect than are probability samples. Design-based inference, in which the distribution for inference is generated by the random mechanism used by the sampler, cannot be used for nonprobability samples. One alternative is quasi-randomization in which pseudo-inclusion probabilities are estimated based on covariates available for samples and nonsample units. Another is superpopulation modeling for the analytic variables collected on the sample units in which the model is used to predict values for the nonsample units. We discuss the pros and cons of each approach.}, + language = {en}, + number = {2}, + urldate = {2024-01-09}, + journal = {Statistical Science}, + author = {Elliott, Michael R. and Valliant, Richard}, + month = may, + year = {2017}, +} + +@article{kim_combining_2021, + title = {Combining {Non}-{Probability} and {Probability} {Survey} {Samples} {Through} {Mass} {Imputation}}, + volume = {184}, + issn = {0964-1998, 1467-985X}, + url = {https://academic.oup.com/jrsssa/article/184/3/941/7068406}, + doi = {10.1111/rssa.12696}, + abstract = {Abstract + Analysis of non-probability survey samples requires auxiliary information at the population level. Such information may also be obtained from an existing probability survey sample from the same finite population. Mass imputation has been used in practice for combining non-probability and probability survey samples and making inferences on the parameters of interest using the information collected only in the non-probability sample for the study variables. Under the assumption that the conditional mean function from the non-probability sample can be transported to the probability sample, we establish the consistency of the mass imputation estimator and derive its asymptotic variance formula. Variance estimators are developed using either linearization or bootstrap. Finite sample performances of the mass imputation estimator are investigated through simulation studies. We also address important practical issues of the method through the analysis of a real-world non-probability survey sample collected by the Pew Research Centre.}, + language = {en}, + number = {3}, + urldate = {2023-11-25}, + journal = {Journal of the Royal Statistical Society Series A: Statistics in Society}, + author = {Kim, Jae Kwang and Park, Seho and Chen, Yilin and Wu, Changbao}, + month = jul, + year = {2021}, + pages = {941--963}, +} + + +@article{kim_theory_2012, + title = {Some theory for propensity-score-adjustment estimators in survey sampling}, + volume = {38}, + abstract = {The propensity-scoring-adjustment approach is commonly used to handle selection bias in survey sampling applications, including unit nonresponse and undercoverage. The propensity score is computed using auxiliary variables observed throughout the sample. We discuss some asymptotic properties of propensity-score-adjusted estimators and derive optimal estimators based on a regression model for the finite population. An optimal propensity-score-adjusted estimator can be implemented using an augmented propensity model. Variance estimation is discussed and the results from two simulation studies are presented.}, + language = {en}, + number = {2}, + year = {2012}, + journal = {Survey Methodology}, + author = {Kim, Jae Kwang and Riddles, Minsun Kim}, + pages = {157--165}, +} + + +@article{ai_simple_2020, + title = {A {Simple} and {Efficient} {Estimation} {Method} for {Models} with {Nonignorable} {Missing} {Data}}, + volume = {30}, + issn = {10170405}, + url = {http://www3.stat.sinica.edu.tw/statistica/J30N4/J30N412/J30N412.html}, + doi = {10.5705/ss.202018.0107}, + abstract = {This paper proposes a simple and efficient generalized method of moments (GMM) estimation for a model with non-ignorable missing data. In contrast to the existing the GMM estimation with a fixed number of moments, we allow the number of moments to grow with the sample size and use optimal weighting. Hence, our estimator is efficient, attaining the semiparametric efficiency bound derived in the literature. Existing semiparametric estimators estimate an efficient score. However, this approach is either locally efficient, or it suffers from the curse of dimensionality and the bandwidth selection problem. In contrast, our estimator does not suffer from these problems. Moreover, the proposed estimator and its consistent covariance matrix are easily computed using commercially available GMM packages. We propose two data-driven methods to select the number of moments. A small-scale simulation study reveals that the proposed estimator outperforms existing alternatives in finite samples.}, + language = {en}, + urldate = {2024-01-10}, + journal = {Statistica Sinica}, + author = {Ai, Chunrong and Linton, Oliver and Zhang, Zheng}, + year = {2020}, + pages = {1949--1970}, +} + +@article{imai_covariate_2014, + title = {Covariate {Balancing} {Propensity} {Score}}, + volume = {76}, + issn = {1369-7412, 1467-9868}, + url = {https://academic.oup.com/jrsssb/article/76/1/243/7075938}, + doi = {10.1111/rssb.12027}, + abstract = {The propensity score plays a central role in a variety of causal inference settings. In particular, matching and weighting methods based on the estimated propensity score have become increasingly common in the analysis of observational data. Despite their popularity and theoretical appeal, the main practical difficulty of these methods is that the propensity score must be estimated. Researchers have found that slight misspecification of the propensity score model can result in substantial bias of estimated treatment effects. We introduce covariate balancing propensity score (CBPS) methodology, which models treatment assignment while optimizing the covariate balance. The CBPS exploits the dual characteristics of the propensity score as a covariate balancing score and the conditional probability of treatment assignment. The estimation of the CBPS is done within the generalized method-of-moments or empirical likelihood framework. We find that the CBPS dramatically improves the poor empirical performance of propensity score matching and weighting methods reported in the literature. We also show that the CBPS can be extended to other important settings, including the estimation of the generalized propensity score for non-binary treatments and the generalization of experimental estimates to a target population. Open source software is available for implementing the methods proposed.}, + language = {en}, + number = {1}, + urldate = {2024-01-10}, + journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology}, + author = {Imai, Kosuke and Ratkovic, Marc}, + month = jan, + year = {2014}, + pages = {243--263}, +} + + @Manual{r-cran, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2023}, + url = {https://www.R-project.org/}, + } + + +@Manual{jointcalib, + title = {{jointCalib}: A Joint Calibration of Totals and Quantiles}, + author = {Maciej Beręsewicz}, + year = {2023}, + note = {R package version 0.1.0}, + url = {https://CRAN.R-project.org/package=jointCalib}, + } + +@Manual{nonprobsvy, + title = {{nonprobsvy}: Package for Inference Based on Non-Probability Samples}, + author = {Chrostowski, {\L}ukasz and Ber\k{e}sewicz, Maciej and Chlebicki, Piotr}, + year = {2024}, + note = {R package version 0.1.1, +https://ncn-foreigners.github.io/nonprobsvy/}, + url = {https://github.com/ncn-foreigners/nonprobsvy}, + } + + +@article{chen2018, + title = {Model-assisted calibration of non-probability sample survey data using adaptive {LASSO}}, + volume = {44}, + abstract = {The probability-sampling-based framework has dominated survey research because it provides precise mathematical tools to assess sampling variability. However increasing costs and declining response rates are expanding the use of non-probability samples, particularly in general population settings, where samples of individuals pulled from web surveys are becoming increasingly cheap and easy to access. But non-probability samples are at risk for selection bias due to differential access, degrees of interest, and other factors. Calibration to known statistical totals in the population provide a means of potentially diminishing the effect of selection bias in non-probability samples. Here we show that model calibration using adaptive LASSO can yield a consistent estimator of a population total as long as a subset of the true predictors is included in the prediction model, thus allowing large numbers of possible covariates to be included without risk of overfitting. We show that the model calibration using adaptive LASSO provides improved estimation with respect to mean square error relative to standard competitors such as generalized regression (GREG) estimators when a large number of covariates are required to determine the true model, with effectively no loss in efficiency over GREG when smaller models will suffice. We also derive closed form variance estimators of population totals, and compare their behavior with bootstrap estimators. We conclude with a real world example using data from the National Health Interview Survey.}, + language = {en}, + number = {1}, + journal = {Survey Methodology}, + author = {Chen, Jack and Valliant, Richard and Elliott, Michael}, + year = {2018}, + keywords = {lasso}, + pages = {117--144}, +} + +@article{chen2019, + title = {Calibrating non‐probability surveys to estimated control totals using {LASSO}, with an application to political polling}, + volume = {68}, + issn = {0035-9254, 1467-9876}, + url = {https://onlinelibrary.wiley.com/doi/10.1111/rssc.12327}, + doi = {10.1111/rssc.12327}, + abstract = {Declining response rates and increasing costs have led to greater use of nonprobability samples in election polling. But non-probability samples may suffer from selection bias due to differential access, degrees of interest and other factors. Here we estimate voting preference for 19 elections in the US 2014 midterm elections by using large non-probability surveys obtained from SurveyMonkey users, calibrated to estimated control totals using modelassisted calibration combined with adaptive LASSO regression, or the estimated controlled LASSO, ECLASSO. Comparing the bias and root-mean-square error of ECLASSO with traditional calibration methods shows that ECLASSO can be a powerful method for adjusting non-probability surveys even when only a small sample is available from a probability survey. The methodology proposed has potentially broad application across social science and health research, as response rates for probability samples decline and access to non-probability samples increases.}, + language = {en}, + number = {3}, + urldate = {2023-02-23}, + journal = {Journal of the Royal Statistical Society: Series C (Applied Statistics)}, + author = {Chen, Jack and Valliant, Richard and Elliott, Michael}, + year = {2019}, + keywords = {lasso}, + pages = {657--681}, +} + + +@article{hazlett_kernel_2020, + title = {Kernel {Balancing}: {A} {Flexible} {Non}-{Parametric} {Weighting} {Procedure} for {Estimating} {Causal} {Effects}}, + volume = {30}, + issn = {1556-5068}, + shorttitle = {Kernel {Balancing}}, + url = {https://www.ssrn.com/abstract=2746753}, + doi = {10.2139/ssrn.2746753}, + abstract = {Matching and weighting methods are widely used to estimate causal effects when needing to adjust for a set of observables. Matching is appealing for its nonparametric nature, but with continuous variables, is not guaranteed to remove bias. Weighting techniques choose weights on units to ensure that prespecified functions of the covariates have equal (weighted) means for the treated and control groups. This ensures an unbiased effect estimate only when the potential outcomes are linear in those prespecified functions of the observables. Kernel balancing begins by assuming that the expectation of the nontreatment potential outcome, conditional on the covariates, falls in a large, flexible space of functions associated with a kernel. It then constructs linear bases for this function space, and achieves approximate balance on these bases. A worst-case bound on the bias due to this approximation is given and minimized. Relative to current practice, kernel balancing offers a reasonable solution to the long-standing question of which functions of the covariates investigators should balance. Furthermore, these weights are also those that would make the estimated multivariate density of covariates approximately the same for the treated and control groups, when the same choice of kernel is used to estimate those densities. The approach is fully automated, given the user’s choice of kernel and smoothing parameter, for which default options and guidelines are provided. An R package, kbal, implements this approach.}, + language = {en}, + number = {3}, + urldate = {2023-09-24}, + journal = {SSRN Electronic Journal}, + author = {Hazlett, Chad}, + year = {2020}, +} + + +@article{santanna_covariate_2022, + title = {Covariate {Distribution} {Balance} via {Propensity} {Scores}}, + volume = {37}, + abstract = {This paper proposes new estimators for the propensity score that aim to maximize the covariate distribution balance among different treatment groups. Heuristically, our proposed procedure attempts to estimate a propensity score model by making the underlying covariate distribution of different treatment groups as close to each other as possible. Our estimators are data-driven and can be used to estimate different treatment effect parameters under different identifying assumptions, including unconfoundedness and local treatment effects. We derive the asymptotic properties of inverse probability weighted estimators for the average, distributional, and quantile treatment effects based on the proposed propensity score estimator and illustrate their finite sample performance via Monte Carlo simulations and an empirical application.}, + language = {en}, + number = {6}, + journal = {Journal of Applied Econometrics}, + author = {Sant’Anna, Pedro H C and Song, Xiaojun and Xu, Qi}, + year = {2022}, + pages = {1093--1120}, +} + +@article{park_note_2019, + title = {A note on propensity score weighting method using paradata in survey sampling}, + volume = {45}, + abstract = {Paradata is often collected during the survey process to monitor the quality of the survey response. One such paradata is a respondent behavior, which can be used to construct response models. The propensity score weight using the respondent behavior information can be applied to the final analysis to reduce the nonresponse bias. However, including the surrogate variable in the propensity score weighting does not always guarantee the efficiency gain. We show that the surrogate variable is useful only when it is correlated with the study variable. Results from a limited simulation study confirm the finding. A real data application using the Korean Workplace Panel Survey data is also presented.}, + language = {en}, + number = {3}, + journal = {Survey Methodology}, + author = {Park, Seho and Kim, Jae Kwang and Kim, Kimin}, + year = {2019}, + pages = {451--463}, +} + +@article{yang_doubly_2020, + title = {Doubly {Robust} {Inference} when {Combining} {Probability} and {Non}-{Probability} {Samples} with {High} {Dimensional} {Data}}, + volume = {82}, + issn = {1369-7412, 1467-9868}, + url = {https://academic.oup.com/jrsssb/article/82/2/445/7056072}, + doi = {10.1111/rssb.12354}, + abstract = {We consider integrating a non-probability sample with a probability sample which provides high dimensional representative covariate information of the target population. We propose a two-step approach for variable selection and finite population inference. In the first step, we use penalized estimating equations with folded concave penalties to select important variables and show selection consistency for general samples. In the second step, we focus on a doubly robust estimator of the finite population mean and re-estimate the nuisance model parameters by minimizing the asymptotic squared bias of the doubly robust estimator. This estimating strategy mitigates the possible first-step selection error and renders the doubly robust estimator root n consistent if either the sampling probability or the outcome model is correctly specified.}, + language = {en}, + number = {2}, + urldate = {2023-02-24}, + journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology}, + author = {Yang, Shu and Kim, Jae Kwang and Song, Rui}, + month = apr, + year = {2020}, + pages = {445--465}, +} + +@article{berkesewicz2017two, + title={A two-step procedure to measure representativeness of internet data sources}, + author={Ber{\k{e}}sewicz, Maciej}, + journal={International Statistical Review}, + volume={85}, + number={3}, + pages={473--493}, + year={2017}, + publisher={Wiley Online Library} +} + + +@article{citro2014multiple, + title={From multiple modes for surveys to multiple data sources for estimates}, + author={Citro, Constance F}, + journal={Survey Methodology}, + volume={40}, + number={2}, + pages={137--162}, + year={2014}, + publisher={Statistics Canada} +} + + +@article{daas2015big, + title={Big data as a source for official statistics}, + author={Daas, Piet JH and Puts, Marco J and Buelens, Bart and Hurk, Paul AM van den}, + journal={Journal of Official Statistics}, + volume={31}, + number={2}, + pages={249--262}, + year={2015}, + publisher={SAGE Publications Sage UK: London, England} +} + + +@article{chen_nonparametric_2022, + title = {Nonparametric {Mass} {Imputation} for {Data} {Integration}}, + volume = {10}, + issn = {2325-0984, 2325-0992}, + url = {https://academic.oup.com/jssam/article/10/1/1/5983829}, + doi = {10.1093/jssam/smaa036}, + abstract = {Abstract + Data integration combining a probability sample with another nonprobability sample is an emerging area of research in survey sampling. We consider the case when the study variable of interest is measured only in the nonprobability sample, but comparable auxiliary information is available for both data sources. We consider mass imputation for the probability sample using the nonprobability data as the training set for imputation. The parametric mass imputation is sensitive to parametric model assumptions. To develop improved and robust methods, we consider nonparametric mass imputation for data integration. In particular, we consider kernel smoothing for a low-dimensional covariate and generalized additive models for a relatively high-dimensional covariate for imputation. Asymptotic theories and variance estimation are developed. Simulation studies and real applications show the benefits of our proposed methods over parametric counterparts.}, + language = {en}, + number = {1}, + urldate = {2024-01-17}, + journal = {Journal of Survey Statistics and Methodology}, + author = {Chen, Sixia and Yang, Shu and Kim, Jae Kwang}, + month = jan, + year = {2022}, + pages = {1--24}, +} + + + +@book{yee2015vector, + title={Vector generalized linear and additive models: with an implementation in R}, + author={Yee, Thomas W}, + volume={10}, + year={2015}, + publisher={Springer} +} + + +@software{lukasz_chrostowski_2023_10280114, + author = {Łukasz Chrostowski and + Maciej Beręsewicz and + Piotr Chlebicki}, + title = {ncn-foreigners/nonprobsvy: Initial release}, + month = dec, + year = 2023, + publisher = {Zenodo}, + version = {0.1.0}, + doi = {10.5281/zenodo.10280114}, + url = {https://doi.org/10.5281/zenodo.10280114} +} + + +@article{chen_note_2021, + title = {A note on multiply robust predictive mean matching imputation with complex survey data}, + volume = {47}, + abstract = {Predictive mean matching is a commonly used imputation procedure for addressing the problem of item nonresponse in surveys. The customary approach relies upon the specification of a single outcome regression model. In this note, we propose a novel predictive mean matching procedure that allows the user to specify multiple outcome regression models. The resulting estimator is multiply robust in the sense that it remains consistent if one of the specified outcome regression models is correctly specified. The results from a simulation study suggest that the proposed method performs well in terms of bias and efficiency.}, + language = {en}, + number = {1}, + year = 2021, + journal = {Survey Methodology}, + author = {Chen, Sixia and Haziza, David and Stubblefield, Alexander}, + pages = {215--222}, +} + + +@Manual{jvs-meth, + title = {Methodological report The demand for labour}, + author = {{Statistics Poland}}, + year = {2021}, + url = {https://stat.gov.pl/obszary-tematyczne/rynek-pracy/popyt-na-prace/zeszyt-metodologiczny-popyt-na-prace,3,1.html}, + } + + +@misc{beresewicz2024inference, + title={Inference for non-probability samples using the calibration approach for quantiles}, + author={Maciej Beręsewicz and Marcin Szymkowiak}, + year={2024}, + eprint={2403.09726}, + archivePrefix={arXiv}, + primaryClass={stat.ME}, + howpublished={\url{https://arxiv.org/abs/2403.09726}} +} + +@article{little1988missing, + title={Missing-data adjustments in large surveys}, + author={Little, Roderick JA}, + journal={Journal of Business \& Economic Statistics}, + volume={6}, + number={3}, + pages={287--296}, + year={1988}, + publisher={Taylor \& Francis} +} + +@article{rubin1986statistical, + title={Statistical matching using file concatenation with adjusted weights and multiple imputations}, + author={Rubin, Donald B}, + journal={Journal of Business \& Economic Statistics}, + volume={4}, + number={1}, + pages={87--94}, + year={1986}, + publisher={Taylor \& Francis} +} + + +@article{schenker1996partial, + title={Partially parametric techniques for multiple imputation}, + author={Schenker, Nathaniel and Taylor, Jeremy MG}, + journal={{Computational Statistics \& Data analysis}}, + volume={22}, + number={4}, + pages={425--446}, + year={1996}, + publisher={Elsevier} +} + +@article{horton2001multiple, + title={Multiple imputation in practice: comparison of software packages for regression models with missing variables}, + author={Horton, Nicholas J and Lipsitz, Stuart R}, + journal={The American Statistician}, + volume={55}, + number={3}, + pages={244--254}, + year={2001}, + publisher={Taylor \& Francis} +} + +@book{klenke_prob, + author = {Klenke, Achim}, + year = {2014}, + month = {01}, + title = {Probability theory. A comprehensive course. 2nd extended ed}, + isbn = {978-1-4471-5360-3}, + doi = {10.1007/978-1-4471-5361-0} +} + +@article{tyrcha_mielniczuk, + title = {Consistency of multilayer perceptron regression estimators}, + journal = {Neural Networks}, + volume = {6}, + number = {7}, + pages = {1019-1022}, + year = {1993}, + issn = {0893-6080}, + doi = {https://doi.org/10.1016/S0893-6080(09)80011-7}, + author = {Jan Mielniczuk and Joanna Tyrcha}, + keywords = {Multilayer perceptron, Least squares regression estimator, Entropy, Back propagation, Vapnik-Cher-vonenkis class}, +} + +@Inbook{Tsiatis2006, + author = {Anastasios A. Tsiatis}, + title="The Geometry of Influence Functions", + bookTitle="Semiparametric Theory and Missing Data", + year="2006", + publisher="Springer New York", + address="New York, NY", + pages="21--51", + isbn="978-0-387-37345-4", + doi="10.1007/0-387-37345-4_3" +} + +@article{gelman1997poststratification, + title={Poststratification into many categories using hierarchical logistic regression}, + author={Gelman, Andrew}, + journal={Survey Methodology}, + volume={23}, + pages={127}, + year={1997} +} + + +@article{cobo2024software, + author = {Cobo, Beatriz and Ferri-García, Ramón and Rueda-Sánchez, Jorge L. and Rueda, María del Mar}, + title = {Software review for inference with non-probability surveys}, + journal = {The Survey Statistician}, + volume = {90}, + pages = {40--47}, + year = {2024}, + affiliation = {University of Granada, Spain} +} + + +@misc{sarig2023balancepythonpackage, + title={balance -- a Python package for balancing biased data samples}, + author={Tal Sarig and Tal Galili and Roee Eilat}, + year={2023}, + eprint={2307.06024}, + archivePrefix={arXiv}, + primaryClass={stat.CO}, + url={https://arxiv.org/abs/2307.06024}, +} + + +@article{NonProbEst, + author = {Rueda, María del Mar and Ferri-García, Ramón and Castro, Luis}, + title = {The R package NonProbEst for estimation in non-probability surveys}, + journal = {The R Journal}, + year = {2020}, + note = {https://rjournal.github.io/}, + volume = {12}, + issue = {1}, + issn = {2073-4859}, + pages = {406-418} +} + + +@Manual{castro2024inps, + title = {INPS: Inference from Non-Probability Samples}, + author = {Castro Martín, Luis}, + year = {2024}, + version = {1.19}, + date = {2024-11-17}, + url = {https://pypi.org/project/inps/}, + publisher = {Python Package Index}, + keywords = {python, statistics, non-probability-samples} +} + +@Manual{rstanarm, + title = {rstanarm: {Bayesian} applied regression modeling via {Stan}.}, + author = {Ben Goodrich and Jonah Gabry and Imad Ali and Sam Brilleman}, + note = {R package version 2.32.1}, + year = {2024}, + url = {https://mc-stan.org/rstanarm/}, +} + + + +@techreport{jvs2022, + title = {{The Demand for :abour: Methodological report}}, + author = {{Statistics Poland}}, + year = {2021}, + institution = {Statistical Office in Bydgoszcz}, + address = {Bydgoszcz, Warsaw}, + type = {Methodological Report}, + url = {https://stat.gov.pl/obszary-tematyczne/rynek-pracy/popyt-na-prace/zeszyt-metodologiczny-popyt-na-prace,3,1.html}, + series = {Statistical research methodology} +} + + +@article{beresewicz2025, + author = {Beręsewicz, Maciej and Szymkowiak, Marcin and Chlebicki, Piotr}, + title = {Quantile balancing inverse probability weighting for non-probability samples}, + journal = {Forthcomming to the Survey Methodology}, + year = {2025}, + volume = {51}, + issue = {2}, + pages = {0-0} +} + + +@Manual{cobalt, + title = {cobalt: Covariate Balance Tables and Plots}, + author = {Greifer, Noah }, + year = {2024}, + note = {R package version 4.5.5}, + url = {https://CRAN.R-project.org/package=cobalt}, +} + +@Manual{rann-pkg, + title = {RANN: Fast Nearest Neighbour Search (Wraps ANN Library) Using L2 +Metric}, + author = {Gregory Jefferis and Samuel E. Kemp and Sunil Arya and David Mount}, + year = {2024}, + note = {R package version 2.6.2}, + url = {https://CRAN.R-project.org/package=RANN}, +} + + +@Misc{svrep, + author = {Benjamin Schneider}, + year = {2023}, + title = {svrep: Tools for Creating, Updating, and Analyzing Survey Replicate Weights}, + note = {R package version 0.6.0}, + url = {https://CRAN.R-project.org/package=svrep}, +} + + + + + +@book{biffignandi2021handbook, + title={Handbook of Web Surveys}, + author={Biffignandi, Silvia and Bethlehem, Jelke}, + year={2021}, + publisher={John Wiley \& Sons}, + isbn={9781119371687}, + doi={10.1002/9781119371717} +} + + +@article{lee2006propensity, + title={Propensity score adjustment as a weighting scheme for volunteer panel web surveys}, + author={Lee, Sunghee}, + journal={Journal of official statistics}, + volume={22}, + number={2}, + pages={329}, + year={2006}, + publisher={Statistics Sweden (SCB)} +} + +@article{kim2023empirical, + title={An Empirical Likelihood Approach to Reduce Selection Bias in Voluntary Samples}, + author={Kim, Jae Kwang and Morikawa, Kento}, + journal={Calcutta Statistical Association Bulletin}, + volume={75}, + number={1}, + pages={8--27}, + year={2023}, + publisher={SAGE Publications}, + doi={10.1177/00080683231186488} +} + + +@article{wisniowski2020integrating, + title={Integrating Probability and Nonprobability Samples for Survey Inference}, + author={Wiśniowski, Arkadiusz and Sakshaug, Joseph W. and Perez Ruiz, Diego Andres and Blom, Annelies G.}, + journal={Journal of Survey Statistics and Methodology}, + volume={8}, + pages={120--147}, + year={2020}, + doi={10.1093/jssam/smz051} +} + + +@article{meng2018statistical, + title={Statistical Paradises and Paradoxes in Big Data ({I}): {Law} of {Large} {Populations}, {Big} {Data} {Paradox}, and the 2016 {US} {Presidential} {Election}}, + author={Meng, Xiao-Li}, + journal={Annals of Applied Statistics}, + volume={12}, + pages={685--726}, + year={2018}, + doi={10.1214/18-AOAS1161SF} +} + + +@inproceedings{rivers2007sampling, + author = {Rivers, Douglas}, + year = {2007}, + title = {Sampling for web surveys}, + booktitle = {Proceedings of the Survey Research Methods Section, Joint Statistical Meetings}, + publisher = {American Statistical Association}, + address = {Alexandria, VA}, + pages = {1--26} +} + + +@unpublished{chrostowski2024statistical, + author = {Chrostowski, Lukasz}, + title = {Statistical inference with non-probability samples}, + year = {2024}, + note = {Master's thesis, Adam Mickiewicz Universith} +} + + +@article{Grow2022, + author = {Grow, André and Perrotta, Daniela and Del Fava, Emanuele and Cimentada, Jorge and Rampazzo, Francesco and Gil-Clavel, Sofia and Zagheni, Emilio and Flores, René D. and Ventura, Ilana and Weber, Ingmar}, + title = {Is Facebook’s Advertising Data Accurate Enough for Use in Social Science Research? Insights from a Cross-National Online Survey}, + journal = {Journal of the Royal Statistical Society Series A: Statistics in Society}, + volume = {185}, + number = {2}, + pages = {343-363}, + year = {2022}, + month = {11}, + abstract = {Social scientists increasingly use Facebook’s advertising platform for research, either in the form of conducting digital censuses of the general population, or for recruiting participants for survey research. Both approaches depend on the accuracy of the data that Facebook provides about its users, but little is known about how accurate these data are. We address this gap in a large-scale, cross-national online survey (N = 137,224), in which we compare self-reported and Facebook-classified demographic information (sex, age and region of residence). Our results suggest that Facebook’s advertising platform can be fruitfully used for conducing social science research if additional steps are taken to assess the accuracy of the characteristics under consideration.}, + issn = {0964-1998}, + doi = {10.1111/rssa.12948}, + url = {https://doi.org/10.1111/rssa.12948} +} + + + +@article{abadie2006large, + title={Large sample properties of matching estimators for average treatment effects}, + author={Abadie, Alberto and Imbens, Guido W}, + journal={Econometrica}, + volume={74}, + number={1}, + pages={235--267}, + year={2006}, + publisher={Wiley Online Library} +} + +@article{robins1994estimation, + title={Estimation of regression coefficients when some regressors are not always observed}, + author={Robins, James M and Rotnitzky, Andrea and Zhao, Lue Ping}, + journal={Journal of the American statistical Association}, + volume={89}, + number={427}, + pages={846--866}, + year={1994}, + publisher={Taylor \& Francis} +} + + +@article{kim2014doubly, + title={Doubly robust inference with missing data in survey sampling}, + author={Kim, Jae Kwang and Haziza, David}, + journal={Statistica Sinica}, + volume={24}, + number={1}, + pages={375--394}, + year={2014}, + publisher={JSTOR} +} + + +@article{tibshirani1996regression, + title={Regression shrinkage and selection via the lasso}, + author={Tibshirani, Robert}, + journal={Journal of the Royal Statistical Society Series B: Statistical Methodology}, + volume={58}, + number={1}, + pages={267--288}, + year={1996}, + publisher={Oxford University Press} +} + + +@article{lee2009estimation, + title={Estimation for volunteer panel web surveys using propensity score adjustment and calibration adjustment}, + author={Lee, Sunghee and Valliant, Richard}, + journal={Sociological Methods \& Research}, + volume={37}, + number={3}, + pages={319--343}, + year={2009}, + publisher={Sage Publications Sage CA: Los Angeles, CA} +} + + +@article{Schonlau2017, +author = {Schonlau, Matthias and Couper, Mick P. }, +title = {{Options for Conducting Web Surveys}}, +volume = {32}, +journal = {Statistical Science}, +number = {2}, +publisher = {Institute of Mathematical Statistics}, +pages = {279 -- 292}, +keywords = {Convenience sample, Internet survey}, +year = {2017}, +doi = {10.1214/16-STS597}, +URL = {https://doi.org/10.1214/16-STS597} +} + + + +@ARTICLE{scipy2020, + author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and + Haberland, Matt and Reddy, Tyler and Cournapeau, David and + Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and + Bright, Jonathan and {van der Walt}, St{\'e}fan J. and + Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and + Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and + Kern, Robert and Larson, Eric and Carey, C J and + Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and + {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and + Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and + Harris, Charles R. and Archibald, Anne M. and + Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and + {van Mulbregt}, Paul and {SciPy 1.0 Contributors}}, + title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific + Computing in Python}}, + journal = {Nature Methods}, + year = {2020}, + volume = {17}, + pages = {261--272}, + adsurl = {https://rdcu.be/b08Wh}, + doi = {10.1038/s41592-019-0686-2}, +} + +@article{Diallo2021, doi = {10.21105/joss.03376}, url = {https://doi.org/10.21105/joss.03376}, year = {2021}, publisher = {The Open Journal}, volume = {6}, number = {68}, pages = {3376}, author = {Mamadou S. Diallo}, title = {samplics: a Python Package for selecting, weighting and analyzing data from complex sampling designs.}, journal = {Journal of Open Source Software} } diff --git a/vignettes/nonprobsvy-getting-started.qmd b/vignettes/nonprobsvy-getting-started.qmd new file mode 100644 index 0000000..861ff52 --- /dev/null +++ b/vignettes/nonprobsvy-getting-started.qmd @@ -0,0 +1,198 @@ +--- +title: "nonprobsvy -- An R package for modern methods for non-probability surveys" +author: + - name: Łukasz Chrostowski + affiliation: Independent + email: \email{lukaszchrostowski6@gmail.com} + url: https://github.com/LukaszChrostowski + - name: Piotr Chlebicki + affiliation: Stockholm University + email: \email{piotr.chlebicki@math.su.se} + url: https://github.com/Kertoo, https://www.su.se/profiles/pich3772 + orcid: 0009-0006-4867-7434 + - name: Maciej Beręsewicz + orcid: 0000-0002-8281-4301 + email: \email{maciej.beresewicz@ue.poznan.pl} + url: https://github.com/BERENZ, https://ue.poznan.pl/en/people/dr-maciej-beresewicz/ + affiliation: Poznań University of Economics and Business / Statistical Office in Poznań +format: + html: + html-math-method: mathjax + toc: true + number-sections: true +preamble: > + \usepackage{amsmath, amsthm, amssymb} + \usepackage{calc, ragged2e} + \usepackage[ruled]{algorithm2e} + \usepackage{algpseudocode} + \newcommand{\argmin}{\operatornamewithlimits{arg\,min}} + \newcommand{\argmax}{\operatornamewithlimits{arg\,max}} + \newcommand{\bX}{\boldsymbol{X}} + \newcommand{\bx}{\boldsymbol{x}} + \newcommand{\bY}{\boldsymbol{Y}} + \newcommand{\by}{\boldsymbol{y}} + \newcommand{\bh}{\boldsymbol{h}} + \newcommand{\bH}{\boldsymbol{H}} + \newcommand{\ba}{\boldsymbol{a}} + \newcommand{\bp}{\boldsymbol{p}} + \newcommand{\bA}{\boldsymbol{A}} + \newcommand{\bw}{\boldsymbol{w}} + \newcommand{\bd}{\boldsymbol{d}} + \newcommand{\bZ}{\boldsymbol{Z}} + \newcommand{\bz}{\boldsymbol{z}} + \newcommand{\bv}{\boldsymbol{v}} + \newcommand{\bu}{\boldsymbol{u}} + \newcommand{\bU}{\boldsymbol{U}} + \newcommand{\bQ}{\boldsymbol{Q}} + \newcommand{\bG}{\boldsymbol{G}} + \newcommand{\HT}{\text{\rm HT}} + \newcommand{\bbeta}{\boldsymbol{\beta}} + \newcommand{\balpha}{\boldsymbol{\alpha}} + \newcommand{\btau}{\boldsymbol{\tau}} + \newcommand{\bgamma}{\boldsymbol{\gamma}} + \newcommand{\btheta}{\boldsymbol{\theta}} + \newcommand{\blambda}{\boldsymbol{\lambda}} + \newcommand{\bPhi}{\boldsymbol{\Phi}} + \newcommand{\bEta}{\boldsymbol{\eta}} + \newcommand{\bZero}{\boldsymbol{0}} + \newcommand{\colvec}{\operatorname{colvec}} + \newcommand{\logit}{\operatorname{logit}} + \newcommand{\Exp}{\operatorname{Exp}} + \newcommand{\Ber}{\operatorname{Bernoulli}} + \newcommand{\Uni}{\operatorname{Uniform}} +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{nonprobsvy -- An R package for modern methods for non-probability surveys} + %\VignetteEngine{quarto::html} + %\VignetteEncoding{UTF-8} + %\VignetteDepends{nonprobsvy} + %\VignetteKeywords{data integration, doubly robust estimation, propensity score estimation, mass imputation, survey} + %\VignettePackage{nonprobsvy} +--- + +# Introduction {#sec-introduction} + +In official statistics, information about the target population and its +characteristics is mainly collected through probability surveys, +censuses or is obtained from administrative registers and covers all (or +nearly all) units of the population. However, owing to increasing +non-response rates, particularly unit non-response and non-contact, +which result from the growing respondent burden as well as rising costs +of surveys conducted by National Statistical Institutes, non-probability +data sources are becoming more popular [@berkesewicz2017two, +@beaumont2020probability, @biffignandi2021handbook]. Non-probability +surveys, such as opt-in web panels, social media, scanner data, mobile +phone data or voluntary register data, are currently being explored for +use in the production of official statistics [@citro2014multiple, +@daas2015big], public opinion studies [@Schonlau2017] or market research +[@Grow2022]. Since the selection mechanism underlying these sources is +unknown, standard design-based inference methods cannot be directly +applied and, in the case of large datasets, can lead to the *big data +paradox* described by @meng2018statistical. + +@tbl-comparison-characteristics compares basic characteristics of +probability and non-probability samples. In particular, it shows the +advantages and disadvantages of each type with respect to the selection +mechanism, the population coverage, bias, variance, costs and +timeliness. In general, the quality of non-probability samples suffers +from an unknown selection mechanism (i.e. unknown probabilities of +inclusion) and under-coverage of certain groups from the population +(e.g. older people). As a result, direct estimates based on +non-probability samples are biased and, in most cases, are characterised +by small variance owing to their size, which is known as the +*big data paradox*, i.e. the larger the sample, the larger the +bias. Certainly, the costs and timeliness of these surveys are +significantly smaller than those of non-probability samples. + +To address this problem, several approaches have been proposed, which +rely on the estimation of propensity scores (i.e. inclusion +probabilities) for deriving inverse probability weights (IPW; also known +as propensity score weighting/adjustment, cf. @lee2006propensity, +@lee2009estimation), on model-based prediction (in particular, mass +imputation estimators; MI) and on the doubly robust (DR) approach +involving IPW and MI estimators. Two main scenarios are usually +considered: 1) only population-level means or totals are available, and +2) unit-level data are available either in the form of registers +covering the whole population or in the form of probability surveys +[@elliott_inference_2017]. @wu2022statistical classified these +approaches into three groups that require a joint randomization +framework involving a *probability sampling design*}\* (denoted as $p$) +and an outcome regression model (denoted as $\xi$) or a propensity score +model (denoted as $q$). According to this classification, IPW estimators +represent the $qp$ framework, MI estimators represent the $\xi p$ +framework, and DR estimators can represent either the $qp$ or the +$\xi p$ framework. + +| Factor | Probability sample | Non-probability sample | +|-----------------------|-----------------------|---------------------------| +| Selection | Known probabilities | Unknown self-selection | +| Coverage | Complete | May be incomplete | +| Estimation bias | Unbiased under design | Potential systematic bias | +| Variance of estimates | Typically high | Typically low | +| Cost | High | Low | +| Timeliness | Long delay | Very short delay | + +: A comparison of probability and non-probability samples and their characteristics {#tbl-comparison-characteristics} + +Most approaches assume that population data are used to reduce the bias +of non-probability sampling by the right reweighting to reproduce known +population totals/means (i.e. IPW estimators); by modelling the target +variable using various techniques (i.e. MI estimators); or by combining +both approaches (e.g. DR estimators, cf. @chen2020doubly; see also +Multilevel Regression and Post-stratification, MRP; *Mister-P*, cf. +@gelman1997poststratification). This topic has become very popular and a +number of new methods have been proposed; for instance non-parametric +approaches based on nearest neighbours [@yang2021integration], kernel +density estimation [@chen_nonparametric_2022], empirical likelihood +[@kim2023empirical], model-calibration with LASSO [@chen2018] or +quantile balanced IPW [@beresewicz2025] to name a few. It should be +highlighted that, in contrast to probability samples, there is no single +method that can be used for non-probability samples. Based on the +methods available in the literature several statistical software +solutions have been developed, which are presented in the next section. + +# Summary and future work + +The {nonprobsvy} package provides a comprehensive R software solution +that addresses inference challenges connected with non-probability +samples by integrating them with probability samples or known population +totals/means. As non-probability data sources like administrative +registers, voluntary online panels, and social media data become +increasingly available, statisticians need robust methods to produce +reliable population estimates. The package implements *state-of-the-art* +approaches including mass imputation, inverse probability weighting, and +doubly robust methods, each designed to correct selection bias by +leveraging auxiliary data. By providing a unified framework and its +integration with the {survey} package, the {nonprobsvy} makes complex +statistical methods for non-probability samples more accessible, +enabling researchers to produce robust estimates even when working with +non-representative data. + +There are several avenues for future development of the {nonprobsvy} +package. One key priority is to implement model-based calibration and +additional methods for estimating propensity scores and weights. The +package currently assumes no overlap between probability and +non-probability samples, so accounting for potential overlap (e.g., in +big data sources and registers) is another important extension. +Additional planned developments include handling non-ignorable sample +selection mechanisms, developing a theory for maintaining consistency +with calibration weights, and supporting multiple non-probability +samples from various sources for the purpose of data integration. +Further methodological extensions under consideration include empirical +likelihood approaches for doubly/multiply robust estimation, integration +of machine learning methods like debiased/double machine learning from +causal inference, handling measurement errors in big data variables, and +expanding the bootstrap approach beyond simple random sampling with +replacement. + +The package will also be extended to handle the `svyrep.design` class +from the {survey} package and the {svrep} package. These developments +will enhance its capabilities for handling complex survey data +structures and modern estimation challenges. + +# Acknowledgements {#sec-acknowledgements .unnumbered} + +The authors' work has been financed by the National Science Centre in +Poland, OPUS 20, grant no. 2020/39/B/HS4/00941. + +# References {.unnumbered}