diff --git a/.gitignore b/.gitignore index 56db315..94cb359 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ config.yml docs .idea/ renv.lock -extras/ \ No newline at end of file +extras/ +.Renviron \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0df769f..d4c1a53 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: DeepPatientLevelPrediction Type: Package Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model -Version: 2.0.0 +Version: 2.0.1 Date: 18-04-2023 Authors@R: c( person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 3da704b..70ccf12 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +DeepPatientLevelPrediction 2.0.1 +====================== + - Connection parameter fixed to be in line with newest polars + - Fixed a bug where LRFinder used a hardcoded batch size + - Seed is now used in LRFinder so it's reproducible + - Fixed a bug in NumericalEmbedding + - Fixed a bug for Transformer and numerical features + - Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang ) + - Updated installation documentation after feedback from HADES hackathon + - Fixed a bug where order of numeric features wasn't conserved between training and test set + - TrainingCache now only saves prediction dataframe for the best performing model + DeepPatientLevelPrediction 2.0.0 ====================== - New backend which uses pytorch through reticulate instead of torch in R diff --git a/R/Estimator.R b/R/Estimator.R index f804368..7705279 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -310,7 +310,8 @@ gridCvDeep <- function(mappedData, fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))] findLR <- modelSettings$estimatorSettings$findLR - for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { + if (!trainCache$isFull()) { + for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId)) ParallelLogger::logInfo(paste0("HyperParameters: ")) ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | ")) @@ -363,25 +364,38 @@ gridCvDeep <- function(mappedData, ) } maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) - paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]] - gridSearchPredictons[[gridId]] <- list( prediction = prediction, - param = paramSearch[[gridId]] + param = paramSearch[[gridId]], + gridPerformance = PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]]) ) + gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), + nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) + gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]] + + # remove all predictions that are not the max performance + indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + for (i in seq_along(gridSearchPredictons)) { + if (!is.null(gridSearchPredictons[[i]])) { + if (i != indexOfMax) { + gridSearchPredictons[[i]]$prediction <- list(NULL) + } + } + } + ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax)) trainCache$saveGridSearchPredictions(gridSearchPredictons) } + } + paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) + # get best params + indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + finalParam <- gridSearchPredictons[[indexOfMax]]$param + + paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) - # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function) - paramGridSearch <- lapply(gridSearchPredictons, function(x) { - do.call(PatientLevelPrediction::computeGridPerformance, x) - }) # cvAUCmean, cvAUC, param - - optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance))) - finalParam <- paramGridSearch[[optimalParamInd]]$param - - cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction + # get best CV prediction + cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction cvPrediction$evaluationType <- "CV" ParallelLogger::logInfo("Training final model using optimal parameters") diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R index 8577f31..be626d5 100644 --- a/R/TrainingCache-class.R +++ b/R/TrainingCache-class.R @@ -69,6 +69,13 @@ TrainingCache <- R6::R6Class( return(private$.paramPersistence$gridSearchPredictions) }, + #' @description + #' Check if cache is full + #' @returns Boolen + isFull = function() { + return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance))))) + }, + #' @description #' Gets the last index from the cached grid search #' @returns Last grid search index diff --git a/extras/example.R b/extras/example.R deleted file mode 100644 index 99a6c08..0000000 --- a/extras/example.R +++ /dev/null @@ -1,73 +0,0 @@ -# testing code (requires sequential branch of FeatureExtraction): -# rm(list = ls()) -library(PatientLevelPrediction) -library(DeepPatientLevelPrediction) - -data(plpDataSimulationProfile) -sampleSize <- 1e3 -plpData <- simulatePlpData( - plpDataSimulationProfile, - n = sampleSize - ) - - -populationSet <- PatientLevelPrediction::createStudyPopulationSettings( - requireTimeAtRisk = F, - riskWindowStart = 1, - riskWindowEnd = 365*5) - -# -# modelSettings <- setDefaultTransformer(estimatorSettings = setEstimator( -# learningRate = "auto", -# batchSize=64L, -# epochs = 10L -# )) - -modelSettings <- setDefaultResNet(estimatorSettings = setEstimator( - learningRate = "auto", - weightDecay = 1e-06, - device="cuda:0", - batchSize=128L, - epochs=50L, - seed=42 -)) - -modelSettings <- setResNet(numLayers = c(1L, 2L), - sizeHidden = 72L, - hiddenFactor = 1L, - residualDropout = 0.0, - hiddenDropout = 0.0, - sizeEmbedding = 64L, - estimatorSettings = setEstimator( - learningRate = 3e-4, - batchSize = 128L, - epochs = 10L, - device = "cpu", - seed = 42 - ), - randomSample = 2) - -res2 <- PatientLevelPrediction::runPlp( - plpData = plpData, - outcomeId = unique(plpData$outcomes$outcomeId)[[1]], - modelSettings = modelSettings, - analysisId = 'Test', - analysisName = 'Testing DeepPlp', - populationSettings = populationSet, - splitSettings = createDefaultSplitSetting(splitSeed = 123), - sampleSettings = createSampleSettings("underSample"), # none - featureEngineeringSettings = createFeatureEngineeringSettings(), # none - preprocessSettings = createPreprocessSettings(normalize = F), - logSettings = createLogSettings(verbosity='TRACE'), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = T, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ), - saveDirectory = '~/test/resnet/' -) - - diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py index 15d749b..98d9ed3 100644 --- a/inst/python/Dataset.py +++ b/inst/python/Dataset.py @@ -21,7 +21,7 @@ def __init__(self, if pathlib.Path(data).suffix == '.sqlite': data = urllib.parse.quote(data) data = pl.read_database("SELECT * from covariates", - connection_uri=f"sqlite://{data}").lazy() + connection=f"sqlite://{data}").lazy() else: data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow')) observations = data.select(pl.col('rowId').max()).collect()[0, 0] @@ -67,7 +67,7 @@ def __init__(self, if pl.count(self.numerical_features) == 0: self.num = None else: - numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \ + numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \ with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId'). rank(method="dense") - 1, pl.col('rowId') - 1) \ .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect() diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py index 4c24a38..9d5bd0c 100644 --- a/inst/python/LrFinder.py +++ b/inst/python/LrFinder.py @@ -37,6 +37,7 @@ def __init__(self, smooth = lr_settings.get("smooth", 0.05) divergence_threshold = lr_settings.get("divergence_threshold", 4) torch.manual_seed(seed=estimator_settings["seed"]) + self.seed = estimator_settings["seed"] self.model = model(**model_parameters) if callable(estimator_settings["device"]): self.device = estimator_settings["device"]() @@ -55,18 +56,18 @@ def __init__(self, self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr) self.criterion = estimator_settings["criterion"]() - self.batch_size = estimator_settings['batch_size'] + self.batch_size = int(estimator_settings['batch_size']) self.losses = None self.loss_index = None def get_lr(self, dataset): batch_index = torch.arange(0, len(dataset), 1).tolist() - + random.seed(self.seed) losses = torch.empty(size=(self.num_lr,), dtype=torch.float) lrs = torch.empty(size=(self.num_lr,), dtype=torch.float) for i in tqdm(range(self.num_lr)): self.optimizer.zero_grad() - random_batch = random.sample(batch_index, 32) + random_batch = random.sample(batch_index, self.batch_size) batch = dataset[random_batch] batch = batch_to_device(batch, self.device) diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py index f680eb2..cef4b49 100644 --- a/inst/python/ResNet.py +++ b/inst/python/ResNet.py @@ -130,9 +130,9 @@ def __init__(self, nn.init.kaiming_uniform_(parameter, a=math.sqrt(5)) def forward(self, input): - x = self.weight.unsqueeze(0) * input.unsqueeze(-1) + x = self.weight[None] * input[..., None] if self.bias is not None: - x = x + self.bias.unsqueeze(-1) + x = x + self.bias[None] return x diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py index 5944e1b..1c95b36 100644 --- a/inst/python/Transformer.py +++ b/inst/python/Transformer.py @@ -49,6 +49,9 @@ def __init__(self, if num_features != 0 and num_features is not None: self.numerical_embedding = NumericalEmbedding(num_features, dim_token) + self.use_numerical = True + else: + self.use_numerical = False self.class_token = ClassToken(dim_token) self.layers = nn.ModuleList([]) @@ -78,7 +81,7 @@ def __init__(self, def forward(self, x): mask = torch.where(x["cat"] == 0, True, False) cat = self.categorical_embedding(x["cat"]) - if "num" in x.keys() and self.numerical_embedding is not None: + if self.use_numerical: num = self.numerical_embedding(x["num"]) x = torch.cat([cat, num], dim=1) mask = torch.cat([mask, torch.zeros([x.shape[0], diff --git a/man/TrainingCache.Rd b/man/TrainingCache.Rd index 0a7ec7b..c82bb23 100644 --- a/man/TrainingCache.Rd +++ b/man/TrainingCache.Rd @@ -8,6 +8,8 @@ Whether the provided and cached parameter grid is identical Grid search results from the training cache +Boolen + Last grid search index } \description{ @@ -21,6 +23,7 @@ Parameter caching for training persistence and continuity \item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}} \item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}} \item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}} +\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}} \item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}} \item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}} \item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}} @@ -104,6 +107,16 @@ Gets the grid search results from the training cache \if{html}{\out{
}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TrainingCache-isFull}{}}} +\subsection{Method \code{isFull()}}{ +Check if cache is full +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TrainingCache$isFull()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index adf0dcb..7cd0fee 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -78,3 +78,21 @@ dataset <- Dataset$Data( ) small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3))) +modelSettings <- setResNet( + numLayers = 1, sizeHidden = 16, hiddenFactor = 1, + residualDropout = c(0, 0.2), hiddenDropout = 0, + sizeEmbedding = 16, hyperParamSearch = "random", + randomSample = 2, + setEstimator(epochs=1, + learningRate = 3e-4) +) +fitEstimatorPath <- file.path(testLoc, 'fitEstimator') +if (!dir.exists(fitEstimatorPath)) { + dir.create(fitEstimatorPath) +} +fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, + analysisPath = fitEstimatorPath) + + diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R index 9cb1382..b4dd0a4 100644 --- a/tests/testthat/test-Estimator.R +++ b/tests/testthat/test-Estimator.R @@ -146,25 +146,12 @@ test_that("early stopping works", { testthat::expect_true(earlyStop$early_stop) }) -modelSettings <- setResNet( - numLayers = 1, sizeHidden = 16, hiddenFactor = 1, - residualDropout = 0, hiddenDropout = 0, - sizeEmbedding = 16, hyperParamSearch = "random", - randomSample = 1, - setEstimator(epochs=1, - learningRate = 3e-4) -) - -sink(nullfile()) -results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1, analysisPath = testLoc) -sink() - test_that("Estimator fit function works", { - expect_true(!is.null(results$trainDetails$trainingTime)) + expect_true(!is.null(fitEstimatorResults$trainDetails$trainingTime)) - expect_equal(class(results), "plpModel") - expect_equal(attr(results, "modelType"), "binary") - expect_equal(attr(results, "saveType"), "file") + expect_equal(class(fitEstimatorResults), "plpModel") + expect_equal(attr(fitEstimatorResults, "modelType"), "binary") + expect_equal(attr(fitEstimatorResults, "saveType"), "file") fakeTrainData <- trainData fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake")) expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc)) @@ -184,7 +171,7 @@ test_that("predictDeepEstimator works", { # input is a plpModel and data sink(nullfile()) predictions <- predictDeepEstimator( - plpModel = results, data = trainData$Test, + plpModel = fitEstimatorResults, data = trainData$Test, trainData$Test$labels ) sink() @@ -369,4 +356,4 @@ test_that("estimatorSettings can be saved and loaded with correct python objects testthat::expect_false(reticulate::py_is_null_xptr(optimizer)) testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun)) testthat::expect_false(reticulate::py_is_null_xptr(criterion)) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R index eb4ab17..debe95c 100644 --- a/tests/testthat/test-TrainingCache.R +++ b/tests/testthat/test-TrainingCache.R @@ -47,43 +47,32 @@ test_that("Param grid predictions can be cached", { }) test_that("Estimator can resume training from cache", { - modelPath <- tempdir() - analysisPath <- file.path(modelPath, "Analysis_TrainCacheResNet") - dir.create(analysisPath) - trainCache <- TrainingCache$new(analysisPath) - trainCache$saveModelParams(paramSearch) + trainCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) + newPath <- file.path(testLoc, 'resume') + dir.create(newPath) + + # remove last row + trainCache$gridSearchPredictions[[2]] <- NULL + length(trainCache$gridSearchPredictions) <- 2 + + # save new cache + saveRDS(trainCache, file=file.path(newPath, "paramPersistence.rds")) sink(nullfile()) - res2 <- tryCatch( - { - PatientLevelPrediction::runPlp( - plpData = plpData, - outcomeId = 3, - modelSettings = resNetSettings, - analysisId = "Analysis_TrainCacheResNet", - analysisName = "Testing Training Cache", - populationSettings = populationSet, - splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), - sampleSettings = PatientLevelPrediction::createSampleSettings(), # none - featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none - preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), - executeSettings = PatientLevelPrediction::createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ), - saveDirectory = modelPath - ) - }, - error = function(e) { - print(e) - return(NULL) - } - ) + fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, + analysisPath = newPath) sink() - trainCache <- TrainingCache$new(analysisPath) - testthat::expect_equal(is.na(trainCache$getLastGridSearchIndex()), TRUE) + + newCache <- readRDS(file.path(newPath, "paramPersistence.rds")) + testthat::expect_equal(nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary), 4) +}) + +test_that("Prediction is cached for optimal parameters", { + testCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) + indexOfMax <- which.max(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) + indexOfMin <- which.min(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) + testthat::expect_equal(class(testCache$gridSearchPredictions[[indexOfMax]]$prediction), class(data.frame())) + testthat::expect_null(testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]]) }) diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R index b3e421f..043cbe7 100644 --- a/tests/testthat/test-Transformer.R +++ b/tests/testthat/test-Transformer.R @@ -85,6 +85,9 @@ test_that("transformer nn-module works", { dim_hidden = 32 ) output <- model(input) + expect_equal(output$shape[0], 10L) + input$num <- reticulate::py_none() + output <- model(input) expect_equal(output$shape[0], 10L) }) @@ -126,3 +129,32 @@ test_that("dimHidden ratio works as expected", { dimHiddenRatio = 4/3)) }) + +test_that("numerical embedding works as expected", { + embeddings <- 32L # size of embeddings + features <- 2L # number of numerical features + patients <- 9L + + numTensor <- torch$randn(c(patients, features)) + + numericalEmbeddingClass <- reticulate::import_from_path("ResNet", path=path)$NumericalEmbedding + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, + embedding_dim = embeddings, + bias = TRUE) + out <- numericalEmbedding(numTensor) + + # should be patients x features x embedding size + expect_equal(out$shape[[0]], patients) + expect_equal(out$shape[[1]], features) + expect_equal(out$shape[[2]], embeddings) + + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, + embedding_dim = embeddings, + bias = FALSE) + + out <- numericalEmbedding(numTensor) + expect_equal(out$shape[[0]], patients) + expect_equal(out$shape[[1]], features) + expect_equal(out$shape[[2]], embeddings) + + }) diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd index 5b88419..1aded4d 100644 --- a/vignettes/Installing.Rmd +++ b/vignettes/Installing.Rmd @@ -52,7 +52,7 @@ Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires ## Mac/Linux Users -Under Mac and Linux the OHDSI deepPLP package requires installing: +Under Mac and Linux the OHDSI DeepPLP package requires installing: - R ( ) - (R \>= 4.0.0, but latest is recommended) - Python - The package is tested with python 3.10, but \>= 3.8 should work @@ -83,9 +83,15 @@ By default `install_minconda()` creates an environment `r-reticulate` with `pyth reticulate::conda_install(envname = 'r-reticulate', packages=c('python=3.10')) ``` -Then when we can install `DeepPatientLevelPrediction` and it should install the required python packages in this environment. +If reticulate is having issues finding the conda installation you can use the function `reticulate::miniconda_path()` to find the default installation location for your miniconda installation. Then you can force reticulate to use the newly generated environment by setting the environment variable `RETICULATE_PYTHON` to point to the python binary in the environment. For example by adding the following to the `.Renviron` file: -If instead you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`. +``` +RETICULATE_PYTHON="/path/to/miniconda/envs/r-reticulate/python/bin" +``` + +Then you need to restart you R session. To verify that `reticulate` finds the correct version. You can call `reticulate::py_config()`. + +Once you have a working python environment that reticulate can locate you can install `DeepPatientLevelPrediction`. If you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`. ## Installing DeepPatientLevelPrediction using remotes @@ -93,11 +99,18 @@ To install using `remotes` run: ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} install.packages("remotes") -remotes::install_github("OHDSI/FeatureExtraction") -remotes::install_github("OHDSI/PatientLevelPrediction") remotes::install_github("OHDSI/DeepPatientLevelPrediction") ``` +This should install the required python packages. If that doesn't happen it can be triggered by calling: + +``` +library(DeepPatientLevelPrediction) +torch$trandn(10L) +``` + +This should print out a tensor with ten different values. + When installing make sure to close any other Rstudio sessions that are using `DeepPatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks on windows that prevent the package installing. # Testing Installation @@ -107,7 +120,7 @@ library(PatientLevelPrediction) library(DeepPatientLevelPrediction) data(plpDataSimulationProfile) -sampleSize <- 1e4 +sampleSize <- 1e3 plpData <- simulatePlpData( plpDataSimulationProfile, n = sampleSize