From 8942e11c4fb36dadb5ad0e8e8018347a317ccc0d Mon Sep 17 00:00:00 2001 From: Egill Axfjord Fridgeirsson Date: Fri, 3 Nov 2023 05:42:56 -0400 Subject: [PATCH 1/7] Release 2.0.1 (#94) * Connection parameter fixed to be in line with newest polars * Fixed a bug where LRFinder used a hardcoded batch size * Seed is now used in LRFinder so it's reproducible * Fixed a bug in NumericalEmbedding * Fixed a bug for Transformer and numerical features * Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang ) * Updated installation documentation after feedback from HADES hackathon * Fixed a bug where order of numeric features wasn't conserved between training and test set * TrainingCache now only saves prediction dataframe for the best performing model Co-authored-by: Henrik John Co-authored-by: Xinzhuo Jiang --- .gitignore | 3 +- DESCRIPTION | 2 +- NEWS.md | 12 +++++ R/Estimator.R | 40 +++++++++++----- R/TrainingCache-class.R | 7 +++ extras/example.R | 73 ----------------------------- inst/python/Dataset.py | 4 +- inst/python/LrFinder.py | 7 +-- inst/python/ResNet.py | 4 +- inst/python/Transformer.py | 5 +- man/TrainingCache.Rd | 13 +++++ tests/testthat/setup.R | 18 +++++++ tests/testthat/test-Estimator.R | 25 +++------- tests/testthat/test-TrainingCache.R | 61 ++++++++++-------------- tests/testthat/test-Transformer.R | 32 +++++++++++++ vignettes/Installing.Rmd | 25 +++++++--- 16 files changed, 174 insertions(+), 157 deletions(-) delete mode 100644 extras/example.R diff --git a/.gitignore b/.gitignore index 56db315..94cb359 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ config.yml docs .idea/ renv.lock -extras/ \ No newline at end of file +extras/ +.Renviron \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0df769f..d4c1a53 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: DeepPatientLevelPrediction Type: Package Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model -Version: 2.0.0 +Version: 2.0.1 Date: 18-04-2023 Authors@R: c( person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 3da704b..70ccf12 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +DeepPatientLevelPrediction 2.0.1 +====================== + - Connection parameter fixed to be in line with newest polars + - Fixed a bug where LRFinder used a hardcoded batch size + - Seed is now used in LRFinder so it's reproducible + - Fixed a bug in NumericalEmbedding + - Fixed a bug for Transformer and numerical features + - Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang ) + - Updated installation documentation after feedback from HADES hackathon + - Fixed a bug where order of numeric features wasn't conserved between training and test set + - TrainingCache now only saves prediction dataframe for the best performing model + DeepPatientLevelPrediction 2.0.0 ====================== - New backend which uses pytorch through reticulate instead of torch in R diff --git a/R/Estimator.R b/R/Estimator.R index f804368..7705279 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -310,7 +310,8 @@ gridCvDeep <- function(mappedData, fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))] findLR <- modelSettings$estimatorSettings$findLR - for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { + if (!trainCache$isFull()) { + for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId)) ParallelLogger::logInfo(paste0("HyperParameters: ")) ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | ")) @@ -363,25 +364,38 @@ gridCvDeep <- function(mappedData, ) } maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) - paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]] - gridSearchPredictons[[gridId]] <- list( prediction = prediction, - param = paramSearch[[gridId]] + param = paramSearch[[gridId]], + gridPerformance = PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]]) ) + gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), + nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) + gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]] + + # remove all predictions that are not the max performance + indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + for (i in seq_along(gridSearchPredictons)) { + if (!is.null(gridSearchPredictons[[i]])) { + if (i != indexOfMax) { + gridSearchPredictons[[i]]$prediction <- list(NULL) + } + } + } + ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax)) trainCache$saveGridSearchPredictions(gridSearchPredictons) } + } + paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) + # get best params + indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + finalParam <- gridSearchPredictons[[indexOfMax]]$param + + paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) - # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function) - paramGridSearch <- lapply(gridSearchPredictons, function(x) { - do.call(PatientLevelPrediction::computeGridPerformance, x) - }) # cvAUCmean, cvAUC, param - - optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance))) - finalParam <- paramGridSearch[[optimalParamInd]]$param - - cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction + # get best CV prediction + cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction cvPrediction$evaluationType <- "CV" ParallelLogger::logInfo("Training final model using optimal parameters") diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R index 8577f31..be626d5 100644 --- a/R/TrainingCache-class.R +++ b/R/TrainingCache-class.R @@ -69,6 +69,13 @@ TrainingCache <- R6::R6Class( return(private$.paramPersistence$gridSearchPredictions) }, + #' @description + #' Check if cache is full + #' @returns Boolen + isFull = function() { + return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance))))) + }, + #' @description #' Gets the last index from the cached grid search #' @returns Last grid search index diff --git a/extras/example.R b/extras/example.R deleted file mode 100644 index 99a6c08..0000000 --- a/extras/example.R +++ /dev/null @@ -1,73 +0,0 @@ -# testing code (requires sequential branch of FeatureExtraction): -# rm(list = ls()) -library(PatientLevelPrediction) -library(DeepPatientLevelPrediction) - -data(plpDataSimulationProfile) -sampleSize <- 1e3 -plpData <- simulatePlpData( - plpDataSimulationProfile, - n = sampleSize - ) - - -populationSet <- PatientLevelPrediction::createStudyPopulationSettings( - requireTimeAtRisk = F, - riskWindowStart = 1, - riskWindowEnd = 365*5) - -# -# modelSettings <- setDefaultTransformer(estimatorSettings = setEstimator( -# learningRate = "auto", -# batchSize=64L, -# epochs = 10L -# )) - -modelSettings <- setDefaultResNet(estimatorSettings = setEstimator( - learningRate = "auto", - weightDecay = 1e-06, - device="cuda:0", - batchSize=128L, - epochs=50L, - seed=42 -)) - -modelSettings <- setResNet(numLayers = c(1L, 2L), - sizeHidden = 72L, - hiddenFactor = 1L, - residualDropout = 0.0, - hiddenDropout = 0.0, - sizeEmbedding = 64L, - estimatorSettings = setEstimator( - learningRate = 3e-4, - batchSize = 128L, - epochs = 10L, - device = "cpu", - seed = 42 - ), - randomSample = 2) - -res2 <- PatientLevelPrediction::runPlp( - plpData = plpData, - outcomeId = unique(plpData$outcomes$outcomeId)[[1]], - modelSettings = modelSettings, - analysisId = 'Test', - analysisName = 'Testing DeepPlp', - populationSettings = populationSet, - splitSettings = createDefaultSplitSetting(splitSeed = 123), - sampleSettings = createSampleSettings("underSample"), # none - featureEngineeringSettings = createFeatureEngineeringSettings(), # none - preprocessSettings = createPreprocessSettings(normalize = F), - logSettings = createLogSettings(verbosity='TRACE'), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = T, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ), - saveDirectory = '~/test/resnet/' -) - - diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py index 15d749b..98d9ed3 100644 --- a/inst/python/Dataset.py +++ b/inst/python/Dataset.py @@ -21,7 +21,7 @@ def __init__(self, if pathlib.Path(data).suffix == '.sqlite': data = urllib.parse.quote(data) data = pl.read_database("SELECT * from covariates", - connection_uri=f"sqlite://{data}").lazy() + connection=f"sqlite://{data}").lazy() else: data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow')) observations = data.select(pl.col('rowId').max()).collect()[0, 0] @@ -67,7 +67,7 @@ def __init__(self, if pl.count(self.numerical_features) == 0: self.num = None else: - numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \ + numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \ with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId'). rank(method="dense") - 1, pl.col('rowId') - 1) \ .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect() diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py index 4c24a38..9d5bd0c 100644 --- a/inst/python/LrFinder.py +++ b/inst/python/LrFinder.py @@ -37,6 +37,7 @@ def __init__(self, smooth = lr_settings.get("smooth", 0.05) divergence_threshold = lr_settings.get("divergence_threshold", 4) torch.manual_seed(seed=estimator_settings["seed"]) + self.seed = estimator_settings["seed"] self.model = model(**model_parameters) if callable(estimator_settings["device"]): self.device = estimator_settings["device"]() @@ -55,18 +56,18 @@ def __init__(self, self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr) self.criterion = estimator_settings["criterion"]() - self.batch_size = estimator_settings['batch_size'] + self.batch_size = int(estimator_settings['batch_size']) self.losses = None self.loss_index = None def get_lr(self, dataset): batch_index = torch.arange(0, len(dataset), 1).tolist() - + random.seed(self.seed) losses = torch.empty(size=(self.num_lr,), dtype=torch.float) lrs = torch.empty(size=(self.num_lr,), dtype=torch.float) for i in tqdm(range(self.num_lr)): self.optimizer.zero_grad() - random_batch = random.sample(batch_index, 32) + random_batch = random.sample(batch_index, self.batch_size) batch = dataset[random_batch] batch = batch_to_device(batch, self.device) diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py index f680eb2..cef4b49 100644 --- a/inst/python/ResNet.py +++ b/inst/python/ResNet.py @@ -130,9 +130,9 @@ def __init__(self, nn.init.kaiming_uniform_(parameter, a=math.sqrt(5)) def forward(self, input): - x = self.weight.unsqueeze(0) * input.unsqueeze(-1) + x = self.weight[None] * input[..., None] if self.bias is not None: - x = x + self.bias.unsqueeze(-1) + x = x + self.bias[None] return x diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py index 5944e1b..1c95b36 100644 --- a/inst/python/Transformer.py +++ b/inst/python/Transformer.py @@ -49,6 +49,9 @@ def __init__(self, if num_features != 0 and num_features is not None: self.numerical_embedding = NumericalEmbedding(num_features, dim_token) + self.use_numerical = True + else: + self.use_numerical = False self.class_token = ClassToken(dim_token) self.layers = nn.ModuleList([]) @@ -78,7 +81,7 @@ def __init__(self, def forward(self, x): mask = torch.where(x["cat"] == 0, True, False) cat = self.categorical_embedding(x["cat"]) - if "num" in x.keys() and self.numerical_embedding is not None: + if self.use_numerical: num = self.numerical_embedding(x["num"]) x = torch.cat([cat, num], dim=1) mask = torch.cat([mask, torch.zeros([x.shape[0], diff --git a/man/TrainingCache.Rd b/man/TrainingCache.Rd index 0a7ec7b..c82bb23 100644 --- a/man/TrainingCache.Rd +++ b/man/TrainingCache.Rd @@ -8,6 +8,8 @@ Whether the provided and cached parameter grid is identical Grid search results from the training cache +Boolen + Last grid search index } \description{ @@ -21,6 +23,7 @@ Parameter caching for training persistence and continuity \item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}} \item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}} \item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}} +\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}} \item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}} \item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}} \item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}} @@ -104,6 +107,16 @@ Gets the grid search results from the training cache \if{html}{\out{
}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-TrainingCache-isFull}{}}} +\subsection{Method \code{isFull()}}{ +Check if cache is full +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{TrainingCache$isFull()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index adf0dcb..7cd0fee 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -78,3 +78,21 @@ dataset <- Dataset$Data( ) small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3))) +modelSettings <- setResNet( + numLayers = 1, sizeHidden = 16, hiddenFactor = 1, + residualDropout = c(0, 0.2), hiddenDropout = 0, + sizeEmbedding = 16, hyperParamSearch = "random", + randomSample = 2, + setEstimator(epochs=1, + learningRate = 3e-4) +) +fitEstimatorPath <- file.path(testLoc, 'fitEstimator') +if (!dir.exists(fitEstimatorPath)) { + dir.create(fitEstimatorPath) +} +fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, + analysisPath = fitEstimatorPath) + + diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R index 9cb1382..b4dd0a4 100644 --- a/tests/testthat/test-Estimator.R +++ b/tests/testthat/test-Estimator.R @@ -146,25 +146,12 @@ test_that("early stopping works", { testthat::expect_true(earlyStop$early_stop) }) -modelSettings <- setResNet( - numLayers = 1, sizeHidden = 16, hiddenFactor = 1, - residualDropout = 0, hiddenDropout = 0, - sizeEmbedding = 16, hyperParamSearch = "random", - randomSample = 1, - setEstimator(epochs=1, - learningRate = 3e-4) -) - -sink(nullfile()) -results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1, analysisPath = testLoc) -sink() - test_that("Estimator fit function works", { - expect_true(!is.null(results$trainDetails$trainingTime)) + expect_true(!is.null(fitEstimatorResults$trainDetails$trainingTime)) - expect_equal(class(results), "plpModel") - expect_equal(attr(results, "modelType"), "binary") - expect_equal(attr(results, "saveType"), "file") + expect_equal(class(fitEstimatorResults), "plpModel") + expect_equal(attr(fitEstimatorResults, "modelType"), "binary") + expect_equal(attr(fitEstimatorResults, "saveType"), "file") fakeTrainData <- trainData fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake")) expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc)) @@ -184,7 +171,7 @@ test_that("predictDeepEstimator works", { # input is a plpModel and data sink(nullfile()) predictions <- predictDeepEstimator( - plpModel = results, data = trainData$Test, + plpModel = fitEstimatorResults, data = trainData$Test, trainData$Test$labels ) sink() @@ -369,4 +356,4 @@ test_that("estimatorSettings can be saved and loaded with correct python objects testthat::expect_false(reticulate::py_is_null_xptr(optimizer)) testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun)) testthat::expect_false(reticulate::py_is_null_xptr(criterion)) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R index eb4ab17..debe95c 100644 --- a/tests/testthat/test-TrainingCache.R +++ b/tests/testthat/test-TrainingCache.R @@ -47,43 +47,32 @@ test_that("Param grid predictions can be cached", { }) test_that("Estimator can resume training from cache", { - modelPath <- tempdir() - analysisPath <- file.path(modelPath, "Analysis_TrainCacheResNet") - dir.create(analysisPath) - trainCache <- TrainingCache$new(analysisPath) - trainCache$saveModelParams(paramSearch) + trainCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) + newPath <- file.path(testLoc, 'resume') + dir.create(newPath) + + # remove last row + trainCache$gridSearchPredictions[[2]] <- NULL + length(trainCache$gridSearchPredictions) <- 2 + + # save new cache + saveRDS(trainCache, file=file.path(newPath, "paramPersistence.rds")) sink(nullfile()) - res2 <- tryCatch( - { - PatientLevelPrediction::runPlp( - plpData = plpData, - outcomeId = 3, - modelSettings = resNetSettings, - analysisId = "Analysis_TrainCacheResNet", - analysisName = "Testing Training Cache", - populationSettings = populationSet, - splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), - sampleSettings = PatientLevelPrediction::createSampleSettings(), # none - featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none - preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), - executeSettings = PatientLevelPrediction::createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ), - saveDirectory = modelPath - ) - }, - error = function(e) { - print(e) - return(NULL) - } - ) + fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, + analysisPath = newPath) sink() - trainCache <- TrainingCache$new(analysisPath) - testthat::expect_equal(is.na(trainCache$getLastGridSearchIndex()), TRUE) + + newCache <- readRDS(file.path(newPath, "paramPersistence.rds")) + testthat::expect_equal(nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary), 4) +}) + +test_that("Prediction is cached for optimal parameters", { + testCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) + indexOfMax <- which.max(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) + indexOfMin <- which.min(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) + testthat::expect_equal(class(testCache$gridSearchPredictions[[indexOfMax]]$prediction), class(data.frame())) + testthat::expect_null(testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]]) }) diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R index b3e421f..043cbe7 100644 --- a/tests/testthat/test-Transformer.R +++ b/tests/testthat/test-Transformer.R @@ -85,6 +85,9 @@ test_that("transformer nn-module works", { dim_hidden = 32 ) output <- model(input) + expect_equal(output$shape[0], 10L) + input$num <- reticulate::py_none() + output <- model(input) expect_equal(output$shape[0], 10L) }) @@ -126,3 +129,32 @@ test_that("dimHidden ratio works as expected", { dimHiddenRatio = 4/3)) }) + +test_that("numerical embedding works as expected", { + embeddings <- 32L # size of embeddings + features <- 2L # number of numerical features + patients <- 9L + + numTensor <- torch$randn(c(patients, features)) + + numericalEmbeddingClass <- reticulate::import_from_path("ResNet", path=path)$NumericalEmbedding + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, + embedding_dim = embeddings, + bias = TRUE) + out <- numericalEmbedding(numTensor) + + # should be patients x features x embedding size + expect_equal(out$shape[[0]], patients) + expect_equal(out$shape[[1]], features) + expect_equal(out$shape[[2]], embeddings) + + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, + embedding_dim = embeddings, + bias = FALSE) + + out <- numericalEmbedding(numTensor) + expect_equal(out$shape[[0]], patients) + expect_equal(out$shape[[1]], features) + expect_equal(out$shape[[2]], embeddings) + + }) diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd index 5b88419..1aded4d 100644 --- a/vignettes/Installing.Rmd +++ b/vignettes/Installing.Rmd @@ -52,7 +52,7 @@ Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires ## Mac/Linux Users -Under Mac and Linux the OHDSI deepPLP package requires installing: +Under Mac and Linux the OHDSI DeepPLP package requires installing: - R ( ) - (R \>= 4.0.0, but latest is recommended) - Python - The package is tested with python 3.10, but \>= 3.8 should work @@ -83,9 +83,15 @@ By default `install_minconda()` creates an environment `r-reticulate` with `pyth reticulate::conda_install(envname = 'r-reticulate', packages=c('python=3.10')) ``` -Then when we can install `DeepPatientLevelPrediction` and it should install the required python packages in this environment. +If reticulate is having issues finding the conda installation you can use the function `reticulate::miniconda_path()` to find the default installation location for your miniconda installation. Then you can force reticulate to use the newly generated environment by setting the environment variable `RETICULATE_PYTHON` to point to the python binary in the environment. For example by adding the following to the `.Renviron` file: -If instead you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`. +``` +RETICULATE_PYTHON="/path/to/miniconda/envs/r-reticulate/python/bin" +``` + +Then you need to restart you R session. To verify that `reticulate` finds the correct version. You can call `reticulate::py_config()`. + +Once you have a working python environment that reticulate can locate you can install `DeepPatientLevelPrediction`. If you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`. ## Installing DeepPatientLevelPrediction using remotes @@ -93,11 +99,18 @@ To install using `remotes` run: ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} install.packages("remotes") -remotes::install_github("OHDSI/FeatureExtraction") -remotes::install_github("OHDSI/PatientLevelPrediction") remotes::install_github("OHDSI/DeepPatientLevelPrediction") ``` +This should install the required python packages. If that doesn't happen it can be triggered by calling: + +``` +library(DeepPatientLevelPrediction) +torch$trandn(10L) +``` + +This should print out a tensor with ten different values. + When installing make sure to close any other Rstudio sessions that are using `DeepPatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks on windows that prevent the package installing. # Testing Installation @@ -107,7 +120,7 @@ library(PatientLevelPrediction) library(DeepPatientLevelPrediction) data(plpDataSimulationProfile) -sampleSize <- 1e4 +sampleSize <- 1e3 plpData <- simulatePlpData( plpDataSimulationProfile, n = sampleSize From 3a8d873a78a5b9c37aca0e0444fdc3ce0f37d397 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 4 Dec 2023 17:17:23 +0100 Subject: [PATCH 2/7] add test for conserved columns during external validation --- R/Estimator.R | 8 +- R/LRFinder.R | 11 +- inst/python/Dataset.py | 23 +-- tests/testthat/setup.R | 59 +++--- tests/testthat/test-Dataset.R | 104 +++++++++-- tests/testthat/test-Estimator.R | 274 +++++++++++++++------------- tests/testthat/test-LRFinder.R | 119 ++++++------ tests/testthat/test-MLP.R | 127 ++++++------- tests/testthat/test-ResNet.R | 157 ++++++++-------- tests/testthat/test-TrainingCache.R | 60 +++--- tests/testthat/test-Transformer.R | 90 ++++----- 11 files changed, 583 insertions(+), 449 deletions(-) diff --git a/R/Estimator.R b/R/Estimator.R index 374bf2d..6ba9283 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -328,9 +328,9 @@ gridCvDeep <- function(mappedData, fitParams, paramSearch[[gridId]]) currentEstimatorSettings$modelType <- modelSettings$modelType - currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] + currentModelParams$catFeatures <- dataset$get_cat_features()$max() currentModelParams$numFeatures <- - dataset$get_numerical_features()$shape[[1]] + dataset$get_numerical_features()$max() if (findLR) { lrFinder <- createLRFinder(modelType = modelSettings$modelType, modelParameters = currentModelParams, @@ -401,8 +401,8 @@ gridCvDeep <- function(mappedData, dir.create(file.path(modelLocation), recursive = TRUE) } - modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] - modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]] + modelParams$catFeatures <- dataset$get_cat_features()$max() + modelParams$numFeatures <- dataset$get_numerical_features()$max() estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, diff --git a/R/LRFinder.R b/R/LRFinder.R index 63cb62c..4ab006c 100644 --- a/R/LRFinder.R +++ b/R/LRFinder.R @@ -22,16 +22,21 @@ createLRFinder <- function(modelType, path <- system.file("python", package = "DeepPatientLevelPrediction") lrFinderClass <- reticulate::import_from_path("LrFinder", path = path)$LrFinder + + model <- reticulate::import_from_path(modelType, path = path)[[modelType]] modelParameters <- camelCaseToSnakeCaseNames(modelParameters) estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings) + estimatorSettings <- evalEstimatorSettings(estimatorSettings) + + # estimator <- createEstimator(modelType = estimatorSettings$modelType, + # modelParameters = modelParameters, + # estimatorSettings = estimatorSettings) if (!is.null(lrSettings)) { lrSettings <- camelCaseToSnakeCaseNames(lrSettings) } - - estimatorSettings <- evalEstimatorSettings(estimatorSettings) - + lrFinder <- lrFinderClass(model = model, model_parameters = modelParameters, estimator_settings = estimatorSettings, diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py index a81dfc1..b488520 100644 --- a/inst/python/Dataset.py +++ b/inst/python/Dataset.py @@ -47,13 +47,8 @@ def __init__(self, data, labels=None, numerical_features=None): # rename newColumnId to columnId and sort by it data_cat = ( data.filter(~pl.col("columnId").is_in(self.numerical_features)) - .sort(by="columnId") - .with_row_count("newColumnId") - .with_columns( - pl.col("newColumnId").first().over("columnId").rank(method="dense") - ) - .select(pl.col("rowId"), pl.col("newColumnId").alias("columnId")) - .sort("rowId") + .select(pl.col("rowId"), pl.col("columnId")) + .sort(["rowId", "columnId"]) .with_columns(pl.col("rowId") - 1) .collect() ) @@ -77,18 +72,16 @@ def __init__(self, data, labels=None, numerical_features=None): if pl.count(self.numerical_features) == 0: self.num = None else: + map_numerical = dict(zip(self.numerical_features.sort().to_list(), + list(range(len(self.numerical_features))))) + numerical_data = ( data.filter(pl.col("columnId").is_in(self.numerical_features)) - .sort(by="columnId") - .with_row_count("newColumnId") - .with_columns( - pl.col("newColumnId").first().over("columnId").rank(method="dense") - - 1, - pl.col("rowId") - 1, - ) + .with_columns(pl.col("columnId").replace(map_numerical), + pl.col("rowId") - 1) .select( pl.col("rowId"), - pl.col("newColumnId").alias("columnId"), + pl.col("columnId"), pl.col("covariateValue"), ) .collect() diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 7cd0fee..4a6e74d 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -7,13 +7,13 @@ connectionDetails <- Eunomia::getEunomiaConnectionDetails() Eunomia::createCohorts(connectionDetails) covSet <- FeatureExtraction::createCovariateSettings( - useDemographicsGender = T, - useDemographicsAge = T, - useDemographicsRace = T, - useDemographicsEthnicity = T, - useDemographicsAgeGroup = T, - useConditionGroupEraLongTerm = T, - useDrugEraStartLongTerm = T, + useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useDemographicsRace = TRUE, + useDemographicsEthnicity = TRUE, + useDemographicsAgeGroup = TRUE, + useConditionGroupEraLongTerm = TRUE, + useDrugEraStartLongTerm = TRUE, endDays = -1 ) @@ -30,10 +30,11 @@ databaseDetails <- PatientLevelPrediction::createDatabaseDetails( cdmDatabaseName = "eunomia" ) -restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings( - firstExposureOnly = T, - washoutPeriod = 365 -) +restrictPlpDataSettings <- + PatientLevelPrediction::createRestrictPlpDataSettings( + firstExposureOnly = TRUE, + washoutPeriod = 365 + ) plpData <- PatientLevelPrediction::getPlpData( databaseDetails = databaseDetails, @@ -41,9 +42,24 @@ plpData <- PatientLevelPrediction::getPlpData( covariateSettings = covSet ) +# add age squared so I have more than one numerical feature +plpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>% + dplyr::rows_append(data.frame( + covariateId = 2002, + covariateName = "Squared age", + analysisId = 2, + conceptId = 0), copy = TRUE) + +squaredAges <- plpData$covariateData$covariates %>% + dplyr::filter(covariateId == 1002) %>% + dplyr::mutate(covariateId = 2002, + covariateValue = .data$covariateValue**2) + +plpData$covariateData$covariates <- plpData$covariateData$covariates %>% + dplyr::rows_append(squaredAges) populationSet <- PatientLevelPrediction::createStudyPopulationSettings( - requireTimeAtRisk = F, + requireTimeAtRisk = FALSE, riskWindowStart = 1, riskWindowEnd = 365 ) @@ -66,33 +82,32 @@ mappedData <- PatientLevelPrediction::MapIds( ) path <- system.file("python", package = "DeepPatientLevelPrediction") -Dataset <- reticulate::import_from_path("Dataset", path = path) +datasetClass <- reticulate::import_from_path("Dataset", path = path) if (is.null(attributes(mappedData)$path)) { # sqlite object attributes(mappedData)$path <- attributes(mappedData)$dbname } -dataset <- Dataset$Data( +dataset <- datasetClass$Data( data = reticulate::r_to_py(normalizePath(attributes(mappedData)$path)), labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount), ) -small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3))) +smallDataset <- torch$utils$data$Subset(dataset, + (1:round(length(dataset) / 3))) modelSettings <- setResNet( numLayers = 1, sizeHidden = 16, hiddenFactor = 1, residualDropout = c(0, 0.2), hiddenDropout = 0, sizeEmbedding = 16, hyperParamSearch = "random", randomSample = 2, - setEstimator(epochs=1, + setEstimator(epochs = 1, learningRate = 3e-4) ) -fitEstimatorPath <- file.path(testLoc, 'fitEstimator') +fitEstimatorPath <- file.path(testLoc, "fitEstimator") if (!dir.exists(fitEstimatorPath)) { dir.create(fitEstimatorPath) } -fitEstimatorResults <- fitEstimator(trainData$Train, - modelSettings = modelSettings, - analysisId = 1, +fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, analysisPath = fitEstimatorPath) - - diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R index 940f9d1..fd3ce92 100644 --- a/tests/testthat/test-Dataset.R +++ b/tests/testthat/test-Dataset.R @@ -1,7 +1,9 @@ test_that("number of num and cat features sum correctly", { testthat::expect_equal( - length(dataset$get_numerical_features()) + length(dataset$get_cat_features()), - dplyr::n_distinct(mappedData$covariates %>% dplyr::collect() %>% + length(dataset$get_numerical_features()) + + length(dataset$get_cat_features()), + dplyr::n_distinct(mappedData$covariates %>% + dplyr::collect() %>% dplyr::pull(covariateId)) ) }) @@ -12,40 +14,112 @@ test_that("length of dataset correct", { expect_equal(length(dataset), dataset$num$shape[0]) expect_equal( length(dataset), - dplyr::n_distinct(mappedData$covariates %>% - dplyr::collect() %>% dplyr::pull(rowId)) + dplyr::n_distinct(mappedData$covariates %>% + dplyr::collect() %>% + dplyr::pull(.data$rowId)) ) }) test_that(".getbatch works", { - batch_size <- 16 + batchSize <- 16 # get one sample out <- dataset[10] - - # output should be a list of two items, the batch in pos 1 and targets in pos 2, + + # output should be a list of two items, + # the batch in pos 1 and targets in pos 2, # the batch is what goes to the model expect_equal(length(out), 2) - + # targets should be binary expect_true(out[[2]]$item() %in% c(0, 1)) - + # shape of batch is correct expect_equal(length(out[[1]]), 2) expect_equal(out[[1]]$cat$shape[0], 1) expect_equal(out[[1]]$num$shape[0], 1) - + # shape of target expect_equal(out[[2]]$shape$numel(), 1) - + # get a whole batch - out <- dataset[10:(10 + batch_size)] - + out <- dataset[10:(10 + batchSize)] + expect_equal(length(out), 2) expect_true(all(out[[2]]$numpy() %in% c(0, 1))) - + expect_equal(length(out[[1]]), 2) expect_equal(out[[1]]$cat$shape[0], 16) expect_equal(out[[1]]$num$shape[0], 16) - + expect_equal(out[[2]]$shape[0], 16) }) + +test_that("Column order is preserved when features are missing", { + # important for transfer learning and external validation + + reducedCovData <- Andromeda::copyAndromeda(trainData$Train$covariateData) + + # remove one numerical and one categorical + numFeature <- 1002 # continous age + catFeature <- 4285898210 # a random common cat feature + reducedCovData$covariates <- trainData$Train$covariateData$covariates %>% + dplyr::filter(!(covariateId %in% c(numFeature, catFeature))) + reducedCovData$covariates <- trainData$Train$covariateData$covariates %>% + dplyr::filter(!(covariateId %in% c(numFeature, catFeature))) + + mappedReducedData <- PatientLevelPrediction::MapIds( + reducedCovData, + mapping = mappedData$mapping + ) + + catColumn <- mappedData$mapping %>% + dplyr::filter(covariateId == catFeature) %>% + dplyr::pull("columnId") + numColumn <- mappedData$mapping %>% + dplyr::filter(covariateId == numFeature) %>% + dplyr::pull("columnId") + + reducedDataset <- datasetClass$Data( + data = + reticulate::r_to_py(normalizePath(attributes(mappedReducedData)$dbname)), + labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount), + numerical_features = dataset$numerical_features$to_list() + ) + + # should have same number of columns + expect_equal(dataset$num$shape[[1]], reducedDataset$num$shape[[1]]) + + # all zeros in column with removed feature, -1 because r to py + expect_true(reducedDataset$num[, numColumn - 1]$sum()$item() == 0) + + # all other columns are same + indexReduced <- !torch$isin(torch$arange(reducedDataset$num$shape[[1]]), + numColumn - 1) + index <- !torch$isin(torch$arange(dataset$num$shape[[1]]), + numColumn - 1) + + expect_equal(reducedDataset$num[, indexReduced]$mean()$item(), + dataset$num[, index]$mean()$item()) + + # cat data should have same counts of all columnIds + # expect the one that was removed + # not same counts for removed feature + expect_false(isTRUE(all.equal((reducedDataset$cat == catColumn)$sum()$item(), + (dataset$cat == catColumn)$sum()$item()))) + + # get counts + counts <- as.array(torch$unique(dataset$cat, + return_counts = TRUE)[[2]]$numpy()) + counts <- counts[-(catColumn + 1)] # +1 because py_to_r + counts <- counts[-1] + + reducedCounts <- as.array(torch$unique(reducedDataset$cat, + return_counts = TRUE)[[2]]$numpy()) + reducedCounts <- reducedCounts[-(catColumn + 1)] # +1 because py_to_r + reducedCounts <- reducedCounts[-1] + + expect_false(isTRUE(all.equal(counts, reducedCounts))) + expect_equal(dataset$get_cat_features()$max(), + reducedDataset$get_cat_features()$max()) + +}) diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R index b4dd0a4..9972839 100644 --- a/tests/testthat/test-Estimator.R +++ b/tests/testthat/test-Estimator.R @@ -1,5 +1,5 @@ -catFeatures <- small_dataset$dataset$get_cat_features()$shape[[1]] -numFeatures <- small_dataset$dataset$get_numerical_features()$shape[[1]] +catFeatures <- smallDataset$dataset$get_cat_features()$max() +numFeatures <- smallDataset$dataset$get_numerical_features()$max() modelParameters <- list( cat_features = catFeatures, @@ -10,18 +10,20 @@ modelParameters <- list( hidden_factor = 2 ) -estimatorSettings <- setEstimator(learningRate = 3e-4, - weightDecay = 0.0, - batchSize = 128, - epochs = 5, - device = 'cpu', - seed=42, - optimizer=torch$optim$AdamW, - criterion=torch$nn$BCEWithLogitsLoss, - metric='loss', - scheduler= list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau, - params=list(patience=1)), - earlyStopping=NULL) +estimatorSettings <- + setEstimator(learningRate = 3e-4, + weightDecay = 0.0, + batchSize = 128, + epochs = 5, + device = "cpu", + seed = 42, + optimizer = torch$optim$AdamW, + criterion = torch$nn$BCEWithLogitsLoss, + metric = "loss", + scheduler = + list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau, + params = list(patience = 1)), + earlyStopping = NULL) modelType <- "ResNet" estimator <- createEstimator(modelType = modelType, @@ -32,11 +34,12 @@ test_that("Estimator initialization works", { # count parameters in both instances path <- system.file("python", package = "DeepPatientLevelPrediction") - ResNet <- reticulate::import_from_path(modelType, path=path)[[modelType]] + resNet <- reticulate::import_from_path(modelType, path = path)[[modelType]] testthat::expect_equal( - sum(reticulate::iterate(estimator$model$parameters(), function(x) x$numel())), - sum(reticulate::iterate(do.call(ResNet, modelParameters)$parameters(), + sum(reticulate::iterate(estimator$model$parameters(), + function(x) x$numel())), + sum(reticulate::iterate(do.call(resNet, modelParameters)$parameters(), function(x) x$numel())) ) @@ -47,8 +50,8 @@ test_that("Estimator initialization works", { }) test_that("Estimator detects wrong inputs", { - - testthat::expect_error(setEstimator(learningRate='notAuto')) + + testthat::expect_error(setEstimator(learningRate = "notAuto")) testthat::expect_error(setEstimator(weightDecay = -1)) testthat::expect_error(setEstimator(weightDecay = "text")) testthat::expect_error(setEstimator(batchSize = 0)) @@ -61,7 +64,7 @@ test_that("Estimator detects wrong inputs", { }) sink(nullfile()) -estimator$fit(small_dataset, small_dataset) +estimator$fit(smallDataset, smallDataset) sink() test_that("estimator fitting works", { @@ -70,72 +73,74 @@ test_that("estimator fitting works", { expect_true(!is.null(estimator$best_score$loss)) expect_true(!is.null(estimator$best_score$auc)) - old_weights <- estimator$model$head$weight$mean()$item() + oldWeights <- estimator$model$head$weight$mean()$item() sink(nullfile()) - estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule) + estimator$fit_whole_training_set(smallDataset, estimator$learn_rate_schedule) sink() - expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learn_rate_schedule, 1)[[1]]) + expect_equal(estimator$optimizer$param_groups[[1]]$lr, + tail(estimator$learn_rate_schedule, 1)[[1]]) - new_weights <- estimator$model$head$weight$mean()$item() + newWeights <- estimator$model$head$weight$mean()$item() # model should be updated when refitting - expect_true(old_weights != new_weights) + expect_true(oldWeights != newWeights) estimator$save(testLoc, "estimator.pt") expect_true(file.exists(file.path(testLoc, "estimator.pt"))) - + sink(nullfile()) preds <- estimator$predict_proba(dataset) sink() - + expect_lt(max(preds), 1) expect_gt(min(preds), 0) - + sink(nullfile()) - classes <- estimator$predict(small_dataset, threshold = 0.5) + classes <- estimator$predict(smallDataset, threshold = 0.5) sink() expect_equal(all(unique(classes) %in% c(0, 1)), TRUE) - + sink(nullfile()) - classes <- estimator$predict(small_dataset$dataset) + classes <- estimator$predict(smallDataset$dataset) sink() expect_equal(all(unique(classes) %in% c(0, 1)), TRUE) - + estimatorSettings <- setEstimator(learningRate = 3e-4, weightDecay = 0.0, batchSize = 128, epochs = 5, - device = 'cpu', - metric= "loss") - - estimator <- createEstimator(modelType=modelType, - modelParameters=modelParameters, - estimatorSettings=estimatorSettings) - + device = "cpu", + metric = "loss") + + estimator <- createEstimator(modelType = modelType, + modelParameters = modelParameters, + estimatorSettings = estimatorSettings) + sink(nullfile()) - estimator$fit(small_dataset, small_dataset) + estimator$fit(smallDataset, smallDataset) sink() - + expect_equal(estimator$metric$mode, "min") expect_equal(estimator$metric$name, "loss") - + sink(nullfile()) - estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule) + estimator$fit_whole_training_set(smallDataset, estimator$learn_rate_schedule) sink() - + expect_equal(estimator$learn_rate_schedule[[estimator$best_epoch]], estimator$optimizer$param_groups[[1]]$lr) - + }) test_that("early stopping works", { - EarlyStopping <- reticulate::import_from_path("Estimator", path=path)$EarlyStopping - earlyStop <- EarlyStopping(patience = 3, delta = 0, verbose = FALSE) - - + earlyStopping <- + reticulate::import_from_path("Estimator", path = path)$EarlyStopping + earlyStop <- earlyStopping(patience = 3, delta = 0, verbose = FALSE) + + testthat::expect_true(is.null(earlyStop$best_score)) testthat::expect_false(earlyStop$early_stop) earlyStop(0.5) @@ -153,15 +158,19 @@ test_that("Estimator fit function works", { expect_equal(attr(fitEstimatorResults, "modelType"), "binary") expect_equal(attr(fitEstimatorResults, "saveType"), "file") fakeTrainData <- trainData - fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake")) - expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc)) + fakeTrainData$train$covariateData <- list(fakeCovData = c("Fake")) + expect_error(fitEstimator(fakeTrainData$train, + modelSettings, analysisId = 1, + analysisPath = testLoc)) }) test_that("predictDeepEstimator works", { # input is an estimator and a dataset sink(nullfile()) - predictions <- predictDeepEstimator(estimator, small_dataset, cohort = trainData$Train$labels) + predictions <- predictDeepEstimator(estimator, + smallDataset, + cohort = trainData$Train$labels) sink() expect_lt(max(predictions$value), 1) @@ -181,22 +190,23 @@ test_that("predictDeepEstimator works", { }) test_that("batchToDevice works", { - batch_to_device <- reticulate::import_from_path("Estimator", path=path)$batch_to_device + batchToDevice <- reticulate::import_from_path("Estimator", + path = path)$batch_to_device # since we can't test moving to gpu there is a fake device called meta # which we can use to test of the device is updated estimator$device <- "meta" b <- 1:10 - batch <- batch_to_device(dataset[b], device=estimator$device) - + batch <- batchToDevice(dataset[b], device = estimator$device) + devices <- lapply( lapply(unlist(batch, recursive = TRUE), function(x) x$device), function(x) x == torch$device(type = "meta") ) # test that all are meta expect_true(all(devices == TRUE)) - - numDevice <- batch_to_device(dataset[b][[1]]$num, device=estimator$device) - expect_true(numDevice$device==torch$device(type="meta")) + + numDevice <- batchToDevice(dataset[b][[1]]$num, device = estimator$device) + expect_true(numDevice$device == torch$device(type = "meta")) }) test_that("Estimator without earlyStopping works", { @@ -205,19 +215,19 @@ test_that("Estimator without earlyStopping works", { weightDecay = 0.0, batchSize = 128, epochs = 1, - device = 'cpu', + device = "cpu", earlyStopping = NULL) - + estimator2 <- createEstimator(modelType = modelType, modelParameters = modelParameters, - estimatorSettings=estimatorSettings) + estimatorSettings = estimatorSettings) sink(nullfile()) - estimator2$fit(small_dataset, small_dataset) + estimator2$fit(smallDataset, smallDataset) sink() - + expect_null(estimator2$early_stopper) expect_true(!is.null(estimator2$best_epoch)) - + }) test_that("Early stopper can use loss and stops early", { @@ -225,78 +235,89 @@ test_that("Early stopper can use loss and stops early", { weightDecay = 0.0, batchSize = 128, epochs = 10, - device = 'cpu', - earlyStopping =list(useEarlyStopping=TRUE, - params = list(mode=c('min'), - patience=1)), - metric = 'loss', - seed=42) - + device = "cpu", + earlyStopping = + list(useEarlyStopping = TRUE, + params = list(mode = c("min"), + patience = 1)), + metric = "loss", + seed = 42) + estimator <- createEstimator(modelType = modelType, modelParameters = modelParameters, estimatorSettings = estimatorSettings) sink(nullfile()) - estimator$fit(small_dataset, small_dataset) + estimator$fit(smallDataset, smallDataset) sink() - + expect_true(estimator$best_epoch < estimator$epochs) - + }) -test_that('Custom metric in estimator works', { - - metric_fun <- function(predictions, labels) { - pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(), +test_that("Custom metric in estimator works", { + + metricFun <- function(predictions, labels) { + pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(), weights.class0 = labels$numpy()) auprc <- pr$auc.integral reticulate::r_to_py(auprc) } - + estimatorSettings <- setEstimator(learningRate = 3e-4, weightDecay = 0.0, batchSize = 128, device = "cpu", epochs = 1, - metric=list(fun=metric_fun, - name="auprc", - mode="max")) + metric = list(fun = metricFun, + name = "auprc", + mode = "max")) estimator <- createEstimator(modelType = modelType, modelParameters = modelParameters, estimatorSettings = estimatorSettings) expect_true(is.function(estimator$metric$fun)) expect_true(is.character(estimator$metric$name)) expect_true(estimator$metric$mode %in% c("min", "max")) - + sink(nullfile()) - estimator$fit(small_dataset, small_dataset) + estimator$fit(smallDataset, smallDataset) sink() - - expect_true(estimator$best_score[["auprc"]]>0) + + expect_true(estimator$best_score[["auprc"]] > 0) }) -test_that("setEstimator with paramsToTune is correctly added to hyperparameters", { - estimatorSettings <- setEstimator(learningRate=c(3e-4,1e-3), - batchSize=128, - epochs=1, - device="cpu", - metric=c("auc", "auprc"), - earlyStopping = list(useEarlyStopping=TRUE, - params=list(patience=c(4,10)))) +test_that("setEstimator with hyperparameters", { + estimatorSettings <- + setEstimator(learningRate = c(3e-4, 1e-3), + batchSize = 128, + epochs = 1, + device = "cpu", + metric = c("auc", "auprc"), + earlyStopping = list(useEarlyStopping = TRUE, + params = list(patience = c(4, 10)))) modelSettings <- setResNet(numLayers = 1, sizeHidden = 64, hiddenFactor = 1, residualDropout = 0.2, - hiddenDropout = 0.2,sizeEmbedding = 128, + hiddenDropout = 0.2, sizeEmbedding = 128, estimatorSettings = estimatorSettings, hyperParamSearch = "grid") - + expect_true(length(modelSettings$param) == 8) - expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.metric)))==2) - expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.learningRate)))==2) - expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.earlyStopping.patience)))==2) - fitParams <- names(modelSettings$param[[1]])[grepl("^estimator", names(modelSettings$param[[1]]))] - - estimatorSettings2 <- fillEstimatorSettings(estimatorSettings, fitParams, paramSearch=modelSettings$param[[8]]) - + expect_true(length(unique(lapply(modelSettings$param, + function(x) x$estimator.metric))) == 2) + expect_true(length(unique(lapply(modelSettings$param, + function(x) x$estimator.learningRate))) == 2) + expect_true(length(unique(lapply(modelSettings$param, function(x) { + x$estimator.earlyStopping.patience + }))) == 2) + + fitParams <- + names(modelSettings$param[[1]])[grepl("^estimator", + names(modelSettings$param[[1]]))] + + estimatorSettings2 <- + fillEstimatorSettings(estimatorSettings, fitParams, + paramSearch = modelSettings$param[[8]]) + expect_equal(estimatorSettings2$learningRate, 1e-3) expect_equal(as.character(estimatorSettings2$metric), "auprc") expect_equal(estimatorSettings2$earlyStopping$params$patience, 10) @@ -304,55 +325,54 @@ test_that("setEstimator with paramsToTune is correctly added to hyperparameters" test_that("device as a function argument works", { getDevice <- function() { - dev <- Sys.getenv("testDeepPLPDevice") - if (dev == ""){ - dev = "cpu" - } else{ + dev <- Sys.getenv("testDeepPLPDevice") + if (dev == "") { + dev <- "cpu" + } else { dev } } - estimatorSettings <- setEstimator(device=getDevice, + estimatorSettings <- setEstimator(device = getDevice, learningRate = 3e-4) - - model <- setDefaultResNet(estimatorSettings = estimatorSettings) + + model <- setDefaultResNet(estimatorSettings = estimatorSettings) model$param[[1]]$catFeatures <- 10 estimator <- createEstimator(modelType = modelType, modelParameters = model$param[[1]], estimatorSettings = estimatorSettings) - + expect_equal(estimator$device, "cpu") - + Sys.setenv("testDeepPLPDevice" = "meta") - - estimatorSettings <- setEstimator(device=getDevice, + + estimatorSettings <- setEstimator(device = getDevice, learningRate = 3e-4) - - model <- setDefaultResNet(estimatorSettings = estimatorSettings) + + model <- setDefaultResNet(estimatorSettings = estimatorSettings) model$param[[1]]$catFeatures <- 10 - + estimator <- createEstimator(modelType = modelType, modelParameters = model$param[[1]], estimatorSettings = estimatorSettings) - + expect_equal(estimator$device, "meta") - + Sys.unsetenv("testDeepPLPDevice") - - }) -test_that("estimatorSettings can be saved and loaded with correct python objects", { +}) + +test_that("estimatorSettings can be saved and loaded with python objects", { settings <- setEstimator() - saveRDS(settings,file=file.path(testLoc, 'settings.RDS')) - - loadedSettings <- readRDS(file = file.path(testLoc, 'settings.RDS')) - + saveRDS(settings, file = file.path(testLoc, "settings.RDS")) + + loadedSettings <- readRDS(file = file.path(testLoc, "settings.RDS")) optimizer <- loadedSettings$optimizer() scheduler <- loadedSettings$scheduler() criterion <- loadedSettings$criterion() - + testthat::expect_false(reticulate::py_is_null_xptr(optimizer)) testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun)) testthat::expect_false(reticulate::py_is_null_xptr(criterion)) diff --git a/tests/testthat/test-LRFinder.R b/tests/testthat/test-LRFinder.R index 0a5d700..a42416b 100644 --- a/tests/testthat/test-LRFinder.R +++ b/tests/testthat/test-LRFinder.R @@ -1,77 +1,84 @@ -ResNet <- reticulate::import_from_path("ResNet", path)$ResNet -lrFinderClass <- reticulate::import_from_path("LrFinder", path=path)$LrFinder +resNet <- reticulate::import_from_path("ResNet", path)$ResNet test_that("LR scheduler that changes per batch works", { - - model <- ResNet(cat_features = 10L, num_features = 1L, + + model <- resNet(cat_features = 10L, num_features = 1L, size_embedding = 32L, size_hidden = 64L, num_layers = 1L, hidden_factor = 1L) - optimizer <- torch$optim$AdamW(model$parameters(), lr=1e-7) - - ExponentialSchedulerPerBatch <- reticulate::import_from_path("LrFinder", - path=path)$ExponentialSchedulerPerBatch - scheduler <- ExponentialSchedulerPerBatch(optimizer, - end_lr = 1e-2, - num_iter = 5) + optimizer <- torch$optim$AdamW(model$parameters(), lr = 1e-7) + + + exponentialSchedulerPerBatch <- + reticulate::import_from_path("LrFinder", + path = path)$ExponentialSchedulerPerBatch + scheduler <- exponentialSchedulerPerBatch(optimizer, + end_lr = 1e-2, + num_iter = 5) expect_equal(scheduler$last_epoch, 0) expect_equal(scheduler$optimizer$param_groups[[1]]$lr, 1e-7) - + for (i in 1:5) { optimizer$step() scheduler$step() } - + expect_equal(scheduler$last_epoch, 5) - expect_equal(scheduler$optimizer$param_groups[[1]]$lr, (1e-7 * (0.01 / 1e-7) ^ (5 / 4))) - + expect_equal(scheduler$optimizer$param_groups[[1]]$lr, + (1e-7 * (0.01 / 1e-7) ^ (5 / 4))) + }) test_that("LR finder works", { - lrFinder <- createLRFinder(modelType="ResNet", - modelParameters = list(cat_features=length(dataset$get_cat_features()), - num_features=length(dataset$get_numerical_features()), - size_embedding=32L, - size_hidden=64L, - num_layers=1L, - hidden_factor=1L), - estimatorSettings = setEstimator(batchSize = 32L, - seed=42), - lrSettings = list(minLr=3e-4, - maxLr=10.0, - numLr=20L, - divergenceThreshold=1.1)) - + lrFinder <- + createLRFinder(modelType = "ResNet", + modelParameters = + list(cat_features = + dataset$get_cat_features()$max(), + num_features = + dataset$get_numerical_features()$max(), + size_embedding = 32L, + size_hidden = 64L, + num_layers = 1L, + hidden_factor = 1L), + estimatorSettings = setEstimator(batchSize = 32L, + seed = 42), + lrSettings = list(minLr = 3e-4, + maxLr = 10.0, + numLr = 20L, + divergenceThreshold = 1.1)) + lr <- lrFinder$get_lr(dataset) - - expect_true(lr<=10.0) - expect_true(lr>=3e-4) + + expect_true(lr <= 10.0) + expect_true(lr >= 3e-4) }) test_that("LR finder works with device specified by a function", { - - deviceFun <- function(){ - dev = "cpu" + + deviceFun <- function() { + dev <- "cpu" + dev } - lrFinder <- createLRFinder(model = "ResNet", - modelParameters = list(cat_features=length(dataset$get_cat_features()), - num_features=length(dataset$get_numerical_features()), - size_embedding=8L, - size_hidden=16L, - num_layers=1L, - hidden_factor=1L), - estimatorSettings = setEstimator(batchSize=32L, - seed = 42, - device = deviceFun), - lrSettings = list(minLr=3e-4, - maxLr=10.0, - numLr=20L, - divergenceThreshold=1.1) - ) + lrFinder <- createLRFinder( + model = "ResNet", + modelParameters = + list(cat_features = dataset$get_cat_features()$max(), + num_features = dataset$get_numerical_features()$max(), + size_embedding = 8L, + size_hidden = 16L, + num_layers = 1L, + hidden_factor = 1L), + estimatorSettings = setEstimator(batchSize = 32L, + seed = 42, + device = deviceFun), + lrSettings = list(minLr = 3e-4, + maxLr = 10.0, + numLr = 20L, + divergenceThreshold = 1.1) + ) lr <- lrFinder$get_lr(dataset) - - expect_true(lr<=10.0) - expect_true(lr>=3e-4) - - -}) \ No newline at end of file + + expect_true(lr <= 10.0) + expect_true(lr >= 3e-4) +}) diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R index 5cbd7a5..a470664 100644 --- a/tests/testthat/test-MLP.R +++ b/tests/testthat/test-MLP.R @@ -5,11 +5,11 @@ modelSettings <- setMultiLayerPerceptron( dropout = c(0.1), sizeEmbedding = 32, estimatorSettings = setEstimator( - learningRate=c(3e-4), + learningRate = c(3e-4), weightDecay = c(1e-6), - seed=42, - batchSize=128, - epochs=1 + seed = 42, + batchSize = 128, + epochs = 1 ), hyperParamSearch = "random", randomSample = 1 @@ -21,12 +21,13 @@ test_that("setMultiLayerPerceptron works", { testthat::expect_equal(modelSettings$fitFunction, "fitEstimator") testthat::expect_true(length(modelSettings$param) > 0) - - expect_error(setMultiLayerPerceptron(numLayers=1, + + expect_error(setMultiLayerPerceptron(numLayers = 1, sizeHidden = 128, - dropout= 0.2, + dropout = 0.2, sizeEmbedding = 128, - estimatorSettings = setEstimator(learningRate=3e-4), + estimatorSettings = + setEstimator(learningRate = 3e-4), randomSample = 2)) }) @@ -41,16 +42,17 @@ results <- tryCatch( analysisName = "Testing Deep Learning", populationSettings = populationSet, splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), - sampleSettings = PatientLevelPrediction::createSampleSettings(), # none - featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + sampleSettings = PatientLevelPrediction::createSampleSettings(), + featureEngineeringSettings = + PatientLevelPrediction::createFeatureEngineeringSettings(), preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), executeSettings = PatientLevelPrediction::createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F + runSplitData = TRUE, + runSampleData = FALSE, + runfeatureEngineering = FALSE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = FALSE ), saveDirectory = file.path(testLoc, "MLP") ) @@ -73,7 +75,9 @@ test_that("MLP with runPlp working checks", { # check prediction same size as pop testthat::expect_equal(nrow(results$prediction %>% - dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population)) + dplyr::filter(evaluationType %in% c("Train", + "Test"))), + nrow(population)) # check prediction between 0 and 1 testthat::expect_gte(min(results$prediction$value), 0) @@ -82,15 +86,15 @@ test_that("MLP with runPlp working checks", { test_that("MLP nn-module works ", { - MLP <- reticulate::import_from_path("MLP", path=path)$MLP - model <- MLP( - cat_features = 5, - num_features = 1, + mlp <- reticulate::import_from_path("MLP", path = path)$MLP + model <- mlp( + cat_features = 5, + num_features = 1, size_embedding = 5, - size_hidden = 16, + size_hidden = 16, num_layers = 1, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, dropout = 0.3 ) @@ -100,8 +104,8 @@ test_that("MLP nn-module works ", { expect_equal(pars, 489) input <- list() - input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype=torch$long) - input$num <- torch$randn(10L, 1L, dtype=torch$float32) + input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long) + input$num <- torch$randn(10L, 1L, dtype = torch$float32) output <- model(input) @@ -110,14 +114,14 @@ test_that("MLP nn-module works ", { expect_equal(output$shape[0], 10L) input$num <- NULL - model <- MLP( - cat_features = 5L, - num_features = 0, + model <- mlp( + cat_features = 5L, + num_features = 0, size_embedding = 5L, - size_hidden = 16L, + size_hidden = 16L, num_layers = 1L, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, dropout = 0.3, d_out = 1L ) @@ -129,49 +133,48 @@ test_that("MLP nn-module works ", { test_that("Errors are produced by settings function", { randomSample <- 2 - + expect_error(setMultiLayerPerceptron( - numLayers = 1, - sizeHidden = 128, - dropout = 0.0, - sizeEmbedding = 128, - hyperParamSearch = 'random', - estimatorSettings = setEstimator( - learningRate = 'auto', - weightDecay = c(1e-3), - batchSize = 1024, - epochs = 30, - device="cpu"))) - + numLayers = 1, + sizeHidden = 128, + dropout = 0.0, + sizeEmbedding = 128, + hyperParamSearch = "random", + estimatorSettings = + setEstimator( + learningRate = "auto", + weightDecay = c(1e-3), + batchSize = 1024, + epochs = 30, + device = "cpu"))) }) test_that("Can upload results to database", { - cohortDefinitions = data.frame( - cohortName = c('blank1'), - cohortId = c(1), - json = c('json') + cohortDefinitions <- data.frame( + cohortName = c("blank1"), + cohortId = c(1), + json = c("json") ) - + sink(nullfile()) - sqliteFile <- insertResultsToSqlite(resultLocation = file.path(testLoc, "MLP"), - cohortDefinitions = cohortDefinitions) + sqliteFile <- + insertResultsToSqlite(resultLocation = file.path(testLoc, "MLP"), + cohortDefinitions = cohortDefinitions) sink() - + testthat::expect_true(file.exists(sqliteFile)) - - cdmDatabaseSchema <- 'main' - ohdsiDatabaseSchema <- 'main' + + cdmDatabaseSchema <- "main" + ohdsiDatabaseSchema <- "main" connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = 'sqlite', + dbms = "sqlite", server = sqliteFile ) conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) - targetDialect <- 'sqlite' - + targetDialect <- "sqlite" + # check the results table is populated - sql <- 'select count(*) as N from main.performances;' + sql <- "select count(*) as N from main.performances;" res <- DatabaseConnector::querySql(conn, sql) - testthat::expect_true(res$N[1]>0) - - -}) \ No newline at end of file + testthat::expect_true(res$N[1] > 0) +}) diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R index 29dfb8e..76e3d9f 100644 --- a/tests/testthat/test-ResNet.R +++ b/tests/testthat/test-ResNet.R @@ -6,33 +6,34 @@ resSet <- setResNet( residualDropout = 0.1, hiddenDropout = 0.1, sizeEmbedding = 32, - estimatorSettings = setEstimator(learningRate="auto", + estimatorSettings = setEstimator(learningRate = "auto", weightDecay = c(1e-6), - seed=42, + seed = 42, batchSize = 128, - epochs=1), + epochs = 1), hyperParamSearch = "random", randomSample = 1, ) test_that("setResNet works", { testthat::expect_s3_class(object = resSet, class = "modelSettings") - + testthat::expect_equal(resSet$fitFunction, "fitEstimator") - + testthat::expect_true(length(resSet$param) > 0) - + expect_error(setResNet(numLayers = 2, sizeHidden = 32, hiddenFactor = 2, residualDropout = 0.1, hiddenDropout = 0.1, sizeEmbedding = 32, - estimatorSettings = setEstimator(learningRate=c(3e-4), - weightDecay = c(1e-6), - seed=42, - batchSize = 128, - epochs=1), + estimatorSettings = + setEstimator(learningRate = c(3e-4), + weightDecay = c(1e-6), + seed = 42, + batchSize = 128, + epochs = 1), hyperParamSearch = "random", randomSample = 2)) }) @@ -48,16 +49,17 @@ res2 <- tryCatch( analysisName = "Testing Deep Learning", populationSettings = populationSet, splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), - sampleSettings = PatientLevelPrediction::createSampleSettings(), # none - featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + sampleSettings = PatientLevelPrediction::createSampleSettings(), + featureEngineeringSettings = + PatientLevelPrediction::createFeatureEngineeringSettings(), preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), executeSettings = PatientLevelPrediction::createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F + runSplitData = TRUE, + runSampleData = FALSE, + runfeatureEngineering = FALSE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = FALSE ), saveDirectory = file.path(testLoc, "ResNet") ) @@ -71,17 +73,19 @@ sink() test_that("ResNet with runPlp working checks", { testthat::expect_false(is.null(res2)) - + # check structure testthat::expect_true("prediction" %in% names(res2)) testthat::expect_true("model" %in% names(res2)) testthat::expect_true("covariateSummary" %in% names(res2)) testthat::expect_true("performanceEvaluation" %in% names(res2)) - + # check prediction same size as pop testthat::expect_equal(nrow(res2$prediction %>% - dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population)) - + dplyr::filter(evaluationType %in% c("Train", + "Test"))), + nrow(population)) + # check prediction between 0 and 1 testthat::expect_gte(min(res2$prediction$value), 0) testthat::expect_lte(max(res2$prediction$value), 1) @@ -89,45 +93,45 @@ test_that("ResNet with runPlp working checks", { test_that("ResNet nn-module works ", { - ResNet <- reticulate::import_from_path("ResNet", path=path)$ResNet - model <- ResNet( - cat_features = 5, - num_features = 1, + resNet <- reticulate::import_from_path("ResNet", path = path)$ResNet + model <- resNet( + cat_features = 5, + num_features = 1, size_embedding = 5, - size_hidden = 16, - num_layers = 1, + size_hidden = 16, + num_layers = 1, hidden_factor = 2, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, hidden_dropout = 0.3, residual_dropout = 0.3 ) - + pars <- sum(reticulate::iterate(model$parameters(), function(x) x$numel())) - + # expected number of parameters expect_equal(pars, 1295) - + input <- list() input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long) input$num <- torch$randn(10L, 1L, dtype = torch$float32) - - + + output <- model(input) - + # output is correct shape expect_equal(output$shape[0], 10L) - + input$num <- NULL - model <- ResNet( - cat_features = 5, - num_features = 0, + model <- resNet( + cat_features = 5, + num_features = 0, size_embedding = 5, - size_hidden = 16, - num_layers = 1, + size_hidden = 16, + num_layers = 1, hidden_factor = 2, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, hidden_dropout = 0.3, residual_dropout = 0.3 ) @@ -139,60 +143,59 @@ test_that("ResNet nn-module works ", { test_that("Default Resnet works", { defaultResNet <- setDefaultResNet() params <- defaultResNet$param[[1]] - + expect_equal(params$numLayers, 6) expect_equal(params$sizeHidden, 512) expect_equal(params$hiddenFactor, 2) expect_equal(params$residualDropout, 0.1) expect_equal(params$hiddenDropout, 0.4) expect_equal(params$sizeEmbedding, 256) - -}) + +}) test_that("Errors are produced by settings function", { randomSample <- 2 - - expect_error(setResNet( - numLayers = 1, - sizeHidden = 128, - hiddenFactor = 1, - residualDropout = 0.0, - hiddenDropout = 0.0, - sizeEmbedding = 128, - estimatorSettings = setEstimator(weightDecay = 1e-6, - learningRate = 0.01, - seed = 42), - hyperParamSearch = 'random', - randomSample = randomSample)) + + expect_error(setResNet(numLayers = 1, + sizeHidden = 128, + hiddenFactor = 1, + residualDropout = 0.0, + hiddenDropout = 0.0, + sizeEmbedding = 128, + estimatorSettings = setEstimator(weightDecay = 1e-6, + learningRate = 0.01, + seed = 42), + hyperParamSearch = "random", + randomSample = randomSample)) }) -test_that("Can upload results to database", { - cohortDefinitions = data.frame( - cohortName = c('blank1'), - cohortId = c(1), - json = c('json') +test_that("Can upload results to database", { + cohortDefinitions <- data.frame( + cohortName = c("blank1"), + cohortId = c(1), + json = c("json") ) - + sink(nullfile()) - sqliteFile <- insertResultsToSqlite(resultLocation = file.path(testLoc, "ResNet"), - cohortDefinitions = cohortDefinitions) + sqliteFile <- + insertResultsToSqlite(resultLocation = file.path(testLoc, "ResNet"), + cohortDefinitions = cohortDefinitions) sink() - + testthat::expect_true(file.exists(sqliteFile)) - - cdmDatabaseSchema <- 'main' - ohdsiDatabaseSchema <- 'main' + + cdmDatabaseSchema <- "main" + ohdsiDatabaseSchema <- "main" connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = 'sqlite', + dbms = "sqlite", server = sqliteFile ) conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) - targetDialect <- 'sqlite' - + targetDialect <- "sqlite" + # check the results table is populated - sql <- 'select count(*) as N from main.performances;' + sql <- "select count(*) as N from main.performances;" res <- DatabaseConnector::querySql(conn, sql) - testthat::expect_true(res$N[1]>0) + testthat::expect_true(res$N[1] > 0) }) - diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R index f96d7b8..ec5b063 100644 --- a/tests/testthat/test-TrainingCache.R +++ b/tests/testthat/test-TrainingCache.R @@ -4,12 +4,13 @@ resNetSettings <- setResNet(numLayers = c(1, 2, 4), residualDropout = 0.5, hiddenDropout = 0.5, sizeEmbedding = 64, - estimatorSettings = setEstimator(learningRate=3e-4, - weightDecay=1e-3, - device='cpu', - batchSize=64, - epochs=1, - seed=NULL), + estimatorSettings = + setEstimator(learningRate = 3e-4, + weightDecay = 1e-3, + device = "cpu", + batchSize = 64, + epochs = 1, + seed = NULL), hyperParamSearch = "random", randomSample = 3, randomSampleSeed = NULL) @@ -19,60 +20,67 @@ paramSearch <- resNetSettings$param test_that("Training cache exists on disk", { testthat::expect_true( - file.exists(file.path(testLoc, "paramPersistence.rds"))) + file.exists(file.path(testLoc, "paramPersistence.rds"))) }) test_that("Grid search can be cached", { gridSearchPredictons <- list() length(gridSearchPredictons) <- length(paramSearch) trainCache$saveGridSearchPredictions(gridSearchPredictons) - + index <- 1 - + gridSearchPredictons[[index]] <- list( prediction = list(NULL), param = paramSearch[[index]] ) trainCache$saveGridSearchPredictions(gridSearchPredictons) - testthat::expect_identical(trainCache$getGridSearchPredictions(), gridSearchPredictons) - testthat::expect_equal(trainCache$getLastGridSearchIndex(), index+1) + testthat::expect_identical(trainCache$getGridSearchPredictions(), + gridSearchPredictons) + testthat::expect_equal(trainCache$getLastGridSearchIndex(), index + 1) }) test_that("Param grid predictions can be cached", { testthat::expect_false(trainCache$isParamGridIdentical(paramSearch)) - + trainCache$saveModelParams(paramSearch) testthat::expect_true(trainCache$isParamGridIdentical(paramSearch)) }) test_that("Estimator can resume training from cache", { trainCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) - newPath <- file.path(testLoc, 'resume') + newPath <- file.path(testLoc, "resume") dir.create(newPath) - + # remove last row trainCache$gridSearchPredictions[[2]] <- NULL length(trainCache$gridSearchPredictions) <- 2 - + # save new cache - saveRDS(trainCache, file=file.path(newPath, "paramPersistence.rds")) - + saveRDS(trainCache, file = file.path(newPath, "paramPersistence.rds")) + sink(nullfile()) - fitEstimatorResults <- fitEstimator(trainData$Train, - modelSettings = modelSettings, - analysisId = 1, + fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, analysisPath = newPath) sink() - newCache <- readRDS(file.path(newPath, "paramPersistence.rds")) - testthat::expect_equal(nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary), 4) + cS <- nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary) + testthat::expect_equal(cS, 4) }) test_that("Prediction is cached for optimal parameters", { testCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) - indexOfMax <- which.max(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) - indexOfMin <- which.min(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) - testthat::expect_equal(class(testCache$gridSearchPredictions[[indexOfMax]]$prediction), class(data.frame())) - testthat::expect_null(testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]]) + indexOfMax <- + which.max(unlist(lapply(testCache$gridSearchPredictions, + function(x) x$gridPerformance$cvPerformance))) + indexOfMin <- + which.min(unlist(lapply(testCache$gridSearchPredictions, + function(x) x$gridPerformance$cvPerformance))) + myClass <- class(testCache$gridSearchPredictions[[indexOfMax]]$prediction) + testthat::expect_equal(myClass, class(data.frame())) + lowestIndex <- testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]] + testthat::expect_null(lowestIndex) }) diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R index 043cbe7..9bb5e3a 100644 --- a/tests/testthat/test-Transformer.R +++ b/tests/testthat/test-Transformer.R @@ -1,15 +1,15 @@ settings <- setTransformer( - numBlocks = 1, - dimToken = 8, + numBlocks = 1, + dimToken = 8, dimOut = 1, - numHeads = 2, - attDropout = 0.0, + numHeads = 2, + attDropout = 0.0, ffnDropout = 0.2, - resDropout = 0.0, - dimHidden = 32, + resDropout = 0.0, + dimHidden = 32, estimatorSettings = setEstimator(learningRate = 3e-4, - batchSize=64, - epochs=1), + batchSize = 64, + epochs = 1), randomSample = 1 ) @@ -32,7 +32,11 @@ test_that("Transformer settings work", { }) test_that("fitEstimator with Transformer works", { - results <- fitEstimator(trainData$Train, settings, analysisId = 1, analysisPath = testLoc) + results <- + fitEstimator(trainData$Train, + settings, + analysisId = 1, + analysisPath = testLoc) expect_equal(class(results), "plpModel") expect_equal(attr(results, "modelType"), "binary") @@ -44,16 +48,17 @@ test_that("fitEstimator with Transformer works", { }) test_that("transformer nn-module works", { - Transformer <- reticulate::import_from_path("Transformer", path=path)$Transformer - model <- Transformer( - cat_features = 5, - num_features = 1, + transformer <- + reticulate::import_from_path("Transformer", path = path)$Transformer + model <- transformer( + cat_features = 5, + num_features = 1, num_blocks = 2, - dim_token = 16, - num_heads = 2, - att_dropout = 0, + dim_token = 16, + num_heads = 2, + att_dropout = 0, ffn_dropout = 0, - res_dropout = 0, + res_dropout = 0, dim_hidden = 32 ) @@ -73,19 +78,19 @@ test_that("transformer nn-module works", { input$num <- NULL - model <- Transformer( - cat_features = 5, - num_features = 0, + model <- transformer( + cat_features = 5, + num_features = 0, num_blocks = 2, - dim_token = 16, - num_heads = 2, - att_dropout = 0, + dim_token = 16, + num_heads = 2, + att_dropout = 0, ffn_dropout = 0, - res_dropout = 0, + res_dropout = 0, dim_hidden = 32 ) output <- model(input) - expect_equal(output$shape[0], 10L) + expect_equal(output$shape[0], 10L) input$num <- reticulate::py_none() output <- model(input) expect_equal(output$shape[0], 10L) @@ -94,21 +99,21 @@ test_that("transformer nn-module works", { test_that("Default Transformer works", { defaultTransformer <- setDefaultTransformer() params <- defaultTransformer$param[[1]] - + expect_equal(params$numBlocks, 3) expect_equal(params$dimToken, 192) expect_equal(params$numHeads, 8) expect_equal(params$resDropout, 0.0) expect_equal(params$attDropout, 0.2) - - settings <- attr(defaultTransformer, 'settings') - - expect_equal(settings$name, 'defaultTransformer') -}) + + settings <- attr(defaultTransformer, "settings") + + expect_equal(settings$name, "defaultTransformer") +}) test_that("Errors are produced by settings function", { randomSample <- 2 - + expect_error(setTransformer(randomSample = randomSample)) }) @@ -126,35 +131,36 @@ test_that("dimHidden ratio works as expected", { testthat::expect_error(setTransformer(dimHidden = NULL, dimHiddenRatio = NULL)) testthat::expect_error(setTransformer(dimHidden = 256, - dimHiddenRatio = 4/3)) + dimHiddenRatio = 4 / 3)) }) test_that("numerical embedding works as expected", { embeddings <- 32L # size of embeddings features <- 2L # number of numerical features - patients <- 9L - + patients <- 9L + numTensor <- torch$randn(c(patients, features)) - - numericalEmbeddingClass <- reticulate::import_from_path("ResNet", path=path)$NumericalEmbedding + + numericalEmbeddingClass <- + reticulate::import_from_path("ResNet", path = path)$NumericalEmbedding numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, embedding_dim = embeddings, bias = TRUE) out <- numericalEmbedding(numTensor) - + # should be patients x features x embedding size expect_equal(out$shape[[0]], patients) expect_equal(out$shape[[1]], features) expect_equal(out$shape[[2]], embeddings) - + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, embedding_dim = embeddings, bias = FALSE) - + out <- numericalEmbedding(numTensor) expect_equal(out$shape[[0]], patients) expect_equal(out$shape[[1]], features) expect_equal(out$shape[[2]], embeddings) - - }) + +}) From 53d3b48516bf7284db89a6e29f41851a929c6cea Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 4 Dec 2023 17:28:04 +0100 Subject: [PATCH 3/7] add Andromeda dev dependancy --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2cc1ef5..5a58bc8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,7 +35,8 @@ Suggests: testthat, PRROC, ResultModelManager (>= 0.2.0), - DatabaseConnector (>= 6.0.0) + DatabaseConnector (>= 6.0.0), + Andromeda Remotes: ohdsi/PatientLevelPrediction, ohdsi/FeatureExtraction, From 46180e4a5b41cc42f4ebefe4bb4da03ac544bf72 Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 5 Dec 2023 11:37:01 +0100 Subject: [PATCH 4/7] add test for prediction output --- tests/testthat/test-Estimator.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R index 9972839..ba82835 100644 --- a/tests/testthat/test-Estimator.R +++ b/tests/testthat/test-Estimator.R @@ -377,3 +377,17 @@ test_that("estimatorSettings can be saved and loaded with python objects", { testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun)) testthat::expect_false(reticulate::py_is_null_xptr(criterion)) }) + +test_that("evaluation works on predictDeepEstimator output", { + + prediction <- predictDeepEstimator(plpModel = fitEstimatorResults, + data = trainData$Test, + cohort = trainData$Test$labels) + prediction$evaluationType <- 'Validation' + + evaluation <- evaluatePlp(prediction, "evaluationType") + + expect_length(evaluation, 5) + expect_s3_class(evaluation, "plpEvaluation") + + }) \ No newline at end of file From 02d3d44af5f73e3c3ce7be662e8980c45be0bc90 Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 5 Dec 2023 11:43:04 +0100 Subject: [PATCH 5/7] predictions should be a numeric --- R/Estimator.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Estimator.R b/R/Estimator.R index 6ba9283..4ef2047 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -267,7 +267,7 @@ predictDeepEstimator <- function(plpModel, } else { prediction$value <- plpModel$model$predict_proba(data) } - + prediction$value <- as.numeric(prediction$value) attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType") return(prediction) From d427793c43b71da6639ca8aac68ecfa345aef723 Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 5 Dec 2023 17:08:24 +0100 Subject: [PATCH 6/7] updated news --- NEWS.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/NEWS.md b/NEWS.md index 70ccf12..961ac75 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +DeepPatientLevelPrediction 2.0.2 +====================== + - Ensure output from predict_proba is numeric instead of 1d array + - Refactoring: Move cross-validation to a separate function + - Refactoring: Move paramsToTune to a separate function + - linting: Enforcing HADES style + - Calculate AUC ourselves with torch, get rid of scikit-learn dependancy + - added Andromeda to dev dependencies + + DeepPatientLevelPrediction 2.0.1 ====================== - Connection parameter fixed to be in line with newest polars From 6207324c73b867d69c76ad1f95ac1ce928fc86e7 Mon Sep 17 00:00:00 2001 From: egillax Date: Wed, 6 Dec 2023 11:24:17 +0100 Subject: [PATCH 7/7] add dev release number --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 91bb9a3..b599319 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: DeepPatientLevelPrediction Type: Package Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model -Version: 2.0.2 +Version: 2.0.2.9999 Date: 18-04-2023 Authors@R: c( person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),