OHDSI · egillax · Nov 3, 2023 · Nov 14, 2022 · Nov 14, 2022 · Nov 14, 2022
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,5 @@ config.yml
 docs
 .idea/
 renv.lock
-extras/
+extras/
+.Renviron
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 2.0.0
+Version: 2.0.1
 Date: 18-04-2023
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "[email protected]", role = c("aut", "cre")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+DeepPatientLevelPrediction 2.0.1
+======================
+  - Connection parameter fixed to be in line with newest polars
+  - Fixed a bug where LRFinder used a hardcoded batch size
+  - Seed is now used in LRFinder so it's reproducible
+  - Fixed a bug in NumericalEmbedding
+  - Fixed a bug for Transformer and numerical features
+  - Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang )
+  - Updated installation documentation after feedback from HADES hackathon
+  - Fixed a bug where order of numeric features wasn't conserved between training and test set
+  - TrainingCache now only saves prediction dataframe for the best performing model 
+
 DeepPatientLevelPrediction 2.0.0
 ======================
   - New backend which uses pytorch through reticulate instead of torch in R

diff --git a/R/Estimator.R b/R/Estimator.R
@@ -310,7 +310,8 @@ gridCvDeep <- function(mappedData,
 
   fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))]
   findLR <- modelSettings$estimatorSettings$findLR
-  for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
+  if (!trainCache$isFull()) {
+    for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
     ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
     ParallelLogger::logInfo(paste0("HyperParameters: "))
     ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
@@ -363,25 +364,38 @@ gridCvDeep <- function(mappedData,
       )
     }
     maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
-    paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]]
-
     gridSearchPredictons[[gridId]] <- list(
       prediction = prediction,
-      param = paramSearch[[gridId]]
+      param = paramSearch[[gridId]],
+      gridPerformance =  PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]])
     )
+    gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), 
+                                                                                  nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))   
+    gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]]
+
 
+    # remove all predictions that are not the max performance
+    indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
+    for (i in seq_along(gridSearchPredictons)) {
+      if (!is.null(gridSearchPredictons[[i]])) {
+        if (i != indexOfMax) {
+          gridSearchPredictons[[i]]$prediction <- list(NULL)
+        }
+      }
+    }
+    ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax))
     trainCache$saveGridSearchPredictions(gridSearchPredictons)
   }
+  }
+  paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
+  # get best params
+  indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
+  finalParam <- gridSearchPredictons[[indexOfMax]]$param
+
+  paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
 
-  # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function)
-  paramGridSearch <- lapply(gridSearchPredictons, function(x) {
-    do.call(PatientLevelPrediction::computeGridPerformance, x)
-  }) # cvAUCmean, cvAUC, param
-
-  optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance)))
-  finalParam <- paramGridSearch[[optimalParamInd]]$param
-
-  cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction
+  # get best CV prediction
+  cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction
   cvPrediction$evaluationType <- "CV"
 
   ParallelLogger::logInfo("Training final model using optimal parameters")

diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
@@ -69,6 +69,13 @@ TrainingCache <- R6::R6Class(
       return(private$.paramPersistence$gridSearchPredictions)
     },
 
+    #' @description
+    #' Check if cache is full
+    #' @returns Boolen
+    isFull = function() {
+      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance)))))
+    },    
+
     #' @description
     #' Gets the last index from the cached grid search
     #' @returns Last grid search index

diff --git a/extras/example.R b/extras/example.R
diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
@@ -21,7 +21,7 @@ def __init__(self,
         if pathlib.Path(data).suffix == '.sqlite':
             data = urllib.parse.quote(data)
             data = pl.read_database("SELECT * from covariates",
-                                    connection_uri=f"sqlite://{data}").lazy()
+                                    connection=f"sqlite://{data}").lazy()
         else:
             data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow'))
         observations = data.select(pl.col('rowId').max()).collect()[0, 0]
@@ -67,7 +67,7 @@ def __init__(self,
         if pl.count(self.numerical_features) == 0:
             self.num = None
         else:
-            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \
+            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \
                 with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId').
                                                            rank(method="dense") - 1, pl.col('rowId') - 1) \
                 .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect()

diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py
@@ -37,6 +37,7 @@ def __init__(self,
         smooth = lr_settings.get("smooth", 0.05)
         divergence_threshold = lr_settings.get("divergence_threshold", 4)
         torch.manual_seed(seed=estimator_settings["seed"])
+        self.seed = estimator_settings["seed"]
         self.model = model(**model_parameters)
         if callable(estimator_settings["device"]):
             self.device = estimator_settings["device"]()
@@ -55,18 +56,18 @@ def __init__(self,
         self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr)
 
         self.criterion = estimator_settings["criterion"]()
-        self.batch_size = estimator_settings['batch_size']
+        self.batch_size = int(estimator_settings['batch_size'])
         self.losses = None
         self.loss_index = None
 
     def get_lr(self, dataset):
         batch_index = torch.arange(0, len(dataset), 1).tolist()
-
+        random.seed(self.seed)
         losses = torch.empty(size=(self.num_lr,), dtype=torch.float)
         lrs = torch.empty(size=(self.num_lr,), dtype=torch.float)
         for i in tqdm(range(self.num_lr)):
             self.optimizer.zero_grad()
-            random_batch = random.sample(batch_index, 32)
+            random_batch = random.sample(batch_index, self.batch_size)
             batch = dataset[random_batch]
             batch = batch_to_device(batch, self.device)
 

diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py
@@ -130,9 +130,9 @@ def __init__(self,
                 nn.init.kaiming_uniform_(parameter, a=math.sqrt(5))
 
     def forward(self, input):
-        x = self.weight.unsqueeze(0) * input.unsqueeze(-1)
+        x = self.weight[None] * input[..., None]
         if self.bias is not None:
-            x = x + self.bias.unsqueeze(-1)
+            x = x + self.bias[None]
         return x
 
 

diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py
@@ -49,6 +49,9 @@ def __init__(self,
 
         if num_features != 0 and num_features is not None:
             self.numerical_embedding = NumericalEmbedding(num_features, dim_token)
+            self.use_numerical = True
+        else:
+            self.use_numerical = False
         self.class_token = ClassToken(dim_token)
 
         self.layers = nn.ModuleList([])
@@ -78,7 +81,7 @@ def __init__(self,
     def forward(self, x):
         mask = torch.where(x["cat"] == 0, True, False)
         cat = self.categorical_embedding(x["cat"])
-        if "num" in x.keys() and self.numerical_embedding is not None:
+        if self.use_numerical:
             num = self.numerical_embedding(x["num"])
             x = torch.cat([cat, num], dim=1)
             mask = torch.cat([mask, torch.zeros([x.shape[0],

diff --git a/man/TrainingCache.Rd b/man/TrainingCache.Rd
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
@@ -78,3 +78,21 @@ dataset <- Dataset$Data(
 )
 small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))
 
+modelSettings <- setResNet(
+  numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
+  residualDropout = c(0, 0.2), hiddenDropout = 0,
+  sizeEmbedding = 16, hyperParamSearch = "random",
+  randomSample = 2,
+  setEstimator(epochs=1,
+               learningRate = 3e-4)
+)
+fitEstimatorPath <- file.path(testLoc, 'fitEstimator')
+if (!dir.exists(fitEstimatorPath)) {
+  dir.create(fitEstimatorPath)
+}
+fitEstimatorResults <- fitEstimator(trainData$Train, 
+                                    modelSettings = modelSettings, 
+                                    analysisId = 1, 
+                                    analysisPath = fitEstimatorPath)
+
+
diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R
@@ -146,25 +146,12 @@ test_that("early stopping works", {
   testthat::expect_true(earlyStop$early_stop)
 })
 
-modelSettings <- setResNet(
-  numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
-  residualDropout = 0, hiddenDropout = 0,
-  sizeEmbedding = 16, hyperParamSearch = "random",
-  randomSample = 1,
-  setEstimator(epochs=1,
-               learningRate = 3e-4)
-)
-
-sink(nullfile())
-results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1, analysisPath = testLoc)
-sink()
-
 test_that("Estimator fit function works", {
-  expect_true(!is.null(results$trainDetails$trainingTime))
+  expect_true(!is.null(fitEstimatorResults$trainDetails$trainingTime))
 
-  expect_equal(class(results), "plpModel")
-  expect_equal(attr(results, "modelType"), "binary")
-  expect_equal(attr(results, "saveType"), "file")
+  expect_equal(class(fitEstimatorResults), "plpModel")
+  expect_equal(attr(fitEstimatorResults, "modelType"), "binary")
+  expect_equal(attr(fitEstimatorResults, "saveType"), "file")
   fakeTrainData <- trainData
   fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake"))
   expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc))
@@ -184,7 +171,7 @@ test_that("predictDeepEstimator works", {
   # input is a plpModel and data
   sink(nullfile())
   predictions <- predictDeepEstimator(
-    plpModel = results, data = trainData$Test,
+    plpModel = fitEstimatorResults, data = trainData$Test,
     trainData$Test$labels
   )
   sink()
@@ -369,4 +356,4 @@ test_that("estimatorSettings can be saved and loaded with correct python objects
   testthat::expect_false(reticulate::py_is_null_xptr(optimizer))
   testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun))
   testthat::expect_false(reticulate::py_is_null_xptr(criterion))
-})
+})
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,5 @@ config.yml @@
     docs
     .idea/
     renv.lock
-    extras/
+    extras/
+    .Renviron