OHDSI · lhjohn · Aug 22, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml
@@ -70,6 +70,10 @@ jobs:
           cache: always
           extra-packages: any::rcmdcheck
           needs: check
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
 
       - name: setup r-reticulate venv
         shell: Rscript {0}

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -38,10 +38,10 @@ Suggests:
 Remotes:
     ohdsi/PatientLevelPrediction,
     ohdsi/ResultModelManager
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Encoding: UTF-8
 Config/testthat/edition: 3
-Config/testthat/parallel: TRUE
+Config/testthat/parallel: FALSE
 Config/reticulate:
   list(
     packages = list(

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(fitEstimator)
 export(gridCvDeep)
 export(predictDeepEstimator)
+export(setCustomEmbeddingModel)
 export(setDefaultResNet)
 export(setDefaultTransformer)
 export(setEstimator)

diff --git a/R/CustomEmbeddingModel.R b/R/CustomEmbeddingModel.R
@@ -0,0 +1,73 @@
+# @file CustomEmbeddingModel.R
+#
+# Copyright 2024 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#' Create default settings a model using custom embeddings
+#'
+#' @description A model that uses custom embeddings such as Poincare embeddings or 
+#' embeddings form a foundation model
+#' @param embeddingFilePath path to the saved embeddings. The embeddings file 
+#' should be a pytorch file including a dictionary with two two fields: 
+#' `concept_ids`: a pytorch long tensor with the concept ids and `embeddings`: 
+#' a pytorch float tensor with the embeddings
+#' @param modelSettings for the model to use, needs to have an embedding layer 
+#' with a name `embedding` which will be replaced by the custom embeddings
+#' @param embeddingsClass the class of the custom embeddings, e.g. `CustomEmbeddings` 
+#' or `PoincareEmbeddings`
+#' 
+#' @return settings for a model using custom embeddings
+#'
+#' @export
+setCustomEmbeddingModel <- function(
+    embeddingFilePath,
+    modelSettings = setTransformer(
+      numBlocks = 3,
+      dimToken = 16,
+      dimOut = 1,
+      numHeads = 4,
+      attDropout = 0.2,
+      ffnDropout = 0.1,
+      resDropout = 0.0,
+      dimHidden = 32,
+      estimatorSettings = setEstimator(learningRate = "auto",
+                                       weightDecay = 1e-4,
+                                       batchSize = 256,
+                                       epochs = 2,
+                                       seed = NULL,
+                                       device = "cpu"),
+      hyperParamSearch = "random",
+      randomSample = 1
+    ),
+    embeddingsClass = "CustomEmbeddings"
+) {
+  embeddingFilePath <- normalizePath(embeddingFilePath)
+  checkIsClass(embeddingFilePath, "character")
+  checkFileExists(embeddingFilePath)
+  checkIsClass(embeddingsClass, "character")
+  checkInStringVector(embeddingsClass, c("CustomEmbeddings", "PoincareEmbeddings"))
+
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+  modelSettings$estimatorSettings$initStrategy <-
+    reticulate::import_from_path("InitStrategy",
+                                 path = path)$CustomEmbeddingInitStrategy(
+                                   embedding_class = embeddingsClass,
+                                   embedding_file = embeddingFilePath
+                                 )
+  transformerSettings <- modelSettings
+
+  attr(transformerSettings, "settings")$name <- "CustomEmbeddingModel"
+  return(transformerSettings)
+}
diff --git a/R/Dataset.R b/R/Dataset.R
@@ -36,7 +36,7 @@ createDataset <- function(data, labels, plpModel = NULL) {
       r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))
     data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
       numerical_features = numericalFeatures
-    )
+      )
   }
 
   return(data)

diff --git a/R/Estimator.R b/R/Estimator.R
@@ -221,6 +221,7 @@ fitEstimator <- function(trainData,
     )
 
   comp <- start - Sys.time()
+  modelSettings$estimatorSettings$initStrategy <- NULL
   result <- list(
     model = cvResult$estimator,
     preprocessing = list(
@@ -296,7 +297,17 @@ predictDeepEstimator <- function(plpModel,
     plpModel <- list(model = plpModel)
     attr(plpModel, "modelType") <- "binary"
   }
-  if ("plpData" %in% class(data)) {
+
+  if (!is.null(plpModel$covariateImportance)) {
+    # this means that the model finished training since only in the end covariateImportance is added
+    mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
+                                                 cohort = cohort,
+                                                 mapping = plpModel$covariateImportance %>%
+                                                   dplyr::select("columnId", "covariateId")
+    )
+    data <- createDataset(mappedData, plpModel = plpModel)
+
+  } else if ("plpData" %in% class(data)) {
     mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
       cohort = cohort,
       mapping = plpModel$covariateImportance %>%
@@ -310,18 +321,20 @@ predictDeepEstimator <- function(plpModel,
   if (is.character(plpModel$model)) {
     model <- torch$load(file.path(plpModel$model,
                                   "DeepEstimatorModel.pt"),                        
-                        map_location = "cpu")
+                        map_location = "cpu",
+                        weights_only = FALSE)
     if (is.null(model$model_parameters$model_type)) {
       # for backwards compatibility
       model$model_parameters$model_type <- plpModel$modelDesign$modelSettings$modelType
     }
     model$estimator_settings$device <-
       plpModel$modelDesign$modelSettings$estimatorSettings$device
+    modelParameters <- snakeCaseToCamelCaseNames(model$model_parameters)
+    estimatorSettings <- snakeCaseToCamelCaseNames(model$estimator_settings)
+    parameters <- list(modelParameters = modelParameters,
+                       estimatorSettings = estimatorSettings)
     estimator <-
-      createEstimator(modelParameters =
-                      snakeCaseToCamelCaseNames(model$model_parameters),
-                      estimatorSettings =
-                      snakeCaseToCamelCaseNames(model$estimator_settings))
+      createEstimator(parameters = parameters)
     estimator$model$load_state_dict(model$model_state_dict)
     prediction$value <- estimator$predict_proba(data)
   } else {
@@ -420,7 +433,7 @@ gridCvDeep <- function(mappedData,
     dplyr::select(-"index")
   prediction$cohortStartDate <- as.Date(prediction$cohortStartDate,
     origin = "1970-01-01")
-  numericalIndex <- dataset$get_numerical_features()
+  numericalIndex <- dataset$numerical_features$to_list()
 
   # save torch code here
   if (!dir.exists(file.path(modelLocation))) {
@@ -433,7 +446,7 @@ gridCvDeep <- function(mappedData,
       prediction = prediction,
       finalParam = finalParam,
       paramGridSearch = paramGridSearch,
-      numericalIndex = numericalIndex$to_list()
+      numericalIndex = numericalIndex
     )
   )
 }
@@ -465,22 +478,20 @@ evalEstimatorSettings <- function(estimatorSettings) {
   estimatorSettings
 }
 
-createEstimator <- function(modelParameters,
-                            estimatorSettings) {
+createEstimator <- function(parameters) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
   model <-
-    reticulate::import_from_path(modelParameters$modelType,
-                                 path = path)[[modelParameters$modelType]]
+    reticulate::import_from_path(parameters$modelParameters$modelType,
+                                 path = path)[[parameters$modelParameters$modelType]]
   estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
 
-  modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
-  estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
-  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-
+  parameters$modelParameters <- camelCaseToSnakeCaseNames(parameters$modelParameters)
+  parameters$estimatorSettings <- camelCaseToSnakeCaseNames(parameters$estimatorSettings)
+  parameters$estimatorSettings <- evalEstimatorSettings(parameters$estimatorSettings)
+  parameters <- camelCaseToSnakeCaseNames(parameters)
   estimator <- estimator(
     model = model,
-    model_parameters = modelParameters,
-    estimator_settings = estimatorSettings
+    parameters = parameters
   )
   return(estimator)
 }
@@ -571,16 +582,19 @@ doCrossValidationImpl <- function(dataset,
   )]
   currentModelParams <- parameters[modelSettings$modelParamNames]
   attr(currentModelParams, "metaData")$names <-
-    modelSettings$modelParamNameCH
+    modelSettings$modelParamNames
   currentModelParams$modelType <- modelSettings$modelType
   currentEstimatorSettings <-
     fillEstimatorSettings(modelSettings$estimatorSettings,
                           fitParams,
                           parameters)
-  currentModelParams$catFeatures <- dataset$get_cat_features()$max()
-  currentModelParams$numFeatures <- dataset$get_numerical_features()$len()
+  currentModelParams$feature_info <- dataset$get_feature_info()
+  currentParameters <- list(
+    modelParameters = currentModelParams,
+    estimatorSettings = currentEstimatorSettings
+  )
   if (currentEstimatorSettings$findLR) {
-    lr <- getLR(currentModelParams, currentEstimatorSettings, dataset)
+    lr <- getLR(currentParameters, dataset)
     ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
     currentEstimatorSettings$learningRate <- lr
   }
@@ -605,8 +619,7 @@ doCrossValidationImpl <- function(dataset,
     testDataset <- torch$utils$data$Subset(dataset,
                                            indices =
                                              as.integer(which(fold == i) - 1))
-    estimator <- createEstimator(modelParameters = currentModelParams,
-                                 estimatorSettings = currentEstimatorSettings)
+    estimator <- createEstimator(currentParameters)
     fit_estimator(estimator, trainDataset, testDataset)
 
     ParallelLogger::logInfo("Calculating predictions on left out fold set...")
@@ -659,8 +672,7 @@ trainFinalModel <- function(dataset, finalParam, modelSettings, labels) {
 
     fitParams <- names(finalParam)[grepl("^estimator", names(finalParam))]
 
-    modelParams$catFeatures <- dataset$get_cat_features()$max()
-    modelParams$numFeatures <- dataset$get_numerical_features()$len()
+    modelParams$featureInfo <- dataset$get_feature_info()
     modelParams$modelType <- modelSettings$modelType
 
     estimatorSettings <- fillEstimatorSettings(
@@ -669,8 +681,11 @@ trainFinalModel <- function(dataset, finalParam, modelSettings, labels) {
       finalParam
     )
     estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
-    estimator <- createEstimator(modelParameters = modelParams,
-                                 estimatorSettings = estimatorSettings)
+    parameters <- list(
+      modelParameters = modelParams,
+      estimatorSettings = estimatorSettings
+    )
+    estimator <- createEstimator(parameters = parameters)
     estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
 
     ParallelLogger::logInfo("Calculating predictions on all train data...")

diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
@@ -105,3 +105,24 @@ checkHigherEqual <- function(parameter, value) {
   }
   return(TRUE)
 }
+
+#' helper function to check if a file exists
+#' @param file the file to check
+checkFileExists <- function(file) {
+  if (!file.exists(file)) {
+    ParallelLogger::logError(paste0("File ", file, " does not exist"))
+    stop(paste0("File ", file, " does not exist"))
+  }
+  return(TRUE)
+}
+
+checkInStringVector <- function(parameter, values) {
+  name <- deparse(substitute(parameter))
+  if (!parameter %in% values) {
+    ParallelLogger::logError(paste0(name, " should be ",
+                                    paste0(as.character(values),
+                                           collapse = "or ")))      
+    stop(paste0(name, " has incorrect value"))
+  }
+  return(TRUE)
+}
diff --git a/R/LRFinder.R b/R/LRFinder.R
@@ -15,17 +15,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-getLR <- function(modelParameters,
-                  estimatorSettings,
+getLR <- function(parameters,
                   dataset,
                   lrSettings = NULL) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  estimator <- createEstimator(modelParameters = modelParameters,
-                               estimatorSettings = estimatorSettings)
+  estimator <- createEstimator(parameters = parameters)
   if (!is.null(lrSettings)) {
     lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
   }
   get_lr <- reticulate::import_from_path("LrFinder", path)$get_lr
   lr <- get_lr(estimator, dataset, lrSettings)
   return(lr)
-}
+}