Release version 1.1 (#57)

* speed up data conversion about 3-5x * make sure tensorList matches dataset length * user integers for tensorList * fix tidyselect warnings * fix numericalIndex and address pull warning * Allow dimToken and numHeads to take the form of vectors * Modeltype fix (#48) * added modelType to modelSettings * update default ResNet and Transformer to have custom LR and WD * Add seed for sampling hyperparameter combinations (#50) * Suppress warnings message due to NULL seed * add LRfinder, add setEstimator function for all estimator parameters, add metric to earlystopper. Adjust tests for setEstimator (#51) * add tests for coverage * add custom metric for scheduler/earlyStopping * Derive dimension of feedforward block from embedding dimension (#53) * Add dimHiddenRatio parameter to Transformer * Update Transformer documentation * Divisible check for Transformer not comprehensive (#55) * Update Transformer tests * Update NEWS.md * update website and docs * fix badge in readme --------- Co-authored-by: Henrik John <[email protected]>
OHDSI · Mar 22, 2023 · e8083d0 · e8083d0
1 parent 46da64d
commit e8083d0
Show file tree

Hide file tree

Showing 65 changed files with 1,076 additions and 5,147 deletions.
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -2,7 +2,7 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main]
+    branches: [main, develop]
   release:
     types: [published]
   workflow_dispatch:
@@ -28,18 +28,18 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: any::pkgdown, ohdsi/OhdsiRTools
+          extra-packages: any::pkgdown, ohdsi/OhdsiRTools, local::.
           needs: website
 
       - name: Build site
-        run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)'
+        run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE)'
 
       - name: Fix Hades Logo
         run: Rscript -e 'OhdsiRTools::fixHadesLogo()'
 
       - name: Deploy to GitHub pages 🚀
         if: github.event_name != 'pull_request'
-        uses: JamesIves/github-pages-deploy-action@4.1.4
+        uses: JamesIves/github-pages-deploy-action@v4
         with:
           clean: false
           branch: gh-pages

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 1.0.2
+Version: 1.1.0
 Date: 15-12-2022
 Authors@R: c(
-    person("Jenna", "Reps", email = "[email protected]", role = c("aut")), 
     person("Egill", "Fridgeirsson", email = "[email protected]", role = c("aut", "cre")),
+    person("Jenna", "Reps", email = "[email protected]", role = c("aut")), 
     person("Seng", "Chan You", role = c("aut")),
     person("Chungsoo", "Kim", role = c("aut")),
     person("Henrik", "John", role = c("aut"))
@@ -17,26 +17,28 @@ URL: https://github.com/OHDSI/DeepPatientLevelPrediction
 BugReports: https://github.com/OHDSI/DeepPatientLevelPrediction/issues
 VignetteBuilder: knitr
 Depends:
-    R (>= 3.5.0)
+    R (>= 4.0.0)
 Imports:
     dplyr,
-    data.table,
     FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
     PatientLevelPrediction (>= 6.0.4),
     rlang,
-    torch (>= 0.8.0)
+    torch (>= 0.9.0),
+    torchopt,
+    withr
 Suggests:
     devtools,
     Eunomia,
     knitr,
     markdown,
     plyr,  
-    testthat
+    testthat,
+    PRROC
 Remotes:
     ohdsi/PatientLevelPrediction,
     ohdsi/FeatureExtraction,
     ohdsi/Eunomia
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 Encoding: UTF-8
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,13 +4,13 @@ export(Dataset)
 export(Estimator)
 export(fitEstimator)
 export(gridCvDeep)
+export(lrFinder)
 export(predictDeepEstimator)
 export(setDefaultResNet)
 export(setDefaultTransformer)
+export(setEstimator)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
-import(data.table)
-importFrom(data.table,":=")
 importFrom(dplyr,"%>%")
 importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,14 @@
+DeepPatientLevelPrediction 1.1
+======================
+- Check for if number of heads is compatible with embedding dimension fixed (#55)
+- Now transformer width can be specified as a ratio of the embedding dimensions (dimToken), (#53)
+- A custom metric can now be defined for earlyStopping and learning rate schedule (#51)
+- Added a setEstimator function to configure the estimator (#51)
+- Seed added for model weight initialization to improve reproducibility (#51)
+- Added a learning rate finder for automatic calculatio of learning rate (#51)
+- Add seed for sampling hyperparameters (#50)
+- used vectorised torch operations to speed up data conversion in torch dataset
+
 DeepPatientLevelPrediction 1.0.2
 ======================
 - Fix torch binaries issue when running tests from other github actions
@@ -15,4 +26,4 @@ DeepPatientLevelPrediction 1.0.0
 - created an Estimator R6 class to handle the model fitting
 - Added three non-temporal models. An MLP, a ResNet and a Transformer
 - ResNet and Transformer have default versions of hyperparameters
-- Created tests and documentation for the package
+- Created tests and documentation for the package
diff --git a/R/Dataset.R b/R/Dataset.R
@@ -1,5 +1,4 @@
 #'  A torch dataset
-#' @import data.table
 #' @export
 Dataset <- torch::dataset(
   name = "myDataset",
@@ -10,9 +9,10 @@ Dataset <- torch::dataset(
     # determine numeric
     if (is.null(numericalIndex)) {
       numericalIndex <- data %>%
+        dplyr::arrange(columnId) %>% 
         dplyr::group_by(columnId) %>%
-        dplyr::collect() %>%
         dplyr::summarise(n = dplyr::n_distinct(.data$covariateValue)) %>%
+        dplyr::collect() %>%
         dplyr::pull(n) > 1
       self$numericalIndex <- numericalIndex
     } else {
@@ -38,17 +38,21 @@ Dataset <- torch::dataset(
       dplyr::mutate(newColumnId = dplyr::cur_group_id()) %>%
       dplyr::ungroup() %>%
       dplyr::select(c("rowId", "newColumnId")) %>%
-      dplyr::rename(columnId = newColumnId)
-    # the fastest way I found so far to convert data using data.table
-    # 1.5 min for 100k rows :(
-    dt <- data.table::data.table(rows = dataCat$rowId, cols = dataCat$columnId)
-    maxFeatures <- max(dt[, .N, by = rows][, N])
+      dplyr::rename(columnId = newColumnId) %>% 
+      dplyr::arrange(rowId)
     start <- Sys.time()
-    tensorList <- lapply(1:max(data %>% dplyr::pull(rowId)), function(x) {
-      torch::torch_tensor(dt[rows == x, cols])
-    })
+    catTensor <- torch::torch_tensor(cbind(dataCat$rowId, dataCat$columnId))
+    catTensor <- catTensor[catTensor[,1]$argsort(),]
+    tensorList <- torch::torch_split(catTensor[,2], 
+                                     as.numeric(torch::torch_unique_consecutive(catTensor[,1], 
+                                                                                return_counts = TRUE)[[3]]))
+
+    # because of subjects without cat features, I need to create a list with all zeroes and then insert
+    # my tensorList. That way I can still index the dataset correctly.
+    totalList <- as.list(integer(length(self$target)))
+    totalList[unique(dataCat$rowId)] <- tensorList
     self$lengths <- lengths
-    self$cat <- torch::nn_utils_rnn_pad_sequence(tensorList, batch_first = T)
+    self$cat <- torch::nn_utils_rnn_pad_sequence(totalList, batch_first = T)
     delta <- Sys.time() - start
     ParallelLogger::logInfo("Data conversion for dataset took ", signif(delta, 3), " ", attr(delta, "units"))
     if (sum(numericalIndex) == 0) {

diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
@@ -23,6 +23,5 @@
 #' @docType package
 #' @name DeepPatientLevelPrediction
 #' @importFrom dplyr %>%
-#' @importFrom data.table :=
 #' @importFrom rlang .data
 NULL