From 59dfbd1f8e6dad8853ee135c94a61491902d0d3c Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 27 Nov 2023 14:54:04 +0100 Subject: [PATCH] fixed linting comments for Hades style guide and black for python --- .Rbuildignore | 1 + .gitignore | 1 + NAMESPACE | 2 +- R/Dataset.R | 23 +- R/DeepPatientLevelPrediction.R | 7 +- R/Estimator.R | 416 ++++++++++++--------- R/HelperFunctions.R | 36 +- R/LRFinder.R | 17 +- R/MLP.R | 52 +-- R/ResNet.R | 105 +++--- R/TrainingCache-class.R | 31 +- R/Transformer.R | 105 +++--- inst/python/Dataset.py | 107 ++++-- inst/python/Estimator.py | 230 +++++++----- inst/python/LrFinder.py | 36 +- inst/python/MLP.py | 59 +-- inst/python/ResNet.py | 83 ++-- inst/python/Transformer.py | 144 +++---- man/DeepPatientLevelPrediction.Rd | 3 +- man/setDefaultResNet.Rd | 2 +- man/setEstimator.Rd | 15 +- man/setMultiLayerPerceptron.Rd | 11 +- man/setResNet.Rd | 21 +- man/setTransformer.Rd | 15 +- man/{TrainingCache.Rd => trainingCache.Rd} | 40 +- tests/testthat/test-TrainingCache.R | 2 +- 26 files changed, 871 insertions(+), 693 deletions(-) rename man/{TrainingCache.Rd => trainingCache.Rd} (79%) diff --git a/.Rbuildignore b/.Rbuildignore index ac33fdc..307ed11 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,4 @@ ^extras$ ^deploy.sh$ ^compare_versions$ +^.mypy_cache$ \ No newline at end of file diff --git a/.gitignore b/.gitignore index e54def2..feec6ee 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ renv.lock extras/ .Renviron inst/python/__pycache__ +.mypy_cache diff --git a/NAMESPACE b/NAMESPACE index 4806f0d..5f8c13e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(TrainingCache) export(fitEstimator) export(gridCvDeep) export(predictDeepEstimator) @@ -10,6 +9,7 @@ export(setEstimator) export(setMultiLayerPerceptron) export(setResNet) export(setTransformer) +export(trainingCache) importFrom(dplyr,"%>%") importFrom(reticulate,py_to_r) importFrom(reticulate,r_to_py) diff --git a/R/Dataset.R b/R/Dataset.R index f4b5170..4ba4e81 100644 --- a/R/Dataset.R +++ b/R/Dataset.R @@ -15,21 +15,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -createDataset <- function(data, labels, plpModel=NULL) { +createDataset <- function(data, labels, plpModel = NULL) { path <- system.file("python", package = "DeepPatientLevelPrediction") - Dataset <- reticulate::import_from_path("Dataset", path = path)$Data + dataset <- reticulate::import_from_path("Dataset", path = path)$Data if (is.null(attributes(data)$path)) { # sqlite object attributes(data)$path <- attributes(data)$dbname } if (is.null(plpModel)) { - data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), - r_to_py(labels$outcomeCount)) + data <- dataset(r_to_py(normalizePath(attributes(data)$path)), + r_to_py(labels$outcomeCount)) + } else { + numericalFeatures <- + r_to_py(as.array(which(plpModel$covariateImportance$isNumeric))) + data <- dataset(r_to_py(normalizePath(attributes(data)$path)), + numerical_features = numericalFeatures) } - else { - data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), - numerical_features = r_to_py(as.array(which(plpModel$covariateImportance$isNumeric))) ) - } - - return(data) -} \ No newline at end of file + + return(data) +} diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R index e77812e..137b53a 100644 --- a/R/DeepPatientLevelPrediction.R +++ b/R/DeepPatientLevelPrediction.R @@ -18,7 +18,8 @@ #' DeepPatientLevelPrediction #' -#' @description A package containing deep learning extensions for developing prediction models using data in the OMOP CDM +#' @description A package containing deep learning extensions for developing +#' prediction models using data in the OMOP CDM #' #' @docType package #' @name DeepPatientLevelPrediction @@ -28,9 +29,7 @@ NULL .onLoad <- function(libname, pkgname) { - # use superassignment to update global reference + # use superassignment to update global reference reticulate::configure_environment(pkgname) torch <<- reticulate::import("torch", delay_load = TRUE) } - - diff --git a/R/Estimator.R b/R/Estimator.R index 7705279..656b647 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -26,36 +26,42 @@ #' @param weightDecay what weight_decay to use #' @param batchSize batchSize to use #' @param epochs how many epochs to train for -#' @param device what device to train on, can be a string or a function to that evaluates +#' @param device what device to train on, can be a string or a function to +#' that evaluates #' to the device during runtime #' @param optimizer which optimizer to use #' @param scheduler which learning rate scheduler to use #' @param criterion loss function to use -#' @param earlyStopping If earlyStopping should be used which stops the training of your metric is not improving -#' @param metric either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. -#' Needs to be a list with function `fun`, mode either `min` or `max` and a `name`, -#' `fun` needs to be a function that takes in prediction and labels and outputs a score. +#' @param earlyStopping If earlyStopping should be used which stops the +#' training of your metric is not improving +#' @param metric either `auc` or `loss` or a custom metric to use. This is the +#' metric used for scheduler and earlyStopping. +#' Needs to be a list with function `fun`, mode either `min` or `max` and a +#' `name`, +#' `fun` needs to be a function that takes in prediction and labels and +#' outputs a score. #' @param seed seed to initialize weights of model with #' @export -setEstimator <- function(learningRate='auto', - weightDecay = 0.0, - batchSize = 512, - epochs = 30, - device='cpu', - optimizer = torch$optim$AdamW, - scheduler = list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau, - params=list(patience=1)), - criterion = torch$nn$BCEWithLogitsLoss, - earlyStopping = list(useEarlyStopping=TRUE, - params = list(patience=4)), - metric = "auc", - seed = NULL +setEstimator <- function(learningRate = "auto", + weightDecay = 0.0, + batchSize = 512, + epochs = 30, + device = "cpu", + optimizer = torch$optim$AdamW, + scheduler = list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau, + params = list(patience = 1)), + criterion = torch$nn$BCEWithLogitsLoss, + earlyStopping = list(useEarlyStopping = TRUE, + params = list(patience = 4)), + metric = "auc", + seed = NULL ) { - + checkIsClass(learningRate, c("numeric", "character")) if (inherits(learningRate, "character")) { - if (learningRate != "auto"){ - stop(paste0('Learning rate should be either a numeric or "auto", you provided: ', learningRate)) + if (learningRate != "auto") { + stop(paste0('Learning rate should be either a numeric or "auto", + you provided: ', learningRate)) } } checkIsClass(weightDecay, "numeric") @@ -67,49 +73,57 @@ setEstimator <- function(learningRate='auto', checkIsClass(earlyStopping, c("list", "NULL")) checkIsClass(metric, c("character", "list")) checkIsClass(seed, c("numeric", "integer", "NULL")) - - - if (length(learningRate)==1 && learningRate=='auto') {findLR <- TRUE} else {findLR <- FALSE} + + + if (length(learningRate) == 1 && learningRate == "auto") { + findLR <- TRUE + } else { + findLR <- FALSE + } if (is.null(seed)) { seed <- as.integer(sample(1e5, 1)) } - estimatorSettings <- list(learningRate=learningRate, - weightDecay=weightDecay, - batchSize=batchSize, - epochs=epochs, - device=device, - earlyStopping=earlyStopping, - findLR=findLR, - metric=metric, - seed=seed[1]) - - optimizer <- rlang::enquo(optimizer) + estimatorSettings <- list(learningRate = learningRate, + weightDecay = weightDecay, + batchSize = batchSize, + epochs = epochs, + device = device, + earlyStopping = earlyStopping, + findLR = findLR, + metric = metric, + seed = seed[1]) + + optimizer <- rlang::enquo(optimizer) estimatorSettings$optimizer <- function() rlang::eval_tidy(optimizer) - class(estimatorSettings$optimizer) <- c("delayed", class(estimatorSettings$optimizer)) - + class(estimatorSettings$optimizer) <- c("delayed", + class(estimatorSettings$optimizer)) + criterion <- rlang::enquo(criterion) estimatorSettings$criterion <- function() rlang::eval_tidy(criterion) - class(estimatorSettings$criterion) <- c("delayed", class(estimatorSettings$criterion)) - + class(estimatorSettings$criterion) <- c("delayed", + class(estimatorSettings$criterion)) + scheduler <- rlang::enquo(scheduler) estimatorSettings$scheduler <- function() rlang::eval_tidy(scheduler) - class(estimatorSettings$scheduler) <-c("delayed", class(estimatorSettings$scheduler)) + class(estimatorSettings$scheduler) <- c("delayed", + class(estimatorSettings$scheduler)) if (is.function(device)) { - class(estimatorSettings$device) <- c("delayed", class(estimatorSettings$device)) + class(estimatorSettings$device) <- c("delayed", + class(estimatorSettings$device)) } - + paramsToTune <- list() for (name in names(estimatorSettings)) { param <- estimatorSettings[[name]] if (length(param) > 1 && is.atomic(param)) { - paramsToTune[[paste0('estimator.',name)]] <- param + paramsToTune[[paste0("estimator.", name)]] <- param } if ("params" %in% names(param)) { for (name2 in names(param[["params"]])) { param2 <- param[["params"]][[name2]] if (length(param2) > 1) { - paramsToTune[[paste0('estimator.',name,'.',name2)]] <- param2 + paramsToTune[[paste0("estimator.", name, ".", name2)]] <- param2 } } } @@ -118,7 +132,7 @@ setEstimator <- function(learningRate='auto', return(estimatorSettings) } - + #' fitEstimator #' #' @description @@ -137,12 +151,12 @@ fitEstimator <- function(trainData, analysisPath, ...) { start <- Sys.time() - + # check covariate data if (!FeatureExtraction::isCovariateData(trainData$covariateData)) { stop("Needs correct covariateData") } - + if (!is.null(trainData$folds)) { trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId") } @@ -150,9 +164,9 @@ fitEstimator <- function(trainData, covariateData = trainData$covariateData, cohort = trainData$labels ) - + covariateRef <- mappedCovariateData$covariateRef - + outLoc <- PatientLevelPrediction::createTempModelLoc() cvResult <- do.call( what = gridCvDeep, @@ -164,10 +178,12 @@ fitEstimator <- function(trainData, analysisPath = analysisPath ) ) - - hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary)) + + hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, + function(x) x$hyperSummary)) prediction <- cvResult$prediction - incs <- rep(1, covariateRef %>% dplyr::tally() %>% + incs <- rep(1, covariateRef %>% + dplyr::tally() %>% dplyr::collect() %>% as.integer()) covariateRef <- covariateRef %>% @@ -178,25 +194,30 @@ fitEstimator <- function(trainData, covariateValue = 0, isNumeric = .data$columnId %in% cvResult$numericalIndex ) - + comp <- start - Sys.time() result <- list( - model = cvResult$estimator, # file.path(outLoc), - + model = cvResult$estimator, + preprocessing = list( - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = F + featureEngineering = attr(trainData$covariateData, + "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, + "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = FALSE ), prediction = prediction, modelDesign = PatientLevelPrediction::createModelDesign( targetId = attr(trainData, "metaData")$targetId, outcomeId = attr(trainData, "metaData")$outcomeId, - restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, + restrictPlpDataSettings = attr(trainData, + "metaData")$restrictPlpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, - featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, - preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + featureEngineeringSettings = attr(trainData$covariateData, + "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, + "metaData")$preprocessSettings, modelSettings = modelSettings, splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings @@ -214,12 +235,12 @@ fitEstimator <- function(trainData, ), covariateImportance = covariateRef ) - + class(result) <- "plpModel" attr(result, "predictionFunction") <- "predictDeepEstimator" attr(result, "modelType") <- "binary" attr(result, "saveType") <- modelSettings$saveType - + return(result) } @@ -242,32 +263,31 @@ predictDeepEstimator <- function(plpModel, } if ("plpData" %in% class(data)) { mappedData <- PatientLevelPrediction::MapIds(data$covariateData, - cohort = cohort, - mapping = plpModel$covariateImportance %>% - dplyr::select( - "columnId", - "covariateId" - ) + cohort = cohort, + mapping = plpModel$covariateImportance %>% + dplyr::select("columnId", "covariateId") ) - data <- createDataset(mappedData, plpModel=plpModel) + data <- createDataset(mappedData, plpModel = plpModel) } - + # get predictions prediction <- cohort if (is.character(plpModel$model)) { modelSettings <- plpModel$modelDesign$modelSettings - model <- torch$load(file.path(plpModel$model, "DeepEstimatorModel.pt"), map_location = "cpu") - estimator <- createEstimator(modelType=modelSettings$modelType, - modelParameters=model$model_parameters, - estimatorSettings=model$estimator_settings) + model <- torch$load(file.path(plpModel$model, + "DeepEstimatorModel.pt"), + map_location = "cpu") + estimator <- createEstimator(modelType = modelSettings$modelType, + modelParameters = model$model_parameters, + estimatorSettings = model$estimator_settings) estimator$model$load_state_dict(model$model_state_dict) prediction$value <- estimator$predict_proba(data) } else { prediction$value <- plpModel$model$predict_proba(data) } - + attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType") - + return(prediction) } @@ -289,15 +309,16 @@ gridCvDeep <- function(mappedData, modelSettings, modelLocation, analysisPath) { - ParallelLogger::logInfo(paste0("Running hyperparameter search for ", modelSettings$modelType, " model")) - + ParallelLogger::logInfo(paste0("Running hyperparameter search for ", + modelSettings$modelType, " model")) + ########################################################################### - + paramSearch <- modelSettings$param - + # TODO below chunk should be in a setupCache function - trainCache <- TrainingCache$new(analysisPath) - if (trainCache$isParamGridIdentical(paramSearch)) { + trainCache <- trainingCache$new(analysisPath) + if (trainCache$isParamGridIdentical(paramSearch)) { gridSearchPredictons <- trainCache$getGridSearchPredictions() } else { gridSearchPredictons <- list() @@ -305,123 +326,146 @@ gridCvDeep <- function(mappedData, trainCache$saveGridSearchPredictions(gridSearchPredictons) trainCache$saveModelParams(paramSearch) } - - dataset <- createDataset(data=mappedData, labels=labels) - - fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))] + + dataset <- createDataset(data = mappedData, labels = labels) + + fitParams <- names(paramSearch[[1]])[grepl("^estimator", + names(paramSearch[[1]]))] findLR <- modelSettings$estimatorSettings$findLR if (!trainCache$isFull()) { for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { - ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId)) - ParallelLogger::logInfo(paste0("HyperParameters: ")) - ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | ")) - currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames] - - currentEstimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, - paramSearch[[gridId]]) - - # initiate prediction - prediction <- NULL - - fold <- labels$index - ParallelLogger::logInfo(paste0("Max fold: ", max(fold))) - currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] - currentModelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]] - if (findLR) { - LrFinder <- createLRFinder(modelType = modelSettings$modelType, - modelParameters = currentModelParams, - estimatorSettings = currentEstimatorSettings - ) - lr <- LrFinder$get_lr(dataset) - ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr)) - currentEstimatorSettings$learningRate <- lr - } - - learnRates <- list() - for (i in 1:max(fold)) { - ParallelLogger::logInfo(paste0("Fold ", i)) - trainDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold != i) - 1)) # -1 for python 0-based indexing - testDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold == i) -1)) # -1 for python 0-based indexing - - estimator <- createEstimator(modelType=modelSettings$modelType, - modelParameters=currentModelParams, - estimatorSettings=currentEstimatorSettings) - estimator$fit(trainDataset, testDataset) - - ParallelLogger::logInfo("Calculating predictions on left out fold set...") - - prediction <- rbind( - prediction, - predictDeepEstimator( - plpModel = estimator, - data = testDataset, - cohort = labels[fold == i, ] + ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", + gridId)) + ParallelLogger::logInfo(paste0("HyperParameters: ")) + ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), + paramSearch[[gridId]], collapse = " | ")) + currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames] + + currentEstimatorSettings <- + fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, + paramSearch[[gridId]]) + + # initiate prediction + prediction <- NULL + + fold <- labels$index + ParallelLogger::logInfo(paste0("Max fold: ", max(fold))) + currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] + currentModelParams$numFeatures <- + dataset$get_numerical_features()$shape[[1]] + if (findLR) { + lrFinder <- createLRFinder(modelType = modelSettings$modelType, + modelParameters = currentModelParams, + estimatorSettings = currentEstimatorSettings ) + lr <- lrFinder$get_lr(dataset) + ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr)) + currentEstimatorSettings$learningRate <- lr + } + + learnRates <- list() + for (i in 1:max(fold)) { + ParallelLogger::logInfo(paste0("Fold ", i)) + trainDataset <- + torch$utils$data$Subset(dataset, + indices = as.integer(which(fold != i) - 1)) + # -1 for python 0-based indexing + testDataset <- + torch$utils$data$Subset(dataset, + indices = as.integer(which(fold == i) - 1)) + # -1 for python 0-based indexing + + estimator <- createEstimator(modelType = modelSettings$modelType, + modelParameters = currentModelParams, + estimatorSettings = + currentEstimatorSettings) + estimator$fit(trainDataset, testDataset) + + ParallelLogger::logInfo("Calculating predictions on left out + fold set...") + + prediction <- rbind( + prediction, + predictDeepEstimator( + plpModel = estimator, + data = testDataset, + cohort = labels[fold == i, ] + ) + ) + learnRates[[i]] <- list( + LRs = estimator$learn_rate_schedule, + bestEpoch = estimator$best_epoch + ) + } + maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) + gridSearchPredictons[[gridId]] <- list( + prediction = prediction, + param = paramSearch[[gridId]], + gridPerformance = + PatientLevelPrediction::computeGridPerformance(prediction, + paramSearch[[gridId]]) ) - learnRates[[i]] <- list( - LRs = estimator$learn_rate_schedule, - bestEpoch = estimator$best_epoch - ) - } - maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) - gridSearchPredictons[[gridId]] <- list( - prediction = prediction, - param = paramSearch[[gridId]], - gridPerformance = PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]]) - ) - gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), - nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) - gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]] - - - # remove all predictions that are not the max performance - indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) - for (i in seq_along(gridSearchPredictons)) { - if (!is.null(gridSearchPredictons[[i]])) { - if (i != indexOfMax) { - gridSearchPredictons[[i]]$prediction <- list(NULL) + gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- + rep(list(unlist(learnRates[[maxIndex]]$LRs)), + nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) + gridSearchPredictons[[gridId]]$param$learnSchedule <- + learnRates[[maxIndex]] + + # remove all predictions that are not the max performance + indexOfMax <- + which.max(unlist(lapply(gridSearchPredictons, + function(x) x$gridPerformance$cvPerformance))) + for (i in seq_along(gridSearchPredictons)) { + if (!is.null(gridSearchPredictons[[i]])) { + if (i != indexOfMax) { + gridSearchPredictons[[i]]$prediction <- list(NULL) + } } } + ParallelLogger::logInfo(paste0("Caching all grid search results and + prediction for best combination ", + indexOfMax)) + trainCache$saveGridSearchPredictions(gridSearchPredictons) } - ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax)) - trainCache$saveGridSearchPredictions(gridSearchPredictons) - } } paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) # get best params - indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + indexOfMax <- + which.max(unlist(lapply(gridSearchPredictons, + function(x) x$gridPerformance$cvPerformance))) finalParam <- gridSearchPredictons[[indexOfMax]]$param paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) - + # get best CV prediction cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction cvPrediction$evaluationType <- "CV" - + ParallelLogger::logInfo("Training final model using optimal parameters") # get the params modelParams <- finalParam[modelSettings$modelParamNames] - - + + # create the dir if (!dir.exists(file.path(modelLocation))) { - dir.create(file.path(modelLocation), recursive = T) + dir.create(file.path(modelLocation), recursive = TRUE) } - + modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]] - - - estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, + + + estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, + fitParams, finalParam) estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]] estimator <- createEstimator(modelType = modelSettings$modelType, modelParameters = modelParams, estimatorSettings = estimatorSettings) - + numericalIndex <- dataset$get_numerical_features() estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs) - + ParallelLogger::logInfo("Calculating predictions on all train data...") prediction <- predictDeepEstimator( plpModel = estimator, @@ -429,7 +473,7 @@ gridCvDeep <- function(mappedData, cohort = labels ) prediction$evaluationType <- "Train" - + prediction <- rbind( prediction, cvPrediction @@ -438,9 +482,10 @@ gridCvDeep <- function(mappedData, prediction <- prediction %>% dplyr::select(-"index") - prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, origin = "1970-01-01") - - + prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, + origin = "1970-01-01") + + # save torch code here estimator$save(modelLocation, "DeepEstimatorModel.pt") return( @@ -454,23 +499,25 @@ gridCvDeep <- function(mappedData, ) } -# utility function to add instances of parameters to estimatorSettings during grid search +# utility function to add instances of parameters to estimatorSettings +# during grid search fillEstimatorSettings <- function(estimatorSettings, fitParams, paramSearch) { for (fp in fitParams) { components <- strsplit(fp, "[.]")[[1]] - - if (length(components)==2) { + + if (length(components) == 2) { estimatorSettings[[components[[2]]]] <- paramSearch[[fp]] } else { - estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <- paramSearch[[fp]] + estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <- + paramSearch[[fp]] } } return(estimatorSettings) } -# utility function to evaluate any expressions or call functions passed as settings +# utility function to evaluate any expressions or call functions passed as +# settings evalEstimatorSettings <- function(estimatorSettings) { - for (set in names(estimatorSettings)) { if (inherits(estimatorSettings[[set]], "delayed")) { estimatorSettings[[set]] <- estimatorSettings[[set]]() @@ -484,16 +531,15 @@ createEstimator <- function(modelType, estimatorSettings) { path <- system.file("python", package = "DeepPatientLevelPrediction") - Model <- reticulate::import_from_path(modelType, path=path)[[modelType]] - - Estimator <- reticulate::import_from_path("Estimator", path=path)$Estimator - + model <- reticulate::import_from_path(modelType, path = path)[[modelType]] + estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator + modelParameters <- camelCaseToSnakeCaseNames(modelParameters) estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings) estimatorSettings <- evalEstimatorSettings(estimatorSettings) - - estimator <- Estimator(model = Model, + + estimator <- estimator(model = model, model_parameters = modelParameters, estimator_settings = estimatorSettings) return(estimator) -} \ No newline at end of file +} diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index 350baf9..a1aa43c 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -41,40 +41,40 @@ camelCaseToSnakeCaseNames <- function(object) { } #' helper function to check class of input -#' +#' #' @param parameter the input parameter to check #' @param classes which classes it should belong to (one or more) -checkIsClass<- function(parameter,classes) { - name = deparse(substitute(parameter)) +checkIsClass <- function(parameter, classes) { + name <- deparse(substitute(parameter)) if (!inherits(x = parameter, what = classes)) { - ParallelLogger::logError(paste0(name, ' should be of class:', classes, ' ')) - stop(paste0(name, ' is wrong class')) + ParallelLogger::logError(paste0(name, " should be of class:", classes, " ")) + stop(paste0(name, " is wrong class")) } return(TRUE) } #' helper function to check that input is higher than a certain value -#' +#' #' @param parameter the input parameter to check, can be a vector #' @param value which value it should be higher than -checkHigher <- function(parameter,value) { - name = deparse(substitute(parameter)) - if (!is.numeric(parameter) | all(parameter<=value)) { - ParallelLogger::logError(paste0(name, ' needs to be > ',value)) - stop(paste0(name, ' needs to be > ', value)) +checkHigher <- function(parameter, value) { + name <- deparse(substitute(parameter)) + if (!is.numeric(parameter) || all(parameter == value)) { + ParallelLogger::logError(paste0(name, " needs to be > ", value)) + stop(paste0(name, " needs to be > ", value)) } return(TRUE) } #' helper function to check that input is higher or equal than a certain value -#' +#' #' @param parameter the input parameter to check, can be a vector #' @param value which value it should be higher or equal than -checkHigherEqual <- function(parameter,value) { - name = deparse(substitute(parameter)) - if (!is.numeric(parameter) | all(parameter= ',value)) - stop(paste0(name, ' needs to be >= ', value)) +checkHigherEqual <- function(parameter, value) { + name <- deparse(substitute(parameter)) + if (!is.numeric(parameter) || all(parameter < value)) { + ParallelLogger::logError(paste0(name, " needs to be >= ", value)) + stop(paste0(name, " needs to be >= ", value)) } return(TRUE) -} \ No newline at end of file +} diff --git a/R/LRFinder.R b/R/LRFinder.R index dfe8ee3..63cb62c 100644 --- a/R/LRFinder.R +++ b/R/LRFinder.R @@ -18,22 +18,21 @@ createLRFinder <- function(modelType, modelParameters, estimatorSettings, - lrSettings=NULL) { + lrSettings = NULL) { path <- system.file("python", package = "DeepPatientLevelPrediction") - LRFinderClass <- reticulate::import_from_path("LrFinder", path = path)$LrFinder - - model <- reticulate::import_from_path(modelType, path=path)[[modelType]] - + lrFinderClass <- + reticulate::import_from_path("LrFinder", path = path)$LrFinder + + model <- reticulate::import_from_path(modelType, path = path)[[modelType]] modelParameters <- camelCaseToSnakeCaseNames(modelParameters) estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings) if (!is.null(lrSettings)) { lrSettings <- camelCaseToSnakeCaseNames(lrSettings) } - + estimatorSettings <- evalEstimatorSettings(estimatorSettings) - - - lrFinder <- LRFinderClass(model=model, + + lrFinder <- lrFinderClass(model = model, model_parameters = modelParameters, estimator_settings = estimatorSettings, lr_settings = lrSettings) diff --git a/R/MLP.R b/R/MLP.R index 6ad35e6..771e244 100644 --- a/R/MLP.R +++ b/R/MLP.R @@ -26,12 +26,15 @@ #' #' #' @param numLayers Number of layers in network, default: 1:8 -#' @param sizeHidden Amount of neurons in each default layer, default: 2^(6:9) (64 to 512) -#' @param dropout How much dropout to apply after first linear, default: seq(0, 0.3, 0.05) -#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) (64 to 512) +#' @param sizeHidden Amount of neurons in each default layer, +#' default: 2^(6:9) (64 to 512) +#' @param dropout How much dropout to apply after first linear, +#' default: seq(0, 0.3, 0.05) +#' @param sizeEmbedding Size of embedding default: 2^(6:9) (64 to 512) #' @param estimatorSettings settings of Estimator created with `setEstimator` -#' @param hyperParamSearch Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random' -#' @param randomSample How many random samples from hyperparameter space to use +#' @param hyperParamSearch Which kind of hyperparameter search to use random +#' sampling or exhaustive grid search. default: 'random' +#' @param randomSample How many random samples from hyperparameter space to use #' @param randomSampleSeed Random seed to sample hyperparameter combinations #' #' @export @@ -39,35 +42,34 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8), sizeHidden = c(2^(6:9)), dropout = c(seq(0, 0.3, 0.05)), sizeEmbedding = c(2^(6:9)), - estimatorSettings = setEstimator( - learningRate = 'auto', - weightDecay = c(1e-6, 1e-3), - batchSize = 1024, - epochs = 30, - device="cpu"), + estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = c(1e-6, 1e-3), + batchSize = 1024, + epochs = 30, + device = "cpu"), hyperParamSearch = "random", randomSample = 100, randomSampleSeed = NULL) { - checkIsClass(numLayers, c("integer", "numeric")) checkHigherEqual(numLayers, 1) checkIsClass(sizeHidden, c("integer", "numeric")) checkHigherEqual(sizeHidden, 1) - + checkIsClass(dropout, c("numeric")) checkHigherEqual(dropout, 0) - + checkIsClass(sizeEmbedding, c("numeric", "integer")) checkHigherEqual(sizeEmbedding, 1) - + checkIsClass(hyperParamSearch, "character") - + checkIsClass(randomSample, c("numeric", "integer")) checkHigherEqual(randomSample, 1) - + checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL")) - + paramGrid <- list( numLayers = numLayers, sizeHidden = sizeHidden, @@ -78,16 +80,20 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8), paramGrid <- c(paramGrid, estimatorSettings$paramsToTune) param <- PatientLevelPrediction::listCartesian(paramGrid) - if (hyperParamSearch == "random" && randomSample>length(param)) { - stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", - "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param), + if (hyperParamSearch == "random" && randomSample > length(param)) { + stop(paste("\n Chosen amount of randomSamples is higher than the + amount of possible hyperparameter combinations.", + "\n randomSample:", randomSample, "\n Possible hyperparameter + combinations:", length(param), "\n Please lower the amount of randomSamples")) } if (hyperParamSearch == "random") { - suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]})) + suppressWarnings(withr::with_seed(randomSampleSeed, + {param <- param[sample(length(param), + randomSample)]})) } - attr(param, 'settings')$modelType <- "MLP" + attr(param, "settings")$modelType <- "MLP" results <- list( fitFunction = "fitEstimator", diff --git a/R/ResNet.R b/R/ResNet.R index d20aa1d..32e7f3a 100644 --- a/R/ResNet.R +++ b/R/ResNet.R @@ -22,18 +22,19 @@ #' Creates settings for a default ResNet model #' #' @details -#' Model architecture from by https://arxiv.org/abs/2106.11959 . +#' Model architecture from by https://arxiv.org/abs/2106.11959 . #' Hyperparameters chosen by a experience on a few prediction problems. #' #' @param estimatorSettings created with ```setEstimator``` #' @export -setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto', - weightDecay=1e-6, - device='cpu', - batchSize=1024, - epochs=50, - seed=NULL)) { +setDefaultResNet <- function(estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = 1e-6, + device = "cpu", + batchSize = 1024, + epochs = 50, + seed = NULL)) { resnetSettings <- setResNet(numLayers = 6, sizeHidden = 512, hiddenFactor = 2, @@ -41,9 +42,9 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto', hiddenDropout = 0.4, sizeEmbedding = 256, estimatorSettings = estimatorSettings, - hyperParamSearch = 'random', + hyperParamSearch = "random", randomSample = 1) - attr(resnetSettings, 'settings')$name <- 'defaultResnet' + attr(resnetSettings, "settings")$name <- "defaultResnet" return(resnetSettings) } @@ -58,14 +59,21 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto', #' #' #' @param numLayers Number of layers in network, default: 1:16 -#' @param sizeHidden Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024) -#' @param hiddenFactor How much to grow the amount of neurons in each ResLayer, default: 1:4 -#' @param residualDropout How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05) -#' @param hiddenDropout How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05) -#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) (64 to 512) +#' @param sizeHidden Amount of neurons in each default layer, default: +#' 2^(6:10) (64 to 1024) +#' @param hiddenFactor How much to grow the amount of neurons in each +#' ResLayer, default: 1:4 +#' @param residualDropout How much dropout to apply after last linear layer +#' in ResLayer, default: seq(0, 0.3, 0.05) +#' @param hiddenDropout How much dropout to apply after first linear layer +#' in ResLayer, default: seq(0, 0.3, 0.05) +#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) +#' '(64 to 512) #' @param estimatorSettings created with ```setEstimator``` -#' @param hyperParamSearch Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random' -#' @param randomSample How many random samples from hyperparameter space to use +#' @param hyperParamSearch Which kind of hyperparameter search to use random +#' sampling or exhaustive grid search. default: 'random' +#' @param randomSample How many random samples from hyperparameter space +#' to use #' @param randomSampleSeed Random seed to sample hyperparameter combinations #' @export setResNet <- function(numLayers = c(1:8), @@ -74,61 +82,62 @@ setResNet <- function(numLayers = c(1:8), residualDropout = c(seq(0, 0.5, 0.05)), hiddenDropout = c(seq(0, 0.5, 0.05)), sizeEmbedding = c(2^(6:9)), - estimatorSettings = setEstimator(learningRate='auto', - weightDecay=c(1e-6, 1e-3), - device='cpu', - batchSize=1024, - epochs=30, - seed=NULL), + estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = c(1e-6, 1e-3), + device = "cpu", + batchSize = 1024, + epochs = 30, + seed = NULL), hyperParamSearch = "random", randomSample = 100, - randomSampleSeed = NULL) -{ + randomSampleSeed = NULL) { checkIsClass(numLayers, c("integer", "numeric")) checkHigherEqual(numLayers, 1) - + checkIsClass(sizeHidden, c("integer", "numeric")) checkHigherEqual(sizeHidden, 1) - + checkIsClass(residualDropout, "numeric") checkHigherEqual(residualDropout, 0) - + checkIsClass(hiddenDropout, "numeric") checkHigherEqual(hiddenDropout, 0) - + checkIsClass(sizeEmbedding, c("integer", "numeric")) checkHigherEqual(sizeEmbedding, 1) - + checkIsClass(hyperParamSearch, "character") - + checkIsClass(randomSample, c("numeric", "integer")) checkHigherEqual(randomSample, 1) - + checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL")) - - paramGrid <- list( - numLayers = numLayers, - sizeHidden = sizeHidden, - hiddenFactor = hiddenFactor, - residualDropout = residualDropout, - hiddenDropout = hiddenDropout, - sizeEmbedding = sizeEmbedding) - + + paramGrid <- list(numLayers = numLayers, + sizeHidden = sizeHidden, + hiddenFactor = hiddenFactor, + residualDropout = residualDropout, + hiddenDropout = hiddenDropout, + sizeEmbedding = sizeEmbedding) paramGrid <- c(paramGrid, estimatorSettings$paramsToTune) - + param <- PatientLevelPrediction::listCartesian(paramGrid) - if (hyperParamSearch == "random" && randomSample>length(param)) { - stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", - "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param), - "\n Please lower the amount of randomSamples")) + if (hyperParamSearch == "random" && randomSample > length(param)) { + stop(paste("\n Chosen amount of randomSamples is higher than the amount of + possible hyperparameter combinations.", "\n randomSample:", + randomSample, "\n Possible hyperparameter combinations:", + length(param), "\n Please lower the amount of randomSamples")) } - + if (hyperParamSearch == "random") { - suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]})) + suppressWarnings(withr::with_seed(randomSampleSeed, + {param <- param[sample(length(param), + randomSample)]})) } - attr(param, 'settings')$modelType <- "ResNet" + attr(param, "settings")$modelType <- "ResNet" results <- list( fitFunction = "fitEstimator", param = param, diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R index be626d5..c72f588 100644 --- a/R/TrainingCache-class.R +++ b/R/TrainingCache-class.R @@ -2,9 +2,8 @@ #' @description #' Parameter caching for training persistence and continuity #' @export -TrainingCache <- R6::R6Class( +trainingCache <- R6::R6Class( "TrainingCache", - private = list( .paramPersistence = list( gridSearchPredictions = NULL, @@ -12,30 +11,27 @@ TrainingCache <- R6::R6Class( ), .paramContinuity = list(), .saveDir = NULL, - writeToFile = function() { saveRDS(private$.paramPersistence, file.path(private$.saveDir)) }, - readFromFile = function() { - private$.paramPersistence <- readRDS(file.path(private$.saveDir)) + private$.paramPersistence <- readRDS(file.path(private$.saveDir)) } ), - public = list( #' @description #' Creates a new training cache #' @param inDir Path to the analysis directory initialize = function(inDir) { private$.saveDir <- file.path(inDir, "paramPersistence.rds") - + if (file.exists(private$.saveDir)) { private$readFromFile() } else { private$writeToFile() } }, - + #' @description #' Checks whether the parameter grid in the model settings is identical to #' the cached parameters. @@ -44,7 +40,7 @@ TrainingCache <- R6::R6Class( isParamGridIdentical = function(inModelParams) { return(identical(inModelParams, private$.paramPersistence$modelParams)) }, - + #' @description #' Saves the grid search results to the training cache #' @param inGridSearchPredictions Grid search predictions @@ -53,7 +49,7 @@ TrainingCache <- R6::R6Class( inGridSearchPredictions private$writeToFile() }, - + #' @description #' Saves the parameter grid to the training cache #' @param inModelParams Parameter grid from the model settings @@ -61,21 +57,22 @@ TrainingCache <- R6::R6Class( private$.paramPersistence$modelParams <- inModelParams private$writeToFile() }, - + #' @description #' Gets the grid search results from the training cache #' @returns Grid search results from the training cache getGridSearchPredictions = function() { return(private$.paramPersistence$gridSearchPredictions) }, - + #' @description #' Check if cache is full #' @returns Boolen isFull = function() { - return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance))))) - }, - + return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, + function(x) !is.null(x$gridPerformance))))) + }, + #' @description #' Gets the last index from the cached grid search #' @returns Last grid search index @@ -88,11 +85,11 @@ TrainingCache <- R6::R6Class( return(1) } else { return(which(sapply(private$.paramPersistence$gridSearchPredictions, - is.null))[1]) + is.null))[1]) } } }, - + #' @description #' Remove the training cache from the analysis path dropCache = function() { diff --git a/R/Transformer.R b/R/Transformer.R index 8b0d0c4..cd4a968 100644 --- a/R/Transformer.R +++ b/R/Transformer.R @@ -24,13 +24,13 @@ #' @param estimatorSettings created with `setEstimator` #' #' @export -setDefaultTransformer <- function(estimatorSettings=setEstimator( - learningRate = 'auto', - weightDecay = 1e-4, - batchSize=512, - epochs=10, - seed=NULL, - device='cpu') +setDefaultTransformer <- function(estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = 1e-4, + batchSize = 512, + epochs = 10, + seed = NULL, + device = "cpu") ) { transformerSettings <- setTransformer(numBlocks = 3, dimToken = 192, @@ -40,10 +40,10 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator( ffnDropout = 0.1, resDropout = 0.0, dimHidden = 256, - estimatorSettings=estimatorSettings, - hyperParamSearch = 'random', + estimatorSettings = estimatorSettings, + hyperParamSearch = "random", randomSample = 1) - attr(transformerSettings, 'settings')$name <- 'defaultTransformer' + attr(transformerSettings, "settings")$name <- "defaultTransformer" return(transformerSettings) } @@ -54,75 +54,81 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator( #' #' @param numBlocks number of transformer blocks #' @param dimToken dimension of each token (embedding size) -#' @param dimOut dimension of output, usually 1 for binary problems +#' @param dimOut dimension of output, usually 1 for binary +#' problems #' @param numHeads number of attention heads #' @param attDropout dropout to use on attentions #' @param ffnDropout dropout to use in feedforward block #' @param resDropout dropout to use in residual connections #' @param dimHidden dimension of the feedworward block -#' @param dimHiddenRatio dimension of the feedforward block as a ratio of dimToken (embedding size) +#' @param dimHiddenRatio dimension of the feedforward block as a ratio +#' of dimToken (embedding size) #' @param estimatorSettings created with `setEstimator` -#' @param hyperParamSearch what kind of hyperparameter search to do, default 'random' -#' @param randomSample How many samples to use in hyperparameter search if random -#' @param randomSampleSeed Random seed to sample hyperparameter combinations +#' @param hyperParamSearch what kind of hyperparameter search to do, +#' default 'random' +#' @param randomSample How many samples to use in hyperparameter +#' search if random +#' @param randomSampleSeed Random seed to sample hyperparameter +#' combinations #' #' @export -setTransformer <- function(numBlocks = 3, - dimToken = 96, +setTransformer <- function(numBlocks = 3, + dimToken = 96, dimOut = 1, - numHeads = 8, - attDropout = 0.25, + numHeads = 8, + attDropout = 0.25, ffnDropout = 0.25, - resDropout = 0, - dimHidden = 512, + resDropout = 0, + dimHidden = 512, dimHiddenRatio = NULL, - estimatorSettings=setEstimator(weightDecay = 1e-6, - batchSize=1024, - epochs=10, - seed=NULL), + estimatorSettings = setEstimator(weightDecay = 1e-6, + batchSize = 1024, + epochs = 10, + seed = NULL), hyperParamSearch = "random", - randomSample = 1, + randomSample = 1, randomSampleSeed = NULL) { - + checkIsClass(numBlocks, c("integer", "numeric")) checkHigherEqual(numBlocks, 1) - + checkIsClass(dimToken, c("integer", "numeric")) checkHigherEqual(dimToken, 1) - + checkIsClass(dimOut, c("integer", "numeric")) checkHigherEqual(dimOut, 1) - + checkIsClass(numHeads, c("integer", "numeric")) checkHigherEqual(numHeads, 1) checkIsClass(attDropout, c("numeric")) checkHigherEqual(attDropout, 0) - + checkIsClass(ffnDropout, c("numeric")) checkHigherEqual(ffnDropout, 0) - + checkIsClass(resDropout, c("numeric")) checkHigherEqual(resDropout, 0) - + checkIsClass(dimHidden, c("integer", "numeric", "NULL")) if (!is.null(dimHidden)) { checkHigherEqual(dimHidden, 1) } - + checkIsClass(dimHiddenRatio, c("numeric", "NULL")) if (!is.null(dimHiddenRatio)) { checkHigher(dimHiddenRatio, 0) } - + checkIsClass(hyperParamSearch, "character") - + checkIsClass(randomSample, c("numeric", "integer")) checkHigherEqual(randomSample, 1) - + checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL")) - - if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), dimToken %% numHeads != 0))) { + + if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), + dimToken %% numHeads != 0))) { stop(paste( "dimToken needs to divisible by numHeads. dimToken =", dimToken, "is not divisible by numHeads =", numHeads @@ -157,21 +163,24 @@ setTransformer <- function(numBlocks = 3, if (!is.null(dimHiddenRatio)) { param <- lapply(param, function(x) { - x$dimHidden <- round(x$dimToken*x$dimHidden, digits = 0) + x$dimHidden <- round(x$dimToken * x$dimHidden, digits = 0) return(x) }) } - - if (hyperParamSearch == "random" && randomSample>length(param)) { - stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", - "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param), - "\n Please lower the amount of randomSample")) + + if (hyperParamSearch == "random" && randomSample > length(param)) { + stop(paste("\n Chosen amount of randomSamples is higher than the amount of + possible hyperparameter combinations.", "\n randomSample:", + randomSample, "\n Possible hyperparameter combinations:", + length(param), "\n Please lower the amount of randomSample")) } if (hyperParamSearch == "random") { - suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]})) + suppressWarnings(withr::with_seed(randomSampleSeed, + {param <- param[sample(length(param), + randomSample)]})) } - attr(param, 'settings')$modelType <- "Transformer" + attr(param, "settings")$modelType <- "Transformer" results <- list( fitFunction = "fitEstimator", param = param, @@ -186,4 +195,4 @@ setTransformer <- function(numBlocks = 3, class(results) <- "modelSettings" return(results) -} \ No newline at end of file +} diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py index 98d9ed3..a81dfc1 100644 --- a/inst/python/Dataset.py +++ b/inst/python/Dataset.py @@ -8,29 +8,32 @@ class Data(Dataset): - def __init__(self, - data, - labels=None, - numerical_features=None): + def __init__(self, data, labels=None, numerical_features=None): """ data: path to a covariates dataframe either arrow dataset or sqlite object labels: a list of either 0 or 1, 1 if the patient got the outcome numerical_features: list of indices where the numerical features are """ start = time.time() - if pathlib.Path(data).suffix == '.sqlite': + if pathlib.Path(data).suffix == ".sqlite": data = urllib.parse.quote(data) - data = pl.read_database("SELECT * from covariates", - connection=f"sqlite://{data}").lazy() + data = pl.read_database( + "SELECT * from covariates", connection=f"sqlite://{data}" + ).lazy() else: - data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow')) - observations = data.select(pl.col('rowId').max()).collect()[0, 0] + data = pl.scan_ipc(pathlib.Path(data).joinpath("covariates/*.arrow")) + observations = data.select(pl.col("rowId").max()).collect()[0, 0] # detect features are numeric if numerical_features is None: - self.numerical_features = data.groupby(by='columnId') \ - .n_unique().filter(pl.col('covariateValue') > 1).select('columnId').collect()['columnId'] + self.numerical_features = ( + data.groupby(by="columnId") + .n_unique() + .filter(pl.col("covariateValue") > 1) + .select("columnId") + .collect()["columnId"] + ) else: - self.numerical_features = pl.Series('num', numerical_features) + self.numerical_features = pl.Series("num", numerical_features) if labels: self.target = torch.as_tensor(labels) @@ -42,42 +45,68 @@ def __init__(self, # create newColumnId from 1 (or zero?) until # catColumns # select rowId and newColumnId # rename newColumnId to columnId and sort by it - data_cat = data.filter(~pl.col('columnId') - .is_in(self.numerical_features)) \ - .sort(by='columnId').with_row_count('newColumnId'). \ - with_columns(pl.col('newColumnId').first().over('columnId') - .rank(method="dense")) \ - .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId')).sort('rowId') \ - .with_columns(pl.col('rowId') - 1).collect() + data_cat = ( + data.filter(~pl.col("columnId").is_in(self.numerical_features)) + .sort(by="columnId") + .with_row_count("newColumnId") + .with_columns( + pl.col("newColumnId").first().over("columnId").rank(method="dense") + ) + .select(pl.col("rowId"), pl.col("newColumnId").alias("columnId")) + .sort("rowId") + .with_columns(pl.col("rowId") - 1) + .collect() + ) cat_tensor = torch.as_tensor(data_cat.to_numpy()) - tensor_list = torch.split(cat_tensor[:, 1], torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1]. - tolist()) + tensor_list = torch.split( + cat_tensor[:, 1], + torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].tolist(), + ) # because of subjects without cat features, I need to create a list with all zeroes and then insert # my tensorList. That way I can still index the dataset correctly. total_list = [torch.as_tensor((0,))] * observations - idx = data_cat['rowId'].unique().to_list() + idx = data_cat["rowId"].unique().to_list() for i, i2 in enumerate(idx): total_list[i2] = tensor_list[i] self.cat = torch.nn.utils.rnn.pad_sequence(total_list, batch_first=True) - self.cat_features = data_cat['columnId'].unique() + self.cat_features = data_cat["columnId"].unique() # numerical data, # N x C, dense matrix with values for N patients/visits for C numerical features if pl.count(self.numerical_features) == 0: self.num = None else: - numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \ - with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId'). - rank(method="dense") - 1, pl.col('rowId') - 1) \ - .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect() - indices = torch.as_tensor(numerical_data.select(['rowId', 'columnId']).to_numpy(), dtype=torch.long) - values = torch.as_tensor(numerical_data.select('covariateValue').to_numpy(), dtype=torch.float) - self.num = torch.sparse_coo_tensor(indices=indices.T, - values=values.squeeze(), - size=(observations, pl.count(self.numerical_features))).to_dense() + numerical_data = ( + data.filter(pl.col("columnId").is_in(self.numerical_features)) + .sort(by="columnId") + .with_row_count("newColumnId") + .with_columns( + pl.col("newColumnId").first().over("columnId").rank(method="dense") + - 1, + pl.col("rowId") - 1, + ) + .select( + pl.col("rowId"), + pl.col("newColumnId").alias("columnId"), + pl.col("covariateValue"), + ) + .collect() + ) + indices = torch.as_tensor( + numerical_data.select(["rowId", "columnId"]).to_numpy(), + dtype=torch.long, + ) + values = torch.as_tensor( + numerical_data.select("covariateValue").to_numpy(), dtype=torch.float + ) + self.num = torch.sparse_coo_tensor( + indices=indices.T, + values=values.squeeze(), + size=(observations, pl.count(self.numerical_features)), + ).to_dense() delta = time.time() - start - print(f'Processed data in {delta:.2f} seconds') + print(f"Processed data in {delta:.2f} seconds") def get_numerical_features(self): return self.numerical_features @@ -90,13 +119,15 @@ def __len__(self): def __getitem__(self, item): if self.num is not None: - batch = {"cat": self.cat[item, :], - "num": self.num[item, :]} + batch = {"cat": self.cat[item, :], "num": self.num[item, :]} else: - batch = {"cat": self.cat[item, :].squeeze(), - "num": None} + batch = {"cat": self.cat[item, :].squeeze(), "num": None} if batch["cat"].dim() == 1 and not isinstance(item, list): batch["cat"] = batch["cat"].unsqueeze(0) - if batch["num"] is not None and batch["num"].dim() == 1 and not isinstance(item, list): + if ( + batch["num"] is not None + and batch["num"].dim() == 1 + and not isinstance(item, list) + ): batch["num"] = batch["num"].unsqueeze(0) return [batch, self.target[item].squeeze()] diff --git a/inst/python/Estimator.py b/inst/python/Estimator.py index cd534e2..983bde6 100644 --- a/inst/python/Estimator.py +++ b/inst/python/Estimator.py @@ -12,10 +12,7 @@ class Estimator: A class that wraps around pytorch models. """ - def __init__(self, - model, - model_parameters, - estimator_settings): + def __init__(self, model, model_parameters, estimator_settings): self.seed = estimator_settings["seed"] if callable(estimator_settings["device"]): self.device = estimator_settings["device"]() @@ -36,31 +33,51 @@ def __init__(self, self.previous_epochs = int(estimator_settings.get("previous_epochs", 0)) self.model.to(device=self.device) - self.optimizer = estimator_settings["optimizer"](params=self.model.parameters(), - lr=self.learning_rate, - weight_decay=self.weight_decay) + self.optimizer = estimator_settings["optimizer"]( + params=self.model.parameters(), + lr=self.learning_rate, + weight_decay=self.weight_decay, + ) self.criterion = estimator_settings["criterion"]() - if "metric" in estimator_settings.keys() and estimator_settings["metric"] is not None: + if ( + "metric" in estimator_settings.keys() + and estimator_settings["metric"] is not None + ): self.metric = estimator_settings["metric"] - if type(self.metric) == str: + if isinstance(self.metric, str): if self.metric == "auc": - self.metric = {"name": "auc", - "mode": "max"} + self.metric = {"name": "auc", "mode": "max"} elif self.metric == "loss": - self.metric = {"name": "loss", - "mode": "min"} - if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None: + self.metric = {"name": "loss", "mode": "min"} + if ( + "scheduler" in estimator_settings.keys() + and estimator_settings["scheduler"] is not None + ): estimator_settings["scheduler"]["params"]["mode"] = self.metric["mode"] - if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None: - estimator_settings["early_stopping"]["params"]["mode"] = self.metric["mode"] - - if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None: - self.scheduler = estimator_settings["scheduler"]["fun"](self.optimizer, - **estimator_settings["scheduler"]["params"]) - - if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None: - self.early_stopper = EarlyStopping(**estimator_settings["early_stopping"]["params"]) + if ( + "early_stopping" in estimator_settings.keys() + and estimator_settings["early_stopping"] is not None + ): + estimator_settings["early_stopping"]["params"]["mode"] = self.metric[ + "mode" + ] + + if ( + "scheduler" in estimator_settings.keys() + and estimator_settings["scheduler"] is not None + ): + self.scheduler = estimator_settings["scheduler"]["fun"]( + self.optimizer, **estimator_settings["scheduler"]["params"] + ) + + if ( + "early_stopping" in estimator_settings.keys() + and estimator_settings["early_stopping"] is not None + ): + self.early_stopper = EarlyStopping( + **estimator_settings["early_stopping"]["params"] + ) else: self.early_stopper = None @@ -69,21 +86,24 @@ def __init__(self, self.learn_rate_schedule = None def fit(self, dataset, test_dataset): - - train_dataloader = DataLoader(dataset=dataset, - batch_size=None, - sampler=BatchSampler( - sampler=RandomSampler(dataset), - batch_size=self.batch_size, - drop_last=True - )) - test_dataloader = DataLoader(dataset=test_dataset, - batch_size=None, - sampler=BatchSampler( - sampler=SequentialSampler(test_dataset), - batch_size=self.batch_size, - drop_last=False - )) + train_dataloader = DataLoader( + dataset=dataset, + batch_size=None, + sampler=BatchSampler( + sampler=RandomSampler(dataset), + batch_size=self.batch_size, + drop_last=True, + ), + ) + test_dataloader = DataLoader( + dataset=test_dataset, + batch_size=None, + sampler=BatchSampler( + sampler=SequentialSampler(test_dataset), + batch_size=self.batch_size, + drop_last=False, + ), + ) trained_epochs = dict() times = list() @@ -105,19 +125,25 @@ def fit(self, dataset, test_dataset): times.append(round(delta_time, 3)) if self.early_stopper: - self.early_stopper(scores['metric']) + self.early_stopper(scores["metric"]) if self.early_stopper.improved: model_state_dict[epoch] = self.model.state_dict() trained_epochs[epoch] = current_epoch if self.early_stopper.early_stop: print("Early stopping, validation metric stopped improving") - print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds') - self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates) + print( + f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds" + ) + self.finish_fit( + all_scores, model_state_dict, trained_epochs, learning_rates + ) return else: model_state_dict[epoch] = self.model.state_dict() trained_epochs[epoch] = current_epoch - print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds') + print( + f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds" + ) self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates) return @@ -170,48 +196,68 @@ def score(self, dataloader): def finish_fit(self, scores, model_state_dict, epoch, learning_rates): if self.metric["mode"] == "max": - best_epoch_index = torch.argmax(torch.as_tensor([x["metric"] for x in scores])).item() + best_epoch_index = torch.argmax( + torch.as_tensor([x["metric"] for x in scores]) + ).item() elif self.metric["mode"] == "min": - best_epoch_index = torch.argmin(torch.as_tensor([x["metric"] for x in scores])).item() + best_epoch_index = torch.argmin( + torch.as_tensor([x["metric"] for x in scores]) + ).item() best_model_state_dict = model_state_dict[best_epoch_index] self.model.load_state_dict(best_model_state_dict) self.best_epoch = epoch[best_epoch_index] - self.best_score = {"loss": scores[best_epoch_index]["loss"], - "auc": scores[best_epoch_index]["auc"]} - self.learn_rate_schedule = learning_rates[:(best_epoch_index+1)] + self.best_score = { + "loss": scores[best_epoch_index]["loss"], + "auc": scores[best_epoch_index]["auc"], + } + self.learn_rate_schedule = learning_rates[: (best_epoch_index + 1)] print(f"Loaded best model (based on AUC) from epoch {self.best_epoch}") print(f"ValLoss: {self.best_score['loss']}") print(f"valAUC: {self.best_score['auc']}") - if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss": + if ( + self.metric + and self.metric["name"] != "auc" + and self.metric["name"] != "loss" + ): self.best_score[self.metric["name"]] = scores[best_epoch_index]["metric"] print(f"{self.metric['name']}: {self.best_score[self.metric['name']]}") return def print_progress(self, scores, training_loss, delta_time, current_epoch): - if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss": - print(f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} " - f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} " - f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds " - f"| LR: {self.optimizer.param_groups[0]['lr']}") + if ( + self.metric + and self.metric["name"] != "auc" + and self.metric["name"] != "loss" + ): + print( + f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} " + f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} " + f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds " + f"| LR: {self.optimizer.param_groups[0]['lr']}" + ) else: - print(f"Epochs: {current_epoch} " - f"| Val AUC: {scores['auc']:.3f} " - f"| Val Loss: {scores['loss']:.3f} " - f"| Train Loss: {training_loss:.3f} " - f"| Time: {delta_time:.3f} seconds " - f"| LR: {self.optimizer.param_groups[0]['lr']}") + print( + f"Epochs: {current_epoch} " + f"| Val AUC: {scores['auc']:.3f} " + f"| Val Loss: {scores['loss']:.3f} " + f"| Train Loss: {training_loss:.3f} " + f"| Time: {delta_time:.3f} seconds " + f"| LR: {self.optimizer.param_groups[0]['lr']}" + ) return def fit_whole_training_set(self, dataset, learning_rates=None): - dataloader = DataLoader(dataset=dataset, - batch_size=None, - sampler=BatchSampler( - sampler=RandomSampler(dataset), - batch_size=self.batch_size, - drop_last=True - )) + dataloader = DataLoader( + dataset=dataset, + batch_size=None, + sampler=BatchSampler( + sampler=RandomSampler(dataset), + batch_size=self.batch_size, + drop_last=True, + ), + ) if isinstance(learning_rates, list): self.best_epoch = len(learning_rates) elif ~isinstance(learning_rates, list): @@ -221,7 +267,7 @@ def fit_whole_training_set(self, dataset, learning_rates=None): self.best_epoch = self.epochs for epoch in range(self.best_epoch): - self.optimizer.param_groups[0]['lr'] = learning_rates[epoch] + self.optimizer.param_groups[0]["lr"] = learning_rates[epoch] self.fit_epoch(dataloader) return @@ -231,19 +277,21 @@ def save(self, path, name): model_state_dict=self.model.state_dict(), model_parameters=self.model_parameters, estimator_settings=self.estimator_settings, - epoch=self.epochs) - torch.save(out, - f=save_path) + epoch=self.epochs, + ) + torch.save(out, f=save_path) return save_path def predict_proba(self, dataset): - dataloader = DataLoader(dataset=dataset, - batch_size=None, - sampler=BatchSampler( - sampler=SequentialSampler(dataset), - batch_size=self.batch_size, - drop_last=False - )) + dataloader = DataLoader( + dataset=dataset, + batch_size=None, + sampler=BatchSampler( + sampler=SequentialSampler(dataset), + batch_size=self.batch_size, + drop_last=False, + ), + ) with torch.no_grad(): predictions = list() self.model.eval() @@ -261,15 +309,11 @@ def predict(self, dataset, threshold=None): # use outcome rate threshold = dataset.target.sum().item() / len(dataset) predicted_class = predictions > threshold + return predicted_class class EarlyStopping: - - def __init__(self, - patience=3, - delta=0, - verbose=True, - mode='max'): + def __init__(self, patience=3, delta=0, verbose=True, mode="max"): self.patience = patience self.counter = 0 self.verbose = verbose @@ -280,9 +324,8 @@ def __init__(self, self.previous_score = 0 self.mode = mode - def __call__(self, - metric): - if self.mode == 'max': + def __call__(self, metric): + if self.mode == "max": score = metric else: score = -1 * metric @@ -293,8 +336,9 @@ def __call__(self, self.counter += 1 self.improved = False if self.verbose: - print(f"Early stopping counter: {self.counter}" - f" out of {self.patience}") + print( + f"Early stopping counter: {self.counter}" f" out of {self.patience}" + ) if self.counter >= self.patience: self.early_stop = True else: @@ -304,12 +348,12 @@ def __call__(self, self.previous_score = score -def batch_to_device(batch, device='cpu'): +def batch_to_device(batch, device="cpu"): if torch.is_tensor(batch): batch = batch.to(device=device) else: for ix, b in enumerate(batch): - if type(b) is str: + if isinstance(b, str): key = b b = batch[b] else: @@ -338,7 +382,7 @@ def compute_auc(y_true, y_pred): float: Computed AUC score. """ # Ensure inputs are sorted by predicted score - y_pred_sorted, sorted_indices = torch.sort(y_pred, descending=True) + _, sorted_indices = torch.sort(y_pred, descending=True) y_true_sorted = y_true[sorted_indices] # Get the number of positive and negative examples @@ -351,7 +395,3 @@ def compute_auc(y_true, y_pred): # Compute AUC auc = num_crossings / (n_pos * n_neg) return auc - - - - diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py index 9d5bd0c..e7141cd 100644 --- a/inst/python/LrFinder.py +++ b/inst/python/LrFinder.py @@ -2,17 +2,13 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from torch.utils.data import DataLoader, BatchSampler, RandomSampler from tqdm import tqdm from Estimator import batch_to_device class ExponentialSchedulerPerBatch(_LRScheduler): - - def __init__(self, optimizer, - end_lr, - num_iter): + def __init__(self, optimizer, end_lr, num_iter): self.end_lr = end_lr self.num_iter = num_iter super(ExponentialSchedulerPerBatch, self).__init__(optimizer, last_epoch=-1) @@ -23,14 +19,9 @@ def get_lr(self): class LrFinder: - - def __init__(self, - model, - model_parameters, - estimator_settings, - lr_settings): + def __init__(self, model, model_parameters, estimator_settings, lr_settings): if lr_settings is None: - lr_settings = {} + lr_settings = {} min_lr = lr_settings.get("min_lr", 1e-7) max_lr = lr_settings.get("max_lr", 1) num_lr = lr_settings.get("num_lr", 100) @@ -50,13 +41,16 @@ def __init__(self, self.smooth = smooth self.divergence_threshold = divergence_threshold - self.optimizer = estimator_settings['optimizer'](params=self.model.parameters(), - lr=self.min_lr) + self.optimizer = estimator_settings["optimizer"]( + params=self.model.parameters(), lr=self.min_lr + ) - self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr) + self.scheduler = ExponentialSchedulerPerBatch( + self.optimizer, self.max_lr, self.num_lr + ) self.criterion = estimator_settings["criterion"]() - self.batch_size = int(estimator_settings['batch_size']) + self.batch_size = int(estimator_settings["batch_size"]) self.losses = None self.loss_index = None @@ -74,7 +68,9 @@ def get_lr(self, dataset): out = self.model(batch[0]) loss = self.criterion(out, batch[1]) if self.smooth is not None and i != 0: - losses[i] = self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1] + losses[i] = ( + self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1] + ) else: losses[i] = loss.item() lrs[i] = self.optimizer.param_groups[0]["lr"] @@ -90,13 +86,15 @@ def get_lr(self, dataset): best_loss = losses[i] if losses[i] > (self.divergence_threshold * best_loss): - print(f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}") + print( + f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}" + ) break # find LR where gradient is highest but before global minimum is reached # I added -5 to make sure it is not still in the minimum global_minimum = torch.argmin(losses) - gradient = torch.diff(losses[:(global_minimum - 5)+1]) + gradient = torch.diff(losses[: (global_minimum - 5) + 1]) smallest_gradient = torch.argmin(gradient) suggested_lr = lrs[smallest_gradient] diff --git a/inst/python/MLP.py b/inst/python/MLP.py index 3b8b03b..cd049a6 100644 --- a/inst/python/MLP.py +++ b/inst/python/MLP.py @@ -4,17 +4,18 @@ class MLP(nn.Module): - - def __init__(self, - cat_features: int, - num_features: int, - size_embedding: int, - size_hidden: int, - num_layers: int, - activation=nn.ReLU, - normalization=nn.BatchNorm1d, - dropout=None, - d_out: int = 1): + def __init__( + self, + cat_features: int, + num_features: int, + size_embedding: int, + size_hidden: int, + num_layers: int, + activation=nn.ReLU, + normalization=nn.BatchNorm1d, + dropout=None, + d_out: int = 1, + ): super(MLP, self).__init__() self.name = "MLP" cat_features = int(cat_features) @@ -23,21 +24,25 @@ def __init__(self, size_hidden = int(size_hidden) num_layers = int(num_layers) d_out = int(d_out) - - self.embedding = nn.EmbeddingBag(cat_features + 1, - size_embedding, - padding_idx=0) + + self.embedding = nn.EmbeddingBag( + cat_features + 1, size_embedding, padding_idx=0 + ) if num_features != 0 and num_features is not None: self.num_embedding = NumericalEmbedding(num_features, size_embedding) self.first_layer = nn.Linear(size_embedding, size_hidden) - self.layers = nn.ModuleList(MLPLayer(size_hidden=size_hidden, - normalization=normalization, - activation=activation, - dropout=dropout) - for _ in range(num_layers)) + self.layers = nn.ModuleList( + MLPLayer( + size_hidden=size_hidden, + normalization=normalization, + activation=activation, + dropout=dropout, + ) + for _ in range(num_layers) + ) self.last_norm = normalization(size_hidden) self.head = nn.Linear(size_hidden, d_out) @@ -62,12 +67,14 @@ def forward(self, input): class MLPLayer(nn.Module): - def __init__(self, - size_hidden=64, - normalization=nn.BatchNorm1d, - activation=nn.ReLU, - dropout=0.0, - bias=True): + def __init__( + self, + size_hidden=64, + normalization=nn.BatchNorm1d, + activation=nn.ReLU, + dropout=0.0, + bias=True, + ): super(MLPLayer, self).__init__() self.norm = normalization(size_hidden) self.activation = activation() diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py index cef4b49..9df35d8 100644 --- a/inst/python/ResNet.py +++ b/inst/python/ResNet.py @@ -5,22 +5,23 @@ class ResNet(nn.Module): - - def __init__(self, - cat_features: int, - num_features: int = 0, - size_embedding: int = 256, - size_hidden: int = 256, - num_layers: int = 2, - hidden_factor: int = 1, - activation=nn.ReLU, - normalization=nn.BatchNorm1d, - hidden_dropout=0, - residual_dropout=0, - dim_out: int = 1, - concat_num=True): + def __init__( + self, + cat_features: int, + num_features: int = 0, + size_embedding: int = 256, + size_hidden: int = 256, + num_layers: int = 2, + hidden_factor: int = 1, + activation=nn.ReLU, + normalization=nn.BatchNorm1d, + hidden_dropout=0, + residual_dropout=0, + dim_out: int = 1, + concat_num=True, + ): super(ResNet, self).__init__() - self.name = 'ResNet' + self.name = "ResNet" cat_features = int(cat_features) num_features = int(num_features) size_embedding = int(size_embedding) @@ -28,12 +29,9 @@ def __init__(self, num_layers = int(num_layers) hidden_factor = int(hidden_factor) dim_out = int(dim_out) - - + self.embedding = nn.EmbeddingBag( - num_embeddings=cat_features + 1, - embedding_dim=size_embedding, - padding_idx=0 + num_embeddings=cat_features + 1, embedding_dim=size_embedding, padding_idx=0 ) if num_features != 0 and not concat_num: self.num_embedding = NumericalEmbedding(num_features, size_embedding) @@ -45,9 +43,17 @@ def __init__(self, res_hidden = size_hidden * hidden_factor - self.layers = nn.ModuleList(ResLayer(size_hidden, res_hidden, normalization, - activation, hidden_dropout, residual_dropout) - for _ in range(num_layers)) + self.layers = nn.ModuleList( + ResLayer( + size_hidden, + res_hidden, + normalization, + activation, + hidden_dropout, + residual_dropout, + ) + for _ in range(num_layers) + ) self.last_norm = normalization(size_hidden) @@ -58,7 +64,11 @@ def __init__(self, def forward(self, x): x_cat = x["cat"] x_cat = self.embedding(x_cat) - if "num" in x.keys() and x["num"] is not None and self.num_embedding is not None: + if ( + "num" in x.keys() + and x["num"] is not None + and self.num_embedding is not None + ): x_num = x["num"] # take the average af numerical and categorical embeddings x = (x_cat + self.num_embedding(x_num).mean(dim=1)) / 2 @@ -79,14 +89,15 @@ def forward(self, x): class ResLayer(nn.Module): - - def __init__(self, - size_hidden, - res_hidden, - normalization, - activation, - hidden_dropout=None, - residual_dropout=None): + def __init__( + self, + size_hidden, + res_hidden, + normalization, + activation, + hidden_dropout=None, + residual_dropout=None, + ): super(ResLayer, self).__init__() self.norm = normalization(size_hidden) @@ -114,10 +125,7 @@ def forward(self, input): class NumericalEmbedding(nn.Module): - def __init__(self, - num_embeddings, - embedding_dim, - bias=True): + def __init__(self, num_embeddings, embedding_dim, bias=True): super(NumericalEmbedding, self).__init__() self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim)) if bias: @@ -134,6 +142,3 @@ def forward(self, input): if self.bias is not None: x = x + self.bias[None] return x - - - diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py index 1c95b36..ddbe929 100644 --- a/inst/python/Transformer.py +++ b/inst/python/Transformer.py @@ -18,23 +18,24 @@ def forward(self, x): class Transformer(nn.Module): - - def __init__(self, - cat_features: int, - num_features: int, - num_blocks: int, - dim_token: int, - num_heads: int, - att_dropout, - ffn_dropout, - res_dropout, - dim_hidden: int, - dim_out: int = 1, - head_activation=nn.ReLU, - activation=ReGLU, - ffn_norm=nn.LayerNorm, - head_norm=nn.LayerNorm, - att_norm=nn.LayerNorm): + def __init__( + self, + cat_features: int, + num_features: int, + num_blocks: int, + dim_token: int, + num_heads: int, + att_dropout, + ffn_dropout, + res_dropout, + dim_hidden: int, + dim_out: int = 1, + head_activation=nn.ReLU, + activation=ReGLU, + ffn_norm=nn.LayerNorm, + head_norm=nn.LayerNorm, + att_norm=nn.LayerNorm, + ): super(Transformer, self).__init__() self.name = "Transformer" cat_features = int(cat_features) @@ -44,8 +45,10 @@ def __init__(self, num_heads = int(num_heads) dim_hidden = int(dim_hidden) dim_out = int(dim_out) - - self.categorical_embedding = nn.Embedding(cat_features + 1, dim_token, padding_idx=0) + + self.categorical_embedding = nn.Embedding( + cat_features + 1, dim_token, padding_idx=0 + ) if num_features != 0 and num_features is not None: self.numerical_embedding = NumericalEmbedding(num_features, dim_token) @@ -56,27 +59,35 @@ def __init__(self, self.layers = nn.ModuleList([]) for layer_idx in range(num_blocks): - layer = nn.ModuleDict({ - "attention": nn.MultiheadAttention(dim_token, num_heads, - dropout=att_dropout), - "ffn": FeedForwardBlock(dim_token, dim_hidden, - bias_first=True, - bias_second=True, - dropout=ffn_dropout, - activation=activation), - "attention_res_dropout": nn.Dropout(res_dropout), - "ffn_res_dropout": nn.Dropout(res_dropout), - "ffn_norm": ffn_norm(dim_token) - }) + layer = nn.ModuleDict( + { + "attention": nn.MultiheadAttention( + dim_token, num_heads, dropout=att_dropout + ), + "ffn": FeedForwardBlock( + dim_token, + dim_hidden, + bias_first=True, + bias_second=True, + dropout=ffn_dropout, + activation=activation, + ), + "attention_res_dropout": nn.Dropout(res_dropout), + "ffn_res_dropout": nn.Dropout(res_dropout), + "ffn_norm": ffn_norm(dim_token), + } + ) if layer_idx != 0: layer["attention_norm"] = att_norm(dim_token) self.layers.append(layer) - self.head = Head(dim_token, - bias=True, - activation=head_activation, - normalization=head_norm, - dim_out=dim_out) + self.head = Head( + dim_token, + bias=True, + activation=head_activation, + normalization=head_norm, + dim_out=dim_out, + ) def forward(self, x): mask = torch.where(x["cat"] == 0, True, False) @@ -84,31 +95,34 @@ def forward(self, x): if self.use_numerical: num = self.numerical_embedding(x["num"]) x = torch.cat([cat, num], dim=1) - mask = torch.cat([mask, torch.zeros([x.shape[0], - num.shape[1]], - device=mask.device, - dtype=mask.dtype)], - dim=1) + mask = torch.cat( + [ + mask, + torch.zeros( + [x.shape[0], num.shape[1]], device=mask.device, dtype=mask.dtype + ), + ], + dim=1, + ) else: x = cat x = self.class_token(x) - mask = torch.cat([mask, torch.zeros([x.shape[0], 1], - device=mask.device, - dtype=mask.dtype)], - dim=1) + mask = torch.cat( + [mask, torch.zeros([x.shape[0], 1], device=mask.device, dtype=mask.dtype)], + dim=1, + ) for i, layer in enumerate(self.layers): x_residual = self.start_residual(layer, "attention", x) - if i == len(self.layers)-1: + if i == len(self.layers) - 1: dims = x_residual.shape x_residual = layer["attention"]( x_residual[:, -1].view([dims[0], 1, dims[2]]).transpose(0, 1), x_residual.transpose(0, 1), x_residual.transpose(0, 1), - mask + mask, ) - attn_weights = x_residual[1] x_residual = x_residual[0] x = x[:, -1].view([dims[0], 1, dims[2]]) else: @@ -116,7 +130,7 @@ def forward(self, x): x_residual.transpose(0, 1), x_residual.transpose(0, 1), x_residual.transpose(0, 1), - mask + mask, )[0] x = self.end_residual(layer, "attention", x, x_residual.transpose(0, 1)) @@ -141,14 +155,15 @@ def end_residual(layer, stage, x, x_residual): class FeedForwardBlock(nn.Module): - - def __init__(self, - dim_token, - dim_hidden, - bias_first=True, - bias_second=True, - dropout=0.0, - activation=ReGLU): + def __init__( + self, + dim_token, + dim_hidden, + bias_first=True, + bias_second=True, + dropout=0.0, + activation=ReGLU, + ): super(FeedForwardBlock, self).__init__() self.linear0 = nn.Linear(dim_token, int(dim_hidden * 2), bias=bias_first) self.activation = activation() @@ -164,13 +179,7 @@ def forward(self, x): class Head(nn.Module): - - def __init__(self, - dim_in, - bias, - activation, - normalization, - dim_out): + def __init__(self, dim_in, bias, activation, normalization, dim_out): super(Head, self).__init__() self.normalization = normalization(dim_in) self.activation = activation() @@ -183,17 +192,16 @@ def forward(self, x): x = self.linear(x) return x -class ClassToken(nn.Module): - def __init__(self, - dim_token): +class ClassToken(nn.Module): + def __init__(self, dim_token): super(ClassToken, self).__init__() self.weight = nn.Parameter(torch.empty(dim_token, 1)) nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) def expand(self, dims): new_dims = [1] * (len(dims) - 1) - return self.weight.view(new_dims + [-1]).expand(dims +[-1]) + return self.weight.view(new_dims + [-1]).expand(dims + [-1]) def forward(self, x): return torch.cat([x, self.expand([x.shape[0], 1])], dim=1) diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd index c97f163..9eb3b36 100644 --- a/man/DeepPatientLevelPrediction.Rd +++ b/man/DeepPatientLevelPrediction.Rd @@ -5,5 +5,6 @@ \alias{DeepPatientLevelPrediction} \title{DeepPatientLevelPrediction} \description{ -A package containing deep learning extensions for developing prediction models using data in the OMOP CDM +A package containing deep learning extensions for developing +prediction models using data in the OMOP CDM } diff --git a/man/setDefaultResNet.Rd b/man/setDefaultResNet.Rd index c26f354..e1d7f46 100644 --- a/man/setDefaultResNet.Rd +++ b/man/setDefaultResNet.Rd @@ -16,6 +16,6 @@ setDefaultResNet( Creates settings for a default ResNet model } \details{ -Model architecture from by https://arxiv.org/abs/2106.11959 . +Model architecture from by https://arxiv.org/abs/2106.11959 . Hyperparameters chosen by a experience on a few prediction problems. } diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd index fcb33ab..b8424a3 100644 --- a/man/setEstimator.Rd +++ b/man/setEstimator.Rd @@ -28,7 +28,8 @@ setEstimator( \item{epochs}{how many epochs to train for} -\item{device}{what device to train on, can be a string or a function to that evaluates +\item{device}{what device to train on, can be a string or a function to +that evaluates to the device during runtime} \item{optimizer}{which optimizer to use} @@ -37,11 +38,15 @@ to the device during runtime} \item{criterion}{loss function to use} -\item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving} +\item{earlyStopping}{If earlyStopping should be used which stops the +training of your metric is not improving} -\item{metric}{either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. -Needs to be a list with function `fun`, mode either `min` or `max` and a `name`, -`fun` needs to be a function that takes in prediction and labels and outputs a score.} +\item{metric}{either `auc` or `loss` or a custom metric to use. This is the +metric used for scheduler and earlyStopping. +Needs to be a list with function `fun`, mode either `min` or `max` and a +`name`, +`fun` needs to be a function that takes in prediction and labels and +outputs a score.} \item{seed}{seed to initialize weights of model with} } diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd index f3676cd..a5f96d7 100644 --- a/man/setMultiLayerPerceptron.Rd +++ b/man/setMultiLayerPerceptron.Rd @@ -19,15 +19,18 @@ setMultiLayerPerceptron( \arguments{ \item{numLayers}{Number of layers in network, default: 1:8} -\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)} +\item{sizeHidden}{Amount of neurons in each default layer, +default: 2^(6:9) (64 to 512)} -\item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)} +\item{dropout}{How much dropout to apply after first linear, +default: seq(0, 0.3, 0.05)} -\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)} +\item{sizeEmbedding}{Size of embedding default: 2^(6:9) (64 to 512)} \item{estimatorSettings}{settings of Estimator created with `setEstimator`} -\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'} +\item{hyperParamSearch}{Which kind of hyperparameter search to use random +sampling or exhaustive grid search. default: 'random'} \item{randomSample}{How many random samples from hyperparameter space to use} diff --git a/man/setResNet.Rd b/man/setResNet.Rd index fbe3d77..4f4c245 100644 --- a/man/setResNet.Rd +++ b/man/setResNet.Rd @@ -21,21 +21,28 @@ setResNet( \arguments{ \item{numLayers}{Number of layers in network, default: 1:16} -\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)} +\item{sizeHidden}{Amount of neurons in each default layer, default: +2^(6:10) (64 to 1024)} -\item{hiddenFactor}{How much to grow the amount of neurons in each ResLayer, default: 1:4} +\item{hiddenFactor}{How much to grow the amount of neurons in each +ResLayer, default: 1:4} -\item{residualDropout}{How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)} +\item{residualDropout}{How much dropout to apply after last linear layer +in ResLayer, default: seq(0, 0.3, 0.05)} -\item{hiddenDropout}{How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)} +\item{hiddenDropout}{How much dropout to apply after first linear layer +in ResLayer, default: seq(0, 0.3, 0.05)} -\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)} +\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) +'(64 to 512)} \item{estimatorSettings}{created with ```setEstimator```} -\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'} +\item{hyperParamSearch}{Which kind of hyperparameter search to use random +sampling or exhaustive grid search. default: 'random'} -\item{randomSample}{How many random samples from hyperparameter space to use} +\item{randomSample}{How many random samples from hyperparameter space +to use} \item{randomSampleSeed}{Random seed to sample hyperparameter combinations} } diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd index 7de56c4..9afcbd5 100644 --- a/man/setTransformer.Rd +++ b/man/setTransformer.Rd @@ -26,7 +26,8 @@ setTransformer( \item{dimToken}{dimension of each token (embedding size)} -\item{dimOut}{dimension of output, usually 1 for binary problems} +\item{dimOut}{dimension of output, usually 1 for binary +problems} \item{numHeads}{number of attention heads} @@ -38,15 +39,19 @@ setTransformer( \item{dimHidden}{dimension of the feedworward block} -\item{dimHiddenRatio}{dimension of the feedforward block as a ratio of dimToken (embedding size)} +\item{dimHiddenRatio}{dimension of the feedforward block as a ratio +of dimToken (embedding size)} \item{estimatorSettings}{created with `setEstimator`} -\item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'} +\item{hyperParamSearch}{what kind of hyperparameter search to do, +default 'random'} -\item{randomSample}{How many samples to use in hyperparameter search if random} +\item{randomSample}{How many samples to use in hyperparameter +search if random} -\item{randomSampleSeed}{Random seed to sample hyperparameter combinations} +\item{randomSampleSeed}{Random seed to sample hyperparameter +combinations} } \description{ A transformer model diff --git a/man/TrainingCache.Rd b/man/trainingCache.Rd similarity index 79% rename from man/TrainingCache.Rd rename to man/trainingCache.Rd index c82bb23..77d2efc 100644 --- a/man/TrainingCache.Rd +++ b/man/trainingCache.Rd @@ -1,7 +1,7 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/TrainingCache-class.R -\name{TrainingCache} -\alias{TrainingCache} +\name{trainingCache} +\alias{trainingCache} \title{TrainingCache} \value{ Whether the provided and cached parameter grid is identical @@ -18,15 +18,15 @@ Parameter caching for training persistence and continuity \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-TrainingCache-new}{\code{TrainingCache$new()}} -\item \href{#method-TrainingCache-isParamGridIdentical}{\code{TrainingCache$isParamGridIdentical()}} -\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}} -\item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}} -\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}} -\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}} -\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}} -\item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}} -\item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}} +\item \href{#method-TrainingCache-new}{\code{trainingCache$new()}} +\item \href{#method-TrainingCache-isParamGridIdentical}{\code{trainingCache$isParamGridIdentical()}} +\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{trainingCache$saveGridSearchPredictions()}} +\item \href{#method-TrainingCache-saveModelParams}{\code{trainingCache$saveModelParams()}} +\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{trainingCache$getGridSearchPredictions()}} +\item \href{#method-TrainingCache-isFull}{\code{trainingCache$isFull()}} +\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{trainingCache$getLastGridSearchIndex()}} +\item \href{#method-TrainingCache-dropCache}{\code{trainingCache$dropCache()}} +\item \href{#method-TrainingCache-clone}{\code{trainingCache$clone()}} } } \if{html}{\out{
}} @@ -35,7 +35,7 @@ Parameter caching for training persistence and continuity \subsection{Method \code{new()}}{ Creates a new training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$new(inDir)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$new(inDir)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -53,7 +53,7 @@ Creates a new training cache Checks whether the parameter grid in the model settings is identical to the cached parameters. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -70,7 +70,7 @@ the cached parameters. \subsection{Method \code{saveGridSearchPredictions()}}{ Saves the grid search results to the training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -87,7 +87,7 @@ Saves the grid search results to the training cache \subsection{Method \code{saveModelParams()}}{ Saves the parameter grid to the training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$saveModelParams(inModelParams)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$saveModelParams(inModelParams)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -104,7 +104,7 @@ Saves the parameter grid to the training cache \subsection{Method \code{getGridSearchPredictions()}}{ Gets the grid search results from the training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$getGridSearchPredictions()}\if{html}{\out{
}} } } @@ -114,7 +114,7 @@ Gets the grid search results from the training cache \subsection{Method \code{isFull()}}{ Check if cache is full \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$isFull()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$isFull()}\if{html}{\out{
}} } } @@ -124,7 +124,7 @@ Check if cache is full \subsection{Method \code{getLastGridSearchIndex()}}{ Gets the last index from the cached grid search \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$getLastGridSearchIndex()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$getLastGridSearchIndex()}\if{html}{\out{
}} } } @@ -134,7 +134,7 @@ Gets the last index from the cached grid search \subsection{Method \code{dropCache()}}{ Remove the training cache from the analysis path \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$dropCache()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$dropCache()}\if{html}{\out{
}} } } @@ -144,7 +144,7 @@ Remove the training cache from the analysis path \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$clone(deep = FALSE)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$clone(deep = FALSE)}\if{html}{\out{
}} } \subsection{Arguments}{ diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R index debe95c..f96d7b8 100644 --- a/tests/testthat/test-TrainingCache.R +++ b/tests/testthat/test-TrainingCache.R @@ -14,7 +14,7 @@ resNetSettings <- setResNet(numLayers = c(1, 2, 4), randomSample = 3, randomSampleSeed = NULL) -trainCache <- TrainingCache$new(testLoc) +trainCache <- trainingCache$new(testLoc) paramSearch <- resNetSettings$param test_that("Training cache exists on disk", {