diff --git a/.gitignore b/.gitignore index 56db315..94cb359 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ config.yml docs .idea/ renv.lock -extras/ \ No newline at end of file +extras/ +.Renviron \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0df769f..d4c1a53 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: DeepPatientLevelPrediction Type: Package Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model -Version: 2.0.0 +Version: 2.0.1 Date: 18-04-2023 Authors@R: c( person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 3da704b..70ccf12 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +DeepPatientLevelPrediction 2.0.1 +====================== + - Connection parameter fixed to be in line with newest polars + - Fixed a bug where LRFinder used a hardcoded batch size + - Seed is now used in LRFinder so it's reproducible + - Fixed a bug in NumericalEmbedding + - Fixed a bug for Transformer and numerical features + - Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang ) + - Updated installation documentation after feedback from HADES hackathon + - Fixed a bug where order of numeric features wasn't conserved between training and test set + - TrainingCache now only saves prediction dataframe for the best performing model + DeepPatientLevelPrediction 2.0.0 ====================== - New backend which uses pytorch through reticulate instead of torch in R diff --git a/R/Estimator.R b/R/Estimator.R index f804368..7705279 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -310,7 +310,8 @@ gridCvDeep <- function(mappedData, fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))] findLR <- modelSettings$estimatorSettings$findLR - for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { + if (!trainCache$isFull()) { + for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId)) ParallelLogger::logInfo(paste0("HyperParameters: ")) ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | ")) @@ -363,25 +364,38 @@ gridCvDeep <- function(mappedData, ) } maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) - paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]] - gridSearchPredictons[[gridId]] <- list( prediction = prediction, - param = paramSearch[[gridId]] + param = paramSearch[[gridId]], + gridPerformance = PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]]) ) + gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), + nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) + gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]] + + # remove all predictions that are not the max performance + indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + for (i in seq_along(gridSearchPredictons)) { + if (!is.null(gridSearchPredictons[[i]])) { + if (i != indexOfMax) { + gridSearchPredictons[[i]]$prediction <- list(NULL) + } + } + } + ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax)) trainCache$saveGridSearchPredictions(gridSearchPredictons) } + } + paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) + # get best params + indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + finalParam <- gridSearchPredictons[[indexOfMax]]$param + + paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) - # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function) - paramGridSearch <- lapply(gridSearchPredictons, function(x) { - do.call(PatientLevelPrediction::computeGridPerformance, x) - }) # cvAUCmean, cvAUC, param - - optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance))) - finalParam <- paramGridSearch[[optimalParamInd]]$param - - cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction + # get best CV prediction + cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction cvPrediction$evaluationType <- "CV" ParallelLogger::logInfo("Training final model using optimal parameters") diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R index 8577f31..be626d5 100644 --- a/R/TrainingCache-class.R +++ b/R/TrainingCache-class.R @@ -69,6 +69,13 @@ TrainingCache <- R6::R6Class( return(private$.paramPersistence$gridSearchPredictions) }, + #' @description + #' Check if cache is full + #' @returns Boolen + isFull = function() { + return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance))))) + }, + #' @description #' Gets the last index from the cached grid search #' @returns Last grid search index diff --git a/extras/example.R b/extras/example.R deleted file mode 100644 index 99a6c08..0000000 --- a/extras/example.R +++ /dev/null @@ -1,73 +0,0 @@ -# testing code (requires sequential branch of FeatureExtraction): -# rm(list = ls()) -library(PatientLevelPrediction) -library(DeepPatientLevelPrediction) - -data(plpDataSimulationProfile) -sampleSize <- 1e3 -plpData <- simulatePlpData( - plpDataSimulationProfile, - n = sampleSize - ) - - -populationSet <- PatientLevelPrediction::createStudyPopulationSettings( - requireTimeAtRisk = F, - riskWindowStart = 1, - riskWindowEnd = 365*5) - -# -# modelSettings <- setDefaultTransformer(estimatorSettings = setEstimator( -# learningRate = "auto", -# batchSize=64L, -# epochs = 10L -# )) - -modelSettings <- setDefaultResNet(estimatorSettings = setEstimator( - learningRate = "auto", - weightDecay = 1e-06, - device="cuda:0", - batchSize=128L, - epochs=50L, - seed=42 -)) - -modelSettings <- setResNet(numLayers = c(1L, 2L), - sizeHidden = 72L, - hiddenFactor = 1L, - residualDropout = 0.0, - hiddenDropout = 0.0, - sizeEmbedding = 64L, - estimatorSettings = setEstimator( - learningRate = 3e-4, - batchSize = 128L, - epochs = 10L, - device = "cpu", - seed = 42 - ), - randomSample = 2) - -res2 <- PatientLevelPrediction::runPlp( - plpData = plpData, - outcomeId = unique(plpData$outcomes$outcomeId)[[1]], - modelSettings = modelSettings, - analysisId = 'Test', - analysisName = 'Testing DeepPlp', - populationSettings = populationSet, - splitSettings = createDefaultSplitSetting(splitSeed = 123), - sampleSettings = createSampleSettings("underSample"), # none - featureEngineeringSettings = createFeatureEngineeringSettings(), # none - preprocessSettings = createPreprocessSettings(normalize = F), - logSettings = createLogSettings(verbosity='TRACE'), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = T, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ), - saveDirectory = '~/test/resnet/' -) - - diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py index 15d749b..98d9ed3 100644 --- a/inst/python/Dataset.py +++ b/inst/python/Dataset.py @@ -21,7 +21,7 @@ def __init__(self, if pathlib.Path(data).suffix == '.sqlite': data = urllib.parse.quote(data) data = pl.read_database("SELECT * from covariates", - connection_uri=f"sqlite://{data}").lazy() + connection=f"sqlite://{data}").lazy() else: data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow')) observations = data.select(pl.col('rowId').max()).collect()[0, 0] @@ -67,7 +67,7 @@ def __init__(self, if pl.count(self.numerical_features) == 0: self.num = None else: - numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \ + numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \ with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId'). rank(method="dense") - 1, pl.col('rowId') - 1) \ .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect() diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py index 4c24a38..9d5bd0c 100644 --- a/inst/python/LrFinder.py +++ b/inst/python/LrFinder.py @@ -37,6 +37,7 @@ def __init__(self, smooth = lr_settings.get("smooth", 0.05) divergence_threshold = lr_settings.get("divergence_threshold", 4) torch.manual_seed(seed=estimator_settings["seed"]) + self.seed = estimator_settings["seed"] self.model = model(**model_parameters) if callable(estimator_settings["device"]): self.device = estimator_settings["device"]() @@ -55,18 +56,18 @@ def __init__(self, self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr) self.criterion = estimator_settings["criterion"]() - self.batch_size = estimator_settings['batch_size'] + self.batch_size = int(estimator_settings['batch_size']) self.losses = None self.loss_index = None def get_lr(self, dataset): batch_index = torch.arange(0, len(dataset), 1).tolist() - + random.seed(self.seed) losses = torch.empty(size=(self.num_lr,), dtype=torch.float) lrs = torch.empty(size=(self.num_lr,), dtype=torch.float) for i in tqdm(range(self.num_lr)): self.optimizer.zero_grad() - random_batch = random.sample(batch_index, 32) + random_batch = random.sample(batch_index, self.batch_size) batch = dataset[random_batch] batch = batch_to_device(batch, self.device) diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py index f680eb2..cef4b49 100644 --- a/inst/python/ResNet.py +++ b/inst/python/ResNet.py @@ -130,9 +130,9 @@ def __init__(self, nn.init.kaiming_uniform_(parameter, a=math.sqrt(5)) def forward(self, input): - x = self.weight.unsqueeze(0) * input.unsqueeze(-1) + x = self.weight[None] * input[..., None] if self.bias is not None: - x = x + self.bias.unsqueeze(-1) + x = x + self.bias[None] return x diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py index 5944e1b..1c95b36 100644 --- a/inst/python/Transformer.py +++ b/inst/python/Transformer.py @@ -49,6 +49,9 @@ def __init__(self, if num_features != 0 and num_features is not None: self.numerical_embedding = NumericalEmbedding(num_features, dim_token) + self.use_numerical = True + else: + self.use_numerical = False self.class_token = ClassToken(dim_token) self.layers = nn.ModuleList([]) @@ -78,7 +81,7 @@ def __init__(self, def forward(self, x): mask = torch.where(x["cat"] == 0, True, False) cat = self.categorical_embedding(x["cat"]) - if "num" in x.keys() and self.numerical_embedding is not None: + if self.use_numerical: num = self.numerical_embedding(x["num"]) x = torch.cat([cat, num], dim=1) mask = torch.cat([mask, torch.zeros([x.shape[0], diff --git a/man/TrainingCache.Rd b/man/TrainingCache.Rd index 0a7ec7b..c82bb23 100644 --- a/man/TrainingCache.Rd +++ b/man/TrainingCache.Rd @@ -8,6 +8,8 @@ Whether the provided and cached parameter grid is identical Grid search results from the training cache +Boolen + Last grid search index } \description{ @@ -21,6 +23,7 @@ Parameter caching for training persistence and continuity \item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}} \item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}} \item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}} +\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}} \item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}} \item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}} \item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}} @@ -104,6 +107,16 @@ Gets the grid search results from the training cache \if{html}{\out{