Skip to content

Commit

Permalink
merged with develop
Browse files Browse the repository at this point in the history
  • Loading branch information
egillax committed Dec 6, 2023
2 parents 5836121 + 6207324 commit 1170c83
Show file tree
Hide file tree
Showing 14 changed files with 628 additions and 434 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: DeepPatientLevelPrediction
Type: Package
Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
Version: 2.0.1.9999
Version: 2.0.2.9999
Date: 18-04-2023
Authors@R: c(
person("Egill", "Fridgeirsson", email = "[email protected]", role = c("aut", "cre")),
Expand Down Expand Up @@ -35,7 +35,8 @@ Suggests:
testthat,
PRROC,
ResultModelManager (>= 0.2.0),
DatabaseConnector (>= 6.0.0)
DatabaseConnector (>= 6.0.0),
Andromeda
Remotes:
ohdsi/PatientLevelPrediction,
ohdsi/FeatureExtraction,
Expand Down
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
DeepPatientLevelPrediction 2.0.2
======================
- Ensure output from predict_proba is numeric instead of 1d array
- Refactoring: Move cross-validation to a separate function
- Refactoring: Move paramsToTune to a separate function
- linting: Enforcing HADES style
- Calculate AUC ourselves with torch, get rid of scikit-learn dependancy
- added Andromeda to dev dependencies


DeepPatientLevelPrediction 2.0.1
======================
- Connection parameter fixed to be in line with newest polars
Expand Down
10 changes: 5 additions & 5 deletions R/Estimator.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ predictDeepEstimator <- function(plpModel,
} else {
prediction$value <- plpModel$model$predict_proba(data)
}

prediction$value <- as.numeric(prediction$value)
attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType")

return(prediction)
Expand Down Expand Up @@ -344,9 +344,9 @@ gridCvDeep <- function(mappedData,
fitParams,
paramSearch[[gridId]])
currentEstimatorSettings$modelType <- modelSettings$modelType
currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
currentModelParams$catFeatures <- dataset$get_cat_features()$max()
currentModelParams$numFeatures <-
dataset$get_numerical_features()$shape[[1]]
dataset$get_numerical_features()$max()
if (findLR) {
lrFinder <- createLRFinder(modelType = modelSettings$modelType,
modelParameters = currentModelParams,
Expand Down Expand Up @@ -417,8 +417,8 @@ gridCvDeep <- function(mappedData,
dir.create(file.path(modelLocation), recursive = TRUE)
}

modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
modelParams$catFeatures <- dataset$get_cat_features()$max()
modelParams$numFeatures <- dataset$get_numerical_features()$max()


estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings,
Expand Down
7 changes: 5 additions & 2 deletions R/LRFinder.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,17 @@ createLRFinder <- function(modelType,
lrFinderClass <-
reticulate::import_from_path("LrFinder", path = path)$LrFinder



estimatorSettings <- evalEstimatorSettings(estimatorSettings)

model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
estimatorSettings <- evalEstimatorSettings(estimatorSettings)
browser()
estimator <- createEstimator(modelType = estimatorSettings$modelType,
modelParameters = modelParameters,
estimatorSettings = estimatorSettings)

if (!is.null(lrSettings)) {
lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
}
Expand Down
2 changes: 1 addition & 1 deletion R/TrainingCache-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ trainingCache <- R6::R6Class(
return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions,
function(x) !is.null(x$gridPerformance)))))
},

#' @description
#' Gets the last index from the cached grid search
#' @returns Last grid search index
Expand Down
8 changes: 6 additions & 2 deletions inst/python/Dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, data, labels=None, numerical_features=None):
data_cat = (
data.filter(~pl.col("columnId").is_in(self.numerical_features))
.select(pl.col("rowId"), pl.col("columnId"))
.sort("rowId")
.sort(["rowId", "columnId"])
.with_columns(pl.col("rowId") - 1)
.collect()
)
Expand All @@ -69,9 +69,13 @@ def __init__(self, data, labels=None, numerical_features=None):
if pl.count(self.numerical_features) == 0:
self.num = None
else:
map_numerical = dict(zip(self.numerical_features.sort().to_list(),
list(range(len(self.numerical_features)))))

numerical_data = (
data.filter(pl.col("columnId").is_in(self.numerical_features))
.sort(by="columnId")
.with_columns(pl.col("columnId").replace(map_numerical),
pl.col("rowId") - 1)
.select(
pl.col("rowId"),
pl.col("columnId"),
Expand Down
61 changes: 38 additions & 23 deletions tests/testthat/setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ connectionDetails <- Eunomia::getEunomiaConnectionDetails()
Eunomia::createCohorts(connectionDetails)

covSet <- FeatureExtraction::createCovariateSettings(
useDemographicsGender = T,
useDemographicsAge = T,
useDemographicsRace = T,
useDemographicsEthnicity = T,
useDemographicsAgeGroup = T,
useConditionGroupEraLongTerm = T,
useDrugEraStartLongTerm = T,
useDemographicsGender = TRUE,
useDemographicsAge = TRUE,
useDemographicsRace = TRUE,
useDemographicsEthnicity = TRUE,
useDemographicsAgeGroup = TRUE,
useConditionGroupEraLongTerm = TRUE,
useDrugEraStartLongTerm = TRUE,
endDays = -1
)

Expand All @@ -30,20 +30,36 @@ databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
cdmDatabaseName = "eunomia"
)

restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings(
firstExposureOnly = T,
washoutPeriod = 365
)
restrictPlpDataSettings <-
PatientLevelPrediction::createRestrictPlpDataSettings(
firstExposureOnly = TRUE,
washoutPeriod = 365
)

plpData <- PatientLevelPrediction::getPlpData(
databaseDetails = databaseDetails,
restrictPlpDataSettings = restrictPlpDataSettings,
covariateSettings = covSet
)

# add age squared so I have more than one numerical feature
plpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>%
dplyr::rows_append(data.frame(
covariateId = 2002,
covariateName = "Squared age",
analysisId = 2,
conceptId = 0), copy = TRUE)

squaredAges <- plpData$covariateData$covariates %>%
dplyr::filter(covariateId == 1002) %>%
dplyr::mutate(covariateId = 2002,
covariateValue = .data$covariateValue**2)

plpData$covariateData$covariates <- plpData$covariateData$covariates %>%
dplyr::rows_append(squaredAges)

populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
requireTimeAtRisk = F,
requireTimeAtRisk = FALSE,
riskWindowStart = 1,
riskWindowEnd = 365
)
Expand All @@ -66,33 +82,32 @@ mappedData <- PatientLevelPrediction::MapIds(
)

path <- system.file("python", package = "DeepPatientLevelPrediction")
Dataset <- reticulate::import_from_path("Dataset", path = path)
datasetClass <- reticulate::import_from_path("Dataset", path = path)
if (is.null(attributes(mappedData)$path)) {
# sqlite object
attributes(mappedData)$path <- attributes(mappedData)$dbname
}

dataset <- Dataset$Data(
dataset <- datasetClass$Data(
data = reticulate::r_to_py(normalizePath(attributes(mappedData)$path)),
labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
)
small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))
smallDataset <- torch$utils$data$Subset(dataset,
(1:round(length(dataset) / 3)))

modelSettings <- setResNet(
numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
residualDropout = c(0, 0.2), hiddenDropout = 0,
sizeEmbedding = 16, hyperParamSearch = "random",
randomSample = 2,
setEstimator(epochs=1,
setEstimator(epochs = 1,
learningRate = 3e-4)
)
fitEstimatorPath <- file.path(testLoc, 'fitEstimator')
fitEstimatorPath <- file.path(testLoc, "fitEstimator")
if (!dir.exists(fitEstimatorPath)) {
dir.create(fitEstimatorPath)
}
fitEstimatorResults <- fitEstimator(trainData$Train,
modelSettings = modelSettings,
analysisId = 1,
analysisPath = fitEstimatorPath)


fitEstimatorResults <- fitEstimator(trainData$Train,
modelSettings = modelSettings,
analysisId = 1,
analysisPath = fitEstimatorPath)
106 changes: 87 additions & 19 deletions tests/testthat/test-Dataset.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
test_that("number of num and cat features sum correctly", {
testthat::expect_equal(
length(dataset$get_numerical_features()) + length(dataset$get_cat_features()),
dplyr::n_distinct(mappedData$covariates %>% dplyr::collect() %>%
length(dataset$get_numerical_features()) +
length(dataset$get_cat_features()),
dplyr::n_distinct(mappedData$covariates %>%
dplyr::collect() %>%
dplyr::pull(covariateId))
)
})
Expand All @@ -12,46 +14,112 @@ test_that("length of dataset correct", {
expect_equal(length(dataset), dataset$num$shape[0])
expect_equal(
length(dataset),
dplyr::n_distinct(mappedData$covariates %>%
dplyr::collect() %>% dplyr::pull(rowId))
dplyr::n_distinct(mappedData$covariates %>%
dplyr::collect() %>%
dplyr::pull(.data$rowId))
)
})

test_that(".getbatch works", {
batch_size <- 16
batchSize <- 16
# get one sample
out <- dataset[10]

# output should be a list of two items, the batch in pos 1 and targets in pos 2,

# output should be a list of two items,
# the batch in pos 1 and targets in pos 2,
# the batch is what goes to the model
expect_equal(length(out), 2)

# targets should be binary
expect_true(out[[2]]$item() %in% c(0, 1))

# shape of batch is correct
expect_equal(length(out[[1]]), 2)
expect_equal(out[[1]]$cat$shape[0], 1)
expect_equal(out[[1]]$num$shape[0], 1)

# shape of target
expect_equal(out[[2]]$shape$numel(), 1)

# get a whole batch
out <- dataset[10:(10 + batch_size)]
out <- dataset[10:(10 + batchSize)]

expect_equal(length(out), 2)
expect_true(all(out[[2]]$numpy() %in% c(0, 1)))

expect_equal(length(out[[1]]), 2)
expect_equal(out[[1]]$cat$shape[0], 16)
expect_equal(out[[1]]$num$shape[0], 16)

expect_equal(out[[2]]$shape[0], 16)
})

test_that("Column order is preserved in presence of missing features", {
# important for both external validation and transfer learning


test_that("Column order is preserved when features are missing", {
# important for transfer learning and external validation

reducedCovData <- Andromeda::copyAndromeda(trainData$Train$covariateData)

# remove one numerical and one categorical
numFeature <- 1002 # continous age
catFeature <- 4285898210 # a random common cat feature
reducedCovData$covariates <- trainData$Train$covariateData$covariates %>%
dplyr::filter(!(covariateId %in% c(numFeature, catFeature)))
reducedCovData$covariates <- trainData$Train$covariateData$covariates %>%
dplyr::filter(!(covariateId %in% c(numFeature, catFeature)))

mappedReducedData <- PatientLevelPrediction::MapIds(
reducedCovData,
mapping = mappedData$mapping
)

catColumn <- mappedData$mapping %>%
dplyr::filter(covariateId == catFeature) %>%
dplyr::pull("columnId")
numColumn <- mappedData$mapping %>%
dplyr::filter(covariateId == numFeature) %>%
dplyr::pull("columnId")

reducedDataset <- datasetClass$Data(
data =
reticulate::r_to_py(normalizePath(attributes(mappedReducedData)$dbname)),
labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
numerical_features = dataset$numerical_features$to_list()
)

# should have same number of columns
expect_equal(dataset$num$shape[[1]], reducedDataset$num$shape[[1]])

# all zeros in column with removed feature, -1 because r to py
expect_true(reducedDataset$num[, numColumn - 1]$sum()$item() == 0)

# all other columns are same
indexReduced <- !torch$isin(torch$arange(reducedDataset$num$shape[[1]]),
numColumn - 1)
index <- !torch$isin(torch$arange(dataset$num$shape[[1]]),
numColumn - 1)

expect_equal(reducedDataset$num[, indexReduced]$mean()$item(),
dataset$num[, index]$mean()$item())

# cat data should have same counts of all columnIds
# expect the one that was removed
# not same counts for removed feature
expect_false(isTRUE(all.equal((reducedDataset$cat == catColumn)$sum()$item(),
(dataset$cat == catColumn)$sum()$item())))

# get counts
counts <- as.array(torch$unique(dataset$cat,
return_counts = TRUE)[[2]]$numpy())
counts <- counts[-(catColumn + 1)] # +1 because py_to_r
counts <- counts[-1]

reducedCounts <- as.array(torch$unique(reducedDataset$cat,
return_counts = TRUE)[[2]]$numpy())
reducedCounts <- reducedCounts[-(catColumn + 1)] # +1 because py_to_r
reducedCounts <- reducedCounts[-1]

expect_false(isTRUE(all.equal(counts, reducedCounts)))
expect_equal(dataset$get_cat_features()$max(),
reducedDataset$get_cat_features()$max())

})
Loading

0 comments on commit 1170c83

Please sign in to comment.