diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml new file mode 100644 index 0000000..1d99ae6 --- /dev/null +++ b/.github/workflows/check-standard.yaml @@ -0,0 +1,58 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + workflow_dispatch: + pull_request: + +name: check-standard + +permissions: read-all + +jobs: + check-standard: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + dependencies: '"hard"' + extra-packages: | + any::rcmdcheck + any::testthat + any::knitr + any::rmarkdown + local::. + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + args: 'c("--no-manual","--no-build-vignettes","--no-examples")' + build_args: 'c("--no-manual","--no-build-vignettes", "--no-resave-data")' + error-on: '"error"' \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 43518e5..d1737d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: nichenetr Type: Package Title: NicheNet: Modeling Intercellular Communication by Linking Ligands to Target Genes -Version: 2.1.6 +Version: 2.1.7 Authors@R: c(person("Robin", "Browaeys", role = c("aut")), person("Chananchida", "Sang-aram", role = c("aut", "cre"), email = "chananchida.sangaram@ugent.be")) Description: This package allows you the investigate intercellular communication from a computational perspective. More specifically, it allows to investigate how interacting cells influence each other's gene expression. Functionalities of this package (e.g. including predicting extracellular upstream regulators and their affected target genes) build upon a probabilistic model of ligand-target links that was inferred by data-integration. @@ -12,6 +12,7 @@ URL: https://github.com/saeyslab/nichenetr BugReports: https://github.com/saeyslab/nichenetr/issues RoxygenNote: 7.3.1 Depends: R (>= 3.0.0) +biocViews: Imports: tidyverse, data.table, @@ -27,7 +28,6 @@ Imports: fdrtool, ROCR, caTools, - limma, Hmisc, caret, randomForest, @@ -52,6 +52,7 @@ Suggests: rmarkdown, testthat, doMC, + limma, mco, parallel, covr, diff --git a/NAMESPACE b/NAMESPACE index 4808f39..3bc51b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -185,7 +185,6 @@ importFrom(igraph,get.data.frame) importFrom(igraph,graph_from_adjacency_matrix) importFrom(igraph,page_rank) importFrom(igraph,sample_degseq) -importFrom(limma,wilcoxGST) importFrom(magrittr,set_colnames) importFrom(magrittr,set_rownames) importFrom(purrr,reduce) diff --git a/R/characterization_data_sources.R b/R/characterization_data_sources.R index e4f2638..0537cb9 100644 --- a/R/characterization_data_sources.R +++ b/R/characterization_data_sources.R @@ -353,10 +353,15 @@ evaluate_model = function(parameters_setting, lr_network, sig_network, gr_networ ligand_importances$spearman[is.na(ligand_importances$spearman)] = 0 ligand_importances$pearson_log_pval[is.na(ligand_importances$pearson_log_pval)] = 0 ligand_importances$spearman_log_pval[is.na(ligand_importances$spearman_log_pval)] = 0 - ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 ligand_importances$pearson_log_pval[is.infinite(ligand_importances$pearson_log_pval)] = 10000 ligand_importances$spearman_log_pval[is.infinite(ligand_importances$spearman_log_pval)] = 10000 - ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + + if ("mean_rank_GST_log_pval" %in% colnames(ligand_importances)){ + ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 + ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + } else{ + warning("mean_rank_GST_log_pval not in ligand_importances; do you have limma installed?") + } all_importances = ligand_importances %>% select_if(.predicate = function(x){sum(is.na(x)) == 0}) # all_importances = full_join(ligand_importances, ligand_importances_glm, by = c("setting","test_ligand","ligand")) %>% full_join(ligand_importances_discrete, by = c("setting","test_ligand", "ligand")) diff --git a/R/evaluate_model_ligand_prediction.R b/R/evaluate_model_ligand_prediction.R index ad8e14e..120f810 100644 --- a/R/evaluate_model_ligand_prediction.R +++ b/R/evaluate_model_ligand_prediction.R @@ -1,828 +1,826 @@ -#' @title Convert settings to correct settings format for ligand prediction. -#' -#' @description \code{convert_settings_ligand_prediction} Converts settings to correct settings format for ligand activity prediction. In this prediction problem, ligands (out of a set of possibly active ligands) will be ranked based on feature importance scores. The format can be made suited for: 1) validation of ligand activity state prediction by calculating individual feature importane scores or 2) feature importance based on models with embedded feature importance determination; applications in which ligands need to be scores based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. -#' -#' @usage -#' convert_settings_ligand_prediction(settings, all_ligands, validation = TRUE, single = TRUE) -#' -#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. -#' @param all_ligands A character vector of possible ligands that will be considered for the ligand activity state prediction. -#' @param validation TRUE if seetings need to be prepared for validation of ligand activity state predictions (this implies that the true active ligand of a setting is known); FALSE for application purposes when the true active ligand(s) is/are not known. -#' @param single TRUE if feature importance scores for ligands will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. - -#' @return A list with following elements: $name, $ligand: name of active ligand(s) (only if validation is TRUE), $from (ligand(s) that will be tested for activity prediction), $response -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) -#' ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, ligands, validation = TRUE, single = TRUE) -#' } -#' @export -#' -#' -convert_settings_ligand_prediction = function(settings,all_ligands,validation = TRUE, single = TRUE){ - - # input check - if(!is.list(settings)) - stop("settings should be a list") - if(!is.character(all_ligands)) - stop("all_ligands should be a character vector") - if(!is.logical(validation) | length(validation) != 1) - stop("validation should be TRUE or FALSE") - if(!is.logical(single) | length(single) != 1) - stop("single should be TRUE or FALSE") - - requireNamespace("dplyr") - - new_settings = list() - if (validation == TRUE && single == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - for (k in 1:length(all_ligands)){ - test_ligand = all_ligands[[k]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_validation(setting,test_ligand)) - } - } - } else if (validation == TRUE && single == FALSE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_validation(setting,all_ligands)) - } - } else if (validation == FALSE && single == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - for (k in 1:length(all_ligands)){ - test_ligand = all_ligands[[k]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_ligand)) - } - } - } else if (validation == FALSE && single == FALSE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_ligands)) - } - } - return(new_settings %>% unlist(recursive = FALSE)) -} -#' @title Get ligand importances based on target gene prediction performance of single ligands. -#' -#' @description \code{get_single_ligand_importances} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. -#' -#' @usage -#' get_single_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @inheritParams evaluate_target_prediction -#' -#' @return A data.frame with for each ligand - data set combination, classification evaluation metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' print(head(ligand_importances)) -#' } -#' @export -#' -get_single_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - metrics = evaluate_target_prediction(setting, ligand_target_matrix, ligands_position) - metrics = metrics %>% rename(test_ligand = ligand) - if (known == TRUE){ - true_ligand = setting$ligand - metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) - metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) - } - return(metrics) -} -#' @title Get ligand importances from a multi-ligand classfication model. -#' -#' @description \code{get_multi_ligand_importances} A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_glm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances,ligand_target_matrix, algorithm = "glm")) -#' print(head(ligand_importances_glm)) -#' } -#' @export -#' -get_multi_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - output = evaluate_multi_ligand_target_prediction(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors, continuous) - metrics = output$var_imps - metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, importance) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, importance) - return(metrics) - -} -#' @title Evaluation of ligand activity prediction based on ligand importance scores. -#' -#' @description \code{evaluate_importances_ligand_prediction} Evaluate how well a trained model of ligand importance scores is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. A classificiation algorithm chosen by the user is trained to construct one model based on the ligand importance scores of all ligands of interest (ligands importance scores are considered as features). Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. -#' -#' @usage -#' evaluate_importances_ligand_prediction(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) -#' -#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' -#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each ligand-setting combination the ligand importance scores for the individual importance scores, the complete model of importance scores and the ligand activity as well (TRUE or FALSE); $model: the caret model object that can be used on new importance scores to predict the ligand activity state. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom limma wilcoxGST -#' @import caret -#' @importFrom purrr safely -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") -#' print(head(evaluation)) -#' } -#' @export -#' -evaluate_importances_ligand_prediction = function(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ - if (!is.data.frame(importances)) - stop("importances must be a data frame") - if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) - stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") - if(normalization != "mean" & normalization != "median") - stop("normalization should be 'mean' or 'median'") - if(!is.character(algorithm)) - stop("algorithm should be a character vector") - if(!is.logical(var_imps) | length(var_imps) > 1) - stop("var_imps should be a logical vector: TRUE or FALSE") - if(!is.logical(cv) | length(cv) > 1) - stop("cv should be a logical vector: TRUE or FALSE") - if(!is.numeric(cv_number) | length(cv_number) > 1) - stop("cv_number should be a numeric vector of length 1") - if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) - stop("cv_repeats should be a numeric vector of length 1") - if(!is.logical(parallel) | length(parallel) > 1) - stop("parallel should be a logical vector: TRUE or FALSE") - if(!is.numeric(n_cores) | length(n_cores) > 1) - stop("n_cores should be a numeric vector of length 1") - if(!is.logical(ignore_errors) | length(ignore_errors) > 1) - stop("ignore_errors should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - -# importances = importances %>% tidyr::drop_na() - added = is_ligand_active(importances) - importances = importances %>% mutate(class = added) - - if (normalization == "mean"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "median"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) - } - - response_vector = importances$class %>% make.names() %>% as.factor() - train_data = normalized_importances %>% mutate(obs = response_vector) %>% data.frame() - - output = wrapper_caret_classification(train_data,algorithm,TRUE,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = bind_cols(importances %>% select(setting,ligand,test_ligand,class), normalized_importances),ignore_errors,return_model = TRUE) - return(output) -} -#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: aggregate all datasets. -#' -#' @description \code{evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. -#' -#' @usage -#' evaluate_single_importances_ligand_prediction(importances,normalization) -#' -#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) or "no" (use unnormalized feature importance scores - only recommended when evaluating ligand activity prediction on individual datasets) -#' -#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom limma wilcoxGST -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_single_importances_ligand_prediction(ligand_importances,normalization = "median") -#' print(head(evaluation)) -#' } -#' @export -#' -evaluate_single_importances_ligand_prediction = function(importances,normalization){ - if (!is.data.frame(importances)) - stop("importances must be a data frame") - if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) - stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") - if(normalization != "mean" & normalization != "median" & normalization != "no") - stop("normalization should be 'mean' or 'median' or 'no'") - - requireNamespace("dplyr") - importances0 = importances %>% select(-setting,-ligand,-test_ligand) -# importances = importances %>% tidyr::drop_na() - added = is_ligand_active(importances) - - if (nrow(importances) == 0){ - performances = lapply(importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) - output = tibble(importance_measure = names(performances)) - performances = bind_rows(performances) - return(bind_cols(output,performances)) - } - - importances = importances %>% select_if(.predicate = function(x) { - sum(is.na(x)) == 0 - }) - - if (normalization == "mean"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "median"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "no") { - normalized_importances = importances %>% select(-c(setting,test_ligand,ligand)) - } - - performances = lapply(normalized_importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) - output = tibble(importance_measure = names(performances)) - performances = bind_rows(performances) - return(bind_cols(output,performances)) -} -#' @title Prediction of ligand activity prediction by a model trained on ligand importance scores. -#' -#' @description \code{model_based_ligand_activity_prediction} Predict the activity state of a ligand based on a classification model that was trained to predict ligand activity state based on ligand importance scores. -#' -#' @usage -#' model_based_ligand_activity_prediction(importances, model, normalization) -#' -#' @param model A model object of a classification object as e.g. generated via caret. -#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) -#' -#' @return A data frame containing the ligand importance scores and the probabilities that according to the trained model, the ligands are active based on their importance scores. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") -#' -#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) -#' activity_predictions = model_based_ligand_activity_prediction(ligand_importances, evaluation$model,"median") -#' print(head(activity_predictions)) -#' } -#' -#' @export -#' -model_based_ligand_activity_prediction = function(importances, model, normalization){ - if (!is.list(model)) - stop("model must be a list, derived as model object from model training (e.g. via the caret package)") - if(model$finalModel$problemType != "Classification" & model$finalModel$problemType != "Regression") - stop("model should be model object (derived from model training)") - if (!is.data.frame(importances)) - stop("importances must be a data frame") - if(!is.character(importances$setting) | !is.character(importances$test_ligand)) - stop("importances$setting and importances$test_ligand should be character vectors") - if(normalization != "mean" & normalization != "median") - stop("normalization should be 'mean' or 'median'") - - requireNamespace("dplyr") - -# importances = importances %>% tidyr::drop_na() - - if (normalization == "mean"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) - } else if (normalization == "median"){ - normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) - } - - final_model_predictions = predict(model,newdata = normalized_importances, type = "prob") - final_model_predictions = final_model_predictions %>% as_tibble() %>% mutate(active = TRUE. > FALSE.) %>% select(-FALSE.) %>% rename(model = TRUE.) - return(bind_cols(importances,final_model_predictions) %>% as_tibble()) - -} -#' @title Get ligand importances from a multi-ligand trained random forest model. -#' -#' @description \code{get_multi_ligand_rf_importances} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_rf_importances(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. -#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). -#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @importFrom randomForest randomForest importance -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances,ligand_target_matrix, ntrees = 100, mtry = 2)) -#' print(head(ligand_importances_rf)) -#' } -#' -#' @export -#' -get_multi_ligand_rf_importances = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - if(ntrees <= 1) - stop("ntrees should be higher than 1") - if(mtry <= 1) - stop("mtry should be higher than 1") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - ligands_oi = setting$from - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - if(is.double(prediction_matrix) == FALSE){ - convert_categorical_factor = function(x){ - x = x %>% make.names() %>% as.factor() - } - prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) - } - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - rf_model = randomForest::randomForest(y = train_data$obs, - x = train_data[,-(which(colnames(train_data) == "obs"))], - ntree = ntrees, - mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), - importance = TRUE - ) - - metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, MeanDecreaseAccuracy, MeanDecreaseGini) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, MeanDecreaseAccuracy, MeanDecreaseGini) - return(metrics) - - -} -#' @title Get ligand importances based on target gene value prediction performance of single ligands (regression). -#' -#' @description \code{get_single_ligand_importances_regression} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. Response: continuous values associated to a gene, e.g. a log fold change value. -#' -#' @usage -#' get_single_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @inheritParams evaluate_target_prediction_regression -#' -#' @return A data.frame with for each ligand - data set combination, regression model fit metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction_regression}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances_regression,ligand_target_matrix)) -#' print(head(ligand_importances)) -#' } -#' @export -#' -get_single_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - metrics = evaluate_target_prediction_regression(setting, ligand_target_matrix, ligands_position) - metrics = metrics %>% rename(test_ligand = ligand) - if (known == TRUE){ - true_ligand = setting$ligand - metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) - metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) - } - return(metrics) -} -#' @title Get ligand importances from a multi-ligand regression model. -#' -#' @description \code{get_multi_ligand_importances_regression} A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (respone: e.g. absolute value of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_lm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances_regression,ligand_target_matrix, algorithm = "lm")) -#' print(head(ligand_importances_lm)) -#' } -#' @export -#' -get_multi_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - output = evaluate_multi_ligand_target_prediction_regression(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors) - metrics = output$var_imps - metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, importance) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, importance) - return(metrics) - -} -#' @title Get ligand importances from a multi-ligand trained random forest regression model. -#' -#' @description \code{get_multi_ligand_rf_importances_regression} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (response: e.g. absolute values of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. -#' -#' @usage -#' get_multi_ligand_rf_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. -#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). -#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. -#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. -#' -#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). -#' -#' @importFrom randomForest randomForest importance -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances_regression,ligand_target_matrix, ntrees = 100, mtry = 2)) -#' print(head(ligand_importances_rf)) -#' } -#' -#' @export -#' -get_multi_ligand_rf_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE){ - - if(!is.logical(known) | length(known) > 1) - stop("known should be a logical vector: TRUE or FALSE") - if(!is.logical(filter_genes) | length(filter_genes) > 1) - stop("filter_genes should be a logical vector: TRUE or FALSE") - if(ntrees <= 1) - stop("ntrees should be higher than 1") - if(mtry <= 1) - stop("mtry should be higher than 1") - - requireNamespace("dplyr") - - if (filter_genes == TRUE){ - ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) - } - - setting_name = setting$name - ligands_oi = setting$from - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - response_df = tibble(gene = names(response_vector), response = response_vector) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - rf_model = randomForest::randomForest(y = train_data$obs, - x = train_data[,-(which(colnames(train_data) == "obs"))], - ntree = ntrees, - mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), - importance = TRUE - ) - - metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) - - if (known == TRUE){ - true_ligand = setting$ligand - metrics = metrics %>% mutate(ligand = true_ligand) - metrics = metrics %>% select(setting, test_ligand, ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) - return(metrics) - } - metrics = metrics %>% select(setting, test_ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) - return(metrics) -} -#' @title Convert settings to correct settings format for TF prediction. -#' -#' @description \code{convert_settings_tf_prediction} Converts settings to correct settings format for TF activity prediction. In this prediction problem, TFs (out of a set of possibly active TFs) will be ranked based on feature importance scores. The format can be made suited for applications in which TFs need to be scored based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. Remark that upstream regulator analysis for TFs here is experimental and was not thoroughly validated in the study accompanying this package. -#' -#' @usage -#' convert_settings_tf_prediction(settings, all_tfs, single = TRUE) -#' -#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the tf(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. -#' @param all_tfs A character vector of possible tfs that will be considered for the tf activity state prediction. -#' @param single TRUE if feature importance scores for tfs will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. - -#' @return A list with following elements: $name, $tf: name of active tf(s) (only if validation is TRUE), $from (tf(s) that will be tested for activity prediction), $response -#' -#' @examples -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_tf_pred = convert_settings_tf_prediction(settings, all_tfs = c("SMAD1","STAT1","RELA"), single = TRUE) -#' # show how this function can be used to predict activities of TFs -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' tf_target = construct_tf_target_matrix(weighted_networks, tfs_as_cols = TRUE, standalone_output = TRUE) -#' tf_importances = dplyr::bind_rows(lapply(settings_tf_pred,get_single_ligand_importances,tf_target,known = FALSE)) -#' print(head(tf_importances)) -#' -#' @export -#' -#' -convert_settings_tf_prediction = function(settings,all_tfs, single = TRUE){ - - # input check - if(!is.list(settings)) - stop("settings should be a list") - if(!is.character(all_tfs)) - stop("all_tfs should be a character vector") - if(!is.logical(single) | length(single) != 1) - stop("single should be TRUE or FALSE") - - requireNamespace("dplyr") - - new_settings = list() - if (single == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - for (k in 1:length(all_tfs)){ - test_tf = all_tfs[[k]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_tf)) - } - } - } else if (single == FALSE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_tfs)) - } - } - return(new_settings %>% unlist(recursive = FALSE)) -} -#' @title Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands. -#' -#' @description \code{convert_expression_settings_evaluation} Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands.(useful for applications when a lot of ligands are potentially active, a lot of settings need to be predicted and a multi-ligand model is trained). -#' -#' @usage -#' convert_settings_topn_ligand_prediction(setting, importances, model, n, normalization) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) -#' @param n The top n number of ligands according to the ligand activity state prediction model will be considered as potential ligand for the generation of a new setting. -#' @inheritParams model_based_ligand_activity_prediction - -#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") -#' -#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) -#' settings = lapply(settings,convert_settings_topn_ligand_prediction, importances = ligand_importances, model = evaluation$model, n = 3, normalization = "median" ) -#' } -#' -#' @export -#' -convert_settings_topn_ligand_prediction = function(setting, importances, model, n, normalization){ - # input check - if(!is.list(setting)) - stop("setting should be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.logical(setting$response)) - stop("setting$response should be a logical vector") - - requireNamespace("dplyr") - setting_name = setting$name - importances_oi = importances %>% filter(setting == setting_name) - output = model_based_ligand_activity_prediction(importances_oi, model,normalization) - top_ligands = output %>% top_n(n,model) %>% .$test_ligand - - new_setting = list() - new_setting$name = setting$name - new_setting$from = top_ligands - new_setting$response = setting$response - - return(new_setting) -} - -#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: each dataset individually. -#' -#' @description \code{wrapper_evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. -#' -#' @usage -#' wrapper_evaluate_single_importances_ligand_prediction(group,ligand_importances) -#' -#' @param group Name of the dataset (setting) you want to calculate ligand activity performance for. -#' @param ligand_importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. -#' -#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) -#' -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) -#' evaluation = ligand_importances$setting %>% unique() %>% lapply(function(x){x}) %>% lapply(wrapper_evaluate_single_importances_ligand_prediction,ligand_importances) %>% bind_rows() %>% inner_join(ligand_importances %>% distinct(setting,ligand)) -#' print(head(evaluation)) -#' } -#' @export -#' -wrapper_evaluate_single_importances_ligand_prediction = function(group,ligand_importances){ - if (!is.data.frame(ligand_importances)) - stop("ligand_importances must be a data frame") - if (!is.character(group)) - stop("group must be a character") - ligand_importances %>% filter(setting %in% group) %>% evaluate_single_importances_ligand_prediction(normalization = "no") %>% mutate(setting = group) -} - +#' @title Convert settings to correct settings format for ligand prediction. +#' +#' @description \code{convert_settings_ligand_prediction} Converts settings to correct settings format for ligand activity prediction. In this prediction problem, ligands (out of a set of possibly active ligands) will be ranked based on feature importance scores. The format can be made suited for: 1) validation of ligand activity state prediction by calculating individual feature importane scores or 2) feature importance based on models with embedded feature importance determination; applications in which ligands need to be scores based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. +#' +#' @usage +#' convert_settings_ligand_prediction(settings, all_ligands, validation = TRUE, single = TRUE) +#' +#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. +#' @param all_ligands A character vector of possible ligands that will be considered for the ligand activity state prediction. +#' @param validation TRUE if seetings need to be prepared for validation of ligand activity state predictions (this implies that the true active ligand of a setting is known); FALSE for application purposes when the true active ligand(s) is/are not known. +#' @param single TRUE if feature importance scores for ligands will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. + +#' @return A list with following elements: $name, $ligand: name of active ligand(s) (only if validation is TRUE), $from (ligand(s) that will be tested for activity prediction), $response +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) +#' ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, ligands, validation = TRUE, single = TRUE) +#' } +#' @export +#' +#' +convert_settings_ligand_prediction = function(settings,all_ligands,validation = TRUE, single = TRUE){ + + # input check + if(!is.list(settings)) + stop("settings should be a list") + if(!is.character(all_ligands)) + stop("all_ligands should be a character vector") + if(!is.logical(validation) | length(validation) != 1) + stop("validation should be TRUE or FALSE") + if(!is.logical(single) | length(single) != 1) + stop("single should be TRUE or FALSE") + + requireNamespace("dplyr") + + new_settings = list() + if (validation == TRUE && single == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + for (k in 1:length(all_ligands)){ + test_ligand = all_ligands[[k]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_validation(setting,test_ligand)) + } + } + } else if (validation == TRUE && single == FALSE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_validation(setting,all_ligands)) + } + } else if (validation == FALSE && single == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + for (k in 1:length(all_ligands)){ + test_ligand = all_ligands[[k]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_ligand)) + } + } + } else if (validation == FALSE && single == FALSE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_ligands)) + } + } + return(new_settings %>% unlist(recursive = FALSE)) +} +#' @title Get ligand importances based on target gene prediction performance of single ligands. +#' +#' @description \code{get_single_ligand_importances} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. +#' +#' @usage +#' get_single_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @inheritParams evaluate_target_prediction +#' +#' @return A data.frame with for each ligand - data set combination, classification evaluation metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' print(head(ligand_importances)) +#' } +#' @export +#' +get_single_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + metrics = evaluate_target_prediction(setting, ligand_target_matrix, ligands_position) + metrics = metrics %>% rename(test_ligand = ligand) + if (known == TRUE){ + true_ligand = setting$ligand + metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) + metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) + } + return(metrics) +} +#' @title Get ligand importances from a multi-ligand classfication model. +#' +#' @description \code{get_multi_ligand_importances} A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_importances(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_glm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances,ligand_target_matrix, algorithm = "glm")) +#' print(head(ligand_importances_glm)) +#' } +#' @export +#' +get_multi_ligand_importances = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + output = evaluate_multi_ligand_target_prediction(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors, continuous) + metrics = output$var_imps + metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, importance) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, importance) + return(metrics) + +} +#' @title Evaluation of ligand activity prediction based on ligand importance scores. +#' +#' @description \code{evaluate_importances_ligand_prediction} Evaluate how well a trained model of ligand importance scores is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. A classificiation algorithm chosen by the user is trained to construct one model based on the ligand importance scores of all ligands of interest (ligands importance scores are considered as features). Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. +#' +#' @usage +#' evaluate_importances_ligand_prediction(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) +#' +#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' +#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each ligand-setting combination the ligand importance scores for the individual importance scores, the complete model of importance scores and the ligand activity as well (TRUE or FALSE); $model: the caret model object that can be used on new importance scores to predict the ligand activity state. +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @import caret +#' @importFrom purrr safely +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") +#' print(head(evaluation)) +#' } +#' @export +#' +evaluate_importances_ligand_prediction = function(importances, normalization, algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ + if (!is.data.frame(importances)) + stop("importances must be a data frame") + if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) + stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") + if(normalization != "mean" & normalization != "median") + stop("normalization should be 'mean' or 'median'") + if(!is.character(algorithm)) + stop("algorithm should be a character vector") + if(!is.logical(var_imps) | length(var_imps) > 1) + stop("var_imps should be a logical vector: TRUE or FALSE") + if(!is.logical(cv) | length(cv) > 1) + stop("cv should be a logical vector: TRUE or FALSE") + if(!is.numeric(cv_number) | length(cv_number) > 1) + stop("cv_number should be a numeric vector of length 1") + if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) + stop("cv_repeats should be a numeric vector of length 1") + if(!is.logical(parallel) | length(parallel) > 1) + stop("parallel should be a logical vector: TRUE or FALSE") + if(!is.numeric(n_cores) | length(n_cores) > 1) + stop("n_cores should be a numeric vector of length 1") + if(!is.logical(ignore_errors) | length(ignore_errors) > 1) + stop("ignore_errors should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + +# importances = importances %>% tidyr::drop_na() + added = is_ligand_active(importances) + importances = importances %>% mutate(class = added) + + if (normalization == "mean"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "median"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand,-class) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) + } + + response_vector = importances$class %>% make.names() %>% as.factor() + train_data = normalized_importances %>% mutate(obs = response_vector) %>% data.frame() + + output = wrapper_caret_classification(train_data,algorithm,TRUE,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = bind_cols(importances %>% select(setting,ligand,test_ligand,class), normalized_importances),ignore_errors,return_model = TRUE) + return(output) +} +#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: aggregate all datasets. +#' +#' @description \code{evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. +#' +#' @usage +#' evaluate_single_importances_ligand_prediction(importances,normalization) +#' +#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) or "no" (use unnormalized feature importance scores - only recommended when evaluating ligand activity prediction on individual datasets) +#' +#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_single_importances_ligand_prediction(ligand_importances,normalization = "median") +#' print(head(evaluation)) +#' } +#' @export +#' +evaluate_single_importances_ligand_prediction = function(importances,normalization){ + if (!is.data.frame(importances)) + stop("importances must be a data frame") + if(!is.character(importances$setting) | !is.character(importances$test_ligand) | !is.character(importances$ligand)) + stop("importances$setting, importances$test_ligand and importances$ligand should be character vectors") + if(normalization != "mean" & normalization != "median" & normalization != "no") + stop("normalization should be 'mean' or 'median' or 'no'") + + requireNamespace("dplyr") + importances0 = importances %>% select(-setting,-ligand,-test_ligand) +# importances = importances %>% tidyr::drop_na() + added = is_ligand_active(importances) + + if (nrow(importances) == 0){ + performances = lapply(importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) + output = tibble(importance_measure = names(performances)) + performances = bind_rows(performances) + return(bind_cols(output,performances)) + } + + importances = importances %>% select_if(.predicate = function(x) { + sum(is.na(x)) == 0 + }) + + if (normalization == "mean"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "median"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-ligand,-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "no") { + normalized_importances = importances %>% select(-c(setting,test_ligand,ligand)) + } + + performances = lapply(normalized_importances, classification_evaluation_continuous_pred, added, iregulon = FALSE) + output = tibble(importance_measure = names(performances)) + performances = bind_rows(performances) + return(bind_cols(output,performances)) +} +#' @title Prediction of ligand activity prediction by a model trained on ligand importance scores. +#' +#' @description \code{model_based_ligand_activity_prediction} Predict the activity state of a ligand based on a classification model that was trained to predict ligand activity state based on ligand importance scores. +#' +#' @usage +#' model_based_ligand_activity_prediction(importances, model, normalization) +#' +#' @param model A model object of a classification object as e.g. generated via caret. +#' @param importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' @param normalization Way of normalization of the importance measures: "mean" (classifcal z-score) or "median" (modified z-score) +#' +#' @return A data frame containing the ligand importance scores and the probabilities that according to the trained model, the ligands are active based on their importance scores. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") +#' +#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) +#' activity_predictions = model_based_ligand_activity_prediction(ligand_importances, evaluation$model,"median") +#' print(head(activity_predictions)) +#' } +#' +#' @export +#' +model_based_ligand_activity_prediction = function(importances, model, normalization){ + if (!is.list(model)) + stop("model must be a list, derived as model object from model training (e.g. via the caret package)") + if(model$finalModel$problemType != "Classification" & model$finalModel$problemType != "Regression") + stop("model should be model object (derived from model training)") + if (!is.data.frame(importances)) + stop("importances must be a data frame") + if(!is.character(importances$setting) | !is.character(importances$test_ligand)) + stop("importances$setting and importances$test_ligand should be character vectors") + if(normalization != "mean" & normalization != "median") + stop("normalization should be 'mean' or 'median'") + + requireNamespace("dplyr") + +# importances = importances %>% tidyr::drop_na() + + if (normalization == "mean"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_zscore)) %>% ungroup() %>% select(-setting) + } else if (normalization == "median"){ + normalized_importances = importances %>% group_by(setting) %>% dplyr::select(-test_ligand) %>% mutate_all(funs(scaling_modified_zscore)) %>% ungroup() %>% select(-setting) + } + + final_model_predictions = predict(model,newdata = normalized_importances, type = "prob") + final_model_predictions = final_model_predictions %>% as_tibble() %>% mutate(active = TRUE. > FALSE.) %>% select(-FALSE.) %>% rename(model = TRUE.) + return(bind_cols(importances,final_model_predictions) %>% as_tibble()) + +} +#' @title Get ligand importances from a multi-ligand trained random forest model. +#' +#' @description \code{get_multi_ligand_rf_importances} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset. Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_rf_importances(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. +#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). +#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained classification model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @importFrom randomForest randomForest importance +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances,ligand_target_matrix, ntrees = 100, mtry = 2)) +#' print(head(ligand_importances_rf)) +#' } +#' +#' @export +#' +get_multi_ligand_rf_importances = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, continuous = TRUE, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + if(ntrees <= 1) + stop("ntrees should be higher than 1") + if(mtry <= 1) + stop("mtry should be higher than 1") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + ligands_oi = setting$from + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + if(is.double(prediction_matrix) == FALSE){ + convert_categorical_factor = function(x){ + x = x %>% make.names() %>% as.factor() + } + prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) + } + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + rf_model = randomForest::randomForest(y = train_data$obs, + x = train_data[,-(which(colnames(train_data) == "obs"))], + ntree = ntrees, + mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), + importance = TRUE + ) + + metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, MeanDecreaseAccuracy, MeanDecreaseGini) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, MeanDecreaseAccuracy, MeanDecreaseGini) + return(metrics) + + +} +#' @title Get ligand importances based on target gene value prediction performance of single ligands (regression). +#' +#' @description \code{get_single_ligand_importances_regression} Get ligand importance measures for ligands based on how well a single, individual, ligand can predict an observed response. Assess how well every ligand of interest is able to predict the observed transcriptional response in a particular dataset, according to the ligand-target model. It can be assumed that the ligand that best predicts the observed response, is more likely to be the true ligand. Response: continuous values associated to a gene, e.g. a log fold change value. +#' +#' @usage +#' get_single_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @inheritParams evaluate_target_prediction_regression +#' +#' @return A data.frame with for each ligand - data set combination, regression model fit metrics indicating how well the query ligand predicts the response in the particular dataset. Evaluation metrics are the same as in \code{\link{evaluate_target_prediction_regression}}. In addition to the metrics, the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances_regression,ligand_target_matrix)) +#' print(head(ligand_importances)) +#' } +#' @export +#' +get_single_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", known = TRUE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + metrics = evaluate_target_prediction_regression(setting, ligand_target_matrix, ligands_position) + metrics = metrics %>% rename(test_ligand = ligand) + if (known == TRUE){ + true_ligand = setting$ligand + metrics_meta = metrics %>% select(setting,test_ligand) %>% bind_cols(tibble(ligand = true_ligand)) + metrics = inner_join(metrics_meta, metrics, by = c("setting","test_ligand")) + } + return(metrics) +} +#' @title Get ligand importances from a multi-ligand regression model. +#' +#' @description \code{get_multi_ligand_importances_regression} A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (respone: e.g. absolute value of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost"; without: "lda","naive_bayes","pls"(because bug in current version of pls package), "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_lm = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_importances_regression,ligand_target_matrix, algorithm = "lm")) +#' print(head(ligand_importances_lm)) +#' } +#' @export +#' +get_multi_ligand_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + output = evaluate_multi_ligand_target_prediction_regression(setting, ligand_target_matrix, ligands_position,algorithm, var_imps = TRUE, cv, cv_number, cv_repeats, parallel, n_cores, ignore_errors) + metrics = output$var_imps + metrics = metrics %>% mutate(setting = setting_name) %>% rename(test_ligand = feature) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, importance) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, importance) + return(metrics) + +} +#' @title Get ligand importances from a multi-ligand trained random forest regression model. +#' +#' @description \code{get_multi_ligand_rf_importances_regression} A random forest is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features) in order to predict the observed response in a particular dataset (response: e.g. absolute values of log fold change). Variable importance scores that indicate for each ligand the importance for response prediction, are extracted. It can be assumed that ligands with higher variable importance scores are more likely to be a true active ligand. +#' +#' @usage +#' get_multi_ligand_rf_importances_regression(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) of which the predictve performance need to be assessed; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. $ligand: NULL or the name of the ligand(s) that are known to be active in the setting of interest. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param ntrees Indicate the number of trees used in the random forest algorithm. The more trees, the longer model training takes, but the more robust the extraced importance scores will be. Default: 1000. Recommended for robustness to have till 10000 trees. +#' @param mtry n**(1/mtry) features of the n features will be sampled at each split during the training of the random forest algorithm. Default: 2 (square root). +#' @param known Indicate whether the true active ligand for a particular dataset is known or not. Default: TRUE. The true ligand will be extracted from the $ligand slot of the setting. +#' @param filter_genes Indicate whether 50 per cent of the genes that are the least variable in ligand-target scores should be removed in order to reduce the training of the model. Default: FALSE. +#' +#' @return A data.frame with for each ligand - data set combination, feature importance scores indicating how important the query ligand is for the prediction of the response in the particular dataset, when prediction is done via a trained regression model with all possible ligands as input. In addition to the importance score(s), the name of the particular setting ($setting), the name of the query ligand($test_ligand), the name of the true active ligand (if known: $ligand). +#' +#' @importFrom randomForest randomForest importance +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation_regression) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = FALSE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances_rf = dplyr::bind_rows(lapply(settings_ligand_pred, get_multi_ligand_rf_importances_regression,ligand_target_matrix, ntrees = 100, mtry = 2)) +#' print(head(ligand_importances_rf)) +#' } +#' +#' @export +#' +get_multi_ligand_rf_importances_regression = function(setting,ligand_target_matrix, ligands_position = "cols", ntrees = 1000, mtry = 2, known = TRUE, filter_genes = FALSE){ + + if(!is.logical(known) | length(known) > 1) + stop("known should be a logical vector: TRUE or FALSE") + if(!is.logical(filter_genes) | length(filter_genes) > 1) + stop("filter_genes should be a logical vector: TRUE or FALSE") + if(ntrees <= 1) + stop("ntrees should be higher than 1") + if(mtry <= 1) + stop("mtry should be higher than 1") + + requireNamespace("dplyr") + + if (filter_genes == TRUE){ + ligand_target_matrix = filter_genes_ligand_target_matrix(ligand_target_matrix,ligands_position) + } + + setting_name = setting$name + ligands_oi = setting$from + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + response_df = tibble(gene = names(response_vector), response = response_vector) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + rf_model = randomForest::randomForest(y = train_data$obs, + x = train_data[,-(which(colnames(train_data) == "obs"))], + ntree = ntrees, + mtry = ncol(train_data[,-(which(colnames(train_data) == "obs"))])**(1/mtry) %>% ceiling(), + importance = TRUE + ) + + metrics = randomForest::importance(rf_model) %>% data.frame() %>% tibble::rownames_to_column("test_ligand") %>% as_tibble() %>% mutate(setting = setting_name) + + if (known == TRUE){ + true_ligand = setting$ligand + metrics = metrics %>% mutate(ligand = true_ligand) + metrics = metrics %>% select(setting, test_ligand, ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) + return(metrics) + } + metrics = metrics %>% select(setting, test_ligand, X.IncMSE, IncNodePurity) %>% rename(IncMSE = X.IncMSE) + return(metrics) +} +#' @title Convert settings to correct settings format for TF prediction. +#' +#' @description \code{convert_settings_tf_prediction} Converts settings to correct settings format for TF activity prediction. In this prediction problem, TFs (out of a set of possibly active TFs) will be ranked based on feature importance scores. The format can be made suited for applications in which TFs need to be scored based on their possible upstream activity: 3) by calculating individual feature importane scores or 4) feature importance based on models with embedded feature importance determination. Remark that upstream regulator analysis for TFs here is experimental and was not thoroughly validated in the study accompanying this package. +#' +#' @usage +#' convert_settings_tf_prediction(settings, all_tfs, single = TRUE) +#' +#' @param settings A list of lists. Eeach sublist contains the following elements: .$name: name of the setting; .$from: name(s) of the tf(s) active in the setting of interest; .$response: the observed target response: indicate for a gene whether it was a target or not in the setting of interest. +#' @param all_tfs A character vector of possible tfs that will be considered for the tf activity state prediction. +#' @param single TRUE if feature importance scores for tfs will be calculated by looking at ligans individually. FALSE if the goal is to calculate the feature importance scores via sophisticated classification algorithms like random forest. + +#' @return A list with following elements: $name, $tf: name of active tf(s) (only if validation is TRUE), $from (tf(s) that will be tested for activity prediction), $response +#' +#' @examples +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_tf_pred = convert_settings_tf_prediction(settings, all_tfs = c("SMAD1","STAT1","RELA"), single = TRUE) +#' # show how this function can be used to predict activities of TFs +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' tf_target = construct_tf_target_matrix(weighted_networks, tfs_as_cols = TRUE, standalone_output = TRUE) +#' tf_importances = dplyr::bind_rows(lapply(settings_tf_pred,get_single_ligand_importances,tf_target,known = FALSE)) +#' print(head(tf_importances)) +#' +#' @export +#' +#' +convert_settings_tf_prediction = function(settings,all_tfs, single = TRUE){ + + # input check + if(!is.list(settings)) + stop("settings should be a list") + if(!is.character(all_tfs)) + stop("all_tfs should be a character vector") + if(!is.logical(single) | length(single) != 1) + stop("single should be TRUE or FALSE") + + requireNamespace("dplyr") + + new_settings = list() + if (single == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + for (k in 1:length(all_tfs)){ + test_tf = all_tfs[[k]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_single_application(setting,test_tf)) + } + } + } else if (single == FALSE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + new_settings[[length(new_settings) + 1]] = list(make_new_setting_ligand_prediction_multi_application(setting,all_tfs)) + } + } + return(new_settings %>% unlist(recursive = FALSE)) +} +#' @title Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands. +#' +#' @description \code{convert_expression_settings_evaluation} Converts expression settings to format in which the total number of potential ligands is reduced up to n top-predicted active ligands.(useful for applications when a lot of ligands are potentially active, a lot of settings need to be predicted and a multi-ligand model is trained). +#' +#' @usage +#' convert_settings_topn_ligand_prediction(setting, importances, model, n, normalization) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) +#' @param n The top n number of ligands according to the ligand activity state prediction model will be considered as potential ligand for the generation of a new setting. +#' @inheritParams model_based_ligand_activity_prediction + +#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = evaluate_importances_ligand_prediction(ligand_importances,"median","lda") +#' +#' settings = lapply(expression_settings_validation[5:10],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = FALSE, single = TRUE) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix, known = FALSE)) +#' settings = lapply(settings,convert_settings_topn_ligand_prediction, importances = ligand_importances, model = evaluation$model, n = 3, normalization = "median" ) +#' } +#' +#' @export +#' +convert_settings_topn_ligand_prediction = function(setting, importances, model, n, normalization){ + # input check + if(!is.list(setting)) + stop("setting should be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.logical(setting$response)) + stop("setting$response should be a logical vector") + + requireNamespace("dplyr") + setting_name = setting$name + importances_oi = importances %>% filter(setting == setting_name) + output = model_based_ligand_activity_prediction(importances_oi, model,normalization) + top_ligands = output %>% top_n(n,model) %>% .$test_ligand + + new_setting = list() + new_setting$name = setting$name + new_setting$from = top_ligands + new_setting$response = setting$response + + return(new_setting) +} + +#' @title Evaluation of ligand activity prediction performance of single ligand importance scores: each dataset individually. +#' +#' @description \code{wrapper_evaluate_single_importances_ligand_prediction} Evaluate how well a single ligand importance score is able to predict the true activity state of a ligand. For this it is assumed, that ligand importance measures for truely active ligands will be higher than for non-active ligands. Several classification evaluation metrics for the prediction are calculated and variable importance scores can be extracted to rank the different importance measures in order of importance for ligand activity state prediction. +#' +#' @usage +#' wrapper_evaluate_single_importances_ligand_prediction(group,ligand_importances) +#' +#' @param group Name of the dataset (setting) you want to calculate ligand activity performance for. +#' @param ligand_importances A data frame containing at least folowing variables: $setting, $test_ligand, $ligand and one or more feature importance scores. $test_ligand denotes the name of a possibly active ligand, $ligand the name of the truely active ligand. +#' +#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' settings_ligand_pred = convert_settings_ligand_prediction(settings, all_ligands = unlist(extract_ligands_from_settings(settings,combination = FALSE)), validation = TRUE, single = TRUE) +#' +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' ligands = extract_ligands_from_settings(settings_ligand_pred,combination = FALSE) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' ligand_importances = dplyr::bind_rows(lapply(settings_ligand_pred,get_single_ligand_importances,ligand_target_matrix)) +#' evaluation = ligand_importances$setting %>% unique() %>% lapply(function(x){x}) %>% lapply(wrapper_evaluate_single_importances_ligand_prediction,ligand_importances) %>% bind_rows() %>% inner_join(ligand_importances %>% distinct(setting,ligand)) +#' print(head(evaluation)) +#' } +#' @export +#' +wrapper_evaluate_single_importances_ligand_prediction = function(group,ligand_importances){ + if (!is.data.frame(ligand_importances)) + stop("ligand_importances must be a data frame") + if (!is.character(group)) + stop("group must be a character") + ligand_importances %>% filter(setting %in% group) %>% evaluate_single_importances_ligand_prediction(normalization = "no") %>% mutate(setting = group) +} + diff --git a/R/evaluate_model_target_prediction.R b/R/evaluate_model_target_prediction.R index 939e831..462ccab 100644 --- a/R/evaluate_model_target_prediction.R +++ b/R/evaluate_model_target_prediction.R @@ -1,683 +1,682 @@ -#' @title Extract ligands of interest from settings -#' -#' @description \code{extract_ligands_from_settings} Extract ligands of interest from (expression) settings in correct to construct the ligand-target matrix. -#' -#' @usage -#' extract_ligands_from_settings(settings,combination = TRUE) -#' -#' @param settings A list of lists for which each sub-list contains the information about (expression) datasets; with minimally the following elements: name of the setting ($name), ligands (possibly) active in the setting of interest ($from). -#' @param combination Indicate whether in case multiple ligands are possibly active ligand combinations should be extracted or only individual ligands. Default: TRUE. -#' -#' @return A list containing the ligands and ligands combinations for which a ligand-target matrix should be constructed. When for a particular dataset multiple ligands are possibly active (i.e. more than ligand in .$from slot of sublist of settings), then both the combination of these multiple ligands and each of these multiple ligands individually will be select for model construction. -#' -#' @examples -#' \dontrun{ -#' ligands = extract_ligands_from_settings(expression_settings_validation) -#' } -#' @export -#' -extract_ligands_from_settings = function(settings, combination = TRUE){ - - # input check - if (!is.list(settings)) - stop("settings must be a list") - if(sum(sapply(settings,function(x){is.character(x$from)})) != length(settings)) - stop("settings$.$from must be a character vector containing ligands") - - ligands_oi = list() - if (combination == TRUE){ - for (i in 1:length(settings)){ - setting = settings[[i]] - ligand = setting$from - if (length(ligand) == 1) { - ligands_oi[length(ligands_oi) + 1] = ligand - } else {# if multiple ligands added - ligands_oi[[length(ligands_oi) + 1]] = ligand # ligands together - for (l in ligand) { - ligands_oi[length(ligands_oi) + 1] = l # ligands separate - } - } - } - } else { - for (i in 1:length(settings)){ - setting = settings[[i]] - ligand = setting$from - if (length(ligand) == 1) { - ligands_oi[length(ligands_oi) + 1] = ligand - } else {# if multiple ligands added - for (l in ligand) { - ligands_oi[length(ligands_oi) + 1] = l # ligands separate - } - } - } - } - - ligands_oi = unique(ligands_oi) - return(ligands_oi) -} -#' @title Convert expression settings to correct settings format for evaluation of target gene prediction. -#' -#' @description \code{convert_expression_settings_evaluation} Converts expression settings to correct settings format for evaluation of target gene prediction. -#' -#' @usage -#' convert_expression_settings_evaluation(setting) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) - -#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) -#' } -#' @export -#' -#' -convert_expression_settings_evaluation = function(setting) { - # input check - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.data.frame(setting$diffexp)) - stop("setting$diffexp should be data frame") - if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) - stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") - - requireNamespace("dplyr") - - diffexp_df = setting$diffexp %>% mutate(diffexp = (abs(lfc) >= 1) & (qval <= 0.1)) - diffexp_vector = diffexp_df$diffexp - names(diffexp_vector) = diffexp_df$gene - diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] - if((diffexp_vector %>% sum) == 0) { - print(setting$name) - warning("No differentially expressed genes, remove this expression dataset") - } - return(list(name = setting$name, from = setting$from, response = diffexp_vector)) -} -#' @title Evaluation of target gene prediction. -#' -#' @description \code{evaluate_target_prediction} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand). It shows several classification evaluation metrics for the prediction. Different classification metrics are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. -#' -#' @usage -#' evaluate_target_prediction(setting,ligand_target_matrix, ligands_position = "cols") -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" - -#' @return A data.frame with following variables: setting, ligand and for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom data.table data.table -#' @importFrom limma wilcoxGST -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(setting,evaluate_target_prediction,ligand_target_matrix) -#' print(head(perf1)) -#' perf2 = lapply(setting,evaluate_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix)) -#' } -#' @export -#' -evaluate_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols"){ - ## still make evaluation multiple ligands possible - # input check - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.logical(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - - requireNamespace("dplyr") - - if (length(setting$from) == 1){ - ligand_oi = setting$from - } else { - ligand_oi = paste0(setting$from,collapse = "-") - } - if (ligands_position == "cols"){ - if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[,ligand_oi] - names(prediction_vector) = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[ligand_oi,] - names(prediction_vector) = colnames(ligand_target_matrix) - } - response_vector = setting$response - - if(sd(prediction_vector) == 0) - warning("all target gene probability score predictions have same value") - if(sd(response_vector) == 0) - stop("all genes have same response") - performance = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector)) - output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) - - return(output) -} -#' @title Evaluation of target gene prediction for multiple ligands. -#' -#' @description \code{evaluate_multi_ligand_target_prediction} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the set of DE genes after treatment of cells by multiple ligands). A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). Several classification evaluation metrics for the prediction are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. -#' -#' @usage -#' evaluate_multi_ligand_target_prediction(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE,continuous = TRUE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost", "pls" (load "pls" package before!); without: "lda","naive_bayes", "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). -#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. -#' -#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom limma wilcoxGST -#' @import caret -#' @importFrom purrr safely -#' -#' @examples -#' \dontrun{ -#' library(dplyr) -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = convert_expression_settings_evaluation(expression_settings_validation$TGFB_IL6_timeseries) %>% list() -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' output = lapply(setting,evaluate_multi_ligand_target_prediction,ligand_target_matrix,ligands_position = "cols",algorithm = "glm") -#' output = lapply(setting,evaluate_multi_ligand_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix),ligands_position = "cols",algorithm = "glm" ) -#' } -#' @export -#' -evaluate_multi_ligand_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE){ - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.logical(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - if(!is.character(algorithm)) - stop("algorithm should be a character vector") - if(!is.logical(var_imps) | length(var_imps) > 1) - stop("var_imps should be a logical vector: TRUE or FALSE") - if(!is.logical(cv) | length(cv) > 1) - stop("cv should be a logical vector: TRUE or FALSE") - if(!is.numeric(cv_number) | length(cv_number) > 1) - stop("cv_number should be a numeric vector of length 1") - if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) - stop("cv_repeats should be a numeric vector of length 1") - if(!is.logical(parallel) | length(parallel) > 1) - stop("parallel should be a logical vector: TRUE or FALSE") - if(!is.numeric(n_cores) | length(n_cores) > 1) - stop("n_cores should be a numeric vector of length 1") - if(!is.logical(ignore_errors) | length(ignore_errors) > 1) - stop("ignore_errors should be a logical vector: TRUE or FALSE") - if(!is.logical(continuous) | length(continuous) > 1) - stop("continuous should be a logical vector: TRUE or FALSE") - requireNamespace("dplyr") - - ligands_oi = setting$from - - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - if(sd(response_vector) == 0) - stop("all genes have same response") - response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - if(is.double(prediction_matrix) == FALSE){ - convert_categorical_factor = function(x){ - x = x %>% make.names() %>% as.factor() - } - prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) - } - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - if (nrow(combined) == 0) - stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") - - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - output = wrapper_caret_classification(train_data,algorithm,continuous = continuous,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,ignore_errors, prediction_response_df = combined) - output$setting = setting$name - output$ligands = ligands_oi - # output$prediction_response_df = combined - return(output) -} -#' @title Evaluation of target gene prediction. -#' -#' @description \code{evaluate_target_prediction_interprete} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand; or the log fold change values). It shows several classification evaluation metrics for the prediction when response is categorical, or several regression model fit metrics when the response is continuous. -#' -#' @usage -#' evaluate_target_prediction_interprete(setting,ligand_target_matrix, ligands_position = "cols") -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" - -#' @return A list with the elements $performances and $prediction_response_df. $performance is a data.frame with classification evaluation metrics if response is categorical, or regression model fit metrics if response is continuous. $prediction_response_df shows for each gene, the model prediction and the response value of the gene (e.g. whether the gene the gene is a target or not according to the observed response, or the absolute value of the log fold change of a gene). -#' -#' @importFrom ROCR prediction performance -#' @importFrom caTools trapz -#' @importFrom data.table data.table -#' @importFrom limma wilcoxGST -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) -#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation_regression) -#' perf2 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) -#' } -#' @export -#' -evaluate_target_prediction_interprete = function(setting,ligand_target_matrix, ligands_position = "cols"){ - ## still make evaluation multiple ligands possible - # input check - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if((!is.logical(setting$response) & !is.numeric(setting$response)) | is.null(names(setting$response))) - stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted or a numeric vector containing response values (e.g. log fold change).") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - - requireNamespace("dplyr") - - if (length(setting$from) == 1){ - ligand_oi = setting$from - } else { - ligand_oi = paste0(setting$from,collapse = "-") - } - if (ligands_position == "cols"){ - if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[,ligand_oi] - names(prediction_vector) = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[ligand_oi,] - names(prediction_vector) = colnames(ligand_target_matrix) - } - response_vector = setting$response - - if(sd(prediction_vector) == 0) - warning("all target gene probability score predictions have same value") - if(sd(response_vector) == 0) - stop("all genes have same response") - - if (is.logical(response_vector)){ - output = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector), prediction_response_df = TRUE) - } else { - output = evaluate_target_prediction_regression_strict(response_vector,prediction_vector, prediction_response_df = TRUE) - } - - output$setting = setting$name - output$ligand = ligand_oi - colnames(output$prediction_response_df) = c("gene","response",ligand_oi) - - return(output) -} -#' @title Convert gene list to correct settings format for evaluation of target gene prediction. -#' -#' @description \code{convert_gene_list_settings_evaluation} Converts a gene list to correct settings format for evaluation of target gene prediction. -#' -#' @usage -#' convert_gene_list_settings_evaluation(gene_list, name, ligands_oi, background) -#' -#' @param gene_list A character vector of target gene names -#' @param name The name that will be given to the setting -#' @param ligands_oi The possibly active ligands -#' @param background A character vector of names of genes that are not target genes. If genes present in the gene list are in this vector of background gene names, these genes will be removed from the background. - -#' @return A list with following elements: $name, $from, $response -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' all_genes = unique(c(weighted_networks$gr$from,weighted_networks$gr$to,weighted_networks$lr_sig$from, weighted_networks$lr_sig$to)) -#' gene_list = c("ID1","ID2","ID3") -#' setting = list(convert_gene_list_settings_evaluation(gene_list = c("ID1","ID2","ID3"), name = "test",ligands_oi = "TGFB1", background = all_genes)) -#' } -#' @export -#' -#' -convert_gene_list_settings_evaluation = function(gene_list, name, ligands_oi, background) { - # input check - if(!is.character(gene_list)) - stop("gene_list should be character vector") - if(!is.character(name) | length(name) > 1) - stop("name should be character vector of length 1") - if(!is.character(ligands_oi)) - stop("ligands_oi should be character vector") - if(!is.character(background)) - stop("background should be character vector") - - requireNamespace("dplyr") - - background = background[(background %in% gene_list) == FALSE] - - background_logical = rep(FALSE,times = length(background)) - names(background_logical) = background - gene_list_logical = rep(TRUE,times = length(gene_list)) - names(gene_list_logical) = gene_list - response = c(background_logical,gene_list_logical) - - return(list(name = name, from = ligands_oi, response = response)) -} -#' @title Convert expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). -#' -#' @description \code{convert_expression_settings_evaluation_regression} Converts expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). -#' -#' @usage -#' convert_expression_settings_evaluation_regression(setting) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) - -#' @return A list with following elements: $name, $from, $response. $response will be a gene-named numeric vector of log fold change values. -#' -#' @examples -#' \dontrun{ -#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation_regression) -#' } -#' @export -#' -#' -convert_expression_settings_evaluation_regression = function(setting) { - # input check - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.data.frame(setting$diffexp)) - stop("setting$diffexp should be data frame") - if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) - stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") - - requireNamespace("dplyr") - - diffexp_vector = setting$diffexp$lfc %>% abs() - names(diffexp_vector) = setting$diffexp$gene - diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] - - return(list(name = setting$name, from = setting$from, response = diffexp_vector)) -} -#' @title Evaluation of target gene value prediction (regression). -#' -#' @description \code{evaluate_target_prediction_regression} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). It shows several regression model fit metrics for the prediction. -#' -#' @usage -#' evaluate_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols") -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" - -#' @return A data.frame with following variables: setting, ligand and as regression model fit metrics: r_squared: R squared, adj_r_squared: adjusted R squared, f_statistic: estimate of F-statistic, lm_coefficient_abs_t: absolute value of t-value of coefficient, inverse_rse: 1 divided by estimated standard deviation of the errors (inversed to become that higher values indicate better fit), reverse_aic: reverse value of Akaike information criterion (-AIC, to become that higher values indicate better fit), reverse_bic: the reverse value of the bayesian information criterion, inverse_mae: mean absolute error, pearson: pearson correlation coefficient, spearman: spearman correlation coefficient. -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' settings = lapply(expression_settings_validation[1:2],convert_expression_settings_evaluation_regression) -#' ligands = extract_ligands_from_settings(settings) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(settings,evaluate_target_prediction_regression,ligand_target_matrix) -#' } -#' @export -#' -evaluate_target_prediction_regression = function(setting,ligand_target_matrix, ligands_position = "cols"){ - # input check - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.double(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix)) - stop("ligand_target matrix should be of type double and containing numeric probabilities as predictions") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - - requireNamespace("dplyr") - - if (length(setting$from) == 1){ - ligand_oi = setting$from - } else { - ligand_oi = paste0(setting$from,collapse = "-") - } - if (ligands_position == "cols"){ - if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[,ligand_oi] - names(prediction_vector) = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) - stop("ligand should be in ligand_target_matrix") - prediction_vector = ligand_target_matrix[ligand_oi,] - names(prediction_vector) = colnames(ligand_target_matrix) - } - response_vector = setting$response - - if(sd(prediction_vector) == 0) - warning("all target gene probability score predictions have same value") - if(sd(response_vector) == 0) - stop("all genes have same response") - - performance = evaluate_target_prediction_regression_strict(response_vector,prediction_vector) - output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) - - return(output) -} -#' @title Evaluation of target gene value prediction for multiple ligands (regression). -#' -#' @description \code{evaluate_multi_ligand_target_prediction_regression} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). It shows several regression model fit metrics for the prediction. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. -#' -#' @usage -#' evaluate_multi_ligand_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) -#' -#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. -#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). -#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" -#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: "lm","glmnet", "rf". -#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. -#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. -#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. -#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. -#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. -#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. -#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. -#' -#' @return A list with the following elements. $performances: data frame containing regression model fit metrics for regression on the test folds during training via cross-validation; $performances_training: data frame containing model fit metrics for regression of the final model on the complete data set (performance can be severly optimistic due to overfitting!); $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. -#' -#' @import caret -#' @importFrom purrr safely -#' -#' @examples -#' \dontrun{ -#' library(dplyr) -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' setting = convert_expression_settings_evaluation_regression(expression_settings_validation$TGFB_IL6_timeseries) %>% list() -#' ligands = extract_ligands_from_settings(setting) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' output = lapply(setting,evaluate_multi_ligand_target_prediction_regression,ligand_target_matrix,ligands_position = "cols",algorithm = "lm" ) -#' } -#' @export -#' -evaluate_multi_ligand_target_prediction_regression = function(setting, ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ - if (!is.list(setting)) - stop("setting must be a list") - if(!is.character(setting$from) | !is.character(setting$name)) - stop("setting$from and setting$name should be character vectors") - if(!is.double(setting$response) | is.null(names(setting$response))) - stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") - if(!is.matrix(ligand_target_matrix)) - stop("ligand_target_matrix should be a matrix") - if(!is.double(ligand_target_matrix)) - stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions") - if (ligands_position != "cols" & ligands_position != "rows") - stop("ligands_position must be 'cols' or 'rows'") - if(!is.character(algorithm)) - stop("algorithm should be a character vector") - if(!is.logical(var_imps) | length(var_imps) > 1) - stop("var_imps should be a logical vector: TRUE or FALSE") - if(!is.logical(cv) | length(cv) > 1) - stop("cv should be a logical vector: TRUE or FALSE") - if(!is.numeric(cv_number) | length(cv_number) > 1) - stop("cv_number should be a numeric vector of length 1") - if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) - stop("cv_repeats should be a numeric vector of length 1") - if(!is.logical(parallel) | length(parallel) > 1) - stop("parallel should be a logical vector: TRUE or FALSE") - if(!is.numeric(n_cores) | length(n_cores) > 1) - stop("n_cores should be a numeric vector of length 1") - if(!is.logical(ignore_errors) | length(ignore_errors) > 1) - stop("ignore_errors should be a logical vector: TRUE or FALSE") - - requireNamespace("dplyr") - - ligands_oi = setting$from - - - if (ligands_position == "cols"){ - if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[,ligands_oi] - target_genes = rownames(ligand_target_matrix) - } else if (ligands_position == "rows") { - if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) - stop("ligands should be in ligand_target_matrix") - prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() - target_genes = colnames(ligand_target_matrix) - } - - response_vector = setting$response - - if(sd(response_vector) == 0) - stop("all genes have same response") - response_df = tibble(gene = names(response_vector), response = response_vector) - - prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() - - prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) - combined = inner_join(response_df,prediction_df, by = "gene") - if (nrow(combined) == 0) - stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") - train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() - - output = wrapper_caret_regression(train_data,algorithm,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = combined,ignore_errors) - output$setting = setting$name - output$ligands = ligands_oi - return(output) -} -#' @title Calculate average performance of datasets of a specific ligand. -#' -#' @description \code{wrapper_average_performances} Calculate average performance of datasets of a specific ligand. Datasets profiling more than one ligand (and thus ligands other than the ligand of interest), will be included as well. -#' -#' @usage -#' wrapper_average_performances(ligand_oi,performances, averaging = "median") -#' -#' @param ligand_oi Name of the ligand for which datasets should be averaged. -#' @param performances A data frame with performance values, containing at least folowing variables: $setting, $ligand and one or more metrics. -#' @param averaging How should performances of datasets of the same ligand be averaged? 'median' or 'mean'. -#' -#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. -#' -#' @examples -#' \dontrun{ -#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) -#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) -#' ligands = extract_ligands_from_settings(settings) -#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) -#' perf1 = lapply(settings,evaluate_target_prediction,ligand_target_matrix) -#' performances_target_prediction_averaged = ligands %>% lapply(wrapper_average_performances, perf1,"median") %>% bind_rows() %>% drop_na() -#' } -#' -#' @export -#' -wrapper_average_performances = function(ligand_oi,performances, averaging = "median"){ - if (!is.character(ligand_oi)) - stop("ligand_oi must be a character") - if (!is.data.frame(performances)) - stop("performances must be a data frame") - if(averaging != "median" & averaging != "mean") - stop("averaging should be 'median' or 'mean' ") - - all_settings = performances$setting %>% unique() - settings_oi_indicators = strsplit(performances$ligand,"[-]") %>% lapply(function(real_ligand){sum(ligand_oi %in% real_ligand) > 0}) %>% unlist() - settings_oi = all_settings[settings_oi_indicators] - performances_oi = performances %>% filter(setting %in% settings_oi) - if(averaging == "median"){ - performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(median, na.rm = TRUE) - } else if (averaging == "mean"){ - performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(mean, na.rm = TRUE) - } - return(performances_oi_averaged %>% mutate(ligand = ligand_oi)) -} - - - - - - - - - - - - - - - - - - - +#' @title Extract ligands of interest from settings +#' +#' @description \code{extract_ligands_from_settings} Extract ligands of interest from (expression) settings in correct to construct the ligand-target matrix. +#' +#' @usage +#' extract_ligands_from_settings(settings,combination = TRUE) +#' +#' @param settings A list of lists for which each sub-list contains the information about (expression) datasets; with minimally the following elements: name of the setting ($name), ligands (possibly) active in the setting of interest ($from). +#' @param combination Indicate whether in case multiple ligands are possibly active ligand combinations should be extracted or only individual ligands. Default: TRUE. +#' +#' @return A list containing the ligands and ligands combinations for which a ligand-target matrix should be constructed. When for a particular dataset multiple ligands are possibly active (i.e. more than ligand in .$from slot of sublist of settings), then both the combination of these multiple ligands and each of these multiple ligands individually will be select for model construction. +#' +#' @examples +#' \dontrun{ +#' ligands = extract_ligands_from_settings(expression_settings_validation) +#' } +#' @export +#' +extract_ligands_from_settings = function(settings, combination = TRUE){ + + # input check + if (!is.list(settings)) + stop("settings must be a list") + if(sum(sapply(settings,function(x){is.character(x$from)})) != length(settings)) + stop("settings$.$from must be a character vector containing ligands") + + ligands_oi = list() + if (combination == TRUE){ + for (i in 1:length(settings)){ + setting = settings[[i]] + ligand = setting$from + if (length(ligand) == 1) { + ligands_oi[length(ligands_oi) + 1] = ligand + } else {# if multiple ligands added + ligands_oi[[length(ligands_oi) + 1]] = ligand # ligands together + for (l in ligand) { + ligands_oi[length(ligands_oi) + 1] = l # ligands separate + } + } + } + } else { + for (i in 1:length(settings)){ + setting = settings[[i]] + ligand = setting$from + if (length(ligand) == 1) { + ligands_oi[length(ligands_oi) + 1] = ligand + } else {# if multiple ligands added + for (l in ligand) { + ligands_oi[length(ligands_oi) + 1] = l # ligands separate + } + } + } + } + + ligands_oi = unique(ligands_oi) + return(ligands_oi) +} +#' @title Convert expression settings to correct settings format for evaluation of target gene prediction. +#' +#' @description \code{convert_expression_settings_evaluation} Converts expression settings to correct settings format for evaluation of target gene prediction. +#' +#' @usage +#' convert_expression_settings_evaluation(setting) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) + +#' @return A list with following elements: $name, $from, $response. $response will be a gene-named logical vector indicating whether the gene's transcription was influenced by the active ligand(s) in the setting of interest. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation) +#' } +#' @export +#' +#' +convert_expression_settings_evaluation = function(setting) { + # input check + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.data.frame(setting$diffexp)) + stop("setting$diffexp should be data frame") + if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) + stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") + + requireNamespace("dplyr") + + diffexp_df = setting$diffexp %>% mutate(diffexp = (abs(lfc) >= 1) & (qval <= 0.1)) + diffexp_vector = diffexp_df$diffexp + names(diffexp_vector) = diffexp_df$gene + diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] + if((diffexp_vector %>% sum) == 0) { + print(setting$name) + warning("No differentially expressed genes, remove this expression dataset") + } + return(list(name = setting$name, from = setting$from, response = diffexp_vector)) +} +#' @title Evaluation of target gene prediction. +#' +#' @description \code{evaluate_target_prediction} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand). It shows several classification evaluation metrics for the prediction. Different classification metrics are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. +#' +#' @usage +#' evaluate_target_prediction(setting,ligand_target_matrix, ligands_position = "cols") +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" + +#' @return A data.frame with following variables: setting, ligand nd for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds.\cr +#' "mean_rank_GST_log_pval" will only be included in the dataframe if limma is installed. From NicheNet v2.1.7 onwards, limma is no longer a hard dependency of NicheNet. +#' +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @importFrom data.table data.table +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(setting,evaluate_target_prediction,ligand_target_matrix) +#' print(head(perf1)) +#' perf2 = lapply(setting,evaluate_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix)) +#' } +#' @export +#' +evaluate_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols"){ + ## still make evaluation multiple ligands possible + # input check + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.logical(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + + requireNamespace("dplyr") + + if (length(setting$from) == 1){ + ligand_oi = setting$from + } else { + ligand_oi = paste0(setting$from,collapse = "-") + } + if (ligands_position == "cols"){ + if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[,ligand_oi] + names(prediction_vector) = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[ligand_oi,] + names(prediction_vector) = colnames(ligand_target_matrix) + } + response_vector = setting$response + + if(sd(prediction_vector) == 0) + warning("all target gene probability score predictions have same value") + if(sd(response_vector) == 0) + stop("all genes have same response") + performance = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector)) + output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) + + return(output) +} +#' @title Evaluation of target gene prediction for multiple ligands. +#' +#' @description \code{evaluate_multi_ligand_target_prediction} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the set of DE genes after treatment of cells by multiple ligands). A classificiation algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). Several classification evaluation metrics for the prediction are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. +#' +#' @usage +#' evaluate_multi_ligand_target_prediction(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE,continuous = TRUE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: with embedded feature selection: "rf","glm","fda","glmnet","sdwd","gam","glmboost", "pls" (load "pls" package before!); without: "lda","naive_bayes", "pcaNNet". Please notice that not all these algorithms work when the features (i.e. ligand vectors) are categorical (i.e. discrete class assignments). +#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' @param continuous Indicate whether during training of the model, model training and evaluation should be done on class probabilities or discrete class labels. For huge class imbalance, we recommend setting this value to TRUE. Default: TRUE. +#' +#' @return A list with the following elements. $performances: data frame containing classification evaluation measure for classification on the test folds during training via cross-validation; $performances_training: data frame containing classification evaluation measures for classification of the final model (discrete class assignments) on the complete data set (performance can be severly optimistic due to overfitting!); $performance_training_continuous: data frame containing classification evaluation measures for classification of the final model (class probability scores) on the complete data set (performance can be severly optimistic due to overfitting!) $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @import caret +#' @importFrom purrr safely +#' +#' @examples +#' \dontrun{ +#' library(dplyr) +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = convert_expression_settings_evaluation(expression_settings_validation$TGFB_IL6_timeseries) %>% list() +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' output = lapply(setting,evaluate_multi_ligand_target_prediction,ligand_target_matrix,ligands_position = "cols",algorithm = "glm") +#' output = lapply(setting,evaluate_multi_ligand_target_prediction,make_discrete_ligand_target_matrix(ligand_target_matrix),ligands_position = "cols",algorithm = "glm" ) +#' } +#' @export +#' +evaluate_multi_ligand_target_prediction = function(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE, continuous = TRUE){ + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.logical(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + if(!is.character(algorithm)) + stop("algorithm should be a character vector") + if(!is.logical(var_imps) | length(var_imps) > 1) + stop("var_imps should be a logical vector: TRUE or FALSE") + if(!is.logical(cv) | length(cv) > 1) + stop("cv should be a logical vector: TRUE or FALSE") + if(!is.numeric(cv_number) | length(cv_number) > 1) + stop("cv_number should be a numeric vector of length 1") + if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) + stop("cv_repeats should be a numeric vector of length 1") + if(!is.logical(parallel) | length(parallel) > 1) + stop("parallel should be a logical vector: TRUE or FALSE") + if(!is.numeric(n_cores) | length(n_cores) > 1) + stop("n_cores should be a numeric vector of length 1") + if(!is.logical(ignore_errors) | length(ignore_errors) > 1) + stop("ignore_errors should be a logical vector: TRUE or FALSE") + if(!is.logical(continuous) | length(continuous) > 1) + stop("continuous should be a logical vector: TRUE or FALSE") + requireNamespace("dplyr") + + ligands_oi = setting$from + + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + if(sd(response_vector) == 0) + stop("all genes have same response") + response_df = tibble(gene = names(response_vector), response = response_vector %>% make.names() %>% as.factor()) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + if(is.double(prediction_matrix) == FALSE){ + convert_categorical_factor = function(x){ + x = x %>% make.names() %>% as.factor() + } + prediction_df = prediction_df %>% mutate_all(funs(convert_categorical_factor)) + } + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + if (nrow(combined) == 0) + stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") + + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + output = wrapper_caret_classification(train_data,algorithm,continuous = continuous,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,ignore_errors, prediction_response_df = combined) + output$setting = setting$name + output$ligands = ligands_oi + # output$prediction_response_df = combined + return(output) +} +#' @title Evaluation of target gene prediction. +#' +#' @description \code{evaluate_target_prediction_interprete} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand; or the log fold change values). It shows several classification evaluation metrics for the prediction when response is categorical, or several regression model fit metrics when the response is continuous. +#' +#' @usage +#' evaluate_target_prediction_interprete(setting,ligand_target_matrix, ligands_position = "cols") +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" + +#' @return A list with the elements $performances and $prediction_response_df. $performance is a data.frame with classification evaluation metrics if response is categorical, or regression model fit metrics if response is continuous. $prediction_response_df shows for each gene, the model prediction and the response value of the gene (e.g. whether the gene the gene is a target or not according to the observed response, or the absolute value of the log fold change of a gene). +#' +#' @importFrom ROCR prediction performance +#' @importFrom caTools trapz +#' @importFrom data.table data.table +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation) +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) +#' setting = lapply(expression_settings_validation[1],convert_expression_settings_evaluation_regression) +#' perf2 = lapply(setting,evaluate_target_prediction_interprete,ligand_target_matrix) +#' } +#' @export +#' +evaluate_target_prediction_interprete = function(setting,ligand_target_matrix, ligands_position = "cols"){ + ## still make evaluation multiple ligands possible + # input check + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if((!is.logical(setting$response) & !is.numeric(setting$response)) | is.null(names(setting$response))) + stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted or a numeric vector containing response values (e.g. log fold change).") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix) & !is.logical(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions; or of type logical when it contains categorical target predictions (TRUE or FALSE)") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + + requireNamespace("dplyr") + + if (length(setting$from) == 1){ + ligand_oi = setting$from + } else { + ligand_oi = paste0(setting$from,collapse = "-") + } + if (ligands_position == "cols"){ + if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[,ligand_oi] + names(prediction_vector) = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[ligand_oi,] + names(prediction_vector) = colnames(ligand_target_matrix) + } + response_vector = setting$response + + if(sd(prediction_vector) == 0) + warning("all target gene probability score predictions have same value") + if(sd(response_vector) == 0) + stop("all genes have same response") + + if (is.logical(response_vector)){ + output = evaluate_target_prediction_strict(response_vector,prediction_vector,is.double(prediction_vector), prediction_response_df = TRUE) + } else { + output = evaluate_target_prediction_regression_strict(response_vector,prediction_vector, prediction_response_df = TRUE) + } + + output$setting = setting$name + output$ligand = ligand_oi + colnames(output$prediction_response_df) = c("gene","response",ligand_oi) + + return(output) +} +#' @title Convert gene list to correct settings format for evaluation of target gene prediction. +#' +#' @description \code{convert_gene_list_settings_evaluation} Converts a gene list to correct settings format for evaluation of target gene prediction. +#' +#' @usage +#' convert_gene_list_settings_evaluation(gene_list, name, ligands_oi, background) +#' +#' @param gene_list A character vector of target gene names +#' @param name The name that will be given to the setting +#' @param ligands_oi The possibly active ligands +#' @param background A character vector of names of genes that are not target genes. If genes present in the gene list are in this vector of background gene names, these genes will be removed from the background. + +#' @return A list with following elements: $name, $from, $response +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' all_genes = unique(c(weighted_networks$gr$from,weighted_networks$gr$to,weighted_networks$lr_sig$from, weighted_networks$lr_sig$to)) +#' gene_list = c("ID1","ID2","ID3") +#' setting = list(convert_gene_list_settings_evaluation(gene_list = c("ID1","ID2","ID3"), name = "test",ligands_oi = "TGFB1", background = all_genes)) +#' } +#' @export +#' +#' +convert_gene_list_settings_evaluation = function(gene_list, name, ligands_oi, background) { + # input check + if(!is.character(gene_list)) + stop("gene_list should be character vector") + if(!is.character(name) | length(name) > 1) + stop("name should be character vector of length 1") + if(!is.character(ligands_oi)) + stop("ligands_oi should be character vector") + if(!is.character(background)) + stop("background should be character vector") + + requireNamespace("dplyr") + + background = background[(background %in% gene_list) == FALSE] + + background_logical = rep(FALSE,times = length(background)) + names(background_logical) = background + gene_list_logical = rep(TRUE,times = length(gene_list)) + names(gene_list_logical) = gene_list + response = c(background_logical,gene_list_logical) + + return(list(name = name, from = ligands_oi, response = response)) +} +#' @title Convert expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). +#' +#' @description \code{convert_expression_settings_evaluation_regression} Converts expression settings to correct settings format for evaluation of target gene log fold change prediction (regression). +#' +#' @usage +#' convert_expression_settings_evaluation_regression(setting) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$diffexp: data frame or tibble containing at least 3 variables= $gene, $lfc (log fold change treated vs untreated) and $qval (fdr-corrected p-value) + +#' @return A list with following elements: $name, $from, $response. $response will be a gene-named numeric vector of log fold change values. +#' +#' @examples +#' \dontrun{ +#' settings = lapply(expression_settings_validation,convert_expression_settings_evaluation_regression) +#' } +#' @export +#' +#' +convert_expression_settings_evaluation_regression = function(setting) { + # input check + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.data.frame(setting$diffexp)) + stop("setting$diffexp should be data frame") + if(is.null(setting$diffexp$lfc) | is.null(setting$diffexp$gene) | is.null(setting$diffexp$qval)) + stop("setting$diffexp should contain the variables 'lfc', 'qval' and 'diffexp'") + + requireNamespace("dplyr") + + diffexp_vector = setting$diffexp$lfc %>% abs() + names(diffexp_vector) = setting$diffexp$gene + diffexp_vector = diffexp_vector[unique(names(diffexp_vector))] + + return(list(name = setting$name, from = setting$from, response = diffexp_vector)) +} +#' @title Evaluation of target gene value prediction (regression). +#' +#' @description \code{evaluate_target_prediction_regression} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). It shows several regression model fit metrics for the prediction. +#' +#' @usage +#' evaluate_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols") +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (or discrete target assignments). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" + +#' @return A data.frame with following variables: setting, ligand and as regression model fit metrics: r_squared: R squared, adj_r_squared: adjusted R squared, f_statistic: estimate of F-statistic, lm_coefficient_abs_t: absolute value of t-value of coefficient, inverse_rse: 1 divided by estimated standard deviation of the errors (inversed to become that higher values indicate better fit), reverse_aic: reverse value of Akaike information criterion (-AIC, to become that higher values indicate better fit), reverse_bic: the reverse value of the bayesian information criterion, inverse_mae: mean absolute error, pearson: pearson correlation coefficient, spearman: spearman correlation coefficient. +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' settings = lapply(expression_settings_validation[1:2],convert_expression_settings_evaluation_regression) +#' ligands = extract_ligands_from_settings(settings) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(settings,evaluate_target_prediction_regression,ligand_target_matrix) +#' } +#' @export +#' +evaluate_target_prediction_regression = function(setting,ligand_target_matrix, ligands_position = "cols"){ + # input check + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.double(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix)) + stop("ligand_target matrix should be of type double and containing numeric probabilities as predictions") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + + requireNamespace("dplyr") + + if (length(setting$from) == 1){ + ligand_oi = setting$from + } else { + ligand_oi = paste0(setting$from,collapse = "-") + } + if (ligands_position == "cols"){ + if((ligand_oi %in% colnames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[,ligand_oi] + names(prediction_vector) = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if((ligand_oi %in% rownames(ligand_target_matrix)) == FALSE) + stop("ligand should be in ligand_target_matrix") + prediction_vector = ligand_target_matrix[ligand_oi,] + names(prediction_vector) = colnames(ligand_target_matrix) + } + response_vector = setting$response + + if(sd(prediction_vector) == 0) + warning("all target gene probability score predictions have same value") + if(sd(response_vector) == 0) + stop("all genes have same response") + + performance = evaluate_target_prediction_regression_strict(response_vector,prediction_vector) + output = bind_cols(tibble(setting = setting$name, ligand = ligand_oi), performance) + + return(output) +} +#' @title Evaluation of target gene value prediction for multiple ligands (regression). +#' +#' @description \code{evaluate_multi_ligand_target_prediction_regression} Evaluate how well a trained model is able to predict the observed response to a combination of ligands (e.g. the absolute log fold change value of genes after treatment of cells by a ligand). A regression algorithm chosen by the user is trained to construct one model based on the target gene predictions of all ligands of interest (ligands are considered as features). It shows several regression model fit metrics for the prediction. In addition, variable importance scores can be extracted to rank the possible active ligands in order of importance for response prediction. +#' +#' @usage +#' evaluate_multi_ligand_target_prediction_regression(setting,ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4,ignore_errors = FALSE) +#' +#' @param setting A list containing the following elements: .$name: name of the setting; .$from: name(s) of the ligand(s) active in the setting of interest; .$response: named logical vector indicating whether a target is a TRUE target of the possibly active ligand(s) or a FALSE. +#' @param ligand_target_matrix A matrix of ligand-target probabilty scores (recommended) or discrete target assignments (not-recommended). +#' @param ligands_position Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols" +#' @param algorithm The name of the classification algorithm to be applied. Should be supported by the caret package. Examples of algorithms we recommend: "lm","glmnet", "rf". +#' @param var_imps Indicate whether in addition to classification evaluation performances, variable importances should be calculated. Default: TRUE. +#' @param cv Indicate whether model training and hyperparameter optimization should be done via cross-validation. Default: TRUE. FALSE might be useful for applications only requiring variable importance, or when final model is not expected to be extremely overfit. +#' @param cv_number The number of folds for the cross-validation scheme: Default: 4; only relevant when cv == TRUE. +#' @param cv_repeats The number of repeats during cross-validation. Default: 2; only relevant when cv == TRUE. +#' @param parallel Indiciate whether the model training will occur parallelized. Default: FALSE. TRUE only possible for non-windows OS. +#' @param n_cores The number of cores used for parallelized model training via cross-validation. Default: 4. Only relevant on non-windows OS. +#' @param ignore_errors Indiciate whether errors during model training by caret should be ignored such that another model training try will be initiated until model is trained without raising errors. Default: FALSE. +#' +#' @return A list with the following elements. $performances: data frame containing regression model fit metrics for regression on the test folds during training via cross-validation; $performances_training: data frame containing model fit metrics for regression of the final model on the complete data set (performance can be severly optimistic due to overfitting!); $var_imps: data frame containing the variable importances of the different ligands (embbed importance score for some classification algorithms, otherwise just the auroc); $prediction_response_df: data frame containing for each gene the ligand-target predictions of the individual ligands, the complete model and the response as well; $setting: name of the specific setting that needed to be evaluated; $ligands: ligands of interest. +#' +#' @import caret +#' @importFrom purrr safely +#' +#' @examples +#' \dontrun{ +#' library(dplyr) +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' setting = convert_expression_settings_evaluation_regression(expression_settings_validation$TGFB_IL6_timeseries) %>% list() +#' ligands = extract_ligands_from_settings(setting) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' output = lapply(setting,evaluate_multi_ligand_target_prediction_regression,ligand_target_matrix,ligands_position = "cols",algorithm = "lm" ) +#' } +#' @export +#' +evaluate_multi_ligand_target_prediction_regression = function(setting, ligand_target_matrix, ligands_position = "cols", algorithm, var_imps = TRUE, cv = TRUE, cv_number = 4, cv_repeats = 2, parallel = FALSE, n_cores = 4, ignore_errors = FALSE){ + if (!is.list(setting)) + stop("setting must be a list") + if(!is.character(setting$from) | !is.character(setting$name)) + stop("setting$from and setting$name should be character vectors") + if(!is.double(setting$response) | is.null(names(setting$response))) + stop("setting$response should be named numeric vector containing response values that needs to be predicted (e.g. log fold change values) ") + if(!is.matrix(ligand_target_matrix)) + stop("ligand_target_matrix should be a matrix") + if(!is.double(ligand_target_matrix)) + stop("ligand_target matrix should be of type double if it contains numeric probabilities as predictions") + if (ligands_position != "cols" & ligands_position != "rows") + stop("ligands_position must be 'cols' or 'rows'") + if(!is.character(algorithm)) + stop("algorithm should be a character vector") + if(!is.logical(var_imps) | length(var_imps) > 1) + stop("var_imps should be a logical vector: TRUE or FALSE") + if(!is.logical(cv) | length(cv) > 1) + stop("cv should be a logical vector: TRUE or FALSE") + if(!is.numeric(cv_number) | length(cv_number) > 1) + stop("cv_number should be a numeric vector of length 1") + if(!is.numeric(cv_repeats) | length(cv_repeats) > 1) + stop("cv_repeats should be a numeric vector of length 1") + if(!is.logical(parallel) | length(parallel) > 1) + stop("parallel should be a logical vector: TRUE or FALSE") + if(!is.numeric(n_cores) | length(n_cores) > 1) + stop("n_cores should be a numeric vector of length 1") + if(!is.logical(ignore_errors) | length(ignore_errors) > 1) + stop("ignore_errors should be a logical vector: TRUE or FALSE") + + requireNamespace("dplyr") + + ligands_oi = setting$from + + + if (ligands_position == "cols"){ + if(sum((ligands_oi %in% colnames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[,ligands_oi] + target_genes = rownames(ligand_target_matrix) + } else if (ligands_position == "rows") { + if(sum((ligands_oi %in% rownames(ligand_target_matrix)) == FALSE) > 0) + stop("ligands should be in ligand_target_matrix") + prediction_matrix = ligand_target_matrix[ligands_oi,] %>% t() + target_genes = colnames(ligand_target_matrix) + } + + response_vector = setting$response + + if(sd(response_vector) == 0) + stop("all genes have same response") + response_df = tibble(gene = names(response_vector), response = response_vector) + + prediction_df = prediction_matrix %>% data.frame() %>% as_tibble() + + prediction_df = tibble(gene = target_genes) %>% bind_cols(prediction_df) + combined = inner_join(response_df,prediction_df, by = "gene") + if (nrow(combined) == 0) + stop("Gene names in response don't accord to gene names in ligand-target matrix (did you consider differences human-mouse namings?)") + train_data = combined %>% select(-gene) %>% rename(obs = response) %>% data.frame() + + output = wrapper_caret_regression(train_data,algorithm,var_imps,cv,cv_number,cv_repeats,parallel,n_cores,prediction_response_df = combined,ignore_errors) + output$setting = setting$name + output$ligands = ligands_oi + return(output) +} +#' @title Calculate average performance of datasets of a specific ligand. +#' +#' @description \code{wrapper_average_performances} Calculate average performance of datasets of a specific ligand. Datasets profiling more than one ligand (and thus ligands other than the ligand of interest), will be included as well. +#' +#' @usage +#' wrapper_average_performances(ligand_oi,performances, averaging = "median") +#' +#' @param ligand_oi Name of the ligand for which datasets should be averaged. +#' @param performances A data frame with performance values, containing at least folowing variables: $setting, $ligand and one or more metrics. +#' @param averaging How should performances of datasets of the same ligand be averaged? 'median' or 'mean'. +#' +#' @return A data frame containing classification evaluation measures for the ligand activity state prediction single, individual feature importance measures. +#' +#' @examples +#' \dontrun{ +#' weighted_networks = construct_weighted_networks(lr_network, sig_network, gr_network, source_weights_df) +#' settings = lapply(expression_settings_validation[1:5],convert_expression_settings_evaluation) +#' ligands = extract_ligands_from_settings(settings) +#' ligand_target_matrix = construct_ligand_target_matrix(weighted_networks, ligands) +#' perf1 = lapply(settings,evaluate_target_prediction,ligand_target_matrix) +#' performances_target_prediction_averaged = ligands %>% lapply(wrapper_average_performances, perf1,"median") %>% bind_rows() %>% drop_na() +#' } +#' +#' @export +#' +wrapper_average_performances = function(ligand_oi,performances, averaging = "median"){ + if (!is.character(ligand_oi)) + stop("ligand_oi must be a character") + if (!is.data.frame(performances)) + stop("performances must be a data frame") + if(averaging != "median" & averaging != "mean") + stop("averaging should be 'median' or 'mean' ") + + all_settings = performances$setting %>% unique() + settings_oi_indicators = strsplit(performances$ligand,"[-]") %>% lapply(function(real_ligand){sum(ligand_oi %in% real_ligand) > 0}) %>% unlist() + settings_oi = all_settings[settings_oi_indicators] + performances_oi = performances %>% filter(setting %in% settings_oi) + if(averaging == "median"){ + performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(median, na.rm = TRUE) + } else if (averaging == "mean"){ + performances_oi_averaged = performances_oi %>% select(-setting,-ligand) %>% summarise_all(mean, na.rm = TRUE) + } + return(performances_oi_averaged %>% mutate(ligand = ligand_oi)) +} + + + + + + + + + + + + + + + + + + + diff --git a/R/parameter_optimization.R b/R/parameter_optimization.R index 3832338..a891ffc 100644 --- a/R/parameter_optimization.R +++ b/R/parameter_optimization.R @@ -383,12 +383,12 @@ process_mlrmbo_nichenet_optimization = function(optimization_results,source_name #' @description \code{model_evaluation_optimization_nsga2} will take as input a vector of data source weights and hyperparameters to construct a ligand-target matrix and evaluate its performance on input validation settings. #' @usage #' model_evaluation_optimization_nsga2(y, source_names, algorithm, correct_topology, lr_network, sig_network, gr_network, settings, secondary_targets = FALSE, remove_direct_links = "no",damping_factor = NULL) -#' +#' #' @param y A numeric vector containing the data source weights as the first elements, and hyperparameters as the last elements. The order of the data source weights accords to the order of source_names. #' @inheritParams model_evaluation_optimization_mlrmbo -#' +#' #' @return A numeric vector of length 4 containing the average auroc for target gene prediction, average aupr (corrected for TP fraction) for target gene prediction, average auroc for ligand activity prediction and average aupr for ligand activity prediction. -#' +#' #' @examples #' \dontrun{ #' nr_datasources = source_weights_df$source %>% unique() %>% length() @@ -397,7 +397,7 @@ process_mlrmbo_nichenet_optimization = function(optimization_results,source_name #' test_evaluation_optimization = model_evaluation_optimization_nsga2(test_input, source_weights_df$source %>% unique(), "PPR", TRUE, lr_network, sig_network, gr_network, #' lapply(expression_settings_validation, convert_expression_settings_evaluation), secondary_targets = FALSE, remove_direct_links = "no") #' } -#' +#' #' @export model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, correct_topology, lr_network, sig_network, gr_network, settings, secondary_targets = FALSE, remove_direct_links = "no", damping_factor = NULL) { # change numeric vector y input to list x @@ -416,123 +416,123 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr x$gr_hub = other_params[2] x$ltf_cutoff = other_params[3] x$damping_factor = other_params[4] - + if (!is.null(damping_factor) & is.null(x$damping_factor)) { x$damping_factor = damping_factor } - if (!is.list(x)) + if (!is.list(x)) stop("x should be a list!") - if (!is.numeric(x$source_weights)) + if (!is.numeric(x$source_weights)) stop("x$source_weights should be a numeric vector") - if (x$lr_sig_hub < 0 | x$lr_sig_hub > 1) + if (x$lr_sig_hub < 0 | x$lr_sig_hub > 1) stop("x$lr_sig_hub must be a number between 0 and 1 (0 and 1 included)") - if (x$gr_hub < 0 | x$gr_hub > 1) + if (x$gr_hub < 0 | x$gr_hub > 1) stop("x$gr_hub must be a number between 0 and 1 (0 and 1 included)") if (is.null(x$ltf_cutoff)) { - if ((algorithm == "PPR" | algorithm == "SPL") & correct_topology == - FALSE) + if ((algorithm == "PPR" | algorithm == "SPL") & correct_topology == + FALSE) warning("Did you not forget to give a value to x$ltf_cutoff?") } else { - if (x$ltf_cutoff < 0 | x$ltf_cutoff > 1) + if (x$ltf_cutoff < 0 | x$ltf_cutoff > 1) stop("x$ltf_cutoff must be a number between 0 and 1 (0 and 1 included)") } if (algorithm == "PPR") { - if (x$damping_factor < 0 | x$damping_factor >= 1) + if (x$damping_factor < 0 | x$damping_factor >= 1) stop("x$damping_factor must be a number between 0 and 1 (0 included, 1 not)") } - if (algorithm != "PPR" & algorithm != "SPL" & algorithm != - "direct") + if (algorithm != "PPR" & algorithm != "SPL" & algorithm != + "direct") stop("algorithm must be 'PPR' or 'SPL' or 'direct'") - if (correct_topology != TRUE & correct_topology != FALSE) + if (correct_topology != TRUE & correct_topology != FALSE) stop("correct_topology must be TRUE or FALSE") - if (!is.data.frame(lr_network)) + if (!is.data.frame(lr_network)) stop("lr_network must be a data frame or tibble object") - if (!is.data.frame(sig_network)) + if (!is.data.frame(sig_network)) stop("sig_network must be a data frame or tibble object") - if (!is.data.frame(gr_network)) + if (!is.data.frame(gr_network)) stop("gr_network must be a data frame or tibble object") - if (!is.list(settings)) + if (!is.list(settings)) stop("settings should be a list!") - if (!is.character(settings[[1]]$from) | !is.character(settings[[1]]$name)) + if (!is.character(settings[[1]]$from) | !is.character(settings[[1]]$name)) stop("setting$from and setting$name should be character vectors") - if (!is.logical(settings[[1]]$response) | is.null(names(settings[[1]]$response))) + if (!is.logical(settings[[1]]$response) | is.null(names(settings[[1]]$response))) stop("setting$response should be named logical vector containing class labels of the response that needs to be predicted ") - if (secondary_targets != TRUE & secondary_targets != FALSE) + if (secondary_targets != TRUE & secondary_targets != FALSE) stop("secondary_targets must be TRUE or FALSE") - if (remove_direct_links != "no" & remove_direct_links != - "ligand" & remove_direct_links != "ligand-receptor") + if (remove_direct_links != "no" & remove_direct_links != + "ligand" & remove_direct_links != "ligand-receptor") stop("remove_direct_links must be 'no' or 'ligand' or 'ligand-receptor'") - if (!is.character(source_names)) + if (!is.character(source_names)) stop("source_names should be a character vector") - if (length(source_names) != length(x$source_weights)) + if (length(source_names) != length(x$source_weights)) stop("Length of source_names should be the same as length of x$source_weights") - if (correct_topology == TRUE && !is.null(x$ltf_cutoff)) + if (correct_topology == TRUE && !is.null(x$ltf_cutoff)) warning("Because PPR-ligand-target matrix will be corrected for topology, the proposed cutoff on the ligand-tf matrix will be ignored (x$ltf_cutoff") - if (correct_topology == TRUE && algorithm != "PPR") + if (correct_topology == TRUE && algorithm != "PPR") warning("Topology correction is PPR-specific and makes no sense when the algorithm is not PPR") # names(x$source_weights) = source_names - + parameters_setting = list(model_name = "query_design", source_weights = x$source_weights) if (algorithm == "PPR") { if (correct_topology == TRUE) { - parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, - lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, - ltf_cutoff = 0, algorithm = algorithm, damping_factor = x$damping_factor, + parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, + lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, + ltf_cutoff = 0, algorithm = algorithm, damping_factor = x$damping_factor, correct_topology = TRUE) } else { - parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, - lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, - ltf_cutoff = x$ltf_cutoff, algorithm = algorithm, + parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, + lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, + ltf_cutoff = x$ltf_cutoff, algorithm = algorithm, damping_factor = x$damping_factor, correct_topology = FALSE) } } if (algorithm == "SPL" | algorithm == "direct") { - parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, - lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, ltf_cutoff = x$ltf_cutoff, + parameters_setting = add_hyperparameters_parameter_settings(parameters_setting, + lr_sig_hub = x$lr_sig_hub, gr_hub = x$gr_hub, ltf_cutoff = x$ltf_cutoff, algorithm = algorithm, damping_factor = NULL, correct_topology = FALSE) } - output_evaluation = evaluate_model(parameters_setting, lr_network, - sig_network, gr_network, settings, calculate_popularity_bias_target_prediction = FALSE, - calculate_popularity_bias_ligand_prediction = FALSE, - ncitations = ncitations, secondary_targets = secondary_targets, + output_evaluation = evaluate_model(parameters_setting, lr_network, + sig_network, gr_network, settings, calculate_popularity_bias_target_prediction = FALSE, + calculate_popularity_bias_ligand_prediction = FALSE, + ncitations = ncitations, secondary_targets = secondary_targets, remove_direct_links = remove_direct_links, n_target_bins = 3) - + ligands_evaluation = settings %>% sapply(function(x){x$from}) %>% unlist() %>% unique() - + ligand_activity_performance_setting_summary = output_evaluation$performances_ligand_prediction_single %>% select(-setting, -ligand) %>% group_by(importance_measure) %>% summarise_all(mean) %>% group_by(importance_measure) %>% mutate(geom_average = exp(mean(log(c(auroc,aupr_corrected))))) best_metric = ligand_activity_performance_setting_summary %>% ungroup() %>% filter(geom_average == max(geom_average)) %>% pull(importance_measure) %>% .[1] performances_ligand_prediction_single_summary = output_evaluation$performances_ligand_prediction_single %>% filter(importance_measure == best_metric) - + performances_target_prediction_averaged = ligands_evaluation %>% lapply(function(x){x}) %>% lapply(wrapper_average_performances, output_evaluation$performances_target_prediction,"median") %>% bind_rows() %>% drop_na() performances_ligand_prediction_single_summary_averaged = ligands_evaluation %>% lapply(function(x){x}) %>% lapply(wrapper_average_performances, performances_ligand_prediction_single_summary %>% select(-importance_measure),"median") %>% bind_rows() %>% drop_na() - + mean_auroc_target_prediction = performances_target_prediction_averaged$auroc %>% mean(na.rm = TRUE) %>% unique() mean_aupr_target_prediction = performances_target_prediction_averaged$aupr_corrected %>% mean(na.rm = TRUE) %>% unique() - + # we want also to look at median ligand prediction, but also the mean: why? median focuses on improving half of the datasets, but can lead to ignorance of a few bad datasets -- try to semi-avoid this with the mean ## combine both mean and median median_auroc_ligand_prediction = performances_ligand_prediction_single_summary_averaged$auroc %>% median(na.rm = TRUE) %>% unique() median_aupr_ligand_prediction = performances_ligand_prediction_single_summary_averaged$aupr_corrected %>% median(na.rm = TRUE) %>% unique() - + mean_auroc_ligand_prediction = performances_ligand_prediction_single_summary_averaged$auroc %>% mean(na.rm = TRUE) %>% unique() mean_aupr_ligand_prediction = performances_ligand_prediction_single_summary_averaged$aupr_corrected %>% mean(na.rm = TRUE) %>% unique() - + score_auroc_ligand_prediction = (median_auroc_ligand_prediction + mean_auroc_ligand_prediction) * 0.5 score_aupr_ligand_prediction = (median_aupr_ligand_prediction + mean_aupr_ligand_prediction) * 0.5 - - return(c(mean_auroc_target_prediction*-1, mean_aupr_target_prediction*-1, + + return(c(mean_auroc_target_prediction*-1, mean_aupr_target_prediction*-1, score_auroc_ligand_prediction*-1, score_aupr_ligand_prediction*-1)) } #' @title Run NSGA-II for parameter optimization. #' @description \code{run_nsga2R_cluster} runs the NSGA-II algorithm for parameter optimization and allows for parallelization. The core of this function is adapted from \code{nsga2R::nsga2R}. -#' @usage +#' @usage #' run_nsga2R_cluster(run_id, fn, varNo, objDim, lowerBounds = rep(-Inf, varNo), upperBounds = rep(Inf, varNo), popSize = 100, tourSize = 2, generations = 20, cprob = 0.7, XoverDistIdx = 5, mprob = 0.2, MuDistIdx = 10, ncores = 24) -#' +#' #' @param fn The function to be optimized, usually \code{model_evaluation_optimization_nsga2}. #' @param varNo The number of variables to be optimized, usually the number of data sources + 4 hyperparameters (lr_sig_hub, gr_hub, ltf_cutoff, damping_factor). #' @param objDim Number of objective functions @@ -547,7 +547,7 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr #' @param MuDistIdx The mutation distribution index, it can be any nonnegative real number (default: 10) #' @param ncores The number of cores to be used for parallelization (default: 24) #' @param ... Additional arguments to \code{fn}. -#' +#' #' @return An 'nsga2R' object containing input settings and the following elements: #' \itemize{ #' \item intermed_output_list_params: a list with intermediate values of parameters for each generation (each element has dimensions popSize x varNo) @@ -557,7 +557,7 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr #' \item paretoFrontRank: nondomination ranks (or levels) that each non-dominated solution belongs to #' \item crowdingDistance: crowding distances of each non-dominated solution #' } -#' +#' #' @examples #' \dontrun{ #' source_names <- c(lr_network$source, sig_network$source, gr_network$source) %>% unique() @@ -568,37 +568,37 @@ model_evaluation_optimization_nsga2r = function(y, source_names, algorithm, corr #' results <- run_nsga2R_cluster(model_evaluation_optimization_nsga2r, varNo=n_param, objDim=n_obj, #' lowerBounds=lower_bounds, upperBounds=upper_bounds, popSize = 360, tourSize = 2, generations = 15, ncores = 8) #' } -#' +#' #' @export -run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo), - upperBounds = rep(Inf, varNo), popSize = 100, tourSize = 2, - generations = 20, ncores = 24, cprob = 0.7, XoverDistIdx = 5, mprob = 0.2, +run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo), + upperBounds = rep(Inf, varNo), popSize = 100, tourSize = 2, + generations = 20, ncores = 24, cprob = 0.7, XoverDistIdx = 5, mprob = 0.2, MuDistIdx = 10, ...) { library(nsga2R) library(dplyr) library(tidyr) - + intermed_output_list_params = list() intermed_output_list_obj = list() - + doMC::registerDoMC(ncores) - + cat("********** R based Nondominated Sorting Genetic Algorithm II *********") cat("\n") cat("initializing the population") cat("\n") - + print(Sys.time()) - - parent <- t(sapply(1:popSize, function(u) array(runif(length(lowerBounds), + + parent <- t(sapply(1:popSize, function(u) array(runif(length(lowerBounds), lowerBounds, upperBounds)))) # parent_old = parent # parent_classic <- cbind(parent, t(apply(parent, 1, fn))) parent <- cbind(parent, t(parallel::mclapply(split(parent, 1:nrow(parent)), fn, mc.cores = ncores, ...) %>% unlist() %>% matrix(nrow = objDim))) - + cat("ranking the initial population") cat("\n") - ranking <- fastNonDominatedSorting(parent[, (varNo + 1):(varNo + + ranking <- fastNonDominatedSorting(parent[, (varNo + 1):(varNo + objDim)]) rnkIndex <- integer(popSize) i <- 1 @@ -609,15 +609,15 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) parent <- cbind(parent, rnkIndex) cat("crowding distance calculation") cat("\n") - objRange <- apply(parent[, (varNo + 1):(varNo + objDim)], - 2, max) - apply(parent[, (varNo + 1):(varNo + objDim)], + objRange <- apply(parent[, (varNo + 1):(varNo + objDim)], + 2, max) - apply(parent[, (varNo + 1):(varNo + objDim)], 2, min) cd <- crowdingDist4frnt(parent, ranking, objRange) parent <- cbind(parent, apply(cd, 1, sum)) for (iter in 1:generations) { print(iter) print(Sys.time()) - cat("---------------generation---------------", + cat("---------------generation---------------", iter, "starts") cat("\n") cat("tournament selection") @@ -625,25 +625,25 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) matingPool <- tournamentSelection(parent, popSize, tourSize) cat("crossover operator") cat("\n") - childAfterX <- boundedSBXover(matingPool[, 1:varNo], + childAfterX <- boundedSBXover(matingPool[, 1:varNo], lowerBounds, upperBounds, cprob, XoverDistIdx) cat("mutation operator") cat("\n") - childAfterM <- boundedPolyMutation(childAfterX, lowerBounds, + childAfterM <- boundedPolyMutation(childAfterX, lowerBounds, upperBounds, mprob, MuDistIdx) cat("evaluate the objective fns of childAfterM") cat("\n") # childAfterM_old = childAfterM - # childAfterM_classic <- cbind(childAfterM, t(apply(childAfterM, + # childAfterM_classic <- cbind(childAfterM, t(apply(childAfterM, # 1, fn))) childAfterM <- cbind(childAfterM, t(parallel::mclapply(split(childAfterM, 1:nrow(childAfterM)), fn, mc.cores = ncores) %>% unlist() %>% matrix(nrow = objDim))) - + cat("Rt = Pt + Qt") cat("\n") parentNext <- rbind(parent[, 1:(varNo + objDim)], childAfterM) cat("ranking again") cat("\n") - ranking <- fastNonDominatedSorting(parentNext[, (varNo + + ranking <- fastNonDominatedSorting(parentNext[, (varNo + 1):(varNo + objDim)]) i <- 1 while (i <= length(ranking)) { @@ -653,51 +653,51 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) parentNext <- cbind(parentNext, rnkIndex) cat("crowded comparison again") cat("\n") - objRange <- apply(parentNext[, (varNo + 1):(varNo + objDim)], - 2, max) - apply(parentNext[, (varNo + 1):(varNo + + objRange <- apply(parentNext[, (varNo + 1):(varNo + objDim)], + 2, max) - apply(parentNext[, (varNo + 1):(varNo + objDim)], 2, min) cd <- crowdingDist4frnt(parentNext, ranking, objRange) parentNext <- cbind(parentNext, apply(cd, 1, sum)) - parentNext.sort <- parentNext[order(parentNext[, varNo + - objDim + 1], -parentNext[, varNo + objDim + 2]), + parentNext.sort <- parentNext[order(parentNext[, varNo + + objDim + 1], -parentNext[, varNo + objDim + 2]), ] cat("environmental selection") cat("\n") parent <- parentNext.sort[1:popSize, ] - cat("---------------generation---------------", + cat("---------------generation---------------", iter, "ends") cat("\n") if (iter != generations) { - # intermed_output_list_params[[iter]] = parent[, 1:varNo] + # intermed_output_list_params[[iter]] = parent[, 1:varNo] # intermed_output_list_obj[[iter]] = parent[, (varNo + 1):(varNo + objDim)] cat("\n") cat("********** new iteration *********") cat("\n") - + } else { cat("********** stop the evolution *********") cat("\n") } - intermed_output_list_params[[iter]] = parent[, 1:varNo] + intermed_output_list_params[[iter]] = parent[, 1:varNo] intermed_output_list_obj[[iter]] = parent[, (varNo + 1):(varNo + objDim)] } - result = list(intermed_output_list_params = intermed_output_list_params, - intermed_output_list_obj = intermed_output_list_obj, - functions = fn, - parameterDim = varNo, - objectiveDim = objDim, - lowerBounds = lowerBounds, - upperBounds = upperBounds, - popSize = popSize, - tournamentSize = tourSize, - generations = generations, - XoverProb = cprob, - XoverDistIndex = XoverDistIdx, - mutationProb = mprob, - mutationDistIndex = MuDistIdx, - parameters = parent[,1:varNo], - objectives = parent[, (varNo + 1):(varNo + objDim)], + result = list(intermed_output_list_params = intermed_output_list_params, + intermed_output_list_obj = intermed_output_list_obj, + functions = fn, + parameterDim = varNo, + objectiveDim = objDim, + lowerBounds = lowerBounds, + upperBounds = upperBounds, + popSize = popSize, + tournamentSize = tourSize, + generations = generations, + XoverProb = cprob, + XoverDistIndex = XoverDistIdx, + mutationProb = mprob, + mutationDistIndex = MuDistIdx, + parameters = parent[,1:varNo], + objectives = parent[, (varNo + 1):(varNo + objDim)], paretoFrontRank = parent[, varNo + objDim + 1], crowdingDistance = parent[, varNo + objDim + 2]) class(result) = "nsga2R" @@ -708,31 +708,31 @@ run_nsga2R_cluster = function (fn, varNo, objDim, lowerBounds = rep(-Inf, varNo) #' @description \code{get_optimized_parameters_nsga2} will take as input the output of \code{run_nsga2R_cluster} and extract the optimal parameter values, either from the best solution at the end of the generations or the best solution across all generations. #' @usage #' get_optimized_parameters_nsga2(result_nsga2r, source_names, search_all_iterations = FALSE, top_n = NULL, summarise_weights = TRUE) -#' +#' #' @param result_nsga2r The output of \code{run_nsga2R_cluster}. #' @param source_names Character vector containing the names of the data sources. #' @param search_all_iterations Logical indicating whether the best solution across all generations should be considered (TRUE) or only the best solution at the end of the generations (FALSE). #' @param top_n If search_all_iterations=TRUE, this indicates how many of the best solutions should be considered. #' @param summarise_weights If search_all_iterations=TRUE, a logical indicating whether the weights should be summarised by taking the mean and median. -#' +#' #' @return A list containing two dataframes, the optimal data source weights and the optimal hyperparameters. -#' +#' #' @examples #' \dontrun{ #' results <- run_nsga2R_cluster(model_evaluation_optimization_nsga2r, varNo=n_param, objDim=n_obj, #' lowerBounds=lower_bounds, upperBounds=upper_bounds, popSize = 360, tourSize = 2, generations = 15, ncores = 8) -#' +#' #' # Get the best solution at the end of the generations #' optimized_parameters <- get_optimized_parameters_nsga2(results, source_names, search_all_iterations = FALSE, top_n = NULL, summarise_weights = TRUE) -#' +#' #' # Get the best solution across all generations, consider top 25 solutions and summarise weights #' optimized_parameters <- get_optimized_parameters_nsga2(results, source_names, search_all_iterations = TRUE, top_n = 25, summarise_weights = TRUE) #' } -#' +#' #' @export -#' +#' get_optimized_parameters_nsga2r = function(result_nsga2r, source_names, search_all_iterations = FALSE, top_n = NULL, summarise_weights = TRUE){ - + if (!search_all_iterations & is.numeric(top_n)){ message("search_all_iterations is FALSE, so top_n will be ignored") } @@ -763,7 +763,7 @@ get_optimized_parameters_nsga2r = function(result_nsga2r, source_names, search_a if (!search_all_iterations){ # take the best parameter setting considering the geometric mean of the objective function results - parameter_set_index = processed_optimization %>% filter(average == max(average)) %>% .$index + parameter_set_index = processed_optimization %>% filter(average == max(average)) %>% .$index params = optimization_results$parameters[parameter_set_index,] # Data source weights @@ -780,16 +780,16 @@ get_optimized_parameters_nsga2r = function(result_nsga2r, source_names, search_a lapply(function(index) { params = optimization_results$intermed_output_list_params %>% lapply(data.frame) %>% bind_rows() %>% .[index,] %>% as.double() source_weights = tibble(source = source_names, weight = params[1:length(source_names)], index = index) - + other_params = params[(length(source_names)+1): length(params)] other_params_df = tibble(parameter = hyperparam_names, weight = c(other_params[1], other_params[2], other_params[3], other_params[4]), index = index) - list(source_weights = source_weights, + list(source_weights = source_weights, hyperparams = other_params_df) }) - + # Extract data source and hyperparameter weights - source_weight_df <- purrr::map(all_weights, "source_weights") %>% bind_rows() %>% + source_weight_df <- purrr::map(all_weights, "source_weights") %>% bind_rows() %>% mutate_cond(weight <= 0.025 & source %in% source_names_zero_possible, weight = 0) %>% mutate_cond(weight >= 0.975 & source %in% source_names, weight = 1) %>% # If summarise_weights is TRUE, summarise weights by taking the mean and median @@ -1091,10 +1091,16 @@ evaluate_model_cv = function(parameters_setting, lr_network, sig_network, gr_net ligand_importances$spearman[is.na(ligand_importances$spearman)] = 0 ligand_importances$pearson_log_pval[is.na(ligand_importances$pearson_log_pval)] = 0 ligand_importances$spearman_log_pval[is.na(ligand_importances$spearman_log_pval)] = 0 - ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 ligand_importances$pearson_log_pval[is.infinite(ligand_importances$pearson_log_pval)] = 10000 ligand_importances$spearman_log_pval[is.infinite(ligand_importances$spearman_log_pval)] = 10000 - ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + + + if ("mean_rank_GST_log_pval" %in% colnames(ligand_importances)){ + ligand_importances$mean_rank_GST_log_pval[is.na(ligand_importances$mean_rank_GST_log_pval)] = 0 + ligand_importances$mean_rank_GST_log_pval[is.infinite(ligand_importances$mean_rank_GST_log_pval)] = 10000 + } else { + warning("mean_rank_GST_log_pval not in ligand_importances; do you have limma installed?") + } # all_importances = ligand_importances %>% select_if(.predicate = function(x){sum(is.na(x)) == 0}) @@ -1111,24 +1117,24 @@ evaluate_model_cv = function(parameters_setting, lr_network, sig_network, gr_net #' @description \code{visualize_parameter_values} will take as input the output of \code{run_nsga2R_cluster} and visualize the data source weights and hyperparameters of the best and worst solutions #' @usage #' visualize_parameter_values(result_nsga2r, source_names, top_ns = c(5, 10, -10, -25)) -#' +#' #' @param result_nsga2r The output of \code{run_nsga2R_cluster}. #' @param source_names Character vector containing the names of the data sources. #' @param top_ns Numeric vector indicating how many of the best and worst solutions should be considered (negative values indicate the worst solutions; default: c(5, 10, -10, -25)). -#' +#' #' @return A list containing two ggplot objects, one for the data source weights and one for the hyperparameters. -#' +#' #' @examples #' \dontrun{ #' results <- run_nsga2R_cluster(model_evaluation_optimization_nsga2r, varNo=n_param, objDim=n_obj, #' lowerBounds=lower_bounds, upperBounds=upper_bounds, popSize = 360, tourSize = 2, generations = 15, ncores = 8) -#' +#' #' # Visualize the best and worst 5 solutions #' visualize_parameter_values(results, source_names, top_ns = c(5, -5)) #' } -#' +#' #' @export -#' +#' visualize_parameter_values = function(result_nsga2r, source_names, top_ns = c(5, 10, -10, -25)){ optimized_params <- lapply(top_ns, function(n) { @@ -1148,16 +1154,16 @@ visualize_parameter_values = function(result_nsga2r, source_names, top_ns = c(5, #' @title Visualize parameter values from the output of \code{run_nsga2R_cluster} across cross-validation folds. #' @description \code{visualize_parameter_values_across_folds} will take as input the output of \code{run_nsga2R_cluster} and visualize the data source weights and hyperparameters of the best solutions across all folds. -#' +#' #' @usage #' visualize_parameter_values_across_folds(result_nsga2r_list, source_names, top_n = 25) -#' +#' #' @param result_nsga2r_list A list containing the outputs of \code{run_nsga2R_cluster} for each cross-validation fold. #' @param source_names Character vector containing the names of the data sources. #' @param top_n Numeric indicating how many of the best solutions should be considered. -#' +#' #' @return A list containing two ggplot objects, one for the data source weights and one for the hyperparameters. -#' +#' #' @examples #' \dontrun{ #' results_list <- lapply(cv_folds, function(fold){ @@ -1171,14 +1177,14 @@ visualize_parameter_values = function(result_nsga2r, source_names, top_ns = c(5, #' source_names = source_names, algorithm = "PPR", correct_topology = FALSE, lr_network = lr_network, sig_network = lr_network, gr_network = gr_network_subset, #' settings = settings, secondary_targets = FALSE, remove_direct_links = "no", damping_factor = NULL) #' }) -#' +#' #' # Visualize the best 25 solutions across all folds #' visualize_parameter_values_across_folds(results_list, source_names, top_n = 25) #' } -#' +#' #' @export visualize_parameter_values_across_folds = function(result_nsga2r_list, source_names, top_n = 25){ - + # Get best weights for each fold optimized_params <- lapply(1:length(result_nsga2r_list), function(i){ get_optimized_parameters_nsga2r(result_nsga2r_list[[i]], source_names, search_all_iterations = TRUE, top_n = top_n, summarise_weights = FALSE) %>% @@ -1191,7 +1197,7 @@ visualize_parameter_values_across_folds = function(result_nsga2r_list, source_na hyperparameters_boxplot = purrr::map(optimized_params, "hyperparams_df") %>% bind_rows() %>% ggplot(aes(x = fold, y = weight, group = fold, color = fold)) + geom_boxplot() + geom_point() + facet_wrap(.~parameter) + theme_bw() - - + + return(list(source_weights_boxplot = source_weights_boxplot, hyperparameters_boxplot = hyperparameters_boxplot)) -} \ No newline at end of file +} diff --git a/R/supporting_functions.R b/R/supporting_functions.R index df38416..dbfb2b2 100644 --- a/R/supporting_functions.R +++ b/R/supporting_functions.R @@ -530,33 +530,33 @@ classification_evaluation_continuous_pred = function(prediction,response, iregul cor_p_pval = suppressWarnings(cor.test(as.numeric(prediction), as.numeric(response))) %>% .$p.value cor_s_pval = suppressWarnings(cor.test(as.numeric(prediction), as.numeric(response), method = "s")) %>% .$p.value - mean_rank_GST = limma::wilcoxGST(response, prediction) - #### now start calculating the AUC-iRegulon + # Mean rank GST calculated if limma is installed + mean_rank_GST = ifelse(rlang::is_installed("limma"), limma::wilcoxGST(response, prediction), NA) + + # Calculate the AUC-iRegulon + output_iregulon = list() + if (iregulon){ + output_iregulon = calculate_auc_iregulon(prediction,response) + } + tbl_perf = tibble(auroc = auroc, aupr = aupr, aupr_corrected = aupr - aupr_random, sensitivity_roc = sensitivity, specificity_roc = specificity, mean_rank_GST_log_pval = -log(mean_rank_GST), + auc_iregulon = output_iregulon$auc_iregulon, + auc_iregulon_corrected = output_iregulon$auc_iregulon_corrected, pearson_log_pval = -log10(cor_p_pval), spearman_log_pval = -log10(cor_s_pval), pearson = cor_p, spearman = cor_s) - if (iregulon == TRUE){ - output_iregulon = calculate_auc_iregulon(prediction,response) - tbl_perf = tibble(auroc = auroc, - aupr = aupr, - aupr_corrected = aupr - aupr_random, - sensitivity_roc = sensitivity, - specificity_roc = specificity, - mean_rank_GST_log_pval = -log(mean_rank_GST), - auc_iregulon = output_iregulon$auc_iregulon, - auc_iregulon_corrected = output_iregulon$auc_iregulon_corrected, - pearson_log_pval = -log10(cor_p_pval), - spearman_log_pval = -log10(cor_s_pval), - pearson = cor_p, - spearman = cor_s) + + # Remove mean_rank_GST if limma is not installed + if (!rlang::is_installed("limma")) { + tbl_perf = tbl_perf %>% select(-mean_rank_GST_log_pval) } + return(tbl_perf) } classification_evaluation_categorical_pred = function(predictions, response) { diff --git a/man/evaluate_target_prediction.Rd b/man/evaluate_target_prediction.Rd index e2af602..89f2849 100644 --- a/man/evaluate_target_prediction.Rd +++ b/man/evaluate_target_prediction.Rd @@ -14,7 +14,8 @@ evaluate_target_prediction(setting,ligand_target_matrix, ligands_position = "col \item{ligands_position}{Indicate whether the ligands in the ligand-target matrix are in the rows ("rows") or columns ("cols"). Default: "cols"} } \value{ -A data.frame with following variables: setting, ligand and for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds. +A data.frame with following variables: setting, ligand nd for probabilistic predictions: auroc, aupr, aupr_corrected (aupr - aupr for random prediction), sensitivity_roc (proxy measure, inferred from ROC), specificity_roc (proxy measure, inferred from ROC), mean_rank_GST_log_pval (-log10 of p-value of mean-rank gene set test), pearson (correlation coefficient), spearman (correlation coefficient); whereas for categorical predictions: accuracy, recall, specificity, precision, F1, F0.5, F2, mcc, informedness, markedness, fisher_pval_log (which is -log10 of p-value fisher exact test), fisher odds.\cr +"mean_rank_GST_log_pval" will only be included in the dataframe if limma is installed. From NicheNet v2.1.7 onwards, limma is no longer a hard dependency of NicheNet. } \description{ \code{evaluate_target_prediction} Evaluate how well the model (i.e. the inferred ligand-target probability scores) is able to predict the observed response to a ligand (e.g. the set of DE genes after treatment of cells by a ligand). It shows several classification evaluation metrics for the prediction. Different classification metrics are calculated depending on whether the input ligand-target matrix contains probability scores for targets or discrete target assignments. diff --git a/vignettes/model_evaluation.Rmd b/vignettes/model_evaluation.Rmd index 7224c70..c3cd82e 100644 --- a/vignettes/model_evaluation.Rmd +++ b/vignettes/model_evaluation.Rmd @@ -66,6 +66,7 @@ performances = settings %>% lapply(evaluate_target_prediction, ligand_target_mat ``` Step 3: visualize the results: show here different classification evaluation metrics +(Note: Mean-rank gene-set enrichment will only be calculated if limma is installed) ```{r target-prediction-v2-results, fig.width=8, fig.height=8} # Visualize some classification evaluation metrics showing the target gene prediction performance diff --git a/vignettes/model_evaluation.md b/vignettes/model_evaluation.md index fbd4070..2bad270 100644 --- a/vignettes/model_evaluation.md +++ b/vignettes/model_evaluation.md @@ -88,6 +88,7 @@ performances = settings %>% lapply(evaluate_target_prediction, ligand_target_mat Step 3: visualize the results: show here different classification evaluation metrics +(Note: Mean-rank gene-set enrichment will only be calculated if limma is installed) ``` r # Visualize some classification evaluation metrics showing the target gene prediction performance