From 33044148929d3f28cbbc064f3c4269745a00300f Mon Sep 17 00:00:00 2001 From: mrustl Date: Thu, 4 Jun 2020 23:38:21 +0200 Subject: [PATCH] update cleaning --- NAMESPACE | 1 + R/.read_budget-files.R | 104 ++++++++++++++++------------- R/get_costs_by_work_package.R | 14 ++-- R/read_partner_budget_from_excel.R | 5 +- man/append_zero_costs.Rd | 6 +- man/get_costs_by_work_package.Rd | 6 +- 6 files changed, 76 insertions(+), 60 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f0431b0..f681263 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ importFrom(fs,dir_create) importFrom(fs,file_copy) importFrom(kwb.utils,catAndRun) importFrom(kwb.utils,defaultIfNA) +importFrom(kwb.utils,extractSubstring) importFrom(kwb.utils,getAttribute) importFrom(kwb.utils,noFactorDataFrame) importFrom(kwb.utils,rbindAll) diff --git a/R/.read_budget-files.R b/R/.read_budget-files.R index 65d10d7..378f8ae 100644 --- a/R/.read_budget-files.R +++ b/R/.read_budget-files.R @@ -40,13 +40,6 @@ if (FALSE) #path <- "proposals/bmbf_digital/Previous-projects/Budget" path <- "proposals/h2020_covid/60_Budget/10_Filled_out_forms" - tdir_root <- kwb.nextcloud:::create_download_dir("nextcloud_") - tdir_forms <- file.path(tdir_root, "10_filled_out_forms") - tdir_summary <- file.path(tdir_root, "20_Summary_Files") - - dir.create(tdir_forms) - dir.create(tdir_summary) - # 2) Generate budget files metadata (in order to detect updates, required for # automation) # List files recursively (only xlsx or csv files) @@ -61,6 +54,14 @@ if (FALSE) #View(file_info_latest) # 3) Upload file metadata to cloud (only once!) + tdir_root <- kwb.nextcloud:::create_download_dir("nextcloud_") + tdir_forms <- file.path(tdir_root, "10_filled_out_forms") + tdir_summary <- file.path(tdir_root, "20_Summary_Files") + + dir.create(tdir_forms) + dir.create(tdir_summary) + + is_this_the_first_time <- FALSE if (is_this_the_first_time) { @@ -126,6 +127,7 @@ if (FALSE) # Download the corresponding files to a temp folder below ~/../Downloads system.time( downloaded_files <- kwb.nextcloud:::download_files( + #href = file_info_latest$href, paths = full_paths, target_dir = tdir_forms ) @@ -162,28 +164,31 @@ if (FALSE) print(has_error) table(has_error) + stopifnot(all(!has_error)) # Exclude elements that caused errors costs_list <- costs_list[! has_error] + + # Get partner metadata (for DWH proposal) + columns <- c("partner_id", "partner_name_short", "partner_type", "country") + + partner_info <- read_partner_info(columns) + + # Transform in dataframe costs <- kwb.utils::rbindAll(costs_list) %>% - dplyr::select(-.data$Country) + dplyr::select(-.data$Country) %>% + dplyr::left_join(partner_info, by = "partner_id") # Table with direct costs by WP - costs_by_wp <- kwb.budget::get_costs_by_work_package(costs_list) - head(costs_by_wp) - View(costs_by_wp) - - # Add reimbursement rate - costs_by_wp <- merge( - costs_by_wp, - costs[, c("partner_short_name", "Reimbursement_rate")], - by.x = "partner", - by.y = "partner_short_name" - ) - - # Add indirect and total costs - costs_by_wp <- costs_by_wp %>% + costs_by_wp <- kwb.budget::get_costs_by_work_package(costs_list, + n_work_packages = 6) %>% + merge(costs[, c("partner_id", "partner_name_short", "partner_type", "country", "Reimbursement_rate")], + by.x = "partner", + by.y = "partner_id" + ) %>% + # Add indirect and total costs + dplyr::rename(partner_id = partner, partner = partner_name_short) %>% dplyr::mutate( Reimbursement_rate = 0.01 * as.numeric( sub("%", "", .data$Reimbursement_rate) @@ -197,7 +202,12 @@ if (FALSE) ), Total_cost = .data$Direct_cost + .data$Indirect_cost, Total_funded_cost = .data$Reimbursement_rate * .data$Total_cost - ) + ) %>% + kwb.utils::moveColumnsToFront(c("partner_id", "partner", "partner_type", "country" + )) + + head(costs_by_wp) + View(costs_by_wp) # used in DWC proposal # # load partner info @@ -207,9 +217,6 @@ if (FALSE) # ) # partner_info <- read.csv2(file_partner_info) - # Get partner metadata (for DWH proposal) - partner_info <- get_partner_info() - # Check if names are the same in the two files before merging check <- costs$partner_short_name %in% partner_info$partner_name_short costs$partner_short_name[! check] @@ -218,44 +225,41 @@ if (FALSE) costs_data <- merge(costs, partner_info, by = "partner_id") head(costs_data) - ### -> this merge is not possible because partners changed "short_name in - # "budget" excel file - costs_data_by_wp <- merge( - costs_by_wp, partner_info, - by.x = "partner", - by.y = "Partner_short_name" - ) - - head(costs_data_by_wp) + head(costs_by_wp) # Prepare simplified table with costs - costs_data_short <- prepare_cost_data_short(costs_data) - costs_data_short %>% + costs_short <- prepare_cost_data_short(costs) + + costs_short %>% dplyr::select(.data$partner_name_short, .data$Total_funded_cost) %>% dplyr::arrange(dplyr::desc(.data$Total_funded_cost)) # Prepare table with person month for each wp - pm_data_by_wp <- costs_data_by_wp %>% + pm_data_by_wp <- costs_by_wp %>% dplyr::select(partner, wp, person_months.personnel) %>% tidyr::spread(wp, person_months.personnel) # Merge with simplified table withz costs costs_data_short <- merge( - costs_data_short, pm_data_by_wp, + costs_short, pm_data_by_wp, by.x = "partner_short_name", - by.y = "partner" + by.y = "partner", + all = TRUE ) %>% - select(-Total_funded_cost, Total_funded_cost) + dplyr::select(-Total_funded_cost, Total_funded_cost) # Prepare table with costs by company type - costs_data_by_type <- prepare_cost_data_by_type(costs_data_short) - costs_data_by_type + costs_data_by_type <- costs %>% + dplyr::rename(Type = .data$partner_type) %>% + prepare_cost_data_by_type() ### Save outputs version_num <- 7 - folder_out <- paste0(folder_bugdet, "20_Summary_Files") - file_out <- file.path(folder_out, paste0("V", version_num, "_DWC_Costs")) + file_out <- file.path( + tdir_summary, + sprintf("DWC_partner-budget_v%d", version_num) + ) write.csv2(costs_data, paste0(file_out, ".csv")) save(costs_data, file = paste0(file_out, ".rdata")) @@ -412,17 +416,21 @@ prepare_cost_data_short <- function(costs_data) } # read_partner_info ------------------------------------------------------------ -read_partner_info <- function() +read_partner_info <- function(columns = NULL) { path <- "proposals/h2020_covid/30_Partners/DWH_Partners-LOI-EAB_List.xlsx" - xls_partners <- kwb.nextcloud::download_files(paths = path) + path_partners <- kwb.nextcloud::download_files(paths = path) result <- openxlsx::read.xlsx( xlsxFile = path_partners, sheet = "Partners-PIC-Main contact" ) + if (! is.null(columns)) { + result <- kwb.utils::selectColumns(result, columns) + } + structure(result, path_partners = path_partners) } @@ -430,7 +438,7 @@ read_partner_info <- function() download_partner_budget_files <- function() { file_info <- kwb.nextcloud::list_files( - "proposals/h2020_covid/60_Budget", + "proposals/h2020_covid/60_Budget/10_Filled_out_forms", pattern = "DWH_partner-budget_\\d\\d", recursive = TRUE, full_info = TRUE diff --git a/R/get_costs_by_work_package.R b/R/get_costs_by_work_package.R index 76f33f1..bf6dd21 100644 --- a/R/get_costs_by_work_package.R +++ b/R/get_costs_by_work_package.R @@ -1,26 +1,28 @@ ##' Helper function: append zero costs ##' @param x x +#' @param n_work_packages number of work packages in EXCEL template +#' (default: 7, as used for DWC) ##' @return data frame with zero costs ##' @export ##' @importFrom kwb.utils noFactorDataFrame safeRowBind ##' -append_zero_costs <- function(x) { +append_zero_costs <- function(x, n_work_packages) { kwb.utils::safeRowBind(x, kwb.utils::noFactorDataFrame( - partner = unique(x$partner), wp = 1:7, cost = 0 + partner = unique(x$partner), wp = seq_len(n_work_packages), cost = 0 )) } ##' Get Costs by Work Package ##' @param costs_list list with "costs" from multiple partner Excelsheets -##' +#' @param n_work_packages number of work packages in EXCEL template +#' (default: 7, as used for DWC) ##' @return data frame with costs per work package ##' @export -##' ##' @importFrom kwb.utils defaultIfNA getAttribute rbindAll ##' @importFrom dplyr group_by summarise ##' @importFrom rlang .data ##' -get_costs_by_work_package <- function(costs_list) +get_costs_by_work_package <- function(costs_list, n_work_packages = 7) { # costs_list must be a list stopifnot(is.list(costs_list)) @@ -30,7 +32,7 @@ get_costs_by_work_package <- function(costs_list) collect_lines_with_work_package <- function(x, name) { result <- lapply(x, kwb.utils::getAttribute, name) - result <- lapply(result, append_zero_costs) + result <- lapply(result, append_zero_costs, n_work_packages) result <- kwb.utils::rbindAll(result) result$cost <- kwb.utils::defaultIfNA(result$cost, 0) result[! is.na(result$wp), ] diff --git a/R/read_partner_budget_from_excel.R b/R/read_partner_budget_from_excel.R index f3ade3b..f124e20 100644 --- a/R/read_partner_budget_from_excel.R +++ b/R/read_partner_budget_from_excel.R @@ -6,7 +6,7 @@ #' @param dbg debug message (default: TRUE) #' @return list with imported EXCEL budget file data #' @export -#' @importFrom kwb.utils noFactorDataFrame renameAndSelect removeColumns toLookupTable +#' @importFrom kwb.utils noFactorDataFrame renameAndSelect removeColumns toLookupTable extractSubstring #' @importFrom stringr str_extract #' read_partner_budget_from_excel <- function( @@ -36,6 +36,7 @@ read_partner_budget_from_excel <- function( budget <- kwb.utils::noFactorDataFrame( filename = filename, partner_id = as.numeric(stringr::str_extract(filename, "[0-9][0-9]")), + #Participant = kwb.utils::extractSubstring("_([^_]+)\\.xlsx$", filename, 1), Participant = general$partner_short_name, Country = "", #Direct_personnel_costs = sum(ranges$range_personnel[["Cost (EUR)"]]), @@ -91,7 +92,7 @@ read_partner_budget_from_excel <- function( ))) bind_partner <- function(x) { - cbind(kwb.utils::noFactorDataFrame(partner = general$partner_short_name), x) + cbind(kwb.utils::noFactorDataFrame(partner = budget$partner_id), x) } structure( diff --git a/man/append_zero_costs.Rd b/man/append_zero_costs.Rd index 11f01ca..2ad6915 100644 --- a/man/append_zero_costs.Rd +++ b/man/append_zero_costs.Rd @@ -4,10 +4,12 @@ \alias{append_zero_costs} \title{Helper function: append zero costs} \usage{ -append_zero_costs(x) +append_zero_costs(x, n_work_packages) } \arguments{ -\item{x}{x} +\item{x}{x +@param n_work_packages number of work packages in EXCEL template +(default: 7, as used for DWC)} } \value{ data frame with zero costs diff --git a/man/get_costs_by_work_package.Rd b/man/get_costs_by_work_package.Rd index 0510c80..c33053a 100644 --- a/man/get_costs_by_work_package.Rd +++ b/man/get_costs_by_work_package.Rd @@ -4,10 +4,12 @@ \alias{get_costs_by_work_package} \title{Get Costs by Work Package} \usage{ -get_costs_by_work_package(costs_list) +get_costs_by_work_package(costs_list, n_work_packages = 7) } \arguments{ -\item{costs_list}{list with "costs" from multiple partner Excelsheets} +\item{costs_list}{list with "costs" from multiple partner Excelsheets +@param n_work_packages number of work packages in EXCEL template +(default: 7, as used for DWC)} } \value{ data frame with costs per work package