From f0e718f6ffb6e4ad2c57d8911e37c02c7be68d7b Mon Sep 17 00:00:00 2001 From: DavZim Date: Tue, 10 Oct 2023 16:38:24 +0200 Subject: [PATCH] expose detect_backend function and add test, also fixes #7 --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/check_data.R | 83 +++++++++++++++++++--------- R/filters.R | 8 +-- README.Rmd | 1 + README.md | 5 +- man/check_data.Rd | 3 + man/detect_backend.Rd | 27 +++++++++ tests/testthat/test-check_data.R | 14 ++--- tests/testthat/test-detect_backend.R | 67 ++++++++++++++++++++++ 10 files changed, 170 insertions(+), 41 deletions(-) create mode 100644 man/detect_backend.Rd create mode 100644 tests/testthat/test-detect_backend.R diff --git a/DESCRIPTION b/DESCRIPTION index 3c1f39e..009b166 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dataverifyr Type: Package Title: A Lightweight, Flexible, and Fast Data Validation Package that Can Handle All Sizes of Data -Version: 0.1.6.9001 +Version: 0.1.6.9002 Authors@R: c( person(given = "David", family = "Zimmermann-Kollenda", diff --git a/NAMESPACE b/NAMESPACE index 01495cd..5580940 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(print,rule) S3method(print,ruleset) export(check_data) +export(detect_backend) export(filter_fails) export(plot_res) export(read_rules) diff --git a/R/check_data.R b/R/check_data.R index b7514c0..853ab1a 100644 --- a/R/check_data.R +++ b/R/check_data.R @@ -10,6 +10,8 @@ #' @return a data.frame-like object with one row for each rule and its results #' @export #' +#' @seealso [detect_backend()] +#' #' @examples #' rs <- ruleset( #' rule(mpg > 10), @@ -27,18 +29,18 @@ check_data <- function(x, rules, xname = deparse(substitute(x)), # treat single rule if needed if (hasName(rules, "expr")) rules <- ruleset(rules) - type <- detect_type(class(x)) + backend <- detect_backend(x) # make sure the input dataset has the right class if (class(x)[[1]] == "data.frame") { - if (type == "data.table") { + if (backend == "data.table") { x <- data.table::as.data.table(x) - } else if (type == "dplyr") { + } else if (backend == "dplyr") { x <- dplyr::as_tibble(x) } } - res <- check_(x, rules, type = type) + res <- check_(x, rules, backend = backend) # fails on warning and/or error if (fail_on_warn && any(res$warn != "") || @@ -58,39 +60,66 @@ check_data <- function(x, rules, xname = deparse(substitute(x)), res } -# helper function that detects which type the request should use: -# returns either: base-r, data.table, collectibles (any DBI or arrow backend) -detect_type <- function(cc) { +#' Detects the backend which will be used for checking the rules +#' +#' @description +#' The detection will be made based on the class of the object as well as the packages installed. +#' For example, if a `data.frame` is used, it will look if `data.table` or `dplyr` are installed on the system, as they provide more speed. +#' Note the main functions will revert the +#' +#' @param x The data object, ie a data.frame, tibble, data.table, arrow, or DBI object +#' +#' @return a single character element with the name of the backend to use. +#' One of `base-r`, `data.table`, `dplyr`, `collectibles` (for arrow or DBI objects) +#' @export +#' +#' @seealso [check_data()] +#' @examples +#' data <- mtcars +#' detect_backend(data) +detect_backend <- function(x) { + cc <- class(x) if ("data.table" %in% cc) { if (!has_pkg("data.table")) stop("The data.table package needs to be installed in order to test a data.table OR you can convert the data to a data.frame first!") - type <- "data.table" + backend <- "data.table" - } else if ("tibble" %in% cc) { + } else if (any(c("tibble", "tbl_df") %in% cc)) { if (!has_pkg("dplyr")) stop("The dplyr package needs to be installed in order to test a tibble OR you can convert the data to a data.frame first!") - type <- "dplyr" + backend <- "dplyr" } else if (length(cc) == 1 && cc == "data.frame") { if (has_pkg("data.table")) { - type <- "data.table" + backend <- "data.table" } else if (has_pkg("dplyr")) { - type <- "dplyr" + backend <- "dplyr" } else { - type <- "base-r" + backend <- "base-r" } - } else if (any(c("tbl_sql", "ArrowObject") %in% cc)) { + } else if ("tbl_sql" %in% cc) { + + if (!has_pkg("DBI")) + stop("The DBI package needs to be installed in order to test a tbl_sql.") + + backend <- "collectibles" - type <- "collectibles" + } else if ("ArrowObject" %in% cc) { + + if (!has_pkg("arrow")) + stop("The arrow package needs to be installed in order to test an ArrowObject.") + + backend <- "collectibles" } else { + stop(sprintf(paste("Unknown class of x found: '%s'.", "x must be a data.frame/tibble/data.table or a tbl (SQL table) or ArrowObject."), paste(cc, collapse = ", "))) } - type + backend } # small helper to check if a package is installed @@ -104,19 +133,19 @@ get_warnings <- function(code) { strip_dplyr_errors(paste(unique(out), collapse = ", ")) } -# helper function that checks the rules given a specific type (~package) -check_ <- function(x, rules, type = c("base-r", "dplyr", "data.table", "collectibles")) { +# helper function that checks the rules given a specific backend (~package) +check_ <- function(x, rules, backend = c("base-r", "dplyr", "data.table", "collectibles")) { - type <- match.arg(type) + backend <- match.arg(backend) # function to create a data.frame - to_df <- switch(type, + to_df <- switch(backend, "base-r" = data.frame, dplyr = dplyr::tibble, data.table = data.table::data.table, collectibles = dplyr::tibble) # function to combine multiple data.frames into a single one by row - br <- switch(type, + br <- switch(backend, "base-r" = function(l) do.call(rbind, l), dplyr = dplyr::bind_rows, data.table = data.table::rbindlist, @@ -152,7 +181,7 @@ check_ <- function(x, rules, type = c("base-r", "dplyr", "data.table", "collecti Sys.setenv(NO_COLOR = "OFF") pos <- tryCatch({ warns <- get_warnings({ - pos <- filter_data_(x, type, e) + pos <- filter_data_(x, backend, e) }) pos }, error = function(err) { @@ -185,15 +214,15 @@ check_ <- function(x, rules, type = c("base-r", "dplyr", "data.table", "collecti # internal helper function that filters a dataset x # when return_n = FALSE the data is returned, otherwise the number of rows -filter_data_ <- function(x, type, e, return_n = TRUE) { - if (type == "base-r") { +filter_data_ <- function(x, backend, e, return_n = TRUE) { + if (backend == "base-r") { # note that the nrow(with(x, x[eval(parse(text = e)), ])) # includes NA rows and therefore returns the wrong number of rows pos <- with(x, eval(parse(text = e))) pos <- if (return_n) sum(pos, na.rm = TRUE) else x[pos, ] - } else if (type == "dplyr" | type == "collectibles") { + } else if (backend == "dplyr" | backend == "collectibles") { rr <- dplyr::filter(x, !!str2lang(e)) - if (type == "collectibles") { + if (backend == "collectibles") { if (return_n) { pos <- dplyr::pull( dplyr::collect(dplyr::summarise(rr, n = dplyr::n())), @@ -205,7 +234,7 @@ filter_data_ <- function(x, type, e, return_n = TRUE) { } else { # dplyr pos <- if (return_n) nrow(rr) else rr } - } else if (type == "data.table") { + } else if (backend == "data.table") { pos <- x[eval(parse(text = e)), ] if (return_n) pos <- nrow(pos) } diff --git a/R/filters.R b/R/filters.R index ced8a2a..e2c9582 100644 --- a/R/filters.R +++ b/R/filters.R @@ -48,7 +48,7 @@ filter_fails <- function(res, x, per_rule = FALSE) { } - type <- detect_type(class(x)) + backend <- detect_backend(x) # add negated values e <- ifelse(negated, paste0("!(", eorig, ")"), eorig) @@ -70,14 +70,14 @@ filter_fails <- function(res, x, per_rule = FALSE) { # make sure the input dataset has the right class if (class(x)[[1]] == "data.frame") { - if (type == "data.table" && requireNamespace("data.table", quietly = TRUE)) { + if (backend == "data.table" && requireNamespace("data.table", quietly = TRUE)) { x <- data.table::as.data.table(x) - } else if (type == "dplyr" && requireNamespace("dplyr", quietly = TRUE)) { + } else if (backend == "dplyr" && requireNamespace("dplyr", quietly = TRUE)) { x <- dplyr::as_tibble(x) } } - ll <- lapply(e, filter_data_, x = x, type = type, return_n = FALSE) + ll <- lapply(e, filter_data_, x = x, backend = backend, return_n = FALSE) if (per_rule) { ll <- setNames(ll, eorig)[sapply(ll, nrow) != 0] } else { diff --git a/README.Rmd b/README.Rmd index 938561a..3d5dfeb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -129,6 +129,7 @@ data <- read_custom("wrong_data.csv", rules) At the moment the following backends are supported. Note that they are automatically chosen based on data type and package availability. Eg, when the dataset is a `dplyr::tbl()` connected to an `SQLite` database, the package will automatically choose `RSQLite`/`DBI`/`dbplyr` for the task. +To see which backend `dataverifyr` would use for a task, you can use `detect_backend(data)`. ```{r backends, echo=FALSE, results="asis"} # setup the table of backends in R... base markdown is not nice with this formatting... diff --git a/README.md b/README.md index ad9855b..9f64be2 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,8 @@ At the moment the following backends are supported. Note that they are automatically chosen based on data type and package availability. Eg, when the dataset is a `dplyr::tbl()` connected to an `SQLite` database, the package will automatically choose `RSQLite`/`DBI`/`dbplyr` for the -task. +task. To see which backend `dataverifyr` would use for a task, you can +use `detect_backend(data)`. @@ -610,7 +611,7 @@ res #> # A tibble: 3 × 10 #> name expr allow_na negate tests pass fail warn error time #> -#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE FALSE 32 28 4 "" "" 3.5227668 secs +#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE FALSE 32 28 4 "" "" 1.232728 secs #> 2 Rule for: cyl cyl %in% c(4, 8) FALSE FALSE 32 25 7 "" "" 0.2015200 secs #> 3 Rule for: vs vs %in% c(0, 1) TRUE FALSE 32 32 0 "" "" 0.1898661 secs diff --git a/man/check_data.Rd b/man/check_data.Rd index 16e81cc..c96bb7d 100644 --- a/man/check_data.Rd +++ b/man/check_data.Rd @@ -40,3 +40,6 @@ rs check_data(mtcars, rs) } +\seealso{ +\code{\link[=detect_backend]{detect_backend()}} +} diff --git a/man/detect_backend.Rd b/man/detect_backend.Rd new file mode 100644 index 0000000..42d61d6 --- /dev/null +++ b/man/detect_backend.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_data.R +\name{detect_backend} +\alias{detect_backend} +\title{Detects the backend which will be used for checking the rules} +\usage{ +detect_backend(x) +} +\arguments{ +\item{x}{The data object, ie a data.frame, tibble, data.table, arrow, or DBI object} +} +\value{ +a single character element with the name of the backend to use. +One of \code{base-r}, \code{data.table}, \code{dplyr}, \code{collectibles} (for arrow or DBI objects) +} +\description{ +The detection will be made based on the class of the object as well as the packages installed. +For example, if a \code{data.frame} is used, it will look if \code{data.table} or \code{dplyr} are installed on the system, as they provide more speed. +Note the main functions will revert the +} +\examples{ +data <- mtcars +detect_backend(data) +} +\seealso{ +\code{\link[=check_data]{check_data()}} +} diff --git a/tests/testthat/test-check_data.R b/tests/testthat/test-check_data.R index 6ed4614..d11d49e 100644 --- a/tests/testthat/test-check_data.R +++ b/tests/testthat/test-check_data.R @@ -14,7 +14,7 @@ rules <- ruleset( ) test_that("base-r check_ works", { - res <- check_(data, rules, type = "base-r") + res <- check_(data, rules, backend = "base-r") expect_equal(class(res), "data.frame") exp <- data.frame( @@ -36,7 +36,7 @@ test_that("dplyr check_ works", { skip_if_not(requireNamespace("dplyr", quietly = TRUE), "dplyr must be installed to test the functionality") - res <- check_(dplyr::tibble(data), rules, type = "dplyr") + res <- check_(dplyr::tibble(data), rules, backend = "dplyr") expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) exp <- dplyr::tibble( @@ -58,7 +58,7 @@ test_that("data.table check_ works", { skip_if_not(requireNamespace("data.table", quietly = TRUE), "data.table must be installed to test the functionality") - res <- check_(data.table::as.data.table(data), rules, type = "data.table") + res <- check_(data.table::as.data.table(data), rules, backend = "data.table") expect_equal(class(res), c("data.table", "data.frame")) exp <- data.table::data.table( @@ -85,7 +85,7 @@ test_that("arrow::arrow_table check_ works", { skip_if_not(requireNamespace("arrow", quietly = TRUE), "arrow must be installed to test the functionality") - res <- check_(arrow::arrow_table(data), rules, type = "collectibles") + res <- check_(arrow::arrow_table(data), rules, backend = "collectibles") expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) exp <- dplyr::tibble( @@ -111,7 +111,7 @@ test_that("arrow::open_dataset check_ works", { arrow::write_dataset(data, temp) ds <- arrow::open_dataset(temp) - res <- check_(ds, rules, type = "collectibles") + res <- check_(ds, rules, backend = "collectibles") expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) exp <- dplyr::tibble( @@ -141,7 +141,7 @@ test_that("sqlite (RSQLite) check_ works", { tbl <- dplyr::tbl(con, "data") - res <- check_(tbl, rules, type = "collectibles") + res <- check_(tbl, rules, backend = "collectibles") expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) exp <- dplyr::tibble( @@ -175,7 +175,7 @@ test_that("duckdb check_ works", { tbl <- dplyr::tbl(con, "data") - res <- check_(tbl, rules, type = "collectibles") + res <- check_(tbl, rules, backend = "collectibles") expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) exp <- dplyr::tibble( diff --git a/tests/testthat/test-detect_backend.R b/tests/testthat/test-detect_backend.R new file mode 100644 index 0000000..67341af --- /dev/null +++ b/tests/testthat/test-detect_backend.R @@ -0,0 +1,67 @@ +library(testthat) +library(dataverifyr) + +test_that("detect_backend works as expected", { + data <- mtcars + + # with no other packages installed, use base-r + pkgs <- NULL + local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr") + expect_equal(detect_backend(data), "base-r") + + # use data.table if installed + pkgs <- "data.table" + expect_equal(detect_backend(data), "data.table") + + # use dplyr if no data.table is installed + pkgs <- "dplyr" + expect_equal(detect_backend(data), "dplyr") + + # use data.table if both dplyr and data.table are found + pkgs <- c("dplyr", "data.table") + expect_equal(detect_backend(data), "data.table") + + + # test dplyr dataset + class(data) <- c("tbl_df", "tbl", "data.frame") + pkgs <- NULL + expect_error(detect_backend(data), "The dplyr package needs to be installed") + + # if tibble is given but no dplyr is installed -> error + pkgs <- "data.table" + expect_error(detect_backend(data), "The dplyr package needs to be installed") + + pkgs <- "dplyr" + expect_equal(detect_backend(data), "dplyr") + + + # test data.table dataset + class(data) <- c("data.table", "data.frame") + pkgs <- NULL + expect_error(detect_backend(data), "The data.table package needs to be installed") + + # if data.table is given but no data.table is installed -> error + pkgs <- "dplyr" + expect_error(detect_backend(data), "The data.table package needs to be installed") + + pkgs <- "data.table" + expect_equal(detect_backend(data), "data.table") + + + # test arrow dataset + class(data) <- c("FileSystemDataset", "Dataset", "ArrowObject", "R6") + pkgs <- NULL + expect_error(detect_backend(data), "The arrow package needs to be installed") + + pkgs <- "arrow" + expect_equal(detect_backend(data), "collectibles") + + + # test DBI dataset + class(data) <- c("tbl_SQLiteConnection", "tbl_dbi", "tbl_sql", "tbl_lazy", "tbl") + pkgs <- NULL + expect_error(detect_backend(data), "The DBI package needs to be installed") + + pkgs <- "DBI" + expect_equal(detect_backend(data), "collectibles") +})