From 283eaf5c57864de2671a1dddf1809d13af1bfd26 Mon Sep 17 00:00:00 2001 From: DavZim Date: Fri, 13 Oct 2023 11:31:33 +0200 Subject: [PATCH] rename args fail_on_ to stop_on_; add stop_on_fail; add tests; closes #8 --- DESCRIPTION | 2 +- NEWS.md | 8 +++ R/check_data.R | 30 ++++++----- R/rule.R | 7 ++- README.Rmd | 6 +-- README.md | 6 +-- man/check_data.Rd | 11 ++-- man/rule.Rd | 7 ++- tests/testthat/test-check_data.R | 87 ++++++++++++++++++++++++++++++-- 9 files changed, 133 insertions(+), 31 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 009b166..8961994 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dataverifyr Type: Package Title: A Lightweight, Flexible, and Fast Data Validation Package that Can Handle All Sizes of Data -Version: 0.1.6.9002 +Version: 0.1.6.9003 Authors@R: c( person(given = "David", family = "Zimmermann-Kollenda", diff --git a/NEWS.md b/NEWS.md index 5dd07f8..8d3d2f9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ + +# dataverifyr 0.1.7 + +* rename args `fail_on_X` to `stop_on_X` +* adds `stop_on_fail` to [`check_data()`], so that the examples using `read_custom()` make sense (eg in the Readme); thanks [FedericoComoglio](https://github.com/FedericoComoglio) for pointing it out! +* fix bug when dplyr backend is used + add tests; thanks [FedericoComoglio](https://github.com/FedericoComoglio) for reporting the bug! +* expose [`detect_backend()`] to allow user the check which backend is used + # dataverifyr 0.1.6 * allow multiline rules in yaml file diff --git a/R/check_data.R b/R/check_data.R index 853ab1a..89dd000 100644 --- a/R/check_data.R +++ b/R/check_data.R @@ -4,8 +4,9 @@ #' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection) #' @param rules a list of [`rule`]s #' @param xname optional, a name for the x variable (only used for errors) -#' @param fail_on_warn if the function should throw an error on a warning -#' @param fail_on_error if the function should throw an error on a failed rule +#' @param stop_on_fail when any of the rules fail, throw an error with stop +#' @param stop_on_warn when a warning is found in the code execution, throw an error with stop +#' @param stop_on_error when an error is found in the code execution, throw an error with stop #' #' @return a data.frame-like object with one row for each rule and its results #' @export @@ -22,7 +23,8 @@ #' #' check_data(mtcars, rs) check_data <- function(x, rules, xname = deparse(substitute(x)), - fail_on_warn = FALSE, fail_on_error = FALSE) { + stop_on_fail = FALSE, stop_on_warn = FALSE, + stop_on_error = FALSE) { # if rules is a yaml file, read it in if (length(rules) == 1 && is.character(rules)) rules <- read_rules(rules) @@ -42,19 +44,19 @@ check_data <- function(x, rules, xname = deparse(substitute(x)), res <- check_(x, rules, backend = backend) - # fails on warning and/or error - if (fail_on_warn && any(res$warn != "") || - fail_on_error && any(res$error != "")) { - warn <- fail_on_warn && any(res$warn != "") - err <- fail_on_error && any(res$error != "") + # stops on fail, warning and/or error + fail <- stop_on_fail && any(res$fail != 0) + warn <- stop_on_warn && any(res$warn != "") + err <- stop_on_error && any(res$error != "") - txt <- sprintf("In dataset '%s' found %s%s%s", - xname, - if (warn) sprintf("%s warnings", sum(res$warn != "")), - if (warn && err) " and ", - if (err) sprintf("%s errors", sum(res$error != ""))) - stop(txt) + if (fail || warn || err) { + tt <- paste(c( + if (fail) sprintf("%s rule fails", sum(res$fail != 0)), + if (warn) sprintf("%s warnings", sum(res$warn != "")), + if (err) sprintf("%s errors", sum(res$error != "")) + ), collapse = ", ") + stop(sprintf("In dataset '%s' found %s", xname, tt)) } res diff --git a/R/rule.R b/R/rule.R index aefe8ba..2a3c516 100644 --- a/R/rule.R +++ b/R/rule.R @@ -7,7 +7,12 @@ #' [arrow documentation](https://arrow.apache.org/docs/r/reference/acero.html#function-mappings)). #' The expression can be given as a string as well. #' @param name an optional name for the rule for reference -#' @param allow_na does the rule allow for NA values? default value is FALSE +#' @param allow_na does the rule allow for NA values in the data? default value is FALSE. +#' Note that when NAs are introduced in the expression, `allow_na` has no effect. +#' Eg when the rule `as.numeric(vs) %in% c(0, 1)` finds the values of `vs` as +#' `c("1", "A")`, the rule will throw a fail regardless of the value of `allow_na` +#' as the NA is introduced in the expression and is not found in the original data. +#' However, when the values of `vs` are `c("1", NA)`, `allow_na` will have an effect. #' @param negate is the rule negated, only applies to the expression not allow_na, #' that is, if `expr = mpg > 10`, `allow_na = TRUE`, and `negate = TRUE`, it would #' match all `mpg <= 10` as well as NAs. diff --git a/README.Rmd b/README.Rmd index 3d5dfeb..469b53f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -110,7 +110,7 @@ read_custom <- function(file, rules) { data <- read.csv(file) # or however you read in your data # if the check_data detects a fail: the read_custom function will stop check_data(data, rules, xname = file, - fail_on_error = TRUE, fail_on_warn = TRUE) + stop_on_fail = TRUE, stop_on_warn = TRUE, stop_on_error = TRUE) # ... data } @@ -119,8 +119,8 @@ data <- read_custom("correct_data.csv", rules) # an error is thrown when warnings or errors are found data <- read_custom("wrong_data.csv", rules) -#> Error in check_data(data, rules, fail_on_error = TRUE, fail_on_warn = TRUE) : -#> In dataset 'wrong_data.csv' found 1 warnings and 1 errors +#> Error in check_data(data, rules, stop_on_fail = TRUE, stop_on_error = TRUE, stop_on_warn = TRUE) : +#> In dataset 'wrong_data.csv' found 2 rule fails, 1 warnings, 1 errors ``` diff --git a/README.md b/README.md index 9f64be2..f84e3c4 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ read_custom <- function(file, rules) { data <- read.csv(file) # or however you read in your data # if the check_data detects a fail: the read_custom function will stop check_data(data, rules, xname = file, - fail_on_error = TRUE, fail_on_warn = TRUE) + stop_on_fail = TRUE, stop_on_warn = TRUE, stop_on_error = TRUE) # ... data } @@ -171,8 +171,8 @@ data <- read_custom("correct_data.csv", rules) # an error is thrown when warnings or errors are found data <- read_custom("wrong_data.csv", rules) -#> Error in check_data(data, rules, fail_on_error = TRUE, fail_on_warn = TRUE) : -#> In dataset 'wrong_data.csv' found 1 warnings and 1 errors +#> Error in check_data(data, rules, stop_on_fail = TRUE, stop_on_error = TRUE, stop_on_warn = TRUE) : +#> In dataset 'wrong_data.csv' found 2 rule fails, 1 warnings, 1 errors ``` ## Backends diff --git a/man/check_data.Rd b/man/check_data.Rd index c96bb7d..f58cf97 100644 --- a/man/check_data.Rd +++ b/man/check_data.Rd @@ -8,8 +8,9 @@ check_data( x, rules, xname = deparse(substitute(x)), - fail_on_warn = FALSE, - fail_on_error = FALSE + stop_on_fail = FALSE, + stop_on_warn = FALSE, + stop_on_error = FALSE ) } \arguments{ @@ -20,9 +21,11 @@ check_data( \item{xname}{optional, a name for the x variable (only used for errors)} -\item{fail_on_warn}{if the function should throw an error on a warning} +\item{stop_on_fail}{when any of the rules fail, throw an error with stop} -\item{fail_on_error}{if the function should throw an error on a failed rule} +\item{stop_on_warn}{when a warning is found in the code execution, throw an error with stop} + +\item{stop_on_error}{when an error is found in the code execution, throw an error with stop} } \value{ a data.frame-like object with one row for each rule and its results diff --git a/man/rule.Rd b/man/rule.Rd index 8e014b1..ddbdcb6 100644 --- a/man/rule.Rd +++ b/man/rule.Rd @@ -19,7 +19,12 @@ The expression can be given as a string as well.} \item{name}{an optional name for the rule for reference} -\item{allow_na}{does the rule allow for NA values? default value is FALSE} +\item{allow_na}{does the rule allow for NA values in the data? default value is FALSE. +Note that when NAs are introduced in the expression, \code{allow_na} has no effect. +Eg when the rule \code{as.numeric(vs) \%in\% c(0, 1)} finds the values of \code{vs} as +\code{c("1", "A")}, the rule will throw a fail regardless of the value of \code{allow_na} +as the NA is introduced in the expression and is not found in the original data. +However, when the values of \code{vs} are \code{c("1", NA)}, \code{allow_na} will have an effect.} \item{negate}{is the rule negated, only applies to the expression not allow_na, that is, if \code{expr = mpg > 10}, \code{allow_na = TRUE}, and \code{negate = TRUE}, it would diff --git a/tests/testthat/test-check_data.R b/tests/testthat/test-check_data.R index d11d49e..dcbc139 100644 --- a/tests/testthat/test-check_data.R +++ b/tests/testthat/test-check_data.R @@ -9,7 +9,7 @@ rules <- ruleset( rule(cyl %in% c(4, 6, 8), "r1"), rule(mpg < 10 & mpg > 34, "r2", negate = TRUE), rule(disp > 100, "r3", allow_na = TRUE), # data validation "fails" - rule(as.numeric(hp) > 0 & as.numeric(hp) < 400, "r4"), # creates warning + 1 NA + rule(as.numeric(hp) > 0 & as.numeric(hp) < 400, "r4"), # creates warning + 1 NA -> result in fail rule(does_not_exist %in% c("a", "b", "c"), "r5") # creates a stop ) @@ -210,7 +210,86 @@ test_that("Test extra functionality", { expect_equal(res[, !"time"], res2[, !"time"]) unlink(rule_file) - # fails on warn and fail on error work ===== - expect_error(check_data(data, rules, fail_on_warn = TRUE)) - expect_error(check_data(data, rules, fail_on_error = TRUE)) + # stop on warn and fail on error work ===== + expect_error(check_data(data, rules, stop_on_fail = TRUE)) + expect_error(check_data(data, rules, stop_on_warn = TRUE)) + expect_error(check_data(data, rules, stop_on_error = TRUE)) +}) + +test_that("Special case where a warning with allowed missing values returned a fail", { + rules <- ruleset( + rule(as.numeric(vs) %in% c(0, 1), allow_na = TRUE) + # conversion will introduce warning but allow_na should pass it + ) + data <- mtcars + data$vs <- as.character(data$vs) + data$vs[1] <- "asd" + + res <- check_data(data, rules) + expect_equal(res$fail, 1) + expect_equal(res$warn, "NAs introduced by coercion") +}) + + +test_that("Extra tests for stop_on_fail with custom reader", { + rules <- ruleset( + rule(mpg > 10 & mpg < 30), # mpg goes up to 34 + rule(cyl %in% c(4, 8)), # missing 6 cyl + rule(as.numeric(vs) %in% c(0, 1), allow_na = TRUE) # conversion can introduce warning + ) + + read_custom <- function(file, rules) { + data <- read.csv(file) + # expected: if the check_data detects a fail: the read_custom function will stop + check_data(data, rules, xname = file, + stop_on_fail = TRUE, stop_on_warn = TRUE, stop_on_error = TRUE) + data + } + + d <- mtcars + d$name <- rownames(d) + rownames(d) <- NULL + + # normal use case, no fails, warnings, errors + data_ok <- d[d$mpg <= 30 & d$cyl != 6, ] + rownames(data_ok) <- NULL + + file_ok <- tempfile(fileext = ".csv") + write.csv(data_ok, file_ok, row.names = FALSE) + + data_ok_got <- read_custom(file_ok, rules) + expect_equal(data_ok_got, data_ok) + + # fail use case, no warnings, errors + data_fail <- d + file_fail <- tempfile(fileext = ".csv") + write.csv(data_fail, file_fail, row.names = FALSE) + + expect_error( + read_custom(file_fail, rules), + "In dataset '.*' found 2 rule fails" + ) + + # warn use case, no fails, no errors + data_warn <- data_ok + data_warn$vs <- as.character(data_warn$vs) + data_warn$vs[3] <- "asd" # will throw warning + + file_warn <- tempfile(fileext = ".csv") + write.csv(data_warn, file_warn, row.names = FALSE) + + # see `allow_na` in `?rule` for an explanation why this fails + expect_error( + read_custom(file_warn, rules), + "In dataset '.*' found 1 rule fails, 1 warnings" + ) + + # error use case results in rule fails as well, no warnings + rules_error <- ruleset( + rule(stop("Not going to work...")) + ) + expect_error( + read_custom(file_ok, rules_error), + "In dataset '.*' found 1 rule fails, 1 errors" + ) })