From 8466ce0181234353b7778ab3bd8e8e0f0ad8d3bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 11 Jul 2024 18:42:25 +0200 Subject: [PATCH] as_duckplyr_tibble() --- NAMESPACE | 1 + NEWS.md | 1 + R/as_duckplyr_df.R | 17 ++++++++++------- R/as_duckplyr_tibble.R | 13 +++++++++++++ R/io-.R | 4 ++++ README.Rmd | 14 +++++++------- README.md | 14 +++++++------- man/as_duckplyr_df.Rd | 21 +++++++++++++++++---- man/df_from_file.Rd | 6 ++++-- tests/testthat/test-as_duckplyr_tibble.R | 7 +++++++ 10 files changed, 71 insertions(+), 27 deletions(-) create mode 100644 R/as_duckplyr_tibble.R create mode 100644 tests/testthat/test-as_duckplyr_tibble.R diff --git a/NAMESPACE b/NAMESPACE index d0305ef31..fd2b5b0bd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,6 +107,7 @@ export(anti_join) export(any_of) export(arrange) export(as_duckplyr_df) +export(as_duckplyr_tibble) export(as_tibble) export(between) export(bind_cols) diff --git a/NEWS.md b/NEWS.md index 305d3d34e..2687267f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ ## Features - `df_from_file()` and related functions support multiple files (#194, #195), show a clear error message for non-string `path` arguments (#182), and create a tibble by default (#177). +- New `as_duckplyr_tibble()` to convert a data frame to a duckplyr tibble (#177). - Support descending sort for character and other non-numeric data (@toppyy, #92, #175). - Avoid setting memory limit (#193). - Check compatibility of join columns (#168, #185). diff --git a/R/as_duckplyr_df.R b/R/as_duckplyr_df.R index 9f29f1ffe..bb0ac2ee9 100644 --- a/R/as_duckplyr_df.R +++ b/R/as_duckplyr_df.R @@ -1,16 +1,23 @@ #' Convert to a duckplyr data frame #' -#' For an object of class `duckplyr_df`, +#' @description +#' These functions convert a data-frame-like input to an object of class `"duckpylr_df"`. +#' For such objects, #' dplyr verbs such as [mutate()], [select()] or [filter()] will attempt to use DuckDB. #' If this is not possible, the original dplyr implementation is used. #' +#' `as_duckplyr_df()` requires the input to be a plain data frame or a tibble, +#' and will fail for any other classes, including subclasses of `"data.frame"` or `"tbl_df"`. +#' This behavior is likely to change, do not rely on it. +#' +#' @details #' Set the `DUCKPLYR_FALLBACK_INFO` and `DUCKPLYR_FORCE` environment variables #' for more control over the behavior, see [config] for more details. #' #' @param .data data frame or tibble to transform #' -#' @return An object of class `"duckplyr_df"`, inheriting from the classes of the -#' `.data` argument. +#' @return For `as_duckplyr_df()`, an object of class `"duckplyr_df"`, +#' inheriting from the classes of the `.data` argument. #' #' @export #' @examples @@ -36,7 +43,3 @@ as_duckplyr_df <- function(.data) { class(.data) <- c("duckplyr_df", class(.data)) .data } - -default_df_class <- function() { - class(new_tibble(list())) -} diff --git a/R/as_duckplyr_tibble.R b/R/as_duckplyr_tibble.R new file mode 100644 index 000000000..6588bb0e1 --- /dev/null +++ b/R/as_duckplyr_tibble.R @@ -0,0 +1,13 @@ +#' as_duckplyr_tibble +#' +#' `as_duckplyr_tibble()` converts the input to a tibble and then to a duckplyr data frame. +#' +#' @return For `as_duckplyr_df()`, an object of class +#' `c("duckplyr_df", class(tibble()))` . +#' +#' @rdname as_duckplyr_df +#' @export +as_duckplyr_tibble <- function(.data) { + # Extra as.data.frame() call for good measure and perhaps https://github.com/tidyverse/tibble/issues/1556 + as_duckplyr_df(as_tibble(as.data.frame(.data))) +} diff --git a/R/io-.R b/R/io-.R index ea991d129..4b9cdfa16 100644 --- a/R/io-.R +++ b/R/io-.R @@ -78,3 +78,7 @@ duckplyr_df_from_file <- function( out <- df_from_file(path, table_function, options = options, class = class) as_duckplyr_df(out) } + +default_df_class <- function() { + class(new_tibble(list())) +} diff --git a/README.Rmd b/README.Rmd index 7489ea9d2..1f4858809 100644 --- a/README.Rmd +++ b/README.Rmd @@ -75,8 +75,8 @@ conflict_prefer("filter", "dplyr") There are two ways to use duckplyr. -1. To enable duckplyr for individual data frames, use `duckplyr::as_duckplyr_df()` as the first step in your pipe, without attaching the package. -1. By calling `library(duckplyr)`, it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_df()`. To turn this off, call `methods_restore()`. +1. To enable duckplyr for individual data frames, use `duckplyr::as_duckplyr_tibble()` as the first step in your pipe, without attaching the package. +1. By calling `library(duckplyr)`, it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_tibble()`. To turn this off, call `methods_restore()`. The examples below illustrate both methods. See also the companion [demo repository](https://github.com/Tmonster/duckplyr_demo) for a use case with a large dataset. @@ -85,20 +85,20 @@ See also the companion [demo repository](https://github.com/Tmonster/duckplyr_de This example illustrates usage of duckplyr for individual data frames. -Use `duckplyr::as_duckplyr_df()` to enable processing with duckdb: +Use `duckplyr::as_duckplyr_tibble()` to enable processing with duckdb: ```{r} out <- palmerpenguins::penguins %>% # CAVEAT: factor columns are not supported yet mutate(across(where(is.factor), as.character)) %>% - duckplyr::as_duckplyr_df() %>% + duckplyr::as_duckplyr_tibble() %>% mutate(bill_area = bill_length_mm * bill_depth_mm) %>% summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>% filter(species != "Gentoo") ``` -The result is a data frame or tibble, with its own class. +The result is a tibble, with its own class. ```{r} class(out) @@ -137,7 +137,7 @@ Use `library(duckplyr)` or `duckplyr::methods_overwrite()` to overwrite dplyr me duckplyr::methods_overwrite() ``` -This is the same query as above, without `as_duckplyr_df()`: +This is the same query as above, without `as_duckplyr_tibble()`: ```{r echo = FALSE} Sys.setenv(DUCKPLYR_FALLBACK_COLLECT = 0) @@ -206,7 +206,7 @@ Sys.setenv(DUCKPLYR_FALLBACK_COLLECT = "") ```{r} palmerpenguins::penguins %>% - duckplyr::as_duckplyr_df() %>% + duckplyr::as_duckplyr_tibble() %>% transmute(bill_area = bill_length_mm * bill_depth_mm) %>% head(3) ``` diff --git a/README.md b/README.md index 11b530ecf..32796d9dc 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,8 @@ Or from [GitHub](https://github.com/) with: There are two ways to use duckplyr. -1. To enable duckplyr for individual data frames, use [`duckplyr::as_duckplyr_df()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_df.html) as the first step in your pipe, without attaching the package. -2. By calling [`library(duckplyr)`](https://duckdblabs.github.io/duckplyr/), it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_df()`. To turn this off, call `methods_restore()`. +1. To enable duckplyr for individual data frames, use [`duckplyr::as_duckplyr_tibble()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_tibble.html) as the first step in your pipe, without attaching the package. +2. By calling [`library(duckplyr)`](https://duckdblabs.github.io/duckplyr/), it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_tibble()`. To turn this off, call `methods_restore()`. The examples below illustrate both methods. See also the companion [demo repository](https://github.com/Tmonster/duckplyr_demo) for a use case with a large dataset. @@ -50,19 +50,19 @@ The examples below illustrate both methods. See also the companion [demo reposit This example illustrates usage of duckplyr for individual data frames. -Use [`duckplyr::as_duckplyr_df()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_df.html) to enable processing with duckdb: +Use [`duckplyr::as_duckplyr_tibble()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_tibble.html) to enable processing with duckdb:
 out <-
   palmerpenguins::penguins %>%
   # CAVEAT: factor columns are not supported yet
   mutate(across(where(is.factor), as.character)) %>%
-  duckplyr::as_duckplyr_df() %>%
+  duckplyr::as_duckplyr_tibble() %>%
   mutate(bill_area = bill_length_mm * bill_depth_mm) %>%
   summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>%
   filter(species != "Gentoo")
-The result is a data frame or tibble, with its own class. +The result is a tibble, with its own class.
 class(out)
@@ -211,7 +211,7 @@ Use [`library(duckplyr)`](https://duckdblabs.github.io/duckplyr/) or [`duckplyr:
 #>  Overwriting dplyr methods with duckplyr methods.
 #>  Turn off with `duckplyr::methods_restore()`.
-This is the same query as above, without `as_duckplyr_df()`: +This is the same query as above, without `as_duckplyr_tibble()`:
 out <-
@@ -298,7 +298,7 @@ The first time the package encounters an unsupported function, data type, or ope
 
 
 palmerpenguins::penguins %>%
-  duckplyr::as_duckplyr_df() %>%
+  duckplyr::as_duckplyr_tibble() %>%
   transmute(bill_area = bill_length_mm * bill_depth_mm) %>%
   head(3)
 #> The duckplyr package is configured to fall back to dplyr when it encounters an
diff --git a/man/as_duckplyr_df.Rd b/man/as_duckplyr_df.Rd
index 76e38289b..88a1c29c0 100644
--- a/man/as_duckplyr_df.Rd
+++ b/man/as_duckplyr_df.Rd
@@ -1,22 +1,35 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/as_duckplyr_df.R
+% Please edit documentation in R/as_duckplyr_df.R, R/as_duckplyr_tibble.R
 \name{as_duckplyr_df}
 \alias{as_duckplyr_df}
+\alias{as_duckplyr_tibble}
 \title{Convert to a duckplyr data frame}
 \usage{
 as_duckplyr_df(.data)
+
+as_duckplyr_tibble(.data)
 }
 \arguments{
 \item{.data}{data frame or tibble to transform}
 }
 \value{
-An object of class \code{"duckplyr_df"}, inheriting from the classes of the
-\code{.data} argument.
+For \code{as_duckplyr_df()}, an object of class \code{"duckplyr_df"},
+inheriting from the classes of the \code{.data} argument.
+
+For \code{as_duckplyr_df()}, an object of class
+\code{c("duckplyr_df", class(tibble()))} .
 }
 \description{
-For an object of class \code{duckplyr_df},
+These functions convert a data-frame-like input to an object of class \code{"duckpylr_df"}.
+For such objects,
 dplyr verbs such as \code{\link[=mutate]{mutate()}}, \code{\link[=select]{select()}} or \code{\link[=filter]{filter()}}  will attempt to use DuckDB.
 If this is not possible, the original dplyr implementation is used.
+
+\code{as_duckplyr_df()} requires the input to be a plain data frame or a tibble,
+and will fail for any other classes, including subclasses of \code{"data.frame"} or \code{"tbl_df"}.
+This behavior is likely to change, do not rely on it.
+
+\code{as_duckplyr_tibble()} converts the input to a tibble and then to a duckplyr data frame.
 }
 \details{
 Set the \code{DUCKPLYR_FALLBACK_INFO} and \code{DUCKPLYR_FORCE} environment variables
diff --git a/man/df_from_file.Rd b/man/df_from_file.Rd
index 8244e4c06..7816c69ab 100644
--- a/man/df_from_file.Rd
+++ b/man/df_from_file.Rd
@@ -42,9 +42,11 @@ DuckDB function such as \code{"read_parquet"},
 \item{options}{Arguments to the DuckDB function
 indicated by \code{table_function}.}
 
-\item{class}{An optional class to add to the data frame.
+\item{class}{The class of the output.
+By default, a tibble is created.
 The returned object will always be a data frame.
-Pass \code{class(tibble())} to create a tibble.}
+Use \code{class = "data.frame"} or \code{class = character()}
+to create a plain data frame.}
 
 \item{data}{A data frame to be written to disk.}
 }
diff --git a/tests/testthat/test-as_duckplyr_tibble.R b/tests/testthat/test-as_duckplyr_tibble.R
new file mode 100644
index 000000000..c713f517d
--- /dev/null
+++ b/tests/testthat/test-as_duckplyr_tibble.R
@@ -0,0 +1,7 @@
+test_that("as_duckplyr_tibble() works", {
+  expect_s3_class(as_duckplyr_tibble(tibble(a = 1)), "duckplyr_df")
+  expect_equal(class(as_duckplyr_tibble(tibble(a = 1))), c("duckplyr_df", class(tibble())))
+
+  expect_s3_class(as_duckplyr_tibble(data.frame(a = 1)), "duckplyr_df")
+  expect_equal(class(as_duckplyr_tibble(data.frame(a = 1))), c("duckplyr_df", class(tibble())))
+})