diff --git a/NAMESPACE b/NAMESPACE index d0305ef31..fd2b5b0bd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,6 +107,7 @@ export(anti_join) export(any_of) export(arrange) export(as_duckplyr_df) +export(as_duckplyr_tibble) export(as_tibble) export(between) export(bind_cols) diff --git a/NEWS.md b/NEWS.md index 305d3d34e..2687267f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ ## Features - `df_from_file()` and related functions support multiple files (#194, #195), show a clear error message for non-string `path` arguments (#182), and create a tibble by default (#177). +- New `as_duckplyr_tibble()` to convert a data frame to a duckplyr tibble (#177). - Support descending sort for character and other non-numeric data (@toppyy, #92, #175). - Avoid setting memory limit (#193). - Check compatibility of join columns (#168, #185). diff --git a/R/as_duckplyr_df.R b/R/as_duckplyr_df.R index 9f29f1ffe..bb0ac2ee9 100644 --- a/R/as_duckplyr_df.R +++ b/R/as_duckplyr_df.R @@ -1,16 +1,23 @@ #' Convert to a duckplyr data frame #' -#' For an object of class `duckplyr_df`, +#' @description +#' These functions convert a data-frame-like input to an object of class `"duckpylr_df"`. +#' For such objects, #' dplyr verbs such as [mutate()], [select()] or [filter()] will attempt to use DuckDB. #' If this is not possible, the original dplyr implementation is used. #' +#' `as_duckplyr_df()` requires the input to be a plain data frame or a tibble, +#' and will fail for any other classes, including subclasses of `"data.frame"` or `"tbl_df"`. +#' This behavior is likely to change, do not rely on it. +#' +#' @details #' Set the `DUCKPLYR_FALLBACK_INFO` and `DUCKPLYR_FORCE` environment variables #' for more control over the behavior, see [config] for more details. #' #' @param .data data frame or tibble to transform #' -#' @return An object of class `"duckplyr_df"`, inheriting from the classes of the -#' `.data` argument. +#' @return For `as_duckplyr_df()`, an object of class `"duckplyr_df"`, +#' inheriting from the classes of the `.data` argument. #' #' @export #' @examples @@ -36,7 +43,3 @@ as_duckplyr_df <- function(.data) { class(.data) <- c("duckplyr_df", class(.data)) .data } - -default_df_class <- function() { - class(new_tibble(list())) -} diff --git a/R/as_duckplyr_tibble.R b/R/as_duckplyr_tibble.R new file mode 100644 index 000000000..6588bb0e1 --- /dev/null +++ b/R/as_duckplyr_tibble.R @@ -0,0 +1,13 @@ +#' as_duckplyr_tibble +#' +#' `as_duckplyr_tibble()` converts the input to a tibble and then to a duckplyr data frame. +#' +#' @return For `as_duckplyr_df()`, an object of class +#' `c("duckplyr_df", class(tibble()))` . +#' +#' @rdname as_duckplyr_df +#' @export +as_duckplyr_tibble <- function(.data) { + # Extra as.data.frame() call for good measure and perhaps https://github.com/tidyverse/tibble/issues/1556 + as_duckplyr_df(as_tibble(as.data.frame(.data))) +} diff --git a/R/io-.R b/R/io-.R index ea991d129..4b9cdfa16 100644 --- a/R/io-.R +++ b/R/io-.R @@ -78,3 +78,7 @@ duckplyr_df_from_file <- function( out <- df_from_file(path, table_function, options = options, class = class) as_duckplyr_df(out) } + +default_df_class <- function() { + class(new_tibble(list())) +} diff --git a/README.Rmd b/README.Rmd index 7489ea9d2..1f4858809 100644 --- a/README.Rmd +++ b/README.Rmd @@ -75,8 +75,8 @@ conflict_prefer("filter", "dplyr") There are two ways to use duckplyr. -1. To enable duckplyr for individual data frames, use `duckplyr::as_duckplyr_df()` as the first step in your pipe, without attaching the package. -1. By calling `library(duckplyr)`, it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_df()`. To turn this off, call `methods_restore()`. +1. To enable duckplyr for individual data frames, use `duckplyr::as_duckplyr_tibble()` as the first step in your pipe, without attaching the package. +1. By calling `library(duckplyr)`, it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_tibble()`. To turn this off, call `methods_restore()`. The examples below illustrate both methods. See also the companion [demo repository](https://github.com/Tmonster/duckplyr_demo) for a use case with a large dataset. @@ -85,20 +85,20 @@ See also the companion [demo repository](https://github.com/Tmonster/duckplyr_de This example illustrates usage of duckplyr for individual data frames. -Use `duckplyr::as_duckplyr_df()` to enable processing with duckdb: +Use `duckplyr::as_duckplyr_tibble()` to enable processing with duckdb: ```{r} out <- palmerpenguins::penguins %>% # CAVEAT: factor columns are not supported yet mutate(across(where(is.factor), as.character)) %>% - duckplyr::as_duckplyr_df() %>% + duckplyr::as_duckplyr_tibble() %>% mutate(bill_area = bill_length_mm * bill_depth_mm) %>% summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>% filter(species != "Gentoo") ``` -The result is a data frame or tibble, with its own class. +The result is a tibble, with its own class. ```{r} class(out) @@ -137,7 +137,7 @@ Use `library(duckplyr)` or `duckplyr::methods_overwrite()` to overwrite dplyr me duckplyr::methods_overwrite() ``` -This is the same query as above, without `as_duckplyr_df()`: +This is the same query as above, without `as_duckplyr_tibble()`: ```{r echo = FALSE} Sys.setenv(DUCKPLYR_FALLBACK_COLLECT = 0) @@ -206,7 +206,7 @@ Sys.setenv(DUCKPLYR_FALLBACK_COLLECT = "") ```{r} palmerpenguins::penguins %>% - duckplyr::as_duckplyr_df() %>% + duckplyr::as_duckplyr_tibble() %>% transmute(bill_area = bill_length_mm * bill_depth_mm) %>% head(3) ``` diff --git a/README.md b/README.md index 11b530ecf..32796d9dc 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,8 @@ Or from [GitHub](https://github.com/) with: There are two ways to use duckplyr. -1. To enable duckplyr for individual data frames, use [`duckplyr::as_duckplyr_df()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_df.html) as the first step in your pipe, without attaching the package. -2. By calling [`library(duckplyr)`](https://duckdblabs.github.io/duckplyr/), it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_df()`. To turn this off, call `methods_restore()`. +1. To enable duckplyr for individual data frames, use [`duckplyr::as_duckplyr_tibble()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_tibble.html) as the first step in your pipe, without attaching the package. +2. By calling [`library(duckplyr)`](https://duckdblabs.github.io/duckplyr/), it overwrites dplyr methods and is automatically enabled for the entire session without having to call `as_duckplyr_tibble()`. To turn this off, call `methods_restore()`. The examples below illustrate both methods. See also the companion [demo repository](https://github.com/Tmonster/duckplyr_demo) for a use case with a large dataset. @@ -50,19 +50,19 @@ The examples below illustrate both methods. See also the companion [demo reposit This example illustrates usage of duckplyr for individual data frames. -Use [`duckplyr::as_duckplyr_df()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_df.html) to enable processing with duckdb: +Use [`duckplyr::as_duckplyr_tibble()`](https://duckdblabs.github.io/duckplyr/reference/as_duckplyr_tibble.html) to enable processing with duckdb:
out <- palmerpenguins::penguins %>% # CAVEAT: factor columns are not supported yet mutate(across(where(is.factor), as.character)) %>% - duckplyr::as_duckplyr_df() %>% + duckplyr::as_duckplyr_tibble() %>% mutate(bill_area = bill_length_mm * bill_depth_mm) %>% summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>% filter(species != "Gentoo")-The result is a data frame or tibble, with its own class. +The result is a tibble, with its own class.
class(out) @@ -211,7 +211,7 @@ Use [`library(duckplyr)`](https://duckdblabs.github.io/duckplyr/) or [`duckplyr: #> ✔ Overwriting dplyr methods with duckplyr methods. #> ℹ Turn off with `duckplyr::methods_restore()`.-This is the same query as above, without `as_duckplyr_df()`: +This is the same query as above, without `as_duckplyr_tibble()`:
out <- @@ -298,7 +298,7 @@ The first time the package encounters an unsupported function, data type, or opepalmerpenguins::penguins %>% - duckplyr::as_duckplyr_df() %>% + duckplyr::as_duckplyr_tibble() %>% transmute(bill_area = bill_length_mm * bill_depth_mm) %>% head(3) #> The duckplyr package is configured to fall back to dplyr when it encounters an diff --git a/man/as_duckplyr_df.Rd b/man/as_duckplyr_df.Rd index 76e38289b..88a1c29c0 100644 --- a/man/as_duckplyr_df.Rd +++ b/man/as_duckplyr_df.Rd @@ -1,22 +1,35 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_duckplyr_df.R +% Please edit documentation in R/as_duckplyr_df.R, R/as_duckplyr_tibble.R \name{as_duckplyr_df} \alias{as_duckplyr_df} +\alias{as_duckplyr_tibble} \title{Convert to a duckplyr data frame} \usage{ as_duckplyr_df(.data) + +as_duckplyr_tibble(.data) } \arguments{ \item{.data}{data frame or tibble to transform} } \value{ -An object of class \code{"duckplyr_df"}, inheriting from the classes of the -\code{.data} argument. +For \code{as_duckplyr_df()}, an object of class \code{"duckplyr_df"}, +inheriting from the classes of the \code{.data} argument. + +For \code{as_duckplyr_df()}, an object of class +\code{c("duckplyr_df", class(tibble()))} . } \description{ -For an object of class \code{duckplyr_df}, +These functions convert a data-frame-like input to an object of class \code{"duckpylr_df"}. +For such objects, dplyr verbs such as \code{\link[=mutate]{mutate()}}, \code{\link[=select]{select()}} or \code{\link[=filter]{filter()}} will attempt to use DuckDB. If this is not possible, the original dplyr implementation is used. + +\code{as_duckplyr_df()} requires the input to be a plain data frame or a tibble, +and will fail for any other classes, including subclasses of \code{"data.frame"} or \code{"tbl_df"}. +This behavior is likely to change, do not rely on it. + +\code{as_duckplyr_tibble()} converts the input to a tibble and then to a duckplyr data frame. } \details{ Set the \code{DUCKPLYR_FALLBACK_INFO} and \code{DUCKPLYR_FORCE} environment variables diff --git a/man/df_from_file.Rd b/man/df_from_file.Rd index 8244e4c06..7816c69ab 100644 --- a/man/df_from_file.Rd +++ b/man/df_from_file.Rd @@ -42,9 +42,11 @@ DuckDB function such as \code{"read_parquet"}, \item{options}{Arguments to the DuckDB function indicated by \code{table_function}.} -\item{class}{An optional class to add to the data frame. +\item{class}{The class of the output. +By default, a tibble is created. The returned object will always be a data frame. -Pass \code{class(tibble())} to create a tibble.} +Use \code{class = "data.frame"} or \code{class = character()} +to create a plain data frame.} \item{data}{A data frame to be written to disk.} } diff --git a/tests/testthat/test-as_duckplyr_tibble.R b/tests/testthat/test-as_duckplyr_tibble.R new file mode 100644 index 000000000..c713f517d --- /dev/null +++ b/tests/testthat/test-as_duckplyr_tibble.R @@ -0,0 +1,7 @@ +test_that("as_duckplyr_tibble() works", { + expect_s3_class(as_duckplyr_tibble(tibble(a = 1)), "duckplyr_df") + expect_equal(class(as_duckplyr_tibble(tibble(a = 1))), c("duckplyr_df", class(tibble()))) + + expect_s3_class(as_duckplyr_tibble(data.frame(a = 1)), "duckplyr_df") + expect_equal(class(as_duckplyr_tibble(data.frame(a = 1))), c("duckplyr_df", class(tibble()))) +})