Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Growable ALTREP vectors #6697

Draft
wants to merge 7 commits into
base: truehash
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,7 @@ replace_dot_alias = function(e) {
}
if (!with || missing(j)) return(ans)
if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013
setgrowable(ans)
SDenv$.SDall = ans
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
SDenv$.N = nrow(ans)
Expand Down Expand Up @@ -1591,6 +1592,7 @@ replace_dot_alias = function(e) {
SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset
if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012
if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
setgrowable(SDenv$.SDall)
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
}
if (nrow(SDenv$.SDall)==0L) {
Expand Down
2 changes: 2 additions & 0 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ fitsInInt32 = function(x) .Call(CfitsInInt32R, x)
fitsInInt64 = function(x) .Call(CfitsInInt64R, x)

coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy)

setgrowable = function(x) .Call(Csetgrowable, x)
6 changes: 0 additions & 6 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -19303,12 +19303,6 @@ test(2290.3, DT[, `:=`(a, c := 3)], error="It looks like you re-used `:=` in arg
# partially-named `:=`(...) --> different branch, same error
test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in argument 2")

# segfault when selfref is not ok before set #6410
df = data.frame(a=1:3)
setDT(df)
attr(df, "att") = 1
test(2291.1, set(df, NULL, "new", "new"), error="attributes .* have been reassigned")

# ns-qualified bysub error, #6493
DT = data.table(a = 1)
test(2292.1, DT[, .N, by=base::mget("a")], data.table(a = 1, N = 1L))
Expand Down
52 changes: 29 additions & 23 deletions src/assign.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "data.table.h"

#if R_VERSION < R_Version(3,4,0) // not needed with GROWABLE_BIT
static void finalizer(SEXP p)
{
SEXP x;
Expand All @@ -22,6 +23,7 @@ static void finalizer(SEXP p)
UNPROTECT(1);
return;
}
#endif

void setselfref(SEXP x) {
if(!INHERITS(x, char_datatable)) return; // #5286
Expand All @@ -38,7 +40,9 @@ void setselfref(SEXP x) {
R_NilValue
))
));
#if R_VERSION < R_Version(3,4,0) // not needed with GROWABLE_BIT
R_RegisterCFinalizerEx(p, finalizer, FALSE);
#endif
UNPROTECT(2);

/*
Expand Down Expand Up @@ -126,15 +130,24 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) {
tag = R_ExternalPtrTag(v);
if (!(isNull(tag) || isString(tag))) internal_error(__func__, ".internal.selfref tag is neither NULL nor a character vector"); // # nocov
names = getAttrib(x, R_NamesSymbol);
if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734
// On R >= 3.4, either
// (1) we allocate the data.table and/or its names, so it has the GROWABLE_BIT set, so copies will have zero TRUELENGTH, or
// (2) someone else creates them from scratch, so (only using the API) will have zero TRUELENGTH.
// We then return false and either re-create the data.table from scratch or signal an error, so the current object having a zero TRUELENGTH is fine.
// R < 3.4 doesn't have the GROWABLE_BIT, so let's reset the TRUELENGTH just in case.
#if R_VERSION < R_Version(3,4,0)
if (names!=tag && isString(names))
SET_TRUELENGTH(names, LENGTH(names));
// R copied this vector not data.table; it's not actually over-allocated. It looks over-allocated
// because R copies the original vector's tl over despite allocating length.
#endif
prot = R_ExternalPtrProtected(v);
if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")).
return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r
if (x!=R_ExternalPtrAddr(prot) && !ALTREP(x))
#if R_VERSION < R_Version(3,4,0)
if (x!=R_ExternalPtrAddr(prot))
SET_TRUELENGTH(x, LENGTH(x)); // R copied this vector not data.table, it's not actually over-allocated
#endif
return checkNames ? names==tag : x==R_ExternalPtrAddr(prot);
}

Expand All @@ -151,7 +164,8 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
// called from alloccol where n is checked carefully, or from shallow() at R level
// where n is set to truelength (i.e. a shallow copy only with no size change)
int protecti=0;
SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here?
const int l = isNull(cols) ? length(dt) : length(cols);
SEXP newdt = PROTECT(growable_allocate(VECSXP, l, n)); protecti++; // to do, use growVector here?
SHALLOW_DUPLICATE_ATTRIB(newdt, dt);

// TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It
Expand All @@ -169,8 +183,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
setAttrib(newdt, sym_sorted, duplicate(sorted));

SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++;
SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++;
const int l = isNull(cols) ? LENGTH(dt) : length(cols);
SEXP newnames = PROTECT(growable_allocate(STRSXP, l, n)); protecti++;
if (isNull(cols)) {
for (int i=0; i<l; ++i) SET_VECTOR_ELT(newdt, i, VECTOR_ELT(dt,i));
if (length(names)) {
Expand All @@ -186,13 +199,8 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
for (int i=0; i<l; ++i) SET_STRING_ELT( newnames, i, STRING_ELT(names,INTEGER(cols)[i]-1) );
}
}
// setAttrib used to change length and truelength, but as of R-3.3 no longer does that
setAttrib(newdt, R_NamesSymbol, newnames);
// setAttrib appears to change length and truelength, so need to do that first _then_ SET next,
// otherwise (if the SET were were first) the 100 tl is assigned to length.
SETLENGTH(newnames,l);
SET_TRUELENGTH(newnames,n);
SETLENGTH(newdt,l);
SET_TRUELENGTH(newdt,n);
setselfref(newdt);
UNPROTECT(protecti);
return(newdt);
Expand Down Expand Up @@ -260,10 +268,8 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose)
return shallow(dt,R_NilValue,(n>l) ? n : l); // e.g. test 848 and 851 in R > 3.0.2
// added (n>l) ? ... for #970, see test 1481.
// TO DO: test realloc names if selfrefnamesok (users can setattr(x,"name") themselves for example.
// if (TRUELENGTH(getAttrib(dt,R_NamesSymbol))!=tl)
// internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRUELENGTH(getAttrib(dt,R_NamesSymbol))); // # nocov

tl = TRUELENGTH(dt);
tl = growable_max_size(dt);
// R <= 2.13.2 and we didn't catch uninitialized tl somehow
if (tl<0) internal_error(__func__, "tl of class is marked but tl<0"); // # nocov
if (tl>0 && tl<l) internal_error(__func__, "tl (%d) < l (%d) but tl of class is marked", tl, l); // # nocov
Expand Down Expand Up @@ -313,11 +319,11 @@ SEXP shallowwrapper(SEXP dt, SEXP cols) {
if (!selfrefok(dt, FALSE)) {
int n = isNull(cols) ? length(dt) : length(cols);
return(shallow(dt, cols, n));
} else return(shallow(dt, cols, TRUELENGTH(dt)));
} else return(shallow(dt, cols, growable_max_size(dt)));
}

SEXP truelength(SEXP x) {
return ScalarInteger(isNull(x) ? 0 : TRUELENGTH(x));
return ScalarInteger(is_growable(x) ? growable_max_size(x) : 0);
}

SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
Expand Down Expand Up @@ -514,7 +520,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
// modify DT by reference. Other than if new columns are being added and the allocVec() fails with
// out-of-memory. In that case the user will receive hard halt and know to rerun.
if (length(newcolnames)) {
oldtncol = TRUELENGTH(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.
oldtncol = is_growable(dt) ? growable_max_size(dt) : 0; // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.

if (oldtncol<oldncol) {
if (oldtncol==0) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
Expand All @@ -527,13 +533,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov
// Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as
// strong error message for now.
else if (TRUELENGTH(names) != oldtncol)
else if (growable_max_size(names) != oldtncol)
// Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work
internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)TRUELENGTH(names), oldtncol); // # nocov
internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)growable_max_size(names), oldtncol); // # nocov
if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref
error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker."));
SETLENGTH(dt, oldncol+LENGTH(newcolnames));
SETLENGTH(names, oldncol+LENGTH(newcolnames));
growable_resize(dt, oldncol+LENGTH(newcolnames));
growable_resize(names, oldncol+LENGTH(newcolnames));
for (int i=0; i<LENGTH(newcolnames); ++i)
SET_STRING_ELT(names,oldncol+i,STRING_ELT(newcolnames,i));
// truelengths of both already set by alloccol
Expand Down Expand Up @@ -730,8 +736,8 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
SET_VECTOR_ELT(dt, i, R_NilValue);
SET_STRING_ELT(names, i, NA_STRING); // release reference to the CHARSXP
}
SETLENGTH(dt, ndt-ndelete);
SETLENGTH(names, ndt-ndelete);
growable_resize(dt, ndt-ndelete);
growable_resize(names, ndt-ndelete);
if (LENGTH(names)==0) {
// That was last column deleted, leaving NULL data.table, so we need to reset .row_names, so that it really is the NULL data.table.
PROTECT(nullint=allocVector(INTSXP, 0)); protecti++;
Expand Down
23 changes: 21 additions & 2 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
#if R_VERSION < R_Version(3, 4, 0)
# define SET_GROWABLE_BIT(x) // #3292
#endif
#if R_VERSION >= R_Version(4, 3, 0)
# define USE_GROWABLE_ALTREP
#endif
#include <Rinternals.h>
#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT
#include <stdint.h> // for uint64_t rather than unsigned long long
Expand Down Expand Up @@ -143,7 +146,7 @@ uint64_t dtwiddle(double x);
SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg);
SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg); // reuseSorting wrapper to forder
int getNumericRounding_C(void);
void internal_error_with_cleanup(const char *call_name, const char *format, ...);
NORET void internal_error_with_cleanup(const char *call_name, const char *format, ...);

// reorder.c
SEXP reorder(SEXP x, SEXP order);
Expand Down Expand Up @@ -259,7 +262,7 @@ SEXP islockedR(SEXP x);
bool need2utf8(SEXP x);
SEXP coerceUtf8IfNeeded(SEXP x);
SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg);
void internal_error(const char *call_name, const char *format, ...);
NORET void internal_error(const char *call_name, const char *format, ...);

// types.c
char *end(char *start);
Expand Down Expand Up @@ -298,6 +301,21 @@ void hash_set(hashtab *, SEXP key, R_xlen_t value);
// Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);

// growable.c
// Return a new vector of given type. Initially its xlength() is equal to size. Using growable_resize(), it can be increased to up to max_size.
SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size);
// Return the max_size of a growable vector. Behaviour is undefined if x was not allocated by growable_allocate.
R_xlen_t growable_max_size(SEXP x);
// Resize a growable vector to newsize. Will signal an error if newsize exceeds max_size.
void growable_resize(SEXP x, R_xlen_t newsize);
// Return TRUE if growable_resize(x) and growable_max_size(x) are valid operations.
Rboolean is_growable(SEXP x);
// Transform x into a growable vector. The return value must be reprotected in place of x. What happens to x is deliberately not specified, but no copying occurs.
SEXP make_growable(SEXP x);
#if R_VERSION >= R_Version(4, 3, 0)
void register_altrep_classes(DllInfo*);
#endif

// functions called from R level .Call/.External and registered in init.c
// these now live here to pass -Wstrict-prototypes, #5477
// all arguments must be SEXP since they are called from R level
Expand Down Expand Up @@ -371,4 +389,5 @@ SEXP dt_has_zlib(void);
SEXP startsWithAny(SEXP, SEXP, SEXP);
SEXP convertDate(SEXP, SEXP);
SEXP fastmean(SEXP);
SEXP setgrowable(SEXP x);

27 changes: 12 additions & 15 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,10 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) {
// (see data.table.h), and isNewList() is true for NULL
if (n==0)
return false;
if (hash_lookup(specials, x, 0)<0) return true; // test 2158
if (isVectorAtomic(x))
return ALTREP(x) || hash_lookup(specials, x, 0)<0;
return ALTREP(x); // see test 2156: ALTREP is a source of sharing we can't trace reliably
if (isNewList(x)) {
if (hash_lookup(specials, x, 0)<0)
return true; // test 2158
for (int i=0; i<n; ++i) {
list_el = VECTOR_ELT(x,i);
if (anySpecialStatic(list_el, specials))
Expand Down Expand Up @@ -124,7 +123,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
for (R_len_t i=0; i<n; ++i) {
if (ilens[i] > maxGrpSize) maxGrpSize = ilens[i];
}
defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++;
defineVar(install(".I"), I = PROTECT(growable_allocate(INTSXP, maxGrpSize, maxGrpSize)), env); nprotect++;
hash_set(specials, I, -maxGrpSize); // marker for anySpecialStatic(); see its comments
R_LockBinding(install(".I"), env);

Expand Down Expand Up @@ -197,7 +196,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP
INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch
for (int j=0; j<length(SDall); ++j) {
SETLENGTH(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
growable_resize(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
defineVar(nameSyms[j], VECTOR_ELT(SDall, j), env);
// Redo this defineVar for each group in case user's j assigned to the column names (env is static) (tests 387 and 388)
// nameSyms pre-stored to save repeated install() for efficiency, though.
Expand Down Expand Up @@ -230,14 +229,14 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// and we hope that reference counting on by default from R 4.0 will avoid costly gc()s.
}
grpn = 1; // it may not be 1 e.g. test 722. TODO: revisit.
SETLENGTH(I, grpn);
growable_resize(I, grpn);
INTEGER(I)[0] = 0;
for (int j=0; j<length(xSD); ++j) {
writeNA(VECTOR_ELT(xSD, j), 0, 1, false);
}
} else {
if (verbose) tstart = wallclock();
SETLENGTH(I, grpn);
growable_resize(I, grpn);
int *iI = INTEGER(I);
if (LENGTH(order)==0) {
const int rownum = grpn ? istarts[i]-1 : -1;
Expand Down Expand Up @@ -318,13 +317,13 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
RHS = VECTOR_ELT(jval,j%LENGTH(jval));
if (isNull(target)) {
// first time adding to new column
if (TRUELENGTH(dt) < colj+1) internal_error(__func__, "Trying to add new column by reference but tl is full; setalloccol should have run first at R level before getting to this point"); // # nocov
if (growable_max_size(dt) < colj+1) internal_error(__func__, "Trying to add new column by reference but table is full; setalloccol should have run first at R level before getting to this point"); // # nocov
target = PROTECT(allocNAVectorLike(RHS, n));
// Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
// contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
// Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
SETLENGTH(dtnames, LENGTH(dtnames)+1);
SETLENGTH(dt, LENGTH(dt)+1);
growable_resize(dtnames, LENGTH(dtnames)+1);
growable_resize(dt, LENGTH(dt)+1);
SET_VECTOR_ELT(dt, colj, target);
UNPROTECT(1);
SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
Expand Down Expand Up @@ -482,11 +481,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// Also reset truelength on specials; see comments in anySpecialStatic().
for (int j=0; j<length(SDall); ++j) {
SEXP this = VECTOR_ELT(SDall,j);
SETLENGTH(this, maxGrpSize);
SET_TRUELENGTH(this, maxGrpSize);
growable_resize(this, maxGrpSize);
}
SETLENGTH(I, maxGrpSize);
SET_TRUELENGTH(I, maxGrpSize);
growable_resize(I, maxGrpSize);
if (verbose) {
if (nblock[0] && nblock[1]) internal_error(__func__, "block 0 [%d] and block 1 [%d] have both run", nblock[0], nblock[1]); // # nocov
int w = nblock[1]>0;
Expand Down Expand Up @@ -523,7 +520,7 @@ SEXP growVector(SEXP x, const R_len_t newlen)
SEXP newx;
R_len_t len = length(x);
if (isNull(x)) error(_("growVector passed NULL"));
PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
PROTECT(newx = growable_allocate(TYPEOF(x), newlen, newlen)); // may be shrunk later by fread
if (newlen < len) len=newlen; // i.e. shrink
switch (TYPEOF(x)) {
case RAWSXP: memcpy(RAW(newx), RAW(x), len*SIZEOF(x)); break;
Expand Down
Loading
Loading