@@ -453,18 +453,22 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum, renamecols=false)
453
453
```
454
454
"""
455
455
function combine (f:: Base.Callable , gd:: GroupedDataFrame ;
456
- keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true )
456
+ keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true ,
457
+ nthreads:: Int = NTHREADS)
457
458
return combine_helper (f, gd, keepkeys= keepkeys, ungroup= ungroup,
458
- copycols= true , keeprows= false , renamecols= renamecols)
459
+ copycols= true , keeprows= false , renamecols= renamecols,
460
+ nthreads= nthreads)
459
461
end
460
462
461
463
combine (f:: typeof (nrow), gd:: GroupedDataFrame ;
462
- keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true ) =
464
+ keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true ,
465
+ nthreads:: Int = NTHREADS) =
463
466
combine (gd, [nrow => :nrow ], keepkeys= keepkeys, ungroup= ungroup,
464
467
renamecols= renamecols)
465
468
466
469
function combine (p:: Pair , gd:: GroupedDataFrame ;
467
- keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true )
470
+ keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true ,
471
+ nthreads:: Int = NTHREADS)
468
472
# move handling of aggregate to specialized combine
469
473
p_from, p_to = p
470
474
@@ -484,20 +488,24 @@ function combine(p::Pair, gd::GroupedDataFrame;
484
488
cs = p_from
485
489
end
486
490
return combine_helper (cs => p_to, gd, keepkeys= keepkeys, ungroup= ungroup,
487
- copycols= true , keeprows= false , renamecols= renamecols)
491
+ copycols= true , keeprows= false , renamecols= renamecols,
492
+ nthreads= nthreads)
488
493
end
489
494
490
495
combine (gd:: GroupedDataFrame ,
491
496
cs:: Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex} ...;
492
- keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true ) =
497
+ keepkeys:: Bool = true , ungroup:: Bool = true , renamecols:: Bool = true ,
498
+ nthreads:: Int = NTHREADS) =
493
499
_combine_prepare (gd, cs... , keepkeys= keepkeys, ungroup= ungroup,
494
- copycols= true , keeprows= false , renamecols= renamecols)
500
+ copycols= true , keeprows= false , renamecols= renamecols,
501
+ nthreads= nthreads)
495
502
496
503
function _combine_prepare (gd:: GroupedDataFrame ,
497
504
@nospecialize (cs:: Union {Pair, typeof (nrow),
498
505
ColumnIndex, MultiColumnIndex}. .. );
499
506
keepkeys:: Bool , ungroup:: Bool , copycols:: Bool ,
500
- keeprows:: Bool , renamecols:: Bool )
507
+ keeprows:: Bool , renamecols:: Bool ,
508
+ nthreads:: Int )
501
509
cs_vec = []
502
510
for p in cs
503
511
if p === nrow
@@ -570,7 +578,8 @@ function _combine_prepare(gd::GroupedDataFrame,
570
578
f = Pair[first (x) => first (last (x)) for x in cs_norm]
571
579
nms = Symbol[last (last (x)) for x in cs_norm]
572
580
return combine_helper (f, gd, nms, keepkeys= keepkeys, ungroup= ungroup,
573
- copycols= copycols, keeprows= keeprows, renamecols= renamecols)
581
+ copycols= copycols, keeprows= keeprows, renamecols= renamecols,
582
+ nthreads= nthreads)
574
583
end
575
584
576
585
function gen_groups (idx:: Vector{Int} )
@@ -590,11 +599,12 @@ end
590
599
function combine_helper (f, gd:: GroupedDataFrame ,
591
600
nms:: Union{AbstractVector{Symbol},Nothing} = nothing ;
592
601
keepkeys:: Bool , ungroup:: Bool ,
593
- copycols:: Bool , keeprows:: Bool , renamecols:: Bool )
602
+ copycols:: Bool , keeprows:: Bool , renamecols:: Bool ,
603
+ nthreads:: Int )
594
604
if ! ungroup && ! keepkeys
595
605
throw (ArgumentError (" keepkeys=false when ungroup=false is not allowed" ))
596
606
end
597
- idx, valscat = _combine (f, gd, nms, copycols, keeprows, renamecols)
607
+ idx, valscat = _combine (f, gd, nms, copycols, keeprows, renamecols, nthreads )
598
608
! keepkeys && ungroup && return valscat
599
609
keys = groupcols (gd)
600
610
for key in keys
@@ -985,24 +995,72 @@ function copyto_widen!(res::AbstractVector{T}, x::AbstractVector) where T
985
995
end
986
996
987
997
function groupreduce! (res:: AbstractVector , f, op, condf, adjust, checkempty:: Bool ,
988
- incol:: AbstractVector , gd:: GroupedDataFrame )
998
+ incol:: AbstractVector , gd:: GroupedDataFrame ; nthreads :: Int )
989
999
n = length (gd)
1000
+ groups = gd. groups
990
1001
if adjust != = nothing || checkempty
991
1002
counts = zeros (Int, n)
992
1003
end
993
- groups = gd. groups
994
- @inbounds for i in eachindex (incol, groups)
995
- gix = groups[i]
996
- x = incol[i]
997
- if gix > 0 && (condf === nothing || condf (x))
998
- # this check should be optimized out if U is not Any
999
- if eltype (res) === Any && ! isassigned (res, gix)
1000
- res[gix] = f (x, gix)
1001
- else
1002
- res[gix] = op (res[gix], f (x, gix))
1004
+ nt = min (nthreads, Threads. nthreads ())
1005
+ if nt <= 1 || axes (incol) != axes (groups)
1006
+ @inbounds for i in eachindex (incol, groups)
1007
+ gix = groups[i]
1008
+ x = incol[i]
1009
+ if gix > 0 && (condf === nothing || condf (x))
1010
+ # this check should be optimized out if U is not Any
1011
+ if eltype (res) === Any && ! isassigned (res, gix)
1012
+ res[gix] = f (x, gix)
1013
+ else
1014
+ res[gix] = op (res[gix], f (x, gix))
1015
+ end
1016
+ if adjust != = nothing || checkempty
1017
+ counts[gix] += 1
1018
+ end
1003
1019
end
1020
+ end
1021
+ else
1022
+ res_vec = Vector {typeof(res)} (undef, nt)
1023
+ # needs to be always allocated to fix type instability with @threads
1024
+ counts_vec = Vector {Vector{Int}} (undef, nt)
1025
+ res_vec[1 ] = res
1026
+ if adjust != = nothing || checkempty
1027
+ counts_vec[1 ] = counts
1028
+ end
1029
+ for i in 2 : nt
1030
+ res_vec[i] = copy (res)
1004
1031
if adjust != = nothing || checkempty
1005
- counts[gix] += 1
1032
+ counts_vec[i] = zeros (Int, n)
1033
+ end
1034
+ end
1035
+ Threads. @threads for tid in 1 : nt
1036
+ res′ = res_vec[tid]
1037
+ if adjust != = nothing || checkempty
1038
+ counts′ = counts_vec[tid]
1039
+ end
1040
+ start = 1 + ((tid - 1 ) * length (groups)) ÷ nt
1041
+ stop = (tid * length (groups)) ÷ nt
1042
+ @inbounds for i in start: stop
1043
+ gix = groups[i]
1044
+ x = incol[i]
1045
+ if gix > 0 && (condf === nothing || condf (x))
1046
+ # this check should be optimized out if U is not Any
1047
+ if eltype (res′) === Any && ! isassigned (res′, gix)
1048
+ res′[gix] = f (x, gix)
1049
+ else
1050
+ res′[gix] = op (res′[gix], f (x, gix))
1051
+ end
1052
+ if adjust != = nothing || checkempty
1053
+ counts′[gix] += 1
1054
+ end
1055
+ end
1056
+ end
1057
+ end
1058
+ for i in 2 : length (res_vec)
1059
+ res .= op .(res, res_vec[i])
1060
+ end
1061
+ if adjust != = nothing || checkempty
1062
+ for i in 2 : length (counts_vec)
1063
+ counts .+ = counts_vec[i]
1006
1064
end
1007
1065
end
1008
1066
end
@@ -1042,26 +1100,31 @@ end
1042
1100
1043
1101
# function barrier works around type instability of groupreduce_init due to applicable
1044
1102
groupreduce (f, op, condf, adjust, checkempty:: Bool ,
1045
- incol:: AbstractVector , gd:: GroupedDataFrame ) =
1103
+ incol:: AbstractVector , gd:: GroupedDataFrame ;
1104
+ nthreads:: Int ) =
1046
1105
groupreduce! (groupreduce_init (op, condf, adjust, incol, gd),
1047
- f, op, condf, adjust, checkempty, incol, gd)
1106
+ f, op, condf, adjust, checkempty, incol, gd, nthreads = nthreads )
1048
1107
# Avoids the overhead due to Missing when computing reduction
1049
1108
groupreduce (f, op, condf:: typeof (! ismissing), adjust, checkempty:: Bool ,
1050
- incol:: AbstractVector , gd:: GroupedDataFrame ) =
1109
+ incol:: AbstractVector , gd:: GroupedDataFrame ;
1110
+ nthreads:: Int ) =
1051
1111
groupreduce! (disallowmissing (groupreduce_init (op, condf, adjust, incol, gd)),
1052
- f, op, condf, adjust, checkempty, incol, gd)
1112
+ f, op, condf, adjust, checkempty, incol, gd, nthreads = nthreads )
1053
1113
1054
- (r:: Reduce )(incol:: AbstractVector , gd:: GroupedDataFrame ) =
1055
- groupreduce ((x, i) -> x, r. op, r. condf, r. adjust, r. checkempty, incol, gd)
1114
+ (r:: Reduce )(incol:: AbstractVector , gd:: GroupedDataFrame ; nthreads:: Int = NTHREADS) =
1115
+ groupreduce ((x, i) -> x, r. op, r. condf, r. adjust, r. checkempty, incol, gd,
1116
+ nthreads= nthreads)
1056
1117
1057
1118
# this definition is missing in Julia 1.0 LTS and is required by aggregation for var
1058
1119
# TODO : remove this when we drop 1.0 support
1059
1120
if VERSION < v " 1.1"
1060
1121
Base. zero (:: Type{Missing} ) = missing
1061
1122
end
1062
1123
1063
- function (agg:: Aggregate{typeof(var)} )(incol:: AbstractVector , gd:: GroupedDataFrame )
1064
- means = groupreduce ((x, i) -> x, Base. add_sum, agg. condf, / , false , incol, gd)
1124
+ function (agg:: Aggregate{typeof(var)} )(incol:: AbstractVector , gd:: GroupedDataFrame ;
1125
+ nthreads:: Int = NTHREADS)
1126
+ means = groupreduce ((x, i) -> x, Base. add_sum, agg. condf, / , false , incol, gd,
1127
+ nthreads= nthreads)
1065
1128
# !ismissing check is purely an optimization to avoid a copy later
1066
1129
if eltype (means) >: Missing && agg. condf != = ! ismissing
1067
1130
T = Union{Missing, real (eltype (means))}
@@ -1071,10 +1134,11 @@ function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFra
1071
1134
res = zeros (T, length (gd))
1072
1135
return groupreduce! (res, (x, i) -> @inbounds (abs2 (x - means[i])), + , agg. condf,
1073
1136
(x, l) -> l <= 1 ? oftype (x / (l- 1 ), NaN ) : x / (l- 1 ),
1074
- false , incol, gd)
1137
+ false , incol, gd, nthreads = nthreads )
1075
1138
end
1076
1139
1077
- function (agg:: Aggregate{typeof(std)} )(incol:: AbstractVector , gd:: GroupedDataFrame )
1140
+ function (agg:: Aggregate{typeof(std)} )(incol:: AbstractVector , gd:: GroupedDataFrame ;
1141
+ nthreads:: Int = NTHREADS)
1078
1142
outcol = Aggregate (var, agg. condf)(incol, gd)
1079
1143
if eltype (outcol) <: Union{Missing, Rational}
1080
1144
return sqrt .(outcol)
@@ -1083,20 +1147,25 @@ function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFra
1083
1147
end
1084
1148
end
1085
1149
1086
- for f in (first, last)
1087
- function (agg:: Aggregate{typeof(f)} )(incol:: AbstractVector , gd:: GroupedDataFrame )
1088
- n = length (gd)
1089
- outcol = similar (incol, n)
1090
- fillfirst! (agg. condf, outcol, incol, gd, rev= agg. f === last)
1091
- if isconcretetype (eltype (outcol))
1092
- return outcol
1093
- else
1094
- return copyto_widen! (Tables. allocatecolumn (typeof (first (outcol)), n), outcol)
1150
+ for f in (:first , :last )
1151
+ # Without using @eval the presence of a keyword argument triggers a Julia bug
1152
+ @eval begin
1153
+ function (agg:: Aggregate{typeof($f)} )(incol:: AbstractVector , gd:: GroupedDataFrame ;
1154
+ nthreads:: Int = NTHREADS)
1155
+ n = length (gd)
1156
+ outcol = similar (incol, n)
1157
+ fillfirst! (agg. condf, outcol, incol, gd, rev= agg. f === last)
1158
+ if isconcretetype (eltype (outcol))
1159
+ return outcol
1160
+ else
1161
+ return copyto_widen! (Tables. allocatecolumn (typeof (first (outcol)), n), outcol)
1162
+ end
1095
1163
end
1096
1164
end
1097
1165
end
1098
1166
1099
- function (agg:: Aggregate{typeof(length)} )(incol:: AbstractVector , gd:: GroupedDataFrame )
1167
+ function (agg:: Aggregate{typeof(length)} )(incol:: AbstractVector , gd:: GroupedDataFrame ;
1168
+ nthreads:: Int = NTHREADS)
1100
1169
if getfield (gd, :idx ) === nothing
1101
1170
lens = zeros (Int, length (gd))
1102
1171
@inbounds for gix in gd. groups
@@ -1143,7 +1212,7 @@ end
1143
1212
1144
1213
function _combine (f:: AbstractVector{<:Pair} ,
1145
1214
gd:: GroupedDataFrame , nms:: AbstractVector{Symbol} ,
1146
- copycols:: Bool , keeprows:: Bool , renamecols:: Bool )
1215
+ copycols:: Bool , keeprows:: Bool , renamecols:: Bool , nthreads :: Int )
1147
1216
# here f should be normalized and in a form of source_cols => fun
1148
1217
@assert all (x -> first (x) isa Union{Int, AbstractVector{Int}, AsTable}, f)
1149
1218
@assert all (x -> last (x) isa Base. Callable, f)
@@ -1185,7 +1254,7 @@ function _combine(f::AbstractVector{<:Pair},
1185
1254
if length (gd) > 0 && isagg (p, gd)
1186
1255
incol = parentdf[! , source_cols]
1187
1256
agg = check_aggregate (last (p), incol)
1188
- outcol = agg (incol, gd)
1257
+ outcol = agg (incol, gd, nthreads = nthreads )
1189
1258
res[i] = idx_agg, outcol
1190
1259
elseif keeprows && fun === identity && ! (source_cols isa AsTable)
1191
1260
@assert source_cols isa Union{Int, AbstractVector{Int}}
@@ -1283,7 +1352,7 @@ function _combine(f::AbstractVector{<:Pair},
1283
1352
end
1284
1353
1285
1354
function _combine (fun:: Base.Callable , gd:: GroupedDataFrame , :: Nothing ,
1286
- copycols:: Bool , keeprows:: Bool , renamecols:: Bool )
1355
+ copycols:: Bool , keeprows:: Bool , renamecols:: Bool , nthreads :: Int )
1287
1356
@assert copycols && ! keeprows
1288
1357
# use `similar` as `gd` might have been subsetted
1289
1358
firstres = length (gd) > 0 ? fun (gd[1 ]) : fun (similar (parent (gd), 0 ))
@@ -1293,7 +1362,7 @@ function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing,
1293
1362
end
1294
1363
1295
1364
function _combine (p:: Pair , gd:: GroupedDataFrame , :: Nothing ,
1296
- copycols:: Bool , keeprows:: Bool , renamecols:: Bool )
1365
+ copycols:: Bool , keeprows:: Bool , renamecols:: Bool , nthreads :: Int )
1297
1366
# here p should not be normalized as we allow tabular return value from fun
1298
1367
# map and combine should not dispatch here if p is isagg
1299
1368
@assert copycols && ! keeprows
@@ -1708,7 +1777,7 @@ julia> select(gd, :, AsTable(Not(:a)) => sum, renamecols=false)
1708
1777
```
1709
1778
"""
1710
1779
select (gd:: GroupedDataFrame , args... ; copycols:: Bool = true , keepkeys:: Bool = true ,
1711
- ungroup:: Bool = true , renamecols:: Bool = true ) =
1780
+ ungroup:: Bool = true , renamecols:: Bool = true , nthreads :: Int = NTHREADS ) =
1712
1781
_combine_prepare (gd, args... , copycols= copycols, keepkeys= keepkeys,
1713
1782
ungroup= ungroup, keeprows= true , renamecols= renamecols)
1714
1783
0 commit comments