-
Notifications
You must be signed in to change notification settings - Fork 375
Support multithreading in groupreduce #2491
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
2d57734
713d5b8
3b5addb
ab76ff8
3e225ad
011a9b8
9595755
8000e2e
d7192d5
cc8d2d4
08aa0d9
2678ccb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,3 +129,9 @@ pairs | |
```@docs | ||
isapprox | ||
``` | ||
|
||
## Multithreading | ||
```@docs | ||
DataFrames.nthreads | ||
DataFrames.nthreads! | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -157,24 +157,84 @@ function copyto_widen!(res::AbstractVector{T}, x::AbstractVector) where T | |
end | ||
|
||
function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Bool, | ||
incol::AbstractVector, gd::GroupedDataFrame) | ||
incol::AbstractVector, gd::GroupedDataFrame, nthreads::Int) | ||
n = length(gd) | ||
groups = gd.groups | ||
if adjust !== nothing || checkempty | ||
counts = zeros(Int, n) | ||
end | ||
groups = gd.groups | ||
@inbounds for i in eachindex(incol, groups) | ||
gix = groups[i] | ||
x = incol[i] | ||
if gix > 0 && (condf === nothing || condf(x)) | ||
# this check should be optimized out if U is not Any | ||
if eltype(res) === Any && !isassigned(res, gix) | ||
res[gix] = f(x, gix) | ||
else | ||
res[gix] = op(res[gix], f(x, gix)) | ||
nt = min(nthreads, Threads.nthreads()) | ||
if nt <= 1 || axes(incol) != axes(groups) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
@inbounds for i in eachindex(incol, groups) | ||
gix = groups[i] | ||
x = incol[i] | ||
if gix > 0 && (condf === nothing || condf(x)) | ||
# this check should be optimized out if eltype is not Any | ||
if eltype(res) === Any && !isassigned(res, gix) | ||
res[gix] = f(x, gix) | ||
else | ||
res[gix] = op(res[gix], f(x, gix)) | ||
end | ||
if adjust !== nothing || checkempty | ||
counts[gix] += 1 | ||
end | ||
end | ||
end | ||
else | ||
res_vec = Vector{typeof(res)}(undef, nt) | ||
# needs to be always allocated to fix type instability with @threads | ||
counts_vec = Vector{Vector{Int}}(undef, nt) | ||
res_vec[1] = res | ||
if adjust !== nothing || checkempty | ||
counts_vec[1] = counts | ||
end | ||
for i in 2:nt | ||
res_vec[i] = copy(res) | ||
if adjust !== nothing || checkempty | ||
counts_vec[i] = zeros(Int, n) | ||
end | ||
end | ||
Threads.@threads for tid in 1:nt | ||
res′ = res_vec[tid] | ||
if adjust !== nothing || checkempty | ||
counts[gix] += 1 | ||
counts′ = counts_vec[tid] | ||
end | ||
start = 1 + ((tid - 1) * length(groups)) ÷ nt | ||
stop = (tid * length(groups)) ÷ nt | ||
@inbounds for i in start:stop | ||
gix = groups[i] | ||
x = incol[i] | ||
if gix > 0 && (condf === nothing || condf(x)) | ||
# this check should be optimized out if eltype is not Any | ||
if eltype(res′) === Any && !isassigned(res′, gix) | ||
res′[gix] = f(x, gix) | ||
else | ||
res′[gix] = op(res′[gix], f(x, gix)) | ||
end | ||
if adjust !== nothing || checkempty | ||
counts′[gix] += 1 | ||
end | ||
end | ||
end | ||
end | ||
for i in 2:length(res_vec) | ||
resi = res_vec[i] | ||
@inbounds @simd for j in eachindex(res) | ||
# this check should be optimized out if eltype is not Any | ||
if eltype(res) === Any | ||
if isassigned(resi, j) && isassigned(res, j) | ||
res[j] = op(res[j], resi[j]) | ||
elseif isassigned(resi, j) | ||
res[j] = resi[j] | ||
end | ||
else | ||
res[j] = op(res[j], resi[j]) | ||
end | ||
end | ||
end | ||
if adjust !== nothing || checkempty | ||
for i in 2:length(counts_vec) | ||
counts .+= counts_vec[i] | ||
end | ||
end | ||
end | ||
|
@@ -218,26 +278,31 @@ end | |
|
||
# function barrier works around type instability of groupreduce_init due to applicable | ||
groupreduce(f, op, condf, adjust, checkempty::Bool, | ||
incol::AbstractVector, gd::GroupedDataFrame) = | ||
incol::AbstractVector, gd::GroupedDataFrame, | ||
nthreads::Int) = | ||
groupreduce!(groupreduce_init(op, condf, adjust, incol, gd), | ||
f, op, condf, adjust, checkempty, incol, gd) | ||
f, op, condf, adjust, checkempty, incol, gd, nthreads) | ||
# Avoids the overhead due to Missing when computing reduction | ||
groupreduce(f, op, condf::typeof(!ismissing), adjust, checkempty::Bool, | ||
incol::AbstractVector, gd::GroupedDataFrame) = | ||
incol::AbstractVector, gd::GroupedDataFrame, | ||
nthreads::Int) = | ||
groupreduce!(disallowmissing(groupreduce_init(op, condf, adjust, incol, gd)), | ||
f, op, condf, adjust, checkempty, incol, gd) | ||
f, op, condf, adjust, checkempty, incol, gd, nthreads) | ||
|
||
(r::Reduce)(incol::AbstractVector, gd::GroupedDataFrame) = | ||
groupreduce((x, i) -> x, r.op, r.condf, r.adjust, r.checkempty, incol, gd) | ||
(r::Reduce)(incol::AbstractVector, gd::GroupedDataFrame; | ||
nthreads::Int=nthreads()) = | ||
groupreduce((x, i) -> x, r.op, r.condf, r.adjust, r.checkempty, incol, gd, nthreads) | ||
|
||
# this definition is missing in Julia 1.0 LTS and is required by aggregation for var | ||
# TODO: remove this when we drop 1.0 support | ||
if VERSION < v"1.1" | ||
Base.zero(::Type{Missing}) = missing | ||
end | ||
|
||
function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame) | ||
means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd) | ||
function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame; | ||
nthreads::Int=nthreads()) | ||
means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, | ||
incol, gd, nthreads) | ||
# !ismissing check is purely an optimization to avoid a copy later | ||
if eltype(means) >: Missing && agg.condf !== !ismissing | ||
T = Union{Missing, real(eltype(means))} | ||
|
@@ -247,32 +312,38 @@ function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFra | |
res = zeros(T, length(gd)) | ||
return groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf, | ||
(x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1), | ||
false, incol, gd) | ||
false, incol, gd, nthreads) | ||
end | ||
|
||
function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame) | ||
outcol = Aggregate(var, agg.condf)(incol, gd) | ||
function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame; | ||
nthreads::Int=nthreads()) | ||
outcol = Aggregate(var, agg.condf)(incol, gd; nthreads=nthreads) | ||
if eltype(outcol) <: Union{Missing, Rational} | ||
return sqrt.(outcol) | ||
else | ||
return map!(sqrt, outcol, outcol) | ||
end | ||
end | ||
|
||
for f in (first, last) | ||
function (agg::Aggregate{typeof(f)})(incol::AbstractVector, gd::GroupedDataFrame) | ||
n = length(gd) | ||
outcol = similar(incol, n) | ||
fillfirst!(agg.condf, outcol, incol, gd, rev=agg.f === last) | ||
if isconcretetype(eltype(outcol)) | ||
return outcol | ||
else | ||
return copyto_widen!(Tables.allocatecolumn(typeof(first(outcol)), n), outcol) | ||
for f in (:first, :last) | ||
# Without using @eval the presence of a keyword argument triggers a Julia bug | ||
@eval begin | ||
function (agg::Aggregate{typeof($f)})(incol::AbstractVector, gd::GroupedDataFrame; | ||
nthreads::Int=nthreads()) | ||
n = length(gd) | ||
outcol = similar(incol, n) | ||
fillfirst!(agg.condf, outcol, incol, gd, rev=agg.f === last) | ||
if isconcretetype(eltype(outcol)) | ||
return outcol | ||
else | ||
return copyto_widen!(Tables.allocatecolumn(typeof(first(outcol)), n), outcol) | ||
end | ||
end | ||
end | ||
end | ||
|
||
function (agg::Aggregate{typeof(length)})(incol::AbstractVector, gd::GroupedDataFrame) | ||
function (agg::Aggregate{typeof(length)})(incol::AbstractVector, gd::GroupedDataFrame; | ||
nthreads::Int=nthreads()) | ||
if getfield(gd, :idx) === nothing | ||
lens = zeros(Int, length(gd)) | ||
@inbounds for gix in gd.groups | ||
|
Uh oh!
There was an error while loading. Please reload this page.