Fix bug (#12)

* simpler ArrayStyle * hoist buffer out of benchmark * rm debug code, restore codepath
ericphanson · Oct 26, 2023 · ac1d45e · ac1d45e · ericphanson · Oct 26, 2023
1 parent ba04628
commit ac1d45e
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -60,18 +60,18 @@ For a less-toy example, in `test/flux.jl` we test inference over a Flux model:
 
 ```julia
 # Baseline: Array
-infer!(predictions, model, data): 0.457247 seconds (8.02 k allocations: 370.796 MiB, 6.10% gc time)
+infer!(b, predictions, model, data): 0.499735 seconds (8.05 k allocations: 306.796 MiB, 6.47% gc time)
 # Baseline: StrideArray
 stride_data = StrideArray.(data)
-infer!(predictions, model, stride_data): 0.336535 seconds (8.05 k allocations: 370.796 MiB, 6.20% gc time)
+infer!(b, predictions, model, stride_data): 0.364180 seconds (8.05 k allocations: 306.796 MiB, 8.32% gc time)
 # Using AllocArray:
 alloc_data = AllocArray.(data)
-infer!(predictions, model, alloc_data): 0.318736 seconds (13.35 k allocations: 67.225 MiB)
+infer!(b, predictions, model, alloc_data): 0.351953 seconds (13.60 k allocations: 3.221 MiB)
 checked_alloc_data = CheckedAllocArray.(data)
-infer!(predictions, model, checked_alloc_data): 23.673344 seconds (26.15 k allocations: 67.773 MiB)
+infer!(b, predictions, model, checked_alloc_data): 15.522897 seconds (25.54 k allocations: 3.742 MiB)
 ```
 
-We can see in this example, we got much less allocation (and no GC time), and similar runtime. By running larger examples, the gap in allocations can be much larger; here we use a 64 MiB buffer that we allocate each `infer!` call, which accounts for most of the memory usage.
+We can see in this example, we got 100x less allocation (and no GC time), and similar runtime, for `AllocArray`s. We can see `CheckedAllocArrays` are far slower here.
 
 ## Design notes
 

diff --git a/src/AllocArray.jl b/src/AllocArray.jl
@@ -44,8 +44,8 @@ Base.size(a::AllocArray) = size(getfield(a, :arr))
 Base.IndexStyle(::Type{<:AllocArray{T,N,Arr}}) where {T,N,Arr} = Base.IndexStyle(Arr)
 
 # used only by broadcasting?
-function Base.similar(::Type{AllocArray{T,N,Arr}}, dims::Dims) where {T,N,Arr}
-    return alloc_similar(CURRENT_ALLOCATOR[], AllocArray{T,N,Arr}, dims)
+function Base.similar(::Type{<:AllocArray{T}}, dims::Dims) where {T}
+    return alloc_similar(CURRENT_ALLOCATOR[], AllocArray{T}, dims)
 end
 
 function Base.similar(a::AllocArray, ::Type{T}, dims::Dims) where {T}
@@ -56,13 +56,13 @@ end
 ##### Broadcasting
 #####
 
-function Base.BroadcastStyle(::Type{AllocArray{T,N,Arr}}) where {T,N,Arr}
-    return Broadcast.ArrayStyle{AllocArray{T,N,Arr}}()
+function Base.BroadcastStyle(::Type{<:AllocArray})
+    return ArrayStyle{AllocArray}()
 end
 
-function Base.similar(bc::Broadcasted{ArrayStyle{AllocArray{T,N,Arr}}},
-                      ::Type{ElType}) where {T,N,Arr,ElType}
-    return similar(AllocArray{T,N,Arr}, axes(bc))
+function Base.similar(bc::Broadcasted{ArrayStyle{AllocArray}},
+                      ::Type{T}) where {T}
+    return similar(AllocArray{T}, axes(bc))
 end
 
 #####

diff --git a/src/CheckedAllocArray.jl b/src/CheckedAllocArray.jl
@@ -121,8 +121,8 @@ end
 Base.IndexStyle(::Type{<:CheckedAllocArray{T,N,Arr}}) where {T,N,Arr} = Base.IndexStyle(Arr)
 
 # used only by broadcasting?
-function Base.similar(::Type{CheckedAllocArray{T,N,Arr}}, dims::Dims) where {T,N,Arr}
-    return alloc_similar(CURRENT_ALLOCATOR[], CheckedAllocArray{T,N,Arr}, dims)
+function Base.similar(::Type{<:CheckedAllocArray{T}}, dims::Dims) where {T}
+    return alloc_similar(CURRENT_ALLOCATOR[], CheckedAllocArray{T}, dims)
 end
 
 function Base.similar(a::CheckedAllocArray, ::Type{T}, dims::Dims) where {T}
@@ -133,13 +133,13 @@ end
 ##### Broadcasting
 #####
 
-function Base.BroadcastStyle(::Type{CheckedAllocArray{T,N,Arr}}) where {T,N,Arr}
-    return Broadcast.ArrayStyle{CheckedAllocArray{T,N,Arr}}()
+function Base.BroadcastStyle(::Type{<:CheckedAllocArray})
+    return ArrayStyle{CheckedAllocArray}()
 end
 
-function Base.similar(bc::Broadcasted{ArrayStyle{CheckedAllocArray{T,N,Arr}}},
-                      ::Type{ElType}) where {T,N,Arr,ElType}
-    return similar(CheckedAllocArray{T,N,Arr}, axes(bc))::CheckedAllocArray
+function Base.similar(bc::Broadcasted{ArrayStyle{CheckedAllocArray}},
+                      ::Type{T}) where {T}
+    return similar(CheckedAllocArray{T}, axes(bc))::CheckedAllocArray
 end
 
 #####

diff --git a/src/alloc_interface.jl b/src/alloc_interface.jl
@@ -5,10 +5,15 @@
 
 Allocators need to subtype `Allocator` and implement two methods of `alloc_similar`:
 
-- `AllocArrays.alloc_similar(::Allocator, arr, ::Type{T}, dims::Dims)`
-- `AllocArrays.alloc_similar(::Allocator, ::Type{Arr}, dims::Dims) where {Arr<:AbstractArray}`
+- `AllocArrays.alloc_similar(::MyAllocator, a::AllocArray, ::Type{T}, dims::Dims)`
+- `AllocArrays.alloc_similar(::MyAllocator, ::Type{<:AllocArray{T}}, dims::Dims) where {T}`
 
-where the latter is used by broadcasting.
+to support `AllocArrays`, (which should each return an `AllocArray`) and likewise
+
+- `AllocArrays.alloc_similar(::MyAllocator, a::CheckedAllocArray, ::Type{T}, dims::Dims)`
+- `AllocArrays.alloc_similar(::MyAllocator, ::Type{<:CheckedAllocArray{T}}, dims::Dims) where {T}`
+
+which should each return a `CheckedAllocArray`.
 """
 abstract type Allocator end
 
@@ -38,20 +43,22 @@ function alloc_similar(::DefaultAllocator, ::AllocArray, ::Type{T}, dims::Dims)
     return AllocArray(similar(Array{T}, dims))
 end
 
-function alloc_similar(::DefaultAllocator, ::Type{AllocArray{T,N,Arr}},
-                       dims::Dims) where {T, N, Arr}
-    return AllocArray(similar(Arr, dims))
-end
-
-function alloc_similar(::DefaultAllocator, ::CheckedAllocArray, ::Type{T}, dims::Dims) where {T}
-    return CheckedAllocArray(similar(Array{T}, dims))
+function alloc_similar(::DefaultAllocator, ::Type{<:AllocArray{T}},
+                       dims::Dims) where {T}
+    return AllocArray(similar(Array{T}, dims))
 end
 
-function alloc_similar(::DefaultAllocator, ::Type{CheckedAllocArray{T,N,Arr}},
-                       dims::Dims) where {T, N, Arr}
+function alloc_similar(D::DefaultAllocator, c::CheckedAllocArray, ::Type{T},
+                       dims::Dims) where {T}
     # We know the memory is valid since it was allocated with the
     # default allocator
-    return CheckedAllocArray(similar(Arr, dims), MemValid(true))
+    a = @lock(c, alloc_similar(D, _get_inner(c), T, dims))
+    return CheckedAllocArray(a, MemValid(true))
+end
+
+function alloc_similar(D::DefaultAllocator, ::Type{<:CheckedAllocArray{T}},
+                       dims::Dims) where {T}
+    return CheckedAllocArray(alloc_similar(D, AllocArray{T}, dims), MemValid(true))
 end
 
 #####
@@ -125,12 +132,14 @@ function reset!(B::UncheckedBumperAllocator)
     return nothing
 end
 
-function alloc_similar(B::UncheckedBumperAllocator, ::AllocArray, ::Type{T}, dims::Dims) where {T}
+function alloc_similar(B::UncheckedBumperAllocator, ::AllocArray, ::Type{T},
+                       dims::Dims) where {T}
     inner = Bumper.alloc(T, B.buf, dims...)
     return AllocArray(inner)
 end
 
-function alloc_similar(B::UncheckedBumperAllocator, ::Type{AllocArray{T,N,Arr}}, dims::Dims) where {T, N, Arr}
+function alloc_similar(B::UncheckedBumperAllocator, ::Type{<:AllocArray{T}},
+                       dims::Dims) where {T}
     inner = Bumper.alloc(T, B.buf, dims...)
     return AllocArray(inner)
 end
@@ -224,10 +233,10 @@ function alloc_similar(B::BumperAllocator, c::CheckedAllocArray, ::Type{T},
     end
 end
 
-function alloc_similar(B::BumperAllocator, ::Type{CheckedAllocArray{T,N,Arr}},
-                       dims::Dims) where {T,N,Arr}
+function alloc_similar(B::BumperAllocator, ::Type{<:CheckedAllocArray{T}},
+                       dims::Dims) where {T}
     @lock B begin
-        inner = alloc_similar(B.bumper, Arr, dims)
+        inner = alloc_similar(B.bumper, AllocArray{T}, dims)
         valid = MemValid(true)
         push!(B.mems, valid)
         return CheckedAllocArray(inner, valid)
@@ -239,9 +248,9 @@ end
 # If we have a `BumperAllocator` and are asked to allocate an unchecked array
 # then we can do that by dispatching to the inner bumper. We will still
 # get the lock for concurrency-safety.
-function alloc_similar(B::BumperAllocator, ::Type{AllocArray{T,N,Arr}},
-                       dims::Dims) where {T,N,Arr}
-    return @lock(B, alloc_similar(B.bumper, AllocArray{T,N,Arr}, dims))
+function alloc_similar(B::BumperAllocator, ::Type{<:AllocArray{T}},
+                       dims::Dims) where {T}
+    return @lock(B, alloc_similar(B.bumper, AllocArray{T}, dims))
 end
 
 function alloc_similar(B::BumperAllocator, a::AllocArray, ::Type{T},

diff --git a/test/flux.jl b/test/flux.jl
@@ -80,8 +80,7 @@ end
 # Our model acts on input just by applying the chain.
 (m::DigitsModel)(x) = m.chain(x)
 
-function infer!(predictions, model, data)
-    b = BumperAllocator(2^26) # 64 MiB
+function infer!(b, predictions, model, data)
     # Here we use a locked bumper for thread-safety, since NNlib multithreads
     # some of it's functions. However we are sure to only deallocate outside of the threaded region. (All concurrency occurs within the `model` call itself).
     with_allocator(b) do
@@ -96,6 +95,8 @@ end
 @testset "More complicated model" begin
     model = DigitsModel()
 
+    b = BumperAllocator(2^26) # 64 MiB
+
     # Setup some fake data
     N = 1_000
     data_arr = rand(Float32, 28, 28, N)
@@ -112,32 +113,35 @@ end
     checked_alloc_data = CheckedAllocArray.(data)
 
     preds_data = fresh_predictions()
-    infer!(preds_data, model, data)
+    infer!(b, preds_data, model, data)
 
     preds_alloc = fresh_predictions()
-    infer!(preds_alloc, model, alloc_data)
+    infer!(b, preds_alloc, model, alloc_data)
 
     preds_checked_alloc = fresh_predictions()
-    infer!(preds_checked_alloc, model, checked_alloc_data)
+    infer!(b, preds_checked_alloc, model, checked_alloc_data)
 
     preds_stride = fresh_predictions()
     stride_data = StrideArray.(data)
-    infer!(preds_stride, model, stride_data)
+    infer!(b, preds_stride, model, stride_data)
 
     @test preds_data ≈ preds_alloc
     @test preds_data ≈ preds_stride
     @test preds_data ≈ preds_checked_alloc
 
     predictions = fresh_predictions()
-    @showtime infer!(predictions, model, data)
-    @showtime infer!(predictions, model, stride_data)
-    @showtime infer!(predictions, model, alloc_data)
-    @showtime infer!(predictions, model, checked_alloc_data)
+    @showtime infer!(b, predictions, model, data)
+    @showtime infer!(b, predictions, model, stride_data)
+    @showtime infer!(b, predictions, model, alloc_data)
+    @showtime infer!(b, predictions, model, checked_alloc_data)
 
     # Note: for max perf, consider
-    # (using Functors)
+    # using Functors
     # model = fmap(AllocArray ∘ PtrArray, model; exclude = x -> x isa AbstractArray)
-    # and `alloc_data = AllocArray.(PtrArray.(data))`
+    # alloc_data = AllocArray.(PtrArray.(data))
+    # @showtime infer!(b, predictions, model, alloc_data)
+    # @showtime infer!(b, predictions, model, alloc_data)
     # Together, that ensure everything is an `AllocArray(PtrArray(...))`
-    # This seems to help although not a huge amount.
+    # This seems to help with runtime although not a huge amount,
+    # and doesn't really help with allocations.
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -39,4 +39,12 @@ end
 
     include("flux.jl")
     include("checked.jl")
+
+    # Bug reported here:
+    # https://julialang.zulipchat.com/#narrow/stream/137791-general/topic/AllocArrays.2Ejl/near/398698500
+    a = AllocArray(1:4)
+    @test a[1:2] .+ a[3:4]' isa AllocArray
+
+    a = CheckedAllocArray(1:4)
+    @test a[1:2] .+ a[3:4]' isa CheckedAllocArray
 end