diff --git a/Project.toml b/Project.toml
index 9f640f545..bb4e20bd9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,8 @@ version = "0.5.0"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+CUDNN_jll = "62b44479-cb7b-5706-934f-f13b2eb2e645"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -26,7 +28,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Adapt = "3"
-CUDA = "3.3"
+CUDA = "3, 4"
 ChainRulesCore = "1"
 DataStructures = "0.18"
 Flux = "0.13.4"
diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl
index a0baa83ee..f2dc59bd7 100644
--- a/src/GNNGraphs/GNNGraphs.jl
+++ b/src/GNNGraphs/GNNGraphs.jl
@@ -15,6 +15,8 @@ using ChainRulesCore
 using LinearAlgebra, Random, Statistics
 import MLUtils
 using MLUtils: getobs, numobs
+using CUDA.CUSPARSE: AbstractCuSparseMatrix
+
 
 include("gnngraph.jl")
 export GNNGraph, 
diff --git a/src/GNNGraphs/convert.jl b/src/GNNGraphs/convert.jl
index 8da7345a4..0f83ab1dc 100644
--- a/src/GNNGraphs/convert.jl
+++ b/src/GNNGraphs/convert.jl
@@ -161,7 +161,7 @@ function to_sparse(A::ADJMAT_T, T=nothing; dir=:out, num_nodes=nothing, weighted
         A = sparse(A)
     end
     if !weighted
-        A = map(x -> ifelse(x > 0, T(1), T(0)), A)
+        A = binarize(A, T)
     end
     return A, num_nodes, num_edges
 end
diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl
index a537a90c4..ab2882b43 100644
--- a/src/GNNGraphs/query.jl
+++ b/src/GNNGraphs/query.jl
@@ -143,12 +143,7 @@ User may specify the eltype `T` of the returned matrix.
 If `weighted=true`, the `A` will contain the edge weigths if any, otherwise the elements of `A` will be either 0 or 1.
 """
 function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=eltype(g); dir=:out, weighted=true)
-    if g.graph[1] isa CuVector
-        # TODO revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152
-        A, n, m = to_dense(g.graph, T; num_nodes=g.num_nodes, weighted)
-    else
-        A, n, m = to_sparse(g.graph, T; num_nodes=g.num_nodes, weighted)
-    end
+    A, n, m = to_sparse(g.graph, T; num_nodes=g.num_nodes, weighted)
     @assert size(A) == (n, n)
     return dir == :out ? A : A'
 end
@@ -157,7 +152,7 @@ function Graphs.adjacency_matrix(g::GNNGraph{<:ADJMAT_T}, T::DataType=eltype(g);
     @assert dir ∈ [:in, :out]
     A = g.graph
     if !weighted
-        A = binarize(A)
+        A = binarize(A, T)
     end
     A = T != eltype(A) ? T.(A) : A
     return dir == :out ? A : A'
@@ -232,7 +227,7 @@ function Graphs.degree(g::GNNGraph{<:ADJMAT_T}, T::TT=nothing; dir=:out, edge_we
     end
     A = adjacency_matrix(g)
     if edge_weight === false
-        A = binarize(A)
+        A = binarize(A, T)
     end
     A = eltype(A) != T ? T.(A) : A
     return dir == :out ? vec(sum(A, dims=2)) : 
diff --git a/src/GNNGraphs/utils.jl b/src/GNNGraphs/utils.jl
index 0e5c498c0..ec50b555f 100644
--- a/src/GNNGraphs/utils.jl
+++ b/src/GNNGraphs/utils.jl
@@ -172,8 +172,46 @@ function edge_decoding(idx, n; directed=true)
     return s, t
 end
 
-binarize(x) = map(>(0), x)
+# binarize(x) = map(>(0), x) # map is not supported by CuSparse types
+binarize(x::AbstractArray) = >(0).(x)
+binarize(x::Number, T::Type)::T = ifelse(x > 0, T(1), T(0)) 
+binarize(x::AbstractArray, T) = T.(binarize(x)) # didn't find a better cusparse compatible implementation
+
 
 @non_differentiable binarize(x...)
 @non_differentiable edge_encoding(x...)
 @non_differentiable edge_decoding(x...)
+
+
+## PIRACY. THESE SHOULD GO in CUDA.jl
+
+# Workaround https://github.com/JuliaGPU/CUDA.jl/issues/1406
+Base.sum(x::AbstractCuSparseMatrix; dims=:) = cusparse_sum(x, Val(dims))
+
+cusparse_sum(x, ::Val{:}) = sum(cusparse_sum(x, Val(1)))
+
+function cusparse_sum(x::AbstractCuSparseMatrix, ::Val{1})
+    m, n = size(x)
+    v = ones_like(x, (1, m))
+    return v * x
+end
+
+function cusparse_sum(x::AbstractCuSparseMatrix, ::Val{2})
+    m, n = size(x)
+    v = ones_like(x, (n, 1))
+    return x * v
+end
+
+# workaround https://github.com/JuliaGPU/CUDA.jl/issues/1664
+function CUDA.CuMatrix{T}(x::AbstractCuSparseMatrix{T}) where T <: Integer
+    return T.(CuMatrix(Float32.(x)))
+end
+
+function Base.:(*)(x::AbstractCuSparseMatrix, d::Diagonal)
+    return x .* d.diag'
+end
+
+function Base.:(*)(d::Diagonal, x::AbstractCuSparseMatrix)
+    return d.diag .* CuArray(x) # couldn't do better
+end
+
diff --git a/src/msgpass.jl b/src/msgpass.jl
index d6f6df6d7..6f5cb204f 100644
--- a/src/msgpass.jl
+++ b/src/msgpass.jl
@@ -189,14 +189,14 @@ end
 ## COPY_XJ 
 
 function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(+), xi, xj::AbstractMatrix, e)
-    A = adjacency_matrix(g, weighted=false)
+    A = adjacency_matrix(g, eltype(xj), weighted=false)
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
-    propagate((xi,xj,e) -> copy_xj(xi,xj,e), g, +, xi, xj, e)
-end
+# ## avoid the fast path on gpu until we have better cuda support
+# function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
+#     propagate((xi,xj,e) -> copy_xj(xi,xj,e), g, +, xi, xj, e)
+# end
 
 ## E_MUL_XJ 
 
@@ -207,10 +207,10 @@ function propagate(::typeof(e_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::AbstractVector)
-    propagate((xi,xj,e) -> e_mul_xj(xi,xj,e), g, +, xi, xj, e)
-end
+# ## avoid the fast path on gpu until we have better cuda support
+# function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::AbstractVector)
+#     propagate((xi,xj,e) -> e_mul_xj(xi,xj,e), g, +, xi, xj, e)
+# end
 
 ## W_MUL_XJ 
 
@@ -220,10 +220,10 @@ function propagate(::typeof(w_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::Nothing)
-    propagate((xi,xj,e) -> w_mul_xj(xi,xj,e), g, +, xi, xj, e)
-end
+# ## avoid the fast path on gpu until we have better cuda support
+# function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::Nothing)
+#     propagate((xi,xj,e) -> w_mul_xj(xi,xj,e), g, +, xi, xj, e)
+# end
 
 
 
diff --git a/test_cuda.jl b/test_cuda.jl
new file mode 100644
index 000000000..5e547a8ff
--- /dev/null
+++ b/test_cuda.jl
@@ -0,0 +1,85 @@
+
+### how to make it work with CUDA.jl v4.0 ####
+# dev Flux
+# In the Flux project folder: 
+    # - change the CUDA compat bound
+    # - comment out usages of CUDA.has_cudnn()
+# dev NNLibCUDA
+# In the NNlibCUDA project folder: 
+    # - change the CUDA compat bound
+    # - add CUDA#master
+    # - convert all the using CUDA.CUDNN to using CUDNN
+    # - add https://github.com/JuliaGPU/CUDA.jl:lib/cudnn
+# add CUDA#master
+# add https://github.com/JuliaGPU/CUDA.jl:lib/cudnn # CUDNN subpackage not registered yet
+
+using GraphNeuralNetworks, CUDA, Flux
+using CUDA.CUSPARSE
+using LinearAlgebra, SparseArrays
+using GraphNeuralNetworks.GNNGraphs: binarize
+CUDA.allowscalar(false)
+
+g_cpu = rand_graph(10, 10, graph_type = :sparse)
+g = g_cpu |> gpu
+
+a = adjacency_matrix(g, Float32)
+# maximum(a)
+# minimum(a)
+# extrema(a)
+# sum(a)
+
+x = rand(2, 10) |> gpu
+z = rand(10, 2) |> gpu
+
+@assert x * z isa CuMatrix
+@assert a .+ 1 isa CuMatrix
+@assert tanh.(a) isa CuSparseMatrix
+@assert a + a isa CuSparseMatrix
+@assert mul!(deepcopy(z), a, z, 0, 1) isa CuArray
+@assert mul!(deepcopy(x), x, a, 0, 1) isa CuArray
+# @assert mm!('N', 'N', 0, a, z, 1, deepcopy(z), 'O') isa CuArray
+
+@assert x * a isa CuMatrix
+@assert a * z isa CuMatrix
+# a * a
+f(x) = x > 0
+@assert f.(a) isa CuSparseMatrixCSC{Bool}
+# map(f, a)
+@assert binarize.(a) isa CuSparseMatrix
+# show(a')
+# CUDA.ones(10) .* a
+# a .* CUDA.ones(10)
+
+
+b = CuSparseMatrixCSR(a)
+@assert x * z isa CuMatrix
+@assert b .+ 1 isa CuMatrix
+@assert tanh.(b) isa CuSparseMatrix
+@assert b + b isa CuSparseMatrix
+@assert x * b isa CuMatrix
+@assert b * z isa CuMatrix
+f(x) = x > 0
+#BUG # @assert f.(b) isa CuSparseMatrixCSC{Bool}
+# map(f, b)
+
+c = CuSparseMatrixCOO(a)
+@assert x * z isa CuMatrix
+# BUG @assert c .+ 1 isa CuMatrix
+# BUG @assert tanh.(c) isa CuSparseMatrix
+# BUG @assert c + c isa CuSparseMatrix
+@assert x * c isa CuMatrix
+@assert c * z isa CuMatrix
+f(x) = x > 0
+# map(f, c)
+# BUG @assert f.(c) isa CuSparseMatrixCSC{Bool}
+
+
+# b * b
+m = GCNConv(2 => 2) |> gpu
+y = m(g, x)
+
+g2 = rand_graph(10, 10, graph_type=:coo) |> gpu
+adjacency_matrix(g2)
+
+
+a
\ No newline at end of file