Skip to content

Commit dfd9185

Browse files
committed
Sparse: CrsMatrix traversal initial implementation
The current implementation is fairly basic but allows users to get a truly portable way to run functor on CPU and GPU over the values of a matrix. This should lower the barrier to get some distributed custom algorithms in users codes and libraries.
1 parent 19b30d8 commit dfd9185

5 files changed

+357
-3
lines changed

perf_test/sparse/KokkosSparse_par_ilut.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,6 @@ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
7171
size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>;
7272
using float_t = typename Kokkos::ArithTraits<scalar_t>::mag_type;
7373

74-
static constexpr bool IS_GPU =
75-
KokkosKernels::Impl::kk_is_gpu_exec_space<exe_space>();
76-
7774
///////////////////////////////////////////////////////////////////////////////
7875
void run_par_ilut_test(benchmark::State& state, KernelHandle& kh,
7976
const sp_matrix_type& A, int& num_iters)
@@ -132,6 +129,9 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh,
132129

133130
#ifdef USE_GINKGO
134131
///////////////////////////////////////////////////////////////////////////////
132+
static constexpr bool IS_GPU =
133+
KokkosKernels::Impl::kk_is_gpu_exec_space<exe_space>();
134+
135135
using ginkgo_exec =
136136
std::conditional_t<IS_GPU, gko::CudaExecutor, gko::OmpExecutor>;
137137

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
//@HEADER
2+
// ************************************************************************
3+
//
4+
// Kokkos v. 4.0
5+
// Copyright (2022) National Technology & Engineering
6+
// Solutions of Sandia, LLC (NTESS).
7+
//
8+
// Under the terms of Contract DE-NA0003525 with NTESS,
9+
// the U.S. Government retains certain rights in this software.
10+
//
11+
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12+
// See https://kokkos.org/LICENSE for license information.
13+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14+
//
15+
//@HEADER
16+
17+
namespace KokkosSparse {
18+
namespace Impl {
19+
20+
template <class execution_space, class matrix_type, class functor_type>
21+
struct crsmatrix_traversal_functor {
22+
using size_type = typename matrix_type::non_const_size_type;
23+
using ordinal_type = typename matrix_type::non_const_ordinal_type;
24+
using value_type = typename matrix_type::non_const_value_type;
25+
26+
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
27+
using team_member_type = typename team_policy_type::member_type;
28+
29+
matrix_type A;
30+
functor_type func;
31+
ordinal_type rows_per_team;
32+
33+
crsmatrix_traversal_functor(const matrix_type& A_, const functor_type& func_,
34+
const ordinal_type rows_per_team_)
35+
: A(A_), func(func_), rows_per_team(rows_per_team_) {}
36+
37+
// RangePolicy overload
38+
KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type rowIdx) const {
39+
for (size_type entryIdx = A.graph.row_map(rowIdx);
40+
entryIdx < A.graph.row_map(rowIdx + 1); ++entryIdx) {
41+
const ordinal_type colIdx = A.graph.entries(entryIdx);
42+
const value_type value = A.values(entryIdx);
43+
44+
func(rowIdx, entryIdx, colIdx, value);
45+
}
46+
}
47+
48+
// TeamPolicy overload
49+
KOKKOS_INLINE_FUNCTION void operator()(const team_member_type& dev) const {
50+
const ordinal_type teamWork = dev.league_rank() * rows_per_team;
51+
Kokkos::parallel_for(
52+
Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) {
53+
// iRow represents a row of the matrix, so its correct type is
54+
// ordinal_type.
55+
const ordinal_type rowIdx = teamWork + loop;
56+
if (rowIdx >= A.numRows()) {
57+
return;
58+
}
59+
60+
const ordinal_type row_length =
61+
A.graph.row_map(rowIdx + 1) - A.graph.row_map(rowIdx);
62+
Kokkos::parallel_for(
63+
Kokkos::ThreadVectorRange(dev, row_length),
64+
[&](ordinal_type rowEntryIdx) {
65+
const size_type entryIdx = A.graph.row_map(rowIdx) +
66+
static_cast<size_type>(rowEntryIdx);
67+
const ordinal_type colIdx = A.graph.entries(entryIdx);
68+
const value_type value = A.values(entryIdx);
69+
70+
func(rowIdx, entryIdx, colIdx, value);
71+
});
72+
});
73+
}
74+
};
75+
76+
template <class execution_space>
77+
int64_t crsmatrix_traversal_launch_parameters(int64_t numRows, int64_t nnz,
78+
int64_t rows_per_thread,
79+
int& team_size,
80+
int& vector_length) {
81+
int64_t rows_per_team;
82+
int64_t nnz_per_row = nnz / numRows;
83+
84+
if (nnz_per_row < 1) nnz_per_row = 1;
85+
86+
int max_vector_length = 1;
87+
#ifdef KOKKOS_ENABLE_CUDA
88+
if (std::is_same<execution_space, Kokkos::Cuda>::value)
89+
max_vector_length = 32;
90+
#endif
91+
#ifdef KOKKOS_ENABLE_HIP
92+
if (std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
93+
max_vector_length = 64;
94+
#endif
95+
96+
if (vector_length < 1) {
97+
vector_length = 1;
98+
while (vector_length < max_vector_length && vector_length * 6 < nnz_per_row)
99+
vector_length *= 2;
100+
}
101+
102+
// Determine rows per thread
103+
if (rows_per_thread < 1) {
104+
if (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
105+
rows_per_thread = 1;
106+
else {
107+
if (nnz_per_row < 20 && nnz > 5000000) {
108+
rows_per_thread = 256;
109+
} else
110+
rows_per_thread = 64;
111+
}
112+
}
113+
114+
if (team_size < 1) {
115+
if (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
116+
team_size = 256 / vector_length;
117+
} else {
118+
team_size = 1;
119+
}
120+
}
121+
122+
rows_per_team = rows_per_thread * team_size;
123+
124+
if (rows_per_team < 0) {
125+
int64_t nnz_per_team = 4096;
126+
int64_t conc = execution_space().concurrency();
127+
while ((conc * nnz_per_team * 4 > nnz) && (nnz_per_team > 256))
128+
nnz_per_team /= 2;
129+
rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
130+
}
131+
132+
return rows_per_team;
133+
}
134+
135+
template <class execution_space, class crsmatrix_type, class functor_type>
136+
void crsmatrix_traversal_on_host(const execution_space& space,
137+
const crsmatrix_type& A,
138+
const functor_type& func) {
139+
// Wrap user functor with crsmatrix_traversal_functor
140+
crsmatrix_traversal_functor<execution_space, crsmatrix_type, functor_type>
141+
traversal_func(A, func, -1);
142+
143+
// Launch traversal kernel
144+
Kokkos::parallel_for(
145+
"KokkosSparse::crsmatrix_traversal",
146+
Kokkos::RangePolicy<execution_space>(space, 0, A.numRows()),
147+
traversal_func);
148+
}
149+
150+
template <class execution_space, class crsmatrix_type, class functor_type>
151+
void crsmatrix_traversal_on_gpu(const execution_space& space,
152+
const crsmatrix_type& A,
153+
const functor_type& func) {
154+
// Wrap user functor with crsmatrix_traversal_functor
155+
int64_t rows_per_thread = 0;
156+
int team_size = 0, vector_length = 0;
157+
const int64_t rows_per_team =
158+
crsmatrix_traversal_launch_parameters<execution_space>(
159+
A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length);
160+
const int nteams =
161+
(static_cast<int>(A.numRows()) + rows_per_team - 1) / rows_per_team;
162+
crsmatrix_traversal_functor<execution_space, crsmatrix_type, functor_type>
163+
traversal_func(A, func, rows_per_team);
164+
165+
// Launch traversal kernel
166+
Kokkos::parallel_for("KokkosSparse::crsmatrix_traversal",
167+
Kokkos::TeamPolicy<execution_space>(
168+
space, nteams, team_size, vector_length),
169+
traversal_func);
170+
}
171+
172+
} // namespace Impl
173+
} // namespace KokkosSparse
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
//@HEADER
2+
// ************************************************************************
3+
//
4+
// Kokkos v. 4.0
5+
// Copyright (2022) National Technology & Engineering
6+
// Solutions of Sandia, LLC (NTESS).
7+
//
8+
// Under the terms of Contract DE-NA0003525 with NTESS,
9+
// the U.S. Government retains certain rights in this software.
10+
//
11+
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12+
// See https://kokkos.org/LICENSE for license information.
13+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14+
//
15+
//@HEADER
16+
17+
/// \file KokkosSparse_CrsMatrix_traversal.hpp
18+
/// \brief Traversal method to access all entries in a CrsMatrix
19+
///
20+
/// blah
21+
22+
#ifndef KOKKOSSPARSE_CRSMATRIX_TRAVERSAL_HPP
23+
#define KOKKOSSPARSE_CRSMATRIX_TRAVERSAL_HPP
24+
25+
#include "Kokkos_Core.hpp"
26+
27+
#include "KokkosSparse_CrsMatrix.hpp"
28+
#include "KokkosKernels_ExecSpaceUtils.hpp"
29+
30+
#include "KokkosSparse_CrsMatrix_traversal_impl.hpp"
31+
32+
namespace KokkosSparse {
33+
namespace Experimental {
34+
35+
template <class execution_space, class crsmatrix_type, class functor_type>
36+
void crsmatrix_traversal(const execution_space& space,
37+
const crsmatrix_type& matrix, functor_type& functor) {
38+
// Choose between device and host implementation
39+
if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
40+
KokkosSparse::Impl::crsmatrix_traversal_on_gpu(space, matrix, functor);
41+
} else {
42+
KokkosSparse::Impl::crsmatrix_traversal_on_host(space, matrix, functor);
43+
}
44+
}
45+
46+
template <class crsmatrix_type, class functor_type>
47+
void crsmatrix_traversal(const crsmatrix_type& matrix, functor_type& functor) {
48+
using execution_space = typename crsmatrix_type::execution_space;
49+
execution_space space{};
50+
crsmatrix_traversal(space, matrix, functor);
51+
}
52+
53+
} // namespace Experimental
54+
} // namespace KokkosSparse
55+
56+
#endif // KOKKOSSPARSE_CRSMATRIX_TRAVERSAL_HPP

sparse/unit_test/Test_Sparse.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "Test_Sparse_ccs2crs.hpp"
4747
#include "Test_Sparse_crs2ccs.hpp"
4848
#include "Test_Sparse_removeCrsMatrixZeros.hpp"
49+
#include "Test_Sparse_crsmatrix_traversal.hpp"
4950

5051
// TPL specific tests, these require
5152
// particular pairs of backend and TPL
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
//@HEADER
2+
// ************************************************************************
3+
//
4+
// Kokkos v. 4.0
5+
// Copyright (2022) National Technology & Engineering
6+
// Solutions of Sandia, LLC (NTESS).
7+
//
8+
// Under the terms of Contract DE-NA0003525 with NTESS,
9+
// the U.S. Government retains certain rights in this software.
10+
//
11+
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12+
// See https://kokkos.org/LICENSE for license information.
13+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14+
//
15+
//@HEADER
16+
17+
/// \file Test_Sparse_SortCrs.hpp
18+
/// \brief Tests for sort_crs_matrix and sort_crs_graph in
19+
/// KokkosSparse_SortCrs.hpp
20+
21+
#ifndef TEST_SPARSE_CRSMATRIX_TRAVERSAL_HPP
22+
#define TEST_SPARSE_CRSMATRIX_TRAVERSAL_HPP
23+
24+
#include <Kokkos_Core.hpp>
25+
26+
#include "KokkosKernels_Test_Structured_Matrix.hpp"
27+
#include "KokkosSparse_CrsMatrix_traversal.hpp"
28+
29+
namespace TestCrsMatrixTraversal {
30+
31+
template <class CrsMatrix>
32+
struct diag_extraction {
33+
using diag_view = typename CrsMatrix::values_type::non_const_type;
34+
using size_type = typename CrsMatrix::non_const_size_type;
35+
using ordinal_type = typename CrsMatrix::non_const_ordinal_type;
36+
using value_type = typename CrsMatrix::non_const_value_type;
37+
38+
diag_view diag;
39+
40+
diag_extraction(CrsMatrix A) {
41+
diag = diag_view("diag values", A.numRows());
42+
};
43+
44+
KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type rowIdx,
45+
const size_type /*entryIdx*/,
46+
const ordinal_type colIdx,
47+
const value_type value) const {
48+
if (rowIdx == colIdx) {
49+
diag(rowIdx) = value;
50+
}
51+
}
52+
};
53+
54+
} // namespace TestCrsMatrixTraversal
55+
56+
void testCrsMatrixTraversal(int testCase) {
57+
using namespace TestCrsMatrixTraversal;
58+
using Device =
59+
Kokkos::Device<TestExecSpace, typename TestExecSpace::memory_space>;
60+
using Matrix = KokkosSparse::CrsMatrix<default_scalar, default_lno_t, Device,
61+
void, default_size_type>;
62+
using Vector = Kokkos::View<default_scalar*, TestExecSpace::memory_space>;
63+
64+
constexpr int nx = 4, ny = 4;
65+
constexpr bool leftBC = true, rightBC = false, topBC = false, botBC = false;
66+
67+
Kokkos::View<int * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
68+
2);
69+
mat_structure(0, 0) = nx;
70+
mat_structure(0, 1) = (leftBC ? 1 : 0);
71+
mat_structure(0, 2) = (rightBC ? 1 : 0);
72+
73+
mat_structure(1, 0) = ny;
74+
mat_structure(1, 1) = (topBC ? 1 : 0);
75+
mat_structure(1, 2) = (botBC ? 1 : 0);
76+
77+
Matrix A = Test::generate_structured_matrix2D<Matrix>("FD", mat_structure);
78+
79+
Vector diag_ref("diag ref", A.numRows());
80+
auto diag_ref_h = Kokkos::create_mirror_view(diag_ref);
81+
diag_ref_h(0) = 1;
82+
diag_ref_h(1) = 3;
83+
diag_ref_h(2) = 3;
84+
diag_ref_h(3) = 2;
85+
diag_ref_h(4) = 1;
86+
diag_ref_h(5) = 4;
87+
diag_ref_h(6) = 4;
88+
diag_ref_h(7) = 3;
89+
diag_ref_h(8) = 1;
90+
diag_ref_h(9) = 4;
91+
diag_ref_h(10) = 4;
92+
diag_ref_h(11) = 3;
93+
diag_ref_h(12) = 1;
94+
diag_ref_h(13) = 3;
95+
diag_ref_h(14) = 3;
96+
diag_ref_h(15) = 2;
97+
98+
// Run the diagonal extraction functor
99+
// using traversal function.
100+
diag_extraction<Matrix> func(A);
101+
KokkosSparse::Experimental::crsmatrix_traversal(A, func);
102+
Kokkos::fence();
103+
104+
// Extract the diagonal view from functor
105+
auto diag_h = Kokkos::create_mirror_view(func.diag);
106+
Kokkos::deep_copy(diag_h, func.diag);
107+
108+
// Check for correctness
109+
bool matches = true;
110+
for (int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
111+
if (diag_ref_h(rowIdx) != diag_h(rowIdx)) matches = false;
112+
}
113+
114+
EXPECT_TRUE(matches)
115+
<< "Test case " << testCase
116+
<< ": matrix with zeros filtered out does not match reference.";
117+
}
118+
119+
TEST_F(TestCategory, sparse_crsmatrix_traversal) {
120+
for (int testCase = 0; testCase < 1; testCase++)
121+
testCrsMatrixTraversal(testCase);
122+
}
123+
124+
#endif // TEST_SPARSE_CRSMATRIX_TRAVERSAL_HPP

0 commit comments

Comments
 (0)