-
Notifications
You must be signed in to change notification settings - Fork 2
/
ccnuma-demo.cu
57 lines (45 loc) · 1.37 KB
/
ccnuma-demo.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include <iostream>
#include <cstdlib>
#include <cassert>
#include <numa.h>
// Add a scalar to the vector
__global__ void vadd(int *const v, int const a, size_t const len) {
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int gsize = gridDim.x * blockDim.x;
for (size_t i = gid; i < len; i += gsize) {
v[i] += a;
}
}
int main() {
// Vector length
constexpr size_t LEN = 100'000;
// NUMA node
constexpr int NODE = 0;
// GPU kernel parameters
constexpr unsigned int grid = 160;
constexpr unsigned int block = 1024;
// Allocate vector
int *data = nullptr;
data = reinterpret_cast<int *>(numa_alloc_onnode(LEN * sizeof(int), NODE));
if (data == nullptr) {
std::cerr << "Failed to allocate memory" << std::endl;
std::exit(EXIT_FAILURE);
}
// Initialize vector with some data
for (size_t i = 0; i < LEN; ++i) {
data[i] = i;
}
// Call a function to do some work
vadd<<<grid, block>>>(data, 1, LEN);
// Wait for the GPU kernel to finish execution.
cudaDeviceSynchronize();
// Verify that result is correct
unsigned long long sum = 0;
for (size_t i = 0; i < LEN; ++i) {
sum += data[i];
}
assert(sum == (LEN * (LEN + 1)) / 2);
// Free vector
numa_free(data, LEN * sizeof(int));
std::exit(EXIT_SUCCESS);
}