-
Notifications
You must be signed in to change notification settings - Fork 0
/
13_matmul_shared.cu
122 lines (117 loc) · 3.13 KB
/
13_matmul_shared.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <iostream>
#include <typeinfo>
#include <random>
#include <stdint.h>
#include <cublas_v2.h>
#include <chrono>
using namespace std;
__global__ void kernel(int dim_m, int dim_n, int dim_k,
float *d_a, float *d_b, float *d_c) {
int offset_a_m = 64 * blockIdx.x;
int offset_b_n = 64 * blockIdx.y;
int i = threadIdx.x;
int m = threadIdx.x;
__shared__ float block_a[8][64];
__shared__ float block_b[8][64];
float block_c[64];
for (int n = 0; n < 64; ++n)
block_c[n] = 0;
for (int k = 0; k < dim_k; k += 8) {
int offset_a_k = k, offset_b_k = k;
__syncthreads();
for (int j = 0; j < 8; ++j) {
block_a[j][i] = d_a[(offset_a_k + j) * dim_m + offset_a_m + i];
block_b[j][i] = d_b[(offset_b_n + i) * dim_k + offset_b_k + j];
}
__syncthreads();
#pragma unroll
for (int j = 0; j < 8; ++j) {
for (int n = 0; n < 64; ++n) {
block_c[n] += block_a[j][m] * block_b[j][n];
}
}
}
for (int n = 0; n < 64; ++n) {
int c_n = offset_b_n + n;
int c_m = offset_a_m + m;
if (c_n < dim_n && c_m < dim_m) {
d_c[c_n * dim_m + c_m] = block_c[n];
}
}
}
int main(int argc, const char **argv) {
int m = 10240;
int k = 4096;
int n = 8192;
float alpha = 1.0;
float beta = 0.0;
int Nt = 10;
float *A, *B, *C, *C2;
cudaMallocManaged(&A, m * k * sizeof(float));
cudaMallocManaged(&B, k * n * sizeof(float));
cudaMallocManaged(&C, m * n * sizeof(float));
cudaMallocManaged(&C2, m * n * sizeof(float));
for (int i=0; i<m; i++)
for (int j=0; j<k; j++)
A[k*i+j] = drand48();
for (int i=0; i<k; i++)
for (int j=0; j<n; j++)
B[n*i+j] = drand48();
for (int i=0; i<n; i++)
for (int j=0; j<m; j++)
C[m*i+j] = C2[m*i+j] = 0;
cublasHandle_t cublas_handle;
cublasCreate(&cublas_handle);
auto tic = chrono::steady_clock::now();
for (int i = 0; i < Nt+2; i++) {
if (i == 2) tic = chrono::steady_clock::now();
cublasSgemm(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,
n,
k,
&alpha,
A,
m,
B,
k,
&beta,
C,
m);
cudaDeviceSynchronize();
}
auto toc = chrono::steady_clock::now();
int64_t num_flops = (2 * int64_t(m) * int64_t(n) * int64_t(k)) + (2 * int64_t(m) * int64_t(n));
double tcublas = chrono::duration<double>(toc - tic).count() / Nt;
double cublas_flops = double(num_flops) / tcublas / 1.0e9;
int tile = 64;
dim3 block = dim3(tile);
dim3 grid = dim3((m+tile-1)/tile, (n+tile-1)/tile);
for (int i = 0; i < Nt+2; i++) {
if (i == 2) tic = chrono::steady_clock::now();
kernel<<< grid, block >>>(m,
n,
k,
A,
B,
C2);
cudaDeviceSynchronize();
}
toc = chrono::steady_clock::now();
double tcutlass = chrono::duration<double>(toc - tic).count() / Nt;
double cutlass_flops = double(num_flops) / tcutlass / 1.0e9;
printf("CUBLAS: %.2f Gflops, CUTLASS: %.2f Gflops\n", cublas_flops, cutlass_flops);
double err = 0;
for (int i=0; i<n; i++) {
for (int j=0; j<m; j++) {
err += fabs(C[m*i+j] - C2[m*i+j]);
}
}
printf("error: %lf\n", err/n/m);
cudaFree(A);
cudaFree(B);
cudaFree(C);
cudaFree(C2);
cublasDestroy(cublas_handle);
}