-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconv.cu
490 lines (466 loc) · 21.2 KB
/
conv.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
/*
convolution supported by CUDA implements
matrix is orgnized in 1D array
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// #define DEBUG
#define element_type float
#define OFFSET(row, col, ld) ((row) * (ld) + (col))
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4 *>(&(pointer))[0])
/*
@brief: 串行卷积实现 CPU代码 NCHW
@param in inC inH inW: 输入矩阵(数组) channel height width
@param out outC outH outW: 输出矩阵 channel height width
@param kernel kernelH kernelW: 卷积核 height width
*/
void serial_convolution(element_type *in, element_type *out, element_type *kernel, int batch_size,
int inC, int inH, int inW,
int outC, int outH, int outW,
int kernelH, int kernelW)
{
float val;
int out_pos, in_pos, kernel_pos;
for (int oc = 0; oc < outC; oc++) // 每个输出通道
{
// 对一个位置的操作 用当前输入channel卷积去对相应的输出channel
// 保证每个outChannel都是多inChannel累积的结果
for (int i = 0; i < outH; i++)
{
for (int j = 0; j < outW; j++)
{
val = 0; // 避免累积和需要多次读取写入
out_pos = oc * outH * outW + OFFSET(i, j, outW);
for (int ic = 0; ic < inC; ic++) // 对每个输入通道
{
for (int ii = 0; ii < kernelH; ii++)
{
for (int jj = 0; jj < kernelW; jj++)
{
in_pos = ic * inH * inW + OFFSET(i + ii, j + jj, inW);
kernel_pos = oc * kernelH * kernelW + OFFSET(ii, jj, kernelW);
val += in[in_pos] * kernel[kernel_pos];
}
}
}
out[out_pos] = val; // 与cudnn差个负号
}
}
}
}
/*
@brief: 串行卷积实现 CPU代码
@param in inC inH inW: 输入矩阵(数组) channel height width
@param out outC outH outW: 输出矩阵 channel height width
@param kernel kernelH kernelW: 卷积核 height width
@attention: template可以保证传入静态变量直接用于分配静态存储空间
*/
template <
const int BLOCK_HEIGHT,
const int BLOCK_WIDTH,
const int KERNEL_HEIGHT,
const int KERNEL_WIDTH,
const int MALLOC_TEMP_SIZE,
const int MALLOC_KERNEL_HEIGHT,
const int MALLOC_KERNEL_WIDTH,
const int MALLOC_BLOCK_HEIGHT,
const int MALLOC_BLOCL_WIDTH>
__global__ void v1_convolution(element_type *in, element_type *out, element_type *kernel, int batch_size,
int inC, int inH, int inW,
int outC, int outH, int outW,
int kernelH, int kernelW)
{
// block id 与 thread id的读取与计算 分块是对target矩阵去分的
// 目前按一个线程负责一个in的计算
int block_row = blockIdx.y;
int block_col = blockIdx.x;
int thread_row = threadIdx.y, thread_col = threadIdx.x;
int threadH = BLOCK_HEIGHT, threadW = BLOCK_WIDTH; // 线程网格范围
int thread_num_per_block = threadH * threadW, tid = thread_row * threadW + thread_col;
// 分块边界 boundary是限制正常范围 edge是需要特殊处理的范围
int row_boundary = outH / BLOCK_HEIGHT - 1,
col_boundary = outW / BLOCK_WIDTH - 1;
int row_edge = outH % BLOCK_HEIGHT, col_edge = outW % BLOCK_WIDTH;
// 线程计算值暂存区大小 动态分配不是很方便 需要在外部分配并带进来
// 一般取单个计算元素和oc之积的2倍即可 因为block比较小
const int temp_size = MALLOC_TEMP_SIZE;
// if (tid==0)
// printf("(%d %d)\n", block_row, block_col);
/// 转移存储 GMEM --> SMEM
// __shared__ float s_in[BLOCK_HEIGHT + KERNEL_HEIGHT - 1][BLOCK_WIDTH + KERNEL_WIDTH - 1];
__shared__ float s_kernel[MALLOC_KERNEL_HEIGHT][MALLOC_KERNEL_WIDTH]; // 开奇数内存会出错
__shared__ float s_in[MALLOC_BLOCK_HEIGHT][MALLOC_BLOCL_WIDTH]; // 要满足修正的尺寸
float load_reg[4];
// 当前block的起始位置
// int begin_pos = (block_row + thread_row) * BLOCK_HEIGHT + (block_col) * BLOCK_WIDTH + thread_col;
int begin_pos = block_row * BLOCK_HEIGHT * inW + block_col * BLOCK_WIDTH;
int single_trans_ele_num = 4; // 线程一次转移的数据数
int cur_in_block_height = BLOCK_HEIGHT + KERNEL_HEIGHT - 1, // 读入in的block height
cur_in_block_width = BLOCK_WIDTH + KERNEL_WIDTH - 1, // 读入in的block width
in_tile_thread_per_row, // 以tile为单位转移数据,一行需要的thread数
in_tile_row_start, // tile的行起始位置
in_tile_col, // tile的列
in_tile_row_stride; // tile行跨度
// 修正边缘block尺寸
if (block_row == row_boundary)
{
cur_in_block_height = BLOCK_HEIGHT + row_edge + kernelH - 1;
}
if (block_col == col_boundary)
{
cur_in_block_width = BLOCK_WIDTH + col_edge + kernelW - 1;
}
in_tile_thread_per_row = cur_in_block_width / single_trans_ele_num;
in_tile_row_start = tid / in_tile_thread_per_row;
in_tile_col = tid % in_tile_thread_per_row * single_trans_ele_num;
in_tile_row_stride = thread_num_per_block / in_tile_thread_per_row;
// 下方都是读取第一个channel的数据
// 按行读取 每行令线程以tile为单位读取 tile大小目前为single_trans_ele_num,余量特殊处理
for (int i = 0; i < cur_in_block_height && in_tile_row_start < cur_in_block_height;
i += in_tile_row_stride)
{
// if (block_row == 0 && block_col == 0)
// {
// printf("%d (%d %d) %d %d\n", tid, in_tile_row_start + i, in_tile_col, cur_in_block_height, cur_in_block_width);
// }
FETCH_FLOAT4(load_reg[0]) =
FETCH_FLOAT4(in[begin_pos + OFFSET(in_tile_row_start + i, in_tile_col, inW)]);
s_in[in_tile_row_start + i][in_tile_col] = load_reg[0];
s_in[in_tile_row_start + i][in_tile_col + 1] = load_reg[1];
s_in[in_tile_row_start + i][in_tile_col + 2] = load_reg[2];
s_in[in_tile_row_start + i][in_tile_col + 3] = load_reg[3];
if (in_tile_col + 2 * single_trans_ele_num > cur_in_block_width &&
cur_in_block_width > in_tile_col + 1 * single_trans_ele_num) // 余量不足一次转移数
{
for (int j = in_tile_col + 1 * single_trans_ele_num; j < cur_in_block_width; j++)
{
s_in[in_tile_row_start + i][j] = in[begin_pos + OFFSET(in_tile_row_start + i, j, inW)];
}
}
}
// 读取第一个kernel的数据
if (thread_row >= 0 && thread_row < KERNEL_HEIGHT && thread_col == 0)
{
for (int j = 0; j < KERNEL_WIDTH; j++)
{
s_kernel[thread_row][j] = kernel[OFFSET(thread_row, j, KERNEL_WIDTH)];
}
}
__syncthreads();
// 验证数据转移正确性
// if (block_row == 0 && block_col == 0 && thread_row == 0 && thread_col == 0) // 16 8
// {
// for (int i = 0; i < cur_in_block_height; i++)
// {
// for (int j = 0; j < cur_in_block_width; j++)
// {
// printf("(%d %d) %.2f|%.2f\n", i, j, s_in[i][j], in[begin_pos + OFFSET(i, j, inW)]);
// }
// }
// }
// if (block_row == 2 && block_col == 2 && tid == 0)
// {
// for (int i = 0; i < KERNEL_HEIGHT; i++)
// {
// for (int j = 0; j < KERNEL_WIDTH; j++)
// {
// printf("(%d %d) %.2f|%.2f\n", i, j, s_kernel[i][j], kernel[OFFSET(thread_row, j, KERNEL_WIDTH)]);
// }
// }
// }
// 逐个channel计算 一个线程负责block中的一个元素计算
// 修正out block的大小
int cur_out_block_height = BLOCK_HEIGHT, // 输出block height
cur_out_block_width = BLOCK_WIDTH, // 输出block width
single_calculate_num = 1, // 线程一次负责计算的元素数目
out_tile_thread_per_row, // block按tile划分需要的线程数目
out_tile_row_start, // tile的行起始位置
out_tile_col, // tile的列起始位置
out_tile_row_stride; // tile行跨度
if (block_row == row_boundary)
{
cur_out_block_height = BLOCK_HEIGHT + row_edge;
}
if (block_col == col_boundary)
{
cur_out_block_width = BLOCK_WIDTH + col_edge;
}
out_tile_thread_per_row = cur_out_block_width / single_calculate_num;
out_tile_row_start = tid / out_tile_thread_per_row;
out_tile_col = tid % out_tile_thread_per_row * single_calculate_num;
out_tile_row_stride = thread_num_per_block / out_tile_thread_per_row;
float val[temp_size]; // 存储累积和 避免多次读取GMEM
for (int i = 0; i < temp_size; i++)
val[i] = 0;
int out_pos, temp_pos;
for (int oc = 0; oc < outC; oc++)
{
for (int ic = 0; ic < inC; ic++)
{
// i,j 是相当于当前block起始位置而言
// 每个线程负责一个tile,元素数>线程数会进行轮替,会有少部分的重叠区域,代价不大(只要width不大)
// 用ic的每个block去对oc的kernel进行计算
for (int i = 0; i < cur_out_block_height && (out_tile_row_start + i) < cur_out_block_height;
i += out_tile_row_stride)
{
for (int j = 0; j < single_calculate_num; j++)
{
// 计算线程负责的元素 同一个oc的缓存顺序排列
// 不同oc偏移一个cur_out_block_height / out_tile_row_stride + 1的位置
temp_pos = i / out_tile_row_stride + j +
oc * (cur_out_block_height / out_tile_row_stride + 1);
for (int ii = 0; ii < KERNEL_HEIGHT; ii++)
{
for (int jj = 0; jj < KERNEL_WIDTH; jj++) // 更换的是SMEM中的内容,相对位置不变
{
val[temp_pos] += s_in[out_tile_row_start + i + ii][out_tile_col + j + jj] * s_kernel[ii][jj];
}
}
}
}
// 读取下一个in channel和对应kernel的数据
if (ic + 1 < inC)
{
for (int i = 0; i < cur_in_block_height && in_tile_row_start < cur_in_block_height;
i += in_tile_row_stride)
{
FETCH_FLOAT4(load_reg[0]) =
FETCH_FLOAT4(in[begin_pos + (ic + 1) * inH * inW + OFFSET(in_tile_row_start + i, in_tile_col, inW)]);
s_in[in_tile_row_start + i][in_tile_col] = load_reg[0];
s_in[in_tile_row_start + i][in_tile_col + 1] = load_reg[1];
s_in[in_tile_row_start + i][in_tile_col + 2] = load_reg[2];
s_in[in_tile_row_start + i][in_tile_col + 3] = load_reg[3];
if (in_tile_col + 2 * single_trans_ele_num > cur_in_block_width &&
cur_in_block_width > in_tile_col + 1 * single_trans_ele_num) // 余量不足一次转移数
{
for (int j = in_tile_col + 1 * single_trans_ele_num; j < cur_in_block_width; j++)
{
s_in[in_tile_row_start + i][j] = in[begin_pos + (ic + 1) * inH * inW + OFFSET(in_tile_row_start + i, j, inW)];
}
}
}
if (thread_row >= 0 && thread_row < KERNEL_HEIGHT && thread_col == 0)
{
for (int j = 0; j < KERNEL_WIDTH; j++)
{
s_kernel[thread_row][j] = kernel[(oc * inC + ic + 1) * kernelH * kernelW + OFFSET(thread_row, j, KERNEL_WIDTH)];
}
}
}
__syncthreads();
// 验证数据转移
// if (ic + 1 < inC)
// if (block_row == 2 && block_col == 2 && thread_row == 0 && thread_col == 0) // 16 8
// {
// for (int i = 0; i < cur_in_block_height; i++)
// {
// for (int j = 0; j < cur_in_block_width; j++)
// {
// printf("(%d %d) %.2f|%.2f\n", i, j, s_in[i][j], in[begin_pos + (ic + 1) * inH * inW + OFFSET(i, j, inW)]);
// }
// }
// }
}
// 读取下一个kernel channel数据
if (oc + 1 < outC)
{
if (thread_row >= 0 && thread_row < KERNEL_HEIGHT && thread_col == 0)
{
for (int j = 0; j < KERNEL_WIDTH; j++)
{
s_kernel[thread_row][j] = kernel[(oc + 1) * inC * kernelH * kernelW + OFFSET(thread_row, j, KERNEL_WIDTH)];
}
}
}
__syncthreads();
// 写回 利用线程id计算写回位置
int i = 0, j = 0;
while (i < cur_out_block_height && (out_tile_row_start + i) < cur_out_block_height)
{
while (j < single_calculate_num)
{
out_pos = oc * outH * outW +
block_row * BLOCK_HEIGHT * outW + block_col * BLOCK_WIDTH +
OFFSET(out_tile_row_start + i, out_tile_col + j, outW);
temp_pos = i / out_tile_row_stride + j +
oc * (cur_out_block_height / out_tile_row_stride + 1);
// if (tid == 0 && block_row == 0 && block_col == 0)
// {
// printf("%d %d-(%d %d) %d %d\n", i, j, out_tile_row_start + i, out_tile_col + j,
// temp_pos, out_pos);
// }
out[out_pos] = val[temp_pos];
j++;
}
i += out_tile_row_stride;
j = 0;
}
// 读取下一个in channel数据
for (int i = 0; i < cur_in_block_height && in_tile_row_start < cur_in_block_height;
i += in_tile_row_stride)
{
// if (block_row == 0 && block_col == 0)
// {
// printf("%d (%d %d) %d %d\n", tid, in_tile_row_start + i, in_tile_col, cur_in_block_height, cur_in_block_width);
// }
FETCH_FLOAT4(load_reg[0]) =
FETCH_FLOAT4(in[begin_pos + OFFSET(in_tile_row_start + i, in_tile_col, inW)]);
s_in[in_tile_row_start + i][in_tile_col] = load_reg[0];
s_in[in_tile_row_start + i][in_tile_col + 1] = load_reg[1];
s_in[in_tile_row_start + i][in_tile_col + 2] = load_reg[2];
s_in[in_tile_row_start + i][in_tile_col + 3] = load_reg[3];
if (in_tile_col + 2 * single_trans_ele_num > cur_in_block_width &&
cur_in_block_width > in_tile_col + 1 * single_trans_ele_num) // 余量不足一次转移数
{
for (int j = in_tile_col + 1 * single_trans_ele_num; j < cur_in_block_width; j++)
{
s_in[in_tile_row_start + i][j] = in[begin_pos + OFFSET(in_tile_row_start + i, j, inW)];
}
}
}
}
}
template <
const int BLOCK_SIZE_ROW,
const int BLOCK_SIZE_COL,
const int THREAD_SIZE_ROW,
const int THREAD_SIZEZ_COL,
const int FILTER_SIZE>
__global__ void v2_convolution(element_type *org,
element_type *target,
element_type *filter,
int row, int col)
{
// block id 与 thread id的读取与计算 分块是对target矩阵去分的
int block_row = blockIdx.y;
int block_col = blockIdx.x;
int thread_row = threadIdx.y, thread_col = threadIdx.x, tid = thread_row * THREAD_SIZE_ROW + thread_col;
// 目标矩阵尺寸
int t_row = row - FILTER_SIZE + 1, t_col = col - FILTER_SIZE + 1;
// 分块边界
int row_boundary = t_row / BLOCK_SIZE_ROW - 1, col_boundary = t_col / BLOCK_SIZE_COL - 1;
int row_edge = t_row % BLOCK_SIZE_ROW, col_edge = t_col % BLOCK_SIZE_COL;
if (block_row == 0 && block_col == 0 && thread_row == 0 && thread_col == 0)
printf("filter0:%.2f\n", filter[0]);
// 转移存储 GMEM --> SMEM
// __shared__ float s_filter[filter_size][filter_size];
// __shared__ float s_org[BLOCK_SIZE_ROW + filter_size - 1][BLOCK_SIZE_COL + filter_size - 1];
__shared__ float s_filter[FILTER_SIZE][FILTER_SIZE];
__shared__ float s_org[BLOCK_SIZE_ROW + FILTER_SIZE - 1][BLOCK_SIZE_COL + FILTER_SIZE - 1];
int begin_pos = block_row * BLOCK_SIZE_ROW * col + block_col * BLOCK_SIZE_COL * row; // 当前block的起始位置
// 右下角元素负责filter_size^2的元素转移
if (thread_row == BLOCK_SIZE_ROW - 1 && thread_col == BLOCK_SIZE_COL - 1)
{
for (int i = 0; i < FILTER_SIZE; i++)
{
for (int j = 0; j < FILTER_SIZE; j++)
{
s_org[thread_row + i][thread_col + j] =
org[begin_pos + OFFSET(thread_row + i, thread_col + j, col)];
}
}
}
else if (thread_row == BLOCK_SIZE_ROW - 1) // 下边界向外延伸
{
for (int i = 0; i < FILTER_SIZE; i++)
{
s_org[thread_row + i][thread_col] =
org[begin_pos + OFFSET(thread_row + i, thread_col, col)];
}
}
else if (thread_col == BLOCK_SIZE_COL - 1) // 右边界向外延伸
{
for (int i = 0; i < FILTER_SIZE; i++)
{
s_org[thread_row][thread_col + i] =
org[begin_pos + OFFSET(thread_row, thread_col + i, col)];
}
}
else // 边界内只需负责转移自身数据
{
s_org[thread_row][thread_col] =
org[begin_pos + OFFSET(thread_row, thread_col, col)];
// 0号线程同时转移filter
if (thread_row == 0 && thread_col == 0)
{
for (int i = 0; i < FILTER_SIZE; i++)
{
for (int j = 0; j < FILTER_SIZE; j++)
{
s_filter[i][j] = filter[OFFSET(i, j, FILTER_SIZE)];
}
}
}
}
__syncthreads();
// 计算部分
if (block_row == row_boundary && block_col == col_boundary) // 最右下角的 负责处理edge部分
{
if (thread_row < row_edge && thread_col < col_edge)
{
int value = 0;
// single_conv(FILTER_SIZE, s_org, target, s_filter, begin_pos, thread_row, thread_col, t_col);
for (int i = 0; i < FILTER_SIZE; i++)
{
for (int j = 0; j < FILTER_SIZE; j++)
{
value += s_org[thread_row + i][thread_col + j] * s_filter[thread_row + i][thread_col + j];
}
}
target[begin_pos + OFFSET(thread_row, thread_col, t_col)] = value;
}
}
else if (block_row == row_boundary) // 下边一条的edge
{
if (thread_row < row_edge)
{
int value = 0;
// single_conv(FILTER_SIZE, s_org, target, s_filter, begin_pos, thread_row, thread_col, t_col);
for (int i = 0; i < FILTER_SIZE; i++)
{
for (int j = 0; j < FILTER_SIZE; j++)
{
value += s_org[thread_row + i][thread_col + j] * s_filter[thread_row + i][thread_col + j];
}
}
target[begin_pos + OFFSET(thread_row, thread_col, t_col)] = value;
}
}
else if (block_col == col_boundary) // 右边一条的edge
{
if (thread_col < col_edge)
{
int value = 0;
// single_conv(FILTER_SIZE, s_org, target, s_filter, begin_pos, thread_row, thread_col, t_col);
for (int i = 0; i < FILTER_SIZE; i++)
{
for (int j = 0; j < FILTER_SIZE; j++)
{
value += s_org[thread_row + i][thread_col + j] * s_filter[thread_row + i][thread_col + j];
}
}
target[begin_pos + OFFSET(thread_row, thread_col, t_col)] = value;
}
}
else // 正常block
{
float value = 0;
// single_conv(FILTER_SIZE, s_org, target, s_filter, begin_pos, thread_row, thread_col, t_col);
for (int i = 0; i < FILTER_SIZE; i++)
{
for (int j = 0; j < FILTER_SIZE; j++)
{
// if (block_row == 13 && block_col == 11 && thread_row == 0 && thread_col == 0)
// printf("%d %d %.2f * %.2f\n", thread_row + i, thread_col + j, s_org[thread_row + i][thread_col + j], s_filter[i][j]);
value += s_org[thread_row + i][thread_col + j] * s_filter[i][j];
}
}
target[begin_pos + OFFSET(thread_row, thread_col, t_col)] = value;
// if (block_row == 0 && block_col == 0 && thread_row == 0 && thread_col == 0)
// printf("%d-%d.%d-%d : %.2f\n", block_row, block_col, thread_row, thread_col, value);
}
}