-
Notifications
You must be signed in to change notification settings - Fork 344
/
numba_square.py
38 lines (29 loc) · 1.1 KB
/
numba_square.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from numba import cuda
# CUDA kernel
@cuda.jit
def square_matrix_kernel(matrix, result):
# Calculate the row and column index for each thread
row, col = cuda.grid(2)
# Check if the thread's indices are within the bounds of the matrix
if row < matrix.shape[0] and col < matrix.shape[1]:
# Perform the square operation
result[row, col] = matrix[row, col] ** 2
# Example usage
import numpy as np
# Create a sample matrix
matrix = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
# Allocate memory on the device
d_matrix = cuda.to_device(matrix)
d_result = cuda.device_array(matrix.shape, dtype=np.float32)
# Configure the blocks
threads_per_block = (16, 16)
blocks_per_grid_x = int(np.ceil(matrix.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(np.ceil(matrix.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
# Launch the kernel
square_matrix_kernel[blocks_per_grid, threads_per_block](d_matrix, d_result)
# Copy the result back to the host
result = d_result.copy_to_host()
# Result is now in 'result' array
print(matrix)
print(result)