-
Notifications
You must be signed in to change notification settings - Fork 1
/
cuda.t
122 lines (100 loc) · 3.33 KB
/
cuda.t
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
-- SPDX-FileCopyrightText: 2024 René Hiemstra <[email protected]>
-- SPDX-FileCopyrightText: 2024 Torsten Keßler <[email protected]>
--
-- SPDX-License-Identifier: MIT
local cuda = {}
local alloc = require("alloc")
local err = require("assert")
local C = terralib.includec("cuda_runtime.h")
local instructions = {
["threadIdx"] = "tid",
["blockDim"] = "ntid",
["blockIdx"] = "ctaid",
["gridDim"] = "nctaid",
}
for cuname, instr in pairs(instructions) do
cuda[cuname] = {}
for _, s in ipairs({"x", "y", "z"}) do
local name = "nvvm_read_ptx_sreg_" .. instr .. "_" .. s
cuda[cuname][s] = cudalib[name]
end
end
cuda.barrier0 = cudalib.nvvm_barrier0
-- https://reviews.llvm.org/D80464
cuda.__syncthreads = cuda.barrier0
cuda.DeviceSynchronize = C.cudaDeviceSynchronize
cuda.ThreadSynchronize = C.cudaThreadSynchronize
cuda.StreamSynchronize = terralib.externfunction("cuStreamSynchronize", {&opaque} -> int)
cuda.Malloc = C.cudaMalloc
cuda.MallocManaged = terra(data: &&opaque, size: uint64) return C.cudaMallocManaged(data, size, C.cudaMemAttachGlobal) end
cuda.Free = C.cudaFree
cuda.Success = C.cudaSuccess
cuda.Memcpy = C.cudaMemcpy
-- Wrapper for enum cudaMemcpyKind
local dev = {"Host", "Device"}
for _, src in pairs(dev) do
for _, trg in pairs(dev) do
local name = ("Memcpy%sTo%s"):format(src, trg)
cuda[name] = C["cuda" .. name]
end
end
local function castbuffer(arg)
arg = terralib.newlist(arg)
local typ = arg:map(function(a) return a:gettype() end)
local Buffer = tuple(unpack(typ))
return quote var buf = [Buffer] {[arg]} in [&int8](&buf) end
end
local vprintf = terralib.externfunction("cudart:vprintf", {&int8,&int8} -> int)
cuda.printf = macro(function(fmt, ...)
local buf = castbuffer({...})
return `vprintf(fmt, buf)
end)
cuda.DeviceAllocator = terralib.memoize(function(config)
config = config or {}
local is_managed = config["Managed"] == nil and true or config["Managed"]
assert(type(is_managed) == "boolean")
local cualloc = terralib.types.newstruct("cualloc")
local size_t = alloc.size_t
local block = alloc.block
local cumalloc = macro(function(ptr, size)
if is_managed then
return quote var res = cuda.MallocManaged(&ptr, size) in err.assert(res == cuda.Success) end
else
return quote var res = cuda.Malloc(&ptr, size) in err.assert(res == cuda.Success) end
end
end)
local Imp = {}
terra Imp.__allocate :: {size_t, size_t} -> {block}
terra Imp.__reallocate :: {&block, size_t, size_t} -> {}
terra Imp.__deallocate :: {&block} -> {}
local io = terralib.includec("stdio.h")
terra Imp.__allocate(sz: size_t, num: size_t)
var size = sz * num
var ptr: &opaque = nil
cumalloc(ptr, size)
return block {ptr, size}
end
terra Imp.__deallocate(blk: &block)
var ptr = blk.ptr
var res = cuda.Free(ptr)
err.assert(res == cuda.Success)
blk:__init()
end
terra Imp.__reallocate(blk: &block, sz: size_t, num: size_t)
var old_size = blk:size_in_bytes()
var new_size = sz * num
if blk:owns_resource() and old_size < new_size then
var new_ptr: &opaque = nil
cumalloc(new_ptr, new_size)
var res = cuda.Memcpy(new_ptr, blk.ptr, old_size, cuda.MemcpyDeviceToDevice)
err.assert(res == cuda.Success)
blk:__dtor()
blk.ptr = new_ptr
blk.nbytes = new_size
end
end
alloc.AllocatorBase(cualloc, Imp)
alloc.Allocator:isimplemented(cualloc)
return cualloc
end)
return cuda