Skip to content

Commit 0b85840

Browse files
authored
Merge pull request #482 from NVlabs/fix-api-bottleneck
fix: CUDA API bottleneck on newer CUDA versions on Linux
2 parents c91138b + c1423e1 commit 0b85840

File tree

1 file changed

+18
-11
lines changed

1 file changed

+18
-11
lines changed

src/common_host.cu

+18-11
Original file line numberDiff line numberDiff line change
@@ -225,15 +225,26 @@ bool cuda_supports_virtual_memory(int device) {
225225
return supports_vmm != 0;
226226
}
227227

228+
std::unordered_map<int, cudaDeviceProp>& cuda_device_properties() {
229+
static auto* cuda_device_props = new std::unordered_map<int, cudaDeviceProp>{};
230+
return *cuda_device_props;
231+
}
232+
233+
const cudaDeviceProp& cuda_get_device_properties(int device) {
234+
if (cuda_device_properties().count(device) == 0) {
235+
auto& props = cuda_device_properties()[device];
236+
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
237+
}
238+
239+
return cuda_device_properties().at(device);
240+
}
241+
228242
std::string cuda_device_name(int device) {
229-
cudaDeviceProp props;
230-
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
231-
return props.name;
243+
return cuda_get_device_properties(device).name;
232244
}
233245

234246
uint32_t cuda_compute_capability(int device) {
235-
cudaDeviceProp props;
236-
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
247+
const auto& props = cuda_get_device_properties(device);
237248
return props.major * 10 + props.minor;
238249
}
239250

@@ -255,15 +266,11 @@ uint32_t cuda_supported_compute_capability(int device) {
255266
}
256267

257268
size_t cuda_max_shmem(int device) {
258-
cudaDeviceProp props;
259-
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
260-
return props.sharedMemPerBlockOptin;
269+
return cuda_get_device_properties(device).sharedMemPerBlockOptin;
261270
}
262271

263272
uint32_t cuda_max_registers(int device) {
264-
cudaDeviceProp props;
265-
CUDA_CHECK_THROW(cudaGetDeviceProperties(&props, device));
266-
return (uint32_t)props.regsPerBlock;
273+
return (uint32_t)cuda_get_device_properties(device).regsPerBlock;
267274
}
268275

269276
size_t cuda_memory_granularity(int device) {

0 commit comments

Comments
 (0)