From 88f01b2f82710e68f0506681ff9bdd41eb71b922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= Date: Tue, 7 Mar 2023 01:26:00 +0100 Subject: [PATCH 1/5] psnr CUDA implementation --- libvmaf/src/cuda/cuda_helper.cuh | 7 + libvmaf/src/feature/cuda/integer_psnr/psnr.cu | 138 ++++++++ libvmaf/src/feature/cuda/integer_psnr_cuda.c | 329 ++++++++++++++++++ libvmaf/src/feature/feature_extractor.c | 2 + libvmaf/src/libvmaf.c | 4 +- libvmaf/src/meson.build | 4 +- libvmaf/test/test_ring_buffer.c | 4 +- 7 files changed, 483 insertions(+), 5 deletions(-) create mode 100644 libvmaf/src/feature/cuda/integer_psnr/psnr.cu create mode 100644 libvmaf/src/feature/cuda/integer_psnr_cuda.c diff --git a/libvmaf/src/cuda/cuda_helper.cuh b/libvmaf/src/cuda/cuda_helper.cuh index 6fa01b9f6..3d905894c 100644 --- a/libvmaf/src/cuda/cuda_helper.cuh +++ b/libvmaf/src/cuda/cuda_helper.cuh @@ -59,6 +59,13 @@ namespace { return atomicAdd(reinterpret_cast(address), static_cast(val)); } + + typedef unsigned long long int uint64_cu; + __forceinline__ __device__ int64_t atomicAdd_uint64(uint64_t *address, + uint64_t val) { + return atomicAdd(reinterpret_cast(address), + static_cast(val)); + } } // namespace #endif diff --git a/libvmaf/src/feature/cuda/integer_psnr/psnr.cu b/libvmaf/src/feature/cuda/integer_psnr/psnr.cu new file mode 100644 index 000000000..50214269b --- /dev/null +++ b/libvmaf/src/feature/cuda/integer_psnr/psnr.cu @@ -0,0 +1,138 @@ +/** + * + * Copyright 2016-2023 Netflix, Inc. + * Copyright 2022 NVIDIA Corporation. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cuda/integer_motion_cuda.h" +#include "cuda_helper.cuh" + +#include "common.h" + +template +__device__ void sse_calculation(T *ref, T *dis, unsigned int w, unsigned int h, + unsigned int stride, uint64_t *sse) { + constexpr int val_per_thread = sizeof(LOAD_TYPE) / sizeof(T); + unsigned int idx_x = (threadIdx.x + blockDim.x * blockIdx.x) * val_per_thread; + unsigned int idx_y = threadIdx.y + blockDim.y * blockIdx.y; + + if (idx_y < h && idx_x < w) { + int idx = idx_y * (stride / sizeof(T)) + idx_x; + uint64_t thread_sse = 0u; + union { + T value_ref[val_per_thread]; + LOAD_TYPE load_value_dis; + }; + union { + T value_dis[val_per_thread]; + LOAD_TYPE load_value_ref; + }; + load_value_ref = *reinterpret_cast(&ref[idx]); + load_value_dis = *reinterpret_cast(&dis[idx]); + for (unsigned int i = 0; i < val_per_thread; ++i) { + if ((idx_x + i) < w) { + const int e = value_ref[i] - value_dis[i]; + thread_sse += e * e; + } + } + + // Warp-reduce abs_dist + thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 16); + thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 8); + thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 4); + thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 2); + thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 1); + // Let threads in lane zero add warp-reduced abs_dist atomically to global + // sad + const int lane = + (threadIdx.y * blockDim.x + threadIdx.x) % VMAF_CUDA_THREADS_PER_WARP; + if (lane == 0) + atomicAdd_uint64(sse, static_cast(thread_sse)); + } +} + +template +__device__ void psnr8_impl(const VmafPicture ref_pic, + const VmafPicture dist_pic, + const VmafCudaBuffer sse) { + unsigned int stride = ref_pic.stride[chn]; + // if second channel is smaller use smaller load + if (stride <= (ref_pic.stride[0] / 2)) + sse_calculation( + reinterpret_cast(ref_pic.data[chn]), + reinterpret_cast(dist_pic.data[chn]), ref_pic.w[chn], + ref_pic.h[chn], stride, reinterpret_cast(sse.data) + chn); + else + sse_calculation( + reinterpret_cast(ref_pic.data[chn]), + reinterpret_cast(dist_pic.data[chn]), ref_pic.w[chn], + ref_pic.h[chn], stride, reinterpret_cast(sse.data) + chn); +} + +template +__device__ void psnr16_impl(const VmafPicture ref_pic, + const VmafPicture dist_pic, + const VmafCudaBuffer sse) { + unsigned int stride = ref_pic.stride[chn]; + // if second channel is smaller use smaller load + if (stride <= (ref_pic.stride[0] / 2)) + sse_calculation( + reinterpret_cast(ref_pic.data[chn]), + reinterpret_cast(dist_pic.data[chn]), ref_pic.w[chn], + ref_pic.h[chn], stride, reinterpret_cast(sse.data) + chn); + else + sse_calculation( + reinterpret_cast(ref_pic.data[chn]), + reinterpret_cast(dist_pic.data[chn]), ref_pic.w[chn], + ref_pic.h[chn], stride, reinterpret_cast(sse.data) + chn); +} + +extern "C" { + +__global__ void psnr(const VmafPicture ref_pic, const VmafPicture dist_pic, + const VmafCudaBuffer sse) { + // this is needed to not produce local load/store ops when accessing with + // "dynamic" index although blockIdx.z is not really dynamic + switch (blockIdx.z) { + case 0: + psnr8_impl<0>(ref_pic, dist_pic, sse); + return; + case 1: + psnr8_impl<1>(ref_pic, dist_pic, sse); + return; + case 2: + psnr8_impl<2>(ref_pic, dist_pic, sse); + return; + } +} + +__global__ void psnr_hbd(const VmafPicture ref_pic, const VmafPicture dist_pic, + const VmafCudaBuffer sse) { + // this is needed to not produce local load/store ops when accessing with + // "dynamic" index although blockIdx.z is not really dynamic + switch (blockIdx.z) { + case 0: + psnr16_impl<0>(ref_pic, dist_pic, sse); + return; + case 1: + psnr16_impl<1>(ref_pic, dist_pic, sse); + return; + case 2: + psnr16_impl<2>(ref_pic, dist_pic, sse); + return; + } +} +} \ No newline at end of file diff --git a/libvmaf/src/feature/cuda/integer_psnr_cuda.c b/libvmaf/src/feature/cuda/integer_psnr_cuda.c new file mode 100644 index 000000000..5d044c2e3 --- /dev/null +++ b/libvmaf/src/feature/cuda/integer_psnr_cuda.c @@ -0,0 +1,329 @@ +/** + * + * Copyright 2016-2023 Netflix, Inc. + * Copyright 2022 NVIDIA Corporation. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#include + +#include "cuda_helper.cuh" +#include "libvmaf/vmaf_cuda_state.h" +#include "picture_cuda.h" + +#include "feature_collector.h" +#include "feature_extractor.h" +#include "opt.h" + + +typedef struct PsnrStateCuda { + bool enable_chroma; + bool enable_mse; + bool enable_apsnr; + bool reduced_hbd_peak; + uint32_t peak; + CUevent event, finished; + CUfunction func_psnr; + CUstream str, host_stream; + double psnr_max[3]; + double min_sse; + VmafCudaBuffer *sse_device; + uint64_t *sse_host; + void* write_score_parameters; + + struct { + uint64_t sse[3]; + uint64_t n_pixels[3]; + } apsnr; +} PsnrStateCuda; + +typedef struct write_score_parameters_psnr { + VmafFeatureCollector *feature_collector; + PsnrStateCuda *s; + unsigned h[3], w[3]; + unsigned index; +} write_score_parameters_psnr; + +extern unsigned char src_psnr_ptx[]; + +static const VmafOption options[] = { + { + .name = "enable_chroma", + .help = "enable calculation for chroma channels", + .offset = offsetof(PsnrStateCuda, enable_chroma), + .type = VMAF_OPT_TYPE_BOOL, + .default_val.b = true, + }, + { + .name = "enable_mse", + .help = "enable MSE calculation", + .offset = offsetof(PsnrStateCuda, enable_mse), + .type = VMAF_OPT_TYPE_BOOL, + .default_val.b = false, + }, + { + .name = "enable_apsnr", + .help = "enable APSNR calculation", + .offset = offsetof(PsnrStateCuda, enable_apsnr), + .type = VMAF_OPT_TYPE_BOOL, + .default_val.b = false, + }, + { + .name = "reduced_hbd_peak", + .help = "reduce hbd peak value to align with scaled 8-bit content", + .offset = offsetof(PsnrStateCuda, reduced_hbd_peak), + .type = VMAF_OPT_TYPE_BOOL, + .default_val.b = false, + }, + { + .name = "min_sse", + .help = "constrain the minimum possible sse", + .offset = offsetof(PsnrStateCuda, min_sse), + .type = VMAF_OPT_TYPE_DOUBLE, + .default_val.d = 0.0, + .min = 0.0, + .max = DBL_MAX, + }, + { 0 } +}; + + +static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt, + unsigned bpc, unsigned w, unsigned h) +{ + PsnrStateCuda *s = fex->priv; + + CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); + CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); + CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + + CUmodule module; + CHECK_CUDA(cuModuleLoadData(&module, src_psnr_ptx)); + if (bpc > 8) { + CHECK_CUDA(cuModuleGetFunction(&s->func_psnr, module, "psnr_hbd")); + } else { + CHECK_CUDA(cuModuleGetFunction(&s->func_psnr, module, "psnr")); + } + CHECK_CUDA(cuCtxPopCurrent(NULL)); + + s->write_score_parameters = malloc(sizeof(write_score_parameters_psnr)); + ((write_score_parameters_psnr*)s->write_score_parameters)->s = s; + + + int ret = 0; + ret |= vmaf_cuda_buffer_alloc(fex->cu_state, &s->sse_device, sizeof(uint64_t) * 3); + if (ret) goto free_ref; + ret |= vmaf_cuda_buffer_host_alloc(fex->cu_state, &s->sse_host, sizeof(uint64_t) * 3); + if (ret) goto free_ref; + + s->peak = s->reduced_hbd_peak ? 255 * 1 << (bpc - 8) : (1 << bpc) - 1; + + if (pix_fmt == VMAF_PIX_FMT_YUV400P) + s->enable_chroma = false; + + for (unsigned i = 0; i < 3; i++) { + if (s->min_sse != 0.0) { + const int ss_hor = pix_fmt != VMAF_PIX_FMT_YUV444P; + const int ss_ver = pix_fmt == VMAF_PIX_FMT_YUV420P; + const double mse = s->min_sse / + (((i && ss_hor) ? w / 2 : w) * ((i && ss_ver) ? h / 2 : h)); + s->psnr_max[i] = ceil(10. * log10(s->peak * s->peak / mse)); + } else { + s->psnr_max[i] = (6 * bpc) + 12; + } + } + + return ret; +free_ref: + if (s->sse_device) { + ret |= vmaf_cuda_buffer_free(fex->cu_state, s->sse_device); + free(s->sse_device); + } + return -ENOMEM; +} + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static char *mse_name[3] = { "mse_y", "mse_cb", "mse_cr" }; +static char *psnr_name[3] = { "psnr_y", "psnr_cb", "psnr_cr" }; + +static int write_scores(write_score_parameters_psnr* params) +{ + PsnrStateCuda *s = params->s; + VmafFeatureCollector *feature_collector = params->feature_collector; + + const unsigned n = s->enable_chroma ? 3 : 1; + for (unsigned p = 0; p < n; p++) { + if (s->enable_apsnr) { + s->apsnr.sse[p] += s->sse_host[p]; + s->apsnr.n_pixels[p] += params->h[p] * params->w[p]; + } + + const double mse = ((double) s->sse_host[p]) / (params->w[p] * params->h[p]); + const double psnr = + MIN(10. * log10(s->peak * s->peak / MAX(mse, 1e-16)), s->psnr_max[p]); + + + int err = 0; + err |= vmaf_feature_collector_append(feature_collector, psnr_name[p], + psnr, params->index); + if (s->enable_mse) { + err |= vmaf_feature_collector_append(feature_collector, mse_name[p], + mse, params->index); + } + } +} + +static int extract_fex_cuda(VmafFeatureExtractor *fex, + VmafPicture *ref_pic, VmafPicture *ref_pic_90, + VmafPicture *dist_pic, VmafPicture *dist_pic_90, + unsigned index, VmafFeatureCollector *feature_collector) +{ + PsnrStateCuda *s = fex->priv; + + (void) ref_pic_90; + (void) dist_pic_90; + + // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores + CHECK_CUDA(cuStreamSynchronize(s->str)); + CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); + // CHECK_CUDA(cuEventSynchronize(s->finished)); + CHECK_CUDA(cuEventDestroy(s->finished)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuCtxPopCurrent(NULL)); + + // Reset device SSE + CHECK_CUDA(cuMemsetD8Async(s->sse_device->data, 0, sizeof(uint64_t) * 3, s->str)); + + + const int width_y = ref_pic->w[0]; + const int height_y = ref_pic->h[0]; + const int stride_y = ref_pic->stride[0]; + + const int val_per_thread = 8; + const int block_dim_x = 16; + const int block_dim_y = 16; + const int grid_dim_x = DIV_ROUND_UP(width_y, block_dim_x * val_per_thread); + const int grid_dim_y = DIV_ROUND_UP(height_y, block_dim_y); + const int grid_dim_z = s->enable_chroma ? 3 : 1; + + void *kernelParams[] = {ref_pic, dist_pic, s->sse_device}; + CHECK_CUDA(cuStreamWaitEvent(vmaf_cuda_picture_get_stream(ref_pic), vmaf_cuda_picture_get_ready_event(dist_pic), CU_EVENT_WAIT_DEFAULT)); + CHECK_CUDA(cuLaunchKernel(s->func_psnr, + grid_dim_x, grid_dim_y, grid_dim_z, + block_dim_x, block_dim_y, 1, + 0, vmaf_cuda_picture_get_stream(ref_pic), kernelParams, NULL)); + + CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); + // This event ensures the input buffer is consumed + CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); + CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); + CHECK_CUDA(cuEventDestroy(s->event)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuCtxPopCurrent(NULL)); + + // Download sad + // CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + CHECK_CUDA(cuMemcpyDtoHAsync(s->sse_host, (CUdeviceptr)s->sse_device->data, + sizeof(uint64_t) * 3, s->str)); + CHECK_CUDA(cuEventRecord(s->finished, s->str)); + CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); + + write_score_parameters_psnr* params = s->write_score_parameters; + params->feature_collector = feature_collector; + for (unsigned p = 0; p < grid_dim_z; p++) { + params->h[p] = ref_pic->h[p]; + params->w[p] = ref_pic->w[p]; + } + params->index = index; + CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters)); + return 0; +} + +static int flush_fex_cuda(VmafFeatureExtractor *fex, + VmafFeatureCollector *feature_collector) +{ + PsnrStateCuda *s = fex->priv; + const char *apsnr_name[3] = { "apsnr_y", "apsnr_cb", "apsnr_cr" }; + CHECK_CUDA(cuStreamSynchronize(s->str)); + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + + int err = 0; + if (s->enable_apsnr) { + for (unsigned i = 0; i < 3; i++) { + + double apsnr = 10 * (log10(s->peak * s->peak) + + log10(s->apsnr.n_pixels[i]) - + log10(s->apsnr.sse[i])); + + double max_apsnr = + ceil(10 * log10(s->peak * s->peak * + s->apsnr.n_pixels[i] * + 2)); + + err |= + vmaf_feature_collector_set_aggregate(feature_collector, + apsnr_name[i], + MIN(apsnr, max_apsnr)); + } + } + + return (err < 0) ? err : !err; +} + +static int close_fex_cuda(VmafFeatureExtractor *fex) +{ + PsnrStateCuda *s = fex->priv; + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + CHECK_CUDA(cuStreamSynchronize(s->str)); + int ret = 0; + + if (s->sse_host) { + ret |= vmaf_cuda_buffer_host_free(fex->cu_state, s->sse_host); + } + if (s->sse_device) { + ret |= vmaf_cuda_buffer_free(fex->cu_state, s->sse_device); + free(s->sse_device); + } + if(s->write_score_parameters) { + free(s->write_score_parameters); + } + + return ret; +} + +static const char *provided_features[] = { + "psnr_y", "psnr_cb", "psnr_cr", + NULL +}; + +VmafFeatureExtractor vmaf_fex_integer_psnr_cuda = { + .name = "psnr_cuda", + .options = options, + .init = init_fex_cuda, + .extract = extract_fex_cuda, + .flush = flush_fex_cuda, + .priv_size = sizeof(PsnrStateCuda), + .close = close_fex_cuda, + .provided_features = provided_features, + .flags = VMAF_FEATURE_EXTRACTOR_TEMPORAL | VMAF_FEATURE_EXTRACTOR_CUDA, +}; diff --git a/libvmaf/src/feature/feature_extractor.c b/libvmaf/src/feature/feature_extractor.c index 94a4e16ca..acef1d5a6 100644 --- a/libvmaf/src/feature/feature_extractor.c +++ b/libvmaf/src/feature/feature_extractor.c @@ -54,6 +54,7 @@ extern VmafFeatureExtractor vmaf_fex_cambi; extern VmafFeatureExtractor vmaf_fex_integer_adm_cuda; extern VmafFeatureExtractor vmaf_fex_integer_vif_cuda; extern VmafFeatureExtractor vmaf_fex_integer_motion_cuda; +extern VmafFeatureExtractor vmaf_fex_integer_psnr_cuda; #endif extern VmafFeatureExtractor vmaf_fex_null; @@ -79,6 +80,7 @@ static VmafFeatureExtractor *feature_extractor_list[] = { &vmaf_fex_integer_adm_cuda, &vmaf_fex_integer_vif_cuda, &vmaf_fex_integer_motion_cuda, + &vmaf_fex_integer_psnr_cuda, #endif &vmaf_fex_null, NULL diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index 3fbb050c5..c35cbe36e 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -555,7 +555,7 @@ static int translate_picture_host(VmafContext *vmaf, VmafPicture *pic, if (!vmaf->cuda.state.ctx) return -EINVAL; err |= vmaf_ring_buffer_fetch_next_picture(vmaf->cuda.ring_buffer, pic_device); - err |= vmaf_cuda_picture_upload_async(pic_device, pic, 0x1); + err |= vmaf_cuda_picture_upload_async(pic_device, pic, 0xF); if (err) { vmaf_log(VMAF_LOG_LEVEL_ERROR, "problem moving host pic into cuda device buffer\n"); @@ -585,7 +585,7 @@ static int translate_picture_device(VmafContext *vmaf, VmafPicture *pic, return err; } - err = vmaf_cuda_picture_download_async(pic, pic_host, 0x1); + err = vmaf_cuda_picture_download_async(pic, pic_host, 0x3); if (err) { vmaf_log(VMAF_LOG_LEVEL_ERROR, "problem moving cuda pic into host buffer\n"); diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build index 94e2f4d7b..e27f87b8f 100644 --- a/libvmaf/src/meson.build +++ b/libvmaf/src/meson.build @@ -286,6 +286,7 @@ if is_cuda_enabled 'adm_decouple' : [feature_src_dir + 'cuda/integer_adm/adm_decouple.cu'], 'filter1d' : [feature_src_dir + 'cuda/integer_vif/filter1d.cu'], 'motion_score' : [feature_src_dir + 'cuda/integer_motion/motion_score.cu'], + 'psnr' : [feature_src_dir + 'cuda/integer_psnr/psnr.cu'], } message(cuda_cu_sources) cuda_sources = [ @@ -323,7 +324,7 @@ if is_cuda_enabled '-I', '../include', '-I', '../src/feature', '-I', '../src/' + cuda_dir, - ] + ] + cuda_flags ) ptx_files += {name : [t]} cuda_sources += _cu @@ -432,6 +433,7 @@ if is_cuda_enabled feature_src_dir + 'cuda/integer_adm_cuda.c', feature_src_dir + 'cuda/integer_vif_cuda.c', feature_src_dir + 'cuda/integer_motion_cuda.c', + feature_src_dir + 'cuda/integer_psnr_cuda.c', ] endif diff --git a/libvmaf/test/test_ring_buffer.c b/libvmaf/test/test_ring_buffer.c index abea74d50..803acb598 100644 --- a/libvmaf/test/test_ring_buffer.c +++ b/libvmaf/test/test_ring_buffer.c @@ -135,8 +135,8 @@ static void request_picture(void *data) //fprintf(stderr, "request: %i\n", my_thread_pool_data->i); vmaf_ring_buffer_fetch_next_picture(ring_buffer, &pic_cuda_ref); vmaf_ring_buffer_fetch_next_picture(ring_buffer, &pic_cuda_dist); - vmaf_cuda_picture_upload_async(&pic_cuda_ref, &pic, 0x1); - vmaf_cuda_picture_upload_async(&pic_cuda_dist, &pic, 0x1); + vmaf_cuda_picture_upload_async(&pic_cuda_ref, &pic, 0xF); + vmaf_cuda_picture_upload_async(&pic_cuda_dist, &pic, 0xF); //fprintf(stderr, "usleep=%d: %i\n", // my_thread_pool_data->timeout, my_thread_pool_data->i); vmaf_picture_unref(&pic_cuda_ref); From 09274bff10834c4001824f40d45458912d946022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= Date: Tue, 7 Mar 2023 01:31:29 +0100 Subject: [PATCH 2/5] pick feature extractor by feature --- libvmaf/src/libvmaf.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index c35cbe36e..cc167376f 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -275,10 +275,18 @@ int vmaf_use_feature(VmafContext *vmaf, const char *feature_name, int err = 0; - VmafFeatureExtractor *fex = - vmaf_get_feature_extractor_by_name(feature_name); - if (!fex) return -EINVAL; + unsigned fex_flags = 0; +#ifdef HAVE_CUDA + if (!vmaf->cfg.gpumask && vmaf->cuda.state.ctx) + fex_flags |= VMAF_FEATURE_EXTRACTOR_CUDA; +#endif + VmafFeatureExtractor *fex = + vmaf_get_feature_extractor_by_feature_name(feature_name, fex_flags); + if (!fex) { + fex = vmaf_get_feature_extractor_by_name(feature_name); + if (!fex) return -EINVAL; + } VmafDictionary *d = NULL; if (s) { err = vmaf_dictionary_copy(&s, &d); From 11f03827b4f0bfdb65d313a7db1728dba720896a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= Date: Tue, 7 Mar 2023 10:29:46 +0100 Subject: [PATCH 3/5] simplified event handling --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 17 ++--------------- libvmaf/src/feature/cuda/integer_motion_cuda.c | 7 +------ libvmaf/src/feature/cuda/integer_psnr_cuda.c | 15 +++++---------- libvmaf/src/feature/cuda/integer_vif_cuda.c | 13 +++---------- 4 files changed, 11 insertions(+), 41 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index d8b414436..9dec3c11c 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -855,17 +855,9 @@ static void integer_compute_adm_cuda(VmafFeatureExtractor *fex, AdmStateCuda *s, h = (h + 1) / 2; // This event ensures the input buffer is consumed - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuStreamWaitEvent(s->str, s->dis_event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuEventDestroy(s->dis_event)); - CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuStreamWaitEvent(s->str, s->ref_event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuEventDestroy(s->ref_event)); - CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT)); - - CHECK_CUDA(cuCtxPopCurrent(NULL)); + // consumes buf->ref_dwt2 , buf->dis_dwt2 // produces buf->decouple_r , buf->decouple_a adm_decouple_device(s, buf, w, h, buf_stride, &p, s->str); @@ -1154,12 +1146,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, (void) dist_pic_90; // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores - CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + CHECK_CUDA(cuEventSynchronize(s->finished)); // current implementation is limited by the 16-bit data pipeline, thus // cannot handle an angular frequency smaller than 1080p * 3H diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 9615cde98..f2c50cf22 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -259,12 +259,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, MotionStateCuda *s = fex->priv; // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores - CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + CHECK_CUDA(cuEventSynchronize(s->finished)); int err = 0; (void) dist_pic; diff --git a/libvmaf/src/feature/cuda/integer_psnr_cuda.c b/libvmaf/src/feature/cuda/integer_psnr_cuda.c index 5d044c2e3..d121c7c74 100644 --- a/libvmaf/src/feature/cuda/integer_psnr_cuda.c +++ b/libvmaf/src/feature/cuda/integer_psnr_cuda.c @@ -204,12 +204,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, (void) dist_pic_90; // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores - CHECK_CUDA(cuStreamSynchronize(s->str)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + CHECK_CUDA(cuEventSynchronize(s->finished)); // Reset device SSE CHECK_CUDA(cuMemsetD8Async(s->sse_device->data, 0, sizeof(uint64_t) * 3, s->str)); @@ -236,10 +231,10 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); // This event ensures the input buffer is consumed CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); + // CHECK_CUDA(cuEventDestroy(s->event)); + // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); + // CHECK_CUDA(cuCtxPopCurrent(NULL)); // Download sad // CHECK_CUDA(cuStreamSynchronize(s->host_stream)); diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index fcf68f442..6f55f1fa9 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -451,11 +451,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores // before the GPU has finished writing to it. - CHECK_CUDA(cuStreamSynchronize(s->str)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + // CHECK_CUDA(cuStreamSynchronize(s->str)); + CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str)); CHECK_CUDA(cuStreamWaitEvent(vmaf_cuda_picture_get_stream(ref_pic), vmaf_cuda_picture_get_ready_event(dist_pic), CU_EVENT_WAIT_DEFAULT)); @@ -476,10 +473,6 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // This event ensures the input buffer is consumed CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); } } @@ -496,7 +489,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, write_score_parameters_vif *data = s->buf.cpu_param_buf; data->feature_collector = feature_collector; data->index = index; - CHECK_CUDA(cuLaunchHostFunc(s->str, write_scores, data)); + CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data)); return 0; } From 2716d732b01390d4e789dfaf32be25b2619bf8c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= Date: Tue, 14 Mar 2023 20:11:16 +0100 Subject: [PATCH 4/5] fix launch dimensions dwt --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 1 + libvmaf/src/meson.build | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index 9dec3c11c..cdf9f482c 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -149,6 +149,7 @@ void adm_dwt2_s123_combined_device(AdmStateCuda *s,const int32_t *d_i4_scale, in const int BLOCK_Y = (h + 1) / 2; void * args_vert[] = {&d_i4_scale, &tmp_buf, &w, &h, &img_stride, &*p}; + const int num_threads = 128; switch (scale) { case 1: CHECK_CUDA(cuLaunchKernel(s->func_dwt_s123_combined_vert_kernel_0_0_int32_t, diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build index e27f87b8f..7a861b61a 100644 --- a/libvmaf/src/meson.build +++ b/libvmaf/src/meson.build @@ -302,15 +302,14 @@ if is_cuda_enabled cuda_dependency += cuda_rt_api_dependency cuda_flags = [] - if get_option('buildtype').startswith('debug') - cuda_flags += ['-DCUDA_DEBUG', '-lineinfo'] - else - if is_nvtx_enabled - cuda_flags += ['-lineinfo'] - cuda_dependency += declare_dependency(link_args : ['-lnvToolsExt']) - endif + if get_option('buildtype') == 'debug' + cuda_flags += ['-G'] endif - + if is_nvtx_enabled + cuda_flags += ['-lineinfo'] + cuda_dependency += declare_dependency(link_args : ['-lnvToolsExt']) + endif + nvcc_exe = find_program('nvcc') ptx_files = {} foreach name, _cu : cuda_cu_sources @@ -348,7 +347,7 @@ if is_cuda_enabled cuda_static_lib = static_library( 'cuda_common_vmaf_lib', - [cuda_sources, ptx_arrays,], + cuda_sources + ptx_arrays, dependencies: [cuda_drv_api_dependency], include_directories : [ libvmaf_include, From 4e889f9525fd653fdfc58ebff92807f66ed4606d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= Date: Mon, 20 Mar 2023 19:12:24 +0100 Subject: [PATCH 5/5] minor cleanup --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 1 - libvmaf/src/feature/cuda/integer_psnr_cuda.c | 10 ++-------- libvmaf/src/libvmaf.c | 2 +- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index cdf9f482c..9dec3c11c 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -149,7 +149,6 @@ void adm_dwt2_s123_combined_device(AdmStateCuda *s,const int32_t *d_i4_scale, in const int BLOCK_Y = (h + 1) / 2; void * args_vert[] = {&d_i4_scale, &tmp_buf, &w, &h, &img_stride, &*p}; - const int num_threads = 128; switch (scale) { case 1: CHECK_CUDA(cuLaunchKernel(s->func_dwt_s123_combined_vert_kernel_0_0_int32_t, diff --git a/libvmaf/src/feature/cuda/integer_psnr_cuda.c b/libvmaf/src/feature/cuda/integer_psnr_cuda.c index d121c7c74..d29bdabcb 100644 --- a/libvmaf/src/feature/cuda/integer_psnr_cuda.c +++ b/libvmaf/src/feature/cuda/integer_psnr_cuda.c @@ -23,9 +23,8 @@ #include #include +#include "common.h" #include "cuda_helper.cuh" -#include "libvmaf/vmaf_cuda_state.h" -#include "picture_cuda.h" #include "feature_collector.h" #include "feature_extractor.h" @@ -231,13 +230,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); // This event ensures the input buffer is consumed CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - // CHECK_CUDA(cuEventDestroy(s->event)); - // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - // CHECK_CUDA(cuCtxPopCurrent(NULL)); - // Download sad - // CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + // Download SSE CHECK_CUDA(cuMemcpyDtoHAsync(s->sse_host, (CUdeviceptr)s->sse_device->data, sizeof(uint64_t) * 3, s->str)); CHECK_CUDA(cuEventRecord(s->finished, s->str)); diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index cc167376f..6e0574aca 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -593,7 +593,7 @@ static int translate_picture_device(VmafContext *vmaf, VmafPicture *pic, return err; } - err = vmaf_cuda_picture_download_async(pic, pic_host, 0x3); + err = vmaf_cuda_picture_download_async(pic, pic_host, 0xF); if (err) { vmaf_log(VMAF_LOG_LEVEL_ERROR, "problem moving cuda pic into host buffer\n");