From 88f01b2f82710e68f0506681ff9bdd41eb71b922 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <maximilianm@nvidia.com>
Date: Tue, 7 Mar 2023 01:26:00 +0100
Subject: [PATCH 1/5] psnr CUDA implementation

---
 libvmaf/src/cuda/cuda_helper.cuh              |   7 +
 libvmaf/src/feature/cuda/integer_psnr/psnr.cu | 138 ++++++++
 libvmaf/src/feature/cuda/integer_psnr_cuda.c  | 329 ++++++++++++++++++
 libvmaf/src/feature/feature_extractor.c       |   2 +
 libvmaf/src/libvmaf.c                         |   4 +-
 libvmaf/src/meson.build                       |   4 +-
 libvmaf/test/test_ring_buffer.c               |   4 +-
 7 files changed, 483 insertions(+), 5 deletions(-)
 create mode 100644 libvmaf/src/feature/cuda/integer_psnr/psnr.cu
 create mode 100644 libvmaf/src/feature/cuda/integer_psnr_cuda.c
diff --git a/libvmaf/src/cuda/cuda_helper.cuh b/libvmaf/src/cuda/cuda_helper.cuh
index 6fa01b9f6..3d905894c 100644
--- a/libvmaf/src/cuda/cuda_helper.cuh
+++ b/libvmaf/src/cuda/cuda_helper.cuh
@@ -59,6 +59,13 @@ namespace {
         return atomicAdd(reinterpret_cast<uint64_cu *>(address),
                 static_cast<uint64_cu>(val));
     }
+
+    typedef unsigned long long int uint64_cu;
+    __forceinline__ __device__ int64_t atomicAdd_uint64(uint64_t *address,
+            uint64_t val) {
+        return atomicAdd(reinterpret_cast<uint64_cu *>(address),
+                static_cast<uint64_cu>(val));
+    }
 } // namespace
 #endif
 
diff --git a/libvmaf/src/feature/cuda/integer_psnr/psnr.cu b/libvmaf/src/feature/cuda/integer_psnr/psnr.cu
new file mode 100644
index 000000000..50214269b
--- /dev/null
+++ b/libvmaf/src/feature/cuda/integer_psnr/psnr.cu
@@ -0,0 +1,138 @@
+/**
+ *
+ *  Copyright 2016-2023 Netflix, Inc.
+ *  Copyright 2022 NVIDIA Corporation.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include "cuda/integer_motion_cuda.h"
+#include "cuda_helper.cuh"
+
+#include "common.h"
+
+template <typename T, typename LOAD_TYPE>
+__device__ void sse_calculation(T *ref, T *dis, unsigned int w, unsigned int h,
+                                unsigned int stride, uint64_t *sse) {
+  constexpr int val_per_thread = sizeof(LOAD_TYPE) / sizeof(T);
+  unsigned int idx_x = (threadIdx.x + blockDim.x * blockIdx.x) * val_per_thread;
+  unsigned int idx_y = threadIdx.y + blockDim.y * blockIdx.y;
+
+  if (idx_y < h && idx_x < w) {
+    int idx = idx_y * (stride / sizeof(T)) + idx_x;
+    uint64_t thread_sse = 0u;
+    union {
+      T value_ref[val_per_thread];
+      LOAD_TYPE load_value_dis;
+    };
+    union {
+      T value_dis[val_per_thread];
+      LOAD_TYPE load_value_ref;
+    };
+    load_value_ref = *reinterpret_cast<LOAD_TYPE *>(&ref[idx]);
+    load_value_dis = *reinterpret_cast<LOAD_TYPE *>(&dis[idx]);
+    for (unsigned int i = 0; i < val_per_thread; ++i) {
+      if ((idx_x + i) < w) {
+        const int e = value_ref[i] - value_dis[i];
+        thread_sse += e * e;
+      }
+    }
+
+    // Warp-reduce abs_dist
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 16);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 8);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 4);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 2);
+    thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 1);
+    // Let threads in lane zero add warp-reduced abs_dist atomically to global
+    // sad
+    const int lane =
+        (threadIdx.y * blockDim.x + threadIdx.x) % VMAF_CUDA_THREADS_PER_WARP;
+    if (lane == 0)
+      atomicAdd_uint64(sse, static_cast<uint64_t>(thread_sse));
+  }
+}
+
+template <int chn>
+__device__ void psnr8_impl(const VmafPicture ref_pic,
+                           const VmafPicture dist_pic,
+                           const VmafCudaBuffer sse) {
+  unsigned int stride = ref_pic.stride[chn];
+  // if second channel is smaller use smaller load
+  if (stride <= (ref_pic.stride[0] / 2))
+    sse_calculation<uint8_t, uint64_t>(
+        reinterpret_cast<uint8_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint8_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+  else
+    sse_calculation<uint8_t, ushort4>(
+        reinterpret_cast<uint8_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint8_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+}
+
+template <int chn>
+__device__ void psnr16_impl(const VmafPicture ref_pic,
+                            const VmafPicture dist_pic,
+                            const VmafCudaBuffer sse) {
+  unsigned int stride = ref_pic.stride[chn];
+  // if second channel is smaller use smaller load
+  if (stride <= (ref_pic.stride[0] / 2))
+    sse_calculation<uint16_t, ushort4>(
+        reinterpret_cast<uint16_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint16_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+  else
+    sse_calculation<uint16_t, uint4>(
+        reinterpret_cast<uint16_t *>(ref_pic.data[chn]),
+        reinterpret_cast<uint16_t *>(dist_pic.data[chn]), ref_pic.w[chn],
+        ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
+}
+
+extern "C" {
+
+__global__ void psnr(const VmafPicture ref_pic, const VmafPicture dist_pic,
+                     const VmafCudaBuffer sse) {
+  // this is needed to not produce local load/store ops when accessing with
+  // "dynamic" index although blockIdx.z is not really dynamic
+  switch (blockIdx.z) {
+  case 0:
+    psnr8_impl<0>(ref_pic, dist_pic, sse);
+    return;
+  case 1:
+    psnr8_impl<1>(ref_pic, dist_pic, sse);
+    return;
+  case 2:
+    psnr8_impl<2>(ref_pic, dist_pic, sse);
+    return;
+  }
+}
+
+__global__ void psnr_hbd(const VmafPicture ref_pic, const VmafPicture dist_pic,
+                         const VmafCudaBuffer sse) {
+  // this is needed to not produce local load/store ops when accessing with
+  // "dynamic" index although blockIdx.z is not really dynamic
+  switch (blockIdx.z) {
+  case 0:
+    psnr16_impl<0>(ref_pic, dist_pic, sse);
+    return;
+  case 1:
+    psnr16_impl<1>(ref_pic, dist_pic, sse);
+    return;
+  case 2:
+    psnr16_impl<2>(ref_pic, dist_pic, sse);
+    return;
+  }
+}
+}
\ No newline at end of file
diff --git a/libvmaf/src/feature/cuda/integer_psnr_cuda.c b/libvmaf/src/feature/cuda/integer_psnr_cuda.c
new file mode 100644
index 000000000..5d044c2e3
--- /dev/null
+++ b/libvmaf/src/feature/cuda/integer_psnr_cuda.c
@@ -0,0 +1,329 @@
+/**
+ *
+ *  Copyright 2016-2023 Netflix, Inc.
+ *  Copyright 2022 NVIDIA Corporation.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "cuda_helper.cuh"
+#include "libvmaf/vmaf_cuda_state.h"
+#include "picture_cuda.h"
+
+#include "feature_collector.h"
+#include "feature_extractor.h"
+#include "opt.h"
+
+
+typedef struct PsnrStateCuda {
+    bool enable_chroma;
+    bool enable_mse;
+    bool enable_apsnr;
+    bool reduced_hbd_peak;
+    uint32_t peak;
+    CUevent event, finished;
+    CUfunction func_psnr;
+    CUstream str, host_stream;
+    double psnr_max[3];
+    double min_sse;
+    VmafCudaBuffer *sse_device;
+    uint64_t *sse_host;
+    void* write_score_parameters;
+
+    struct {
+        uint64_t sse[3];
+        uint64_t n_pixels[3];
+    } apsnr;
+} PsnrStateCuda;
+
+typedef struct write_score_parameters_psnr {
+    VmafFeatureCollector *feature_collector;
+    PsnrStateCuda *s;
+    unsigned h[3], w[3];
+    unsigned index;
+} write_score_parameters_psnr;
+
+extern unsigned char src_psnr_ptx[];
+
+static const VmafOption options[] = {
+    {
+        .name = "enable_chroma",
+        .help = "enable calculation for chroma channels",
+        .offset = offsetof(PsnrStateCuda, enable_chroma),
+        .type = VMAF_OPT_TYPE_BOOL,
+        .default_val.b = true,
+    },
+    {
+        .name = "enable_mse",
+        .help = "enable MSE calculation",
+        .offset = offsetof(PsnrStateCuda, enable_mse),
+        .type = VMAF_OPT_TYPE_BOOL,
+        .default_val.b = false,
+    },
+    {
+        .name = "enable_apsnr",
+        .help = "enable APSNR calculation",
+        .offset = offsetof(PsnrStateCuda, enable_apsnr),
+        .type = VMAF_OPT_TYPE_BOOL,
+        .default_val.b = false,
+    },
+    {
+        .name = "reduced_hbd_peak",
+        .help = "reduce hbd peak value to align with scaled 8-bit content",
+        .offset = offsetof(PsnrStateCuda, reduced_hbd_peak),
+        .type = VMAF_OPT_TYPE_BOOL,
+        .default_val.b = false,
+    },
+    {
+        .name = "min_sse",
+        .help = "constrain the minimum possible sse",
+        .offset = offsetof(PsnrStateCuda, min_sse),
+        .type = VMAF_OPT_TYPE_DOUBLE,
+        .default_val.d = 0.0,
+        .min = 0.0,
+        .max = DBL_MAX,
+    },
+    { 0 }
+};
+
+
+static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt,
+                unsigned bpc, unsigned w, unsigned h)
+{
+    PsnrStateCuda *s = fex->priv;
+
+    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
+    CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
+    CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
+    CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
+    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
+
+    CUmodule module;
+    CHECK_CUDA(cuModuleLoadData(&module, src_psnr_ptx));
+    if (bpc > 8) {
+        CHECK_CUDA(cuModuleGetFunction(&s->func_psnr, module, "psnr_hbd"));
+    } else {
+        CHECK_CUDA(cuModuleGetFunction(&s->func_psnr, module, "psnr"));
+    }
+    CHECK_CUDA(cuCtxPopCurrent(NULL));
+
+    s->write_score_parameters = malloc(sizeof(write_score_parameters_psnr));
+    ((write_score_parameters_psnr*)s->write_score_parameters)->s = s;
+
+
+    int ret = 0; 
+    ret |= vmaf_cuda_buffer_alloc(fex->cu_state, &s->sse_device, sizeof(uint64_t) * 3);
+    if (ret) goto free_ref;
+    ret |= vmaf_cuda_buffer_host_alloc(fex->cu_state, &s->sse_host, sizeof(uint64_t) * 3);
+    if (ret) goto free_ref;
+
+    s->peak = s->reduced_hbd_peak ? 255 * 1 << (bpc - 8) : (1 << bpc) - 1;
+
+    if (pix_fmt == VMAF_PIX_FMT_YUV400P)
+        s->enable_chroma = false;
+
+    for (unsigned i = 0; i < 3; i++) {
+        if (s->min_sse != 0.0) {
+            const int ss_hor = pix_fmt != VMAF_PIX_FMT_YUV444P;
+            const int ss_ver = pix_fmt == VMAF_PIX_FMT_YUV420P;
+            const double mse = s->min_sse / 
+                (((i && ss_hor) ? w / 2 : w) * ((i && ss_ver) ? h / 2 : h));
+            s->psnr_max[i] = ceil(10. * log10(s->peak * s->peak / mse));
+        } else {
+            s->psnr_max[i] = (6 * bpc) + 12;
+        }
+    }
+
+    return ret;
+free_ref:
+    if (s->sse_device) {
+        ret |= vmaf_cuda_buffer_free(fex->cu_state, s->sse_device);
+        free(s->sse_device);
+    }
+    return -ENOMEM;
+}
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static char *mse_name[3] = { "mse_y", "mse_cb", "mse_cr" };
+static char *psnr_name[3] = { "psnr_y", "psnr_cb", "psnr_cr" };
+
+static int write_scores(write_score_parameters_psnr* params)
+{
+    PsnrStateCuda *s = params->s;
+    VmafFeatureCollector *feature_collector = params->feature_collector;
+    
+    const unsigned n = s->enable_chroma ? 3 : 1;
+    for (unsigned p = 0; p < n; p++) {
+        if (s->enable_apsnr) {
+            s->apsnr.sse[p] += s->sse_host[p];
+            s->apsnr.n_pixels[p] += params->h[p] * params->w[p];
+        }
+
+        const double mse = ((double) s->sse_host[p]) / (params->w[p] * params->h[p]);
+        const double psnr =
+            MIN(10. * log10(s->peak * s->peak / MAX(mse, 1e-16)), s->psnr_max[p]);
+
+
+        int err = 0;
+        err |= vmaf_feature_collector_append(feature_collector, psnr_name[p],
+                                                psnr, params->index);
+        if (s->enable_mse) {
+            err |= vmaf_feature_collector_append(feature_collector, mse_name[p],
+                                                    mse, params->index);
+        }
+    }
+}
+
+static int extract_fex_cuda(VmafFeatureExtractor *fex,
+                   VmafPicture *ref_pic, VmafPicture *ref_pic_90,
+                   VmafPicture *dist_pic, VmafPicture *dist_pic_90,
+                   unsigned index, VmafFeatureCollector *feature_collector)
+{
+    PsnrStateCuda *s = fex->priv;
+
+    (void) ref_pic_90;
+    (void) dist_pic_90;
+
+    // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
+    CHECK_CUDA(cuStreamSynchronize(s->str));
+    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
+    // CHECK_CUDA(cuEventSynchronize(s->finished));
+    CHECK_CUDA(cuEventDestroy(s->finished));
+    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
+    CHECK_CUDA(cuCtxPopCurrent(NULL));
+
+    // Reset device SSE
+    CHECK_CUDA(cuMemsetD8Async(s->sse_device->data, 0, sizeof(uint64_t) * 3, s->str));
+
+
+    const int width_y = ref_pic->w[0];
+    const int height_y = ref_pic->h[0];
+    const int stride_y =  ref_pic->stride[0];
+
+    const int val_per_thread = 8;
+    const int block_dim_x = 16;
+    const int block_dim_y = 16;
+    const int grid_dim_x = DIV_ROUND_UP(width_y, block_dim_x * val_per_thread);
+    const int grid_dim_y = DIV_ROUND_UP(height_y, block_dim_y);
+    const int grid_dim_z = s->enable_chroma ? 3 : 1;
+
+    void *kernelParams[] = {ref_pic,  dist_pic, s->sse_device};
+    CHECK_CUDA(cuStreamWaitEvent(vmaf_cuda_picture_get_stream(ref_pic), vmaf_cuda_picture_get_ready_event(dist_pic), CU_EVENT_WAIT_DEFAULT));
+    CHECK_CUDA(cuLaunchKernel(s->func_psnr, 
+                grid_dim_x, grid_dim_y, grid_dim_z, 
+                block_dim_x, block_dim_y, 1, 
+                0, vmaf_cuda_picture_get_stream(ref_pic), kernelParams, NULL));
+
+    CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
+    // This event ensures the input buffer is consumed
+    CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
+    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
+    CHECK_CUDA(cuEventDestroy(s->event));
+    CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
+    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    
+    // Download sad
+    // CHECK_CUDA(cuStreamSynchronize(s->host_stream));
+    CHECK_CUDA(cuMemcpyDtoHAsync(s->sse_host, (CUdeviceptr)s->sse_device->data,
+                sizeof(uint64_t) * 3, s->str));
+    CHECK_CUDA(cuEventRecord(s->finished, s->str));
+    CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));
+    
+    write_score_parameters_psnr* params = s->write_score_parameters;
+    params->feature_collector = feature_collector;
+    for (unsigned p = 0; p < grid_dim_z; p++) {
+        params->h[p] = ref_pic->h[p];
+        params->w[p] = ref_pic->w[p];
+    }
+    params->index = index;
+    CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters));
+    return 0;
+}
+
+static int flush_fex_cuda(VmafFeatureExtractor *fex,
+                 VmafFeatureCollector *feature_collector)
+{
+    PsnrStateCuda *s = fex->priv;
+    const char *apsnr_name[3] = { "apsnr_y", "apsnr_cb", "apsnr_cr" };
+    CHECK_CUDA(cuStreamSynchronize(s->str));
+    CHECK_CUDA(cuStreamSynchronize(s->host_stream));
+
+    int err = 0;
+    if (s->enable_apsnr) {
+        for (unsigned i = 0; i < 3; i++) {
+
+            double apsnr = 10 * (log10(s->peak * s->peak) +
+                                 log10(s->apsnr.n_pixels[i]) -
+                                 log10(s->apsnr.sse[i]));
+
+            double max_apsnr =
+                ceil(10 * log10(s->peak * s->peak *
+                                s->apsnr.n_pixels[i] *
+                                2));
+
+            err |=
+                vmaf_feature_collector_set_aggregate(feature_collector,
+                                                     apsnr_name[i],
+                                                     MIN(apsnr, max_apsnr));
+        }
+    }
+
+    return (err < 0) ? err : !err;
+}
+
+static int close_fex_cuda(VmafFeatureExtractor *fex)
+{
+    PsnrStateCuda *s = fex->priv;
+    CHECK_CUDA(cuStreamSynchronize(s->host_stream));
+    CHECK_CUDA(cuStreamSynchronize(s->str));
+    int ret = 0;
+
+    if (s->sse_host) {
+        ret |= vmaf_cuda_buffer_host_free(fex->cu_state, s->sse_host);
+    }
+    if (s->sse_device) {
+        ret |= vmaf_cuda_buffer_free(fex->cu_state, s->sse_device);
+        free(s->sse_device);
+    }
+    if(s->write_score_parameters) {
+        free(s->write_score_parameters);
+    }
+
+    return ret;
+}
+
+static const char *provided_features[] = {
+    "psnr_y", "psnr_cb", "psnr_cr",
+    NULL
+};
+
+VmafFeatureExtractor vmaf_fex_integer_psnr_cuda = {
+    .name = "psnr_cuda",
+    .options = options,
+    .init = init_fex_cuda,
+    .extract = extract_fex_cuda,
+    .flush = flush_fex_cuda,
+    .priv_size = sizeof(PsnrStateCuda),
+    .close = close_fex_cuda,
+    .provided_features = provided_features,
+    .flags = VMAF_FEATURE_EXTRACTOR_TEMPORAL | VMAF_FEATURE_EXTRACTOR_CUDA,
+};
diff --git a/libvmaf/src/feature/feature_extractor.c b/libvmaf/src/feature/feature_extractor.c
index 94a4e16ca..acef1d5a6 100644
--- a/libvmaf/src/feature/feature_extractor.c
+++ b/libvmaf/src/feature/feature_extractor.c
@@ -54,6 +54,7 @@ extern VmafFeatureExtractor vmaf_fex_cambi;
 extern VmafFeatureExtractor vmaf_fex_integer_adm_cuda;
 extern VmafFeatureExtractor vmaf_fex_integer_vif_cuda;
 extern VmafFeatureExtractor vmaf_fex_integer_motion_cuda;
+extern VmafFeatureExtractor vmaf_fex_integer_psnr_cuda;
 #endif
 extern VmafFeatureExtractor vmaf_fex_null;
 
@@ -79,6 +80,7 @@ static VmafFeatureExtractor *feature_extractor_list[] = {
     &vmaf_fex_integer_adm_cuda,
     &vmaf_fex_integer_vif_cuda,
     &vmaf_fex_integer_motion_cuda,
+    &vmaf_fex_integer_psnr_cuda,
 #endif
     &vmaf_fex_null,
     NULL
diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c
index 3fbb050c5..c35cbe36e 100644
--- a/libvmaf/src/libvmaf.c
+++ b/libvmaf/src/libvmaf.c
@@ -555,7 +555,7 @@ static int translate_picture_host(VmafContext *vmaf, VmafPicture *pic,
         if (!vmaf->cuda.state.ctx)
             return -EINVAL;
         err |= vmaf_ring_buffer_fetch_next_picture(vmaf->cuda.ring_buffer, pic_device);
-        err |= vmaf_cuda_picture_upload_async(pic_device, pic, 0x1);
+        err |= vmaf_cuda_picture_upload_async(pic_device, pic, 0xF);
         if (err) {
             vmaf_log(VMAF_LOG_LEVEL_ERROR,
                     "problem moving host pic into cuda device buffer\n");
@@ -585,7 +585,7 @@ static int translate_picture_device(VmafContext *vmaf, VmafPicture *pic,
         return err;
     }
 
-    err = vmaf_cuda_picture_download_async(pic, pic_host, 0x1);
+    err = vmaf_cuda_picture_download_async(pic, pic_host, 0x3);
     if (err) {
         vmaf_log(VMAF_LOG_LEVEL_ERROR,
                  "problem moving cuda pic into host buffer\n");
diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build
index 94e2f4d7b..e27f87b8f 100644
--- a/libvmaf/src/meson.build
+++ b/libvmaf/src/meson.build
@@ -286,6 +286,7 @@ if is_cuda_enabled
         'adm_decouple' : [feature_src_dir + 'cuda/integer_adm/adm_decouple.cu'],
         'filter1d' : [feature_src_dir + 'cuda/integer_vif/filter1d.cu'],
         'motion_score' : [feature_src_dir + 'cuda/integer_motion/motion_score.cu'],
+        'psnr' : [feature_src_dir + 'cuda/integer_psnr/psnr.cu'],
     }
     message(cuda_cu_sources)
     cuda_sources = [
@@ -323,7 +324,7 @@ if is_cuda_enabled
                 '-I', '../include',
                 '-I', '../src/feature',
                 '-I', '../src/' + cuda_dir,
-            ]
+            ] + cuda_flags
         )
         ptx_files += {name : [t]}
         cuda_sources += _cu
@@ -432,6 +433,7 @@ if is_cuda_enabled
         feature_src_dir + 'cuda/integer_adm_cuda.c',
         feature_src_dir + 'cuda/integer_vif_cuda.c',
         feature_src_dir + 'cuda/integer_motion_cuda.c',
+        feature_src_dir + 'cuda/integer_psnr_cuda.c',
     ]
 endif
 
diff --git a/libvmaf/test/test_ring_buffer.c b/libvmaf/test/test_ring_buffer.c
index abea74d50..803acb598 100644
--- a/libvmaf/test/test_ring_buffer.c
+++ b/libvmaf/test/test_ring_buffer.c
@@ -135,8 +135,8 @@ static void request_picture(void *data)
     //fprintf(stderr, "request: %i\n", my_thread_pool_data->i);
     vmaf_ring_buffer_fetch_next_picture(ring_buffer, &pic_cuda_ref);
     vmaf_ring_buffer_fetch_next_picture(ring_buffer, &pic_cuda_dist);
-    vmaf_cuda_picture_upload_async(&pic_cuda_ref, &pic, 0x1);
-    vmaf_cuda_picture_upload_async(&pic_cuda_dist, &pic, 0x1);
+    vmaf_cuda_picture_upload_async(&pic_cuda_ref, &pic, 0xF);
+    vmaf_cuda_picture_upload_async(&pic_cuda_dist, &pic, 0xF);
     //fprintf(stderr, "usleep=%d: %i\n",
     //        my_thread_pool_data->timeout, my_thread_pool_data->i);
     vmaf_picture_unref(&pic_cuda_ref);

From 09274bff10834c4001824f40d45458912d946022 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <maximilianm@nvidia.com>
Date: Tue, 7 Mar 2023 01:31:29 +0100
Subject: [PATCH 2/5] pick feature extractor by feature

---
 libvmaf/src/libvmaf.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c
index c35cbe36e..cc167376f 100644
--- a/libvmaf/src/libvmaf.c
+++ b/libvmaf/src/libvmaf.c
@@ -275,10 +275,18 @@ int vmaf_use_feature(VmafContext *vmaf, const char *feature_name,
 
     int err = 0;
 
-    VmafFeatureExtractor *fex =
-        vmaf_get_feature_extractor_by_name(feature_name);
-    if (!fex) return -EINVAL;
+    unsigned fex_flags = 0;
 
+#ifdef HAVE_CUDA
+    if (!vmaf->cfg.gpumask && vmaf->cuda.state.ctx)
+        fex_flags |= VMAF_FEATURE_EXTRACTOR_CUDA;
+#endif
+    VmafFeatureExtractor *fex =
+        vmaf_get_feature_extractor_by_feature_name(feature_name, fex_flags);
+    if (!fex) {
+        fex = vmaf_get_feature_extractor_by_name(feature_name);
+        if (!fex) return -EINVAL;
+    }
     VmafDictionary *d = NULL;
     if (s) {
         err = vmaf_dictionary_copy(&s, &d);

From 11f03827b4f0bfdb65d313a7db1728dba720896a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <maximilianm@nvidia.com>
Date: Tue, 7 Mar 2023 10:29:46 +0100
Subject: [PATCH 3/5] simplified event handling

---
 libvmaf/src/feature/cuda/integer_adm_cuda.c    | 17 ++---------------
 libvmaf/src/feature/cuda/integer_motion_cuda.c |  7 +------
 libvmaf/src/feature/cuda/integer_psnr_cuda.c   | 15 +++++----------
 libvmaf/src/feature/cuda/integer_vif_cuda.c    | 13 +++----------
 4 files changed, 11 insertions(+), 41 deletions(-)

diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c
index d8b414436..9dec3c11c 100644
--- a/libvmaf/src/feature/cuda/integer_adm_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c
@@ -855,17 +855,9 @@ static void integer_compute_adm_cuda(VmafFeatureExtractor *fex, AdmStateCuda *s,
             h = (h + 1) / 2;
 
             // This event ensures the input buffer is consumed
-            CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-
             CHECK_CUDA(cuStreamWaitEvent(s->str, s->dis_event, CU_EVENT_WAIT_DEFAULT));
-            CHECK_CUDA(cuEventDestroy(s->dis_event));
-            CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT));
-
             CHECK_CUDA(cuStreamWaitEvent(s->str, s->ref_event, CU_EVENT_WAIT_DEFAULT));
-            CHECK_CUDA(cuEventDestroy(s->ref_event));
-            CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT));
-
-            CHECK_CUDA(cuCtxPopCurrent(NULL));
+            
             // consumes buf->ref_dwt2 , buf->dis_dwt2
             // produces buf->decouple_r , buf->decouple_a
             adm_decouple_device(s, buf, w, h, buf_stride, &p, s->str);
@@ -1154,12 +1146,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
     (void) dist_pic_90;
 
     // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
-    CHECK_CUDA(cuStreamSynchronize(s->str));
-    // CHECK_CUDA(cuEventSynchronize(s->finished));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    CHECK_CUDA(cuEventDestroy(s->finished));
-    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    CHECK_CUDA(cuEventSynchronize(s->finished));
 
     // current implementation is limited by the 16-bit data pipeline, thus
     // cannot handle an angular frequency smaller than 1080p * 3H
diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c
index 9615cde98..f2c50cf22 100644
--- a/libvmaf/src/feature/cuda/integer_motion_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c
@@ -259,12 +259,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
     MotionStateCuda *s = fex->priv;
 
     // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
-    CHECK_CUDA(cuStreamSynchronize(s->str));
-    // CHECK_CUDA(cuEventSynchronize(s->finished));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    CHECK_CUDA(cuEventDestroy(s->finished));
-    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    CHECK_CUDA(cuEventSynchronize(s->finished));
 
     int err = 0;
     (void) dist_pic;
diff --git a/libvmaf/src/feature/cuda/integer_psnr_cuda.c b/libvmaf/src/feature/cuda/integer_psnr_cuda.c
index 5d044c2e3..d121c7c74 100644
--- a/libvmaf/src/feature/cuda/integer_psnr_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_psnr_cuda.c
@@ -204,12 +204,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
     (void) dist_pic_90;
 
     // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
-    CHECK_CUDA(cuStreamSynchronize(s->str));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    // CHECK_CUDA(cuEventSynchronize(s->finished));
-    CHECK_CUDA(cuEventDestroy(s->finished));
-    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    CHECK_CUDA(cuEventSynchronize(s->finished));
 
     // Reset device SSE
     CHECK_CUDA(cuMemsetD8Async(s->sse_device->data, 0, sizeof(uint64_t) * 3, s->str));
@@ -236,10 +231,10 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
     CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
     // This event ensures the input buffer is consumed
     CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    CHECK_CUDA(cuEventDestroy(s->event));
-    CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
+    // CHECK_CUDA(cuEventDestroy(s->event));
+    // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
+    // CHECK_CUDA(cuCtxPopCurrent(NULL));
     
     // Download sad
     // CHECK_CUDA(cuStreamSynchronize(s->host_stream));
diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c
index fcf68f442..6f55f1fa9 100644
--- a/libvmaf/src/feature/cuda/integer_vif_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c
@@ -451,11 +451,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
 
     // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
     // before the GPU has finished writing to it.
-    CHECK_CUDA(cuStreamSynchronize(s->str));
-    CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    CHECK_CUDA(cuEventDestroy(s->finished));
-    CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
-    CHECK_CUDA(cuCtxPopCurrent(NULL));
+    // CHECK_CUDA(cuStreamSynchronize(s->str));
+    CHECK_CUDA(cuEventSynchronize(s->finished));
 
     CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str));
     CHECK_CUDA(cuStreamWaitEvent(vmaf_cuda_picture_get_stream(ref_pic), vmaf_cuda_picture_get_ready_event(dist_pic), CU_EVENT_WAIT_DEFAULT));
@@ -476,10 +473,6 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
             // This event ensures the input buffer is consumed
             CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
             CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
-            CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-            CHECK_CUDA(cuEventDestroy(s->event));
-            CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
-            CHECK_CUDA(cuCtxPopCurrent(NULL));
         }
     }
 
@@ -496,7 +489,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
     write_score_parameters_vif *data = s->buf.cpu_param_buf;
     data->feature_collector = feature_collector;
     data->index = index;
-    CHECK_CUDA(cuLaunchHostFunc(s->str, write_scores, data));
+    CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data));
     return 0;
 }
 

From 2716d732b01390d4e789dfaf32be25b2619bf8c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <maximilianm@nvidia.com>
Date: Tue, 14 Mar 2023 20:11:16 +0100
Subject: [PATCH 4/5] fix launch dimensions dwt

---
 libvmaf/src/feature/cuda/integer_adm_cuda.c |  1 +
 libvmaf/src/meson.build                     | 17 ++++++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c
index 9dec3c11c..cdf9f482c 100644
--- a/libvmaf/src/feature/cuda/integer_adm_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c
@@ -149,6 +149,7 @@ void adm_dwt2_s123_combined_device(AdmStateCuda *s,const int32_t *d_i4_scale, in
     const int BLOCK_Y = (h + 1) / 2;
     
     void * args_vert[] = {&d_i4_scale, &tmp_buf, &w, &h, &img_stride, &*p};
+    const int num_threads = 128;
     switch (scale) {
         case 1:
             CHECK_CUDA(cuLaunchKernel(s->func_dwt_s123_combined_vert_kernel_0_0_int32_t,
diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build
index e27f87b8f..7a861b61a 100644
--- a/libvmaf/src/meson.build
+++ b/libvmaf/src/meson.build
@@ -302,15 +302,14 @@ if is_cuda_enabled
     cuda_dependency += cuda_rt_api_dependency
 
     cuda_flags = []
-    if get_option('buildtype').startswith('debug')
-        cuda_flags += ['-DCUDA_DEBUG', '-lineinfo']
-    else
-        if is_nvtx_enabled
-                cuda_flags += ['-lineinfo']
-                cuda_dependency += declare_dependency(link_args : ['-lnvToolsExt'])
-        endif
+    if get_option('buildtype') == 'debug'
+        cuda_flags += ['-G']
     endif
-
+    if is_nvtx_enabled
+            cuda_flags += ['-lineinfo']
+            cuda_dependency += declare_dependency(link_args : ['-lnvToolsExt'])
+    endif
+    
     nvcc_exe = find_program('nvcc')
     ptx_files = {}
     foreach name, _cu : cuda_cu_sources
@@ -348,7 +347,7 @@ if is_cuda_enabled
 
     cuda_static_lib = static_library(
         'cuda_common_vmaf_lib',
-        [cuda_sources, ptx_arrays,],
+        cuda_sources + ptx_arrays,
         dependencies: [cuda_drv_api_dependency],
         include_directories : [
             libvmaf_include,

From 4e889f9525fd653fdfc58ebff92807f66ed4606d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <maximilianm@nvidia.com>
Date: Mon, 20 Mar 2023 19:12:24 +0100
Subject: [PATCH 5/5] minor cleanup

---
 libvmaf/src/feature/cuda/integer_adm_cuda.c  |  1 -
 libvmaf/src/feature/cuda/integer_psnr_cuda.c | 10 ++--------
 libvmaf/src/libvmaf.c                        |  2 +-
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c
index cdf9f482c..9dec3c11c 100644
--- a/libvmaf/src/feature/cuda/integer_adm_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c
@@ -149,7 +149,6 @@ void adm_dwt2_s123_combined_device(AdmStateCuda *s,const int32_t *d_i4_scale, in
     const int BLOCK_Y = (h + 1) / 2;
     
     void * args_vert[] = {&d_i4_scale, &tmp_buf, &w, &h, &img_stride, &*p};
-    const int num_threads = 128;
     switch (scale) {
         case 1:
             CHECK_CUDA(cuLaunchKernel(s->func_dwt_s123_combined_vert_kernel_0_0_int32_t,
diff --git a/libvmaf/src/feature/cuda/integer_psnr_cuda.c b/libvmaf/src/feature/cuda/integer_psnr_cuda.c
index d121c7c74..d29bdabcb 100644
--- a/libvmaf/src/feature/cuda/integer_psnr_cuda.c
+++ b/libvmaf/src/feature/cuda/integer_psnr_cuda.c
@@ -23,9 +23,8 @@
 #include <stddef.h>
 #include <string.h>
 
+#include "common.h"
 #include "cuda_helper.cuh"
-#include "libvmaf/vmaf_cuda_state.h"
-#include "picture_cuda.h"
 
 #include "feature_collector.h"
 #include "feature_extractor.h"
@@ -231,13 +230,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
     CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
     // This event ensures the input buffer is consumed
     CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
-    // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
-    // CHECK_CUDA(cuEventDestroy(s->event));
-    // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
-    // CHECK_CUDA(cuCtxPopCurrent(NULL));
     
-    // Download sad
-    // CHECK_CUDA(cuStreamSynchronize(s->host_stream));
+    // Download SSE
     CHECK_CUDA(cuMemcpyDtoHAsync(s->sse_host, (CUdeviceptr)s->sse_device->data,
                 sizeof(uint64_t) * 3, s->str));
     CHECK_CUDA(cuEventRecord(s->finished, s->str));
diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c
index cc167376f..6e0574aca 100644
--- a/libvmaf/src/libvmaf.c
+++ b/libvmaf/src/libvmaf.c
@@ -593,7 +593,7 @@ static int translate_picture_device(VmafContext *vmaf, VmafPicture *pic,
         return err;
     }
 
-    err = vmaf_cuda_picture_download_async(pic, pic_host, 0x3);
+    err = vmaf_cuda_picture_download_async(pic, pic_host, 0xF);
     if (err) {
         vmaf_log(VMAF_LOG_LEVEL_ERROR,
                  "problem moving cuda pic into host buffer\n");