Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SAST resizer from FUNQUE #1361

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libvmaf/meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ option('enable_nvtx',
type: 'boolean',
value: false,
description: 'Enable NVTX range support')

option('enable_sast_resizer',
type: 'boolean',
value: false,
description: 'Compile the SAST resizer from FUNQUE into the library')

295 changes: 295 additions & 0 deletions libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
/**
*
* Copyright (c) 2022-2024 Meta, Inc.
*
* Licensed under the BSD 3-Clause License (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://opensource.org/license/bsd-3-clause
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <arm_neon.h>
#include <time.h>
#include "../resizer.h"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer specifying this without .. if possible - I think it can be converted to #include "resizer.h" directly with no other changes?

#include "resizer_neon.h"

#if OPTIMISED_COEFF
void hresize_neon(const unsigned char **src, int **dst, int count,
const short *alpha,
int swidth, int dwidth, int cn, int xmin, int xmax)
#else
void hresize_neon(const unsigned char **src, int **dst, int count,
const int *xofs, const short *alpha,
int swidth, int dwidth, int cn, int xmin, int xmax)
#endif
{
// int first_col_count = 0;
uint8x8_t src1_8x8, src2_8x8, src3_8x8;
int simd_loop = (xmax / 8) * 8;
int num_pix = 8;

#if OPTIMISED_COEFF
int sx_start = 2;
#else
int sx_start = xofs[1];
#endif

for (int k = 0; k < count; k++)
{
const unsigned char *S = src[k];
int *D = dst[k];
int dx = 0, limit = xmin;
for (;;)
{
#if OPTIMISED_COEFF
for (; dx < limit; dx++)
{
int j;
int sx = (dx * 2) - cn;
#else
for (; dx < limit; dx++, alpha += 4)
{
int j;
int sx = xofs[dx] - cn;
#endif
int v = 0;
for (j = 0; j < 4; j++)
{
int sxj = sx + j * cn;
if ((unsigned)sxj >= (unsigned)swidth)
{
while (sxj < 0)
sxj += cn;
while (sxj >= swidth)
sxj -= cn;
}
v += S[sxj] * alpha[j];
}
D[dx] = v;
}
if (limit == dwidth)
break;

int start = sx_start - cn;
src1_8x8 = vld1_u8(S + start);
#if OPTIMISED_COEFF
for (; dx < simd_loop;)
{
#else
for (; dx < simd_loop; alpha += 32)
{
#endif
start += num_pix;
src2_8x8 = vld1_u8(S + start);
start += num_pix;
src3_8x8 = vld1_u8(S + start);

uint16x8_t movl1_16x8 = vmovl_u8(src1_8x8);
uint16x8_t movl2_16x8 = vmovl_u8(src2_8x8);
uint16x8_t movl3_16x8 = vmovl_u8(src3_8x8);
int16x8_t s_movl1_16x8 = vreinterpretq_s16_u16(movl1_16x8);
int16x8_t s_movl2_16x8 = vreinterpretq_s16_u16(movl2_16x8);
int16x8_t s_movl3_16x8 = vreinterpretq_s16_u16(movl3_16x8);
int16x8x2_t t1 = vuzpq_s16(s_movl1_16x8, s_movl2_16x8); // 0 odd, 1 even
int16x8x2_t t2 = vuzpq_s16(s_movl3_16x8, s_movl3_16x8);
int16x8_t vx1 = vextq_s16(t1.val[0], t2.val[0], 1); // s_movl3_16x8,1);
int16x8_t vx2 = vextq_s16(t1.val[1], t2.val[1], 1);
int32x4_t m1_l = vmull_n_s16(vget_low_s16(t1.val[0]), alpha[0]);
int32x4_t m1_h = vmull_n_s16(vget_high_s16(t1.val[0]), alpha[0]);
int32x4_t m2_l = vmlal_n_s16(m1_l, vget_low_s16(vx1), alpha[1]);
int32x4_t m2_h = vmlal_n_s16(m1_h, vget_high_s16(vx1), alpha[1]);
int32x4_t m3_l = vmlal_n_s16(m2_l, vget_low_s16(t1.val[1]), alpha[2]);
int32x4_t m3_h = vmlal_n_s16(m2_h, vget_high_s16(t1.val[1]), alpha[2]);
int32x4_t out_l = vmlal_n_s16(m3_l, vget_low_s16(vx2), alpha[3]); // final out
int32x4_t out_h = vmlal_n_s16(m3_h, vget_high_s16(vx2), alpha[3]); // final out

vst1q_s32(D + dx, out_l);
dx += 4;
vst1q_s32(D + dx, out_h);
dx += 4;
src1_8x8 = src3_8x8;
}

#if OPTIMISED_COEFF
for (; dx < xmax; dx++)
{
int sx2 = dx * 2;
#else
for (; dx < xmax; dx++, alpha += 4)
{
int sx2 = xofs[dx]; // sx - 2, 4, 6, 8....
#endif
D[dx] = S[sx2 - 1] * alpha[0] + S[sx2] * alpha[1] + S[sx2 + 1] * alpha[2] + S[sx2 + 2] * alpha[3];
}
limit = dwidth;
}
#if !OPTIMISED_COEFF
alpha -= dwidth * 4;
#endif
}
}

void vresize_neon(const int **src, unsigned char *dst, const short *beta, int width)
{
int32x4_t src_1, src_2, src_3, src_4, src_1_mul;
int32x4_t d4_q;
int32x4_t add_1;
int32x4_t add_delta;
int32x4_t shift_right_32x4;
uint16x4_t shift_right_16x4;
uint16x8_t shift_right_16x8;
int32x4_t dt;
uint8x8_t dt2;


#define BITS 22
int bits = BITS;

// int32x4_t SHIFT = vdupq_n_s32(bits);
int DELTA = (1 << (bits - 1));
// b1_vq = vdupq_n_s32(beta[0]);
// b2_vq = vdupq_n_s32(beta[1]);
// b3_vq = vdupq_n_s32(beta[2]);
// b4_vq = vdupq_n_s32(beta[3]);
d4_q = vdupq_n_s32(DELTA);
src_1_mul = vdupq_n_s32(0);

int32x4_t lower = vdupq_n_s32(0);
int32x4_t higher = vdupq_n_s32(255);

for (int x = 0; x < width; x += 4)
{
src_1 = vld1q_s32(src[0] + x);
src_2 = vld1q_s32(src[1] + x);
src_3 = vld1q_s32(src[2] + x);
src_4 = vld1q_s32(src[3] + x);

add_1 = vmlaq_n_s32(src_1_mul, src_1, beta[0]);
add_1 = vmlaq_n_s32(add_1, src_2, beta[1]);
add_1 = vmlaq_n_s32(add_1, src_3, beta[2]);
add_1 = vmlaq_n_s32(add_1, src_4, beta[3]);

add_delta = vaddq_s32(add_1, d4_q);

shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4

dt = vminq_s32(shift_right_32x4, higher);
dt = vmaxq_s32(dt, lower);

// shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4

shift_right_16x4 = vqmovun_s32(dt); // 16x4
shift_right_16x8 = vcombine_u16(shift_right_16x4, shift_right_16x4); // 16x8
dt2 = vqmovn_u16(shift_right_16x8); // 8x8

vst1_lane_u32((unsigned int *)(dst + x), vreinterpret_u32_u8(dt2), 0);
}

#undef BITS
}

static int clip_neon(int x, int a, int b)
{
return x >= a ? (x < b ? x : b - 1) : a;
}

#if OPTIMISED_COEFF
void step_neon(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax)
#else
void step_neon(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax)
#endif
{
int dy, cn = channels;

int bufstep = (int)((dwidth + 16 - 1) & -16);
int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int));
if (_buffer == NULL)
{
printf("malloc fails\n");
}
const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int prev_sy[MAX_ESIZE];

for (int k = 0; k < ksize; k++)
{
prev_sy[k] = -1;
rows[k] = _buffer + bufstep * k;
}

#if !OPTIMISED_COEFF
const short *beta = _beta + ksize * start;
#endif


#if OPTIMISED_COEFF
for (dy = start; dy < end; dy++)
{
int sy0 = dy * 2;
#else
for (dy = start; dy < end; dy++, beta += ksize)
{
int sy0 = yofs[dy];
#endif
int k0 = ksize, k1 = 0, ksize2 = ksize / 2;

for (int k = 0; k < ksize; k++)
{
int sy = clip_neon(sy0 - ksize2 + 1 + k, 0, iheight);
for (k1 = MAX(k1, k); k1 < ksize; k1++)
{
if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it.
{
if (k1 > k)
memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0]));
break;
}
}
if (k1 == ksize)
k0 = MIN(k0, k); // remember the first row that needs to be computed
srows[k] = _src + (sy * iwidth);
prev_sy[k] = sy;
}



#if OPTIMISED_COEFF
if (k0 < ksize)
{
hresize_neon((srows + k0), (rows + k0), ksize - k0, _alpha,
iwidth, dwidth, cn, xmin, xmax);
}
#if USE_C_VRESIZE
vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
#elif !USE_C_VRESIZE
vresize_neon((const int **)rows, (_dst + dwidth * dy), _beta, dwidth);
#endif
#else
if (k0 < ksize)
{
hresize_neon((srows + k0), (rows + k0), ksize - k0, xofs, _alpha,
iwidth, dwidth, cn, xmin, xmax);
}
#if USE_C_VRESIZE
vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
#elif !USE_C_VRESIZE
vresize_neon((const int **)rows, (_dst + dwidth * dy), beta, dwidth);
#endif
#endif

}

free(_buffer);
}
30 changes: 30 additions & 0 deletions libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
*
* Copyright (c) 2022-2024 Meta, Inc.
*
* Licensed under the BSD 3-Clause License (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://opensource.org/license/bsd-3-clause
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#if OPTIMISED_COEFF
void step_neon(const unsigned char *_src, unsigned char *_dst,
const short *_alpha, const short *_beta,
int iwidth, int iheight, int dwidth, int channels,
int ksize, int start, int end, int xmin, int xmax);
#else
void step_neon(const unsigned char *_src, unsigned char *_dst,
const int *xofs, const int *yofs,
const short *_alpha, const short *_beta,
int iwidth, int iheight, int dwidth, int dheight, int channels,
int ksize, int start, int end, int xmin, int xmax);
#endif
Loading