-
Notifications
You must be signed in to change notification settings - Fork 754
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement SAST resizer from FUNQUE #1361
Open
cosmin
wants to merge
1
commit into
Netflix:master
Choose a base branch
from
cosmin:funque-resizer
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
295 changes: 295 additions & 0 deletions
295
libvmaf/src/feature/third_party/funque/arm64/resizer_neon.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
/** | ||
* | ||
* Copyright (c) 2022-2024 Meta, Inc. | ||
* | ||
* Licensed under the BSD 3-Clause License (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://opensource.org/license/bsd-3-clause | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
|
||
#include <stdio.h> | ||
#include <math.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <arm_neon.h> | ||
#include <time.h> | ||
#include "../resizer.h" | ||
#include "resizer_neon.h" | ||
|
||
#if OPTIMISED_COEFF | ||
void hresize_neon(const unsigned char **src, int **dst, int count, | ||
const short *alpha, | ||
int swidth, int dwidth, int cn, int xmin, int xmax) | ||
#else | ||
void hresize_neon(const unsigned char **src, int **dst, int count, | ||
const int *xofs, const short *alpha, | ||
int swidth, int dwidth, int cn, int xmin, int xmax) | ||
#endif | ||
{ | ||
// int first_col_count = 0; | ||
uint8x8_t src1_8x8, src2_8x8, src3_8x8; | ||
int simd_loop = (xmax / 8) * 8; | ||
int num_pix = 8; | ||
|
||
#if OPTIMISED_COEFF | ||
int sx_start = 2; | ||
#else | ||
int sx_start = xofs[1]; | ||
#endif | ||
|
||
for (int k = 0; k < count; k++) | ||
{ | ||
const unsigned char *S = src[k]; | ||
int *D = dst[k]; | ||
int dx = 0, limit = xmin; | ||
for (;;) | ||
{ | ||
#if OPTIMISED_COEFF | ||
for (; dx < limit; dx++) | ||
{ | ||
int j; | ||
int sx = (dx * 2) - cn; | ||
#else | ||
for (; dx < limit; dx++, alpha += 4) | ||
{ | ||
int j; | ||
int sx = xofs[dx] - cn; | ||
#endif | ||
int v = 0; | ||
for (j = 0; j < 4; j++) | ||
{ | ||
int sxj = sx + j * cn; | ||
if ((unsigned)sxj >= (unsigned)swidth) | ||
{ | ||
while (sxj < 0) | ||
sxj += cn; | ||
while (sxj >= swidth) | ||
sxj -= cn; | ||
} | ||
v += S[sxj] * alpha[j]; | ||
} | ||
D[dx] = v; | ||
} | ||
if (limit == dwidth) | ||
break; | ||
|
||
int start = sx_start - cn; | ||
src1_8x8 = vld1_u8(S + start); | ||
#if OPTIMISED_COEFF | ||
for (; dx < simd_loop;) | ||
{ | ||
#else | ||
for (; dx < simd_loop; alpha += 32) | ||
{ | ||
#endif | ||
start += num_pix; | ||
src2_8x8 = vld1_u8(S + start); | ||
start += num_pix; | ||
src3_8x8 = vld1_u8(S + start); | ||
|
||
uint16x8_t movl1_16x8 = vmovl_u8(src1_8x8); | ||
uint16x8_t movl2_16x8 = vmovl_u8(src2_8x8); | ||
uint16x8_t movl3_16x8 = vmovl_u8(src3_8x8); | ||
int16x8_t s_movl1_16x8 = vreinterpretq_s16_u16(movl1_16x8); | ||
int16x8_t s_movl2_16x8 = vreinterpretq_s16_u16(movl2_16x8); | ||
int16x8_t s_movl3_16x8 = vreinterpretq_s16_u16(movl3_16x8); | ||
int16x8x2_t t1 = vuzpq_s16(s_movl1_16x8, s_movl2_16x8); // 0 odd, 1 even | ||
int16x8x2_t t2 = vuzpq_s16(s_movl3_16x8, s_movl3_16x8); | ||
int16x8_t vx1 = vextq_s16(t1.val[0], t2.val[0], 1); // s_movl3_16x8,1); | ||
int16x8_t vx2 = vextq_s16(t1.val[1], t2.val[1], 1); | ||
int32x4_t m1_l = vmull_n_s16(vget_low_s16(t1.val[0]), alpha[0]); | ||
int32x4_t m1_h = vmull_n_s16(vget_high_s16(t1.val[0]), alpha[0]); | ||
int32x4_t m2_l = vmlal_n_s16(m1_l, vget_low_s16(vx1), alpha[1]); | ||
int32x4_t m2_h = vmlal_n_s16(m1_h, vget_high_s16(vx1), alpha[1]); | ||
int32x4_t m3_l = vmlal_n_s16(m2_l, vget_low_s16(t1.val[1]), alpha[2]); | ||
int32x4_t m3_h = vmlal_n_s16(m2_h, vget_high_s16(t1.val[1]), alpha[2]); | ||
int32x4_t out_l = vmlal_n_s16(m3_l, vget_low_s16(vx2), alpha[3]); // final out | ||
int32x4_t out_h = vmlal_n_s16(m3_h, vget_high_s16(vx2), alpha[3]); // final out | ||
|
||
vst1q_s32(D + dx, out_l); | ||
dx += 4; | ||
vst1q_s32(D + dx, out_h); | ||
dx += 4; | ||
src1_8x8 = src3_8x8; | ||
} | ||
|
||
#if OPTIMISED_COEFF | ||
for (; dx < xmax; dx++) | ||
{ | ||
int sx2 = dx * 2; | ||
#else | ||
for (; dx < xmax; dx++, alpha += 4) | ||
{ | ||
int sx2 = xofs[dx]; // sx - 2, 4, 6, 8.... | ||
#endif | ||
D[dx] = S[sx2 - 1] * alpha[0] + S[sx2] * alpha[1] + S[sx2 + 1] * alpha[2] + S[sx2 + 2] * alpha[3]; | ||
} | ||
limit = dwidth; | ||
} | ||
#if !OPTIMISED_COEFF | ||
alpha -= dwidth * 4; | ||
#endif | ||
} | ||
} | ||
|
||
void vresize_neon(const int **src, unsigned char *dst, const short *beta, int width) | ||
{ | ||
int32x4_t src_1, src_2, src_3, src_4, src_1_mul; | ||
int32x4_t d4_q; | ||
int32x4_t add_1; | ||
int32x4_t add_delta; | ||
int32x4_t shift_right_32x4; | ||
uint16x4_t shift_right_16x4; | ||
uint16x8_t shift_right_16x8; | ||
int32x4_t dt; | ||
uint8x8_t dt2; | ||
|
||
|
||
#define BITS 22 | ||
int bits = BITS; | ||
|
||
// int32x4_t SHIFT = vdupq_n_s32(bits); | ||
int DELTA = (1 << (bits - 1)); | ||
// b1_vq = vdupq_n_s32(beta[0]); | ||
// b2_vq = vdupq_n_s32(beta[1]); | ||
// b3_vq = vdupq_n_s32(beta[2]); | ||
// b4_vq = vdupq_n_s32(beta[3]); | ||
d4_q = vdupq_n_s32(DELTA); | ||
src_1_mul = vdupq_n_s32(0); | ||
|
||
int32x4_t lower = vdupq_n_s32(0); | ||
int32x4_t higher = vdupq_n_s32(255); | ||
|
||
for (int x = 0; x < width; x += 4) | ||
{ | ||
src_1 = vld1q_s32(src[0] + x); | ||
src_2 = vld1q_s32(src[1] + x); | ||
src_3 = vld1q_s32(src[2] + x); | ||
src_4 = vld1q_s32(src[3] + x); | ||
|
||
add_1 = vmlaq_n_s32(src_1_mul, src_1, beta[0]); | ||
add_1 = vmlaq_n_s32(add_1, src_2, beta[1]); | ||
add_1 = vmlaq_n_s32(add_1, src_3, beta[2]); | ||
add_1 = vmlaq_n_s32(add_1, src_4, beta[3]); | ||
|
||
add_delta = vaddq_s32(add_1, d4_q); | ||
|
||
shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4 | ||
|
||
dt = vminq_s32(shift_right_32x4, higher); | ||
dt = vmaxq_s32(dt, lower); | ||
|
||
// shift_right_32x4 = vshrq_n_s32(add_delta, BITS); // 32x4 | ||
|
||
shift_right_16x4 = vqmovun_s32(dt); // 16x4 | ||
shift_right_16x8 = vcombine_u16(shift_right_16x4, shift_right_16x4); // 16x8 | ||
dt2 = vqmovn_u16(shift_right_16x8); // 8x8 | ||
|
||
vst1_lane_u32((unsigned int *)(dst + x), vreinterpret_u32_u8(dt2), 0); | ||
} | ||
|
||
#undef BITS | ||
} | ||
|
||
static int clip_neon(int x, int a, int b) | ||
{ | ||
return x >= a ? (x < b ? x : b - 1) : a; | ||
} | ||
|
||
#if OPTIMISED_COEFF | ||
void step_neon(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax) | ||
#else | ||
void step_neon(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax) | ||
#endif | ||
{ | ||
int dy, cn = channels; | ||
|
||
int bufstep = (int)((dwidth + 16 - 1) & -16); | ||
int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); | ||
if (_buffer == NULL) | ||
{ | ||
printf("malloc fails\n"); | ||
} | ||
const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | ||
int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | ||
int prev_sy[MAX_ESIZE]; | ||
|
||
for (int k = 0; k < ksize; k++) | ||
{ | ||
prev_sy[k] = -1; | ||
rows[k] = _buffer + bufstep * k; | ||
} | ||
|
||
#if !OPTIMISED_COEFF | ||
const short *beta = _beta + ksize * start; | ||
#endif | ||
|
||
|
||
#if OPTIMISED_COEFF | ||
for (dy = start; dy < end; dy++) | ||
{ | ||
int sy0 = dy * 2; | ||
#else | ||
for (dy = start; dy < end; dy++, beta += ksize) | ||
{ | ||
int sy0 = yofs[dy]; | ||
#endif | ||
int k0 = ksize, k1 = 0, ksize2 = ksize / 2; | ||
|
||
for (int k = 0; k < ksize; k++) | ||
{ | ||
int sy = clip_neon(sy0 - ksize2 + 1 + k, 0, iheight); | ||
for (k1 = MAX(k1, k); k1 < ksize; k1++) | ||
{ | ||
if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. | ||
{ | ||
if (k1 > k) | ||
memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); | ||
break; | ||
} | ||
} | ||
if (k1 == ksize) | ||
k0 = MIN(k0, k); // remember the first row that needs to be computed | ||
srows[k] = _src + (sy * iwidth); | ||
prev_sy[k] = sy; | ||
} | ||
|
||
|
||
|
||
#if OPTIMISED_COEFF | ||
if (k0 < ksize) | ||
{ | ||
hresize_neon((srows + k0), (rows + k0), ksize - k0, _alpha, | ||
iwidth, dwidth, cn, xmin, xmax); | ||
} | ||
#if USE_C_VRESIZE | ||
vresize((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); | ||
#elif !USE_C_VRESIZE | ||
vresize_neon((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); | ||
#endif | ||
#else | ||
if (k0 < ksize) | ||
{ | ||
hresize_neon((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, | ||
iwidth, dwidth, cn, xmin, xmax); | ||
} | ||
#if USE_C_VRESIZE | ||
vresize((const int **)rows, (_dst + dwidth * dy), beta, dwidth); | ||
#elif !USE_C_VRESIZE | ||
vresize_neon((const int **)rows, (_dst + dwidth * dy), beta, dwidth); | ||
#endif | ||
#endif | ||
|
||
} | ||
|
||
free(_buffer); | ||
} |
30 changes: 30 additions & 0 deletions
30
libvmaf/src/feature/third_party/funque/arm64/resizer_neon.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
/** | ||
* | ||
* Copyright (c) 2022-2024 Meta, Inc. | ||
* | ||
* Licensed under the BSD 3-Clause License (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://opensource.org/license/bsd-3-clause | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
|
||
#if OPTIMISED_COEFF | ||
void step_neon(const unsigned char *_src, unsigned char *_dst, | ||
const short *_alpha, const short *_beta, | ||
int iwidth, int iheight, int dwidth, int channels, | ||
int ksize, int start, int end, int xmin, int xmax); | ||
#else | ||
void step_neon(const unsigned char *_src, unsigned char *_dst, | ||
const int *xofs, const int *yofs, | ||
const short *_alpha, const short *_beta, | ||
int iwidth, int iheight, int dwidth, int dheight, int channels, | ||
int ksize, int start, int end, int xmin, int xmax); | ||
#endif |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer specifying this without
..
if possible - I think it can be converted to#include "resizer.h"
directly with no other changes?