Skip to content

Commit 62a8ae5

Browse files
authored
Use int16 min/max for _mm_set_epi16() calls (#395)
* Use int16 min/max for _mm_set_epi16() calls This fixes a bug that causes g0 overflows when compiled on Intel processors and run on AMD Ryzen processors. According to Intel, the `_mm_set_epi16()` function takes signed shorts (int16): https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_set_epi16&ig_expand=6355,6355 But 0-65535 is the range of uint16, not int16; causing an overflow to -1. This is warned of by the compiler: `implict conversion from int to short changes value from 65535 to -1` For whatever reason, compiling and running on AMD Ryzen chip succeeded fine. Compiling and runing on an Intel chip also worked fine. But compiling on Intel and running on AMD caused an exception. Using the int16 range fixes the problem. However, this is not my skill set and I am not sure if this is the right fix. Please verify. Thank you! * Update sse-motion.cc: _mm_set_epi16 use 0, -1
1 parent 336272b commit 62a8ae5

File tree

1 file changed

+18
-18
lines changed

1 file changed

+18
-18
lines changed

libde265/x86/sse-motion.cc

+18-18
Original file line numberDiff line numberDiff line change
@@ -3527,9 +3527,9 @@ void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
35273527
r0 = _mm_srli_epi32(r0, 6);
35283528

35293529
r1 = _mm_and_si128(r1,
3530-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3530+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
35313531
r0 = _mm_and_si128(r0,
3532-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3532+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
35333533
r0 = _mm_hadd_epi16(r0, r1);
35343534
_mm_store_si128((__m128i *) &dst[x], r0);
35353535

@@ -3710,9 +3710,9 @@ void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
37103710
r0 = _mm_srli_epi32(r0, 6);
37113711

37123712
r1 = _mm_and_si128(r1,
3713-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3713+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
37143714
r0 = _mm_and_si128(r0,
3715-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3715+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
37163716
r0 = _mm_hadd_epi16(r0, r1);
37173717
_mm_store_si128((__m128i *) &dst[x], r0);
37183718

@@ -3887,9 +3887,9 @@ void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
38873887
r0 = _mm_srli_epi32(r0, 6);
38883888

38893889
r1 = _mm_and_si128(r1,
3890-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3890+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
38913891
r0 = _mm_and_si128(r0,
3892-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
3892+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
38933893
r0 = _mm_hadd_epi16(r0, r1);
38943894
_mm_store_si128((__m128i *) &dst[x], r0);
38953895

@@ -4058,9 +4058,9 @@ void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
40584058
r0 = _mm_srli_epi32(r0, 6);
40594059

40604060
r1 = _mm_and_si128(r1,
4061-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4061+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
40624062
r0 = _mm_and_si128(r0,
4063-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4063+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
40644064
r0 = _mm_hadd_epi16(r0, r1);
40654065
_mm_store_si128((__m128i *) &dst[x], r0);
40664066

@@ -4241,9 +4241,9 @@ void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
42414241
r0 = _mm_srli_epi32(r0, 6);
42424242

42434243
r1 = _mm_and_si128(r1,
4244-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4244+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
42454245
r0 = _mm_and_si128(r0,
4246-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4246+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
42474247
r0 = _mm_hadd_epi16(r0, r1);
42484248
_mm_store_si128((__m128i *) &dst[x], r0);
42494249

@@ -4419,9 +4419,9 @@ void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
44194419
r0 = _mm_srli_epi32(r0, 6);
44204420

44214421
r1 = _mm_and_si128(r1,
4422-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4422+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
44234423
r0 = _mm_and_si128(r0,
4424-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4424+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
44254425
r0 = _mm_hadd_epi16(r0, r1);
44264426
_mm_store_si128((__m128i *) &dst[x], r0);
44274427

@@ -4592,9 +4592,9 @@ void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
45924592
r0 = _mm_srli_epi32(r0, 6);
45934593

45944594
r1 = _mm_and_si128(r1,
4595-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4595+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
45964596
r0 = _mm_and_si128(r0,
4597-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4597+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
45984598
r0 = _mm_hadd_epi16(r0, r1);
45994599
_mm_store_si128((__m128i *) &dst[x], r0);
46004600

@@ -4778,9 +4778,9 @@ void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
47784778
r0 = _mm_srli_epi32(r0, 6);
47794779

47804780
r1 = _mm_and_si128(r1,
4781-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4781+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
47824782
r0 = _mm_and_si128(r0,
4783-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4783+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
47844784
r0 = _mm_hadd_epi16(r0, r1);
47854785
_mm_store_si128((__m128i *) &dst[x], r0);
47864786

@@ -4958,9 +4958,9 @@ void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
49584958
r0 = _mm_srli_epi32(r0, 6);
49594959

49604960
r1 = _mm_and_si128(r1,
4961-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4961+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
49624962
r0 = _mm_and_si128(r0,
4963-
_mm_set_epi16(0, 65535, 0, 65535, 0, 65535, 0, 65535));
4963+
_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1));
49644964
r0 = _mm_hadd_epi16(r0, r1);
49654965
_mm_store_si128((__m128i *) &dst[x], r0);
49664966

0 commit comments

Comments
 (0)