Skip to content

Commit 7a3b76d

Browse files
authored
simd.h: renaming and fixing of shuffle template (#4739)
While I was investigating some SonarQube warnings about our simd `shuffle<>` templates (which are a false positive and I have a separate PR to simply silence it), I did get to thinking about the naming, and want to make a couple changes: * I decided that the 1-template-argument version of this function, `shuffle<int i>(simd_type)` would actually be more clear and self-documenting if renamed `broadcast_element` to emphasize that it is taking just one simd lane/element and broadcasting it to all lanes. (The multi-argument shuffle really is doing a true shuffle, giving an index for each lane to make a permutation of swizzle, so I'm not renaming that one.) To avoid breaking source compatibility, I am leaving the old name as well as a synonym, but commenting it as deprecated and I will phase out its use. It will disappear entirely from a future OIIO version that's safe to break compatibility. * For 16-wide simd, the 1-arg template we called shuffle was not doing the same operation -- it was replicating one group of 4 elements instead of a single element. We didn't use it anywhere, so I'm redefining it to do the analogous thing as it does for 4-wide and 8-wide. Signed-off-by: Larry Gritz <[email protected]>
1 parent c7b08d7 commit 7a3b76d

File tree

4 files changed

+129
-71
lines changed

4 files changed

+129
-71
lines changed

src/include/OpenImageIO/simd.h

Lines changed: 98 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -614,8 +614,9 @@ class vbool4 {
614614
template<int i0, int i1, int i2, int i3>
615615
OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
616616

617-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
618-
template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
617+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
618+
/// value a[i].
619+
template<int i> OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a);
619620

620621
/// Helper: as rapid as possible extraction of one component, when the
621622
/// index is fixed.
@@ -765,8 +766,9 @@ class vbool8 {
765766
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
766767
OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
767768

768-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
769-
template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
769+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
770+
/// value a[i].
771+
template<int i> OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a);
770772

771773
/// Helper: as rapid as possible extraction of one component, when the
772774
/// index is fixed.
@@ -1158,8 +1160,9 @@ vint4 srl (const vint4& val, const unsigned int bits);
11581160
template<int i0, int i1, int i2, int i3>
11591161
OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
11601162

1161-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1162-
template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1163+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
1164+
/// value a[i].
1165+
template<int i> OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a);
11631166

11641167
/// Helper: as rapid as possible extraction of one component, when the
11651168
/// index is fixed.
@@ -1458,8 +1461,9 @@ vint8 srl (const vint8& val, const unsigned int bits);
14581461
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
14591462
OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
14601463

1461-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1462-
template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1464+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
1465+
/// value a[i].
1466+
template<int i> OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a);
14631467

14641468
/// Helper: as rapid as possible extraction of one component, when the
14651469
/// index is fixed.
@@ -1768,8 +1772,9 @@ template<int i> vint16 shuffle4 (const vint16& a);
17681772
template<int i0, int i1, int i2, int i3>
17691773
vint16 shuffle (const vint16& a);
17701774

1771-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1772-
template<int i> vint16 shuffle (const vint16& a);
1775+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
1776+
/// value a[i].
1777+
template<int i> vint16 broadcast_element(const vint16& a);
17731778

17741779
/// Helper: as rapid as possible extraction of one component, when the
17751780
/// index is fixed.
@@ -2093,8 +2098,9 @@ class vfloat4 {
20932098
template<int i0, int i1, int i2, int i3>
20942099
OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
20952100

2096-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2097-
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2101+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
2102+
/// value a[i].
2103+
template<int i> OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a);
20982104

20992105
/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
21002106
/// 2-bit indices packed into the template parameter i (going from the low
@@ -2716,8 +2722,8 @@ class vfloat8 {
27162722
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
27172723
OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
27182724

2719-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2720-
template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2725+
/// broadcast_element<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2726+
template<int i> OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a);
27212727

27222728
/// Helper: as rapid as possible extraction of one component, when the
27232729
/// index is fixed.
@@ -3046,8 +3052,9 @@ template<int i> OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
30463052
template<int i0, int i1, int i2, int i3>
30473053
OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a);
30483054

3049-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3050-
template<int i> vfloat16 shuffle (const vfloat16& a);
3055+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
3056+
/// value a[i].
3057+
template<int i> vfloat16 broadcast_element(const vfloat16& a);
30513058

30523059
/// Helper: as rapid as possible extraction of one component, when the
30533060
/// index is fixed.
@@ -3468,11 +3475,17 @@ OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
34683475
#endif
34693476
}
34703477

3471-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3472-
template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3478+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
3479+
/// value a[i].
3480+
template<int i> OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a) {
34733481
return shuffle<i,i,i,i>(a);
34743482
}
34753483

3484+
// DEPRECATED(3.1): old name; use broadcast_element instead
3485+
template<int i> OIIO_FORCEINLINE vbool4 shuffle(const vbool4& a) {
3486+
return broadcast_element<i>(a);
3487+
}
3488+
34763489

34773490
/// Helper: as rapid as possible extraction of one component, when the
34783491
/// index is fixed.
@@ -3796,10 +3809,15 @@ OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
37963809
#endif
37973810
}
37983811

3799-
template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3812+
template<int i> OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a) {
38003813
return shuffle<i,i,i,i,i,i,i,i>(a);
38013814
}
38023815

3816+
// DEPRECATED(3.1): old name; use broadcast_element instead
3817+
template<int i> OIIO_FORCEINLINE vbool8 shuffle(const vbool8& a) {
3818+
return broadcast_element<i>(a);
3819+
}
3820+
38033821

38043822
template<int i>
38053823
OIIO_FORCEINLINE bool extract (const vbool8& a) {
@@ -4739,7 +4757,14 @@ OIIO_FORCEINLINE vint4 shuffle (const vint4& a) {
47394757
#endif
47404758
}
47414759

4742-
template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); }
4760+
template<int i> OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a) {
4761+
return shuffle<i,i,i,i>(a);
4762+
}
4763+
4764+
// DEPRECATED(3.1): old name; use broadcast_element instead
4765+
template<int i> OIIO_FORCEINLINE vint4 shuffle(const vint4& a) {
4766+
return broadcast_element<i>(a);
4767+
}
47434768

47444769

47454770
template<int i>
@@ -5579,10 +5604,15 @@ OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
55795604
#endif
55805605
}
55815606

5582-
template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
5607+
template<int i> OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a) {
55835608
return shuffle<i,i,i,i,i,i,i,i>(a);
55845609
}
55855610

5611+
// DEPRECATED(3.1): old name; use broadcast_element instead
5612+
template<int i> OIIO_FORCEINLINE vint8 shuffle(const vint8& a) {
5613+
return broadcast_element<i>(a);
5614+
}
5615+
55865616

55875617
template<int i>
55885618
OIIO_FORCEINLINE int extract (const vint8& v) {
@@ -6390,8 +6420,15 @@ vint16 shuffle (const vint16& a) {
63906420
#endif
63916421
}
63926422

6393-
template<int i> vint16 shuffle (const vint16& a) {
6394-
return shuffle<i,i,i,i> (a);
6423+
template<int i> vint16 broadcast_element(const vint16& a) {
6424+
return a[i];
6425+
}
6426+
6427+
// DEPRECATED(3.1): old name and nonstandard use
6428+
template<int i>
6429+
OIIO_DEPRECATED("Use broadcast_element (3.1)")
6430+
vint16 shuffle(const vint16& a) {
6431+
return broadcast_element<i> (a);
63956432
}
63966433

63976434

@@ -7248,19 +7285,26 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
72487285
#endif
72497286
}
72507287

7251-
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); }
7288+
template<int i> OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a) {
7289+
return shuffle<i,i,i,i>(a);
7290+
}
7291+
7292+
// DEPRECATED(3.1): old name; use broadcast_element instead
7293+
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
7294+
return broadcast_element<i>(a);
7295+
}
72527296

72537297
#if OIIO_SIMD_NEON
7254-
template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) {
7298+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<0> (const vfloat4& a) {
72557299
float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0);
72567300
}
7257-
template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) {
7301+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<1> (const vfloat4& a) {
72587302
float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1);
72597303
}
7260-
template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) {
7304+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<2> (const vfloat4& a) {
72617305
float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0);
72627306
}
7263-
template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
7307+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<3> (const vfloat4& a) {
72647308
float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1);
72657309
}
72667310
#endif
@@ -8260,9 +8304,9 @@ OIIO_FORCEINLINE matrix44 matrix44::transposed () const {
82608304

82618305
OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {
82628306
#if OIIO_SIMD_SSE
8263-
vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8264-
shuffle<2>(V) * m_row[2] + m_row[3];
8265-
R = R / shuffle<3>(R);
8307+
vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] +
8308+
broadcast_element<2>(V) * m_row[2] + m_row[3];
8309+
R = R / broadcast_element<3>(R);
82668310
return vfloat3 (R.xyz0());
82678311
#else
82688312
value_t a, b, c, w;
@@ -8276,8 +8320,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {
82768320

82778321
OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
82788322
#if OIIO_SIMD_SSE
8279-
vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8280-
shuffle<2>(V) * m_row[2];
8323+
vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] +
8324+
broadcast_element<2>(V) * m_row[2];
82818325
return vfloat3 (R.xyz0());
82828326
#else
82838327
value_t a, b, c;
@@ -8291,8 +8335,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
82918335
OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
82928336
#if OIIO_SIMD_SSE
82938337
matrix44 T = transposed();
8294-
vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8295-
shuffle<2>(V) * T[2];
8338+
vfloat4 R = broadcast_element<0>(V) * T[0] + broadcast_element<1>(V) * T[1] +
8339+
broadcast_element<2>(V) * T[2];
82968340
return vfloat3 (R.xyz0());
82978341
#else
82988342
value_t a, b, c;
@@ -8306,8 +8350,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
83068350
OIIO_FORCEINLINE vfloat4 operator* (const vfloat4 &V, const matrix44& M)
83078351
{
83088352
#if OIIO_SIMD_SSE
8309-
return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8310-
shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8353+
return broadcast_element<0>(V) * M[0] + broadcast_element<1>(V) * M[1] +
8354+
broadcast_element<2>(V) * M[2] + broadcast_element<3>(V) * M[3];
83118355
#else
83128356
float a, b, c, w;
83138357
a = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0] + V[3] * M[3][0];
@@ -9029,14 +9073,19 @@ OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
90299073
#endif
90309074
}
90319075

9032-
template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
9076+
template<int i> OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a) {
90339077
#if OIIO_SIMD_AVX >= 2
90349078
return _mm256_permutevar8x32_ps (a, vint8(i));
90359079
#else
9036-
return shuffle<i,i,i,i,i,i,i,i>(a);
9080+
return a[i];
90379081
#endif
90389082
}
90399083

9084+
// DEPRECATED(3.1): old name; use broadcast_element instead
9085+
template<int i> OIIO_FORCEINLINE vfloat8 shuffle(const vfloat8& a) {
9086+
return broadcast_element<i>(a);
9087+
}
9088+
90409089

90419090
template<int i>
90429091
OIIO_FORCEINLINE float extract (const vfloat8& v) {
@@ -9099,9 +9148,9 @@ OIIO_FORCEINLINE vfloat8 vreduce_add (const vfloat8& v) {
90999148
vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps());
91009149
vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
91019150
// get efgh in the 0-idx slot
9102-
vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
9151+
vfloat8 efgh = broadcast_element<4>(abcd_0_0_0_efgh_0_0_0);
91039152
vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
9104-
return shuffle<0>(final_sum);
9153+
return broadcast_element<0>(final_sum);
91059154
#else
91069155
vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
91079156
return vfloat8(hadd4, hadd4);
@@ -9908,7 +9957,14 @@ vfloat16 shuffle (const vfloat16& a) {
99089957
#endif
99099958
}
99109959

9911-
template<int i> vfloat16 shuffle (const vfloat16& a) {
9960+
template<int i> vfloat16 broadcast_element(const vfloat16& a) {
9961+
return a[i];
9962+
}
9963+
9964+
// DEPRECATED(3.1): old name and nonstandard use
9965+
template<int i>
9966+
OIIO_DEPRECATED("Use broadcast_element (3.1)")
9967+
vfloat16 shuffle(const vfloat16& a) {
99129968
return shuffle<i,i,i,i> (a);
99139969
}
99149970

src/libOpenImageIO/imagebufalgo_pixelmath.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1733,7 +1733,7 @@ over_impl_rgbafloat(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
17331733
for (int x = 0; x < w; ++x, r += 4, a += 4, b += 4) {
17341734
vfloat4 a_simd(a);
17351735
vfloat4 b_simd(b);
1736-
vfloat4 alpha = shuffle<3>(a_simd);
1736+
vfloat4 alpha = broadcast_element<3>(a_simd);
17371737
vfloat4 one_minus_alpha = one - clamp(alpha, zero, one);
17381738
vfloat4 result = a_simd + one_minus_alpha * b_simd;
17391739
result.store(r);

src/libtexture/texturesys.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3016,10 +3016,10 @@ TextureSystemImpl::sample_bicubic(
30163016
if (s_onetile & t_onetile) {
30173017
// If we thought it was one tile, realize that it isn't unless
30183018
// it's ascending.
3019-
s_onetile &= all(stex
3020-
== (simd::shuffle<0>(stex) + (*(vint4*)iota)));
3021-
t_onetile &= all(ttex
3022-
== (simd::shuffle<0>(ttex) + (*(vint4*)iota)));
3019+
s_onetile &= all(
3020+
stex == (simd::broadcast_element<0>(stex) + (*(vint4*)iota)));
3021+
t_onetile &= all(
3022+
ttex == (simd::broadcast_element<0>(ttex) + (*(vint4*)iota)));
30233023
}
30243024
bool onetile = (s_onetile & t_onetile);
30253025
if (onetile & allvalid) {
@@ -3199,15 +3199,17 @@ TextureSystemImpl::sample_bicubic(
31993199
simd::vfloat4 col[4];
32003200
for (int j = 0; j < 4; ++j) {
32013201
simd::vfloat4 lx = lerp(texel_simd[j][0], texel_simd[j][1],
3202-
shuffle<0>(h) /*h0x*/);
3202+
broadcast_element<0>(h) /*h0x*/);
32033203
simd::vfloat4 rx = lerp(texel_simd[j][2], texel_simd[j][3],
3204-
shuffle<1>(h) /*h1x*/);
3205-
col[j] = lerp(lx, rx, shuffle<1>(g) /*g1x*/);
3204+
broadcast_element<1>(h) /*h1x*/);
3205+
col[j] = lerp(lx, rx, broadcast_element<1>(g) /*g1x*/);
32063206
}
3207-
simd::vfloat4 ly = lerp(col[0], col[1], shuffle<2>(h) /*h0y*/);
3208-
simd::vfloat4 ry = lerp(col[2], col[3], shuffle<3>(h) /*h1y*/);
3207+
simd::vfloat4 ly = lerp(col[0], col[1],
3208+
broadcast_element<2>(h) /*h0y*/);
3209+
simd::vfloat4 ry = lerp(col[2], col[3],
3210+
broadcast_element<3>(h) /*h1y*/);
32093211
simd::vfloat4 weight_simd = weight;
3210-
accum += weight_simd * lerp(ly, ry, shuffle<3>(g) /*g1y*/);
3212+
accum += weight_simd * lerp(ly, ry, broadcast_element<3>(g) /*g1y*/);
32113213
if (daccumds_) {
32123214
simd::vfloat4 scalex = weight_simd * float(spec.width);
32133215
simd::vfloat4 scaley = weight_simd * float(spec.height);

0 commit comments

Comments
 (0)