Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ARM version of calculating mode scores #2356

Merged
merged 11 commits into from
Feb 19, 2023
20 changes: 20 additions & 0 deletions src/ImageSharp/Common/Helpers/Numerics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

namespace SixLabors.ImageSharp;
Expand Down Expand Up @@ -808,6 +809,25 @@ public static int ReduceSum(Vector256<int> accumulator)
return Sse2.ConvertToInt32(vsum);
}

/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>
/// <param name="accumulator">The accumulator to reduce.</param>
/// <returns>The sum of all elements.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static int ReduceSumArm(Vector128<uint> accumulator)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use Vector128<T>.sum() instead of this method. In general, try using Vector128/Vector256 API wherever possible. This would improve portability of the code and benefit from improvements to the API itself.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ReduceSum can also be refactored out.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ReduceSum can also be refactored out.

We cannot get rid of ReduceSum yet, because we target net6.0 and the Vector128<T>.sum was introduced with net7.0.
I am using Vector128<T>.sum for >= Net7.0: b0bfb0a

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, makes sense 👍

{
if (AdvSimd.Arm64.IsSupported)
{
Vector64<uint> sum = AdvSimd.Arm64.AddAcross(accumulator);
return (int)AdvSimd.Extract(sum, 0);
}

Vector128<ulong> sum2 = AdvSimd.AddPairwiseWidening(accumulator);
Vector64<uint> sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32());
return (int)AdvSimd.Extract(sum3, 0);
}

/// <summary>
/// Reduces even elements of the vector into one sum.
/// </summary>
Expand Down
116 changes: 116 additions & 0 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

// ReSharper disable InconsistentNaming
Expand All @@ -26,6 +27,11 @@ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
return Vp8_Sse16xN_Sse2(a, b, 8);
}

if (AdvSimd.IsSupported)
{
return Vp8_Sse16x16_Neon(a, b);
}

return Vp8_SseNxN(a, b, 16, 16);
}

Expand All @@ -43,6 +49,11 @@ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
return Vp8_Sse16xN_Sse2(a, b, 4);
}

if (AdvSimd.IsSupported)
{
return Vp8_Sse16x8_Neon(a, b);
}

return Vp8_SseNxN(a, b, 16, 8);
}

Expand Down Expand Up @@ -119,6 +130,11 @@ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
return Numerics.ReduceSum(sum);
}

if (AdvSimd.IsSupported)
{
return Vp8_Sse4x4_Neon(a, b);
}

return Vp8_SseNxN(a, b, 4, 4);
}

Expand Down Expand Up @@ -199,6 +215,106 @@ private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
return Numerics.ReduceSum(sum);
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe int Vp8_Sse16x16_Neon(Span<byte> a, Span<byte> b)
{
Vector128<uint> sum = Vector128<uint>.Zero;
fixed (byte* aRef = &MemoryMarshal.GetReference(a))
{
fixed (byte* bRef = &MemoryMarshal.GetReference(b))
{
for (int y = 0; y < 16; y++)
{
sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
}
}
}

#if NET7_0_OR_GREATER
return (int)Vector128.Sum(sum);
#else
return Numerics.ReduceSumArm(sum);
#endif
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe int Vp8_Sse16x8_Neon(Span<byte> a, Span<byte> b)
{
Vector128<uint> sum = Vector128<uint>.Zero;
fixed (byte* aRef = &MemoryMarshal.GetReference(a))
{
fixed (byte* bRef = &MemoryMarshal.GetReference(b))
{
for (int y = 0; y < 8; y++)
{
sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
}
}
}

#if NET7_0_OR_GREATER
return (int)Vector128.Sum(sum);
#else
return Numerics.ReduceSumArm(sum);
#endif
}

[MethodImpl(InliningOptions.ShortMethod)]
private static int Vp8_Sse4x4_Neon(Span<byte> a, Span<byte> b)
{
Vector128<byte> a0 = Load4x4Neon(a).AsByte();
Vector128<byte> b0 = Load4x4Neon(b).AsByte();
Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
Vector64<byte> absDiffLower = absDiff.GetLower().AsByte();
Vector64<byte> absDiffUpper = absDiff.GetUpper().AsByte();
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);

// pair-wise adds and widen.
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);

Vector128<uint> sum = AdvSimd.Add(sum1, sum2);
#if NET7_0_OR_GREATER
return (int)Vector128.Sum(sum);
#else
return Numerics.ReduceSumArm(sum);
#endif
}

// Load all 4x4 pixels into a single Vector128<uint>
[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe Vector128<uint> Load4x4Neon(Span<byte> src)
{
fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
{
Vector128<uint> output = Vector128<uint>.Zero;
output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
return output;
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe Vector128<uint> AccumulateSSE16Neon(byte* a, byte* b, Vector128<uint> sum)
{
Vector128<byte> a0 = AdvSimd.LoadVector128(a);
Vector128<byte> b0 = AdvSimd.LoadVector128(b);

Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
Vector64<byte> absDiffLower = absDiff.GetLower();
Vector64<byte> absDiffUpper = absDiff.GetUpper();
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);

// pair-wise adds and widen.
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
}

[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
{
Expand Down
78 changes: 58 additions & 20 deletions tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.

using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Formats.Webp.Lossy;
using SixLabors.ImageSharp.Tests.TestUtilities;

Expand Down Expand Up @@ -222,62 +223,99 @@ private static void RunHadamardTransformTest()
public void HadamardTransform_Works() => RunHadamardTransformTest();

[Fact]
public void TransformTwo_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);
public void TransformTwo_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);

[Fact]
public void TransformTwo_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);
public void TransformTwo_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void TransformOne_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);
public void TransformOne_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);

[Fact]
public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
public void TransformOne_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);

// This will test the AVX2 version.
// This will test the AVX2 or ARM version.
[Fact]
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);

// This will test the SSE2 version.
[Fact]
public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
public void Vp8Sse16X16_WithoutAVX2_Works()
{
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
return;
}

FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
}

// This will test the fallback scalar version.
[Fact]
public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
public void Vp8Sse16X16_WithoutHwIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableHWIntrinsic);

// This will test the AVX2 version.
// This will test the AVX2 or ARM version.
[Fact]
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);

// This will test the SSE2 version.
[Fact]
public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
public void Vp8Sse16X8_WithoutAVX2_Works()
{
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
return;
}

FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
}

// This will test the fallback scalar version.
[Fact]
public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableHWIntrinsic);

// This will test the AVX2 version.
// This will test the AVX2 version or ARM version.
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);

// This will test the SSE2 version.
[Fact]
public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
public void Vp8Sse4X4_WithoutAVX2_Works()
{
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
return;
}

FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
}

// This will test the fallback scalar version.
[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
public void Mean16x4_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

[Fact]
public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
public void Mean16x4_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
public void HadamardTransform_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);

[Fact]
public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
public void HadamardTransform_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
}