Skip to content

Commit

Permalink
Merge pull request #2356 from SixLabors/bp/modeScoreArm
Browse files Browse the repository at this point in the history
Add ARM version of calculating mode scores
  • Loading branch information
brianpopow authored Feb 19, 2023
2 parents 5ebc460 + 963d993 commit 63c8f9e
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 20 deletions.
20 changes: 20 additions & 0 deletions src/ImageSharp/Common/Helpers/Numerics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

namespace SixLabors.ImageSharp;
Expand Down Expand Up @@ -808,6 +809,25 @@ public static int ReduceSum(Vector256<int> accumulator)
return Sse2.ConvertToInt32(vsum);
}

/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>
/// <param name="accumulator">The accumulator to reduce.</param>
/// <returns>The sum of all elements.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static int ReduceSumArm(Vector128<uint> accumulator)
{
if (AdvSimd.Arm64.IsSupported)
{
Vector64<uint> sum = AdvSimd.Arm64.AddAcross(accumulator);
return (int)AdvSimd.Extract(sum, 0);
}

Vector128<ulong> sum2 = AdvSimd.AddPairwiseWidening(accumulator);
Vector64<uint> sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32());
return (int)AdvSimd.Extract(sum3, 0);
}

/// <summary>
/// Reduces even elements of the vector into one sum.
/// </summary>
Expand Down
116 changes: 116 additions & 0 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

// ReSharper disable InconsistentNaming
Expand All @@ -26,6 +27,11 @@ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
return Vp8_Sse16xN_Sse2(a, b, 8);
}

if (AdvSimd.IsSupported)
{
return Vp8_Sse16x16_Neon(a, b);
}

return Vp8_SseNxN(a, b, 16, 16);
}

Expand All @@ -43,6 +49,11 @@ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
return Vp8_Sse16xN_Sse2(a, b, 4);
}

if (AdvSimd.IsSupported)
{
return Vp8_Sse16x8_Neon(a, b);
}

return Vp8_SseNxN(a, b, 16, 8);
}

Expand Down Expand Up @@ -119,6 +130,11 @@ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
return Numerics.ReduceSum(sum);
}

if (AdvSimd.IsSupported)
{
return Vp8_Sse4x4_Neon(a, b);
}

return Vp8_SseNxN(a, b, 4, 4);
}

Expand Down Expand Up @@ -199,6 +215,106 @@ private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
return Numerics.ReduceSum(sum);
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe int Vp8_Sse16x16_Neon(Span<byte> a, Span<byte> b)
{
Vector128<uint> sum = Vector128<uint>.Zero;
fixed (byte* aRef = &MemoryMarshal.GetReference(a))
{
fixed (byte* bRef = &MemoryMarshal.GetReference(b))
{
for (int y = 0; y < 16; y++)
{
sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
}
}
}

#if NET7_0_OR_GREATER
return (int)Vector128.Sum(sum);
#else
return Numerics.ReduceSumArm(sum);
#endif
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe int Vp8_Sse16x8_Neon(Span<byte> a, Span<byte> b)
{
Vector128<uint> sum = Vector128<uint>.Zero;
fixed (byte* aRef = &MemoryMarshal.GetReference(a))
{
fixed (byte* bRef = &MemoryMarshal.GetReference(b))
{
for (int y = 0; y < 8; y++)
{
sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
}
}
}

#if NET7_0_OR_GREATER
return (int)Vector128.Sum(sum);
#else
return Numerics.ReduceSumArm(sum);
#endif
}

[MethodImpl(InliningOptions.ShortMethod)]
private static int Vp8_Sse4x4_Neon(Span<byte> a, Span<byte> b)
{
Vector128<byte> a0 = Load4x4Neon(a).AsByte();
Vector128<byte> b0 = Load4x4Neon(b).AsByte();
Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
Vector64<byte> absDiffLower = absDiff.GetLower().AsByte();
Vector64<byte> absDiffUpper = absDiff.GetUpper().AsByte();
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);

// pair-wise adds and widen.
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);

Vector128<uint> sum = AdvSimd.Add(sum1, sum2);
#if NET7_0_OR_GREATER
return (int)Vector128.Sum(sum);
#else
return Numerics.ReduceSumArm(sum);
#endif
}

// Load all 4x4 pixels into a single Vector128<uint>
[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe Vector128<uint> Load4x4Neon(Span<byte> src)
{
fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
{
Vector128<uint> output = Vector128<uint>.Zero;
output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
return output;
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe Vector128<uint> AccumulateSSE16Neon(byte* a, byte* b, Vector128<uint> sum)
{
Vector128<byte> a0 = AdvSimd.LoadVector128(a);
Vector128<byte> b0 = AdvSimd.LoadVector128(b);

Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
Vector64<byte> absDiffLower = absDiff.GetLower();
Vector64<byte> absDiffUpper = absDiff.GetUpper();
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);

// pair-wise adds and widen.
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
}

[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
{
Expand Down
78 changes: 58 additions & 20 deletions tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.

using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Formats.Webp.Lossy;
using SixLabors.ImageSharp.Tests.TestUtilities;

Expand Down Expand Up @@ -222,62 +223,99 @@ private static void RunHadamardTransformTest()
public void HadamardTransform_Works() => RunHadamardTransformTest();

[Fact]
public void TransformTwo_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);
public void TransformTwo_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);

[Fact]
public void TransformTwo_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);
public void TransformTwo_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void TransformOne_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);
public void TransformOne_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);

[Fact]
public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
public void TransformOne_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);

// This will test the AVX2 version.
// This will test the AVX2 or ARM version.
[Fact]
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);

// This will test the SSE2 version.
[Fact]
public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
public void Vp8Sse16X16_WithoutAVX2_Works()
{
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
return;
}

FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
}

// This will test the fallback scalar version.
[Fact]
public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
public void Vp8Sse16X16_WithoutHwIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableHWIntrinsic);

// This will test the AVX2 version.
// This will test the AVX2 or ARM version.
[Fact]
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);

// This will test the SSE2 version.
[Fact]
public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
public void Vp8Sse16X8_WithoutAVX2_Works()
{
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
return;
}

FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
}

// This will test the fallback scalar version.
[Fact]
public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableHWIntrinsic);

// This will test the AVX2 version.
// This will test the AVX2 version or ARM version.
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);

// This will test the SSE2 version.
[Fact]
public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
public void Vp8Sse4X4_WithoutAVX2_Works()
{
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
return;
}

FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
}

// This will test the fallback scalar version.
[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
public void Mean16x4_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

[Fact]
public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
public void Mean16x4_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
public void HadamardTransform_WithHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);

[Fact]
public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
public void HadamardTransform_WithoutHardwareIntrinsics_Works() =>
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
}

0 comments on commit 63c8f9e

Please sign in to comment.