diff --git a/AUTHORS.md b/AUTHORS.md index fe18a3ab..bd9af4cf 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -4,9 +4,10 @@ * Memory package, socket module - [Eugene Wissner aka belka-ew](https://github.com/belka-ew) * Image binarization, histogram generation, median filter - [LightHouse Software](http://lhs-blog.info/) / [Oleg Baharev aka aquaratixc](https://github.com/aquaratixc) * TGA and BMP encoder, improved BMP decoder, bugfixes, unittests - [Roman Chistokhodov aka FreeSlave](https://github.com/FreeSlave) -* PNG decoder improvements - Vadim Lopatin +* PNG decoder improvements - [Vadim Lopatin](https://github.com/buggins) * Combinatorics module - Nick Papanastasiou * Rectangle drawing - [Aaron Nédélec aka ReactiveAlkali](https://github.com/ReactiveAlkali) * Vector swizzling assignment - [João Lourenço aka iK4tsu](https://github.com/iK4tsu) +* SSE vector math port for GDC - [Alexander Perfilyev](https://github.com/aperfilev) * Bugfixes - [Andrey Penechko aka MrSmith33](https://github.com/MrSmith33), [Valeriy Fedotov](https://github.com/Valera), [Basile Burg aka SixthDot](https://github.com/SixthDot), [Ate Eskola aka dukc](https://github.com/dukc), [Martin Nowak aka dawg](https://github.com/MartinNowak), [Mathias Lang aka Geod24](https://github.com/Geod24), [Nick Treleaven aka ntrel](https://github.com/ntrel), [Nikolay Krasheninnikov aka GoodNike](https://github.com/GoodNike), [ijet](https://github.com/my-ijet), [TETYYS](https://github.com/TETYYS), [Razvan Nitu aka RazvanN7](https://github.com/RazvanN7) * Unittests - [Roman Vlasov](https://github.com/VlasovRoman) diff --git a/README.md b/README.md index 9fb8ed1b..4eed37f5 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ HTML documentation can be generated from source code using ddox (run `dub build License ------- -Copyright (c) 2011-2024 Timur Gafarov, Martin Cejp, Andrey Penechko, Vadim Lopatin, Nick Papanastasiou, Oleg Baharev, Roman Chistokhodov, Eugene Wissner, Roman Vlasov, Basile Burg, Valeriy Fedotov, Ferhat Kurtulmuş, João Lourenço, Ate Eskola, Aaron Nédélec. Distributed under the Boost Software License, Version 1.0 (see accompanying file COPYING or at https://www.boost.org/LICENSE_1_0.txt). +Copyright (c) 2011-2024 Timur Gafarov, Martin Cejp, Andrey Penechko, Vadim Lopatin, Nick Papanastasiou, Oleg Baharev, Roman Chistokhodov, Eugene Wissner, Roman Vlasov, Basile Burg, Valeriy Fedotov, Ferhat Kurtulmuş, João Lourenço, Ate Eskola, Aaron Nédélec, Alexander Perfilyev. Distributed under the Boost Software License, Version 1.0 (see accompanying file COPYING or at https://www.boost.org/LICENSE_1_0.txt). Sponsors -------- diff --git a/dlib/math/sse.d b/dlib/math/sse.d index fd921037..9eb65dc1 100644 --- a/dlib/math/sse.d +++ b/dlib/math/sse.d @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2023 Timur Gafarov +Copyright (c) 2015-2024 Timur Gafarov, Alexander Perfilyev Boost Software License - Version 1.0 - August 17th, 2003 @@ -30,13 +30,11 @@ DEALINGS IN THE SOFTWARE. * SSE-based optimizations for common vector and matrix operations * * Description: - * This module implements some frequently used vector and matrix operations - * using SSE instructions. Implementation is in WIP status. - * Module is compatible only with Digital Mars D Compiler. + * This module implements some frequently used vector and matrix operations using SSE instructions. * - * Copyright: Timur Gafarov 2015-2023. + * Copyright: Timur Gafarov, Alexander Perfilyev 2015-2024. * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0). - * Authors: Timur Gafarov + * Authors: Timur Gafarov, Alexander Perfilyev */ module dlib.math.sse; @@ -47,202 +45,202 @@ version(GNU) { pragma(inline, true); - /// Vector addition - Vector4f sseAdd4(Vector4f a, Vector4f b) - { - asm { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } - - return a; - } + /// Vector addition + Vector4f sseAdd4(Vector4f a, Vector4f b) + { + asm { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a; + } /// Vector subtraction for GNU D Compiler (using AVX) Vector4f sseSub4(Vector4f a, Vector4f b) { - asm - { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "subps %%xmm1, %%xmm0 \n" ~ // Subtract xmm1 from xmm0 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "subps %%xmm1, %%xmm0 \n" ~ // Subtract xmm1 from xmm0 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } return a; } /// Vector multiplication for GNU D Compiler (using AVX) - Vector4f sseMul4(Vector4f a, Vector4f b) - { - asm - { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } - - return a; - } + Vector4f sseMul4(Vector4f a, Vector4f b) + { + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a; + } /// Vector division for GNU D Compiler (using AVX) - Vector4f sseDiv4(Vector4f a, Vector4f b) - { - asm - { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "divps %%xmm1, %%xmm0 \n" ~ // Divide xmm0 by xmm1 - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } - - return a; - } + Vector4f sseDiv4(Vector4f a, Vector4f b) + { + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "divps %%xmm1, %%xmm0 \n" ~ // Divide xmm0 by xmm1 + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a; + } /// Vector dot product for GNU D Compiler (using SSE) - float sseDot4(Vector4f a, Vector4f b) - { - asm - { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - - // Horizontal addition - "movhlps %%xmm0, %%xmm1 \n" ~ // Copy the high 64 bits to the low 64 bits of xmm1 - "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 - - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1"; // Clobbered registers - } - - return a[0]; - } + float sseDot4(Vector4f a, Vector4f b) + { + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + + // Horizontal addition + "movhlps %%xmm0, %%xmm1 \n" ~// Copy the high 64 bits to the low 64 bits of xmm1 + "addps %%xmm1, %%xmm0 \n" ~ // Add xmm1 to xmm0 + + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1"; // Clobbered registers + } + + return a[0]; + } /// Vector cross product for GNU D Compiler (using SSE) - Vector4f sseCross3(Vector4f a, Vector4f b) - { - asm - { - "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 - "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 - "movaps %%xmm0, %%xmm2 \n" ~ // Copy xmm0 to xmm2 - "movaps %%xmm1, %%xmm3 \n" ~ // Copy xmm1 to xmm3 - - "shufps $0xC9, %%xmm0, %%xmm0 \n" ~ // Shuffle xmm0 according to 0xC9 - "shufps $0xD2, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0xD2 - "shufps $0xD2, %%xmm2, %%xmm2 \n" ~ // Shuffle xmm2 according to 0xD2 - "shufps $0xC9, %%xmm3, %%xmm3 \n" ~ // Shuffle xmm3 according to 0xC9 - - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - "mulps %%xmm3, %%xmm2 \n" ~ // Multiply xmm2 by xmm3 - - "subps %%xmm2, %%xmm0 \n" ~ // Subtract xmm2 from xmm0 - - "movups %%xmm0, %[a] \n" // Store the result back in vector a - : [a] "+m" (a) // Output operand a, constrained to memory - : [b] "m" (b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1", "%xmm2", "%xmm3"; // Clobbered registers - } - - return a; - } + Vector4f sseCross3(Vector4f a, Vector4f b) + { + asm + { + "movups %[a], %%xmm0 \n" ~ // Load vector a into xmm0 + "movups %[b], %%xmm1 \n" ~ // Load vector b into xmm1 + "movaps %%xmm0, %%xmm2 \n" ~ // Copy xmm0 to xmm2 + "movaps %%xmm1, %%xmm3 \n" ~ // Copy xmm1 to xmm3 + + "shufps $0xC9, %%xmm0, %%xmm0 \n" ~ // Shuffle xmm0 according to 0xC9 + "shufps $0xD2, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0xD2 + "shufps $0xD2, %%xmm2, %%xmm2 \n" ~ // Shuffle xmm2 according to 0xD2 + "shufps $0xC9, %%xmm3, %%xmm3 \n" ~ // Shuffle xmm3 according to 0xC9 + + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + "mulps %%xmm3, %%xmm2 \n" ~ // Multiply xmm2 by xmm3 + + "subps %%xmm2, %%xmm0 \n" ~ // Subtract xmm2 from xmm0 + + "movups %%xmm0, %[a] \n" // Store the result back in vector a + : [a] "+m" (a) // Output operand a, constrained to memory + : [b] "m" (b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1", "%xmm2", "%xmm3"; // Clobbered registers + } + + return a; + } /// Matrix multiplication for GNU D Compiler (using SSE) - Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) - { - Matrix4x4f r; - Vector4f a_line, b_line, r_line; - float _b; - uint i, j; - Vector4f* _rp; - - for (i = 0; i < 16; i += 4) - { - a_line = *cast(Vector4f*)(a.arrayof.ptr); - _b = *(b.arrayof.ptr + i); - - asm - { - "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 - - "mov %[_b], %%eax \n" ~ // Move _b into the EAX register - "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 - - "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 - - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - "movups %%xmm0, %[r_line]" // Store the result in r_line - - : [r_line] "=m" (r_line) // Output operand r_line, constrained to memory - : [a_line] "m" (a_line), [_b] "r" (_b) // Input operands a_line and _b, constrained to memory and register - : "%xmm0", "%xmm1", "%eax"; // Clobbered registers - } - - for (j = 1; j < 4; j++) - { - a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); - _b = *(b.arrayof.ptr + i + j); - - asm - { - "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 - - "mov %[_b], %%eax \n" ~ // Move _b into the EAX register - "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 - "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 - - "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 - - "movups %[r_line], %%xmm2 \n" ~ // Load r_line into xmm2 - "addps %%xmm2, %%xmm0 \n" ~ // Add xmm2 to xmm0 - - "movups %%xmm0, %[r_line]" // Store the result back in r_line - : [r_line] "=m" (r_line) // Output and input operands - : [a_line] "m" (a_line), [_b] "r" (_b) // Input operand b, constrained to memory - : "%xmm0", "%xmm1", "%xmm2", "%eax"; // Clobbered registers - } - } - - _rp = cast(Vector4f*)(r.arrayof.ptr + i); - - version(X86) asm - { - "mov %[_rp], %%eax \n" ~ // Move _rp into the EAX register - "movups %%xmm0,(%%eax)" // Move xmm0 to the memory location pointed by EAX - : [_rp] "+r" (_rp) // Output and input operands - : // No additional input operands - : "%eax", "%xmm0"; // Clobbered registers - } - version(X86_64) asm - { - "mov %[_rp], %%rax \n" ~ // Move _rp into the RAX register - "movups %%xmm0, (%%rax)" // Move xmm0 to the memory location pointed by RAX - : [_rp] "+r" (_rp) // Output and input operands - : // No additional input operands - : "%rax", "%xmm0"; // Clobbered registers - } - } - - return r; - } + Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) + { + Matrix4x4f r; + Vector4f a_line, b_line, r_line; + float _b; + uint i, j; + Vector4f* _rp; + + for (i = 0; i < 16; i += 4) + { + a_line = *cast(Vector4f*)(a.arrayof.ptr); + _b = *(b.arrayof.ptr + i); + + asm + { + "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 + + "mov %[_b], %%eax \n" ~ // Move _b into the EAX register + "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 + + "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 + + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + "movups %%xmm0, %[r_line]" // Store the result in r_line + + : [r_line] "=m" (r_line) // Output operand r_line, constrained to memory + : [a_line] "m" (a_line), [_b] "r" (_b) // Input operands a_line and _b, constrained to memory and register + : "%xmm0", "%xmm1", "%eax"; // Clobbered registers + } + + for (j = 1; j < 4; j++) + { + a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); + _b = *(b.arrayof.ptr + i + j); + + asm + { + "movups %[a_line], %%xmm0 \n" ~ // Load vector a_line into xmm0 + + "mov %[_b], %%eax \n" ~ // Move _b into the EAX register + "movd %%eax, %%xmm1 \n" ~ // Move EAX into xmm1 + "shufps $0, %%xmm1, %%xmm1 \n" ~ // Shuffle xmm1 according to 0 + + "mulps %%xmm1, %%xmm0 \n" ~ // Multiply xmm0 by xmm1 + + "movups %[r_line], %%xmm2 \n" ~ // Load r_line into xmm2 + "addps %%xmm2, %%xmm0 \n" ~ // Add xmm2 to xmm0 + + "movups %%xmm0, %[r_line]" // Store the result back in r_line + : [r_line] "=m" (r_line) // Output and input operands + : [a_line] "m" (a_line), [_b] "r" (_b) // Input operand b, constrained to memory + : "%xmm0", "%xmm1", "%xmm2", "%eax"; // Clobbered registers + } + } + + _rp = cast(Vector4f*)(r.arrayof.ptr + i); + + version(X86) asm + { + "mov %[_rp], %%eax \n" ~ // Move _rp into the EAX register + "movups %%xmm0,(%%eax)" // Move xmm0 to the memory location pointed by EAX + : [_rp] "+r" (_rp) // Output and input operands + : // No additional input operands + : "%eax", "%xmm0"; // Clobbered registers + } + version(X86_64) asm + { + "mov %[_rp], %%rax \n" ~ // Move _rp into the RAX register + "movups %%xmm0, (%%rax)" // Move xmm0 to the memory location pointed by RAX + : [_rp] "+r" (_rp) // Output and input operands + : // No additional input operands + : "%rax", "%xmm0"; // Clobbered registers + } + } + + return r; + } } version(DMD)