From beabcdf9ddc83530d115c8f8cb3a4b2038ab2876 Mon Sep 17 00:00:00 2001 From: Adrien GIVRY Date: Fri, 28 Feb 2025 22:13:27 -0500 Subject: [PATCH 1/2] ComputeBoundingSphere SIMD implementation --- .../src/OvRendering/Resources/Mesh.cpp | 166 +++++++++++++++--- 1 file changed, 144 insertions(+), 22 deletions(-) diff --git a/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp b/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp index 466adf216..268103412 100644 --- a/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp +++ b/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include @@ -81,38 +82,159 @@ void OvRendering::Resources::Mesh::Upload(std::span p_ve } } -void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span p_vertices) +namespace { - m_boundingSphere.position = OvMaths::FVector3::Zero; - m_boundingSphere.radius = 0.0f; - - if (!p_vertices.empty()) + OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD(std::span p_vertices) { - float minX = std::numeric_limits::max(); - float minY = std::numeric_limits::max(); - float minZ = std::numeric_limits::max(); + const size_t vertexCount = p_vertices.size(); + + if (vertexCount == 0) + { + return { + .position = OvMaths::FVector3::Zero, + .radius = 0.0f + }; + } - float maxX = std::numeric_limits::min(); - float maxY = std::numeric_limits::min(); - float maxZ = std::numeric_limits::min(); + // Initialize SIMD registers for min/max with first vertex values + __m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX); + __m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX); - for (const auto& vertex : p_vertices) + // Process all vertices in one loop to find min/max + for (size_t i = 1; i < vertexCount; ++i) { - minX = std::min(minX, vertex.position[0]); - minY = std::min(minY, vertex.position[1]); - minZ = std::min(minZ, vertex.position[2]); + // Load vertex position directly - assumes position is aligned properly + const float* posPtr = p_vertices[i].position; + __m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned + + // Update min and max in one pass + vMinXYZ = _mm_min_ps(vMinXYZ, vPos); + vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos); + } + + // Calculate center = (min + max) * 0.5 + __m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f)); + + // Store center position + float centerArr[4]; + _mm_store_ps(centerArr, vCenter); + auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] }; + + // Calculate radius - use dot product for distance calculation + __m128 vMaxDistSq = _mm_setzero_ps(); + + // Pre-load center vector once outside the loop + const __m128 vCenterXYZ = _mm_setr_ps( + center.x, + center.y, + center.z, + 0.0f + ); + + // Unroll the loop by 4 for better throughput + size_t i = 0; + const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4 + + for (; i < unrollCount; i += 4) + { + // Load 4 vertices at once + const float* pos0 = p_vertices[i].position; + const float* pos1 = p_vertices[i + 1].position; + const float* pos2 = p_vertices[i + 2].position; + const float* pos3 = p_vertices[i + 3].position; + + __m128 vPos0 = _mm_loadu_ps(pos0); + __m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ); + __m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all) + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0); + + __m128 vPos1 = _mm_loadu_ps(pos1); + __m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ); + __m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1); + + __m128 vPos2 = _mm_loadu_ps(pos2); + __m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ); + __m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2); + + __m128 vPos3 = _mm_loadu_ps(pos3); + __m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ); + __m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3); + } - maxX = std::max(maxX, vertex.position[0]); - maxY = std::max(maxY, vertex.position[1]); - maxZ = std::max(maxZ, vertex.position[2]); + // Handle remaining vertices + for (; i < vertexCount; ++i) + { + const float* pos = p_vertices[i].position; + __m128 vPos = _mm_loadu_ps(pos); + __m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ); + __m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq); } - m_boundingSphere.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f; + // Extract radius (sqrt of max squared distance) + float maxDistSq; + _mm_store_ss(&maxDistSq, vMaxDistSq); + + return { + .position = center, + .radius = std::sqrt(maxDistSq) + }; + } + + OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular(std::span p_vertices) + { + auto result = OvRendering::Geometry::BoundingSphere{ + .position = OvMaths::FVector3::Zero, + .radius = 0.0f + }; - for (const auto& vertex : p_vertices) + if (!p_vertices.empty()) { - const auto& position = reinterpret_cast(vertex.position); - m_boundingSphere.radius = std::max(m_boundingSphere.radius, OvMaths::FVector3::Distance(m_boundingSphere.position, position)); + float minX = std::numeric_limits::max(); + float minY = std::numeric_limits::max(); + float minZ = std::numeric_limits::max(); + + float maxX = std::numeric_limits::min(); + float maxY = std::numeric_limits::min(); + float maxZ = std::numeric_limits::min(); + + for (const auto& vertex : p_vertices) + { + minX = std::min(minX, vertex.position[0]); + minY = std::min(minY, vertex.position[1]); + minZ = std::min(minZ, vertex.position[2]); + + maxX = std::max(maxX, vertex.position[0]); + maxY = std::max(maxY, vertex.position[1]); + maxZ = std::max(maxZ, vertex.position[2]); + } + + result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f; + + for (const auto& vertex : p_vertices) + { + const auto& position = reinterpret_cast(vertex.position); + result.radius = std::max(result.radius, OvMaths::FVector3::Distance(result.position, position)); + } } + + return result; + } +} + +void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span p_vertices) +{ + constexpr bool useSIMD = true; + + if constexpr (useSIMD) + { + m_boundingSphere = ComputeBoundingSphereSIMD(p_vertices); + } + else + { + m_boundingSphere = ComputeBoundingSphereRegular(p_vertices); } } From ade1387efd3bf6dac2b6a38beee821e7ed03184b Mon Sep 17 00:00:00 2001 From: Adrien GIVRY Date: Thu, 8 May 2025 17:01:47 -0400 Subject: [PATCH 2/2] Cleaned up implementation --- .../src/OvRendering/Resources/Mesh.cpp | 219 +++++++----------- 1 file changed, 82 insertions(+), 137 deletions(-) diff --git a/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp b/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp index 268103412..e4ac7ae01 100644 --- a/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp +++ b/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp @@ -82,159 +82,104 @@ void OvRendering::Resources::Mesh::Upload(std::span p_ve } } -namespace +void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span p_vertices) { - OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD(std::span p_vertices) - { - const size_t vertexCount = p_vertices.size(); - - if (vertexCount == 0) - { - return { - .position = OvMaths::FVector3::Zero, - .radius = 0.0f - }; - } + const size_t vertexCount = p_vertices.size(); - // Initialize SIMD registers for min/max with first vertex values - __m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX); - __m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX); + if (vertexCount == 0) + { + m_boundingSphere = { + .position = OvMaths::FVector3::Zero, + .radius = 0.0f + }; - // Process all vertices in one loop to find min/max - for (size_t i = 1; i < vertexCount; ++i) - { - // Load vertex position directly - assumes position is aligned properly - const float* posPtr = p_vertices[i].position; - __m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned + return; + } - // Update min and max in one pass - vMinXYZ = _mm_min_ps(vMinXYZ, vPos); - vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos); - } + // Initialize SIMD registers for min/max with first vertex values + __m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX); + __m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX); - // Calculate center = (min + max) * 0.5 - __m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f)); + // Process all vertices in one loop to find min/max + for (size_t i = 1; i < vertexCount; ++i) + { + // Load vertex position directly - assumes position is aligned properly + const float* posPtr = p_vertices[i].position; + __m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned - // Store center position - float centerArr[4]; - _mm_store_ps(centerArr, vCenter); - auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] }; + // Update min and max in one pass + vMinXYZ = _mm_min_ps(vMinXYZ, vPos); + vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos); + } - // Calculate radius - use dot product for distance calculation - __m128 vMaxDistSq = _mm_setzero_ps(); + // Calculate center = (min + max) * 0.5 + __m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f)); - // Pre-load center vector once outside the loop - const __m128 vCenterXYZ = _mm_setr_ps( - center.x, - center.y, - center.z, - 0.0f - ); + // Store center position + float centerArr[4]; + _mm_store_ps(centerArr, vCenter); + auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] }; - // Unroll the loop by 4 for better throughput - size_t i = 0; - const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4 + // Calculate radius - use dot product for distance calculation + __m128 vMaxDistSq = _mm_setzero_ps(); - for (; i < unrollCount; i += 4) - { - // Load 4 vertices at once - const float* pos0 = p_vertices[i].position; - const float* pos1 = p_vertices[i + 1].position; - const float* pos2 = p_vertices[i + 2].position; - const float* pos3 = p_vertices[i + 3].position; - - __m128 vPos0 = _mm_loadu_ps(pos0); - __m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ); - __m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all) - vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0); - - __m128 vPos1 = _mm_loadu_ps(pos1); - __m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ); - __m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77); - vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1); - - __m128 vPos2 = _mm_loadu_ps(pos2); - __m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ); - __m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77); - vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2); - - __m128 vPos3 = _mm_loadu_ps(pos3); - __m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ); - __m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77); - vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3); - } + // Pre-load center vector once outside the loop + const __m128 vCenterXYZ = _mm_setr_ps( + center.x, + center.y, + center.z, + 0.0f + ); - // Handle remaining vertices - for (; i < vertexCount; ++i) - { - const float* pos = p_vertices[i].position; - __m128 vPos = _mm_loadu_ps(pos); - __m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ); - __m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77); - vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq); - } + // Unroll the loop by 4 for better throughput + size_t i = 0; + const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4 - // Extract radius (sqrt of max squared distance) - float maxDistSq; - _mm_store_ss(&maxDistSq, vMaxDistSq); - - return { - .position = center, - .radius = std::sqrt(maxDistSq) - }; + for (; i < unrollCount; i += 4) + { + // Load 4 vertices at once + const float* pos0 = p_vertices[i].position; + const float* pos1 = p_vertices[i + 1].position; + const float* pos2 = p_vertices[i + 2].position; + const float* pos3 = p_vertices[i + 3].position; + + __m128 vPos0 = _mm_loadu_ps(pos0); + __m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ); + __m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all) + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0); + + __m128 vPos1 = _mm_loadu_ps(pos1); + __m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ); + __m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1); + + __m128 vPos2 = _mm_loadu_ps(pos2); + __m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ); + __m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2); + + __m128 vPos3 = _mm_loadu_ps(pos3); + __m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ); + __m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3); } - OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular(std::span p_vertices) + // Handle remaining vertices + for (; i < vertexCount; ++i) { - auto result = OvRendering::Geometry::BoundingSphere{ - .position = OvMaths::FVector3::Zero, - .radius = 0.0f - }; - - if (!p_vertices.empty()) - { - float minX = std::numeric_limits::max(); - float minY = std::numeric_limits::max(); - float minZ = std::numeric_limits::max(); - - float maxX = std::numeric_limits::min(); - float maxY = std::numeric_limits::min(); - float maxZ = std::numeric_limits::min(); - - for (const auto& vertex : p_vertices) - { - minX = std::min(minX, vertex.position[0]); - minY = std::min(minY, vertex.position[1]); - minZ = std::min(minZ, vertex.position[2]); - - maxX = std::max(maxX, vertex.position[0]); - maxY = std::max(maxY, vertex.position[1]); - maxZ = std::max(maxZ, vertex.position[2]); - } - - result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f; - - for (const auto& vertex : p_vertices) - { - const auto& position = reinterpret_cast(vertex.position); - result.radius = std::max(result.radius, OvMaths::FVector3::Distance(result.position, position)); - } - } - - return result; + const float* pos = p_vertices[i].position; + __m128 vPos = _mm_loadu_ps(pos); + __m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ); + __m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq); } -} -void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span p_vertices) -{ - constexpr bool useSIMD = true; + // Extract radius (sqrt of max squared distance) + float maxDistSq; + _mm_store_ss(&maxDistSq, vMaxDistSq); - if constexpr (useSIMD) - { - m_boundingSphere = ComputeBoundingSphereSIMD(p_vertices); - } - else - { - m_boundingSphere = ComputeBoundingSphereRegular(p_vertices); - } + m_boundingSphere = { + .position = center, + .radius = std::sqrt(maxDistSq) + }; }