Merged SSE matrix-shaper optimizer

It makes small difference, but here you go! In my computer, matrix shaper runs times x 2.4 faster the core lcms2 without SSE is about times x 2
mm2 · May 25, 2020 · d881cc6 · d881cc6
1 parent f9e2e80
commit d881cc6
Show file tree

Hide file tree

Showing 9 changed files with 461 additions and 55 deletions.
diff --git a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj
@@ -26,6 +26,7 @@
     <ClCompile Include="..\..\src\fast_16_tethra.c" />
     <ClCompile Include="..\..\src\fast_8_curves.c" />
     <ClCompile Include="..\..\src\fast_8_matsh.c" />
+    <ClCompile Include="..\..\src\fast_8_matsh_sse.c" />
     <ClCompile Include="..\..\src\fast_8_tethra.c" />
     <ClCompile Include="..\..\src\fast_float_15bits.c" />
     <ClCompile Include="..\..\src\fast_float_15mats.c" />

diff --git a/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters b/plugins/fast_float/Projects/VC2019/lcms2_fast_float_plugin.vcxproj.filters
@@ -45,9 +45,6 @@
     <ClCompile Include="..\..\src\fast_float_cmyk.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\fast_8_matsh.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\fast_8_curves.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -57,6 +54,12 @@
     <ClCompile Include="..\..\src\fast_16_tethra.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\fast_8_matsh_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\fast_8_matsh.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\..\COPYING.GPL3">

diff --git a/plugins/fast_float/src/Makefile.am b/plugins/fast_float/src/Makefile.am
@@ -24,5 +24,5 @@ liblcms2_fast_float_la_LIBADD = $(LCMS_LIB_DEPLIBS) $(top_builddir)/src/liblcms2
 liblcms2_fast_float_la_SOURCES = \
   fast_float_15bits.c fast_float_15mats.c fast_float_curves.c fast_float_matsh.c fast_float_separate.c \
   fast_float_sup.c fast_float_tethra.c fast_float_cmyk.c fast_float_internal.h \
-  fast_8_curves.c fast_8_matsh.c fast_8_tethra.c
+  fast_8_curves.c fast_8_matsh.c fast_8_matsh_sse.c fast_8_tethra.c
 
diff --git a/plugins/fast_float/src/fast_8_matsh.c b/plugins/fast_float/src/fast_8_matsh.c
@@ -30,10 +30,9 @@ typedef cmsInt32Number cmsS1Fixed14Number;   // Note that this may hold more tha
 // This is the private data container used by this optimization
 typedef struct {
 
-     // This is for SSE2, MUST be aligned at 16 bit boundary
+    // Alignment makes it faster
 
-    cmsFloat32Number  fMatrix[4][4];    
-    cmsFloat32Number  fShaper1[256 * 3];
+    cmsS1Fixed14Number Mat[4][4];     // n.14 to n.14 (needs a saturation after that)
 
     void * real_ptr;
 
@@ -42,10 +41,7 @@ typedef struct {
     cmsS1Fixed14Number Shaper1R[256];  // from 0..255 to 1.14  (0.0...1.0)
     cmsS1Fixed14Number Shaper1G[256];
     cmsS1Fixed14Number Shaper1B[256];
-
-    cmsS1Fixed14Number Mat[3][3];     // n.14 to n.14 (needs a saturation after that)
-    cmsS1Fixed14Number Off[3];
-
+
     cmsUInt8Number Shaper2R[0x4001];    // 1.14 to 0..255 
     cmsUInt8Number Shaper2G[0x4001];
     cmsUInt8Number Shaper2B[0x4001];    
@@ -97,36 +93,24 @@ void FillFirstShaper(cmsS1Fixed14Number* Table, cmsToneCurve* Curve)
     }
 }
 
-static
-void FillFirstShaperFloat(cmsFloat32Number* Table, cmsToneCurve* Curve)
-{
-    int i;
-    cmsFloat32Number R;
-
-    for (i=0; i < 256; i++) {
-
-        R   = (cmsFloat32Number) (i / 255.0);
-
-        Table[i] = cmsEvalToneCurveFloat(Curve, R);
-    }
-}
-
 
 // This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve
 static
 void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve)
 {
     int i;
     cmsFloat32Number R, Val;
-    cmsUInt16Number w;
+    cmsInt32Number w;
 
     for (i=0; i < 0x4001; i++) {
 
-        R   = (cmsFloat32Number) (i / 16384.0);
+        R   = (cmsFloat32Number) (i / 16384.0f);
         Val = cmsEvalToneCurveFloat(Curve, R);    
-        w = _cmsSaturateWord(Val * 65535.0 + 0.5);        
+        w = (cmsInt32Number) (Val * 255.0f + 0.5f);
+        if (w < 0) w = 0;
+        if (w > 255) w = 255;
 
-        Table[i] = FROM_16_TO_8(w);
+        Table[i] = (cmsInt8Number) w;
 
     }
 }
@@ -153,30 +137,22 @@ XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cms
     FillSecondShaper(p ->Shaper2G, Curve2[1]);
     FillSecondShaper(p ->Shaper2B, Curve2[2]);
 
-
-    FillFirstShaperFloat(p ->fShaper1,         Curve1[0]);
-    FillFirstShaperFloat(p ->fShaper1 + 256,   Curve1[1]);
-    FillFirstShaperFloat(p ->fShaper1 + 256*2, Curve1[2]);
-
+
     // Convert matrix to nFixed14. Note that those values may take more than 16 bits as
     for (i=0; i < 3; i++) {
         for (j=0; j < 3; j++) {         
-            p ->Mat[i][j] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]);
-            p ->fMatrix[j][i] = (cmsFloat32Number) Mat ->v[i].n[j];
+            p ->Mat[j][i] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]);            
         }        
     }
-
-
+
     for (i=0; i < 3; i++) {
 
         if (Off == NULL) {   
-
-            p ->Off[i] =  0x2000;
-            p ->fMatrix[3][i] = 0.0f;
+
+            p->Mat[3][i] = DOUBLE_TO_1FIXED14(0.5);
         }
-        else {      
-            p ->Off[i] = DOUBLE_TO_1FIXED14(Off->n[i]) + 0x2000;
-            p ->fMatrix[3][i] = (cmsFloat32Number) Off->n[i];
+        else {                              
+            p->Mat[3][i] = DOUBLE_TO_1FIXED14(Off->n[i] + 0.5);
         }
     }
 
@@ -237,20 +213,19 @@ void MatShaperXform8(struct _cmstransform_struct *CMMcargo,
            gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
            bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
            if (nalpha)
-                  aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
-
+                  aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;          
 
            for (ii = 0; ii < PixelsPerLine; ii++) {
-
+                            
                   // Across first shaper, which also converts to 1.14 fixed point. 16 bits guaranteed.
                   r = p->Shaper1R[*rin];
                   g = p->Shaper1G[*gin];
                   b = p->Shaper1B[*bin];
 
                   // Evaluate the matrix in 1.14 fixed point
-                  l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 14;
-                  l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 14;
-                  l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 14;
+                  l1 = (p->Mat[0][0] * r + p->Mat[1][0] * g + p->Mat[2][0] * b + p->Mat[3][0]) >> 14;
+                  l2 = (p->Mat[0][1] * r + p->Mat[1][1] * g + p->Mat[2][1] * b + p->Mat[3][1]) >> 14;
+                  l3 = (p->Mat[0][2] * r + p->Mat[1][2] * g + p->Mat[2][2] * b + p->Mat[3][2]) >> 14;
 
 
                   // Now we have to clip to 0..1.0 range