Skip to content

Commit

Permalink
Merged SSE matrix-shaper optimizer
Browse files Browse the repository at this point in the history
It makes small difference, but here you go!
In my computer, matrix shaper runs times x 2.4 faster the core  lcms2
without SSE is about times x 2
  • Loading branch information
mm2 committed May 25, 2020
1 parent f9e2e80 commit d881cc6
Show file tree
Hide file tree
Showing 9 changed files with 461 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
<ClCompile Include="..\..\src\fast_16_tethra.c" />
<ClCompile Include="..\..\src\fast_8_curves.c" />
<ClCompile Include="..\..\src\fast_8_matsh.c" />
<ClCompile Include="..\..\src\fast_8_matsh_sse.c" />
<ClCompile Include="..\..\src\fast_8_tethra.c" />
<ClCompile Include="..\..\src\fast_float_15bits.c" />
<ClCompile Include="..\..\src\fast_float_15mats.c" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@
<ClCompile Include="..\..\src\fast_float_cmyk.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\src\fast_8_matsh.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\src\fast_8_curves.c">
<Filter>Source Files</Filter>
</ClCompile>
Expand All @@ -57,6 +54,12 @@
<ClCompile Include="..\..\src\fast_16_tethra.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\src\fast_8_matsh_sse.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\src\fast_8_matsh.c">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="..\..\COPYING.GPL3">
Expand Down
2 changes: 1 addition & 1 deletion plugins/fast_float/src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ liblcms2_fast_float_la_LIBADD = $(LCMS_LIB_DEPLIBS) $(top_builddir)/src/liblcms2
liblcms2_fast_float_la_SOURCES = \
fast_float_15bits.c fast_float_15mats.c fast_float_curves.c fast_float_matsh.c fast_float_separate.c \
fast_float_sup.c fast_float_tethra.c fast_float_cmyk.c fast_float_internal.h \
fast_8_curves.c fast_8_matsh.c fast_8_tethra.c
fast_8_curves.c fast_8_matsh.c fast_8_matsh_sse.c fast_8_tethra.c

67 changes: 21 additions & 46 deletions plugins/fast_float/src/fast_8_matsh.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ typedef cmsInt32Number cmsS1Fixed14Number; // Note that this may hold more tha
// This is the private data container used by this optimization
typedef struct {

// This is for SSE2, MUST be aligned at 16 bit boundary
// Alignment makes it faster

cmsFloat32Number fMatrix[4][4];
cmsFloat32Number fShaper1[256 * 3];
cmsS1Fixed14Number Mat[4][4]; // n.14 to n.14 (needs a saturation after that)

void * real_ptr;

Expand All @@ -42,10 +41,7 @@ typedef struct {
cmsS1Fixed14Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0)
cmsS1Fixed14Number Shaper1G[256];
cmsS1Fixed14Number Shaper1B[256];

cmsS1Fixed14Number Mat[3][3]; // n.14 to n.14 (needs a saturation after that)
cmsS1Fixed14Number Off[3];


cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255
cmsUInt8Number Shaper2G[0x4001];
cmsUInt8Number Shaper2B[0x4001];
Expand Down Expand Up @@ -97,36 +93,24 @@ void FillFirstShaper(cmsS1Fixed14Number* Table, cmsToneCurve* Curve)
}
}

static
void FillFirstShaperFloat(cmsFloat32Number* Table, cmsToneCurve* Curve)
{
int i;
cmsFloat32Number R;

for (i=0; i < 256; i++) {

R = (cmsFloat32Number) (i / 255.0);

Table[i] = cmsEvalToneCurveFloat(Curve, R);
}
}


// This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve
static
void FillSecondShaper(cmsUInt8Number* Table, cmsToneCurve* Curve)
{
int i;
cmsFloat32Number R, Val;
cmsUInt16Number w;
cmsInt32Number w;

for (i=0; i < 0x4001; i++) {

R = (cmsFloat32Number) (i / 16384.0);
R = (cmsFloat32Number) (i / 16384.0f);
Val = cmsEvalToneCurveFloat(Curve, R);
w = _cmsSaturateWord(Val * 65535.0 + 0.5);
w = (cmsInt32Number) (Val * 255.0f + 0.5f);
if (w < 0) w = 0;
if (w > 255) w = 255;

Table[i] = FROM_16_TO_8(w);
Table[i] = (cmsInt8Number) w;

}
}
Expand All @@ -153,30 +137,22 @@ XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cms
FillSecondShaper(p ->Shaper2G, Curve2[1]);
FillSecondShaper(p ->Shaper2B, Curve2[2]);


FillFirstShaperFloat(p ->fShaper1, Curve1[0]);
FillFirstShaperFloat(p ->fShaper1 + 256, Curve1[1]);
FillFirstShaperFloat(p ->fShaper1 + 256*2, Curve1[2]);


// Convert matrix to nFixed14. Note that those values may take more than 16 bits as
for (i=0; i < 3; i++) {
for (j=0; j < 3; j++) {
p ->Mat[i][j] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]);
p ->fMatrix[j][i] = (cmsFloat32Number) Mat ->v[i].n[j];
p ->Mat[j][i] = DOUBLE_TO_1FIXED14(Mat->v[i].n[j]);
}
}



for (i=0; i < 3; i++) {

if (Off == NULL) {

p ->Off[i] = 0x2000;
p ->fMatrix[3][i] = 0.0f;

p->Mat[3][i] = DOUBLE_TO_1FIXED14(0.5);
}
else {
p ->Off[i] = DOUBLE_TO_1FIXED14(Off->n[i]) + 0x2000;
p ->fMatrix[3][i] = (cmsFloat32Number) Off->n[i];
else {
p->Mat[3][i] = DOUBLE_TO_1FIXED14(Off->n[i] + 0.5);
}
}

Expand Down Expand Up @@ -237,20 +213,19 @@ void MatShaperXform8(struct _cmstransform_struct *CMMcargo,
gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
if (nalpha)
aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;

aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;

for (ii = 0; ii < PixelsPerLine; ii++) {

// Across first shaper, which also converts to 1.14 fixed point. 16 bits guaranteed.
r = p->Shaper1R[*rin];
g = p->Shaper1G[*gin];
b = p->Shaper1B[*bin];

// Evaluate the matrix in 1.14 fixed point
l1 = (p->Mat[0][0] * r + p->Mat[0][1] * g + p->Mat[0][2] * b + p->Off[0]) >> 14;
l2 = (p->Mat[1][0] * r + p->Mat[1][1] * g + p->Mat[1][2] * b + p->Off[1]) >> 14;
l3 = (p->Mat[2][0] * r + p->Mat[2][1] * g + p->Mat[2][2] * b + p->Off[2]) >> 14;
l1 = (p->Mat[0][0] * r + p->Mat[1][0] * g + p->Mat[2][0] * b + p->Mat[3][0]) >> 14;
l2 = (p->Mat[0][1] * r + p->Mat[1][1] * g + p->Mat[2][1] * b + p->Mat[3][1]) >> 14;
l3 = (p->Mat[0][2] * r + p->Mat[1][2] * g + p->Mat[2][2] * b + p->Mat[3][2]) >> 14;


// Now we have to clip to 0..1.0 range
Expand Down
Loading

0 comments on commit d881cc6

Please sign in to comment.