Skip to content

Commit 7ac4d63

Browse files
committed
added simple SSE support for convolutional layer, very early
1 parent 647de6a commit 7ac4d63

File tree

13 files changed

+46
-23
lines changed

13 files changed

+46
-23
lines changed

lib/.ycm_extra_conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44

55
def FlagsForFile(filename):
66
return {
7-
'flags' : ['-ffast-math', '-Wall', '-msse2', '-D HAVE_SSE2', '-D HAVE_LIBJPEG', '-D HAVE_LIBPNG', '-D HAVE_GSL', '-D HAVE_FFTW3', '-D HAVE_LIBLINEAR', '-D HAVE_CBLAS', '-D HAVE_AVCODEC', '-D HAVE_AVFORMAT', '-D HAVE_SWSCALE'],
7+
'flags' : ['-ffast-math', '-Wall', '-msse3', '-D HAVE_SSE3', '-D HAVE_LIBJPEG', '-D HAVE_LIBPNG', '-D HAVE_GSL', '-D HAVE_FFTW3', '-D HAVE_LIBLINEAR', '-D HAVE_CBLAS', '-D HAVE_AVCODEC', '-D HAVE_AVFORMAT', '-D HAVE_SWSCALE'],
88
'do_cache' : True
99
}

lib/3rdparty/dsfmt/dSFMT-common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
#include "dSFMT.h"
2525

26-
#if defined(HAVE_SSE2)
26+
#if defined(HAVE_SSE3)
2727
# include <emmintrin.h>
2828
union X128I_T {
2929
uint64_t u[2];
@@ -66,7 +66,7 @@ inline static void do_recursion(dw128_t *r, dw128_t *a, dw128_t * b,
6666
r->s = vec_xor(z, x);
6767
lung->s = w;
6868
}
69-
#elif defined(HAVE_SSE2)
69+
#elif defined(HAVE_SSE3)
7070
/**
7171
* This function represents the recursion formula.
7272
* @param r output 128-bit

lib/3rdparty/dsfmt/dSFMT-params.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
#define DSFMT_SR 12
3636

3737
/* for sse2 */
38-
#if defined(HAVE_SSE2)
38+
#if defined(HAVE_SSE3)
3939
#define SSE2_SHUFF 0x1b
4040
#elif defined(HAVE_ALTIVEC)
4141
#if defined(__APPLE__) /* For OSX */

lib/3rdparty/dsfmt/dSFMT.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ inline static int idxof(int i);
4343
static void initial_mask(dsfmt_t *dsfmt);
4444
static void period_certification(dsfmt_t *dsfmt);
4545

46-
#if defined(HAVE_SSE2)
46+
#if defined(HAVE_SSE3)
4747
/** 1 in 64bit for sse2 */
4848
static const union X128I_T sse2_int_one = {{1, 1}};
4949
/** 2.0 double for sse2 */
@@ -66,7 +66,7 @@ inline static int idxof(int i) {
6666
}
6767
#endif
6868

69-
#if defined(HAVE_SSE2)
69+
#if defined(HAVE_SSE3)
7070
/**
7171
* This function converts the double precision floating point numbers which
7272
* distribute uniformly in the range [1, 2) to those which distribute uniformly

lib/3rdparty/dsfmt/dSFMT.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ union dW128_T {
138138
double d[2];
139139
};
140140

141-
#elif defined(HAVE_SSE2)
141+
#elif defined(HAVE_SSE3)
142142
# include <emmintrin.h>
143143

144144
/** 128-bit data structure */

lib/3rdparty/sfmt/SFMT.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ extern "C" {
4141
/**
4242
* parameters used by sse2.
4343
*/
44-
#ifdef HAVE_SSE2
44+
#ifdef HAVE_SSE3
4545
static const w128_t sse2_param_mask = {{SFMT_MSK1, SFMT_MSK2,
4646
SFMT_MSK3, SFMT_MSK4}};
4747
#endif
@@ -59,7 +59,7 @@ inline static void swap(w128_t *array, int size);
5959

6060
#if defined(HAVE_ALTIVEC)
6161
#include "SFMT-alti.h"
62-
#elif defined(HAVE_SSE2)
62+
#elif defined(HAVE_SSE3)
6363
#include "SFMT-sse2.h"
6464
#endif
6565

@@ -77,7 +77,7 @@ inline static int idxof(int i) {
7777
}
7878
#endif
7979

80-
#if (!defined(HAVE_ALTIVEC)) && (!defined(HAVE_SSE2))
80+
#if (!defined(HAVE_ALTIVEC)) && (!defined(HAVE_SSE3))
8181
/**
8282
* This function fills the user-specified array with pseudorandom
8383
* integers.
@@ -228,7 +228,7 @@ int sfmt_get_min_array_size64(sfmt_t * sfmt) {
228228
return SFMT_N64;
229229
}
230230

231-
#if !defined(HAVE_SSE2) && !defined(HAVE_ALTIVEC)
231+
#if !defined(HAVE_SSE3) && !defined(HAVE_ALTIVEC)
232232
/**
233233
* This function fills the internal state array with pseudorandom
234234
* integers.

lib/3rdparty/sfmt/SFMT.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ union W128_T {
7979
uint32_t u[4];
8080
uint64_t u64[2];
8181
};
82-
#elif defined(HAVE_SSE2)
82+
#elif defined(HAVE_SSE3)
8383
#include <emmintrin.h>
8484

8585
/** 128-bit data structure */

lib/ccv.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
#include <string.h>
1515
#include <float.h>
1616
#include <math.h>
17-
#ifdef HAVE_SSE2
18-
#include <xmmintrin.h>
17+
#ifndef __CUDACC__
18+
#ifdef HAVE_SSE3
19+
#include <x86intrin.h>
20+
#endif
1921
#endif
2022
#include <assert.h>
2123
#include <alloca.h>

lib/ccv_basic.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len)
221221
{
222222
int i = 0;
223223
float scale = (float)(180.0 / CCV_PI);
224-
#ifdef HAVE_SSE2
224+
#ifdef HAVE_SSE3
225225
#ifndef _WIN32
226226
union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff;
227227
__m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl);

lib/ccv_convnet.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,14 +152,35 @@ static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* la
152152
float* w = layer_w + comx * ch_per_partition + comy;
153153
float* apz = ap + ccv_max(j * strides - border, 0) * ch;
154154
// when we have border, we simply do zero padding
155+
#if HAVE_SSE3
156+
__m128 v4 = _mm_setzero_ps();
157+
#endif
155158
for (y = 0; y < maxy; y++)
156159
{
157160
for (x = 0; x < maxx; x++)
158-
for (c = 0; c < ch_per_partition; c++)
161+
{
162+
c = 0;
163+
#if HAVE_SSE3
164+
for (; c < ch_per_partition - 4; c += 4)
165+
{
166+
__m128 w4 = _mm_loadu_ps(w + x * ch_per_partition + c);
167+
__m128 apz4 = _mm_loadu_ps(apz + x * ch + c);
168+
v4 = _mm_add_ps(_mm_mul_ps(w4, apz4), v4);
169+
}
170+
#endif
171+
for (; c < ch_per_partition; c++)
159172
v += w[x * ch_per_partition + c] * apz[x * ch + c];
173+
}
160174
w += kernel_cols * ch_per_partition;
161175
apz += a->cols * ch;
162176
}
177+
#if HAVE_SSE3
178+
v4 = _mm_hadd_ps(v4, v4);
179+
v4 = _mm_hadd_ps(v4, v4);
180+
float pv;
181+
_mm_store_ss(&pv, v4);
182+
v += pv;
183+
#endif
163184
bp[j * count] = ccv_max(0, v); // ReLU
164185
}
165186
bp += db->cols * count;

0 commit comments

Comments
 (0)