Optimize CPU conv2dDerInput on CPU to be 100x faster (tensorflow#995)

PERF
HarshaLaxman · Apr 25, 2018 · ac95bf3 · ac95bf3
1 parent 1ede7c7
commit ac95bf3
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 19 deletions.
diff --git a/src/kernels/backend_cpu.ts b/src/kernels/backend_cpu.ts
@@ -984,44 +984,53 @@ export class MathBackendCPU implements KernelBackend {
 
   conv2dDerInput(dy: Tensor4D, filter: Tensor4D, convInfo: Conv2DInfo):
       Tensor4D {
-    const filterHeight = convInfo.filterHeight;
-    const filterWidth = convInfo.filterWidth;
+    const dx = ops.buffer<Rank.R4>(convInfo.inShape, 'float32');
+    const dxValues = dx.values;
+    const [dxS0, dxS1, dxS2] = dx.strides;
+    const dyValues = dy.dataSync();
+    const [dyS0, dyS1, dyS2] = dy.strides;
+    const fltValues = filter.dataSync();
+    const [fltS0, fltS1, fltS2] = filter.strides;
+    const {batchSize, filterHeight, filterWidth,
+           inChannels, inHeight, inWidth,
+           outChannels, outHeight, outWidth,
+           strideHeight, strideWidth} = convInfo;
     const topPad = filterHeight - 1 - convInfo.padInfo.top;
     const leftPad = filterWidth - 1 - convInfo.padInfo.left;
-    const strideHeight = convInfo.strideHeight;
-    const strideWidth = convInfo.strideWidth;
-    const dx = ops.buffer<Rank.R4>(convInfo.inShape, 'float32');
 
-    for (let b = 0; b < convInfo.batchSize; ++b) {
-      for (let d1 = 0; d1 < convInfo.inChannels; ++d1) {
-        for (let xR = 0; xR < convInfo.inHeight; ++xR) {
+    for (let b = 0; b < batchSize; ++b) {
+      for (let d1 = 0; d1 < inChannels; ++d1) {
+        for (let xR = 0; xR < inHeight; ++xR) {
           const xRCorner = xR - leftPad;
           const xRMin = Math.max(0, Math.ceil(xRCorner / strideHeight));
           const yRMax = Math.min(
-              convInfo.outHeight, (filterHeight + xRCorner) / strideHeight);
+              outHeight, (filterHeight + xRCorner) / strideHeight);
 
-          for (let xC = 0; xC < convInfo.inWidth; ++xC) {
+          for (let xC = 0; xC < inWidth; ++xC) {
             const xCCorner = xC - topPad;
             const xCMin = Math.max(0, Math.ceil(xCCorner / strideWidth));
             const yCMax = Math.min(
-                convInfo.outWidth, (filterWidth + xCCorner) / strideWidth);
+                outWidth, (filterWidth + xCCorner) / strideWidth);
 
             let dotProd = 0;
             for (let yR = xRMin; yR < yRMax; ++yR) {
               const wR = yR * strideHeight - xRCorner;
 
               for (let yC = xCMin; yC < yCMax; ++yC) {
                 const wC = yC * strideWidth - xCCorner;
-
-                for (let d2 = 0; d2 < convInfo.outChannels; ++d2) {
-                  const pixel = dy.get(b, yR, yC, d2);
-                  const weight = filter.get(
-                      filterHeight - 1 - wR, filterWidth - 1 - wC, d1, d2);
+                const dyOffset = dyS0 * b + dyS1 * yR + dyS2 * yC;
+                const fltOffset = fltS0 * (filterHeight - 1 - wR) +
+                                  fltS1 * (filterWidth - 1 - wC) +
+                                  fltS2 * d1;
+
+                for (let d2 = 0; d2 < outChannels; ++d2) {
+                  const pixel = dyValues[dyOffset + d2];
+                  const weight = fltValues[fltOffset + d2];
                   dotProd += pixel * weight;
                 }
               }
             }
-            dx.set(dotProd, b, xR, xC, d1);
+            dxValues[dxS0 * b + dxS1 * xR + dxS2 * xC + d1] = dotProd;
           }
         }
       }

diff --git a/src/tensor.ts b/src/tensor.ts
@@ -38,10 +38,9 @@ export interface TensorData {
 export class TensorBuffer<R extends Rank> {
   size: number;
   shape: ShapeMap[R];
+  strides: number[];
   values: TypedArray;
 
-  private strides: number[];
-
   constructor(shape: ShapeMap[R], public dtype: DataType, values: TypedArray) {
     if (values != null) {
       const n = values.length;