Skip to content

Commit

Permalink
Optimize CPU conv2dDerInput on CPU to be 100x faster (tensorflow#995)
Browse files Browse the repository at this point in the history
PERF
  • Loading branch information
piscisaureus authored and dsmilkov committed Apr 25, 2018
1 parent 1ede7c7 commit ac95bf3
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 19 deletions.
43 changes: 26 additions & 17 deletions src/kernels/backend_cpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -984,44 +984,53 @@ export class MathBackendCPU implements KernelBackend {

conv2dDerInput(dy: Tensor4D, filter: Tensor4D, convInfo: Conv2DInfo):
Tensor4D {
const filterHeight = convInfo.filterHeight;
const filterWidth = convInfo.filterWidth;
const dx = ops.buffer<Rank.R4>(convInfo.inShape, 'float32');
const dxValues = dx.values;
const [dxS0, dxS1, dxS2] = dx.strides;
const dyValues = dy.dataSync();
const [dyS0, dyS1, dyS2] = dy.strides;
const fltValues = filter.dataSync();
const [fltS0, fltS1, fltS2] = filter.strides;
const {batchSize, filterHeight, filterWidth,
inChannels, inHeight, inWidth,
outChannels, outHeight, outWidth,
strideHeight, strideWidth} = convInfo;
const topPad = filterHeight - 1 - convInfo.padInfo.top;
const leftPad = filterWidth - 1 - convInfo.padInfo.left;
const strideHeight = convInfo.strideHeight;
const strideWidth = convInfo.strideWidth;
const dx = ops.buffer<Rank.R4>(convInfo.inShape, 'float32');

for (let b = 0; b < convInfo.batchSize; ++b) {
for (let d1 = 0; d1 < convInfo.inChannels; ++d1) {
for (let xR = 0; xR < convInfo.inHeight; ++xR) {
for (let b = 0; b < batchSize; ++b) {
for (let d1 = 0; d1 < inChannels; ++d1) {
for (let xR = 0; xR < inHeight; ++xR) {
const xRCorner = xR - leftPad;
const xRMin = Math.max(0, Math.ceil(xRCorner / strideHeight));
const yRMax = Math.min(
convInfo.outHeight, (filterHeight + xRCorner) / strideHeight);
outHeight, (filterHeight + xRCorner) / strideHeight);

for (let xC = 0; xC < convInfo.inWidth; ++xC) {
for (let xC = 0; xC < inWidth; ++xC) {
const xCCorner = xC - topPad;
const xCMin = Math.max(0, Math.ceil(xCCorner / strideWidth));
const yCMax = Math.min(
convInfo.outWidth, (filterWidth + xCCorner) / strideWidth);
outWidth, (filterWidth + xCCorner) / strideWidth);

let dotProd = 0;
for (let yR = xRMin; yR < yRMax; ++yR) {
const wR = yR * strideHeight - xRCorner;

for (let yC = xCMin; yC < yCMax; ++yC) {
const wC = yC * strideWidth - xCCorner;

for (let d2 = 0; d2 < convInfo.outChannels; ++d2) {
const pixel = dy.get(b, yR, yC, d2);
const weight = filter.get(
filterHeight - 1 - wR, filterWidth - 1 - wC, d1, d2);
const dyOffset = dyS0 * b + dyS1 * yR + dyS2 * yC;
const fltOffset = fltS0 * (filterHeight - 1 - wR) +
fltS1 * (filterWidth - 1 - wC) +
fltS2 * d1;

for (let d2 = 0; d2 < outChannels; ++d2) {
const pixel = dyValues[dyOffset + d2];
const weight = fltValues[fltOffset + d2];
dotProd += pixel * weight;
}
}
}
dx.set(dotProd, b, xR, xC, d1);
dxValues[dxS0 * b + dxS1 * xR + dxS2 * xC + d1] = dotProd;
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/tensor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@ export interface TensorData {
export class TensorBuffer<R extends Rank> {
size: number;
shape: ShapeMap[R];
strides: number[];
values: TypedArray;

private strides: number[];

constructor(shape: ShapeMap[R], public dtype: DataType, values: TypedArray) {
if (values != null) {
const n = values.length;
Expand Down

0 comments on commit ac95bf3

Please sign in to comment.