From c027d6a51d826977c7c7dcc47b3a4274044c842f Mon Sep 17 00:00:00 2001 From: Architect Date: Fri, 12 Apr 2024 15:18:22 -0500 Subject: [PATCH] Add support for GELU and approximate activation functions (#8224) FEATURE * add docker configs for isolated testing * implement gelu and gelu_new as separate activations * Update activations.ts * Update activations_test.ts * Update activations_test.ts * remove docker files * fix activation tests * fix lint errors * remove extra blank line * fix gelu_new calc * fix 1D test --------- Co-authored-by: Ping Yu <4018+pyu10055@users.noreply.github.com> Co-authored-by: Matthew Soulanille --- tfjs-layers/src/activations.ts | 74 ++++++++++++-- tfjs-layers/src/activations_test.ts | 97 ++++++++++++++++++- .../src/keras_format/activation_config.ts | 4 +- .../layers/nlp/models/gpt2/gpt2_backbone.ts | 3 +- 4 files changed, 166 insertions(+), 12 deletions(-) diff --git a/tfjs-layers/src/activations.ts b/tfjs-layers/src/activations.ts index b5758ae2733..12849a47956 100644 --- a/tfjs-layers/src/activations.ts +++ b/tfjs-layers/src/activations.ts @@ -209,23 +209,64 @@ export class LogSoftmax extends Activation { serialization.registerClass(LogSoftmax); /** - * Swish activation function + * Gelu activation function */ -export class Swish extends Activation { +export class Gelu extends Activation { /** @nocollapse */ - static readonly className = 'swish'; + static readonly className = 'gelu'; /** * Calculate the activation function. * * @param x Tensor. - * @param alpha Scaling factor for the sigmoid function. * @returns a Tensor of the same shape as x */ - apply(x: Tensor, alpha = 1): Tensor { - return tidy(() => tfc.mul(tfc.sigmoid(tfc.mul(x, alpha)), x)); + apply(x: Tensor): Tensor { + return tidy(() => { + return tfc.tidy(() => { + const sqrtTwo = Math.sqrt(2); + // Compute Φ(x) using the erf function + const cdf = tfc.mul(0.5, tfc.add(1, tfc.erf(tfc.div(x, sqrtTwo)))); + // Compute GELU(x) = x * Φ(x) + return tfc.mul(x, cdf); + }); + }); } } -serialization.registerClass(Swish); +serialization.registerClass(Gelu); + +/** + * GeluNew activation function + */ +export class GeluNew extends Activation { + /** @nocollapse */ + static readonly className = 'gelu_new'; + /** + * Calculate the activation function. + * + * @param x Tensor. + * @returns a Tensor of the same shape as x + */ + apply(x: Tensor): Tensor { + return tidy(() => { + return tfc.mul( + 0.5, + tfc.mul( + x, + tfc.add( + 1, + tfc.tanh( + tfc.mul( + tfc.sqrt(tfc.div(2, Math.PI)), + tfc.add(x, tfc.mul(0.044715, tfc.pow(x, 3))) + ) + ) + ) + ) + ); + }); + } +} +serialization.registerClass(GeluNew); /** * Mish activation function @@ -245,6 +286,25 @@ export class Mish extends Activation { } serialization.registerClass(Mish); +/** + * Swish activation function + */ +export class Swish extends Activation { + /** @nocollapse */ + static readonly className = 'swish'; + /** + * Calculate the activation function. + * + * @param x Tensor. + * @param alpha Scaling factor for the sigmoid function. + * @returns a Tensor of the same shape as x + */ + apply(x: Tensor, alpha = 1): Tensor { + return tidy(() => tfc.mul(tfc.sigmoid(tfc.mul(x, alpha)), x)); + } +} +serialization.registerClass(Swish); + export function serializeActivation(activation: Activation): string { return activation.getClassName(); } diff --git a/tfjs-layers/src/activations_test.ts b/tfjs-layers/src/activations_test.ts index bc4d6289812..3ee38dc5d01 100644 --- a/tfjs-layers/src/activations_test.ts +++ b/tfjs-layers/src/activations_test.ts @@ -13,7 +13,7 @@ */ import {scalar, tensor1d, tensor2d, tensor3d} from '@tensorflow/tfjs-core'; -import {Elu, HardSigmoid, Linear, LogSoftmax, Relu, Relu6, Selu, Sigmoid, Softmax, Softplus, Softsign, Tanh, Swish, Mish} from './activations'; +import {Elu, HardSigmoid, Linear, LogSoftmax, Relu, Relu6, Selu, Sigmoid, Softmax, Softplus, Softsign, Tanh, Swish, Mish, Gelu, GeluNew} from './activations'; import {describeMathCPUAndGPU, expectNoLeakedTensors, expectTensorsClose} from './utils/test_utils'; describeMathCPUAndGPU('linear activation', () => { @@ -366,3 +366,98 @@ describeMathCPUAndGPU('mish activation', () => { expectNoLeakedTensors(() => mish(initX), 1); }); }); + +describeMathCPUAndGPU('gelu activation', () => { + const gelu = new Gelu().apply; + // Setup: Array with initial values. + // Execute: Gelu on the last dimension. + // Expect: Output array matches size and approximate expected values. + it('1D', () => { + const initX = tensor1d([0, 1, 3, 9]); + const expectedVals = tensor1d([ + 0, + 0.8413447141647339, + 2.995950222015381, 9 + ]); + expectTensorsClose(gelu(initX), expectedVals); + }); + it('1D all equal', () => { + const initX = tensor1d([-1, -1, -1, -1]); + const expectedVals = tensor1d([ + -0.15865525603294373, + -0.15865525603294373, + -0.15865525603294373, + -0.15865525603294373 + ]); + expectTensorsClose(gelu(initX), expectedVals); + }); + it('2D', () => { + const initX = tensor2d([[0, 1, 3, 9], [0, 1, 3, 9]]); + const expectedVals = tensor2d([ + [0, 0.8413447141647339, 2.995950222015381, 9], + [0, 0.8413447141647339, 2.995950222015381, 9] + ]); + expectTensorsClose(gelu(initX), expectedVals); + }); + it('3D', () => { + const initX = tensor3d([[[0, 1, 3, 9], [0, 1, 3, 9]]]); + const expectedVals = tensor3d([[ + [ 0, 0.8413447141647339, 2.995950222015381, 9 ], + [ 0, 0.8413447141647339, 2.995950222015381, 9 ] + ]]); + expectTensorsClose(gelu(initX), expectedVals); + }); + it('Does not leak', () => { + const initX = tensor1d([0, 1, 3, 9]); + expectNoLeakedTensors(() => gelu(initX), 1); + }); +}); + +describeMathCPUAndGPU('gelu_new activation', () => { + const geluNew = new GeluNew().apply; + // Setup: Array with initial values. + // Execute: GeluNew on the last dimension. + // Expect: Output array matches size and approximate expected values. + it('1D', () => { + const initX = tensor1d([0, 1, 3, 9]); + const expectedVals = tensor1d([ + 0, + 0.8411920070648193, + 2.9963626861572266, + 9 + ]); + expectTensorsClose(geluNew(initX), expectedVals); + }); + it('1D all equal', () => { + const initX = tensor1d([-1, -1, -1, -1]); + const expectedVals = tensor1d([ + -0.15880802273750305, + -0.15880802273750305, + -0.15880802273750305, + -0.15880802273750305 + ]); + expectTensorsClose(geluNew(initX), expectedVals); + }); + it('2D', () => { + const initX = tensor2d([[0, 1, 3, 9], [0, 1, 3, 9]]); + const expectedVals = tensor2d([ + [ 0, 0.8411920070648193, 2.9963626861572266, 9 ], + [ 0, 0.8411920070648193, 2.9963626861572266, 9 ] + ]); + expectTensorsClose(geluNew(initX), expectedVals); + }); + it('3D', () => { + const initX = tensor3d([[[0, 1, 3, 9], [0, 1, 3, 9]]]); + const expectedVals = tensor3d([ + [ + [ 0, 0.8411920070648193, 2.9963626861572266, 9 ], + [ 0, 0.8411920070648193, 2.9963626861572266, 9 ] + ] + ]); + expectTensorsClose(geluNew(initX), expectedVals); + }); + it('Does not leak', () => { + const initX = tensor1d([0, 1, 3, 9]); + expectNoLeakedTensors(() => geluNew(initX), 1); + }); +}); diff --git a/tfjs-layers/src/keras_format/activation_config.ts b/tfjs-layers/src/keras_format/activation_config.ts index 791d622f1dd..92c4cc55de2 100644 --- a/tfjs-layers/src/keras_format/activation_config.ts +++ b/tfjs-layers/src/keras_format/activation_config.ts @@ -15,7 +15,7 @@ import {stringLiteralArray} from './utils'; */ export const activationOptions = stringLiteralArray([ 'elu', 'hard_sigmoid', 'linear', 'relu', 'relu6', 'selu', 'sigmoid', - 'softmax', 'softplus', 'softsign', 'tanh', 'swish', 'mish' + 'softmax', 'softplus', 'softsign', 'tanh', 'swish', 'mish', 'gelu', 'gelu_new' ]); /** @@ -28,4 +28,4 @@ export type ActivationSerialization = typeof activationOptions[number]; // e.g. to src/common.ts. Maybe even duplicate *all* of these to be pedantic? /** @docinline */ export type ActivationIdentifier = 'elu'|'hardSigmoid'|'linear'|'relu'|'relu6'| - 'selu'|'sigmoid'|'softmax'|'softplus'|'softsign'|'tanh'|'swish'|'mish'; + 'selu'|'sigmoid'|'softmax'|'softplus'|'softsign'|'tanh'|'swish'|'mish'|'gelu'|'gelu_new'; diff --git a/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts b/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts index 971f8868cc6..a90158fd17a 100644 --- a/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts +++ b/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts @@ -170,8 +170,7 @@ export class GPT2Backbone extends Backbone { numHeads: args.numHeads, dropout: args.dropout, layerNormEpsilon: 1e-05, - // TODO(pforderique): Implement gelu. - activation: getActivation('relu'), + activation: getActivation('gelu'), kernelInitializer: gpt2KernelInitializer(0.02), normalizeFirst: true, name: `transformer_layer_${i}`,