From c027d6a51d826977c7c7dcc47b3a4274044c842f Mon Sep 17 00:00:00 2001
From: Architect <LuciferianInk@protonmail.com>
Date: Fri, 12 Apr 2024 15:18:22 -0500
Subject: [PATCH] Add support for GELU and approximate activation functions
 (#8224)

FEATURE
* add docker configs for isolated testing

* implement gelu and gelu_new as separate activations

* Update activations.ts

* Update activations_test.ts

* Update activations_test.ts

* remove docker files

* fix activation tests

* fix lint errors

* remove extra blank line

* fix gelu_new calc

* fix 1D test

---------

Co-authored-by: Ping Yu <4018+pyu10055@users.noreply.github.com>
Co-authored-by: Matthew Soulanille <msoulanille@google.com>
---
 tfjs-layers/src/activations.ts                | 74 ++++++++++++--
 tfjs-layers/src/activations_test.ts           | 97 ++++++++++++++++++-
 .../src/keras_format/activation_config.ts     |  4 +-
 .../layers/nlp/models/gpt2/gpt2_backbone.ts   |  3 +-
 4 files changed, 166 insertions(+), 12 deletions(-)

diff --git a/tfjs-layers/src/activations.ts b/tfjs-layers/src/activations.ts
index b5758ae2733..12849a47956 100644
--- a/tfjs-layers/src/activations.ts
+++ b/tfjs-layers/src/activations.ts
@@ -209,23 +209,64 @@ export class LogSoftmax extends Activation {
 serialization.registerClass(LogSoftmax);
 
 /**
- * Swish activation function
+ * Gelu activation function
  */
-export class Swish extends Activation {
+export class Gelu extends Activation {
   /** @nocollapse */
-  static readonly className = 'swish';
+  static readonly className = 'gelu';
   /**
    * Calculate the activation function.
    *
    * @param x Tensor.
-   * @param alpha Scaling factor for the sigmoid function.
    * @returns a Tensor of the same shape as x
    */
-  apply(x: Tensor, alpha = 1): Tensor {
-    return tidy(() => tfc.mul(tfc.sigmoid(tfc.mul(x, alpha)), x));
+  apply(x: Tensor): Tensor {
+    return tidy(() => {
+      return tfc.tidy(() => {
+        const sqrtTwo = Math.sqrt(2);
+        // Compute Φ(x) using the erf function
+        const cdf = tfc.mul(0.5, tfc.add(1, tfc.erf(tfc.div(x, sqrtTwo))));
+        // Compute GELU(x) = x * Φ(x)
+        return tfc.mul(x, cdf);
+      });
+    });
   }
 }
-serialization.registerClass(Swish);
+serialization.registerClass(Gelu);
+
+/**
+ * GeluNew activation function
+ */
+export class GeluNew extends Activation {
+  /** @nocollapse */
+  static readonly className = 'gelu_new';
+  /**
+   * Calculate the activation function.
+   *
+   * @param x Tensor.
+   * @returns a Tensor of the same shape as x
+   */
+  apply(x: Tensor): Tensor {
+    return tidy(() => {
+      return tfc.mul(
+        0.5,
+        tfc.mul(
+          x,
+          tfc.add(
+              1,
+              tfc.tanh(
+                tfc.mul(
+                  tfc.sqrt(tfc.div(2, Math.PI)),
+                  tfc.add(x, tfc.mul(0.044715, tfc.pow(x, 3)))
+                  )
+              )
+          )
+        )
+      );
+    });
+  }
+}
+serialization.registerClass(GeluNew);
 
 /**
  * Mish activation function
@@ -245,6 +286,25 @@ export class Mish extends Activation {
 }
 serialization.registerClass(Mish);
 
+/**
+ * Swish activation function
+ */
+export class Swish extends Activation {
+  /** @nocollapse */
+  static readonly className = 'swish';
+  /**
+   * Calculate the activation function.
+   *
+   * @param x Tensor.
+   * @param alpha Scaling factor for the sigmoid function.
+   * @returns a Tensor of the same shape as x
+   */
+  apply(x: Tensor, alpha = 1): Tensor {
+    return tidy(() => tfc.mul(tfc.sigmoid(tfc.mul(x, alpha)), x));
+  }
+}
+serialization.registerClass(Swish);
+
 export function serializeActivation(activation: Activation): string {
   return activation.getClassName();
 }
diff --git a/tfjs-layers/src/activations_test.ts b/tfjs-layers/src/activations_test.ts
index bc4d6289812..3ee38dc5d01 100644
--- a/tfjs-layers/src/activations_test.ts
+++ b/tfjs-layers/src/activations_test.ts
@@ -13,7 +13,7 @@
  */
 import {scalar, tensor1d, tensor2d, tensor3d} from '@tensorflow/tfjs-core';
 
-import {Elu, HardSigmoid, Linear, LogSoftmax, Relu, Relu6, Selu, Sigmoid, Softmax, Softplus, Softsign, Tanh, Swish, Mish} from './activations';
+import {Elu, HardSigmoid, Linear, LogSoftmax, Relu, Relu6, Selu, Sigmoid, Softmax, Softplus, Softsign, Tanh, Swish, Mish, Gelu, GeluNew} from './activations';
 import {describeMathCPUAndGPU, expectNoLeakedTensors, expectTensorsClose} from './utils/test_utils';
 
 describeMathCPUAndGPU('linear activation', () => {
@@ -366,3 +366,98 @@ describeMathCPUAndGPU('mish activation', () => {
     expectNoLeakedTensors(() => mish(initX), 1);
   });
 });
+
+describeMathCPUAndGPU('gelu activation', () => {
+  const gelu = new Gelu().apply;
+  // Setup: Array with initial values.
+  // Execute: Gelu on the last dimension.
+  // Expect: Output array matches size and approximate expected values.
+  it('1D', () => {
+    const initX = tensor1d([0, 1, 3, 9]);
+    const expectedVals = tensor1d([
+        0,
+        0.8413447141647339,
+        2.995950222015381, 9
+    ]);
+    expectTensorsClose(gelu(initX), expectedVals);
+  });
+  it('1D all equal', () => {
+    const initX = tensor1d([-1, -1, -1, -1]);
+    const expectedVals = tensor1d([
+      -0.15865525603294373,
+      -0.15865525603294373,
+      -0.15865525603294373,
+      -0.15865525603294373
+    ]);
+    expectTensorsClose(gelu(initX), expectedVals);
+  });
+  it('2D', () => {
+    const initX = tensor2d([[0, 1, 3, 9], [0, 1, 3, 9]]);
+    const expectedVals = tensor2d([
+      [0, 0.8413447141647339, 2.995950222015381, 9],
+      [0, 0.8413447141647339, 2.995950222015381, 9]
+    ]);
+    expectTensorsClose(gelu(initX), expectedVals);
+  });
+  it('3D', () => {
+    const initX = tensor3d([[[0, 1, 3, 9], [0, 1, 3, 9]]]);
+    const expectedVals = tensor3d([[
+      [ 0, 0.8413447141647339, 2.995950222015381, 9 ],
+      [ 0, 0.8413447141647339, 2.995950222015381, 9 ]
+    ]]);
+    expectTensorsClose(gelu(initX), expectedVals);
+  });
+  it('Does not leak', () => {
+    const initX = tensor1d([0, 1, 3, 9]);
+    expectNoLeakedTensors(() => gelu(initX), 1);
+  });
+});
+
+describeMathCPUAndGPU('gelu_new activation', () => {
+  const geluNew = new GeluNew().apply;
+  // Setup: Array with initial values.
+  // Execute: GeluNew on the last dimension.
+  // Expect: Output array matches size and approximate expected values.
+  it('1D', () => {
+    const initX = tensor1d([0, 1, 3, 9]);
+    const expectedVals = tensor1d([
+      0,
+      0.8411920070648193,
+      2.9963626861572266,
+      9
+    ]);
+    expectTensorsClose(geluNew(initX), expectedVals);
+  });
+  it('1D all equal', () => {
+    const initX = tensor1d([-1, -1, -1, -1]);
+    const expectedVals = tensor1d([
+      -0.15880802273750305,
+      -0.15880802273750305,
+      -0.15880802273750305,
+      -0.15880802273750305
+    ]);
+    expectTensorsClose(geluNew(initX), expectedVals);
+  });
+  it('2D', () => {
+    const initX = tensor2d([[0, 1, 3, 9], [0, 1, 3, 9]]);
+    const expectedVals = tensor2d([
+      [ 0, 0.8411920070648193, 2.9963626861572266, 9 ],
+      [ 0, 0.8411920070648193, 2.9963626861572266, 9 ]
+    ]);
+    expectTensorsClose(geluNew(initX), expectedVals);
+  });
+  it('3D', () => {
+    const initX = tensor3d([[[0, 1, 3, 9], [0, 1, 3, 9]]]);
+    const expectedVals = tensor3d([
+      [
+        [ 0, 0.8411920070648193, 2.9963626861572266, 9 ],
+        [ 0, 0.8411920070648193, 2.9963626861572266, 9 ]
+      ]
+    ]);
+    expectTensorsClose(geluNew(initX), expectedVals);
+  });
+  it('Does not leak', () => {
+    const initX = tensor1d([0, 1, 3, 9]);
+    expectNoLeakedTensors(() => geluNew(initX), 1);
+  });
+});
diff --git a/tfjs-layers/src/keras_format/activation_config.ts b/tfjs-layers/src/keras_format/activation_config.ts
index 791d622f1dd..92c4cc55de2 100644
--- a/tfjs-layers/src/keras_format/activation_config.ts
+++ b/tfjs-layers/src/keras_format/activation_config.ts
@@ -15,7 +15,7 @@ import {stringLiteralArray} from './utils';
  */
 export const activationOptions = stringLiteralArray([
   'elu', 'hard_sigmoid', 'linear', 'relu', 'relu6', 'selu', 'sigmoid',
-  'softmax', 'softplus', 'softsign', 'tanh', 'swish', 'mish'
+  'softmax', 'softplus', 'softsign', 'tanh', 'swish', 'mish', 'gelu', 'gelu_new'
 ]);
 
 /**
@@ -28,4 +28,4 @@ export type ActivationSerialization = typeof activationOptions[number];
 // e.g. to src/common.ts.  Maybe even duplicate *all* of these to be pedantic?
 /** @docinline */
 export type ActivationIdentifier = 'elu'|'hardSigmoid'|'linear'|'relu'|'relu6'|
-    'selu'|'sigmoid'|'softmax'|'softplus'|'softsign'|'tanh'|'swish'|'mish';
+    'selu'|'sigmoid'|'softmax'|'softplus'|'softsign'|'tanh'|'swish'|'mish'|'gelu'|'gelu_new';
diff --git a/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts b/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts
index 971f8868cc6..a90158fd17a 100644
--- a/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts
+++ b/tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts
@@ -170,8 +170,7 @@ export class GPT2Backbone extends Backbone {
         numHeads: args.numHeads,
         dropout: args.dropout,
         layerNormEpsilon: 1e-05,
-        // TODO(pforderique): Implement gelu.
-        activation: getActivation('relu'),
+        activation: getActivation('gelu'),
         kernelInitializer: gpt2KernelInitializer(0.02),
         normalizeFirst: true,
         name: `transformer_layer_${i}`,