From 0af70b0283b7f147850c8718a98bd88a11931336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pedersen?= Date: Thu, 11 May 2023 08:27:51 +0200 Subject: [PATCH 1/5] Added tests for tf <2.8 + refactored tests --- .github/workflows/test.yml | 44 +++++++++++++++++ tests/test_model_expected_results.py | 30 +++++++++++ tests/test_optimizer_wrapper.py | 10 +--- tests/utils.py | 74 ++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 tests/test_model_expected_results.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ece3dd8..2a46c6f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,3 +98,47 @@ jobs: pytest -v tests/test_mp_batch_norm.py pytest -v tests/test_optimizer_distribute.py pytest -v tests/test_model_distribute.py + + + tf-compability: + needs: build + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-20.04] + python-version: ["3.6"] + tf-version: [2.2.0, 2.3.0, 2.4.0, 2.5.0, 2.6.2] + + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install wheel setuptools flake8 pytest-cov + + - name: Install tensorflow-datasets + run: | + pip install tensorflow==${{ matrix.tf-version }} "tensorflow-datasets<=4.8.2" + pip install "protobuf<=3.20" --force-reinstall + + - name: Download artifact + uses: actions/download-artifact@master + with: + name: "Python wheel" + + - name: Install wheel + run: pip install --find-links=${{github.workspace}} gradient_accumulator + + - name: Debug pip deps + run: pip list + + - name: Test library accessibility + run: python -c "from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer" + + - name: Run tests + run: | + pytest -v tests/test_model_expected_result.py + pytest -v tests/test_optimizer_wrapper.py diff --git a/tests/test_model_expected_results.py b/tests/test_model_expected_results.py new file mode 100644 index 0000000..c266332 --- /dev/null +++ b/tests/test_model_expected_results.py @@ -0,0 +1,30 @@ +import numpy as np +import tensorflow as tf +from .utils import get_opt, normalize_img, reset, run_experiment +from tensorflow.keras.models import load_model +from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer + + +# get current tf minor version +tf_version = int(tf.version.VERSION.split(".")[1]) + + +def test_model_expected_result(): + # set seed + reset() + + # run once + result1 = run_experiment(bs=100, accum_steps=1, epochs=2, modeloropt="model") + + # reset before second run to get identical results + reset() + + # test with model wrapper instead + result2 = run_experiment(bs=50, accum_steps=2, epochs=2, modeloropt="model") + + # results should be identical (theoretically, even in practice on CPU) + if tf_version <= 10: + assert result1 == result2 + else: + # approximation worse for tf >= 2.11 + np.testing.assert_almost_equal(result1, result2, decimal=2) diff --git a/tests/test_optimizer_wrapper.py b/tests/test_optimizer_wrapper.py index a09e829..f4536b4 100644 --- a/tests/test_optimizer_wrapper.py +++ b/tests/test_optimizer_wrapper.py @@ -39,14 +39,8 @@ def run_experiment(bs=16, accum_steps=4, epochs=1): ]) # wrap optimizer to add gradient accumulation support - # opt = tf.keras.optimizers.Adam(learning_rate=1e-3) - # need to dynamically handle which Optimizer class to use dependent on tf version - if tf_version > 10: - curr_opt = tf.keras.optimizers.legacy.SGD(learning_rate=1e-2) - else: - curr_opt = tf.keras.optimizers.SGD(learning_rate=1e-2) # IDENTICAL RESULTS WITH SGD!!! - - opt = GradientAccumulateOptimizer(optimizer=curr_opt, accum_steps=accum_steps, reduction="MEAN") # MEAN REDUCTION IMPORTANT!!! + opt = get_opt("SGD") + opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps, reduction="MEAN") # MEAN REDUCTION IMPORTANT!!! # compile model model.compile( diff --git a/tests/utils.py b/tests/utils.py index 64f73ac..1103840 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,7 +1,14 @@ import random as python_random import tensorflow as tf +import tensorflow_datasets as tfds import numpy as np import os +from tensorflow.keras.models import load_model +from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer + + +# get current tf minor version +tf_version = int(tf.version.VERSION.split(".")[1]) def reset(seed=123): @@ -74,3 +81,70 @@ def get_opt(opt_name, tf_version=None): def normalize_img(image, label): """Normalizes images: `uint8` -> `float32`.""" return tf.cast(image, tf.float32) / 255., label + + +def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): + # load dataset + (ds_train, ds_test), ds_info = tfds.load( + 'mnist', + split=['train', 'test'], + shuffle_files=True, + as_supervised=True, + with_info=True, + ) + + # build train pipeline + ds_train = ds_train.map(normalize_img) + ds_train = ds_train.batch(bs) + ds_train = ds_train.prefetch(1) + + # build test pipeline + ds_test = ds_test.map(normalize_img) + ds_test = ds_test.batch(bs) + ds_test = ds_test.prefetch(1) + + # create model + input = tf.keras.layers.Input(shape=(28, 28)) + x = tf.keras.layers.Flatten(input_shape=(28, 28))(input) + x = tf.keras.layers.Dense(128, activation='relu')(x) + output = tf.keras.layers.Dense(10)(x) + + opt = get_opt(opt_name="SGD", tf_version=tf_version) + + if accum_steps == 1: + model = tf.keras.Model(inputs=input, outputs=output) + else: + if modeloropt == "model": + # wrap model to use gradient accumulation + model = GradientAccumulateModel(accum_steps=accum_steps, inputs=input, outputs=output) + else: + # wrap optimizer to use gradient accumulation + opt = GradientAccumulateOptimizer(opt, accum_steps=accum_steps) + + # compile model + model = tf.keras.Model(inputs=input, outputs=output) + + # compile model + model.compile( + optimizer=opt, + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], + ) + + # train model + model.fit( + ds_train, + epochs=epochs, + validation_data=ds_test, + ) + + model.save("./trained_model") + + # load trained model and test + del model + trained_model = load_model("./trained_model", compile=True) + + result = trained_model.evaluate(ds_test, verbose=1) + print(result) + + return result[1] From d90f246bf81e7ba24425530289792aec1501e52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pedersen?= Date: Thu, 11 May 2023 08:31:04 +0200 Subject: [PATCH 2/5] Refactored tests; isort + black --- tests/test_adaptive_gradient_clipping.py | 42 ++++++++--- tests/test_batch_norm.py | 50 ++++++++----- tests/test_bn_convnd.py | 63 +++++++++------- tests/test_expected_result.py | 27 ++++--- tests/test_mixed_precision.py | 47 ++++++++---- tests/test_model_distribute.py | 37 +++++----- tests/test_model_expected_results.py | 13 +++- tests/test_mp_batch_norm.py | 93 ++++++++++++++++-------- tests/test_multitask.py | 38 +++++++--- tests/test_optimizer_distribute.py | 55 ++++++++++---- tests/test_optimizer_invariance.py | 53 ++++++++++---- tests/test_optimizer_wrapper.py | 38 ++++++---- tests/test_param_count.py | 28 ++++--- tests/test_sparse_optimizer.py | 59 +++++++++------ tests/utils.py | 30 +++++--- 15 files changed, 439 insertions(+), 234 deletions(-) diff --git a/tests/test_adaptive_gradient_clipping.py b/tests/test_adaptive_gradient_clipping.py index 9346b18..d68a8ea 100644 --- a/tests/test_adaptive_gradient_clipping.py +++ b/tests/test_adaptive_gradient_clipping.py @@ -1,16 +1,24 @@ +import os + import tensorflow as tf import tensorflow_datasets as tfds +from tensorflow.keras import mixed_precision from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateModel from gradient_accumulator import unitwise_norm -from tensorflow.keras import mixed_precision -import os + from .utils import normalize_img def test_unitwise_norm(): for i in range(7): - x = tf.zeros([1,] * i) + x = tf.zeros( + [ + 1, + ] + * i + ) try: unitwise_norm(x) except ValueError as e: @@ -22,8 +30,8 @@ def test_unitwise_norm(): def test_train_mnist(): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -35,7 +43,7 @@ def test_train_mnist(): # build train pipeline ds_train = ds_train.map(normalize_img) ds_train = ds_train.cache() - ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples) + ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) ds_train = ds_train.batch(100) # multiplum of 8 ds_train = ds_train.prefetch(1) @@ -46,14 +54,24 @@ def test_train_mnist(): ds_test = ds_test.prefetch(1) # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(32, activation='relu'), # 32 multiplum of 8 - tf.keras.layers.Dense(10, dtype='float32') # output not numerically stable with float16 - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(32, activation="relu"), # 32 multiplum of 8 + tf.keras.layers.Dense( + 10, dtype="float32" + ), # output not numerically stable with float16 + ] + ) # wrap model to use gradient accumulation - model = GradientAccumulateModel(accum_steps=4, mixed_precision=False, use_agc=True, inputs=model.input, outputs=model.output) + model = GradientAccumulateModel( + accum_steps=4, + mixed_precision=False, + use_agc=True, + inputs=model.input, + outputs=model.output, + ) # need to scale optimizer for mixed precision opt = tf.keras.optimizers.SGD(1e-2) diff --git a/tests/test_batch_norm.py b/tests/test_batch_norm.py index 3ed2ada..91dba0c 100644 --- a/tests/test_batch_norm.py +++ b/tests/test_batch_norm.py @@ -1,19 +1,25 @@ +import os +import random as python_random + +import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateModel from gradient_accumulator.layers import AccumBatchNormalization -import random as python_random -import numpy as np -import os -from .utils import reset, normalize_img + +from .utils import normalize_img +from .utils import reset -def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epochs:int = 3): +def run_experiment( + custom_bn: bool = True, bs: int = 100, accum_steps: int = 1, epochs: int = 3 +): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -21,7 +27,7 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo # build train pipeline ds_train = ds_train.map(normalize_img) - ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples) + ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) ds_train = ds_train.batch(bs) ds_train = ds_train.prefetch(1) @@ -39,17 +45,21 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo normalization_layer = tf.keras.layers.Activation("linear") # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(32), - normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance - tf.keras.layers.Activation("relu"), - tf.keras.layers.Dense(10) - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(32), + normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance + tf.keras.layers.Activation("relu"), + tf.keras.layers.Dense(10), + ] + ) # wrap model to use gradient accumulation if accum_steps > 1: - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=model.input, outputs=model.output + ) # compile model model.compile( @@ -79,10 +89,10 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo def test_compare_bn_layers(): # set seed reset() - + # custom BN without accum result1 = run_experiment(custom_bn=True, accum_steps=1, epochs=3)[1] - + # reset before second run to get "identical" results reset() @@ -98,10 +108,10 @@ def test_compare_bn_layers(): def test_compare_accum_bn_expected_result(): # set seed reset() - + # custom BN without accum result1 = run_experiment(custom_bn=True, accum_steps=4, bs=25)[1] - + # reset before second run to get "identical" results reset() diff --git a/tests/test_bn_convnd.py b/tests/test_bn_convnd.py index e7279c4..fe256d7 100644 --- a/tests/test_bn_convnd.py +++ b/tests/test_bn_convnd.py @@ -1,11 +1,14 @@ +import numpy as np import tensorflow as tf from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateModel from gradient_accumulator.layers import AccumBatchNormalization -import numpy as np -def test_bn_conv2d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1): +def test_bn_conv2d( + custom_bn: bool = True, accum_steps: int = 1, epochs: int = 1 +): # make toy dataset data = np.random.randint(2, size=(16, 8, 8, 1)) gt = np.expand_dims(np.random.randint(2, size=16), axis=-1) @@ -19,20 +22,24 @@ def test_bn_conv2d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1): normalization_layer = tf.keras.layers.Activation("linear") # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 1)), - normalization_layer, - tf.keras.layers.Activation("relu"), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(4), - normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance - tf.keras.layers.Activation("relu"), - tf.keras.layers.Dense(1, activation="sigmoid"), - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 1)), + normalization_layer, + tf.keras.layers.Activation("relu"), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(4), + normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance + tf.keras.layers.Activation("relu"), + tf.keras.layers.Dense(1, activation="sigmoid"), + ] + ) # wrap model to use gradient accumulation if accum_steps > 1: - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=model.input, outputs=model.output + ) # compile model model.compile( @@ -60,7 +67,9 @@ def test_bn_conv2d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1): return result -def test_bn_conv3d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1): +def test_bn_conv3d( + custom_bn: bool = True, accum_steps: int = 1, epochs: int = 1 +): # make toy dataset data = np.random.randint(2, size=(16, 8, 8, 8, 1)) gt = np.expand_dims(np.random.randint(2, size=16), axis=-1) @@ -74,20 +83,24 @@ def test_bn_conv3d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1): normalization_layer = tf.keras.layers.Activation("linear") # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Conv3D(4, 3, input_shape=(8, 8, 8, 1)), - normalization_layer, - tf.keras.layers.Activation("relu"), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(4), - normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance - tf.keras.layers.Activation("relu"), - tf.keras.layers.Dense(1, activation="sigmoid"), - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Conv3D(4, 3, input_shape=(8, 8, 8, 1)), + normalization_layer, + tf.keras.layers.Activation("relu"), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(4), + normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance + tf.keras.layers.Activation("relu"), + tf.keras.layers.Dense(1, activation="sigmoid"), + ] + ) # wrap model to use gradient accumulation if accum_steps > 1: - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=model.input, outputs=model.output + ) # compile model model.compile( diff --git a/tests/test_expected_result.py b/tests/test_expected_result.py index e226f63..ddcf0de 100644 --- a/tests/test_expected_result.py +++ b/tests/test_expected_result.py @@ -1,12 +1,17 @@ +import os +import random as python_random + import numpy as np import tensorflow as tf -import random as python_random -import os -from .utils import get_opt, normalize_img, reset import tensorflow_datasets as tfds from tensorflow.keras.models import load_model -from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer +from gradient_accumulator import GradientAccumulateModel +from gradient_accumulator import GradientAccumulateOptimizer + +from .utils import get_opt +from .utils import normalize_img +from .utils import reset # get current tf minor version tf_version = int(tf.version.VERSION.split(".")[1]) @@ -15,8 +20,8 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -35,7 +40,7 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): # create model input = tf.keras.layers.Input(shape=(28, 28)) x = tf.keras.layers.Flatten(input_shape=(28, 28))(input) - x = tf.keras.layers.Dense(128, activation='relu')(x) + x = tf.keras.layers.Dense(128, activation="relu")(x) output = tf.keras.layers.Dense(10)(x) opt = get_opt(opt_name="SGD", tf_version=tf_version) @@ -45,14 +50,16 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): else: if modeloropt == "model": # wrap model to use gradient accumulation - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=input, outputs=output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=input, outputs=output + ) else: # wrap optimizer to use gradient accumulation opt = GradientAccumulateOptimizer(opt, accum_steps=accum_steps) # compile model model = tf.keras.Model(inputs=input, outputs=output) - + # compile model model.compile( optimizer=opt, @@ -91,7 +98,7 @@ def test_expected_result(): # run again with different batch size and number of accumulations result2 = run_experiment(bs=50, accum_steps=2, epochs=2, modeloropt="opt") - + # reset again reset() diff --git a/tests/test_mixed_precision.py b/tests/test_mixed_precision.py index a39b745..d24d50a 100644 --- a/tests/test_mixed_precision.py +++ b/tests/test_mixed_precision.py @@ -2,24 +2,26 @@ def run_experiment(): + import os + import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras import mixed_precision + from gradient_accumulator import GradientAccumulateModel - from .utils import normalize_img - import os + from .utils import normalize_img # disable GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # set mixed global precision policy - mixed_precision.set_global_policy('mixed_float16') + mixed_precision.set_global_policy("mixed_float16") # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -28,8 +30,10 @@ def run_experiment(): # build train pipeline ds_train = ds_train.map(normalize_img) ds_train = ds_train.cache() - ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples) - ds_train = ds_train.batch(32) # multiplum of 8 on GPU to maximize performance + ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) + ds_train = ds_train.batch( + 32 + ) # multiplum of 8 on GPU to maximize performance ds_train = ds_train.prefetch(1) # build test pipeline @@ -39,14 +43,23 @@ def run_experiment(): ds_test = ds_test.prefetch(1) # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(32, activation='relu'), # 32 multiplum of 8 - tf.keras.layers.Dense(10, dtype='float32') # output not numerically stable with float16 - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(32, activation="relu"), # 32 multiplum of 8 + tf.keras.layers.Dense( + 10, dtype="float32" + ), # output not numerically stable with float16 + ] + ) # wrap model to use gradient accumulation - model = GradientAccumulateModel(accum_steps=4, mixed_precision=True, inputs=model.input, outputs=model.output) + model = GradientAccumulateModel( + accum_steps=4, + mixed_precision=True, + inputs=model.input, + outputs=model.output, + ) # need to scale optimizer for mixed precision opt = tf.keras.optimizers.Adam(1e-3) @@ -65,7 +78,7 @@ def run_experiment(): epochs=1, validation_data=ds_test, ) - + # save model on disk model.save("./trained_model") @@ -86,9 +99,11 @@ def test_mixed_precision(): pass else: cleanup_on_sigterm() - + try: - mp.set_start_method('spawn', force=True) # set start method to 'spawn' BEFORE instantiating the queue and the event + mp.set_start_method( + "spawn", force=True + ) # set start method to 'spawn' BEFORE instantiating the queue and the event except RuntimeError: pass diff --git a/tests/test_model_distribute.py b/tests/test_model_distribute.py index 9bd3e86..b47d75d 100644 --- a/tests/test_model_distribute.py +++ b/tests/test_model_distribute.py @@ -1,7 +1,9 @@ import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateModel + from .utils import get_opt @@ -10,15 +12,15 @@ def test_model_distribute(): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, ) # build train pipeline - ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples) + ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) ds_train = ds_train.batch(100) ds_train = ds_train.prefetch(1) @@ -28,14 +30,18 @@ def test_model_distribute(): with strategy.scope(): # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(16, activation='relu'), - tf.keras.layers.Dense(10) - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(16, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) model = GradientAccumulateModel( - accum_steps=4, inputs=model.input, - outputs=model.output, experimental_distributed_support=True, + accum_steps=4, + inputs=model.input, + outputs=model.output, + experimental_distributed_support=True, ) # define optimizer - currently only SGD compatible with GAOptimizerWrapper @@ -44,17 +50,14 @@ def test_model_distribute(): # compile model model.compile( optimizer=opt, - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + loss=tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True + ), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], ) # train model - model.fit( - ds_train, - epochs=3, - validation_data=ds_test, - verbose=1 - ) + model.fit(ds_train, epochs=3, validation_data=ds_test, verbose=1) model.save("./trained_model") diff --git a/tests/test_model_expected_results.py b/tests/test_model_expected_results.py index c266332..81b67dc 100644 --- a/tests/test_model_expected_results.py +++ b/tests/test_model_expected_results.py @@ -1,9 +1,14 @@ import numpy as np import tensorflow as tf -from .utils import get_opt, normalize_img, reset, run_experiment from tensorflow.keras.models import load_model -from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer +from gradient_accumulator import GradientAccumulateModel +from gradient_accumulator import GradientAccumulateOptimizer + +from .utils import get_opt +from .utils import normalize_img +from .utils import reset +from .utils import run_experiment # get current tf minor version tf_version = int(tf.version.VERSION.split(".")[1]) @@ -14,7 +19,9 @@ def test_model_expected_result(): reset() # run once - result1 = run_experiment(bs=100, accum_steps=1, epochs=2, modeloropt="model") + result1 = run_experiment( + bs=100, accum_steps=1, epochs=2, modeloropt="model" + ) # reset before second run to get identical results reset() diff --git a/tests/test_mp_batch_norm.py b/tests/test_mp_batch_norm.py index 79e5df7..50bec5c 100644 --- a/tests/test_mp_batch_norm.py +++ b/tests/test_mp_batch_norm.py @@ -1,21 +1,31 @@ import multiprocessing as mp -def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epochs:int = 3, queue=None, mixed_precision_flag=True): +def run_experiment( + custom_bn: bool = True, + bs: int = 100, + accum_steps: int = 1, + epochs: int = 3, + queue=None, + mixed_precision_flag=True, +): + import os + import random as python_random + + import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras import mixed_precision from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateModel from gradient_accumulator.layers import AccumBatchNormalization - import random as python_random - import numpy as np - import os - from .utils import normalize_img, get_opt + from .utils import get_opt + from .utils import normalize_img ## reset session and seed stuff before running experiment - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # disable GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" @@ -39,12 +49,12 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo # set mixed global precision policy if mixed_precision_flag: - mixed_precision.set_global_policy('mixed_float16') + mixed_precision.set_global_policy("mixed_float16") # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -52,7 +62,7 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo # build train pipeline ds_train = ds_train.map(normalize_img) - ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples) + ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) ds_train = ds_train.batch(bs) ds_train = ds_train.prefetch(1) @@ -70,19 +80,23 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo normalization_layer = tf.keras.layers.Activation("linear") # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(10), - normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance - tf.keras.layers.Activation("relu"), - tf.keras.layers.Dense(10, dtype=tf.float32) - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(10), + normalization_layer, # @TODO: BN before or after ReLU? Leads to different performance + tf.keras.layers.Activation("relu"), + tf.keras.layers.Dense(10, dtype=tf.float32), + ] + ) # wrap model to use gradient accumulation if accum_steps > 1: model = GradientAccumulateModel( - accum_steps=accum_steps, mixed_precision=mixed_precision_flag, - inputs=model.input, outputs=model.output + accum_steps=accum_steps, + mixed_precision=mixed_precision_flag, + inputs=model.input, + outputs=model.output, ) # need to scale optimizer for mixed precision @@ -117,7 +131,9 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo queue.put(result) -def run_experiment_wrapper(custom_bn=True, bs=100, accum_steps=1, epochs=3, mixed_precision=True): +def run_experiment_wrapper( + custom_bn=True, bs=100, accum_steps=1, epochs=3, mixed_precision=True +): # launch experiment in separate process, as we are enabling mixed precision # which will impact other unit tests, unless we do this try: @@ -126,19 +142,29 @@ def run_experiment_wrapper(custom_bn=True, bs=100, accum_steps=1, epochs=3, mixe pass else: cleanup_on_sigterm() - + try: - mp.set_start_method('spawn', force=True) # set start method to 'spawn' BEFORE instantiating the queue and the event + mp.set_start_method( + "spawn", force=True + ) # set start method to 'spawn' BEFORE instantiating the queue and the event except RuntimeError: pass - + queue = mp.Queue() - p = mp.Process(target=run_experiment(custom_bn=custom_bn, bs=bs, accum_steps=accum_steps, epochs=epochs, queue=queue)) + p = mp.Process( + target=run_experiment( + custom_bn=custom_bn, + bs=bs, + accum_steps=accum_steps, + epochs=epochs, + queue=queue, + ) + ) try: p.start() finally: p.join() # necessary so that the Process exists before the test suite exits (thus coverage is collected) - + return queue.get() @@ -146,19 +172,26 @@ def test_mixed_precision(): import numpy as np # custom BN without accum - result1 = run_experiment_wrapper(custom_bn=True, accum_steps=4, bs=25, mixed_precision=False)[1] + result1 = run_experiment_wrapper( + custom_bn=True, accum_steps=4, bs=25, mixed_precision=False + )[1] # keras BN without accum - result2 = run_experiment_wrapper(custom_bn=True, accum_steps=1, bs=100, mixed_precision=False)[1] + result2 = run_experiment_wrapper( + custom_bn=True, accum_steps=1, bs=100, mixed_precision=False + )[1] # assert result1 == result2 np.testing.assert_almost_equal(result1, result2, decimal=2) - # custom BN with accum with mixed precision - result3 = run_experiment_wrapper(custom_bn=True, accum_steps=4, bs=25, mixed_precision=True)[1] + result3 = run_experiment_wrapper( + custom_bn=True, accum_steps=4, bs=25, mixed_precision=True + )[1] # keras BN without accum - result4 = run_experiment_wrapper(custom_bn=True, accum_steps=1, bs=100, mixed_precision=True)[1] + result4 = run_experiment_wrapper( + custom_bn=True, accum_steps=1, bs=100, mixed_precision=True + )[1] np.testing.assert_almost_equal(result3, result4, decimal=2) diff --git a/tests/test_multitask.py b/tests/test_multitask.py index 26a968f..311d3d1 100644 --- a/tests/test_multitask.py +++ b/tests/test_multitask.py @@ -1,13 +1,23 @@ +import os +import random as python_random + import numpy as np import tensorflow as tf -import random as python_random -import os import tensorflow_datasets as tfds -from tensorflow.keras.models import Model, load_model +from tensorflow.keras.layers import Activation +from tensorflow.keras.layers import Conv2D +from tensorflow.keras.layers import Dense +from tensorflow.keras.layers import Flatten +from tensorflow.keras.layers import Input +from tensorflow.keras.layers import MaxPooling2D +from tensorflow.keras.layers import UpSampling2D +from tensorflow.keras.models import Model +from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateModel -from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, UpSampling2D,\ - MaxPooling2D, Activation -from .utils import normalize_img, reset + +from .utils import normalize_img +from .utils import reset def create_multi_input_output(image, label): @@ -17,8 +27,8 @@ def create_multi_input_output(image, label): def run_experiment(bs=16, accum_steps=4, epochs=1): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -58,13 +68,19 @@ def run_experiment(bs=16, accum_steps=4, epochs=1): # wrap model to use gradient accumulation if accum_steps > 1: - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=model.input, outputs=model.output + ) # compile model model.compile( optimizer=tf.keras.optimizers.SGD(1e-3), - loss={"classifier": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - "reconstructor": "mse"}, + loss={ + "classifier": tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True + ), + "reconstructor": "mse", + }, metrics={"classifier": tf.keras.metrics.SparseCategoricalAccuracy()}, ) diff --git a/tests/test_optimizer_distribute.py b/tests/test_optimizer_distribute.py index 4a5d9a3..1dd5441 100644 --- a/tests/test_optimizer_distribute.py +++ b/tests/test_optimizer_distribute.py @@ -1,16 +1,21 @@ +import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateOptimizer -import numpy as np -from .utils import reset, get_opt, normalize_img +from .utils import get_opt +from .utils import normalize_img +from .utils import reset # get current tf minor version tf_version = int(tf.version.VERSION.split(".")[1]) -def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_name="multi"): +def run_experiment( + opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_name="multi" +): # setup single/multi-GPU strategy if strategy_name == "single": strategy = tf.distribute.get_strategy() # get default strategy @@ -21,8 +26,8 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -40,22 +45,28 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na with strategy.scope(): # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dense(10) - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) # define optimizer - currently only SGD compatible with GAOptimizerWrapper opt = get_opt(opt_name=opt_name, tf_version=tf_version) # wrap optimizer to add gradient accumulation support - opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps) + opt = GradientAccumulateOptimizer( + optimizer=opt, accum_steps=accum_steps + ) # compile model model.compile( optimizer=opt, - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + loss=tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True + ), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], ) @@ -65,7 +76,7 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na batch_size=bs, epochs=epochs, validation_data=ds_test, - verbose=1 + verbose=1, ) model.save("./trained_model") @@ -82,7 +93,7 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na def test_distributed_optimizer_invariance(): - # run experiment for different optimizers, to see if GA is consistent + # run experiment for different optimizers, to see if GA is consistent # within an optimizer. Note that it is expected for the results to # differ BETWEEN optimizers, as they behave differently. for strategy_name in ["single", "multi"]: @@ -92,13 +103,25 @@ def test_distributed_optimizer_invariance(): reset() # run once - result1 = run_experiment(opt_name=opt_name, bs=100, accum_steps=1, epochs=2, strategy_name=strategy_name) + result1 = run_experiment( + opt_name=opt_name, + bs=100, + accum_steps=1, + epochs=2, + strategy_name=strategy_name, + ) # reset before second run to get identical results reset() # run again with different batch size and number of accumulations - result2 = run_experiment(opt_name=opt_name, bs=50, accum_steps=2, epochs=2, strategy_name=strategy_name) + result2 = run_experiment( + opt_name=opt_name, + bs=50, + accum_steps=2, + epochs=2, + strategy_name=strategy_name, + ) # results should be "identical" (on CPU, can be different on GPU) np.testing.assert_almost_equal(result1, result2, decimal=2) diff --git a/tests/test_optimizer_invariance.py b/tests/test_optimizer_invariance.py index 947b2d9..cc990a6 100644 --- a/tests/test_optimizer_invariance.py +++ b/tests/test_optimizer_invariance.py @@ -1,22 +1,29 @@ +import os +import random as python_random + import numpy as np import tensorflow as tf -import random as python_random -import os -from .utils import get_opt, normalize_img, reset import tensorflow_datasets as tfds from tensorflow.keras.models import load_model -from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer +from gradient_accumulator import GradientAccumulateModel +from gradient_accumulator import GradientAccumulateOptimizer + +from .utils import get_opt +from .utils import normalize_img +from .utils import reset # get current tf minor version tf_version = int(tf.version.VERSION.split(".")[1]) -def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="model"): +def run_experiment( + bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="model" +): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -35,7 +42,7 @@ def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="mod # create model input = tf.keras.layers.Input(shape=(28, 28)) x = tf.keras.layers.Flatten(input_shape=(28, 28))(input) - x = tf.keras.layers.Dense(128, activation='relu')(x) + x = tf.keras.layers.Dense(128, activation="relu")(x) output = tf.keras.layers.Dense(10)(x) model = tf.keras.models.Model(inputs=input, outputs=output) @@ -45,9 +52,13 @@ def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="mod # wrap model to use gradient accumulation if accum_steps > 1: if wrapper == "model": - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=input, outputs=output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=input, outputs=output + ) elif wrapper == "optimizer": - opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps) + opt = GradientAccumulateOptimizer( + optimizer=opt, accum_steps=accum_steps + ) else: raise ValueError("Unknown wrapper was chosen:", wrapper) @@ -78,7 +89,7 @@ def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="mod def test_optimizer_invariance(): - # run experiment for different optimizers, to see if GA is consistent + # run experiment for different optimizers, to see if GA is consistent # within an optimizer. Note that it is expected for the results to # differ BETWEEN optimizers, as they behave differently. for wrapper in ["model", "optimizer"]: @@ -88,13 +99,27 @@ def test_optimizer_invariance(): reset() # run once - result1 = run_experiment(bs=100, accum_steps=1, epochs=2, opt_name=opt_name, wrapper=wrapper) + result1 = run_experiment( + bs=100, + accum_steps=1, + epochs=2, + opt_name=opt_name, + wrapper=wrapper, + ) # reset before second run to get identical results reset() # run again with different batch size and number of accumulations - result2 = run_experiment(bs=50, accum_steps=2, epochs=2, opt_name=opt_name, wrapper=wrapper) + result2 = run_experiment( + bs=50, + accum_steps=2, + epochs=2, + opt_name=opt_name, + wrapper=wrapper, + ) # results should be "identical" (on CPU, can be different on GPU) - np.testing.assert_almost_equal(result1, result2, decimal=2) # decimals=3 OK for model wrapper but not optimizer + np.testing.assert_almost_equal( + result1, result2, decimal=2 + ) # decimals=3 OK for model wrapper but not optimizer diff --git a/tests/test_optimizer_wrapper.py b/tests/test_optimizer_wrapper.py index f4536b4..22a538c 100644 --- a/tests/test_optimizer_wrapper.py +++ b/tests/test_optimizer_wrapper.py @@ -1,12 +1,16 @@ +import os +import random as python_random + import numpy as np import tensorflow as tf -import random as python_random -import os -from .utils import get_opt, reset, normalize_img import tensorflow_datasets as tfds from tensorflow.keras.models import load_model + from gradient_accumulator import GradientAccumulateOptimizer +from .utils import get_opt +from .utils import normalize_img +from .utils import reset tf_version = int(tf.version.VERSION.split(".")[1]) @@ -14,8 +18,8 @@ def run_experiment(bs=16, accum_steps=4, epochs=1): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -32,15 +36,19 @@ def run_experiment(bs=16, accum_steps=4, epochs=1): ds_test = ds_test.prefetch(1) # create model - model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(32, activation='relu'), - tf.keras.layers.Dense(10), - ]) + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(32, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) # wrap optimizer to add gradient accumulation support opt = get_opt("SGD") - opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps, reduction="MEAN") # MEAN REDUCTION IMPORTANT!!! + opt = GradientAccumulateOptimizer( + optimizer=opt, accum_steps=accum_steps, reduction="MEAN" + ) # MEAN REDUCTION IMPORTANT!!! # compile model model.compile( @@ -74,7 +82,9 @@ def test_expected_result(): reset() # run once - result1 = run_experiment(bs=500, accum_steps=1, epochs=3) # NOTE: AS TO BE DIVISIBLE BY TRAIN SET SIZE = 50000 (!) + result1 = run_experiment( + bs=500, accum_steps=1, epochs=3 + ) # NOTE: AS TO BE DIVISIBLE BY TRAIN SET SIZE = 50000 (!) # reset before second run to get identical results reset() @@ -95,8 +105,8 @@ def test_expected_result(): # result4 = run_experiment(bs=1, accum_steps=500, epochs=2) # results should be identical (theoretically, even in practice on CPU) - #assert result1 == result2 - #assert result1 == result3 + # assert result1 == result2 + # assert result1 == result3 # reduced constraint for temporarily np.testing.assert_almost_equal(result1, result2, decimal=2) diff --git a/tests/test_param_count.py b/tests/test_param_count.py index 9b0c91e..02ffd5d 100644 --- a/tests/test_param_count.py +++ b/tests/test_param_count.py @@ -1,14 +1,15 @@ import tensorflow as tf -from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense +from tensorflow.keras.models import Sequential + from gradient_accumulator import GradientAccumulateModel def create_model(): input = tf.keras.layers.Input(shape=(10,)) - x = Dense(32, input_shape=(10,), activation='relu')(input) - x = Dense(16, activation='relu')(x) - output = Dense(1, activation='sigmoid')(x) + x = Dense(32, input_shape=(10,), activation="relu")(input) + x = Dense(16, activation="relu")(x) + output = Dense(1, activation="sigmoid")(x) return input, output @@ -19,23 +20,30 @@ def count_params(model): def test_param_count_with_wrapper(): # Create a model - input,output = create_model() + input, output = create_model() original_model = tf.keras.Model(inputs=input, outputs=output) # Count the parameters of the original model original_param_count = count_params(original_model) # Wrap the model with GradientAccumulateModel - wrapped_model = GradientAccumulateModel(accum_steps=2, inputs=input, outputs=output) + wrapped_model = GradientAccumulateModel( + accum_steps=2, inputs=input, outputs=output + ) # Count the parameters of the wrapped model wrapped_param_count = count_params(wrapped_model) # Compile both models - original_model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy') - wrapped_model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy') + original_model.compile( + optimizer=tf.keras.optimizers.Adam(), loss="binary_crossentropy" + ) + wrapped_model.compile( + optimizer=tf.keras.optimizers.Adam(), loss="binary_crossentropy" + ) # Check if the number of parameters in both models is the same - assert original_param_count == wrapped_param_count, \ - f"Parameter count mismatch: Original model has {original_param_count} parameters, " \ + assert original_param_count == wrapped_param_count, ( + f"Parameter count mismatch: Original model has {original_param_count} parameters, " f"wrapped model has {wrapped_param_count} parameters." + ) diff --git a/tests/test_sparse_optimizer.py b/tests/test_sparse_optimizer.py index fe13ec1..087abcb 100644 --- a/tests/test_sparse_optimizer.py +++ b/tests/test_sparse_optimizer.py @@ -1,16 +1,20 @@ +import os +import random as python_random + import numpy as np import tensorflow as tf -from tensorflow.keras.preprocessing.text import one_hot -from tensorflow.keras.preprocessing.sequence import pad_sequences import tensorflow_datasets as tfds +from tensorflow.keras.layers import Dense +from tensorflow.keras.layers import Embedding +from tensorflow.keras.layers import Flatten from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Flatten, Embedding, Dense from tensorflow.keras.models import load_model +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import one_hot + from gradient_accumulator import GradientAccumulateOptimizer -import os -import random as python_random -from .utils import reset +from .utils import reset # get current tf minor version tf_version = int(tf.version.VERSION.split(".")[1]) @@ -19,10 +23,16 @@ def preprocess_data(ds, vocab_size, max_length): def encode(x, y): x = tf.strings.substr(x, 0, max_length) - x = tf.strings.reduce_join(tf.strings.unicode_split(x, input_encoding="UTF-8"), separator=' ') + x = tf.strings.reduce_join( + tf.strings.unicode_split(x, input_encoding="UTF-8"), separator=" " + ) x = tf.strings.split(x) x_hashed = tf.strings.to_hash_bucket_fast(x, vocab_size) - x_padded = tf.pad(x_hashed, paddings=[[0, max_length - tf.shape(x_hashed)[-1]]], constant_values=0) + x_padded = tf.pad( + x_hashed, + paddings=[[0, max_length - tf.shape(x_hashed)[-1]]], + constant_values=0, + ) return x_padded, y ds = ds.map(encode) @@ -32,8 +42,8 @@ def encode(x, y): def run_experiment(bs=100, accum_steps=1, epochs=2): # Load the IMDb dataset (ds_train, ds_test), ds_info = tfds.load( - 'imdb_reviews', - split=['train', 'test'], + "imdb_reviews", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -51,25 +61,27 @@ def run_experiment(bs=100, accum_steps=1, epochs=2): # define model model = Sequential() - model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=max_length)) + model.add( + Embedding(input_dim=vocab_size, output_dim=8, input_length=max_length) + ) model.add(Flatten()) - model.add(Dense(1, activation='sigmoid')) + model.add(Dense(1, activation="sigmoid")) # wrap optimizer to add gradient accumulation support # need to dynamically handle which Optimizer class to use dependent on tf version if tf_version > 10: opt = tf.keras.optimizers.legacy.SGD(learning_rate=1e-2) else: - opt = tf.keras.optimizers.SGD(learning_rate=1e-2) # IDENTICAL RESULTS WITH SGD!!! - + opt = tf.keras.optimizers.SGD( + learning_rate=1e-2 + ) # IDENTICAL RESULTS WITH SGD!!! + if accum_steps > 1: - opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps, reduction="MEAN") + opt = GradientAccumulateOptimizer( + optimizer=opt, accum_steps=accum_steps, reduction="MEAN" + ) - model.compile( - optimizer=opt, - loss='binary_crossentropy', - metrics=['acc'] - ) + model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["acc"]) model.fit( ds_train, @@ -92,10 +104,10 @@ def run_experiment(bs=100, accum_steps=1, epochs=2): def test_sparse_expected_results(): # set seed - #reset() + # reset() # run once - #result1 = run_experiment(bs=100, accum_steps=1, epochs=2) + # result1 = run_experiment(bs=100, accum_steps=1, epochs=2) # reset before second run to get identical results reset() @@ -103,6 +115,5 @@ def test_sparse_expected_results(): # run again with different batch size and number of accumulations result2 = run_experiment(bs=50, accum_steps=2, epochs=2) - # results should be identical (theoretically, even in practice on CPU) - #assert result1 == result2 + # assert result1 == result2 diff --git a/tests/utils.py b/tests/utils.py index 1103840..a2c0f39 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,11 +1,13 @@ +import os import random as python_random + +import numpy as np import tensorflow as tf import tensorflow_datasets as tfds -import numpy as np -import os from tensorflow.keras.models import load_model -from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer +from gradient_accumulator import GradientAccumulateModel +from gradient_accumulator import GradientAccumulateOptimizer # get current tf minor version tf_version = int(tf.version.VERSION.split(".")[1]) @@ -13,7 +15,7 @@ def reset(seed=123): # set tf log level - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # disable GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" @@ -35,14 +37,16 @@ def reset(seed=123): # in the TensorFlow backend have a well-defined initial state. # For further details, see: # https://www.tensorflow.org/api_docs/python/tf/random/set_seed - tf.random.set_seed(1234) # @TODO: Should this seed be different than for python and numpy? + tf.random.set_seed( + 1234 + ) # @TODO: Should this seed be different than for python and numpy? # https://stackoverflow.com/a/71311207 try: tf.config.experimental.enable_op_determinism() # Exist only for TF > 2.7 except AttributeError as e: print(e) - + # force cpu threading determinism # https://stackoverflow.com/questions/36288235/how-to-get-stable-results-with-tensorflow-setting-random-seed tf.config.threading.set_inter_op_parallelism_threads(1) @@ -80,14 +84,14 @@ def get_opt(opt_name, tf_version=None): def normalize_img(image, label): """Normalizes images: `uint8` -> `float32`.""" - return tf.cast(image, tf.float32) / 255., label + return tf.cast(image, tf.float32) / 255.0, label def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): # load dataset (ds_train, ds_test), ds_info = tfds.load( - 'mnist', - split=['train', 'test'], + "mnist", + split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, @@ -106,7 +110,7 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): # create model input = tf.keras.layers.Input(shape=(28, 28)) x = tf.keras.layers.Flatten(input_shape=(28, 28))(input) - x = tf.keras.layers.Dense(128, activation='relu')(x) + x = tf.keras.layers.Dense(128, activation="relu")(x) output = tf.keras.layers.Dense(10)(x) opt = get_opt(opt_name="SGD", tf_version=tf_version) @@ -116,14 +120,16 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"): else: if modeloropt == "model": # wrap model to use gradient accumulation - model = GradientAccumulateModel(accum_steps=accum_steps, inputs=input, outputs=output) + model = GradientAccumulateModel( + accum_steps=accum_steps, inputs=input, outputs=output + ) else: # wrap optimizer to use gradient accumulation opt = GradientAccumulateOptimizer(opt, accum_steps=accum_steps) # compile model model = tf.keras.Model(inputs=input, outputs=output) - + # compile model model.compile( optimizer=opt, From a3ed2faa87ca7b50274cf775c3fe8ca7f4187d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pedersen?= Date: Thu, 11 May 2023 08:32:52 +0200 Subject: [PATCH 3/5] Only test with model wrapper for tf<2.8 --- .github/workflows/test.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2a46c6f..a548c0f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -139,6 +139,4 @@ jobs: run: python -c "from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer" - name: Run tests - run: | - pytest -v tests/test_model_expected_result.py - pytest -v tests/test_optimizer_wrapper.py + run: pytest -v tests/test_model_expected_result.py From edbf6ba1a2b3d1082e15361fec8fa4547986a342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pedersen?= Date: Thu, 11 May 2023 08:34:56 +0200 Subject: [PATCH 4/5] Fixed typo in test script name --- ...st_model_expected_results.py => test_model_expected_result.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_model_expected_results.py => test_model_expected_result.py} (100%) diff --git a/tests/test_model_expected_results.py b/tests/test_model_expected_result.py similarity index 100% rename from tests/test_model_expected_results.py rename to tests/test_model_expected_result.py From 783cf713fffffa8a12e9cd0ed3d56eac9c845dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pedersen?= Date: Thu, 11 May 2023 08:42:29 +0200 Subject: [PATCH 5/5] Use same seed for tests for numpy and tf --- tests/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index a2c0f39..0778068 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -37,9 +37,8 @@ def reset(seed=123): # in the TensorFlow backend have a well-defined initial state. # For further details, see: # https://www.tensorflow.org/api_docs/python/tf/random/set_seed - tf.random.set_seed( - 1234 - ) # @TODO: Should this seed be different than for python and numpy? + # @TODO: Should this seed be different than for python and numpy? + tf.random.set_seed(seed) # https://stackoverflow.com/a/71311207 try: