diff --git a/.coveragerc b/.coveragerc
index c304f811bec..5f13f8d3fea 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,6 +1,7 @@
 [report]
 # Regexes for lines to exclude from consideration
 exclude_lines =
+    pragma: no cover
     os.remove
     except ImportError
     # Don't complain if tests don't hit defensive assertion code:
diff --git a/.travis.yml b/.travis.yml
index d2d2b8d79a2..64978e34bbb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -54,7 +54,7 @@ install:
   # install PIL for preprocessing tests
   - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
       conda install pil;
-    elif [[ "$TRAVIS_PYTHON_VERSION" == "3.6" ]]; then
+    else
       conda install Pillow;
     fi
 
@@ -62,11 +62,11 @@ install:
 
   # install TensorFlow (CPU version).
   - pip install tensorflow==1.7
-
-  # install Apache MXNet (CPU version).
+  
+    # install Apache MXNet (CPU version).
   - pip install mxnet
   - pip install --upgrade numpy
-
+  
   # install cntk
   - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
       pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.5.1-cp27-cp27mu-linux_x86_64.whl;
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index afde39cd4b9..c05ccaab763 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -5,6 +5,8 @@ Thank you!
 - [ ] Check that you are up-to-date with the master branch of Keras. You can update with:
 pip install git+git://github.com/awslabs/keras-apache-mxnet.git --upgrade --no-deps
 
+- [ ] If running on MXNet, check that you are up-to-date with the latest version. The installation
+instructions can be found [here](http://mxnet.incubator.apache.org/install/index.html?platform=Linux&language=Python&processor=CPU)
 - [ ] If running on TensorFlow, check that you are up-to-date with the latest version. The installation instructions can be found [here](https://www.tensorflow.org/get_started/os_setup).
 
 - [ ] If running on Theano, check that you are up-to-date with the master branch of Theano. You can update with:
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000000..c7002afa5cb
--- /dev/null
+++ b/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,10 @@
+### Summary
+
+### Related Issues
+
+### PR Overview
+
+- [ ] This PR requires new unit tests [y/n] (make sure tests are included)
+- [ ] This PR requires to update the documentation [y/n] (make sure the docs are up-to-date)
+- [ ] This PR is backwards compatible [y/n]
+- [ ] This PR changes the current API [y/n]
diff --git a/benchmark/README.md b/benchmark/README.md
index fbd2be4b2c6..99ff5c747fe 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -6,6 +6,7 @@
 2. [Library Versions](#library-versions)
 3. [CNN Benchmarks](#cnn-benchmarks)
     1. [CNN Benchmark Results](#cnn-benchmark-results)
+    2. [CNN Inference Benchmark Results](#cnn-inference-benchmark-results)
 4. [RNN Benchmarks (Experimental)](#rnn-benchmarks-experimental)
 5. [Setup](#setup)
 6. [How to Run CNN Benchmarks](#how-to-run-cnn-benchmarks)
@@ -89,7 +90,16 @@ NOTE:
 You can see more benchmark experiments with different instance types, batch_size and other parameters in [detailed CNN 
 results document](benchmark_result/CNN_result.md).
 
- 
+
+### CNN Inference Benchmark Results
+
+#### ResNet50-Synthetic Data
+
+| Instance Type | GPUs  | Batch Size  | Keras-MXNet (img/sec)  | Keras-TensorFlow (img/sec)  |
+|---|---|---|---|---|
+|  C5.X Large |  0 |  32 | 5.79  | 3.27  |
+|  C5.8X Large | 0  | 32  | 27.9  | 18.2  |
+
 ```
 NOTE:
     1. Image_data_format for MXNet backend - 'channels_first'
@@ -192,6 +202,18 @@ For TensorFlow backend benchmarks:
     $ sh run_tf_backend.sh 8_gpu_config resnet50 False 20 # For 8 GPU Benchmarks
 ```
 
+### ResNet50-Synthetic Inference Benchmarks
+
+For MXNet backend inference benchmarks:
+```
+    $ sh run_mxnet_backend.sh cpu_config resnet50 True 20
+```
+
+For TensorFlow backend inference benchmarks:
+```
+    $ sh run_tf_backend.sh cpu_config resnet50 True 20
+```
+
 The last parameter, 20, in the command is the number of epoch.
 
 ## How to Run RNN Benchmarks
@@ -260,4 +282,4 @@ For TensorFlow backend benchmarks:
 ## References
 
 * [TensorFlow Keras Benchmarks](https://github.com/tensorflow/benchmarks/tree/keras-benchmarks/scripts/keras_benchmarks)
-* [lstm_text_generation.py](https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py)
\ No newline at end of file
+* [lstm_text_generation.py](https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py)
diff --git a/benchmark/benchmark_result/cnn_inference_speed.png b/benchmark/benchmark_result/cnn_inference_speed.png
new file mode 100644
index 00000000000..23e7194e171
Binary files /dev/null and b/benchmark/benchmark_result/cnn_inference_speed.png differ
diff --git a/benchmark/scripts/run_benchmark.py b/benchmark/scripts/run_benchmark.py
index 4e4625ade41..ba415ea2f27 100644
--- a/benchmark/scripts/run_benchmark.py
+++ b/benchmark/scripts/run_benchmark.py
@@ -66,8 +66,11 @@ def get_backend_version():
 
 use_dataset_tensors = False
 if args.epochs:
-    model.run_benchmark(gpus=config['gpus'], inference=inference, use_dataset_tensors=use_dataset_tensors, epochs=int(args.epochs))
+    model.run_benchmark(gpus=config['gpus'], inference=inference,
+                        use_dataset_tensors=use_dataset_tensors,
+                        epochs=int(args.epochs))
 else:
-    model.run_benchmark(gpus=config['gpus'], inference=inference, use_dataset_tensors=use_dataset_tensors)
+    model.run_benchmark(gpus=config['gpus'], inference=inference,
+                        use_dataset_tensors=use_dataset_tensors)
 if args.dry_run:
     print("Model :total_time", model.test_name, model.total_time)
diff --git a/docs/autogen.py b/docs/autogen.py
index a915e9156cf..b70f1d5fef9 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -99,6 +99,7 @@
     'deserialize',
     'get',
     'set_image_dim_ordering',
+    'normalize_data_format',
     'image_dim_ordering',
     'get_variable_shape',
 }
@@ -114,7 +115,7 @@
 PAGES = [
     {
         'page': 'models/sequential.md',
-        'functions': [
+        'methods': [
             models.Sequential.compile,
             models.Sequential.fit,
             models.Sequential.evaluate,
@@ -130,7 +131,7 @@
     },
     {
         'page': 'models/model.md',
-        'functions': [
+        'methods': [
             models.Model.compile,
             models.Model.fit,
             models.Model.evaluate,
@@ -341,32 +342,6 @@
 ROOT = 'http://keras.io/'
 
 
-def get_earliest_class_that_defined_member(member, cls):
-    ancestors = get_classes_ancestors([cls])
-    result = None
-    for ancestor in ancestors:
-        if member in dir(ancestor):
-            result = ancestor
-    if not result:
-        return cls
-    return result
-
-
-def get_classes_ancestors(classes):
-    ancestors = []
-    for cls in classes:
-        ancestors += cls.__bases__
-    filtered_ancestors = []
-    for ancestor in ancestors:
-        if ancestor.__name__ in ['object']:
-            continue
-        filtered_ancestors.append(ancestor)
-    if filtered_ancestors:
-        return filtered_ancestors + get_classes_ancestors(filtered_ancestors)
-    else:
-        return filtered_ancestors
-
-
 def get_function_signature(function, method=True):
     wrapped = getattr(function, '_original_function', None)
     if wrapped is None:
@@ -395,10 +370,6 @@ def get_function_signature(function, method=True):
         signature = st[:-2] + ')'
     else:
         signature = st + ')'
-
-    if not method:
-        # Prepend the module name.
-        signature = clean_module_name(function.__module__) + '.' + signature
     return post_process_signature(signature)
 
 
@@ -409,12 +380,15 @@ def get_class_signature(cls):
     except (TypeError, AttributeError):
         # in case the class inherits from object and does not
         # define __init__
-        class_signature = clean_module_name(cls.__module__) + '.' + cls.__name__ + '()'
+        class_signature = "{clean_module_name}.{cls_name}()".format(
+            clean_module_name=clean_module_name(cls.__module__),
+            cls_name=cls.__name__
+        )
     return post_process_signature(class_signature)
 
 
 def post_process_signature(signature):
-    parts = re.split('\.(?!\d)', signature)
+    parts = re.split(r'\.(?!\d)', signature)
     if len(parts) >= 4:
         if parts[1] == 'layers':
             signature = 'keras.layers.' + '.'.join(parts[3:])
@@ -459,7 +433,7 @@ def code_snippet(snippet):
 
 
 def count_leading_spaces(s):
-    ws = re.search('\S', s)
+    ws = re.search(r'\S', s)
     if ws:
         return ws.start()
     else:
@@ -468,7 +442,8 @@ def count_leading_spaces(s):
 
 def process_list_block(docstring, starting_point, leading_spaces, marker):
     ending_point = docstring.find('\n\n', starting_point)
-    block = docstring[starting_point:None if ending_point == -1 else ending_point - 1]
+    block = docstring[starting_point:(None if ending_point == -1 else
+                                      ending_point - 1)]
     # Place marker for later reinjection.
     docstring = docstring.replace(block, marker)
     lines = block.split('\n')
@@ -596,7 +571,6 @@ def process_docstring(docstring):
             shutil.copy(fpath, new_fpath)
 
 
-# Take care of index page.
 def read_file(path):
     with open(path) as f:
         return f.read()
@@ -616,9 +590,10 @@ def collect_class_methods(cls, methods):
 def render_function(function, method=True):
     subblocks = []
     signature = get_function_signature(function, method=method)
-    signature = signature.replace(function.__module__ + '.', '')
-    level = 3
-    subblocks.append('#' * level + ' ' + function.__name__ + '\n')
+    if method:
+        signature = signature.replace(
+            clean_module_name(function.__module__) + '.', '')
+    subblocks.append('### ' + function.__name__ + '\n')
     subblocks.append(code_snippet(signature))
     docstring = function.__doc__
     if docstring:
@@ -626,6 +601,26 @@ def render_function(function, method=True):
     return '\n\n'.join(subblocks)
 
 
+def read_page_data(page_data, type):
+    assert type in ['classes', 'functions', 'methods']
+    data = page_data.get(type, [])
+    for module in page_data.get('all_module_{}'.format(type), []):
+        module_data = []
+        for name in dir(module):
+            if name[0] == '_' or name in EXCLUDE:
+                continue
+            module_member = getattr(module, name)
+            if (inspect.isclass(module_member) and type == 'classes' or
+               inspect.isfunction(module_member) and type == 'functions'):
+                instance = module_member
+                if module.__name__ in instance.__module__:
+                    if instance not in module_data:
+                        module_data.append(instance)
+        module_data.sort(key=lambda x: id(x))
+        data += module_data
+    return data
+
+
 if __name__ == '__main__':
     readme = read_file('../README.md')
     index = read_file('templates/index.md')
@@ -635,22 +630,9 @@ def render_function(function, method=True):
 
     print('Generating docs for Keras %s.' % keras.__version__)
     for page_data in PAGES:
-        blocks = []
-        classes = page_data.get('classes', [])
-        for module in page_data.get('all_module_classes', []):
-            module_classes = []
-            for name in dir(module):
-                if name[0] == '_' or name in EXCLUDE:
-                    continue
-                module_member = getattr(module, name)
-                if inspect.isclass(module_member):
-                    cls = module_member
-                    if cls.__module__ == module.__name__:
-                        if cls not in module_classes:
-                            module_classes.append(cls)
-            module_classes.sort(key=lambda x: id(x))
-            classes += module_classes
+        classes = read_page_data(page_data, 'classes')
 
+        blocks = []
         for element in classes:
             if not isinstance(element, (list, tuple)):
                 element = (element, [])
@@ -675,20 +657,12 @@ def render_function(function, method=True):
                     [render_function(method, method=True) for method in methods]))
             blocks.append('\n'.join(subblocks))
 
-        functions = page_data.get('functions', [])
-        for module in page_data.get('all_module_functions', []):
-            module_functions = []
-            for name in dir(module):
-                if name[0] == '_' or name in EXCLUDE:
-                    continue
-                module_member = getattr(module, name)
-                if inspect.isfunction(module_member):
-                    function = module_member
-                    if module.__name__ in function.__module__:
-                        if function not in module_functions:
-                            module_functions.append(function)
-            module_functions.sort(key=lambda x: id(x))
-            functions += module_functions
+        methods = read_page_data(page_data, 'methods')
+
+        for method in methods:
+            blocks.append(render_function(method, method=True))
+
+        functions = read_page_data(page_data, 'functions')
 
         for function in functions:
             blocks.append(render_function(function, method=False))
@@ -706,7 +680,8 @@ def render_function(function, method=True):
         if os.path.exists(path):
             template = read_file(path)
             assert '{{autogenerated}}' in template, ('Template found for ' + path +
-                                                     ' but missing {{autogenerated}} tag.')
+                                                     ' but missing {{autogenerated}}'
+                                                     ' tag.')
             mkdown = template.replace('{{autogenerated}}', mkdown)
             print('...inserting autogenerated content into template:', path)
         else:
diff --git a/docs/templates/applications.md b/docs/templates/applications.md
index 910db0e4475..84fc176c0a9 100644
--- a/docs/templates/applications.md
+++ b/docs/templates/applications.md
@@ -533,15 +533,6 @@ MobileNet model, with weights pre-trained on ImageNet.
 
 Note that this model only supports the data format `'channels_last'` (height, width, channels).
 
-To load a MobileNet model via `load_model`, import the custom object `relu6` and pass it to the `custom_objects` parameter.
-
-E.g.
-
-```python
-model = load_model('mobilenet.h5', custom_objects={
-                   'relu6': mobilenet.relu6})
-```
-
 The default input size for this model is 224x224.
 
 ### Arguments
@@ -732,15 +723,6 @@ MobileNetV2 model, with weights pre-trained on ImageNet.
 
 Note that this model only supports the data format `'channels_last'` (height, width, channels).
 
-To load a MobileNetV2 model via `load_model`, import the custom object `relu6` and pass it to the `custom_objects` parameter.
-
-E.g.
-
-```python
-model = load_model('mobilenet_v2.h5', custom_objects={
-                   'relu6': mobilenetv2.relu6})
-```
-
 The default input size for this model is 224x224.
 
 ### Arguments
diff --git a/docs/templates/getting-started/faq.md b/docs/templates/getting-started/faq.md
index 99f461cafc8..87dabda1f70 100644
--- a/docs/templates/getting-started/faq.md
+++ b/docs/templates/getting-started/faq.md
@@ -556,22 +556,36 @@ Likewise, cached dataset files, such as those downloaded with [`get_file()`](/ut
 
 ### How can I obtain reproducible results using Keras during development?
 
-During development of a model, sometimes it is useful to be able to obtain reproducible results from run to run in order to determine if a change in performance is due to an actual model or data modification, or merely a result of a new random sample.  The below snippet of code provides an example of how to obtain reproducible results - this is geared towards a TensorFlow backend for a Python 3 environment.
+During development of a model, sometimes it is useful to be able to obtain reproducible results from run to run in order to determine if a change in performance is due to an actual model or data modification, or merely a result of a new random sample.
+
+First, you need to set the `PYTHONHASHSEED` environment variable to `0` before the program starts (not within the program itself). This is necessary in Python 3.2.3 onwards to have reproducible behavior for certain hash-based operations (e.g., the item order in a set or a dict, see [Python's documentation](https://docs.python.org/3.7/using/cmdline.html#envvar-PYTHONHASHSEED) or [issue #2280](https://github.com/keras-team/keras/issues/2280#issuecomment-306959926) for further details). One way to set the environment variable is when starting python like this:
+
+```
+$ cat test_hash.py
+print(hash("keras"))
+$ python3 test_hash.py                  # non-reproducible hash (Python 3.2.3+)
+-8127205062320133199
+$ python3 test_hash.py                  # non-reproducible hash (Python 3.2.3+)
+3204480642156461591
+$ PYTHONHASHSEED=0 python3 test_hash.py # reproducible hash
+4883664951434749476
+$ PYTHONHASHSEED=0 python3 test_hash.py # reproducible hash
+4883664951434749476
+```
+
+Moreover, when using the TensorFlow backend and running on a GPU, some operations have non-deterministic outputs, in particular `tf.reduce_sum()`. This is due to the fact that GPUs run many operations in parallel, so the order of execution is not always guaranteed. Due to the limited precision of floats, even adding several numbers together may give slightly different results depending on the order in which you add them. You can try to avoid the non-deterministic operations, but some may be created automatically by TensorFlow to compute the gradients, so it is much simpler to just run the code on the CPU. For this, you can set the `CUDA_VISIBLE_DEVICES` environment variable to an empty string, for example:
+
+```
+$ CUDA_VISIBLE_DEVICES="" PYTHONHASHSEED=0 python your_program.py
+```
+
+The below snippet of code provides an example of how to obtain reproducible results - this is geared towards a TensorFlow backend for a Python 3 environment:
 
 ```python
 import numpy as np
 import tensorflow as tf
 import random as rn
 
-# The below is necessary in Python 3.2.3 onwards to
-# have reproducible behavior for certain hash-based operations.
-# See these references for further details:
-# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
-# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926
-
-import os
-os.environ['PYTHONHASHSEED'] = '0'
-
 # The below is necessary for starting Numpy generated random numbers
 # in a well-defined initial state.
 
@@ -583,17 +597,18 @@ np.random.seed(42)
 rn.seed(12345)
 
 # Force TensorFlow to use single thread.
-# Multiple threads are a potential source of
-# non-reproducible results.
-# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res
+# Multiple threads are a potential source of non-reproducible results.
+# For further details, see: https://stackoverflow.com/questions/42022950/
 
-session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
+                              inter_op_parallelism_threads=1)
 
 from keras import backend as K
 
 # The below tf.set_random_seed() will make random number generation
 # in the TensorFlow backend have a well-defined initial state.
-# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
+# For further details, see:
+# https://www.tensorflow.org/api_docs/python/tf/set_random_seed
 
 tf.set_random_seed(1234)
 
@@ -624,4 +639,4 @@ import h5py
 ```
 
 If it imports without error it is installed otherwise you can find detailed
-installation instructions here: http://docs.h5py.org/en/latest/build.html
\ No newline at end of file
+installation instructions here: http://docs.h5py.org/en/latest/build.html
diff --git a/docs/templates/models/about-keras-models.md b/docs/templates/models/about-keras-models.md
index e4754888b51..bffd583bd48 100644
--- a/docs/templates/models/about-keras-models.md
+++ b/docs/templates/models/about-keras-models.md
@@ -7,7 +7,6 @@ These models have a number of methods and attributes in common:
 - `model.layers` is a flattened list of the layers comprising the model.
 - `model.inputs` is the list of input tensors of the model.
 - `model.outputs` is the list of output tensors of the model.
-
 - `model.summary()` prints a summary representation of your model. Shortcut for [utils.print_summary](/utils/#print_summary)
 - `model.get_config()` returns a dictionary containing the configuration of the model. The model can be reinstantiated from its config via:
 
@@ -92,4 +91,4 @@ That means the model's topology cannot be inspected or serialized. As a result,
 
 **Key point:** use the right API for the job. The `Model` subclassing API can provide you with greater flexbility for implementing complex models,
 but it comes at a cost (in addition to these missing features):
-it is more verbose, more complex, and has more opportunities for user errors. If possible, prefer using the functional API, which is more user-friendly.
\ No newline at end of file
+it is more verbose, more complex, and has more opportunities for user errors. If possible, prefer using the functional API, which is more user-friendly.
diff --git a/docs/templates/regularizers.md b/docs/templates/regularizers.md
index 366f1d15d71..3cbf774f5d9 100644
--- a/docs/templates/regularizers.md
+++ b/docs/templates/regularizers.md
@@ -25,7 +25,7 @@ model.add(Dense(64, input_dim=64,
 ```python
 keras.regularizers.l1(0.)
 keras.regularizers.l2(0.)
-keras.regularizers.l1_l2(0.)
+keras.regularizers.l1_l2(l1=0.01, l2=0.01)
 ```
 
 ## Developing new regularizers
@@ -43,4 +43,4 @@ model.add(Dense(64, input_dim=64,
 ```
 
 Alternatively, you can write your regularizers in an object-oriented way;
-see the [keras/regularizers.py](https://github.com/keras-team/keras/blob/master/keras/regularizers.py) module for examples.
\ No newline at end of file
+see the [keras/regularizers.py](https://github.com/keras-team/keras/blob/master/keras/regularizers.py) module for examples.
diff --git a/examples/README.md b/examples/README.md
index bcd0e20362f..d7ad618b124 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -11,6 +11,9 @@ Trains a simple convnet on the MNIST dataset.
 [cifar10_cnn.py](cifar10_cnn.py)
 Trains a simple deep CNN on the CIFAR10 small images dataset.
 
+[cifar10_cnn_capsule.py](cifar10_cnn_capsule.py)
+Trains a simple CNN-Capsule Network on the CIFAR10 small images dataset.
+
 [cifar10_resnet.py](cifar10_resnet.py)
 Trains a ResNet on the CIFAR10 small images dataset.
 
@@ -33,7 +36,10 @@ Trains a Siamese multi-layer perceptron on pairs of digits from the MNIST datase
 Trains a Stacked What-Where AutoEncoder built on residual blocks on the MNIST dataset.
 
 [mnist_transfer_cnn.py](mnist_transfer_cnn.py)
-Transfer learning toy example.
+Transfer learning toy example on the MNIST dataset.
+
+[mnist_denoising_autoencoder.py](mnist_denoising_autoencoder.py)
+Trains a denoising autoencoder on the MNIST dataset.
 
 ----
 
@@ -66,6 +72,12 @@ Trains an LSTM model on the IMDB sentiment classification task.
 [lstm_stateful.py](lstm_stateful.py)
 Demonstrates how to use stateful RNNs to model long sequences efficiently.
 
+[lstm_seq2seq.py](lstm_seq2seq.py)
+Trains a basic character-level sequence-to-sequence model.
+
+[lstm_seq2seq_restore.py](lstm_seq2seq_restore.py)
+Restores a character-level sequence to sequence model from disk (saved by [lstm_seq2seq.py](lstm_seq2seq.py)) and uses it to generate predictions.
+
 [pretrained_word_embeddings.py](pretrained_word_embeddings.py)
 Loads pre-trained word embeddings (GloVe embeddings) into a frozen Keras Embedding layer, and uses it to train a text classification model on the 20 Newsgroup dataset.
 
@@ -120,4 +132,10 @@ Compares self-normalizing MLPs with regular MLPs.
 MNIST dataset with TFRecords, the standard TensorFlow data format.
 
 [mnist_dataset_api.py](mnist_dataset_api.py)
-MNIST dataset with TensorFlow's Dataset API.
\ No newline at end of file
+MNIST dataset with TensorFlow's Dataset API.
+
+[cifar10_cnn_tfaugment2d.py](cifar10_cnn_tfaugment2d.py)
+Trains a simple deep CNN on the CIFAR10 small images dataset using Tensorflow internal augmentation APIs.
+
+[tensorboard_embeddings_mnist.py](tensorboard_embeddings_mnist.py)
+Trains a simple convnet on the MNIST dataset and embeds test data which can be later visualized using TensorBoard's Embedding Projector.
diff --git a/examples/addition_rnn.py b/examples/addition_rnn.py
index c6eba825468..f0167d6e55f 100644
--- a/examples/addition_rnn.py
+++ b/examples/addition_rnn.py
@@ -24,7 +24,7 @@
 
 Five digits reversed:
 + One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs
-'''
+'''  # noqa
 
 from __future__ import print_function
 from keras.models import Sequential
diff --git a/examples/babi_memnn.py b/examples/babi_memnn.py
index 91de434b86b..483784d6f18 100644
--- a/examples/babi_memnn.py
+++ b/examples/babi_memnn.py
@@ -18,7 +18,8 @@
 from keras import backend as K
 from keras.models import Sequential, Model
 from keras.layers.embeddings import Embedding
-from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
+from keras.layers import Input, Activation, Dense, Permute, Dropout
+from keras.layers import add, dot, concatenate
 from keras.layers import LSTM
 from keras.utils.data_utils import get_file
 from keras.preprocessing.sequence import pad_sequences
@@ -80,7 +81,8 @@ def get_stories(f, only_supporting=False, max_length=None):
     '''
     data = parse_stories(f.readlines(), only_supporting=only_supporting)
     flatten = lambda data: reduce(lambda x, y: x + y, data)
-    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
+    data = [(flatten(story), q, answer) for story, q, answer in data
+            if not max_length or len(flatten(story)) < max_length]
     return data
 
 
@@ -95,19 +97,24 @@ def vectorize_stories(data):
             np.array(answers))
 
 try:
-    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
+    path = get_file('babi-tasks-v1-2.tar.gz',
+                    origin='https://s3.amazonaws.com/text-datasets/'
+                           'babi_tasks_1-20_v1-2.tar.gz')
 except:
     print('Error downloading dataset, please download it manually:\n'
-          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
+          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
+          '.tar.gz\n'
           '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
     raise
 
 
 challenges = {
     # QA1 with 10,000 samples
-    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
+    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_'
+                                  'single-supporting-fact_{}.txt',
     # QA2 with 10,000 samples
-    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
+    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_'
+                                'two-supporting-facts_{}.txt',
 }
 challenge_type = 'single_supporting_fact_10k'
 challenge = challenges[challenge_type]
diff --git a/examples/babi_rnn.py b/examples/babi_rnn.py
index 8b2afccf4e6..a6396fd5ab7 100644
--- a/examples/babi_rnn.py
+++ b/examples/babi_rnn.py
@@ -123,7 +123,8 @@ def get_stories(f, only_supporting=False, max_length=None):
     '''
     data = parse_stories(f.readlines(), only_supporting=only_supporting)
     flatten = lambda data: reduce(lambda x, y: x + y, data)
-    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
+    data = [(flatten(story), q, answer) for story, q, answer in data
+            if not max_length or len(flatten(story)) < max_length]
     return data
 
 
@@ -140,7 +141,8 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
         xs.append(x)
         xqs.append(xq)
         ys.append(y)
-    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)
+    return (pad_sequences(xs, maxlen=story_maxlen),
+            pad_sequences(xqs, maxlen=query_maxlen), np.array(ys))
 
 RNN = recurrent.LSTM
 EMBED_HIDDEN_SIZE = 50
@@ -154,10 +156,13 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
                                                            QUERY_HIDDEN_SIZE))
 
 try:
-    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
+    path = get_file('babi-tasks-v1-2.tar.gz',
+                    origin='https://s3.amazonaws.com/text-datasets/'
+                           'babi_tasks_1-20_v1-2.tar.gz')
 except:
     print('Error downloading dataset, please download it manually:\n'
-          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
+          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
+          '.tar.gz\n'
           '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
     raise
 
diff --git a/examples/cifar10_cnn.py b/examples/cifar10_cnn.py
index 1daed4ab6ce..4c3e9e800d4 100644
--- a/examples/cifar10_cnn.py
+++ b/examples/cifar10_cnn.py
@@ -85,19 +85,26 @@
         zca_whitening=False,  # apply ZCA whitening
         zca_epsilon=1e-06,  # epsilon for ZCA whitening
         rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
-        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
-        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
+        # randomly shift images horizontally (fraction of total width)
+        width_shift_range=0.1,
+        # randomly shift images vertically (fraction of total height)
+        height_shift_range=0.1,
         shear_range=0.,  # set range for random shear
         zoom_range=0.,  # set range for random zoom
         channel_shift_range=0.,  # set range for random channel shifts
-        fill_mode='nearest',  # set mode for filling points outside the input boundaries
+        # set mode for filling points outside the input boundaries
+        fill_mode='nearest',
         cval=0.,  # value used for fill_mode = "constant"
         horizontal_flip=True,  # randomly flip images
         vertical_flip=False,  # randomly flip images
-        rescale=None,  # set rescaling factor (applied before any other transformation)
-        preprocessing_function=None,  # set function that will be applied on each input
-        data_format=None,  # image data format, either "channels_first" or "channels_last"
-        validation_split=0.0)  # fraction of images reserved for validation (strictly between 0 and 1)
+        # set rescaling factor (applied before any other transformation)
+        rescale=None,
+        # set function that will be applied on each input
+        preprocessing_function=None,
+        # image data format, either "channels_first" or "channels_last"
+        data_format=None,
+        # fraction of images reserved for validation (strictly between 0 and 1)
+        validation_split=0.0)
 
     # Compute quantities required for feature-wise normalization
     # (std, mean, and principal components if ZCA whitening is applied).
diff --git a/examples/cifar10_cnn_capsule.py b/examples/cifar10_cnn_capsule.py
index 9c7953b1e05..56b04d91272 100644
--- a/examples/cifar10_cnn_capsule.py
+++ b/examples/cifar10_cnn_capsule.py
@@ -215,14 +215,19 @@ def compute_output_shape(self, input_shape):
         shear_range=0.,  # set range for random shear
         zoom_range=0.,  # set range for random zoom
         channel_shift_range=0.,  # set range for random channel shifts
-        fill_mode='nearest',  # set mode for filling points outside the input boundaries
+        # set mode for filling points outside the input boundaries
+        fill_mode='nearest',
         cval=0.,  # value used for fill_mode = "constant"
         horizontal_flip=True,  # randomly flip images
         vertical_flip=False,  # randomly flip images
-        rescale=None,  # set rescaling factor (applied before any other transformation)
-        preprocessing_function=None,  # set function that will be applied on each input
-        data_format=None,  # image data format, either "channels_first" or "channels_last"
-        validation_split=0.0)  # fraction of images reserved for validation (strictly between 0 and 1)
+        # set rescaling factor (applied before any other transformation)
+        rescale=None,
+        # set function that will be applied on each input
+        preprocessing_function=None,
+        # image data format, either "channels_first" or "channels_last"
+        data_format=None,
+        # fraction of images reserved for validation (strictly between 0 and 1)
+        validation_split=0.0)
 
     # Compute quantities required for feature-wise normalization
     # (std, mean, and principal components if ZCA whitening is applied).
diff --git a/examples/lstm_seq2seq.py b/examples/lstm_seq2seq.py
index 3b0d3be3823..f7c60178657 100644
--- a/examples/lstm_seq2seq.py
+++ b/examples/lstm_seq2seq.py
@@ -10,7 +10,7 @@
 # Summary of the algorithm
 
 - We start with input sequences from a domain (e.g. English sentences)
-    and correspding target sequences from another domain
+    and corresponding target sequences from another domain
     (e.g. French sentences).
 - An encoder LSTM turns input sequences to 2 state vectors
     (we keep the last LSTM state and discard the outputs).
diff --git a/examples/pretrained_word_embeddings.py b/examples/pretrained_word_embeddings.py
index d1284341ad8..c3ac0a5e661 100644
--- a/examples/pretrained_word_embeddings.py
+++ b/examples/pretrained_word_embeddings.py
@@ -22,6 +22,7 @@
 from keras.layers import Dense, Input, GlobalMaxPooling1D
 from keras.layers import Conv1D, MaxPooling1D, Embedding
 from keras.models import Model
+from keras.initializers import Constant
 
 
 BASE_DIR = ''
@@ -116,7 +117,7 @@
 # note that we set trainable = False so as to keep the embeddings fixed
 embedding_layer = Embedding(num_words,
                             EMBEDDING_DIM,
-                            weights=[embedding_matrix],
+                            embeddings_initializer=Constant(embedding_matrix),
                             input_length=MAX_SEQUENCE_LENGTH,
                             trainable=False)
 
diff --git a/examples/reuters_mlp_relu_vs_selu.py b/examples/reuters_mlp_relu_vs_selu.py
index 351ca0b02ef..6df14455305 100644
--- a/examples/reuters_mlp_relu_vs_selu.py
+++ b/examples/reuters_mlp_relu_vs_selu.py
@@ -21,6 +21,9 @@
 from keras.layers.noise import AlphaDropout
 from keras.preprocessing.text import Tokenizer
 
+if K.backend() == 'mxnet':
+    raise NotImplementedError("MXNet Backend: Alpha Dropout is not supported yet.")
+
 max_words = 1000
 batch_size = 16
 epochs = 40
diff --git a/keras/__init__.py b/keras/__init__.py
index 122cb2e5267..856e0757000 100644
--- a/keras/__init__.py
+++ b/keras/__init__.py
@@ -23,4 +23,4 @@
 from .models import Model
 from .models import Sequential
 
-__version__ = '2.2.0'
+__version__ = '2.2.2'
diff --git a/keras/activations.py b/keras/activations.py
index ba2450488e6..2cc67f4be83 100644
--- a/keras/activations.py
+++ b/keras/activations.py
@@ -171,6 +171,17 @@ def deserialize(name, custom_objects=None):
 
 
 def get(identifier):
+    """Get the `identifier` activation function.
+
+    # Arguments
+        identifier: None or str, name of the function.
+
+    # Returns
+        The activation function, `linear` if `identifier` is None.
+
+    # Raises
+        ValueError if unknown identifier
+    """
     if identifier is None:
         return linear
     if isinstance(identifier, six.string_types):
diff --git a/keras/applications/__init__.py b/keras/applications/__init__.py
index c34b12075f2..a2c2840bde6 100644
--- a/keras/applications/__init__.py
+++ b/keras/applications/__init__.py
@@ -12,7 +12,6 @@
 
 keras_applications.set_keras_submodules(
     backend=backend,
-    engine=engine,
     layers=layers,
     models=models,
     utils=utils)
diff --git a/keras/applications/imagenet_utils.py b/keras/applications/imagenet_utils.py
index ab59cd8e238..d7594c348ad 100644
--- a/keras/applications/imagenet_utils.py
+++ b/keras/applications/imagenet_utils.py
@@ -4,319 +4,7 @@
 from __future__ import division
 from __future__ import print_function
 
-import json
-import warnings
-import numpy as np
+from keras_applications import imagenet_utils
 
-from ..utils.data_utils import get_file
-from .. import backend as K
-
-CLASS_INDEX = None
-CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
-
-# Global tensor of imagenet mean for preprocessing symbolic inputs
-_IMAGENET_MEAN = None
-
-
-def _preprocess_numpy_input(x, data_format, mode):
-    """Preprocesses a Numpy array encoding a batch of images.
-
-    # Arguments
-        x: Input array, 3D or 4D.
-        data_format: Data format of the image array.
-        mode: One of "caffe", "tf" or "torch".
-            - caffe: will convert the images from RGB to BGR,
-                then will zero-center each color channel with
-                respect to the ImageNet dataset,
-                without scaling.
-            - tf: will scale pixels between -1 and 1,
-                sample-wise.
-            - torch: will scale pixels between 0 and 1 and then
-                will normalize each channel with respect to the
-                ImageNet dataset.
-
-    # Returns
-        Preprocessed Numpy array.
-    """
-    if not issubclass(x.dtype.type, np.floating):
-        x = x.astype(K.floatx(), copy=False)
-
-    if mode == 'tf':
-        x /= 127.5
-        x -= 1.
-        return x
-
-    if mode == 'torch':
-        x /= 255.
-        mean = [0.485, 0.456, 0.406]
-        std = [0.229, 0.224, 0.225]
-    else:
-        if data_format == 'channels_first':
-            # 'RGB'->'BGR'
-            if x.ndim == 3:
-                x = x[::-1, ...]
-            else:
-                x = x[:, ::-1, ...]
-        else:
-            # 'RGB'->'BGR'
-            x = x[..., ::-1]
-        mean = [103.939, 116.779, 123.68]
-        std = None
-
-    # Zero-center by mean pixel
-    if data_format == 'channels_first':
-        if x.ndim == 3:
-            x[0, :, :] -= mean[0]
-            x[1, :, :] -= mean[1]
-            x[2, :, :] -= mean[2]
-            if std is not None:
-                x[0, :, :] /= std[0]
-                x[1, :, :] /= std[1]
-                x[2, :, :] /= std[2]
-        else:
-            x[:, 0, :, :] -= mean[0]
-            x[:, 1, :, :] -= mean[1]
-            x[:, 2, :, :] -= mean[2]
-            if std is not None:
-                x[:, 0, :, :] /= std[0]
-                x[:, 1, :, :] /= std[1]
-                x[:, 2, :, :] /= std[2]
-    else:
-        x[..., 0] -= mean[0]
-        x[..., 1] -= mean[1]
-        x[..., 2] -= mean[2]
-        if std is not None:
-            x[..., 0] /= std[0]
-            x[..., 1] /= std[1]
-            x[..., 2] /= std[2]
-    return x
-
-
-def _preprocess_symbolic_input(x, data_format, mode):
-    """Preprocesses a tensor encoding a batch of images.
-
-    # Arguments
-        x: Input tensor, 3D or 4D.
-        data_format: Data format of the image tensor.
-        mode: One of "caffe", "tf" or "torch".
-            - caffe: will convert the images from RGB to BGR,
-                then will zero-center each color channel with
-                respect to the ImageNet dataset,
-                without scaling.
-            - tf: will scale pixels between -1 and 1,
-                sample-wise.
-            - torch: will scale pixels between 0 and 1 and then
-                will normalize each channel with respect to the
-                ImageNet dataset.
-
-    # Returns
-        Preprocessed tensor.
-    """
-    global _IMAGENET_MEAN
-
-    if mode == 'tf':
-        x /= 127.5
-        x -= 1.
-        return x
-
-    if mode == 'torch':
-        x /= 255.
-        mean = [0.485, 0.456, 0.406]
-        std = [0.229, 0.224, 0.225]
-    else:
-        if data_format == 'channels_first':
-            # 'RGB'->'BGR'
-            if K.ndim(x) == 3:
-                x = x[::-1, ...]
-            else:
-                x = x[:, ::-1, ...]
-        else:
-            # 'RGB'->'BGR'
-            x = x[..., ::-1]
-        mean = [103.939, 116.779, 123.68]
-        std = None
-
-    if _IMAGENET_MEAN is None:
-        _IMAGENET_MEAN = K.constant(-np.array(mean))
-
-    # Zero-center by mean pixel
-    if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
-        x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
-    else:
-        x = K.bias_add(x, _IMAGENET_MEAN, data_format)
-    if std is not None:
-        x /= std
-    return x
-
-
-def preprocess_input(x, data_format=None, mode='caffe'):
-    """Preprocesses a tensor or Numpy array encoding a batch of images.
-
-    # Arguments
-        x: Input Numpy or symbolic tensor, 3D or 4D.
-            The preprocessed data is written over the input data
-            if the data types are compatible. To avoid this
-            behaviour, `numpy.copy(x)` can be used.
-        data_format: Data format of the image tensor/array.
-        mode: One of "caffe", "tf" or "torch".
-            - caffe: will convert the images from RGB to BGR,
-                then will zero-center each color channel with
-                respect to the ImageNet dataset,
-                without scaling.
-            - tf: will scale pixels between -1 and 1,
-                sample-wise.
-            - torch: will scale pixels between 0 and 1 and then
-                will normalize each channel with respect to the
-                ImageNet dataset.
-
-    # Returns
-        Preprocessed tensor or Numpy array.
-
-    # Raises
-        ValueError: In case of unknown `data_format` argument.
-    """
-    if data_format is None:
-        data_format = K.image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
-
-    if isinstance(x, np.ndarray):
-        return _preprocess_numpy_input(x, data_format=data_format, mode=mode)
-    else:
-        return _preprocess_symbolic_input(x, data_format=data_format,
-                                          mode=mode)
-
-
-def decode_predictions(preds, top=5):
-    """Decodes the prediction of an ImageNet model.
-
-    # Arguments
-        preds: Numpy tensor encoding a batch of predictions.
-        top: Integer, how many top-guesses to return.
-
-    # Returns
-        A list of lists of top class prediction tuples
-        `(class_name, class_description, score)`.
-        One list of tuples per sample in batch input.
-
-    # Raises
-        ValueError: In case of invalid shape of the `pred` array
-            (must be 2D).
-    """
-    global CLASS_INDEX
-    if len(preds.shape) != 2 or preds.shape[1] != 1000:
-        raise ValueError('`decode_predictions` expects '
-                         'a batch of predictions '
-                         '(i.e. a 2D array of shape (samples, 1000)). '
-                         'Found array with shape: ' + str(preds.shape))
-    if CLASS_INDEX is None:
-        fpath = get_file('imagenet_class_index.json',
-                         CLASS_INDEX_PATH,
-                         cache_subdir='models',
-                         file_hash='c2c37ea517e94d9795004a39431a14cb')
-        with open(fpath) as f:
-            CLASS_INDEX = json.load(f)
-    results = []
-    for pred in preds:
-        top_indices = pred.argsort()[-top:][::-1]
-        result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
-        result.sort(key=lambda x: x[2], reverse=True)
-        results.append(result)
-    return results
-
-
-def _obtain_input_shape(input_shape,
-                        default_size,
-                        min_size,
-                        data_format,
-                        require_flatten,
-                        weights=None):
-    """Internal utility to compute/validate a model's input shape.
-
-    # Arguments
-        input_shape: Either None (will return the default network input shape),
-            or a user-provided shape to be validated.
-        default_size: Default input width/height for the model.
-        min_size: Minimum input width/height accepted by the model.
-        data_format: Image data format to use.
-        require_flatten: Whether the model is expected to
-            be linked to a classifier via a Flatten layer.
-        weights: One of `None` (random initialization)
-            or 'imagenet' (pre-training on ImageNet).
-            If weights='imagenet' input channels must be equal to 3.
-
-    # Returns
-        An integer shape tuple (may include None entries).
-
-    # Raises
-        ValueError: In case of invalid argument values.
-    """
-    if weights != 'imagenet' and input_shape and len(input_shape) == 3:
-        if data_format == 'channels_first':
-            if input_shape[0] not in {1, 3}:
-                warnings.warn(
-                    'This model usually expects 1 or 3 input channels. '
-                    'However, it was passed an input_shape with ' +
-                    str(input_shape[0]) + ' input channels.')
-            default_shape = (input_shape[0], default_size, default_size)
-        else:
-            if input_shape[-1] not in {1, 3}:
-                warnings.warn(
-                    'This model usually expects 1 or 3 input channels. '
-                    'However, it was passed an input_shape with ' +
-                    str(input_shape[-1]) + ' input channels.')
-            default_shape = (default_size, default_size, input_shape[-1])
-    else:
-        if data_format == 'channels_first':
-            default_shape = (3, default_size, default_size)
-        else:
-            default_shape = (default_size, default_size, 3)
-    if weights == 'imagenet' and require_flatten:
-        if input_shape is not None:
-            if input_shape != default_shape:
-                raise ValueError('When setting`include_top=True` '
-                                 'and loading `imagenet` weights, '
-                                 '`input_shape` should be ' +
-                                 str(default_shape) + '.')
-        return default_shape
-    if input_shape:
-        if data_format == 'channels_first':
-            if input_shape is not None:
-                if len(input_shape) != 3:
-                    raise ValueError(
-                        '`input_shape` must be a tuple of three integers.')
-                if input_shape[0] != 3 and weights == 'imagenet':
-                    raise ValueError('The input must have 3 channels; got '
-                                     '`input_shape=' + str(input_shape) + '`')
-                if ((input_shape[1] is not None and input_shape[1] < min_size) or
-                   (input_shape[2] is not None and input_shape[2] < min_size)):
-                    raise ValueError('Input size must be at least ' +
-                                     str(min_size) + 'x' + str(min_size) + '; got '
-                                     '`input_shape=' + str(input_shape) + '`')
-        else:
-            if input_shape is not None:
-                if len(input_shape) != 3:
-                    raise ValueError(
-                        '`input_shape` must be a tuple of three integers.')
-                if input_shape[-1] != 3 and weights == 'imagenet':
-                    raise ValueError('The input must have 3 channels; got '
-                                     '`input_shape=' + str(input_shape) + '`')
-                if ((input_shape[0] is not None and input_shape[0] < min_size) or
-                   (input_shape[1] is not None and input_shape[1] < min_size)):
-                    raise ValueError('Input size must be at least ' +
-                                     str(min_size) + 'x' + str(min_size) + '; got '
-                                     '`input_shape=' + str(input_shape) + '`')
-    else:
-        if require_flatten:
-            input_shape = default_shape
-        else:
-            if data_format == 'channels_first':
-                input_shape = (3, None, None)
-            else:
-                input_shape = (None, None, 3)
-    if require_flatten:
-        if None in input_shape:
-            raise ValueError('If `include_top` is True, '
-                             'you should specify a static `input_shape`. '
-                             'Got `input_shape=' + str(input_shape) + '`')
-    return input_shape
+preprocess_input = imagenet_utils.preprocess_input
+decode_predictions = imagenet_utils.decode_predictions
diff --git a/keras/applications/nasnet.py b/keras/applications/nasnet.py
index b1f2fd00bb7..c9c487cd39b 100644
--- a/keras/applications/nasnet.py
+++ b/keras/applications/nasnet.py
@@ -4,7 +4,6 @@
 
 from keras_applications import nasnet
 
-
 NASNetMobile = nasnet.NASNetMobile
 NASNetLarge = nasnet.NASNetLarge
 decode_predictions = nasnet.decode_predictions
diff --git a/keras/backend/__init__.py b/keras/backend/__init__.py
index eaa81a37786..606773a75c2 100644
--- a/keras/backend/__init__.py
+++ b/keras/backend/__init__.py
@@ -11,6 +11,7 @@
 from .common import cast_to_floatx
 from .common import image_data_format
 from .common import set_image_data_format
+from .common import normalize_data_format
 
 # Set Keras base dir path given KERAS_HOME env variable, if applicable.
 # Otherwise either ~/.keras or /tmp.
@@ -73,7 +74,8 @@
 # Set backend based on KERAS_BACKEND flag, if applicable.
 if 'KERAS_BACKEND' in os.environ:
     _backend = os.environ['KERAS_BACKEND']
-    _BACKEND = _backend
+    if _backend:
+        _BACKEND = _backend
 
 # Import backend functions.
 if _BACKEND == 'cntk':
diff --git a/keras/backend/cntk_backend.py b/keras/backend/cntk_backend.py
index a738b7ec16c..2eed3b6040e 100644
--- a/keras/backend/cntk_backend.py
+++ b/keras/backend/cntk_backend.py
@@ -4,7 +4,11 @@
 
 import cntk as C
 import numpy as np
-from .common import floatx, epsilon, image_dim_ordering, image_data_format
+from .common import floatx
+from .common import epsilon
+from .common import image_data_format
+from .common import normalize_data_format
+from ..utils.generic_utils import transpose_shape
 from collections import defaultdict
 from contextlib import contextmanager
 import warnings
@@ -184,10 +188,7 @@ def variable(value, dtype=None, name=None, constraint=None):
 
 
 def bias_add(x, bias, data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     dims = len(x.shape)
     if dims > 0 and x.shape[0] == C.InferredDimension:
@@ -270,7 +271,7 @@ def placeholder(
         raise ValueError('CNTK backend: creating placeholder with '
                          '%d dimension is not supported, at least '
                          '%d dimensions are needed.'
-                         % (len(cntk_shape, dynamic_axis_num)))
+                         % (len(cntk_shape), dynamic_axis_num))
 
     if name is None:
         name = ''
@@ -369,27 +370,21 @@ def constant(value, dtype=None, shape=None, name=None):
 
 
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
-    # use numpy workaround now
     if seed is None:
         # ensure that randomness is conditioned by the Numpy RNG
         seed = np.random.randint(10e7)
-        np.random.seed(seed)
     if dtype is None:
         dtype = np.float32
     else:
         dtype = _convert_string_dtype(dtype)
 
-    size = 1
     for _ in shape:
         if _ is None:
             raise ValueError('CNTK Backend: randomness op with '
                              'dynamic shape is not supported now. '
                              'Please provide fixed dimension '
                              'instead of `None`.')
-        size *= _
-
-    binomial = np.random.binomial(1, p, size).astype(dtype).reshape(shape)
-    return variable(value=binomial, dtype=dtype)
+    return C.random.bernoulli(shape=shape, dtype=dtype, mean=p, seed=seed)
 
 
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
@@ -400,7 +395,10 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
                              'Please provide fixed dimension '
                              'instead of `None`.')
 
-    return random_uniform_variable(shape, minval, maxval, dtype, seed)
+    if seed is None:
+        # ensure that randomness is conditioned by the Numpy RNG
+        seed = np.random.randint(10e3)
+    return C.random.uniform(shape=shape, dtype=dtype, low=minval, high=maxval, seed=seed)
 
 
 def random_uniform_variable(shape, low, high,
@@ -450,13 +448,14 @@ def random_normal_variable(
     if name is None:
         name = ''
 
-    return C.parameter(
+    p = C.parameter(
         shape=shape,
         init=C.initializer.normal(
             scale=scale,
             seed=seed),
         dtype=dtype,
         name=name)
+    return variable(value=p.value + mean)
 
 
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
@@ -468,8 +467,10 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
                              'dynamic shape is not supported now. '
                              'Please provide fixed dimension '
                              'instead of `None`.')
-    # how to apply mean and stddev
-    return random_normal_variable(shape=shape, mean=mean, scale=1.0, seed=seed)
+    if seed is None:
+        # ensure that randomness is conditioned by the Numpy RNG
+        seed = np.random.randint(10e3)
+    return C.random.normal(shape=shape, mean=mean, scale=stddev, seed=seed, dtype=dtype)
 
 
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
@@ -1480,10 +1481,7 @@ def hard_sigmoid(x):
 
 def conv1d(x, kernel, strides=1, padding='valid',
            data_format=None, dilation_rate=1):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     if padding == 'causal':
         # causal (dilated) convolution:
@@ -1512,10 +1510,7 @@ def conv1d(x, kernel, strides=1, padding='valid',
 
 def conv2d(x, kernel, strides=(1, 1), padding='valid',
            data_format=None, dilation_rate=(1, 1)):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv2d_input(x, data_format)
     kernel = _preprocess_conv2d_kernel(kernel, data_format)
@@ -1546,10 +1541,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
 
 def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
                      padding='valid', data_format=None, dilation_rate=1):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
     if isinstance(strides, int):
         strides = (strides,)
     if isinstance(dilation_rate, int):
@@ -1599,10 +1591,7 @@ def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
 
 def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
                      padding='valid', data_format=None, dilation_rate=(1, 1)):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv2d_input(x, data_format)
     depthwise_kernel = _preprocess_conv2d_kernel(depthwise_kernel, data_format)
@@ -1637,10 +1626,7 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
 
 def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
                      data_format=None, dilation_rate=(1, 1)):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv2d_input(x, data_format)
     depthwise_kernel = _preprocess_conv2d_kernel(depthwise_kernel, data_format)
@@ -1668,10 +1654,7 @@ def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
 
 def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
            data_format=None, dilation_rate=(1, 1, 1)):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv3d_input(x, data_format)
     kernel = _preprocess_conv3d_kernel(kernel, data_format)
@@ -1692,10 +1675,7 @@ def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
 
 def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
                      padding='valid', data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv3d_input(x, data_format)
     kernel = _preprocess_conv3d_kernel(kernel, data_format)
@@ -1705,12 +1685,8 @@ def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
     output_shape = output_shape[1:]
     # in keras2, need handle output shape in different format
     if data_format == 'channels_last':
-        shape = list(output_shape)
-        shape[0] = output_shape[3]
-        shape[1] = output_shape[0]
-        shape[2] = output_shape[1]
-        shape[3] = output_shape[2]
-        output_shape = tuple(shape)
+        output_shape = transpose_shape(output_shape, 'channels_first',
+                                       spatial_axes=(0, 1, 2))
 
     x = C.convolution_transpose(
         kernel,
@@ -1728,10 +1704,7 @@ def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
 def pool2d(x, pool_size, strides=(1, 1),
            padding='valid', data_format=None,
            pool_mode='max'):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     padding = _preprocess_border_mode(padding)
     strides = strides
@@ -1758,10 +1731,7 @@ def pool2d(x, pool_size, strides=(1, 1),
 
 def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
            data_format=None, pool_mode='max'):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     padding = _preprocess_border_mode(padding)
 
@@ -2058,7 +2028,7 @@ def temporal_padding(x, padding=(1, 1)):
     return pad(x, [padding], 'channels_last', num_dynamic_axis)
 
 
-def _padding(x, pattern, axis):
+def _padding(x, pattern, axis):  # pragma: no cover
     base_shape = x.shape
     if b_any([dim < 0 for dim in base_shape]):
         raise ValueError('CNTK Backend: padding input tensor with '
@@ -2089,7 +2059,7 @@ def pad(x, pad_info, data_format, num_dynamic_axis):
         if num_dynamic_axis == 0:
             pattern = [[0, 0]] + pattern
         return C.pad(x, pattern=pattern)
-    else:
+    else:  # pragma: no cover
         for (a, p) in enumerate(pad_info):
             x = _padding(x, p,
                          a + (1 if num_dynamic_axis == 0 else 0) +
@@ -2101,10 +2071,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
     assert len(padding) == 2
     assert len(padding[0]) == 2
     assert len(padding[1]) == 2
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     num_dynamic_axis = _get_dynamic_axis_num(x)
     assert len(x.shape) == 4 - (1 if num_dynamic_axis > 0 else 0)
@@ -2116,10 +2083,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
     assert len(padding[0]) == 2
     assert len(padding[1]) == 2
     assert len(padding[2]) == 2
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     num_dynamic_axis = _get_dynamic_axis_num(x)
     assert len(x.shape) == 5 - (1 if num_dynamic_axis > 0 else 0)
@@ -2224,10 +2188,7 @@ def in_top_k(predictions, targets, k):
 
 def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
                      padding='valid', data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv2d_input(x, data_format)
     kernel = _preprocess_conv2d_kernel(kernel, data_format)
@@ -2237,11 +2198,8 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
     output_shape = output_shape[1:]
     # in keras2, need handle output shape in different format
     if data_format == 'channels_last':
-        shape = list(output_shape)
-        shape[0] = output_shape[2]
-        shape[1] = output_shape[0]
-        shape[2] = output_shape[1]
-        output_shape = tuple(shape)
+        output_shape = transpose_shape(output_shape, 'channels_first',
+                                       spatial_axes=(0, 1))
 
     x = C.convolution_transpose(
         kernel,
@@ -2357,10 +2315,7 @@ def _reshape_sequence(x, time_step):
 
 
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     stride = strides[0]
     kernel_shape = int_shape(kernel)
@@ -2389,10 +2344,7 @@ def local_conv2d(inputs,
                  strides,
                  output_shape,
                  data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     stride_row, stride_col = strides
     output_row, output_col = output_shape
diff --git a/keras/backend/common.py b/keras/backend/common.py
index db06138dea1..e6f72ba6e9a 100644
--- a/keras/backend/common.py
+++ b/keras/backend/common.py
@@ -147,6 +147,37 @@ def set_image_data_format(data_format):
     _IMAGE_DATA_FORMAT = str(data_format)
 
 
+def normalize_data_format(value):
+    """Checks that the value correspond to a valid data format.
+
+    # Arguments
+        value: String or None. `'channels_first'` or `'channels_last'`.
+
+    # Returns
+        A string, either `'channels_first'` or `'channels_last'`
+
+    # Example
+    ```python
+        >>> from keras import backend as K
+        >>> K.normalize_data_format(None)
+        'channels_first'
+        >>> K.normalize_data_format('channels_last')
+        'channels_last'
+    ```
+
+    # Raises
+        ValueError: if `value` or the global `data_format` invalid.
+    """
+    if value is None:
+        value = image_data_format()
+    data_format = value.lower()
+    if data_format not in {'channels_first', 'channels_last'}:
+        raise ValueError('The `data_format` argument must be one of '
+                         '"channels_first", "channels_last". Received: ' +
+                         str(value))
+    return data_format
+
+
 # Legacy methods
 
 def set_image_dim_ordering(dim_ordering):
diff --git a/keras/backend/mxnet_backend.py b/keras/backend/mxnet_backend.py
index cd5e110cd41..35acf92a331 100644
--- a/keras/backend/mxnet_backend.py
+++ b/keras/backend/mxnet_backend.py
@@ -273,7 +273,14 @@ def constant(value, dtype=None, shape=None, name=None):
     # Returns
         A Constant Tensor.
     """
-    if shape is None:
+    if dtype is None:
+        dtype = floatx()
+
+    if type(value) is np.ndarray:
+        mx_ndarray = mx.nd.array(value, dtype=dtype)
+    elif type(value) is list:
+        mx_ndarray = mx.nd.array(value, dtype=dtype)
+    elif shape is None:
         mx_ndarray = mx.nd.array(value, dtype=dtype)
     else:
         shape = tuple([0 if dim is None else dim for dim in shape])
@@ -2662,7 +2669,7 @@ def rnn(step_function, inputs, initial_states,
         warnings.warn('MXNet Backend: `unroll=False` is not supported yet in RNN. Since the input_shape is known, '
                       'setting `unroll=True` and continuing the execution.'
                       'More Details - '
-                      'https://github.com/awslabs/keras-apache-mxnet/tree/master/docs/mxnet_backend/using_rnn_with_mxnet_backend.md',
+                      'https://github.com/awslabs/keras-apache-mxnet/tree/master/docs/mxnet_backend/using_rnn_with_mxnet_backend.md',   # nopep8
                       stacklevel=2)  # nopep8
 
     # Split the inputs across time dimension and generate the list of inputs
@@ -3306,8 +3313,9 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
     """
     """
     # mathematical implementation of complete separable conv2d
-    return _sp_convnd(x,  depthwise_kernel, pointwise_kernel, strides=strides, padding_mode=padding, data_format=data_format,
-                          filter_dilation=dilation_rate)
+    return _sp_convnd(x,  depthwise_kernel, pointwise_kernel, strides=strides,
+                      padding_mode=padding, data_format=data_format,
+                      filter_dilation=dilation_rate)
     """
     # depthwise conv2d
     dw_conv = depthwise_conv2d(x, depthwise_kernel, strides=strides, padding=padding, data_format=data_format,
@@ -5102,6 +5110,10 @@ def get_optimizers():
     optimizers = importlib.import_module('keras.optimizers')
 
     class MXOptimizer(optimizers.Optimizer, mx.optimizer.Optimizer):
+        """Custom MXNet Optimizer wrapping Keras Optimizer.
+        This is required because we cannot use Keras optimizer directly as MXNet backend does not
+        support symbolic optimizers.
+        """
         def __init__(self, lr, decay):
             super(MXOptimizer, self).__init__()
             self.lr = variable(lr)
@@ -5117,6 +5129,18 @@ def get_config(self):
             return config
 
     class SGD(MXOptimizer, mx.optimizer.SGD):
+        """Stochastic gradient descent optimizer.
+
+        Includes support for momentum,
+        learning rate decay, and Nesterov momentum.
+
+        # Arguments
+            lr: float >= 0. Learning rate.
+            momentum: float >= 0. Parameter that accelerates SGD
+                in the relevant direction and dampens oscillations.
+            decay: float >= 0. Learning rate decay over each update.
+            nesterov: boolean. Whether to apply Nesterov momentum.
+        """
         def __init__(self, lr=0.01, momentum=0., decay=0.,
                      nesterov=False, clipnorm=None, **kwargs):
             mx.optimizer.SGD.__init__(self, learning_rate=lr, momentum=momentum, clip_gradient=clipnorm, **kwargs)
@@ -5130,6 +5154,24 @@ def get_config(self):
             return dict(list(base_config.items()) + list(config.items()))
 
     class Adagrad(MXOptimizer, mx.optimizer.AdaGrad):
+        """Adagrad optimizer.
+
+        Adagrad is an optimizer with parameter-specific learning rates,
+        which are adapted relative to how frequently a parameter gets
+        updated during training. The more updates a parameter receives,
+        the smaller the updates.
+
+        It is recommended to leave the parameters of this optimizer
+        at their default values.
+
+        # Arguments
+            lr: float >= 0. Initial learning rate.
+            epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
+            decay: float >= 0. Learning rate decay over each update.
+
+        # References
+            - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)  # nopep8
+        """
         def __init__(self, lr=0.01, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
             mx.optimizer.AdaGrad.__init__(self, learning_rate=lr, eps=epsilon, clip_gradient=clipnorm, **kwargs)
             MXOptimizer.__init__(self, lr, decay)
@@ -5142,6 +5184,30 @@ def get_config(self):
             return dict(list(base_config.items()) + list(config.items()))
 
     class Adadelta(MXOptimizer, mx.optimizer.AdaDelta):
+        """Adadelta optimizer.
+
+        Adadelta is a more robust extension of Adagrad
+        that adapts learning rates based on a moving window of gradient updates,
+        instead of accumulating all past gradients. This way, Adadelta continues
+        learning even when many updates have been done. Compared to Adagrad, in the
+        original version of Adadelta you don't have to set an initial learning
+        rate. In this version, initial learning rate and decay factor can
+        be set, as in most other Keras optimizers.
+
+        It is recommended to leave the parameters of this optimizer
+        at their default values.
+
+        # Arguments
+            lr: float >= 0. Initial learning rate, defaults to 1.
+                It is recommended to leave it at the default value.
+            rho: float >= 0. Adadelta decay factor, corresponding to fraction of
+                gradient to keep at each time step.
+            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+            decay: float >= 0. Initial learning rate decay.
+
+        # References
+            - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
+        """
         def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
             mx.optimizer.AdaDelta.__init__(self, rho=rho, epsilon=epsilon, clip_gradient=clipnorm, **kwargs)
             MXOptimizer.__init__(self, lr, decay)
@@ -5155,6 +5221,24 @@ def get_config(self):
             return dict(list(base_config.items()) + list(config.items()))
 
     class Adam(MXOptimizer, mx.optimizer.Adam):
+        """Adam optimizer.
+
+        Default parameters follow those provided in the original paper.
+
+        # Arguments
+            lr: float >= 0. Learning rate.
+            beta_1: float, 0 < beta < 1. Generally close to 1.
+            beta_2: float, 0 < beta < 1. Generally close to 1.
+            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+            decay: float >= 0. Learning rate decay over each update.
+            amsgrad: boolean. Whether to apply the AMSGrad variant of this
+                algorithm from the paper "On the Convergence of Adam and
+                Beyond".
+
+        # References
+            - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+            - [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ)
+        """
         def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
                      epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
             mx.optimizer.Adam.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2,
@@ -5171,6 +5255,20 @@ def get_config(self):
             return dict(list(base_config.items()) + list(config.items()))
 
     class Adamax(MXOptimizer, mx.optimizer.Adamax):
+        """Adamax optimizer from Adam paper's Section 7.
+
+        It is a variant of Adam based on the infinity norm.
+        Default parameters follow those provided in the paper.
+
+        # Arguments
+            lr: float >= 0. Learning rate.
+            beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+            decay: float >= 0. Learning rate decay over each update.
+
+        # References
+            - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+        """
         def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, decay=0., clipnorm=None,
                      epsilon=1e-8, **kwargs):
             mx.optimizer.Adamax.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2,
@@ -5188,6 +5286,24 @@ def get_config(self):
             return dict(list(base_config.items()) + list(config.items()))
 
     class Nadam(MXOptimizer, mx.optimizer.Nadam):
+        """Nesterov Adam optimizer.
+
+        Much like Adam is essentially RMSprop with momentum,
+        Nadam is Adam RMSprop with Nesterov momentum.
+
+        Default parameters follow those provided in the paper.
+        It is recommended to leave the parameters of this optimizer
+        at their default values.
+
+        # Arguments
+            lr: float >= 0. Learning rate.
+            beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+
+        # References
+            - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
+            - [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)  # nopep8
+        """
         def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0., clipnorm=None,
                      schedule_decay=0.004, **kwargs):
             mx.optimizer.Nadam.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2, epsilon=epsilon,
@@ -5204,6 +5320,24 @@ def get_config(self):
             return dict(list(base_config.items()) + list(config.items()))
 
     class RMSprop(MXOptimizer, mx.optimizer.RMSProp):
+        """RMSProp optimizer.
+
+        It is recommended to leave the parameters of this optimizer
+        at their default values
+        (except the learning rate, which can be freely tuned).
+
+        This optimizer is usually a good choice for recurrent
+        neural networks.
+
+        # Arguments
+            lr: float >= 0. Learning rate.
+            rho: float >= 0.
+            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+            decay: float >= 0. Learning rate decay over each update.
+
+        # References
+            - [rmsprop: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)  # nopep8
+        """
         def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
             mx.optimizer.RMSProp.__init__(self, learning_rate=lr, gamma1=rho, epsilon=epsilon,
                                           clip_gradient=clipnorm, **kwargs)
diff --git a/keras/backend/tensorflow_backend.py b/keras/backend/tensorflow_backend.py
index 346d0b88662..21d15963a36 100644
--- a/keras/backend/tensorflow_backend.py
+++ b/keras/backend/tensorflow_backend.py
@@ -17,8 +17,10 @@
 import numpy as np
 import os
 
-from .common import floatx, epsilon
-from .common import image_data_format
+from .common import floatx
+from .common import epsilon
+from .common import normalize_data_format
+from ..utils.generic_utils import transpose_shape
 from ..utils.generic_utils import has_arg
 
 # Legacy functions
@@ -1871,18 +1873,28 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
     """
     if ndim(x) == 4:
         # The CPU implementation of FusedBatchNorm only support NHWC
-        if axis == 1:
+        if axis == 1 or axis == -3:
             tf_data_format = 'NCHW'
-        elif axis == 3:
+        elif axis == 3 or axis == -1:
             tf_data_format = 'NHWC'
         else:
             tf_data_format = None
 
         if tf_data_format == 'NHWC' or tf_data_format == 'NCHW' and _has_nchw_support():
+            # The mean / var / beta / gamma may be processed by broadcast
+            # so it may have extra axes with 1, it is not needed and should be removed
+            if ndim(mean) > 1:
+                mean = tf.squeeze(mean)
+            if ndim(var) > 1:
+                var = tf.squeeze(var)
             if beta is None:
                 beta = zeros_like(mean)
+            elif ndim(beta) > 1:
+                beta = tf.squeeze(beta)
             if gamma is None:
                 gamma = ones_like(mean)
+            elif ndim(gamma) > 1:
+                gamma = tf.squeeze(gamma)
             y, _, _ = tf.nn.fused_batch_norm(
                 x,
                 gamma,
@@ -2225,20 +2237,13 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
     assert len(padding) == 2
     assert len(padding[0]) == 2
     assert len(padding[1]) == 2
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
-    if data_format == 'channels_first':
-        pattern = [[0, 0],
-                   [0, 0],
-                   list(padding[0]),
-                   list(padding[1])]
-    else:
-        pattern = [[0, 0],
-                   list(padding[0]), list(padding[1]),
-                   [0, 0]]
+    pattern = [[0, 0],
+               list(padding[0]),
+               list(padding[1]),
+               [0, 0]]
+    pattern = transpose_shape(pattern, data_format, spatial_axes=(1, 2))
     return tf.pad(x, pattern)
 
 
@@ -2269,27 +2274,17 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
     assert len(padding[0]) == 2
     assert len(padding[1]) == 2
     assert len(padding[2]) == 2
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
+
+    pattern = [
+        [0, 0],
+        [padding[0][0], padding[0][1]],
+        [padding[1][0], padding[1][1]],
+        [padding[2][0], padding[2][1]],
+        [0, 0]
+    ]
+    pattern = transpose_shape(pattern, data_format, spatial_axes=(1, 2, 3))
 
-    if data_format == 'channels_first':
-        pattern = [
-            [0, 0],
-            [0, 0],
-            [padding[0][0], padding[0][1]],
-            [padding[1][0], padding[1][1]],
-            [padding[2][0], padding[2][1]]
-        ]
-    else:
-        pattern = [
-            [0, 0],
-            [padding[0][0], padding[0][1]],
-            [padding[1][0], padding[1][1]],
-            [padding[2][0], padding[2][1]],
-            [0, 0]
-        ]
     return tf.pad(x, pattern)
 
 
@@ -2611,12 +2606,12 @@ def _call(self, inputs):
                 # `callable_fn` only supports exact matches.
                 array_vals.append(
                     np.asarray(value,
-                               dtype=tensor.dtype.base_dtype.name))
+                               dtype=tf.as_dtype(tensor.dtype).as_numpy_dtype))
         if self.feed_dict:
             for key in sorted(self.feed_dict.keys()):
                 array_vals.append(
                     np.asarray(self.feed_dict[key],
-                               dtype=key.dtype.base_dtype.name))
+                               dtype=tf.as_dtype(key.dtype).as_numpy_dtype))
 
         # Refresh callable if anything has changed.
         if (self._callable_fn is None or
@@ -2736,7 +2731,7 @@ def rnn(step_function, inputs, initial_states,
                 states: List of tensors.
             Returns:
                 outputs: Tensor with shape (samples, ...) (no time dimension),
-                new_states: Tist of tensors, same length and shapes
+                new_states: List of tensors, same length and shapes
                     as 'states'.
         inputs: Tensor of temporal data of shape (samples, time, ...)
             (at least 3D).
@@ -3498,10 +3493,7 @@ def conv1d(x, kernel, strides=1, padding='valid',
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     kernel_shape = kernel.get_shape().as_list()
     if padding == 'causal':
@@ -3549,10 +3541,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
 
@@ -3591,10 +3580,7 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
     if isinstance(output_shape, (tuple, list)):
         output_shape = tf.stack(output_shape)
 
@@ -3643,10 +3629,7 @@ def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
     if isinstance(strides, int):
         strides = (strides,)
     if isinstance(dilation_rate, int):
@@ -3704,10 +3687,7 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3746,10 +3726,7 @@ def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3789,10 +3766,7 @@ def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x, tf_data_format = _preprocess_conv3d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3829,10 +3803,7 @@ def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
     if isinstance(output_shape, (tuple, list)):
         output_shape = tf.stack(output_shape)
 
@@ -3882,10 +3853,7 @@ def pool2d(x, pool_size, strides=(1, 1),
         ValueError: if `data_format` is neither `"channels_last"` or `"channels_first"`.
         ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3931,10 +3899,7 @@ def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
         ValueError: if `data_format` is neither `"channels_last"` or `"channels_first"`.
         ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     x, tf_data_format = _preprocess_conv3d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3979,25 +3944,18 @@ def bias_add(x, bias, data_format=None):
                        the bias should be either a vector or
                        a tensor with ndim(x) - 1 dimension
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
     bias_shape = int_shape(bias)
     if len(bias_shape) != 1 and len(bias_shape) != ndim(x) - 1:
         raise ValueError('Unexpected bias dimensions %d, expect to be 1 or %d dimensions'
                          % (len(bias_shape), ndim(x)))
     if ndim(x) == 5:
-        if data_format == 'channels_first':
-            if len(bias_shape) == 1:
-                x += reshape(bias, (1, bias_shape[0], 1, 1, 1))
-            else:
-                x += reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
-        elif data_format == 'channels_last':
-            if len(bias_shape) == 1:
-                x += reshape(bias, (1, 1, 1, bias_shape[0]))
-            else:
-                x += reshape(bias, (1,) + bias_shape)
+        if len(bias_shape) == 1:
+            new_shape = (1, 1, 1, 1, bias_shape[0])
+        else:
+            new_shape = (1,) + bias_shape
+        new_shape = transpose_shape(new_shape, data_format, spatial_axes=(1, 2, 3))
+        x += reshape(bias, new_shape)
     elif ndim(x) == 4:
         if data_format == 'channels_first':
             if len(bias_shape) == 1:
@@ -4015,16 +3973,12 @@ def bias_add(x, bias, data_format=None):
             else:
                 x += reshape(bias, (1,) + bias_shape)
     elif ndim(x) == 3:
-        if data_format == 'channels_first':
-            if len(bias_shape) == 1:
-                x += reshape(bias, (1, bias_shape[0], 1))
-            else:
-                x += reshape(bias, (1, bias_shape[1], bias_shape[0]))
-        elif data_format == 'channels_last':
-            if len(bias_shape) == 1:
-                x += reshape(bias, (1, 1, bias_shape[0]))
-            else:
-                x += reshape(bias, (1, ) + bias_shape)
+        if len(bias_shape) == 1:
+            new_shape = (1, 1, bias_shape[0])
+        else:
+            new_shape = (1,) + bias_shape
+        new_shape = transpose_shape(new_shape, data_format, spatial_axes=(1,))
+        x += reshape(bias, new_shape)
     else:
         x = tf.nn.bias_add(x, bias)
     return x
@@ -4311,10 +4265,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
         ValueError: If `data_format` is neither
             `"channels_last"` nor `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     stride = strides[0]
     kernel_shape = int_shape(kernel)
@@ -4363,10 +4314,7 @@ def local_conv2d(inputs, kernel, kernel_size, strides, output_shape, data_format
         ValueError: if `data_format` is neither
                     `channels_last` or `channels_first`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     stride_row, stride_col = strides
     output_row, output_col = output_shape
diff --git a/keras/backend/theano_backend.py b/keras/backend/theano_backend.py
index f91555481b1..70f10bb7c48 100644
--- a/keras/backend/theano_backend.py
+++ b/keras/backend/theano_backend.py
@@ -20,7 +20,10 @@
     from theano.sandbox.softsign import softsign as T_softsign
 
 import numpy as np
-from .common import floatx, epsilon, image_data_format
+from .common import floatx
+from .common import epsilon
+from .common import normalize_data_format
+from ..utils.generic_utils import transpose_shape
 from ..utils.generic_utils import has_arg
 # Legacy functions
 from .common import set_image_dim_ordering, image_dim_ordering
@@ -780,8 +783,8 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 # TODO remove this function when Theano without
 # T.nnet.bn.batch_normalization_train is deprecated
-def _old_normalize_batch_in_training(x, gamma, beta,
-                                     reduction_axes, epsilon=1e-3):
+def _old_normalize_batch_in_training(x, gamma, beta, reduction_axes,
+                                     epsilon=1e-3):  # pragma: no cover
     """Computes mean and std for batch then apply batch_normalization on batch.
     """
     if gamma is None:
@@ -828,7 +831,8 @@ def _old_normalize_batch_in_training(x, gamma, beta,
 
 # TODO remove this if statement when Theano without
 # T.nnet.bn.batch_normalization_test is deprecated
-def _old_batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
+def _old_batch_normalization(x, mean, var, beta, gamma,
+                             epsilon=1e-3):  # pragma: no cover
     """Apply batch normalization on x given mean, var, beta and gamma.
     """
     if gamma is None:
@@ -1115,10 +1119,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
     assert len(padding[1]) == 2
     top_pad, bottom_pad = padding[0]
     left_pad, right_pad = padding[1]
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     input_shape = x.shape
     if data_format == 'channels_first':
@@ -1151,10 +1152,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
     """Pad the 2nd, 3rd and 4th dimensions of a 5D tensor
     with "padding[0]", "padding[1]" and "padding[2]" (resp.) zeros left and right.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     input_shape = x.shape
     if data_format == 'channels_first':
@@ -1323,7 +1321,7 @@ def rnn(step_function, inputs, initial_states,
                 states: List of tensors.
             Returns:
                 outputs: Tensor with shape (samples, ...) (no time dimension),
-                new_states: Tist of tensors, same length and shapes
+                new_states: List of tensors, same length and shapes
                     as 'states'.
         inputs: Tensor of temporal data of shape (samples, time, ...)
             (at least 3D).
@@ -1826,8 +1824,8 @@ def int_or_none(value):
             return None
     if data_format == 'channels_last':
         if image_shape:
-            image_shape = (image_shape[0], image_shape[3],
-                           image_shape[1], image_shape[2])
+            image_shape = transpose_shape(image_shape, 'channels_first',
+                                          spatial_axes=(1, 2))
     if image_shape is not None:
         image_shape = tuple(int_or_none(v) for v in image_shape)
     return image_shape
@@ -1933,10 +1931,7 @@ def conv1d(x, kernel, strides=1, padding='valid',
         data_format: string, one of "channels_last", "channels_first"
         dilation_rate: integer.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ', data_format)
+    data_format = normalize_data_format(data_format)
 
     kernel_shape = int_shape(kernel)
     if padding == 'causal':
@@ -1990,10 +1985,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
             Whether to use Theano or TensorFlow data format
         in inputs/kernels/outputs.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ', data_format)
+    data_format = normalize_data_format(data_format)
 
     image_shape = _preprocess_conv2d_image_shape(int_shape(x), data_format)
     kernel_shape = int_shape(kernel)
@@ -2033,10 +2025,7 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
         ValueError: if using an even kernel size with padding 'same'.
     """
     flip_filters = False
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + data_format)
+    data_format = normalize_data_format(data_format)
 
     if data_format == 'channels_last':
         output_shape = (output_shape[0],
@@ -2089,10 +2078,7 @@ def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
     # Raises
         ValueError: if `data_format` is neither `"channels_last"` or `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ', data_format)
+    data_format = normalize_data_format(data_format)
     if isinstance(strides, int):
         strides = (strides,)
     if isinstance(dilation_rate, int):
@@ -2163,10 +2149,7 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
     # Raises
         ValueError: if `data_format` is neither `"channels_last"` or `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ', data_format)
+    data_format = normalize_data_format(data_format)
 
     image_shape = _preprocess_conv2d_image_shape(int_shape(x), data_format)
     depthwise_kernel_shape = int_shape(depthwise_kernel)
@@ -2221,10 +2204,7 @@ def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
     # Raises
         ValueError: if `data_format` is neither `"channels_last"` or `"channels_first"`.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ', data_format)
+    data_format = normalize_data_format(data_format)
 
     image_shape = _preprocess_conv2d_image_shape(int_shape(x), data_format)
     depthwise_kernel_shape = int_shape(depthwise_kernel)
@@ -2261,10 +2241,7 @@ def conv3d(x, kernel, strides=(1, 1, 1),
             Whether to use Theano or TensorFlow data format
         in inputs/kernels/outputs.
     """
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format:', data_format)
+    data_format = normalize_data_format(data_format)
 
     volume_shape = _preprocess_conv3d_volume_shape(int_shape(x), data_format)
     kernel_shape = int_shape(kernel)
@@ -2304,10 +2281,7 @@ def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
         ValueError: if using an even kernel size with padding 'same'.
     """
     flip_filters = False
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + data_format)
+    data_format = normalize_data_format(data_format)
 
     if data_format == 'channels_last':
         output_shape = (output_shape[0],
@@ -2344,10 +2318,7 @@ def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
 
 def pool2d(x, pool_size, strides=(1, 1), padding='valid',
            data_format=None, pool_mode='max'):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format:', data_format)
+    data_format = normalize_data_format(data_format)
 
     assert pool_size[0] >= 1 and pool_size[1] >= 1
 
@@ -2389,10 +2360,7 @@ def pool2d(x, pool_size, strides=(1, 1), padding='valid',
 
 def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
            data_format=None, pool_mode='max'):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format:', data_format)
+    data_format = normalize_data_format(data_format)
 
     if padding == 'same':
         w_pad = pool_size[0] - 2 if pool_size[0] % 2 == 1 else pool_size[0] - 1
@@ -2436,10 +2404,7 @@ def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
 
 
 def bias_add(x, bias, data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
     if ndim(bias) != 1 and ndim(bias) != ndim(x) - 1:
         raise ValueError('Unexpected bias dimensions %d, '
                          'expect to be 1 or %d dimensions'
@@ -2701,10 +2666,7 @@ def foldr(fn, elems, initializer=None, name=None):
 
 
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     stride = strides[0]
     kernel_shape = int_shape(kernel)
@@ -2723,10 +2685,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 
 def local_conv2d(inputs, kernel, kernel_size, strides, output_shape, data_format=None):
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+    data_format = normalize_data_format(data_format)
 
     stride_row, stride_col = strides
     output_row, output_col = output_shape
diff --git a/keras/callbacks.py b/keras/callbacks.py
index cc27f70a648..f151e688033 100644
--- a/keras/callbacks.py
+++ b/keras/callbacks.py
@@ -637,7 +637,7 @@ def on_epoch_begin(self, epoch, logs=None):
                              'should be float.')
         K.set_value(self.model.optimizer.lr, lr)
         if self.verbose > 0:
-            print('\nEpoch %05d: LearningRateScheduler reducing learning '
+            print('\nEpoch %05d: LearningRateScheduler setting learning '
                   'rate to %s.' % (epoch + 1, lr))
 
 
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index 4dc032eb78f..8f6cb4f9516 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -103,7 +103,7 @@ def load_data(path='imdb.npz', num_words=None, skip_top=0,
 
 
 def get_word_index(path='imdb_word_index.json'):
-    """Retrieves the dictionary mapping word indices back to words.
+    """Retrieves the dictionary mapping words to word indices.
 
     # Arguments
         path: where to cache the data (relative to `~/.keras/dataset`).
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index 8644edc32d3..44aff6070cc 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -89,7 +89,7 @@ def load_data(path='reuters.npz', num_words=None, skip_top=0,
 
 
 def get_word_index(path='reuters_word_index.json'):
-    """Retrieves the dictionary mapping word indices back to words.
+    """Retrieves the dictionary mapping words to word indices.
 
     # Arguments
         path: where to cache the data (relative to `~/.keras/dataset`).
diff --git a/keras/engine/base_layer.py b/keras/engine/base_layer.py
index 9ccccba1a1a..f02662f1a9f 100644
--- a/keras/engine/base_layer.py
+++ b/keras/engine/base_layer.py
@@ -14,6 +14,7 @@
 from ..utils.generic_utils import has_arg
 from ..utils.generic_utils import object_list_uid
 from ..utils.generic_utils import to_list
+from ..utils.generic_utils import unpack_singleton
 from ..utils.generic_utils import is_all_none
 from ..legacy import interfaces
 
@@ -33,7 +34,6 @@ class Layer(object):
             ill-defined (e.g. a shared layer with multiple input
             shapes), in which case requesting `input_shape` will raise
             an Exception. Prefer using
-            `layer.get_input_shape_for(input_shape)`, or
             `layer.get_input_shape_at(node_index)`.
         input_spec: List of InputSpec class instances
             each entry describes one required input:
@@ -428,10 +428,7 @@ def __call__(self, inputs, **kwargs):
                                          'and thus cannot be built. '
                                          'You can build it manually via: '
                                          '`layer.build(batch_input_shape)`')
-                if len(input_shapes) == 1:
-                    self.build(input_shapes[0])
-                else:
-                    self.build(input_shapes)
+                self.build(unpack_singleton(input_shapes))
                 self.built = True
 
                 # Load weights that were specified at layer instantiation.
@@ -469,10 +466,7 @@ def __call__(self, inputs, **kwargs):
                 if x in inputs_ls:
                     x = K.identity(x)
                 output_ls_copy.append(x)
-            if len(output_ls_copy) == 1:
-                output = output_ls_copy[0]
-            else:
-                output = output_ls_copy
+            output = unpack_singleton(output_ls_copy)
 
             # Inferring the output shape is only relevant for Theano.
             if all([s is not None
@@ -669,10 +663,7 @@ def _get_node_attribute_at_index(self, node_index, attr, attr_name):
                              ', but the layer has only ' +
                              str(len(self._inbound_nodes)) + ' inbound nodes.')
         values = getattr(self._inbound_nodes[node_index], attr)
-        if len(values) == 1:
-            return values[0]
-        else:
-            return values
+        return unpack_singleton(values)
 
     def get_input_shape_at(self, node_index):
         """Retrieves the input shape(s) of a layer at a given node.
@@ -898,10 +889,7 @@ def input_shape(self):
             [str(node.input_shapes) for node in self._inbound_nodes])
         if len(all_input_shapes) == 1:
             input_shapes = self._inbound_nodes[0].input_shapes
-            if len(input_shapes) == 1:
-                return input_shapes[0]
-            else:
-                return input_shapes
+            return unpack_singleton(input_shapes)
         else:
             raise AttributeError('The layer "' + str(self.name) +
                                  ' has multiple inbound nodes, '
@@ -933,10 +921,7 @@ def output_shape(self):
             [str(node.output_shapes) for node in self._inbound_nodes])
         if len(all_output_shapes) == 1:
             output_shapes = self._inbound_nodes[0].output_shapes
-            if len(output_shapes) == 1:
-                return output_shapes[0]
-            else:
-                return output_shapes
+            return unpack_singleton(output_shapes)
         else:
             raise AttributeError('The layer "' + str(self.name) +
                                  ' has multiple inbound nodes, '
@@ -1327,9 +1312,7 @@ def _collect_previous_mask(input_tensors):
             masks.append(mask)
         else:
             masks.append(None)
-    if len(masks) == 1:
-        return masks[0]
-    return masks
+    return unpack_singleton(masks)
 
 
 def _to_snake_case(name):
@@ -1358,6 +1341,4 @@ def _collect_input_shape(input_tensors):
             shapes.append(K.int_shape(x))
         except TypeError:
             shapes.append(None)
-    if len(shapes) == 1:
-        return shapes[0]
-    return shapes
+    return unpack_singleton(shapes)
diff --git a/keras/engine/input_layer.py b/keras/engine/input_layer.py
index 9ab9bca3360..632bf39eaee 100644
--- a/keras/engine/input_layer.py
+++ b/keras/engine/input_layer.py
@@ -8,6 +8,7 @@
 from .base_layer import Node
 from .. import backend as K
 from ..legacy import interfaces
+from ..utils.generic_utils import unpack_singleton
 
 
 class InputLayer(Layer):
@@ -41,6 +42,7 @@ def __init__(self, input_shape=None, batch_size=None,
         self.trainable = False
         self.built = True
         self.sparse = sparse
+        self.supports_masking = True
 
         if input_shape and batch_input_shape:
             raise ValueError('Only provide the input_shape OR '
@@ -177,7 +179,4 @@ def Input(shape=None, batch_shape=None,
     # Return tensor including _keras_shape and _keras_history.
     # Note that in this case train_output and test_output are the same pointer.
     outputs = input_layer._inbound_nodes[0].output_tensors
-    if len(outputs) == 1:
-        return outputs[0]
-    else:
-        return outputs
+    return unpack_singleton(outputs)
diff --git a/keras/engine/network.py b/keras/engine/network.py
index 3aef5964c79..51ea1d12b95 100644
--- a/keras/engine/network.py
+++ b/keras/engine/network.py
@@ -23,6 +23,7 @@
 from ..utils.generic_utils import has_arg
 from ..utils.generic_utils import to_list
 from ..utils.generic_utils import object_list_uid
+from ..utils.generic_utils import unpack_singleton
 from ..legacy import interfaces
 
 try:
@@ -131,7 +132,7 @@ def _base_init(self, name=None):
         # Entries are unique. Includes input and output layers.
         self._layers = []
 
-        # Used only in conjonction with graph-networks
+        # Used only in conjunction with graph-networks
         self._outbound_nodes = []
         self._inbound_nodes = []
 
@@ -159,36 +160,36 @@ def _init_graph_network(self, inputs, outputs, name=None):
             if not hasattr(x, '_keras_history'):
                 cls_name = self.__class__.__name__
                 raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                                 'must come from `tf.layers.Input`. '
+                                 'must come from `keras.layers.Input`. '
                                  'Received: ' + str(x) +
                                  ' (missing previous layer metadata).')
-        # Check that x is an input tensor.
-        layer, node_index, tensor_index = x._keras_history
-        if (len(layer._inbound_nodes) > 1 or
-                (layer._inbound_nodes and
-                 layer._inbound_nodes[0].inbound_layers)):
-            cls_name = self.__class__.__name__
-            warnings.warn(cls_name + ' inputs must come from '
-                          '`tf.layers.Input` '
-                          '(thus holding past layer metadata), '
-                          'they cannot be the output of '
-                          'a previous non-Input layer. '
-                          'Here, a tensor specified as '
-                          'input to "' + self.name +
-                          '" was not an Input tensor, '
-                          'it was generated by layer ' +
-                          layer.name + '.\n'
-                          'Note that input tensors are '
-                          'instantiated via '
-                          '`tensor = tf.layers.Input(shape)`.\n'
-                          'The tensor that caused the issue was: ' +
-                          str(x.name))
+            # Check that x is an input tensor.
+            layer, node_index, tensor_index = x._keras_history
+            if (len(layer._inbound_nodes) > 1 or
+                    (layer._inbound_nodes and
+                     layer._inbound_nodes[0].inbound_layers)):
+                cls_name = self.__class__.__name__
+                warnings.warn(cls_name + ' inputs must come from '
+                              '`keras.layers.Input` '
+                              '(thus holding past layer metadata), '
+                              'they cannot be the output of '
+                              'a previous non-Input layer. '
+                              'Here, a tensor specified as '
+                              'input to your model '
+                              'was not an Input tensor, '
+                              'it was generated by layer ' +
+                              layer.name + '.\n'
+                              'Note that input tensors are '
+                              'instantiated via '
+                              '`tensor = keras.layers.Input(shape)`.\n'
+                              'The tensor that caused the issue was: ' +
+                              str(x.name))
         for x in self.outputs:
             if not hasattr(x, '_keras_history'):
                 cls_name = self.__class__.__name__
                 raise ValueError('Output tensors to a ' + cls_name +
                                  ' must be '
-                                 'the output of a TensorFlow `Layer` '
+                                 'the output of a Keras `Layer` '
                                  '(thus holding past layer metadata). '
                                  'Found: ' + str(x))
         self._base_init(name=name)
@@ -267,10 +268,7 @@ def _init_graph_network(self, inputs, outputs, name=None):
             node = layer._inbound_nodes[node_index]
             mask = node.output_masks[tensor_index]
             masks.append(mask)
-        if len(masks) == 1:
-            mask = masks[0]
-        else:
-            mask = masks
+        mask = unpack_singleton(masks)
         self._output_mask_cache[mask_cache_key] = mask
 
         # Build self.input_names and self.output_names.
@@ -524,7 +522,7 @@ def input_spec(self):
                 or a single instance if the model has only one input.
         """
         if not self._is_graph_network:
-            # TODO: support it in subclassd networks after inputs are set.
+            # TODO: support it in subclassed networks after inputs are set.
             return None
 
         specs = []
@@ -539,9 +537,7 @@ def input_spec(self):
                                     'Found input_spec = ' +
                                     str(layer.input_spec))
                 specs += layer.input_spec
-        if len(specs) == 1:
-            return specs[0]
-        return specs
+        return unpack_singleton(specs)
 
     def call(self, inputs, mask=None):
         """Calls the model on new inputs.
@@ -605,8 +601,8 @@ def compute_output_shape(self, input_shape):
         cache_key = ', '.join([str(x) for x in input_shapes])
         if cache_key in self._output_shape_cache:
             output_shapes = self._output_shape_cache[cache_key]
-            if isinstance(output_shapes, list) and len(output_shapes) == 1:
-                return output_shapes[0]
+            if isinstance(output_shapes, list):
+                return unpack_singleton(output_shapes)
             return output_shapes
         else:
             # Bad luck, we have to run the graph manually.
@@ -643,10 +639,7 @@ def compute_output_shape(self, input_shape):
                             input_shape = layers_to_output_shapes[shape_key]
                             input_shapes.append(input_shape)
 
-                        if len(input_shapes) == 1:
-                            output_shape = layer.compute_output_shape(input_shapes[0])
-                        else:
-                            output_shape = layer.compute_output_shape(input_shapes)
+                        output_shape = layer.compute_output_shape(unpack_singleton(input_shapes))
 
                         output_shapes = to_list(output_shape)
                         node_index = layer._inbound_nodes.index(node)
@@ -669,8 +662,8 @@ def compute_output_shape(self, input_shape):
                 output_shapes.append(layers_to_output_shapes[key])
             # Store in cache.
             self._output_shape_cache[cache_key] = output_shapes
-            if isinstance(output_shapes, list) and len(output_shapes) == 1:
-                return output_shapes[0]
+            if isinstance(output_shapes, list):
+                return unpack_singleton(output_shapes)
             return output_shapes
 
     def run_internal_graph(self, inputs, masks=None):
@@ -736,6 +729,8 @@ def run_internal_graph(self, inputs, masks=None):
                             else:
                                 output_masks = to_list(output_masks)
                             computed_tensors = [computed_tensor]
+
+                            # computed_masks might be used in the future.
                             computed_masks = [computed_mask]
                         else:
                             computed_tensors = [x[0] for x in computed_data]
@@ -781,12 +776,10 @@ def run_internal_graph(self, inputs, masks=None):
 
                     # Update _keras_shape.
                     if all([hasattr(x, '_keras_shape') for x in computed_tensors]):
-                        if len(computed_tensors) == 1:
-                            shapes = to_list(layer.compute_output_shape(computed_tensors[0]._keras_shape))
-                            uses_learning_phase = computed_tensors[0]._uses_learning_phase
-                        else:
-                            shapes = to_list(layer.compute_output_shape([x._keras_shape for x in computed_tensors]))
-                            uses_learning_phase = any([x._uses_learning_phase for x in computed_tensors])
+                        input_shapes = unpack_singleton([x._keras_shape for x in computed_tensors])
+                        shapes = to_list(layer.compute_output_shape(input_shapes))
+                        uses_learning_phase = any([x._uses_learning_phase for x in computed_tensors])
+
                         for x, s in zip(output_tensors, shapes):
                             x._keras_shape = s
                             x._uses_learning_phase = getattr(x, '_uses_learning_phase', False) or uses_learning_phase
@@ -814,26 +807,18 @@ def run_internal_graph(self, inputs, masks=None):
         cache_key = object_list_uid(inputs)
         cache_key += '_' + object_list_uid(masks)
 
-        if len(output_tensors) == 1:
-            output_tensors = output_tensors[0]
-            self._output_tensor_cache[cache_key] = output_tensors
-        else:
-            self._output_tensor_cache[cache_key] = output_tensors
+        output_tensors = unpack_singleton(output_tensors)
+        self._output_tensor_cache[cache_key] = output_tensors
 
-        if len(output_masks) == 1:
-            output_masks = output_masks[0]
-            self._output_mask_cache[cache_key] = output_masks
-        else:
-            self._output_mask_cache[cache_key] = output_masks
+        output_masks = unpack_singleton(output_masks)
+        self._output_mask_cache[cache_key] = output_masks
 
         if output_shapes is not None:
             input_shapes = [x._keras_shape for x in inputs]
             cache_key = ', '.join([str(x) for x in input_shapes])
-            if len(output_shapes) == 1:
-                output_shapes = output_shapes[0]
-                self._output_shape_cache[cache_key] = output_shapes
-            else:
-                self._output_shape_cache[cache_key] = output_shapes
+
+            output_shapes = unpack_singleton(output_shapes)
+            self._output_shape_cache[cache_key] = output_shapes
         return output_tensors, output_masks, output_shapes
 
     def get_config(self):
@@ -1000,10 +985,7 @@ def process_node(layer, node_data):
             # Call layer on its inputs, thus creating the node
             # and building the layer if needed.
             if input_tensors:
-                if len(input_tensors) == 1:
-                    layer(input_tensors[0], **kwargs)
-                else:
-                    layer(input_tensors, **kwargs)
+                layer(unpack_singleton(input_tensors), **kwargs)
 
         def process_layer(layer_data):
             """Deserializes a layer, then call it on appropriate inputs.
@@ -1213,7 +1195,10 @@ def to_json(self, **kwargs):
         def get_json_type(obj):
             # If obj is any numpy type
             if type(obj).__module__ == np.__name__:
-                return obj.item()
+                if isinstance(obj, np.ndarray):
+                    return obj.tolist()
+                else:
+                    return obj.item()
 
             # If obj is a python 'type'
             if type(obj).__name__ == type.__name__:
@@ -1261,7 +1246,7 @@ def summary(self, line_length=None, positions=None, print_fn=None):
         """
         if not self.built:
             raise ValueError(
-                'This model has never been called, this its weights '
+                'This model has never been called, thus its weights '
                 'have not yet been created, so no summary can be displayed. '
                 'Build the model first '
                 '(e.g. by calling it on some test data).')
diff --git a/keras/engine/saving.py b/keras/engine/saving.py
index 9a29c7faa98..007fd4249a3 100644
--- a/keras/engine/saving.py
+++ b/keras/engine/saving.py
@@ -156,8 +156,7 @@ def get_json_type(obj):
         # if obj is any numpy type
         if type(obj).__module__ == np.__name__:
             if isinstance(obj, np.ndarray):
-                return {'type': type(obj),
-                        'value': obj.tolist()}
+                return obj.tolist()
             else:
                 return obj.item()
 
@@ -910,7 +909,7 @@ def _need_convert_kernel(original_backend):
     The convolution operation is implemented differently in different backends.
     While TH implements convolution, TF and CNTK implement the correlation operation.
     So the channel axis needs to be flipped when we're loading TF weights onto a TH model,
-    or vice verca. However, there's no conversion required between TF and CNTK.
+    or vice versa. However, there's no conversion required between TF and CNTK.
 
     # Arguments
         original_backend: Keras backend the weights were trained with, as a string.
@@ -923,8 +922,7 @@ def _need_convert_kernel(original_backend):
         return False
     uses_correlation = {'tensorflow': True,
                         'theano': False,
-                        'cntk': True,
-                        'mxnet': False}
+                        'cntk': True}
     if original_backend not in uses_correlation:
         # By default, do not convert the kernels if the original backend is unknown
         return False
diff --git a/keras/engine/sequential.py b/keras/engine/sequential.py
index 39de3f217c7..e75a68687be 100644
--- a/keras/engine/sequential.py
+++ b/keras/engine/sequential.py
@@ -149,8 +149,6 @@ def add(self, layer):
                     first_layer = layer.layers[0]
                     while isinstance(first_layer, (Model, Sequential)):
                         first_layer = first_layer.layers[0]
-                    batch_shape = first_layer.batch_input_shape
-                    dtype = first_layer.dtype
 
                 if hasattr(first_layer, 'batch_input_shape'):
                     batch_shape = first_layer.batch_input_shape
@@ -165,11 +163,6 @@ def add(self, layer):
                     # to the input layer we just created.
                     layer(x)
                     set_inputs = True
-                else:
-                    # The layer doesn't know about its expected shape.
-                    # We will have to
-                    # build the model lazily on `fit`/etc.
-                    batch_shape = None
             else:
                 # Corner case where the user passes an InputLayer via `add`.
                 assert len(layer._inbound_nodes[-1].output_tensors) == 1
diff --git a/keras/engine/training.py b/keras/engine/training.py
index f5eed1cecdd..3d8d14a3966 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -25,6 +25,8 @@
 from .. import losses
 from .. import metrics as metrics_module
 from ..utils.generic_utils import slice_arrays
+from ..utils.generic_utils import to_list
+from ..utils.generic_utils import unpack_singleton
 from ..legacy import interfaces
 
 
@@ -154,8 +156,7 @@ def compile(self, optimizer,
         masks = self.compute_mask(self.inputs, mask=None)
         if masks is None:
             masks = [None for _ in self.outputs]
-        if not isinstance(masks, list):
-            masks = [masks]
+        masks = to_list(masks)
 
         # Prepare loss weights.
         if loss_weights is None:
@@ -426,7 +427,6 @@ def handle_metrics(metrics, weights=None):
                     self.stateful_metric_names.append(metric_name)
                     self.stateful_metric_functions.append(metric_fn)
                     self.metrics_updates += metric_fn.updates
-
         with K.name_scope('metrics'):
             for i in range(len(self.outputs)):
                 if i in skip_target_indices:
@@ -437,7 +437,6 @@ def handle_metrics(metrics, weights=None):
                 weights = sample_weights[i]
                 output_metrics = nested_metrics[i]
                 output_weighted_metrics = nested_weighted_metrics[i]
-
                 handle_metrics(output_metrics)
                 handle_metrics(output_weighted_metrics, weights=weights)
 
@@ -622,16 +621,10 @@ def _set_inputs(self, inputs, outputs=None, training=None):
 
         if outputs is None:
             # Obtain symbolic outputs by calling the model.
-            if len(self.inputs) == 1:
-                if self._expects_training_arg:
-                    outputs = self.call(self.inputs[0], training=training)
-                else:
-                    outputs = self.call(self.inputs[0])
+            if self._expects_training_arg:
+                outputs = self.call(unpack_singleton(self.inputs), training=training)
             else:
-                if self._expects_training_arg:
-                    outputs = self.call(self.inputs, training=training)
-                else:
-                    outputs = self.call(self.inputs)
+                outputs = self.call(unpack_singleton(self.inputs))
         if isinstance(outputs, (list, tuple)):
             outputs = list(outputs)
         else:
@@ -1127,7 +1120,7 @@ def predict(self, x,
 
         # Arguments
             x: The input data, as a Numpy array
-                (or list of Numpy arrays if the model has multiple outputs).
+                (or list of Numpy arrays if the model has multiple inputs).
             batch_size: Integer. If unspecified, it will default to 32.
             verbose: Verbosity mode, 0 or 1.
             steps: Total number of steps (batches of samples)
@@ -1220,9 +1213,7 @@ class indices (integers) to
             ins = x + y + sample_weights
         self._make_train_function()
         outputs = self.train_function(ins)
-        if len(outputs) == 1:
-            return outputs[0]
-        return outputs
+        return unpack_singleton(outputs)
 
     def test_on_batch(self, x, y, sample_weight=None):
         """Test the model on a single batch of samples.
@@ -1261,9 +1252,7 @@ def test_on_batch(self, x, y, sample_weight=None):
             ins = x + y + sample_weights
         self._make_test_function()
         outputs = self.test_function(ins)
-        if len(outputs) == 1:
-            return outputs[0]
-        return outputs
+        return unpack_singleton(outputs)
 
     def predict_on_batch(self, x):
         """Returns predictions for a single batch of samples.
@@ -1281,9 +1270,7 @@ def predict_on_batch(self, x):
             ins = x
         self._make_predict_function()
         outputs = self.predict_function(ins)
-        if len(outputs) == 1:
-            return outputs[0]
-        return outputs
+        return unpack_singleton(outputs)
 
     @interfaces.legacy_generator_methods_support
     def fit_generator(self, generator,
diff --git a/keras/engine/training_arrays.py b/keras/engine/training_arrays.py
index 68073a1223d..e74096261b9 100644
--- a/keras/engine/training_arrays.py
+++ b/keras/engine/training_arrays.py
@@ -14,6 +14,8 @@
 from .. import callbacks as cbks
 from ..utils.generic_utils import Progbar
 from ..utils.generic_utils import slice_arrays
+from ..utils.generic_utils import to_list
+from ..utils.generic_utils import unpack_singleton
 
 
 def fit_loop(model, f, ins,
@@ -151,8 +153,7 @@ def fit_loop(model, f, ins,
                 callbacks.on_batch_begin(step_index, batch_logs)
                 outs = f(ins)
 
-                if not isinstance(outs, list):
-                    outs = [outs]
+                outs = to_list(outs)
                 for l, o in zip(out_labels, outs):
                     batch_logs[l] = o
 
@@ -164,8 +165,7 @@ def fit_loop(model, f, ins,
                 val_outs = test_loop(model, val_f, val_ins,
                                      steps=validation_steps,
                                      verbose=0)
-                if not isinstance(val_outs, list):
-                    val_outs = [val_outs]
+                val_outs = to_list(val_outs)
                 # Same labels assumed.
                 for l, o in zip(out_labels, val_outs):
                     epoch_logs['val_' + l] = o
@@ -197,8 +197,7 @@ def fit_loop(model, f, ins,
                     ins_batch[i] = ins_batch[i].toarray()
 
                 outs = f(ins_batch)
-                if not isinstance(outs, list):
-                    outs = [outs]
+                outs = to_list(outs)
                 for l, o in zip(out_labels, outs):
                     batch_logs[l] = o
 
@@ -211,8 +210,7 @@ def fit_loop(model, f, ins,
                         val_outs = test_loop(model, val_f, val_ins,
                                              batch_size=batch_size,
                                              verbose=0)
-                        if not isinstance(val_outs, list):
-                            val_outs = [val_outs]
+                        val_outs = to_list(val_outs)
                         # Same labels assumed.
                         for l, o in zip(out_labels, val_outs):
                             epoch_logs['val_' + l] = o
@@ -266,8 +264,7 @@ def predict_loop(model, f, ins, batch_size=32, verbose=0, steps=None):
         unconcatenated_outs = []
         for step in range(steps):
             batch_outs = f(ins)
-            if not isinstance(batch_outs, list):
-                batch_outs = [batch_outs]
+            batch_outs = to_list(batch_outs)
             if step == 0:
                 for batch_out in batch_outs:
                     unconcatenated_outs.append([])
@@ -295,8 +292,7 @@ def predict_loop(model, f, ins, batch_size=32, verbose=0, steps=None):
                 ins_batch[i] = ins_batch[i].toarray()
 
             batch_outs = f(ins_batch)
-            if not isinstance(batch_outs, list):
-                batch_outs = [batch_outs]
+            batch_outs = to_list(batch_outs)
             if batch_index == 0:
                 # Pre-allocate the results arrays.
                 for batch_out in batch_outs:
@@ -306,9 +302,7 @@ def predict_loop(model, f, ins, batch_size=32, verbose=0, steps=None):
                 outs[i][batch_start:batch_end] = batch_out
             if verbose == 1:
                 progbar.update(batch_end)
-        if len(outs) == 1:
-            return outs[0]
-        return outs
+        return unpack_singleton(outs)
 
 
 def test_loop(model, f, ins, batch_size=None, verbose=0, steps=None):
@@ -415,6 +409,4 @@ def test_loop(model, f, ins, batch_size=None, verbose=0, steps=None):
         for i in range(len(outs)):
             if i not in stateful_metric_indices:
                 outs[i] /= num_samples
-    if len(outs) == 1:
-        return outs[0]
-    return outs
+    return unpack_singleton(outs)
diff --git a/keras/engine/training_generator.py b/keras/engine/training_generator.py
index a17e7c2e274..535f34e0909 100644
--- a/keras/engine/training_generator.py
+++ b/keras/engine/training_generator.py
@@ -12,6 +12,8 @@
 from ..utils.data_utils import GeneratorEnqueuer
 from ..utils.data_utils import OrderedEnqueuer
 from ..utils.generic_utils import Progbar
+from ..utils.generic_utils import to_list
+from ..utils.generic_utils import unpack_singleton
 from .. import callbacks as cbks
 
 
@@ -102,26 +104,46 @@ def fit_generator(model,
     val_enqueuer = None
 
     try:
-        if do_validation and not val_gen:
-            # Prepare data for validation
-            if len(validation_data) == 2:
-                val_x, val_y = validation_data
-                val_sample_weight = None
-            elif len(validation_data) == 3:
-                val_x, val_y, val_sample_weight = validation_data
+        if do_validation:
+            if val_gen and workers > 0:
+                # Create an Enqueuer that can be reused
+                val_data = validation_data
+                if isinstance(val_data, Sequence):
+                    val_enqueuer = OrderedEnqueuer(val_data,
+                                                   use_multiprocessing=use_multiprocessing)
+                    validation_steps = len(val_data)
+                else:
+                    val_enqueuer = GeneratorEnqueuer(val_data,
+                                                     use_multiprocessing=use_multiprocessing)
+                val_enqueuer.start(workers=workers,
+                                   max_queue_size=max_queue_size)
+                val_enqueuer_gen = val_enqueuer.get()
+            elif val_gen:
+                val_data = validation_data
+                if isinstance(val_data, Sequence):
+                    val_enqueuer_gen = iter(val_data)
+                else:
+                    val_enqueuer_gen = val_data
             else:
-                raise ValueError('`validation_data` should be a tuple '
-                                 '`(val_x, val_y, val_sample_weight)` '
-                                 'or `(val_x, val_y)`. Found: ' +
-                                 str(validation_data))
-            val_x, val_y, val_sample_weights = model._standardize_user_data(
-                val_x, val_y, val_sample_weight)
-            val_data = val_x + val_y + val_sample_weights
-            if model.uses_learning_phase and not isinstance(K.learning_phase(),
-                                                            int):
-                val_data += [0.]
-            for cbk in callbacks:
-                cbk.validation_data = val_data
+                # Prepare data for validation
+                if len(validation_data) == 2:
+                    val_x, val_y = validation_data
+                    val_sample_weight = None
+                elif len(validation_data) == 3:
+                    val_x, val_y, val_sample_weight = validation_data
+                else:
+                    raise ValueError('`validation_data` should be a tuple '
+                                     '`(val_x, val_y, val_sample_weight)` '
+                                     'or `(val_x, val_y)`. Found: ' +
+                                     str(validation_data))
+                val_x, val_y, val_sample_weights = model._standardize_user_data(
+                    val_x, val_y, val_sample_weight)
+                val_data = val_x + val_y + val_sample_weights
+                if model.uses_learning_phase and not isinstance(K.learning_phase(),
+                                                                int):
+                    val_data += [0.]
+                for cbk in callbacks:
+                    cbk.validation_data = val_data
 
         if workers > 0:
             if is_sequence:
@@ -190,8 +212,7 @@ def fit_generator(model,
                                             sample_weight=sample_weight,
                                             class_weight=class_weight)
 
-                if not isinstance(outs, list):
-                    outs = [outs]
+                outs = to_list(outs)
                 for l, o in zip(out_labels, outs):
                     batch_logs[l] = o
 
@@ -204,11 +225,9 @@ def fit_generator(model,
                 if steps_done >= steps_per_epoch and do_validation:
                     if val_gen:
                         val_outs = model.evaluate_generator(
-                            validation_data,
+                            val_enqueuer_gen,
                             validation_steps,
-                            workers=workers,
-                            use_multiprocessing=use_multiprocessing,
-                            max_queue_size=max_queue_size)
+                            workers=0)
                     else:
                         # No need for try/except because
                         # data has already been validated.
@@ -217,8 +236,7 @@ def fit_generator(model,
                             batch_size=batch_size,
                             sample_weight=val_sample_weights,
                             verbose=0)
-                    if not isinstance(val_outs, list):
-                        val_outs = [val_outs]
+                    val_outs = to_list(val_outs)
                     # Same labels assumed.
                     for l, o in zip(out_labels, val_outs):
                         epoch_logs['val_' + l] = o
@@ -252,7 +270,6 @@ def evaluate_generator(model, generator,
     """See docstring for `Model.evaluate_generator`."""
     model._make_test_function()
 
-    stateful_metric_indices = []
     if hasattr(model, 'metrics'):
         for m in model.stateful_metric_functions:
             m.reset_states()
@@ -323,8 +340,7 @@ def evaluate_generator(model, generator,
                                  'or (x, y). Found: ' +
                                  str(generator_output))
             outs = model.test_on_batch(x, y, sample_weight=sample_weight)
-            if not isinstance(outs, list):
-                outs = [outs]
+            outs = to_list(outs)
             outs_per_batch.append(outs)
 
             if x is None or len(x) == 0:
@@ -356,10 +372,8 @@ def evaluate_generator(model, generator,
             averages.append(np.average([out[i] for out in outs_per_batch],
                                        weights=batch_sizes))
         else:
-            averages.append(float(outs_per_batch[-1][i]))
-    if len(averages) == 1:
-        return averages[0]
-    return averages
+            averages.append(np.float64(outs_per_batch[-1][i]))
+    return unpack_singleton(averages)
 
 
 def predict_generator(model, generator,
@@ -433,8 +447,7 @@ def predict_generator(model, generator,
                 x = generator_output
 
             outs = model.predict_on_batch(x)
-            if not isinstance(outs, list):
-                outs = [outs]
+            outs = to_list(outs)
 
             if not all_outs:
                 for out in outs:
diff --git a/keras/engine/training_utils.py b/keras/engine/training_utils.py
index ee280a9ffe7..bc133c39ecd 100644
--- a/keras/engine/training_utils.py
+++ b/keras/engine/training_utils.py
@@ -9,6 +9,7 @@
 
 from .. import backend as K
 from .. import losses
+from ..utils.generic_utils import to_list
 
 
 def standardize_single_array(x):
@@ -321,8 +322,7 @@ def collect_metrics(metrics, output_names):
         nested_metrics = []
         for name in output_names:
             output_metrics = metrics.get(name, [])
-            if not isinstance(output_metrics, list):
-                output_metrics = [output_metrics]
+            output_metrics = to_list(output_metrics)
             nested_metrics.append(output_metrics)
         return nested_metrics
     else:
diff --git a/keras/layers/convolutional.py b/keras/layers/convolutional.py
index bcaa60c52f3..7db563f590b 100644
--- a/keras/layers/convolutional.py
+++ b/keras/layers/convolutional.py
@@ -13,6 +13,7 @@
 from ..engine.base_layer import Layer
 from ..engine.base_layer import InputSpec
 from ..utils import conv_utils
+from ..utils.generic_utils import transpose_shape
 from ..legacy import interfaces
 
 # imports for backwards namespace compatibility
@@ -107,7 +108,7 @@ def __init__(self, rank,
         self.kernel_size = conv_utils.normalize_tuple(kernel_size, rank, 'kernel_size')
         self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, rank, 'dilation_rate')
         self.activation = activations.get(activation)
         self.use_bias = use_bias
@@ -275,10 +276,10 @@ class Conv1D(_Conv):
             one of `"channels_last"` (default) or `"channels_first"`.
             The ordering of the dimensions in the inputs.
             `"channels_last"` corresponds to inputs with shape
-            `(batch, length, channels)`
+            `(batch, steps, channels)`
             (default format for temporal data in Keras)
             while `"channels_first"` corresponds to inputs
-            with shape `(batch, channels, length)`.
+            with shape `(batch, channels, steps)`.
         dilation_rate: an integer or tuple/list of a single integer, specifying
             the dilation rate to use for dilated convolution.
             Currently, specifying any `dilation_rate` value != 1 is
@@ -306,10 +307,10 @@ class Conv1D(_Conv):
             (see [constraints](../constraints.md)).
 
     # Input shape
-        3D tensor with shape: `(batch_size, steps, input_dim)`
+        3D tensor with shape: `(batch, steps, channels)`
 
     # Output shape
-        3D tensor with shape: `(batch_size, new_steps, filters)`
+        3D tensor with shape: `(batch, new_steps, filters)`
         `steps` value might have changed due to padding or strides.
     """
 
@@ -353,7 +354,6 @@ def __init__(self, filters,
             kernel_constraint=kernel_constraint,
             bias_constraint=bias_constraint,
             **kwargs)
-        self.input_spec = InputSpec(ndim=3)
 
     def get_config(self):
         config = super(Conv1D, self).get_config()
@@ -434,18 +434,18 @@ class Conv2D(_Conv):
 
     # Input shape
         4D tensor with shape:
-        `(samples, channels, rows, cols)`
+        `(batch, channels, rows, cols)`
         if `data_format` is `"channels_first"`
         or 4D tensor with shape:
-        `(samples, rows, cols, channels)`
+        `(batch, rows, cols, channels)`
         if `data_format` is `"channels_last"`.
 
     # Output shape
         4D tensor with shape:
-        `(samples, filters, new_rows, new_cols)`
+        `(batch, filters, new_rows, new_cols)`
         if `data_format` is `"channels_first"`
         or 4D tensor with shape:
-        `(samples, new_rows, new_cols, filters)`
+        `(batch, new_rows, new_cols, filters)`
         if `data_format` is `"channels_last"`.
         `rows` and `cols` values might have changed due to padding.
     """
@@ -485,7 +485,6 @@ def __init__(self, filters,
             kernel_constraint=kernel_constraint,
             bias_constraint=bias_constraint,
             **kwargs)
-        self.input_spec = InputSpec(ndim=4)
 
     def get_config(self):
         config = super(Conv2D, self).get_config()
@@ -566,18 +565,18 @@ class Conv3D(_Conv):
 
     # Input shape
         5D tensor with shape:
-        `(samples, channels, conv_dim1, conv_dim2, conv_dim3)`
+        `(batch, channels, conv_dim1, conv_dim2, conv_dim3)`
         if `data_format` is `"channels_first"`
         or 5D tensor with shape:
-        `(samples, conv_dim1, conv_dim2, conv_dim3, channels)`
+        `(batch, conv_dim1, conv_dim2, conv_dim3, channels)`
         if `data_format` is `"channels_last"`.
 
     # Output shape
         5D tensor with shape:
-        `(samples, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)`
+        `(batch, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)`
         if `data_format` is `"channels_first"`
         or 5D tensor with shape:
-        `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)`
+        `(batch, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)`
         if `data_format` is `"channels_last"`.
         `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have changed due to padding.
     """
@@ -617,7 +616,6 @@ def __init__(self, filters,
             kernel_constraint=kernel_constraint,
             bias_constraint=bias_constraint,
             **kwargs)
-        self.input_spec = InputSpec(ndim=5)
 
     def get_config(self):
         config = super(Conv3D, self).get_config()
@@ -656,6 +654,14 @@ class Conv2DTranspose(Conv2D):
             Specifying any stride value != 1 is incompatible with specifying
             any `dilation_rate` value != 1.
         padding: one of `"valid"` or `"same"` (case-insensitive).
+        output_padding: An integer or tuple/list of 2 integers,
+            specifying the amount of padding along the height and width
+            of the output tensor.
+            Can be a single integer to specify the same value for all
+            spatial dimensions.
+            The amount of output padding along a given dimension must be
+            lower than the stride along that same dimension.
+            If set to `None` (default), the output shape is inferred.
         data_format: A string,
             one of `"channels_last"` or `"channels_first"`.
             The ordering of the dimensions in the inputs.
@@ -710,6 +716,12 @@ class Conv2DTranspose(Conv2D):
         `(batch, new_rows, new_cols, filters)`
         if `data_format` is `"channels_last"`.
         `rows` and `cols` values might have changed due to padding.
+        If `output_padding` is specified:
+
+        ```
+        new_rows = (rows - 1) * strides[0] + kernel_size[0] - 2 * padding[0] + output_padding[0]
+        new_cols = (cols - 1) * strides[1] + kernel_size[1] - 2 * padding[1] + output_padding[1]
+        ```
 
     # References
         - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1)
@@ -721,6 +733,7 @@ def __init__(self, filters,
                  kernel_size,
                  strides=(1, 1),
                  padding='valid',
+                 output_padding=None,
                  data_format=None,
                  activation=None,
                  use_bias=True,
@@ -748,7 +761,16 @@ def __init__(self, filters,
             kernel_constraint=kernel_constraint,
             bias_constraint=bias_constraint,
             **kwargs)
-        self.input_spec = InputSpec(ndim=4)
+
+        self.output_padding = output_padding
+        if self.output_padding is not None:
+            self.output_padding = conv_utils.normalize_tuple(
+                self.output_padding, 2, 'output_padding')
+            for stride, out_pad in zip(self.strides, self.output_padding):
+                if out_pad >= stride:
+                    raise ValueError('Stride ' + str(self.strides) + ' must be '
+                                     'greater than output padding ' +
+                                     str(self.output_padding))
 
     def build(self, input_shape):
         if len(input_shape) != 4:
@@ -797,14 +819,20 @@ def call(self, inputs):
         height, width = input_shape[h_axis], input_shape[w_axis]
         kernel_h, kernel_w = self.kernel_size
         stride_h, stride_w = self.strides
+        if self.output_padding is None:
+            out_pad_h = out_pad_w = None
+        else:
+            out_pad_h, out_pad_w = self.output_padding
 
         # Infer the dynamic output shape:
         out_height = conv_utils.deconv_length(height,
                                               stride_h, kernel_h,
-                                              self.padding)
+                                              self.padding,
+                                              out_pad_h)
         out_width = conv_utils.deconv_length(width,
                                              stride_w, kernel_w,
-                                             self.padding)
+                                             self.padding,
+                                             out_pad_w)
         if self.data_format == 'channels_first':
             output_shape = (batch_size, self.filters, out_height, out_width)
         else:
@@ -818,7 +846,7 @@ def call(self, inputs):
             padding=self.padding,
             data_format=self.data_format)
 
-        if self.bias:
+        if self.use_bias:
             outputs = K.bias_add(
                 outputs,
                 self.bias,
@@ -837,17 +865,28 @@ def compute_output_shape(self, input_shape):
 
         kernel_h, kernel_w = self.kernel_size
         stride_h, stride_w = self.strides
+        if self.output_padding is None:
+            out_pad_h = out_pad_w = None
+        else:
+            out_pad_h, out_pad_w = self.output_padding
 
         output_shape[c_axis] = self.filters
-        output_shape[h_axis] = conv_utils.deconv_length(
-            output_shape[h_axis], stride_h, kernel_h, self.padding)
-        output_shape[w_axis] = conv_utils.deconv_length(
-            output_shape[w_axis], stride_w, kernel_w, self.padding)
+        output_shape[h_axis] = conv_utils.deconv_length(output_shape[h_axis],
+                                                        stride_h,
+                                                        kernel_h,
+                                                        self.padding,
+                                                        out_pad_h)
+        output_shape[w_axis] = conv_utils.deconv_length(output_shape[w_axis],
+                                                        stride_w,
+                                                        kernel_w,
+                                                        self.padding,
+                                                        out_pad_w)
         return tuple(output_shape)
 
     def get_config(self):
         config = super(Conv2DTranspose, self).get_config()
         config.pop('dilation_rate')
+        config['output_padding'] = self.output_padding
         return config
 
 
@@ -882,6 +921,14 @@ class Conv3DTranspose(Conv3D):
             Specifying any stride value != 1 is incompatible with specifying
             any `dilation_rate` value != 1.
         padding: one of `"valid"` or `"same"` (case-insensitive).
+        output_padding: An integer or tuple/list of 3 integers,
+            specifying the amount of padding along the depth, height, and
+            width.
+            Can be a single integer to specify the same value for all
+            spatial dimensions.
+            The amount of output padding along a given dimension must be
+            lower than the stride along that same dimension.
+            If set to `None` (default), the output shape is inferred.
         data_format: A string,
             one of `"channels_last"` or `"channels_first"`.
             The ordering of the dimensions in the inputs.
@@ -936,6 +983,13 @@ class Conv3DTranspose(Conv3D):
         `(batch, new_depth, new_rows, new_cols, filters)`
         if `data_format` is `"channels_last"`.
         `depth` and `rows` and `cols` values might have changed due to padding.
+        If `output_padding` is specified::
+
+        ```
+        new_depth = (depth - 1) * strides[0] + kernel_size[0] - 2 * padding[0] + output_padding[0]
+        new_rows = (rows - 1) * strides[1] + kernel_size[1] - 2 * padding[1] + output_padding[1]
+        new_cols = (cols - 1) * strides[2] + kernel_size[2] - 2 * padding[2] + output_padding[2]
+        ```
 
     # References
         - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1)
@@ -946,6 +1000,7 @@ def __init__(self, filters,
                  kernel_size,
                  strides=(1, 1, 1),
                  padding='valid',
+                 output_padding=None,
                  data_format=None,
                  activation=None,
                  use_bias=True,
@@ -973,7 +1028,16 @@ def __init__(self, filters,
             kernel_constraint=kernel_constraint,
             bias_constraint=bias_constraint,
             **kwargs)
-        self.input_spec = InputSpec(ndim=5)
+
+        self.output_padding = output_padding
+        if self.output_padding is not None:
+            self.output_padding = conv_utils.normalize_tuple(
+                self.output_padding, 3, 'output_padding')
+            for stride, out_pad in zip(self.strides, self.output_padding):
+                if out_pad >= stride:
+                    raise ValueError('Stride ' + str(self.strides) + ' must be '
+                                     'greater than output padding ' +
+                                     str(self.output_padding))
 
     def build(self, input_shape):
         if len(input_shape) != 5:
@@ -1024,17 +1088,24 @@ def call(self, inputs):
 
         kernel_d, kernel_h, kernel_w = self.kernel_size
         stride_d, stride_h, stride_w = self.strides
+        if self.output_padding is None:
+            out_pad_d = out_pad_h = out_pad_w = None
+        else:
+            out_pad_d, out_pad_h, out_pad_w = self.output_padding
 
         # Infer the dynamic output shape:
         out_depth = conv_utils.deconv_length(depth,
                                              stride_d, kernel_d,
-                                             self.padding)
+                                             self.padding,
+                                             out_pad_d)
         out_height = conv_utils.deconv_length(height,
                                               stride_h, kernel_h,
-                                              self.padding)
+                                              self.padding,
+                                              out_pad_h)
         out_width = conv_utils.deconv_length(width,
                                              stride_w, kernel_w,
-                                             self.padding)
+                                             self.padding,
+                                             out_pad_w)
 
         if self.data_format == 'channels_first':
             output_shape = (batch_size, self.filters, out_depth, out_height, out_width)
@@ -1048,7 +1119,7 @@ def call(self, inputs):
                                      padding=self.padding,
                                      data_format=self.data_format)
 
-        if self.bias:
+        if self.use_bias:
             outputs = K.bias_add(
                 outputs,
                 self.bias,
@@ -1067,26 +1138,34 @@ def compute_output_shape(self, input_shape):
 
         kernel_d, kernel_h, kernel_w = self.kernel_size
         stride_d, stride_h, stride_w = self.strides
+        if self.output_padding is None:
+            out_pad_d = out_pad_h = out_pad_w = None
+        else:
+            out_pad_d, out_pad_h, out_pad_w = self.output_padding
 
         output_shape[c_axis] = self.filters
         output_shape[d_axis] = conv_utils.deconv_length(output_shape[d_axis],
                                                         stride_d,
                                                         kernel_d,
-                                                        self.padding)
+                                                        self.padding,
+                                                        out_pad_d)
         output_shape[h_axis] = conv_utils.deconv_length(output_shape[h_axis],
                                                         stride_h,
                                                         kernel_h,
-                                                        self.padding)
+                                                        self.padding,
+                                                        out_pad_h)
         output_shape[w_axis] = conv_utils.deconv_length(output_shape[w_axis],
                                                         stride_w,
                                                         kernel_w,
-                                                        self.padding)
+                                                        self.padding,
+                                                        out_pad_w)
 
         return tuple(output_shape)
 
     def get_config(self):
         config = super(Conv3DTranspose, self).get_config()
         config.pop('dilation_rate')
+        config['output_padding'] = self.output_padding
         return config
 
 
@@ -1221,6 +1300,7 @@ def __init__(self, rank,
             dilation_rate=dilation_rate,
             activation=activation,
             use_bias=use_bias,
+            bias_initializer=bias_initializer,
             bias_regularizer=bias_regularizer,
             activity_regularizer=activity_regularizer,
             bias_constraint=bias_constraint,
@@ -1296,7 +1376,7 @@ def call(self, inputs):
                 padding=self.padding,
                 dilation_rate=self.dilation_rate)
 
-        if self.bias:
+        if self.use_bias:
             outputs = K.bias_add(
                 outputs,
                 self.bias,
@@ -1350,9 +1430,9 @@ class SeparableConv1D(_SeparableConv):
             one of `"channels_last"` or `"channels_first"`.
             The ordering of the dimensions in the inputs.
             `"channels_last"` corresponds to inputs with shape
-            `(batch, height, width, channels)` while `"channels_first"`
+            `(batch, steps, channels)` while `"channels_first"`
             corresponds to inputs with shape
-            `(batch, channels, height, width)`.
+            `(batch, channels, steps)`.
             It defaults to the `image_data_format` value found in your
             Keras config file at `~/.keras/keras.json`.
             If you never set it, then it will be "channels_last".
@@ -1619,7 +1699,7 @@ class DepthwiseConv2D(Conv2D):
             all spatial dimensions.
             Specifying any stride value != 1 is incompatible with specifying
             any `dilation_rate` value != 1.
-        padding: one of `'valid'` or `'same'` (case-insensitive).
+        padding: one of `"valid"` or `"same"` (case-insensitive).
         depth_multiplier: The number of depthwise convolution output channels
             for each input channel.
             The total number of depthwise convolution output
@@ -1659,18 +1739,18 @@ class DepthwiseConv2D(Conv2D):
 
     # Input shape
         4D tensor with shape:
-        `[batch, channels, rows, cols]`
+        `(batch, channels, rows, cols)`
         if `data_format` is `"channels_first"`
         or 4D tensor with shape:
-        `[batch, rows, cols, channels]`
+        `(batch, rows, cols, channels)`
         if `data_format` is `"channels_last"`.
 
     # Output shape
         4D tensor with shape:
-        `[batch, filters, new_rows, new_cols]`
+        `(batch, filters, new_rows, new_cols)`
         if `data_format` is `"channels_first"`
         or 4D tensor with shape:
-        `[batch, new_rows, new_cols, filters]`
+        `(batch, new_rows, new_cols, filters)`
         if `data_format` is `"channels_last"`.
         `rows` and `cols` values might have changed due to padding.
     """
@@ -1760,7 +1840,7 @@ def call(self, inputs, training=None):
             dilation_rate=self.dilation_rate,
             data_format=self.data_format)
 
-        if self.bias:
+        if self.use_bias:
             outputs = K.bias_add(
                 outputs,
                 self.bias,
@@ -1878,7 +1958,7 @@ class UpSampling2D(Layer):
     @interfaces.legacy_upsampling2d_support
     def __init__(self, size=(2, 2), data_format=None, **kwargs):
         super(UpSampling2D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.size = conv_utils.normalize_tuple(size, 2, 'size')
         self.input_spec = InputSpec(ndim=4)
 
@@ -1946,7 +2026,7 @@ class UpSampling3D(Layer):
 
     @interfaces.legacy_upsampling3d_support
     def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.size = conv_utils.normalize_tuple(size, 3, 'size')
         self.input_spec = InputSpec(ndim=5)
         super(UpSampling3D, self).__init__(**kwargs)
@@ -2074,7 +2154,7 @@ def __init__(self,
                  data_format=None,
                  **kwargs):
         super(ZeroPadding2D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         if isinstance(padding, int):
             self.padding = ((padding, padding), (padding, padding))
         elif hasattr(padding, '__len__'):
@@ -2178,7 +2258,7 @@ class ZeroPadding3D(Layer):
     @interfaces.legacy_zeropadding3d_support
     def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
         super(ZeroPadding3D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         if isinstance(padding, int):
             self.padding = ((padding, padding), (padding, padding), (padding, padding))
         elif hasattr(padding, '__len__'):
@@ -2278,19 +2358,12 @@ def __init__(self, cropping=(1, 1), **kwargs):
         self.input_spec = InputSpec(ndim=3)
 
     def compute_output_shape(self, input_shape):
-        if input_shape[1] is not None:
-            length = input_shape[1] - self.cropping[0] - self.cropping[1]
-        else:
-            length = None
-        return (input_shape[0],
-                length,
-                input_shape[2])
+        return _compute_output_shape_cropping(input_shape,
+                                              'channels_last',
+                                              (self.cropping,))
 
     def call(self, inputs):
-        if self.cropping[1] == 0:
-            return inputs[:, self.cropping[0]:, :]
-        else:
-            return inputs[:, self.cropping[0]: -self.cropping[1], :]
+        return _call_cropping(inputs, 'channels_last', (self.cropping,))
 
     def get_config(self):
         config = {'cropping': self.cropping}
@@ -2349,7 +2422,7 @@ class Cropping2D(Layer):
         # now model.output_shape == (None, 24, 20, 3)
         model.add(Conv2D(64, (3, 3), padding='same'))
         model.add(Cropping2D(cropping=((2, 2), (2, 2))))
-        # now model.output_shape == (None, 20, 16. 64)
+        # now model.output_shape == (None, 20, 16, 64)
     ```
     """
 
@@ -2357,7 +2430,7 @@ class Cropping2D(Layer):
     def __init__(self, cropping=((0, 0), (0, 0)),
                  data_format=None, **kwargs):
         super(Cropping2D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         if isinstance(cropping, int):
             self.cropping = ((cropping, cropping), (cropping, cropping))
         elif hasattr(cropping, '__len__'):
@@ -2381,58 +2454,12 @@ def __init__(self, cropping=((0, 0), (0, 0)),
         self.input_spec = InputSpec(ndim=4)
 
     def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            return (input_shape[0],
-                    input_shape[1],
-                    input_shape[2] - self.cropping[0][0] - self.cropping[0][1] if input_shape[2] else None,
-                    input_shape[3] - self.cropping[1][0] - self.cropping[1][1] if input_shape[3] else None)
-        elif self.data_format == 'channels_last':
-            return (input_shape[0],
-                    input_shape[1] - self.cropping[0][0] - self.cropping[0][1] if input_shape[1] else None,
-                    input_shape[2] - self.cropping[1][0] - self.cropping[1][1] if input_shape[2] else None,
-                    input_shape[3])
+        return _compute_output_shape_cropping(input_shape,
+                                              self.data_format,
+                                              self.cropping)
 
     def call(self, inputs):
-        if self.data_format == 'channels_first':
-            if self.cropping[0][1] == self.cropping[1][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:]
-            elif self.cropping[0][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]: -self.cropping[1][1]]
-            elif self.cropping[1][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]:]
-            return inputs[:,
-                          :,
-                          self.cropping[0][0]: -self.cropping[0][1],
-                          self.cropping[1][0]: -self.cropping[1][1]]
-        elif self.data_format == 'channels_last':
-            if self.cropping[0][1] == self.cropping[1][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:,
-                              :]
-            elif self.cropping[0][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]: -self.cropping[1][1],
-                              :]
-            elif self.cropping[1][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]:,
-                              :]
-            return inputs[:,
-                          self.cropping[0][0]: -self.cropping[0][1],
-                          self.cropping[1][0]: -self.cropping[1][1],
-                          :]
+        return _call_cropping(inputs, self.data_format, self.cropping)
 
     def get_config(self):
         config = {'cropping': self.cropping,
@@ -2485,7 +2512,7 @@ class Cropping3D(Layer):
     def __init__(self, cropping=((1, 1), (1, 1), (1, 1)),
                  data_format=None, **kwargs):
         super(Cropping3D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         if isinstance(cropping, int):
             self.cropping = ((cropping, cropping),
                              (cropping, cropping),
@@ -2513,141 +2540,12 @@ def __init__(self, cropping=((1, 1), (1, 1), (1, 1)),
         self.input_spec = InputSpec(ndim=5)
 
     def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            if input_shape[2] is not None:
-                dim1 = input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
-            else:
-                dim1 = None
-            if input_shape[3] is not None:
-                dim2 = input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
-            else:
-                dim2 = None
-            if input_shape[4] is not None:
-                dim3 = input_shape[4] - self.cropping[2][0] - self.cropping[2][1]
-            else:
-                dim3 = None
-            return (input_shape[0],
-                    input_shape[1],
-                    dim1,
-                    dim2,
-                    dim3)
-        elif self.data_format == 'channels_last':
-            if input_shape[1] is not None:
-                dim1 = input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
-            else:
-                dim1 = None
-            if input_shape[2] is not None:
-                dim2 = input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
-            else:
-                dim2 = None
-            if input_shape[3] is not None:
-                dim3 = input_shape[3] - self.cropping[2][0] - self.cropping[2][1]
-            else:
-                dim3 = None
-            return (input_shape[0],
-                    dim1,
-                    dim2,
-                    dim3,
-                    input_shape[4])
+        return _compute_output_shape_cropping(input_shape,
+                                              self.data_format,
+                                              self.cropping)
 
     def call(self, inputs):
-        if self.data_format == 'channels_first':
-            if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]:]
-            elif self.cropping[0][1] == self.cropping[1][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]: -self.cropping[2][1]]
-            elif self.cropping[1][1] == self.cropping[2][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]:]
-            elif self.cropping[0][1] == self.cropping[2][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]: -self.cropping[1][1],
-                              self.cropping[2][0]:]
-            elif self.cropping[0][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]: -self.cropping[1][1],
-                              self.cropping[2][0]: -self.cropping[2][1]]
-            elif self.cropping[1][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]: -self.cropping[2][1]]
-            elif self.cropping[2][1] == 0:
-                return inputs[:,
-                              :,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]: -self.cropping[1][1],
-                              self.cropping[2][0]:]
-            return inputs[:,
-                          :,
-                          self.cropping[0][0]: -self.cropping[0][1],
-                          self.cropping[1][0]: -self.cropping[1][1],
-                          self.cropping[2][0]: -self.cropping[2][1]]
-
-        elif self.data_format == 'channels_last':
-            if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]:,
-                              :]
-            elif self.cropping[0][1] == self.cropping[1][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]: -self.cropping[2][1],
-                              :]
-            elif self.cropping[1][1] == self.cropping[2][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]:,
-                              :]
-            elif self.cropping[0][1] == self.cropping[2][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]:-self.cropping[1][1],
-                              self.cropping[2][0]:,
-                              :]
-            elif self.cropping[0][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]:,
-                              self.cropping[1][0]: -self.cropping[1][1],
-                              self.cropping[2][0]: -self.cropping[2][1],
-                              :]
-            elif self.cropping[1][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]:,
-                              self.cropping[2][0]: -self.cropping[2][1],
-                              :]
-            elif self.cropping[2][1] == 0:
-                return inputs[:,
-                              self.cropping[0][0]: -self.cropping[0][1],
-                              self.cropping[1][0]: -self.cropping[1][1],
-                              self.cropping[2][0]:,
-                              :]
-            return inputs[:,
-                          self.cropping[0][0]: -self.cropping[0][1],
-                          self.cropping[1][0]: -self.cropping[1][1],
-                          self.cropping[2][0]: -self.cropping[2][1],
-                          :]
+        return _call_cropping(inputs, self.data_format, self.cropping)
 
     def get_config(self):
         config = {'cropping': self.cropping,
@@ -2656,6 +2554,34 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
 
+def _call_cropping(inputs, data_format, cropping):
+    slices_dims = []
+    for start, end in cropping:
+        if end == 0:
+            end = None
+        else:
+            end = -end
+        slices_dims.append(slice(start, end))
+
+    slices = [slice(None)] + slices_dims + [slice(None)]
+    slices = tuple(slices)
+    spatial_axes = list(range(1, 1 + len(cropping)))
+    slices = transpose_shape(slices, data_format, spatial_axes)
+    return inputs[slices]
+
+
+def _compute_output_shape_cropping(input_shape, data_format, cropping):
+    cropping_all_dims = ((0, 0),) + cropping + ((0, 0),)
+    spatial_axes = list(range(1, 1 + len(cropping)))
+    cropping_all_dims = transpose_shape(cropping_all_dims, data_format, spatial_axes)
+
+    output_shape = list(input_shape)
+    for dim in range(len(output_shape)):
+        if output_shape[dim] is not None:
+            output_shape[dim] -= sum(cropping_all_dims[dim])
+    return tuple(output_shape)
+
+
 # Aliases
 
 Convolution1D = Conv1D
diff --git a/keras/layers/convolutional_recurrent.py b/keras/layers/convolutional_recurrent.py
index 8163cae3db9..dc1f7bbe5a3 100644
--- a/keras/layers/convolutional_recurrent.py
+++ b/keras/layers/convolutional_recurrent.py
@@ -21,6 +21,7 @@
 from ..legacy.layers import Recurrent, ConvRecurrent2D
 from .recurrent import RNN
 from ..utils.generic_utils import has_arg
+from ..utils.generic_utils import transpose_shape
 
 
 class ConvRNN2D(RNN):
@@ -169,22 +170,18 @@ def compute_output_shape(self, input_shape):
                                              stride=cell.strides[1],
                                              dilation=cell.dilation_rate[1])
 
-        if cell.data_format == 'channels_first':
-            output_shape = input_shape[:2] + (cell.filters, rows, cols)
-        elif cell.data_format == 'channels_last':
-            output_shape = input_shape[:2] + (rows, cols, cell.filters)
+        output_shape = input_shape[:2] + (rows, cols, cell.filters)
+        output_shape = transpose_shape(output_shape, cell.data_format,
+                                       spatial_axes=(2, 3))
 
         if not self.return_sequences:
             output_shape = output_shape[:1] + output_shape[2:]
 
         if self.return_state:
             output_shape = [output_shape]
-            if cell.data_format == 'channels_first':
-                output_shape += [(input_shape[0], cell.filters, rows, cols)
-                                 for _ in range(2)]
-            elif cell.data_format == 'channels_last':
-                output_shape += [(input_shape[0], rows, cols, cell.filters)
-                                 for _ in range(2)]
+            base = (input_shape[0], rows, cols, cell.filters)
+            base = transpose_shape(base, cell.data_format, spatial_axes=(1, 2))
+            output_shape += [base[:] for _ in range(2)]
         return output_shape
 
     def build(self, input_shape):
@@ -483,10 +480,10 @@ class ConvLSTM2DCell(Layer):
             any `dilation_rate` value != 1.
         padding: One of `"valid"` or `"same"` (case-insensitive).
         data_format: A string,
-            one of `channels_last` (default) or `channels_first`.
+            one of `"channels_last"` (default) or `"channels_first"`.
             It defaults to the `image_data_format` value found in your
             Keras config file at `~/.keras/keras.json`.
-            If you never set it, then it will be "channels_last".
+            If you never set it, then it will be `"channels_last"`.
         dilation_rate: An integer or tuple/list of n integers, specifying
             the dilation rate to use for dilated convolution.
             Currently, specifying any `dilation_rate` value != 1 is
@@ -563,7 +560,7 @@ def __init__(self, filters,
         self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
         self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2, 'dilation_rate')
         self.activation = activations.get(activation)
         self.recurrent_activation = activations.get(recurrent_activation)
@@ -793,15 +790,15 @@ class ConvLSTM2D(ConvRNN2D):
             any `dilation_rate` value != 1.
         padding: One of `"valid"` or `"same"` (case-insensitive).
         data_format: A string,
-            one of `channels_last` (default) or `channels_first`.
+            one of `"channels_last"` (default) or `"channels_first"`.
             The ordering of the dimensions in the inputs.
-            `channels_last` corresponds to inputs with shape
+            `"channels_last"` corresponds to inputs with shape
             `(batch, time, ..., channels)`
-            while `channels_first` corresponds to
+            while `"channels_first"` corresponds to
             inputs with shape `(batch, time, channels, ...)`.
             It defaults to the `image_data_format` value found in your
             Keras config file at `~/.keras/keras.json`.
-            If you never set it, then it will be "channels_last".
+            If you never set it, then it will be `"channels_last"`.
         dilation_rate: An integer or tuple/list of n integers, specifying
             the dilation rate to use for dilated convolution.
             Currently, specifying any `dilation_rate` value != 1 is
@@ -877,7 +874,7 @@ class ConvLSTM2D(ConvRNN2D):
                 5D tensor with shape:
                 `(samples, time, output_row, output_col, filters)`
         - else
-            - if data_format ='channels_first'
+            - if data_format='channels_first'
                 4D tensor with shape:
                 `(samples, filters, output_row, output_col)`
             - if data_format='channels_last'
diff --git a/keras/layers/core.py b/keras/layers/core.py
index 560851e66be..0c55f8db8eb 100644
--- a/keras/layers/core.py
+++ b/keras/layers/core.py
@@ -209,12 +209,7 @@ class SpatialDropout2D(Dropout):
     @interfaces.legacy_spatialdropoutNd_support
     def __init__(self, rate, data_format=None, **kwargs):
         super(SpatialDropout2D, self).__init__(rate, **kwargs)
-        if data_format is None:
-            data_format = K.image_data_format()
-        if data_format not in {'channels_last', 'channels_first'}:
-            raise ValueError('`data_format` must be in '
-                             '{`"channels_last"`, `"channels_first"`}')
-        self.data_format = data_format
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=4)
 
     def _get_noise_shape(self, inputs):
@@ -262,12 +257,7 @@ class SpatialDropout3D(Dropout):
     @interfaces.legacy_spatialdropoutNd_support
     def __init__(self, rate, data_format=None, **kwargs):
         super(SpatialDropout3D, self).__init__(rate, **kwargs)
-        if data_format is None:
-            data_format = K.image_data_format()
-        if data_format not in {'channels_last', 'channels_first'}:
-            raise ValueError('`data_format` must be in '
-                             '{`"channels_last"`, `"channels_first"`}')
-        self.data_format = data_format
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=5)
 
     def _get_noise_shape(self, inputs):
@@ -485,9 +475,8 @@ class Flatten(Layer):
 
     ```python
         model = Sequential()
-        model.add(Conv2D(64, 3, 3,
-                         border_mode='same',
-                         input_shape=(3, 32, 32)))
+        model.add(Conv2D(64, (3, 3),
+                         input_shape=(3, 32, 32), padding='same',))
         # now: model.output_shape == (None, 64, 32, 32)
 
         model.add(Flatten())
@@ -498,7 +487,7 @@ class Flatten(Layer):
     def __init__(self, data_format=None, **kwargs):
         super(Flatten, self).__init__(**kwargs)
         self.input_spec = InputSpec(min_ndim=3)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
 
     def compute_output_shape(self, input_shape):
         if not all(input_shape[1:]):
@@ -624,7 +613,7 @@ def antirectifier_output_shape(input_shape):
 
     # Output shape
         Specified by `output_shape` argument
-        (or auto-inferred when using TensorFlow).
+        (or auto-inferred when using TensorFlow or CNTK).
     """
 
     @interfaces.legacy_lambda_support
@@ -649,8 +638,8 @@ def __init__(self, function, output_shape=None,
 
     def compute_output_shape(self, input_shape):
         if self._output_shape is None:
-            # With TensorFlow and MXNet we can infer the output shape directly:
-            if K.backend() == 'tensorflow' or K.backend() == 'mxnet':
+            # With TensorFlow or CNTK or MXNet, we can infer the output shape directly:
+            if K.backend() in ('tensorflow', 'cntk', 'mxnet'):
                 if isinstance(input_shape, list):
                     xs = [K.placeholder(shape=shape) for shape in input_shape]
                     x = self.call(xs)
@@ -884,7 +873,7 @@ def build(self, input_shape):
     def call(self, inputs):
         output = K.dot(inputs, self.kernel)
         if self.use_bias:
-            output = K.bias_add(output, self.bias)
+            output = K.bias_add(output, self.bias, data_format='channels_last')
         if self.activation is not None:
             output = self.activation(output)
         return output
diff --git a/keras/layers/embeddings.py b/keras/layers/embeddings.py
index 18d391d042b..678a5e01494 100644
--- a/keras/layers/embeddings.py
+++ b/keras/layers/embeddings.py
@@ -35,30 +35,30 @@ class Embedding(Layer):
     ```
 
     # Arguments
-      input_dim: int > 0. Size of the vocabulary,
-          i.e. maximum integer index + 1.
-      output_dim: int >= 0. Dimension of the dense embedding.
-      embeddings_initializer: Initializer for the `embeddings` matrix
-          (see [initializers](../initializers.md)).
-      embeddings_regularizer: Regularizer function applied to
-          the `embeddings` matrix
-          (see [regularizer](../regularizers.md)).
-      embeddings_constraint: Constraint function applied to
-          the `embeddings` matrix
-          (see [constraints](../constraints.md)).
-      mask_zero: Whether or not the input value 0 is a special "padding"
-          value that should be masked out.
-          This is useful when using [recurrent layers](recurrent.md)
-          which may take variable length input.
-          If this is `True` then all subsequent layers
-          in the model need to support masking or an exception will be raised.
-          If mask_zero is set to True, as a consequence, index 0 cannot be
-          used in the vocabulary (input_dim should equal size of
-          vocabulary + 1).
-      input_length: Length of input sequences, when it is constant.
-          This argument is required if you are going to connect
-          `Flatten` then `Dense` layers upstream
-          (without it, the shape of the dense outputs cannot be computed).
+        input_dim: int > 0. Size of the vocabulary,
+            i.e. maximum integer index + 1.
+        output_dim: int >= 0. Dimension of the dense embedding.
+        embeddings_initializer: Initializer for the `embeddings` matrix
+            (see [initializers](../initializers.md)).
+        embeddings_regularizer: Regularizer function applied to
+            the `embeddings` matrix
+            (see [regularizer](../regularizers.md)).
+        embeddings_constraint: Constraint function applied to
+            the `embeddings` matrix
+            (see [constraints](../constraints.md)).
+        mask_zero: Whether or not the input value 0 is a special "padding"
+            value that should be masked out.
+            This is useful when using [recurrent layers](recurrent.md)
+            which may take variable length input.
+            If this is `True` then all subsequent layers
+            in the model need to support masking or an exception will be raised.
+            If mask_zero is set to True, as a consequence, index 0 cannot be
+            used in the vocabulary (input_dim should equal size of
+            vocabulary + 1).
+        input_length: Length of input sequences, when it is constant.
+            This argument is required if you are going to connect
+            `Flatten` then `Dense` layers upstream
+            (without it, the shape of the dense outputs cannot be computed).
 
     # Input shape
         2D tensor with shape: `(batch_size, sequence_length)`.
diff --git a/keras/layers/local.py b/keras/layers/local.py
index a49251b7cb1..8833a60495d 100644
--- a/keras/layers/local.py
+++ b/keras/layers/local.py
@@ -101,7 +101,7 @@ def __init__(self, filters,
         if self.padding != 'valid':
             raise ValueError('Invalid border mode for LocallyConnected1D '
                              '(only "valid" is supported): ' + padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.activation = activations.get(activation)
         self.use_bias = use_bias
         self.kernel_initializer = initializers.get(kernel_initializer)
@@ -283,7 +283,7 @@ def __init__(self, filters,
         if self.padding != 'valid':
             raise ValueError('Invalid border mode for LocallyConnected2D '
                              '(only "valid" is supported): ' + padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.activation = activations.get(activation)
         self.use_bias = use_bias
         self.kernel_initializer = initializers.get(kernel_initializer)
diff --git a/keras/layers/pooling.py b/keras/layers/pooling.py
index 6346b75489e..6df9428be1e 100644
--- a/keras/layers/pooling.py
+++ b/keras/layers/pooling.py
@@ -121,13 +121,12 @@ class _Pooling2D(Layer):
     def __init__(self, pool_size=(2, 2), strides=None, padding='valid',
                  data_format=None, **kwargs):
         super(_Pooling2D, self).__init__(**kwargs)
-        data_format = conv_utils.normalize_data_format(data_format)
         if strides is None:
             strides = pool_size
         self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
         self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=4)
 
     def compute_output_shape(self, input_shape):
@@ -288,7 +287,7 @@ def __init__(self, pool_size=(2, 2, 2), strides=None, padding='valid',
         self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
         self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=5)
 
     def compute_output_shape(self, input_shape):
@@ -489,7 +488,7 @@ class _GlobalPooling2D(Layer):
     @interfaces.legacy_global_pooling_support
     def __init__(self, data_format=None, **kwargs):
         super(_GlobalPooling2D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=4)
 
     def compute_output_shape(self, input_shape):
@@ -584,7 +583,7 @@ class _GlobalPooling3D(Layer):
     @interfaces.legacy_global_pooling_support
     def __init__(self, data_format=None, **kwargs):
         super(_GlobalPooling3D, self).__init__(**kwargs)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=5)
 
     def compute_output_shape(self, input_shape):
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 2a3f7cd451c..30859a93468 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -551,6 +551,14 @@ def call(self,
         # note that the .build() method of subclasses MUST define
         # self.input_spec and self.state_spec with complete input shapes.
         if isinstance(inputs, list):
+            # get initial_state from full input spec
+            # as they could be copied to multiple GPU.
+            if self._num_constants is None:
+                initial_state = inputs[1:]
+            else:
+                initial_state = inputs[1:-self._num_constants]
+            if len(initial_state) == 0:
+                initial_state = None
             inputs = inputs[0]
         if initial_state is not None:
             pass
diff --git a/keras/layers/wrappers.py b/keras/layers/wrappers.py
index 312c74f6bc4..8ac651ad15e 100644
--- a/keras/layers/wrappers.py
+++ b/keras/layers/wrappers.py
@@ -496,10 +496,27 @@ def call(self,
             kwargs['constants'] = constants
 
         if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
-            forward_state = initial_state[:len(initial_state) // 2]
-            backward_state = initial_state[len(initial_state) // 2:]
-            y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
-            y_rev = self.backward_layer.call(inputs, initial_state=backward_state, **kwargs)
+            forward_inputs = [inputs[0]]
+            backward_inputs = [inputs[0]]
+            pivot = len(initial_state) // 2 + 1
+            # add forward initial state
+            forward_state = inputs[1:pivot]
+            forward_inputs += forward_state
+            if self._num_constants is None:
+                # add backward initial state
+                backward_state = inputs[pivot:]
+                backward_inputs += backward_state
+            else:
+                # add backward initial state
+                backward_state = inputs[pivot:-self._num_constants]
+                backward_inputs += backward_state
+                # add constants for forward and backward layers
+                forward_inputs += inputs[-self._num_constants:]
+                backward_inputs += inputs[-self._num_constants:]
+            y = self.forward_layer.call(forward_inputs,
+                                        initial_state=forward_state, **kwargs)
+            y_rev = self.backward_layer.call(backward_inputs,
+                                             initial_state=backward_state, **kwargs)
         else:
             y = self.forward_layer.call(inputs, **kwargs)
             y_rev = self.backward_layer.call(inputs, **kwargs)
diff --git a/keras/legacy/layers.py b/keras/legacy/layers.py
index 76702568082..be869335bca 100644
--- a/keras/legacy/layers.py
+++ b/keras/legacy/layers.py
@@ -8,6 +8,7 @@
 from ..engine import Layer, InputSpec
 from .. import backend as K
 from ..utils import conv_utils
+from ..utils.generic_utils import to_list
 from .. import regularizers
 from .. import constraints
 from .. import activations
@@ -521,10 +522,8 @@ def __call__(self, inputs, initial_state=None, **kwargs):
             # Compute the full input spec, including state
             input_spec = self.input_spec
             state_spec = self.state_spec
-            if not isinstance(input_spec, list):
-                input_spec = [input_spec]
-            if not isinstance(state_spec, list):
-                state_spec = [state_spec]
+            input_spec = to_list(input_spec)
+            state_spec = to_list(state_spec)
             self.input_spec = input_spec + state_spec
 
             # Compute the full inputs, including state
@@ -748,7 +747,7 @@ def __init__(self, filters,
         self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
         self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
-        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.data_format = K.normalize_data_format(data_format)
         self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2, 'dilation_rate')
         self.return_sequences = return_sequences
         self.go_backwards = go_backwards
diff --git a/keras/losses.py b/keras/losses.py
index 92aa133a046..b92747b1a35 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -116,6 +116,17 @@ def deserialize(name, custom_objects=None):
 
 
 def get(identifier):
+    """Get the `identifier` loss function.
+
+    # Arguments
+        identifier: None or str, name of the function.
+
+    # Returns
+        The loss function or None if `identifier` is None.
+
+    # Raises
+        ValueError if unknown identifier.
+    """
     if identifier is None:
         return None
     if isinstance(identifier, six.string_types):
diff --git a/keras/optimizers.py b/keras/optimizers.py
index 68f9bab9d6d..b783770f403 100644
--- a/keras/optimizers.py
+++ b/keras/optimizers.py
@@ -18,6 +18,17 @@
 
 
 def clip_norm(g, c, n):
+    """Clip the gradient `g` if the L2 norm `n` exceeds `c`.
+
+    # Arguments
+        g: Tensor, the gradient tensor
+        c: float >= 0. Gradients will be clipped
+            when their L2 norm exceeds this value.
+        n: Tensor, actual norm of `g`.
+
+    # Returns
+        Tensor, the gradient clipped if required.
+    """
     if c <= 0:  # if clipnorm == 0 no need to add ops to the graph
         return g
 
diff --git a/keras/utils/conv_utils.py b/keras/utils/conv_utils.py
index 549c762d815..c4370dbf352 100644
--- a/keras/utils/conv_utils.py
+++ b/keras/utils/conv_utils.py
@@ -13,11 +13,11 @@ def normalize_tuple(value, n, name):
     """Transforms a single int or iterable of ints into an int tuple.
 
     # Arguments
-        value: The value to validate and convert. Could an int, or any iterable
+        value: The value to validate and convert. Could be an int, or any iterable
           of ints.
         n: The size of the tuple to be returned.
-        name: The name of the argument being validated, e.g. "strides" or
-          "kernel_size". This is only used to format error messages.
+        name: The name of the argument being validated, e.g. `strides` or
+          `kernel_size`. This is only used to format error messages.
 
     # Returns
         A tuple of n integers.
@@ -43,30 +43,19 @@ def normalize_tuple(value, n, name):
             except ValueError:
                 raise ValueError('The `' + name + '` argument must be a tuple of ' +
                                  str(n) + ' integers. Received: ' + str(value) + ' '
-                                 'including element ' + str(single_value) + ' of type' +
-                                 ' ' + str(type(single_value)))
+                                 'including element ' + str(single_value) + ' of '
+                                 'type ' + str(type(single_value)))
     return value_tuple
 
 
-def normalize_data_format(value):
-    if value is None:
-        value = K.image_data_format()
-    data_format = value.lower()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('The `data_format` argument must be one of '
-                         '"channels_first", "channels_last". Received: ' +
-                         str(value))
-    return data_format
-
-
 def normalize_padding(value):
     padding = value.lower()
     allowed = {'valid', 'same', 'causal'}
     if K.backend() == 'theano':
         allowed.add('full')
     if padding not in allowed:
-        raise ValueError('The `padding` argument must be one of "valid", "same" (or "causal" for Conv1D). '
-                         'Received: ' + str(padding))
+        raise ValueError('The `padding` argument must be one of "valid", "same" '
+                         '(or "causal" for Conv1D). Received: ' + str(padding))
     return padding
 
 
@@ -100,7 +89,7 @@ def conv_output_length(input_length, filter_size,
     # Arguments
         input_length: integer.
         filter_size: integer.
-        padding: one of "same", "valid", "full".
+        padding: one of `"same"`, `"valid"`, `"full"`.
         stride: integer.
         dilation: dilation rate, integer.
 
@@ -128,7 +117,7 @@ def conv_input_length(output_length, filter_size, padding, stride):
     # Arguments
         output_length: integer.
         filter_size: integer.
-        padding: one of "same", "valid", "full".
+        padding: one of `"same"`, `"valid"`, `"full"`.
         stride: integer.
 
     # Returns
@@ -146,13 +135,42 @@ def conv_input_length(output_length, filter_size, padding, stride):
     return (output_length - 1) * stride - 2 * pad + filter_size
 
 
-def deconv_length(dim_size, stride_size, kernel_size, padding):
+def deconv_length(dim_size, stride_size, kernel_size, padding, output_padding):
+    """Determines output length of a transposed convolution given input length.
+
+    # Arguments
+        dim_size: Integer, the input length.
+        stride_size: Integer, the stride along the dimension of `dim_size`.
+        kernel_size: Integer, the kernel size along the dimension of
+            `dim_size`.
+        padding: One of `"same"`, `"valid"`, `"full"`.
+        output_padding: Integer, amount of padding along the output dimension,
+            Can be set to `None` in which case the output length is inferred.
+
+    # Returns
+        The output length (integer).
+    """
+    assert padding in {'same', 'valid', 'full'}
     if dim_size is None:
         return None
-    if padding == 'valid':
-        dim_size = dim_size * stride_size + max(kernel_size - stride_size, 0)
-    elif padding == 'full':
-        dim_size = dim_size * stride_size - (stride_size + kernel_size - 2)
-    elif padding == 'same':
-        dim_size = dim_size * stride_size
+
+    # Infer length if output padding is None, else compute the exact length
+    if output_padding is None:
+        if padding == 'valid':
+            dim_size = dim_size * stride_size + max(kernel_size - stride_size, 0)
+        elif padding == 'full':
+            dim_size = dim_size * stride_size - (stride_size + kernel_size - 2)
+        elif padding == 'same':
+            dim_size = dim_size * stride_size
+    else:
+        if padding == 'same':
+            pad = kernel_size // 2
+        elif padding == 'valid':
+            pad = 0
+        elif padding == 'full':
+            pad = kernel_size - 1
+
+        dim_size = ((dim_size - 1) * stride_size + kernel_size - 2 * pad +
+                    output_padding)
+
     return dim_size
diff --git a/keras/utils/data_utils.py b/keras/utils/data_utils.py
index 3f052a204ce..c7ee0238f18 100644
--- a/keras/utils/data_utils.py
+++ b/keras/utils/data_utils.py
@@ -4,7 +4,7 @@
 from __future__ import print_function
 
 import hashlib
-import multiprocessing
+import multiprocessing as mp
 import os
 import random
 import shutil
@@ -168,7 +168,7 @@ def get_file(fname,
 
     # Returns
         Path to the downloaded file
-    """
+    """  # noqa
     if cache_dir is None:
         cache_dir = os.path.join(os.path.expanduser('~'), '.keras')
     if md5_hash is not None and file_hash is None:
@@ -224,7 +224,7 @@ def dl_progress(count, block_size, total_size):
                 raise Exception(error_msg.format(origin, e.errno, e.reason))
             except HTTPError as e:
                 raise Exception(error_msg.format(origin, e.code, e.msg))
-        except (Exception, KeyboardInterrupt) as e:
+        except (Exception, KeyboardInterrupt):
             if os.path.exists(fpath):
                 os.remove(fpath)
             raise
@@ -303,13 +303,14 @@ class Sequence(object):
     """Base object for fitting to a sequence of data, such as a dataset.
 
     Every `Sequence` must implement the `__getitem__` and the `__len__` methods.
-    If you want to modify your dataset between epochs you may implement `on_epoch_end`.
-    The method `__getitem__` should return a complete batch.
+    If you want to modify your dataset between epochs you may implement
+    `on_epoch_end`. The method `__getitem__` should return a complete batch.
 
     # Notes
 
-    `Sequence` are a safer way to do multiprocessing. This structure guarantees that the network will only train once
-     on each sample per epoch which is not the case with generators.
+    `Sequence` are a safer way to do multiprocessing. This structure guarantees
+    that the network will only train once on each sample per epoch which is not
+    the case with generators.
 
     # Examples
 
@@ -482,7 +483,7 @@ def __init__(self, sequence,
         global _SEQUENCE_COUNTER
         if _SEQUENCE_COUNTER is None:
             try:
-                _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
+                _SEQUENCE_COUNTER = mp.Value('i', 0)
             except OSError:
                 # In this case the OS does not allow us to use
                 # multiprocessing. We resort to an int
@@ -517,9 +518,9 @@ def start(self, workers=1, max_queue_size=10):
                 (when full, workers could block on `put()`)
         """
         if self.use_multiprocessing:
-            self.executor_fn = lambda seqs: multiprocessing.Pool(workers,
-                                                                 initializer=init_pool,
-                                                                 initargs=(seqs,))
+            self.executor_fn = lambda seqs: mp.Pool(workers,
+                                                    initializer=init_pool,
+                                                    initargs=(seqs,))
         else:
             # We do not need the init since it's threads.
             self.executor_fn = lambda _: ThreadPool(workers)
@@ -663,7 +664,8 @@ def _data_generator_task(self):
                         break
                     except Exception as e:
                         # Can't pickle tracebacks.
-                        # As a compromise, print the traceback and pickle None instead.
+                        # As a compromise, print the traceback and
+                        # pickle None instead.
                         if not hasattr(e, '__traceback__'):
                             setattr(e, '__traceback__', sys.exc_info()[2])
                         self.queue.put((False, e))
@@ -700,9 +702,9 @@ def start(self, workers=1, max_queue_size=10):
         try:
             self.max_queue_size = max_queue_size
             if self._use_multiprocessing:
-                self._manager = multiprocessing.Manager()
+                self._manager = mp.Manager()
                 self.queue = self._manager.Queue(maxsize=max_queue_size)
-                self._stop_event = multiprocessing.Event()
+                self._stop_event = mp.Event()
             else:
                 # On all OSes, avoid **SYSTEMATIC** error in multithreading mode:
                 # `ValueError: generator already executing`
@@ -716,7 +718,7 @@ def start(self, workers=1, max_queue_size=10):
                     # Reset random seed else all children processes
                     # share the same seed
                     np.random.seed(self.seed)
-                    thread = multiprocessing.Process(target=self._data_generator_task)
+                    thread = mp.Process(target=self._data_generator_task)
                     thread.daemon = True
                     if self.seed is not None:
                         self.seed += 1
@@ -780,7 +782,8 @@ def get(self):
                 if value is not None:
                     yield value
             else:
-                all_finished = all([not thread.is_alive() for thread in self._threads])
+                all_finished = all([not thread.is_alive()
+                                    for thread in self._threads])
                 if all_finished and self.queue.empty():
                     raise StopIteration()
                 else:
diff --git a/keras/utils/generic_utils.py b/keras/utils/generic_utils.py
index 4712844e218..864dbbaba1e 100644
--- a/keras/utils/generic_utils.py
+++ b/keras/utils/generic_utils.py
@@ -461,6 +461,22 @@ def to_list(x):
     return [x]
 
 
+def unpack_singleton(x):
+    """Gets the first element if the iterable has only one value.
+
+    Otherwise return the iterable.
+
+    # Argument:
+        x: A list or tuple.
+
+    # Returns:
+        The same iterable or the first element.
+    """
+    if len(x) == 1:
+        return x[0]
+    return x
+
+
 def object_list_uid(object_list):
     object_list = to_list(object_list)
     return ', '.join([str(abs(id(x))) for x in object_list])
@@ -516,3 +532,52 @@ def slice_arrays(arrays, start=None, stop=None):
             return arrays[start:stop]
         else:
             return [None]
+
+
+def transpose_shape(shape, target_format, spatial_axes):
+    """Converts a tuple or a list to the correct `data_format`.
+
+    It does so by switching the positions of its elements.
+
+    # Arguments
+        shape: Tuple or list, often representing shape,
+            corresponding to `'channels_last'`.
+        target_format: A string, either `'channels_first'` or `'channels_last'`.
+        spatial_axes: A tuple of integers.
+            Correspond to the indexes of the spatial axes.
+            For example, if you pass a shape
+            representing (batch_size, timesteps, rows, cols, channels),
+            then `spatial_axes=(2, 3)`.
+
+    # Returns
+        A tuple or list, with the elements permuted according
+        to `target_format`.
+
+    # Example
+    ```python
+        >>> from keras.utils.generic_utils import transpose_shape
+        >>> transpose_shape((16, 128, 128, 32),'channels_first', spatial_axes=(1, 2))
+        (16, 32, 128, 128)
+        >>> transpose_shape((16, 128, 128, 32), 'channels_last', spatial_axes=(1, 2))
+        (16, 128, 128, 32)
+        >>> transpose_shape((128, 128, 32), 'channels_first', spatial_axes=(0, 1))
+        (32, 128, 128)
+    ```
+
+    # Raises
+        ValueError: if `value` or the global `data_format` invalid.
+    """
+    if target_format == 'channels_first':
+        new_values = shape[:spatial_axes[0]]
+        new_values += (shape[-1],)
+        new_values += tuple(shape[x] for x in spatial_axes)
+
+        if isinstance(shape, list):
+            return list(new_values)
+        return new_values
+    elif target_format == 'channels_last':
+        return shape
+    else:
+        raise ValueError('The `data_format` argument must be one of '
+                         '"channels_first", "channels_last". Received: ' +
+                         str(target_format))
diff --git a/keras/utils/io_utils.py b/keras/utils/io_utils.py
index c29478adfba..0128b2fd9d6 100644
--- a/keras/utils/io_utils.py
+++ b/keras/utils/io_utils.py
@@ -58,6 +58,12 @@ def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
         else:
             self.end = end
         self.normalizer = normalizer
+        if self.normalizer is not None:
+            first_val = self.normalizer(self.data[0:1])
+        else:
+            first_val = self.data[0:1]
+        self._base_shape = first_val.shape[1:]
+        self._base_dtype = first_val.dtype
 
     def __len__(self):
         return self.end - self.start
@@ -101,7 +107,7 @@ def shape(self):
         # Returns
             A numpy-style shape tuple.
         """
-        return (self.end - self.start,) + self.data.shape[1:]
+        return (self.end - self.start,) + self._base_shape
 
     @property
     def dtype(self):
@@ -110,7 +116,7 @@ def dtype(self):
         # Returns
             A numpy dtype string.
         """
-        return self.data.dtype
+        return self._base_dtype
 
     @property
     def ndim(self):
diff --git a/keras/utils/multi_gpu_utils.py b/keras/utils/multi_gpu_utils.py
index 26c7ad2052e..4c2374753eb 100644
--- a/keras/utils/multi_gpu_utils.py
+++ b/keras/utils/multi_gpu_utils.py
@@ -9,6 +9,7 @@
 from ..layers.core import Lambda
 from ..engine.training import Model
 from ..models import clone_model
+from ..utils.generic_utils import to_list
 
 
 def _get_available_devices():
@@ -234,8 +235,7 @@ def get_slice(data, i, parts):
                 # Apply model on slice
                 # (creating a model replica on the target device).
                 outputs = model(inputs)
-                if not isinstance(outputs, list):
-                    outputs = [outputs]
+                outputs = to_list(outputs)
 
                 # Save the outputs for merging back together later.
                 for o in range(len(outputs)):
diff --git a/keras/utils/np_utils.py b/keras/utils/np_utils.py
index 888ba60a057..64743cf9223 100644
--- a/keras/utils/np_utils.py
+++ b/keras/utils/np_utils.py
@@ -8,7 +8,7 @@
 import warnings
 
 
-def to_categorical(y, num_classes=None):
+def to_categorical(y, num_classes=None, dtype='float32'):
     """Converts a class vector (integers) to binary class matrix.
 
     E.g. for use with categorical_crossentropy.
@@ -17,6 +17,8 @@ def to_categorical(y, num_classes=None):
         y: class vector to be converted into a matrix
             (integers from 0 to num_classes).
         num_classes: total number of classes.
+        dtype: The data type expected by the input, as a string
+            (`float32`, `float64`, `int32`...)
 
     # Returns
         A binary matrix representation of the input. The classes axis
@@ -30,7 +32,7 @@ def to_categorical(y, num_classes=None):
     if not num_classes:
         num_classes = np.max(y) + 1
     n = y.shape[0]
-    categorical = np.zeros((n, num_classes), dtype=np.float32)
+    categorical = np.zeros((n, num_classes), dtype=dtype)
     categorical[np.arange(n), y] = 1
     output_shape = input_shape + (num_classes,)
     categorical = np.reshape(categorical, output_shape)
@@ -54,8 +56,7 @@ def normalize(x, axis=-1, order=2):
 
 
 def to_channels_first(data):
-    """
-    Transform the image data to `channels_first` format.
+    """Transform the image data to `channels_first` format.
 
     # Arguments
         data: A Numpy data tensor or a list of Numpy data tensor
diff --git a/keras/utils/test_utils.py b/keras/utils/test_utils.py
index 7b06df50dc3..880b1605ecc 100644
--- a/keras/utils/test_utils.py
+++ b/keras/utils/test_utils.py
@@ -81,6 +81,32 @@ def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None,
         kwargs['weights'] = weights
         layer = layer_cls(**kwargs)
 
+    expected_output_shape = layer.compute_output_shape(input_shape)
+
+    def _layer_in_model_test(model):
+        actual_output = model.predict(input_data)
+        actual_output_shape = actual_output.shape
+        for expected_dim, actual_dim in zip(expected_output_shape,
+                                            actual_output_shape):
+            if expected_dim is not None:
+                assert expected_dim == actual_dim
+        if expected_output is not None:
+            assert_allclose(actual_output, expected_output, rtol=1e-3)
+
+        # test serialization, weight setting at model level
+        model_config = model.get_config()
+        recovered_model = model.__class__.from_config(model_config)
+        if model.weights:
+            weights = model.get_weights()
+            recovered_model.set_weights(weights)
+            _output = recovered_model.predict(input_data)
+            assert_allclose(_output, actual_output, rtol=1e-3)
+
+        # test training mode (e.g. useful for dropout tests)
+        model.compile('rmsprop', 'mse')
+        model.train_on_batch(input_data, actual_output)
+        return actual_output
+
     # test in functional API
     if fixed_batch_size:
         x = Input(batch_shape=input_shape, dtype=input_dtype)
@@ -89,59 +115,19 @@ def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None,
     y = layer(x)
     assert K.dtype(y) == expected_output_dtype
 
-    # check shape inference
+    # check with the functional API
     model = Model(x, y)
-    expected_output_shape = layer.compute_output_shape(input_shape)
-    actual_output = model.predict(input_data)
-    actual_output_shape = actual_output.shape
-    for expected_dim, actual_dim in zip(expected_output_shape,
-                                        actual_output_shape):
-        if expected_dim is not None:
-            assert expected_dim == actual_dim
-    if expected_output is not None:
-        assert_allclose(actual_output, expected_output, rtol=1e-3)
-
-    # test serialization, weight setting at model level
-    model_config = model.get_config()
-    recovered_model = Model.from_config(model_config)
-    if model.weights:
-        weights = model.get_weights()
-        recovered_model.set_weights(weights)
-        _output = recovered_model.predict(input_data)
-        assert_allclose(_output, actual_output, rtol=1e-3)
-
-    # test training mode (e.g. useful for dropout tests)
-    model.compile('rmsprop', 'mse')
-    model.train_on_batch(input_data, actual_output)
+    _layer_in_model_test(model)
 
     # test as first layer in Sequential API
     layer_config = layer.get_config()
     layer_config['batch_input_shape'] = input_shape
     layer = layer.__class__.from_config(layer_config)
 
+    # check with the sequential API
     model = Sequential()
     model.add(layer)
-    actual_output = model.predict(input_data)
-    actual_output_shape = actual_output.shape
-    for expected_dim, actual_dim in zip(expected_output_shape,
-                                        actual_output_shape):
-        if expected_dim is not None:
-            assert expected_dim == actual_dim
-    if expected_output is not None:
-        assert_allclose(actual_output, expected_output, rtol=1e-3)
-
-    # test serialization, weight setting at model level
-    model_config = model.get_config()
-    recovered_model = Sequential.from_config(model_config)
-    if model.weights:
-        weights = model.get_weights()
-        recovered_model.set_weights(weights)
-        _output = recovered_model.predict(input_data)
-        assert_allclose(_output, actual_output, rtol=1e-3)
-
-    # test training mode (e.g. useful for dropout tests)
-    model.compile('rmsprop', 'mse')
-    model.train_on_batch(input_data, actual_output)
+    actual_output = _layer_in_model_test(model)
 
     # for further checks in the caller function
     return actual_output
diff --git a/keras/utils/vis_utils.py b/keras/utils/vis_utils.py
index c15586845c2..c24b06180a5 100644
--- a/keras/utils/vis_utils.py
+++ b/keras/utils/vis_utils.py
@@ -108,7 +108,6 @@ def model_to_dot(model,
             if node_key in model._network_nodes:
                 for inbound_layer in node.inbound_layers:
                     inbound_layer_id = str(id(inbound_layer))
-                    layer_id = str(id(layer))
                     dot.add_edge(pydot.Edge(inbound_layer_id, layer_id))
     return dot
 
diff --git a/keras/wrappers/scikit_learn.py b/keras/wrappers/scikit_learn.py
index c9663c62af4..6ebf6fc8d4d 100644
--- a/keras/wrappers/scikit_learn.py
+++ b/keras/wrappers/scikit_learn.py
@@ -11,6 +11,7 @@
 
 from ..utils.np_utils import to_categorical
 from ..utils.generic_utils import has_arg
+from ..utils.generic_utils import to_list
 from ..models import Sequential
 
 
@@ -291,8 +292,7 @@ def score(self, x, y, **kwargs):
             y = to_categorical(y)
 
         outputs = self.model.evaluate(x, y, **kwargs)
-        if not isinstance(outputs, list):
-            outputs = [outputs]
+        outputs = to_list(outputs)
         for name, output in zip(self.model.metrics_names, outputs):
             if name == 'acc':
                 return output
diff --git a/keras_mxnet_ci/nightly-buildspec.yml b/keras_mxnet_ci/nightly-buildspec.yml
index 4c927eeeffa..aa32b8cdf8c 100644
--- a/keras_mxnet_ci/nightly-buildspec.yml
+++ b/keras_mxnet_ci/nightly-buildspec.yml
@@ -15,6 +15,8 @@ phases:
       echo "Installing Theano";
       pip install theano;
       pip install pillow;
+      pip install graphviz;
+      pip install pydot;
       echo "Installing Keras from source";
       pip install -e .[visualize,tests];
       pip uninstall --yes keras;
@@ -23,6 +25,7 @@ phases:
   build:
     commands:
       echo "Running Keras Unit Tests and Integration Tests for all the backends";
-      py.test tests/;
+      py.test tests/ --ignore=tests/keras/utils/;
+      py.test tests/keras/utils/;
       echo "Running PEP tests";
       py.test --pep8 -m pep8 -n0;
\ No newline at end of file
diff --git a/keras_mxnet_ci/pr-buildspec.yml b/keras_mxnet_ci/pr-buildspec.yml
index 65a03aa4843..bd982f86535 100644
--- a/keras_mxnet_ci/pr-buildspec.yml
+++ b/keras_mxnet_ci/pr-buildspec.yml
@@ -17,6 +17,8 @@ phases:
       echo "Installing Theano";
       pip install theano;
       pip install pillow;
+      pip install graphviz;
+      pip install pydot;
       echo "Installing Keras from source";
       pip install -e .[visualize,tests];
       pip uninstall --yes keras;
@@ -24,6 +26,7 @@ phases:
   build:
     commands:
       echo "Running Keras Unit Tests and Integration Tests for all the backends";
-      py.test tests/;
+      py.test tests/ --ignore=tests/keras/utils/;
+      py.test tests/keras/utils/;
       echo "Running PEP tests";
       py.test --pep8 -m pep8 -n0;
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index 0d3aa4818e7..907acc45202 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -12,7 +12,71 @@ norecursedirs= build
 # E402 module level import not at top of file - temporary measure to continue adding ros python packaged in sys.path
 # E731 do not assign a lambda expression, use a def
 
-pep8ignore=* E501 \
-           * E402 \
+pep8ignore=* E402 \
            * E731 \
+           examples/conv_filter_visualization.py E501 \
+           examples/deep_dream.py E501 \
+           examples/image_ocr.py E501 \
+           examples/imdb_fasttext.py E501 \
+           examples/imdb_lstm.py E501 \
+           examples/lstm_text_generation.py E501 \
+           examples/mnist_hierarchical_rnn.py E501 \
+           examples/mnist_net2net.py E501 \
+           examples/mnist_siamese.py E501 \
+           examples/mnist_tfrecord.py E501 \
+           examples/neural_doodle.py E501 \
+           examples/neural_style_transfer.py E501 \
+           keras/callbacks.py E501 \
+           keras/constraints.py E501 \
+           keras/metrics.py E501 \
+           keras/models.py E501 \
+           keras/optimizers.py E501 \
+           keras/backend/cntk_backend.py E501 \
+           keras/backend/common.py E501 \
+           keras/backend/tensorflow_backend.py E501 \
+           keras/backend/theano_backend.py E501 \
+           keras/datasets/boston_housing.py E501 \
+           keras/datasets/imdb.py E501 \
+           keras/datasets/reuters.py E501 \
+           keras/engine/network.py E501 \
+           keras/engine/saving.py E501 \
+           keras/engine/training.py E501 \
+           keras/engine/training_generator.py E501 \
+           keras/layers/advanced_activations.py E501 \
+           keras/layers/convolutional.py E501 \
+           keras/layers/convolutional_recurrent.py E501 \
+           keras/layers/core.py E501 \
+           keras/layers/cudnn_recurrent.py E501 \
+           keras/layers/embeddings.py E501 \
+           keras/layers/local.py E501 \
+           keras/layers/merge.py E501 \
+           keras/layers/noise.py E501 \
+           keras/layers/normalization.py E501 \
+           keras/layers/recurrent.py E501 \
+           keras/layers/wrappers.py E501 \
+           keras/legacy/interfaces.py E501 \
+           keras/legacy/layers.py E501 \
+           tests/test_documentation.py E501 \
+           tests/test_loss_weighting.py E501 \
+           tests/test_model_saving.py E501 \
+           tests/integration_tests/test_temporal_data_tasks.py E501 \
+           tests/keras/initializers_test.py E501 \
+           tests/keras/metrics_test.py E501 \
+           tests/keras/optimizers_test.py E501 \
+           tests/keras/test_callbacks.py E501 \
+           tests/keras/test_sequential_model.py E501 \
+           tests/keras/backend/backend_test.py E501 \
+           tests/keras/backend/reference_operations.py E501 \
+           tests/keras/engine/test_topology.py E501 \
+           tests/keras/engine/test_training.py E501 \
+           tests/keras/layers/convolutional_recurrent_test.py E501 \
+           tests/keras/layers/convolutional_test.py E501 \
+           tests/keras/layers/core_test.py E501 \
+           tests/keras/layers/cudnn_recurrent_test.py E501 \
+           tests/keras/layers/embeddings_test.py E501 \
+           tests/keras/layers/normalization_test.py E501 \
+           tests/keras/layers/wrappers_test.py E501 \
+           tests/keras/legacy/interface_test.py E501
 
+# Enable line length testing with maximum line length of 120
+pep8maxlinelength = 120
diff --git a/setup.py b/setup.py
index fb80e527bab..7e81fb231b0 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,8 @@
 
 Read the Keras documentation at: https://keras.io/
 
-Read the Keras-MXNet documentation at: https://github.com/awslabs/keras-apache-mxnet/tree/master/docs/mxnet_backend
+Read the Keras-MXNet documentation at:
+https://github.com/awslabs/keras-apache-mxnet/tree/master/docs/mxnet_backend
 
 For a detailed overview of what makes Keras special, see:
 https://keras.io/why-use-keras/
@@ -25,8 +26,9 @@
 '''
 
 setup(name='keras-mxnet',
-      version='2.2.0',
-      description='Deep Learning for humans. Keras with highly scalable, high performance Apache MXNet backend support.',
+      version='2.2.2',
+      description='Deep Learning for humans. Keras with highly scalable,\
+                   high performance Apache MXNet backend support.',
       long_description=long_description,
       author='Amazon Web Services',
       url='https://github.com/awslabs/keras-apache-mxnet',
@@ -36,7 +38,7 @@
                         'six>=1.9.0',
                         'h5py>=2.7.1',
                         'pyyaml',
-                        'keras_applications==1.0.2',
+                        'keras_applications==1.0.4',
                         'keras_preprocessing==1.0.1'],
       extras_require={
           'visualize': ['pydot>=1.2.4'],
diff --git a/tests/integration_tests/applications_test.py b/tests/integration_tests/applications_test.py
index dcb17fd611d..fa57c4ab619 100644
--- a/tests/integration_tests/applications_test.py
+++ b/tests/integration_tests/applications_test.py
@@ -3,14 +3,17 @@
 import os
 from multiprocessing import Process, Queue
 from keras.utils.test_utils import keras_test
+from keras.utils.test_utils import layer_test
+from keras.models import Sequential
 from keras import applications
 from keras import backend as K
 
 
-pytestmark = pytest.mark.skipif(K.backend() == 'mxnet' or
-                                os.environ.get('CORE_CHANGED', 'True') == 'False' and
-                                os.environ.get('APP_CHANGED', 'True') == 'False',
-                                reason='Runs only when the relevant files have been modified.')
+pytestmark = pytest.mark.skipif(
+    K.backend() == 'mxnet' or
+    (os.environ.get('CORE_CHANGED', 'True') == 'False' and
+     os.environ.get('APP_CHANGED', 'True') == 'False'),
+    reason='Runs only when the relevant files have been modified.')
 
 
 MODEL_LIST = [
diff --git a/tests/integration_tests/imagenet_utils_test.py b/tests/integration_tests/imagenet_utils_test.py
index 5e2db293f2e..f46df7be47c 100644
--- a/tests/integration_tests/imagenet_utils_test.py
+++ b/tests/integration_tests/imagenet_utils_test.py
@@ -111,117 +111,5 @@ def test_decode_predictions():
         utils.decode_predictions(np.ones((2, 100)))
 
 
-def test_obtain_input_shape():
-    # input_shape and default_size are not identical.
-    with pytest.raises(ValueError):
-        utils._obtain_input_shape(
-            input_shape=(224, 224, 3),
-            default_size=299,
-            min_size=139,
-            data_format='channels_last',
-            require_flatten=True,
-            weights='imagenet')
-
-    # Test invalid use cases
-    for data_format in ['channels_last', 'channels_first']:
-
-        # test warning
-        shape = (139, 139)
-        input_shape = shape + (99,) if data_format == 'channels_last' else (99,) + shape
-        with pytest.warns(UserWarning):
-            utils._obtain_input_shape(
-                input_shape=input_shape,
-                default_size=None,
-                min_size=139,
-                data_format=data_format,
-                require_flatten=False,
-                weights='fake_weights')
-
-        # input_shape is smaller than min_size.
-        shape = (100, 100)
-        input_shape = shape + (3,) if data_format == 'channels_last' else (3,) + shape
-        with pytest.raises(ValueError):
-            utils._obtain_input_shape(
-                input_shape=input_shape,
-                default_size=None,
-                min_size=139,
-                data_format=data_format,
-                require_flatten=False)
-
-        # shape is 1D.
-        shape = (100,)
-        input_shape = shape + (3,) if data_format == 'channels_last' else (3,) + shape
-        with pytest.raises(ValueError):
-            utils._obtain_input_shape(
-                input_shape=input_shape,
-                default_size=None,
-                min_size=139,
-                data_format=data_format,
-                require_flatten=False)
-
-        # the number of channels is 5 not 3.
-        shape = (100, 100)
-        input_shape = shape + (5,) if data_format == 'channels_last' else (5,) + shape
-        with pytest.raises(ValueError):
-            utils._obtain_input_shape(
-                input_shape=input_shape,
-                default_size=None,
-                min_size=139,
-                data_format=data_format,
-                require_flatten=False)
-
-        # require_flatten=True with dynamic input shape.
-        with pytest.raises(ValueError):
-            utils._obtain_input_shape(
-                input_shape=None,
-                default_size=None,
-                min_size=139,
-                data_format='channels_first',
-                require_flatten=True)
-
-    # test include top
-    assert utils._obtain_input_shape(
-        input_shape=(3, 200, 200),
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=True) == (3, 200, 200)
-
-    assert utils._obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False) == (None, None, 3)
-
-    assert utils._obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=False) == (3, None, None)
-
-    assert utils._obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False) == (None, None, 3)
-
-    assert utils._obtain_input_shape(
-        input_shape=(150, 150, 3),
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False) == (150, 150, 3)
-
-    assert utils._obtain_input_shape(
-        input_shape=(3, None, None),
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=False) == (3, None, None)
-
-
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/backend/backend_test.py b/tests/keras/backend/backend_test.py
index d97a71a5e1a..e1a371df639 100644
--- a/tests/keras/backend/backend_test.py
+++ b/tests/keras/backend/backend_test.py
@@ -547,6 +547,16 @@ def test_function_tf_feed_dict(self):
         assert output == [21.]
         assert KTF.get_session().run(fetches=[x, y]) == [30., 40.]
 
+    def test_function_tf_string_input(self):
+        # Test functions with string inputs.
+
+        x_placeholder = KTF.placeholder(shape=(), dtype="string")
+        x_identity = KTF.identity(x_placeholder)
+
+        f = KTF.function(inputs=[x_placeholder], outputs=[x_identity])
+        output = f([b'test'])
+        assert output == [b'test']
+
     def test_rnn(self):
         # implement a simple RNN
         num_samples = 4
@@ -1112,69 +1122,20 @@ def test_conv3d(self):
                                        BACKENDS_WITHOUT_MXNET, cntk_dynamicity=True,
                                        data_format=data_format)
 
-        # test in data_format = channels_first
-        for input_shape in [(2, 3, 4, 5, 4), (2, 3, 5, 4, 6)]:
-            for kernel_shape in [(2, 2, 2, 3, 4), (3, 2, 4, 3, 4)]:
-                check_two_tensor_operation('conv3d', input_shape, kernel_shape,
-                                           BACKENDS_WITHOUT_MXNET, cntk_dynamicity=True,
-                                           data_format='channels_first')
-
-        # test in data_format = channels_last
-        input_shape = (1, 2, 2, 2, 1)
-        kernel_shape = (2, 2, 2, 1, 1)
-        check_two_tensor_operation('conv3d', input_shape, kernel_shape,
-                                   BACKENDS, cntk_dynamicity=True,
-                                   data_format='channels_last')
-
-        xval = np.random.random(input_shape)
-        kernel_val = np.random.random(kernel_shape) - 0.5
-        # Test invalid use cases
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                k.conv3d(k.variable(xval), k.variable(kernel_val), data_format='channels_middle')
-
-    @pytest.mark.parametrize('k', [KTF, KMX], ids=['TensorFlow', 'MXNet'])
-    def test_depthwise_conv_2d(self, k):
-        for data_format in ['channels_first', 'channels_last']:
-            x_shape = (4, 4)
-            # MXNet only support depth_multiplier=1
-            # TODO: fully support depth_multiplier for depthwise_conv2d
-            depth_multiplier = 1 if k == KMX else 2
-            # default kernel shape for TensorFlow backend
-            kernel_shape = (3, 3, 3, depth_multiplier)
-
-            if data_format == 'channels_first':
-                input_shape = (2, 3) + x_shape
-                # MXNet requires different kernel shape for channels first
-                if k == KMX:
-                    kernel_shape = (3, depth_multiplier, 3, 3)
-            elif data_format == 'channels_last':
-                input_shape = (2,) + x_shape + (3,)
-
-            x_val = np.ones(input_shape)
-            kernel_val = np.arange(np.prod(kernel_shape)).reshape(kernel_shape)
-            z = k.eval(k.depthwise_conv2d(k.variable(x_val), k.variable(kernel_val),
-                                          data_format=data_format))
-            # split to num_in_channels*depth_multiplier
-            for z_i in np.split(z, 3 * depth_multiplier, axis=1 if data_format == 'channels_first' else -1):
-                assert_allclose(z_i, z_i[0] * np.ones_like(z_i))
-
-        # Test invalid use cases
-        with pytest.raises(ValueError):
-            k.depthwise_conv2d(k.variable(x_val), k.variable(kernel_val), data_format='channels_middle')
-
     @pytest.mark.skipif(K.backend() == 'theano' or K.backend() == 'mxnet', reason='Not supported.')
     @pytest.mark.parametrize('op,input_shape,kernel_shape,depth_multiplier,padding,data_format', [
+        ('separable_conv1d', (2, 8, 2), (3,), 1, 'same', 'channels_last'),
+        ('separable_conv1d', (1, 8, 2), (3,), 2, 'valid', 'channels_last'),
         ('separable_conv2d', (2, 3, 4, 5), (3, 3), 1, 'same', 'channels_first'),
         ('separable_conv2d', (2, 3, 5, 6), (4, 3), 2, 'valid', 'channels_first'),
         ('separable_conv2d', (1, 6, 5, 3), (3, 4), 1, 'valid', 'channels_last'),
         ('separable_conv2d', (1, 7, 6, 3), (3, 3), 2, 'same', 'channels_last'),
     ])
-    def test_separable_conv2d(self, op, input_shape, kernel_shape, depth_multiplier, padding, data_format):
+    def test_separable_conv(self, op, input_shape, kernel_shape, depth_multiplier, padding, data_format):
         input_depth = input_shape[1] if data_format == 'channels_first' else input_shape[-1]
         _, x = parse_shape_or_val(input_shape)
         _, depthwise = parse_shape_or_val(kernel_shape + (input_depth, depth_multiplier))
-        _, pointwise = parse_shape_or_val((1, 1) + (input_depth * depth_multiplier, 7))
+        _, pointwise = parse_shape_or_val((1,) * len(kernel_shape) + (input_depth * depth_multiplier, 7))
         y1 = reference_operations.separable_conv(x, depthwise, pointwise, padding, data_format)
         if K.backend() == 'cntk':
             y2 = cntk_func_three_tensor(
@@ -1235,13 +1196,19 @@ def legacy_test_pool3d(self):
                                       strides=(1, 1, 1), padding='same', pool_mode='avg')
 
     def test_random_normal(self):
-        mean = 0.
-        std = 1.
+        # test standard normal as well as a normal with a different set of parameters
         for k in BACKENDS:
-            rand = k.eval(k.random_normal((300, 200), mean=mean, stddev=std, seed=1337))
-            assert rand.shape == (300, 200)
-            assert np.abs(np.mean(rand) - mean) < 0.015
-            assert np.abs(np.std(rand) - std) < 0.015
+            for mean, std in [(0., 1.), (-10., 5.)]:
+                rand = k.eval(k.random_normal((300, 200), mean=mean, stddev=std, seed=1337))
+                assert rand.shape == (300, 200)
+                assert np.abs(np.mean(rand) - mean) < std * 0.015
+                assert np.abs(np.std(rand) - std) < std * 0.015
+
+                # test that random_normal also generates different values when used within a function
+                r = k.random_normal((1,), mean=mean, stddev=std, seed=1337)
+                samples = [k.eval(r) for _ in range(60000)]
+                assert np.abs(np.mean(samples) - mean) < std * 0.015
+                assert np.abs(np.std(samples) - std) < std * 0.015
 
     def test_random_uniform(self):
         min_val = -1.
@@ -1250,8 +1217,14 @@ def test_random_uniform(self):
             rand = k.eval(k.random_uniform((200, 100), min_val, max_val))
             assert rand.shape == (200, 100)
             assert np.abs(np.mean(rand)) < 0.015
-            assert np.max(rand) <= max_val
-            assert np.min(rand) >= min_val
+            assert max_val - 0.015 < np.max(rand) <= max_val
+            assert min_val + 0.015 > np.min(rand) >= min_val
+
+            r = k.random_uniform((1,), minval=min_val, maxval=max_val)
+            samples = [k.eval(r) for _ in range(20000)]
+            assert np.abs(np.mean(samples)) < 0.015
+            assert max_val - 0.015 < np.max(samples) <= max_val
+            assert min_val + 0.015 > np.min(samples) >= min_val
 
     def test_random_binomial(self):
         p = 0.5
@@ -1262,6 +1235,12 @@ def test_random_binomial(self):
             assert np.max(rand) == 1
             assert np.min(rand) == 0
 
+            r = k.random_binomial((1,), p)
+            samples = [k.eval(r) for _ in range(20000)]
+            assert np.abs(np.mean(samples) - p) < 0.015
+            assert np.max(samples) == 1
+            assert np.min(samples) == 0
+
     @pytest.mark.skipif(K.backend() == 'mxnet',
                         reason="MXNet backend does not support truncated normal yet.")
     def test_truncated_normal(self):
diff --git a/tests/keras/engine/test_topology.py b/tests/keras/engine/test_topology.py
index 07f6704d867..a15bea5b686 100644
--- a/tests/keras/engine/test_topology.py
+++ b/tests/keras/engine/test_topology.py
@@ -9,6 +9,17 @@
 from keras import backend as K
 from keras.models import model_from_json, model_from_yaml
 from keras.utils.test_utils import keras_test
+from keras.initializers import Constant
+
+
+skipif_no_tf_gpu = pytest.mark.skipif(
+    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
+    reason='Requires TensorFlow backend and a GPU')
+
+
+skipif_no_tf_gpu = pytest.mark.skipif(
+    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
+    reason='Requires TensorFlow backend and a GPU')
 
 
 skipif_no_tf_gpu = pytest.mark.skipif(
@@ -811,5 +822,19 @@ def call(self, inputs, **kwargs):
     assert K.int_shape(z)[1:] == (16, 16, 3)
 
 
+@keras_test
+def test_constant_initializer_with_numpy():
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,), kernel_initializer=Constant(np.ones((3, 2)))))
+    model.add(Dense(3))
+    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+    json_str = model.to_json()
+    model_from_json(json_str).summary()
+
+    yaml_str = model.to_yaml()
+    model_from_yaml(yaml_str).summary()
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/engine/test_training.py b/tests/keras/engine/test_training.py
index a44d1bee14a..fefca0fe547 100644
--- a/tests/keras/engine/test_training.py
+++ b/tests/keras/engine/test_training.py
@@ -498,22 +498,23 @@ def expected_shape(batch_size, n_batches):
     if K.backend() != 'mxnet':
         # Create a model with a single output.
         single_output_model = Model([a, b], a_2)
-        single_output_model.compile(optimizer, loss, metrics=[], sample_weight_mode=None)
+        single_output_model.compile(optimizer, loss,
+                                    metrics=[], sample_weight_mode=None)
 
         # Single output and one step.
         batch_size = 5
         sequence_length = 1
         shape_0, _ = expected_shape(batch_size, sequence_length)
-        out = single_output_model.predict_generator(RandomSequence(batch_size,
-                                                    sequence_length=sequence_length))
+        out = single_output_model.predict_generator(
+            RandomSequence(batch_size, sequence_length=sequence_length))
         assert np.shape(out) == shape_0
 
         # Single output and multiple steps.
         batch_size = 5
         sequence_length = 2
         shape_0, _ = expected_shape(batch_size, sequence_length)
-        out = single_output_model.predict_generator(RandomSequence(batch_size,
-                                                    sequence_length=sequence_length))
+        out = single_output_model.predict_generator(
+            RandomSequence(batch_size, sequence_length=sequence_length))
         assert np.shape(out) == shape_0
 
 
@@ -543,7 +544,7 @@ def gen_data(batch_sz):
                    [np.random.random((batch_sz, 4)),
                     np.random.random((batch_sz, 3))])
 
-    with pytest.warns(UserWarning) as w:
+    with pytest.warns(Warning) as w:
         out = model.fit_generator(gen_data(4),
                                   steps_per_epoch=10,
                                   use_multiprocessing=True,
diff --git a/tests/keras/layers/convolutional_test.py b/tests/keras/layers/convolutional_test.py
index 23e3509df97..71b395e3e04 100644
--- a/tests/keras/layers/convolutional_test.py
+++ b/tests/keras/layers/convolutional_test.py
@@ -116,7 +116,7 @@ def test_conv_1d():
                        'kernel_size': kernel_size,
                        'padding': padding,
                        'dilation_rate': 2},
-               input_shape=input_shape)
+               input_shape=(batch_size, steps, input_dim))
 
     # Test channels_first
     layer_test(convolutional.Conv1D,
@@ -212,6 +212,9 @@ def test_convolution_2d():
                                                  batch_input_shape=(None, None, 5, None))])
 
 
+@pytest.mark.skipif((K.backend() == 'mxnet'),
+                    reason='MXNet backend does not support Conv2D Transpose yet.')
+@keras_test
 def test_conv2d_transpose():
     num_samples = 2
     filters = 2
@@ -297,6 +300,7 @@ def test_separable_conv_1d():
                        'pointwise_constraint': 'unit_norm',
                        'depthwise_constraint': 'unit_norm',
                        'strides': 1,
+                       'use_bias': True,
                        'depth_multiplier': multiplier},
                input_shape=(num_samples, stack_size, num_step))
 
@@ -308,6 +312,8 @@ def test_separable_conv_1d():
                                                           batch_input_shape=(None, 5, None))])
 
 
+@pytest.mark.skipif((K.backend() == 'mxnet'),
+                    reason='MXNet backend does not support Separable Conv2D yet.')
 @keras_test
 def test_separable_conv_2d():
     num_samples = 2
@@ -327,6 +333,8 @@ def test_separable_conv_2d():
                         continue
                     if dilation_rate != (1, 1) and strides != (1, 1):
                         continue
+                    if dilation_rate != (1, 1) and multiplier == dilation_rate[0]:
+                        continue
                     if dilation_rate != (1, 1) and K.backend() == 'cntk':
                         continue
 
@@ -398,6 +406,7 @@ def test_depthwise_conv_2d():
                        'bias_regularizer': 'l2',
                        'activity_regularizer': 'l2',
                        'depthwise_constraint': 'unit_norm',
+                       'use_bias': True,
                        'strides': strides,
                        'depth_multiplier': multiplier},
                input_shape=(num_samples, stack_size, num_row, num_col))
@@ -532,18 +541,22 @@ def test_conv3d_transpose():
     num_col = 6
 
     for padding in _convolution_paddings:
-        for strides in [(1, 1, 1), (2, 2, 2)]:
-            for data_format in ['channels_first', 'channels_last']:
-                if padding == 'same' and strides != (1, 1, 1):
-                    continue
-                layer_test(convolutional.Conv3DTranspose,
-                           kwargs={'filters': filters,
-                                   'kernel_size': 3,
-                                   'padding': padding,
-                                   'strides': strides,
-                                   'data_format': data_format},
-                           input_shape=(None, num_depth, num_row, num_col, stack_size),
-                           fixed_batch_size=True)
+        for out_padding in [None, (0, 0, 0), (1, 1, 1)]:
+            for strides in [(1, 1, 1), (2, 2, 2)]:
+                for data_format in ['channels_first', 'channels_last']:
+                    if padding == 'same' and strides != (1, 1, 1):
+                        continue
+                    if strides == (1, 1, 1) and out_padding == (1, 1, 1):
+                        continue
+                    layer_test(convolutional.Conv3DTranspose,
+                               kwargs={'filters': filters,
+                                       'kernel_size': 3,
+                                       'padding': padding,
+                                       'output_padding': out_padding,
+                                       'strides': strides,
+                                       'data_format': data_format},
+                               input_shape=(None, num_depth, num_row, num_col, stack_size),
+                               fixed_batch_size=True)
 
     layer_test(convolutional.Conv3DTranspose,
                kwargs={'filters': filters,
@@ -556,16 +569,38 @@ def test_conv3d_transpose():
                        'activity_regularizer': 'l2',
                        'kernel_constraint': 'max_norm',
                        'bias_constraint': 'max_norm',
+                       'use_bias': True,
                        'strides': strides},
                input_shape=(None, stack_size, num_depth, num_row, num_col),
                fixed_batch_size=True)
 
     # Test invalid use case
     with pytest.raises(ValueError):
-        model = Sequential([convolutional.Conv3DTranspose(filters=filters,
-                                                          kernel_size=3,
-                                                          padding=padding,
-                                                          batch_input_shape=(None, None, 5, None, None))])
+        model = Sequential([convolutional.Conv3DTranspose(
+            filters=filters,
+            kernel_size=3,
+            padding=padding,
+            batch_input_shape=(None, None, 5, None, None))])
+
+    # Test invalid output padding for given stride. Output padding equal
+    # to stride
+    with pytest.raises(ValueError):
+        model = Sequential([convolutional.Conv3DTranspose(
+            filters=filters,
+            kernel_size=3,
+            padding=padding,
+            output_padding=(0, 3, 3),
+            strides=(1, 3, 4),
+            batch_input_shape=(None, num_depth, num_row, num_col, stack_size))])
+    # Output padding greater than stride
+    with pytest.raises(ValueError):
+        model = Sequential([convolutional.Conv3DTranspose(
+            filters=filters,
+            kernel_size=3,
+            padding=padding,
+            output_padding=(2, 2, 3),
+            strides=(1, 3, 4),
+            batch_input_shape=(None, num_depth, num_row, num_col, stack_size))])
 
 
 @keras_test
diff --git a/tests/keras/layers/core_test.py b/tests/keras/layers/core_test.py
index d36f2268de0..c27975ee44d 100644
--- a/tests/keras/layers/core_test.py
+++ b/tests/keras/layers/core_test.py
@@ -5,6 +5,7 @@
 from keras import backend as K
 from keras import layers
 from keras.models import Model
+from keras.models import Sequential
 from keras.utils.test_utils import layer_test
 from keras.utils.test_utils import keras_test
 from keras import regularizers
@@ -281,6 +282,16 @@ def f_shape(s):
     ld = deserialize_layer({'class_name': 'Lambda', 'config': config})
 
 
+@keras_test
+@pytest.mark.skipif((K.backend() == 'theano'),
+                    reason="theano cannot compute "
+                           "the output shape automatically.")
+def test_lambda_output_shape():
+    layer_test(layers.Lambda,
+               kwargs={'function': lambda x: K.mean(x, axis=-1)},
+               input_shape=(3, 2, 4))
+
+
 @keras_test
 def test_dense():
     layer_test(layers.Dense,
@@ -336,5 +347,30 @@ def test_activity_regularization():
     model.compile('rmsprop', 'mse')
 
 
+@keras_test
+def test_sequential_as_downstream_of_masking_layer():
+
+    inputs = layers.Input(shape=(3, 4))
+    x = layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
+    s = Sequential()
+    s.add(layers.Dense(5, input_shape=(4,)))
+    s.add(layers.Activation('relu'))
+    x = layers.wrappers.TimeDistributed(s)(x)
+    model = Model(inputs=inputs, outputs=x)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model_input = np.random.randint(low=1, high=5, size=(10, 3, 4))
+    for i in range(4):
+        model_input[i, i:, :] = 0.
+    model.fit(model_input,
+              np.random.random((10, 3, 5)), epochs=1, batch_size=6)
+
+    mask_outputs = [model.layers[1].compute_mask(model.layers[1].input)]
+    mask_outputs += [model.layers[2].compute_mask(model.layers[2].input, mask_outputs[-1])]
+    func = K.function([model.input], mask_outputs)
+    mask_outputs_val = func([model_input])
+    assert np.array_equal(mask_outputs_val[0], np.any(model_input, axis=-1))
+    assert np.array_equal(mask_outputs_val[1], np.any(model_input, axis=-1))
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/wrappers_test.py b/tests/keras/layers/wrappers_test.py
index 968fc41cd33..3bfc8241a4b 100644
--- a/tests/keras/layers/wrappers_test.py
+++ b/tests/keras/layers/wrappers_test.py
@@ -12,10 +12,6 @@
 from keras.utils.generic_utils import object_list_uid, to_list
 
 
-pytestmark = pytest.mark.skipif(K.backend() == 'mxnet',
-                                reason='MXNet backend does not support TimeDistributed and RNN yet')
-
-
 pytestmark = pytest.mark.skipif(K.backend() == 'mxnet',
                                 reason='MXNet backend does not support TimeDistributed and RNN yet')
 
diff --git a/tests/keras/optimizers_test.py b/tests/keras/optimizers_test.py
index a2c96400160..68b3f5b70fd 100644
--- a/tests/keras/optimizers_test.py
+++ b/tests/keras/optimizers_test.py
@@ -69,12 +69,16 @@ def _test_optimizer(optimizer, target=0.75):
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
 @keras_test
-def _test_no_grad(optimizer):
+@pytest.mark.skipif((K.backend() != 'tensorflow'),
+                    reason="Only Tensorflow raises a "
+                           "ValueError if the gradient is null.")
+def test_no_grad():
     inp = Input([3])
     x = Dense(10)(inp)
-    x = Lambda(lambda l: 1.0 * K.reshape(K.cast(K.argmax(l), 'float32'), [-1, 1]))(x)
+    x = Lambda(lambda l: 1.0 * K.reshape(K.cast(K.argmax(l), 'float32'), [-1, 1]),
+               output_shape=lambda x: [x[0], 1])(x)
     mod = Model(inp, x)
-    mod.compile(optimizer, 'mse')
+    mod.compile('sgd', 'mse')
     with pytest.raises(ValueError):
         mod.fit(np.zeros([10, 3]), np.zeros([10, 1], np.float32), batch_size=10, epochs=10)
 
@@ -87,7 +91,6 @@ def _test_no_grad(optimizer):
 def test_sgd():
     sgd = optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
     _test_optimizer(sgd)
-    _test_no_grad(sgd)
 
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
diff --git a/tests/keras/preprocessing/image_test.py b/tests/keras/preprocessing/image_test.py
index 0fe83400032..31064408613 100644
--- a/tests/keras/preprocessing/image_test.py
+++ b/tests/keras/preprocessing/image_test.py
@@ -56,7 +56,8 @@ def test_image_data_generator(self, tmpdir):
                 vertical_flip=True)
             generator.fit(images, augment=True)
 
-            for x, y in generator.flow(images, np.arange(images.shape[0]),
+            num_samples = images.shape[0]
+            for x, y in generator.flow(images, np.arange(num_samples),
                                        shuffle=False, save_to_dir=str(tmpdir),
                                        batch_size=3):
                 assert x.shape == images[:3].shape
@@ -64,9 +65,9 @@ def test_image_data_generator(self, tmpdir):
                 break
 
             # Test with sample weights
-            for x, y, w in generator.flow(images, np.arange(images.shape[0]),
+            for x, y, w in generator.flow(images, np.arange(num_samples),
                                           shuffle=False,
-                                          sample_weight=np.arange(images.shape[0]) + 1,
+                                          sample_weight=np.arange(num_samples) + 1,
                                           save_to_dir=str(tmpdir),
                                           batch_size=3):
                 assert x.shape == images[:3].shape
@@ -75,7 +76,7 @@ def test_image_data_generator(self, tmpdir):
                 break
 
             # Test with `shuffle=True`
-            for x, y in generator.flow(images, np.arange(images.shape[0]),
+            for x, y in generator.flow(images, np.arange(num_samples),
                                        shuffle=True, save_to_dir=str(tmpdir),
                                        batch_size=3):
                 assert x.shape == images[:3].shape
@@ -137,11 +138,13 @@ def test_image_data_generator(self, tmpdir):
 
             with pytest.raises(ValueError) as e_info:
                 generator.flow((images, x_misc_err), np.arange(dsize), batch_size=3)
-            assert str(e_info.value).find('All of the arrays in') != -1
+            assert 'All of the arrays in' in str(e_info.value)
 
             with pytest.raises(ValueError) as e_info:
-                generator.flow((images, x_misc1), np.arange(dsize + 1), batch_size=3)
-            assert str(e_info.value).find('`x` (images tensor) and `y` (labels) ') != -1
+                generator.flow((images, x_misc1),
+                               np.arange(dsize + 1),
+                               batch_size=3)
+            assert '`x` (images tensor) and `y` (labels) ' in str(e_info.value)
 
             # Test `flow` behavior as Sequence
             seq = generator.flow(images, np.arange(images.shape[0]),
@@ -272,7 +275,8 @@ def test_directory_iterator(self, tmpdir):
                 im_class = count % num_classes
                 # rotate subfolders
                 classpaths = paths[im_class]
-                filename = os.path.join(classpaths[count % len(classpaths)], 'image-{}.jpg'.format(count))
+                filename = os.path.join(classpaths[count % len(classpaths)],
+                                        'image-{}.jpg'.format(count))
                 filenames.append(filename)
                 im.save(str(tmpdir / filename))
                 count += 1
@@ -305,7 +309,8 @@ def preprocessing_function(x):
             return np.zeros_like(x)
 
         # Test usage as Sequence
-        generator = image.ImageDataGenerator(preprocessing_function=preprocessing_function)
+        generator = image.ImageDataGenerator(
+            preprocessing_function=preprocessing_function)
         dir_seq = generator.flow_from_directory(str(tmpdir),
                                                 target_size=(26, 26),
                                                 color_mode='rgb',
@@ -337,7 +342,8 @@ def test_directory_iterator_class_mode_input(self, tmpdir):
 
         # create iterator
         generator = image.ImageDataGenerator()
-        dir_iterator = generator.flow_from_directory(str(tmpdir), class_mode='input')
+        dir_iterator = generator.flow_from_directory(str(tmpdir),
+                                                     class_mode='input')
         batch = next(dir_iterator)
 
         # check if input and output have the same shape
@@ -353,7 +359,8 @@ def test_directory_iterator_class_mode_input(self, tmpdir):
         (0.40, 10),
         (0.50, 8),
     ])
-    def test_directory_iterator_with_validation_split(self, validation_split, num_training):
+    def test_directory_iterator_with_validation_split(self, validation_split,
+                                                      num_training):
         num_classes = 2
         tmp_folder = tempfile.mkdtemp(prefix='test_images')
 
@@ -380,7 +387,8 @@ def test_directory_iterator_with_validation_split(self, validation_split, num_tr
                 im_class = count % num_classes
                 # rotate subfolders
                 classpaths = paths[im_class]
-                filename = os.path.join(classpaths[count % len(classpaths)], 'image-{}.jpg'.format(count))
+                filename = os.path.join(classpaths[count % len(classpaths)],
+                                        'image-{}.jpg'.format(count))
                 filenames.append(filename)
                 im.save(os.path.join(tmp_folder, filename))
                 count += 1
@@ -391,10 +399,12 @@ def test_directory_iterator_with_validation_split(self, validation_split, num_tr
         with pytest.raises(ValueError):
             generator.flow_from_directory(tmp_folder, subset='foo')
 
-        train_iterator = generator.flow_from_directory(tmp_folder, subset='training')
+        train_iterator = generator.flow_from_directory(tmp_folder,
+                                                       subset='training')
         assert train_iterator.samples == num_training
 
-        valid_iterator = generator.flow_from_directory(tmp_folder, subset='validation')
+        valid_iterator = generator.flow_from_directory(tmp_folder,
+                                                       subset='validation')
         assert valid_iterator.samples == count - num_training
 
         # check number of classes and images
@@ -437,17 +447,17 @@ def test_img_utils(self):
         with pytest.raises(ValueError):
             x = np.random.random((height, width))  # not 3D
             img = image.array_to_img(x, data_format='channels_first')
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError):  # unknown data_format
             x = np.random.random((height, width, 3))
-            img = image.array_to_img(x, data_format='channels')  # unknown data_format
-        with pytest.raises(ValueError):
-            x = np.random.random((height, width, 5))  # neither RGB nor gray-scale
+            img = image.array_to_img(x, data_format='channels')
+        with pytest.raises(ValueError):  # neither RGB nor gray-scale
+            x = np.random.random((height, width, 5))
             img = image.array_to_img(x, data_format='channels_last')
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError):  # unknown data_format
             x = np.random.random((height, width, 3))
-            img = image.img_to_array(x, data_format='channels')  # unknown data_format
-        with pytest.raises(ValueError):
-            x = np.random.random((height, width, 5, 3))  # neither RGB nor gray-scale
+            img = image.img_to_array(x, data_format='channels')
+        with pytest.raises(ValueError):  # neither RGB nor gray-scale
+            x = np.random.random((height, width, 5, 3))
             img = image.img_to_array(x, data_format='channels_last')
 
     def test_random_transforms(self):
@@ -485,7 +495,8 @@ def test_random_transforms(self):
         assert transform_dict['zy'] != 0
         assert transform_dict['zy'] != transform_dict2['zy']
         assert transform_dict['channel_shift_intensity'] != 0
-        assert transform_dict['channel_shift_intensity'] != transform_dict2['channel_shift_intensity']
+        assert (transform_dict['channel_shift_intensity'] !=
+                transform_dict2['channel_shift_intensity'])
         assert transform_dict['brightness'] != 0
         assert transform_dict['brightness'] != transform_dict2['brightness']
 
@@ -523,8 +534,8 @@ def test_deterministic_transform(self):
                                [1., 1., 1.]]])
         assert np.allclose(generator.apply_transform(x, {'theta': 45}),
                            x_rotated)
-        assert np.allclose(image.apply_affine_transform(x, theta=45, channel_axis=2,
-                                                        fill_mode='constant'), x_rotated)
+        assert np.allclose(image.apply_affine_transform(
+            x, theta=45, channel_axis=2, fill_mode='constant'), x_rotated)
 
     def test_batch_standardize(self):
         # ImageDataGenerator.standardize should work on batches
diff --git a/tests/keras/preprocessing/text_test.py b/tests/keras/preprocessing/text_test.py
index 65fbdbb37a8..ac55e5bd310 100644
--- a/tests/keras/preprocessing/text_test.py
+++ b/tests/keras/preprocessing/text_test.py
@@ -3,7 +3,10 @@
 import numpy as np
 import pytest
 
-from keras.preprocessing.text import Tokenizer, one_hot, hashing_trick, text_to_word_sequence
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.text import one_hot
+from keras.preprocessing.text import hashing_trick
+from keras.preprocessing.text import text_to_word_sequence
 
 
 def test_one_hot():
@@ -80,16 +83,19 @@ def test_text_to_word_sequence_multichar_split():
 
 def test_text_to_word_sequence_unicode():
     text = u'ali! veli? kırk dokuz elli'
-    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
+    assert (text_to_word_sequence(text) ==
+            [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
 
 
 def test_text_to_word_sequence_unicode_multichar_split():
     text = u'ali!stopveli?stopkırkstopdokuzstopelli'
-    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
+    assert (text_to_word_sequence(text, split='stop') ==
+            [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
 
 
 def test_tokenizer_unicode():
-    texts = [u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz']
+    texts = [u'ali veli kırk dokuz elli',
+             u'ali veli kırk dokuz elli veli kırk dokuz']
     tokenizer = Tokenizer(num_words=5)
     tokenizer.fit_on_texts(texts)
 
diff --git a/tests/keras/test_callbacks.py b/tests/keras/test_callbacks.py
index 5cc7707d87f..944a15bf0b6 100644
--- a/tests/keras/test_callbacks.py
+++ b/tests/keras/test_callbacks.py
@@ -10,7 +10,7 @@
 from keras import initializers
 from keras import callbacks
 from keras.models import Sequential, Model
-from keras.layers import Input, Dense, Dropout, add, dot, Lambda
+from keras.layers import Input, Dense, Dropout, add, dot, Lambda, Layer
 from keras.layers.convolutional import Conv2D
 from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D, GlobalAveragePooling2D
 from keras.utils.test_utils import get_test_data
@@ -470,6 +470,8 @@ def make_model():
 
 
 @keras_test
+@pytest.mark.skipif((K.backend() == 'mxnet'),
+                    reason='MXNet backend does not support it yet.')
 def test_TensorBoard(tmpdir):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
@@ -500,6 +502,19 @@ def data_generator(train):
             i += 1
             i = i % max_batch_index
 
+    class DummyStatefulMetric(Layer):
+
+        def __init__(self, name='dummy_stateful_metric', **kwargs):
+            super(DummyStatefulMetric, self).__init__(name=name, **kwargs)
+            self.stateful = True
+            self.state = K.variable(value=0, dtype='int32')
+
+        def reset_states(self):
+            pass
+
+        def __call__(self, y_true, y_pred):
+            return self.state
+
     inp = Input((input_dim,))
     hidden = Dense(num_hidden, activation='relu')(inp)
     hidden = Dropout(0.1)(hidden)
@@ -507,7 +522,7 @@ def data_generator(train):
     model = Model(inputs=inp, outputs=output)
     model.compile(loss='categorical_crossentropy',
                   optimizer='sgd',
-                  metrics=['accuracy'])
+                  metrics=['accuracy', DummyStatefulMetric()])
 
     # we must generate new callbacks for each test, as they aren't stateless
     def callbacks_factory(histogram_freq, embeddings_freq=1):
diff --git a/tests/keras/test_sequential_model.py b/tests/keras/test_sequential_model.py
index e41f91078be..9626cfa8f05 100644
--- a/tests/keras/test_sequential_model.py
+++ b/tests/keras/test_sequential_model.py
@@ -429,5 +429,45 @@ def test_sequential_deferred_build():
     assert len(new_model.weights) == 4
 
 
+@keras_test
+def test_nested_sequential_deferred_build():
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(3))
+    inner_model.add(keras.layers.Dense(3))
+
+    model = keras.models.Sequential()
+    model.add(inner_model)
+    model.add(keras.layers.Dense(5))
+    model.compile('sgd', 'mse')
+
+    assert inner_model.built is False
+    assert len(inner_model.layers) == 2
+    assert len(inner_model.weights) == 0
+    assert model.built is False
+    assert len(model.layers) == 2
+    assert len(model.weights) == 0
+
+    model.train_on_batch(
+        np.random.random((2, 4)), np.random.random((2, 5)))
+
+    assert inner_model.built is True
+    assert len(inner_model.layers) == 2
+    assert len(inner_model.weights) == 4
+    assert model.built is True
+    assert len(model.layers) == 2
+    assert len(model.weights) == 6
+
+    config = model.get_config()
+    new_model = keras.models.Sequential.from_config(config)
+    assert new_model.built is True
+    assert len(new_model.layers) == 2
+    assert len(new_model.weights) == 6
+
+    new_inner_model = new_model.layers[0]
+    assert new_inner_model.built is True
+    assert len(new_inner_model.layers) == 2
+    assert len(new_inner_model.weights) == 4
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/utils/conv_utils_test.py b/tests/keras/utils/conv_utils_test.py
index 742c88877b3..1a467512fe7 100644
--- a/tests/keras/utils/conv_utils_test.py
+++ b/tests/keras/utils/conv_utils_test.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 from keras.utils import conv_utils
+from keras import backend as K
 
 
 def test_normalize_tuple():
@@ -17,7 +18,7 @@ def test_normalize_tuple():
 
 def test_invalid_data_format():
     with pytest.raises(ValueError):
-        conv_utils.normalize_data_format('channels_middle')
+        K.normalize_data_format('channels_middle')
 
 
 def test_invalid_padding():
@@ -59,13 +60,22 @@ def test_conv_input_length():
 
 
 def test_deconv_length():
-    assert conv_utils.deconv_length(None, 1, 7, 'same') is None
-    assert conv_utils.deconv_length(224, 1, 7, 'same') == 224
-    assert conv_utils.deconv_length(224, 2, 7, 'same') == 448
-    assert conv_utils.deconv_length(32, 1, 5, 'valid') == 36
-    assert conv_utils.deconv_length(32, 2, 5, 'valid') == 67
-    assert conv_utils.deconv_length(32, 1, 5, 'full') == 28
-    assert conv_utils.deconv_length(32, 2, 5, 'full') == 59
+    assert conv_utils.deconv_length(None, 1, 7, 'same', None) is None
+    assert conv_utils.deconv_length(224, 1, 7, 'same', None) == 224
+    assert conv_utils.deconv_length(224, 2, 7, 'same', None) == 448
+    assert conv_utils.deconv_length(32, 1, 5, 'valid', None) == 36
+    assert conv_utils.deconv_length(32, 2, 5, 'valid', None) == 67
+    assert conv_utils.deconv_length(32, 1, 5, 'full', None) == 28
+    assert conv_utils.deconv_length(32, 2, 5, 'full', None) == 59
+    assert conv_utils.deconv_length(224, 1, 7, 'same', 0) == 224
+    assert conv_utils.deconv_length(224, 2, 7, 'same', 0) == 447
+    assert conv_utils.deconv_length(224, 2, 7, 'same', 1) == 448
+    assert conv_utils.deconv_length(32, 1, 5, 'valid', 0) == 36
+    assert conv_utils.deconv_length(32, 2, 5, 'valid', 0) == 67
+    assert conv_utils.deconv_length(32, 2, 5, 'valid', 1) == 68
+    assert conv_utils.deconv_length(6, 1, 3, 'full', 0) == 4
+    assert conv_utils.deconv_length(6, 2, 3, 'full', 1) == 10
+    assert conv_utils.deconv_length(6, 2, 3, 'full', 2) == 11
 
 
 if __name__ == '__main__':
diff --git a/tests/keras/utils/data_utils_test.py b/tests/keras/utils/data_utils_test.py
index 2ed9dc6fd2d..be14cfa8211 100644
--- a/tests/keras/utils/data_utils_test.py
+++ b/tests/keras/utils/data_utils_test.py
@@ -173,7 +173,8 @@ def test_generator_enqueuer_threads():
 
     """
      Not comparing the order since it is not guaranteed.
-     It may get ordered, but not a lot, one thread can take the GIL before he was supposed to.
+     It may get ordered, but not a lot, one thread can take
+     the GIL before he was supposed to.
     """
     assert len(set(acc) - set(range(100))) == 0, "Output is not the same"
     enqueuer.stop()
@@ -187,7 +188,8 @@ def test_generator_enqueuer_processes():
     acc = []
     for i in range(100):
         acc.append(int(next(gen_output)[0, 0, 0, 0]))
-    assert acc != list(range(100)), "Order was keep in GeneratorEnqueuer with processes"
+    assert acc != list(range(100)), ('Order was keep in GeneratorEnqueuer '
+                                     'with processes')
     enqueuer.stop()
 
 
@@ -210,13 +212,15 @@ def test_generator_enqueuer_fail_processes():
 
 
 def test_ordered_enqueuer_threads():
-    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]), use_multiprocessing=False)
+    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]),
+                               use_multiprocessing=False)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
     acc = []
     for i in range(100):
         acc.append(next(gen_output)[0, 0, 0, 0])
-    assert acc == list(range(100)), "Order was not keep in GeneratorEnqueuer with threads"
+    assert acc == list(range(100)), ('Order was not keep in GeneratorEnqueuer '
+                                     'with threads')
     enqueuer.stop()
 
 
@@ -229,19 +233,22 @@ def test_ordered_enqueuer_threads_not_ordered():
     acc = []
     for i in range(100):
         acc.append(next(gen_output)[0, 0, 0, 0])
-    assert acc != list(range(100)), "Order was not keep in GeneratorEnqueuer with threads"
+    assert acc != list(range(100)), ('Order was not keep in GeneratorEnqueuer '
+                                     'with threads')
     enqueuer.stop()
 
 
 @use_spawn
 def test_ordered_enqueuer_processes():
-    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]), use_multiprocessing=True)
+    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]),
+                               use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
     acc = []
     for i in range(100):
         acc.append(next(gen_output)[0, 0, 0, 0])
-    assert acc == list(range(100)), "Order was not keep in GeneratorEnqueuer with processes"
+    assert acc == list(range(100)), ('Order was not keep in GeneratorEnqueuer '
+                                     'with processes')
     enqueuer.stop()
 
 
@@ -255,20 +262,24 @@ def test_ordered_enqueuer_fail_threads():
 
 @use_spawn
 def test_on_epoch_end_processes():
-    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]), use_multiprocessing=True)
+    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]),
+                               use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
     acc = []
     for i in range(200):
         acc.append(next(gen_output)[0, 0, 0, 0])
-    assert acc[100:] == list([k * 5 for k in range(100)]), "Order was not keep in GeneratorEnqueuer with processes"
+    assert acc[100:] == list([k * 5 for k in range(100)]), (
+        'Order was not keep in GeneratorEnqueuer with processes')
     enqueuer.stop()
 
 
 @use_spawn
 def test_context_switch():
-    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]), use_multiprocessing=True)
-    enqueuer2 = OrderedEnqueuer(DummySequence([3, 200, 200, 3], value=15), use_multiprocessing=True)
+    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]),
+                               use_multiprocessing=True)
+    enqueuer2 = OrderedEnqueuer(DummySequence([3, 200, 200, 3], value=15),
+                                use_multiprocessing=True)
     enqueuer.start(3, 10)
     enqueuer2.start(3, 10)
     gen_output = enqueuer.get()
@@ -297,7 +308,8 @@ def test_context_switch():
 
 
 def test_on_epoch_end_threads():
-    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]), use_multiprocessing=False)
+    enqueuer = OrderedEnqueuer(DummySequence([3, 200, 200, 3]),
+                               use_multiprocessing=False)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
     acc = []
@@ -306,7 +318,8 @@ def test_on_epoch_end_threads():
     acc = []
     for i in range(100):
         acc.append(next(gen_output)[0, 0, 0, 0])
-    assert acc == list([k * 5 for k in range(100)]), "Order was not keep in GeneratorEnqueuer with processes"
+    assert acc == list([k * 5 for k in range(100)]), (
+        'Order was not keep in GeneratorEnqueuer with processes')
     enqueuer.stop()
 
 
@@ -350,7 +363,8 @@ def test_finite_generator_enqueuer_processes():
     acc = []
     for output in gen_output:
         acc.append(int(output[0, 0, 0, 0]))
-    assert acc != list(range(100)), "Order was keep in GeneratorEnqueuer with processes"
+    assert acc != list(range(100)), ('Order was keep in GeneratorEnqueuer '
+                                     'with processes')
     enqueuer.stop()
 
 
diff --git a/tests/keras/utils/generic_utils_test.py b/tests/keras/utils/generic_utils_test.py
index 588a4ecb903..96486e1ab03 100644
--- a/tests/keras/utils/generic_utils_test.py
+++ b/tests/keras/utils/generic_utils_test.py
@@ -70,7 +70,8 @@ def test_has_arg(fn, name, accept_all, expected):
             if sys.version_info >= (3,):
                 raise
             pytest.skip('Function is not compatible with Python 2')
-        context.pop('__builtins__', None)  # Sometimes exec adds builtins to the context
+        # Sometimes exec adds builtins to the context
+        context.pop('__builtins__', None)
         fn, = context.values()
 
     assert has_arg(fn, name, accept_all) is expected
@@ -125,7 +126,8 @@ def test_func_dump_and_load_backwards_compat(test_func):
     # this test ensures that models serialized prior to version 2.1.2 can still be
     # deserialized
 
-    # see https://github.com/evhub/keras/blob/2.1.1/keras/utils/generic_utils.py#L166
+    # see:
+    # https://github.com/evhub/keras/blob/2.1.1/keras/utils/generic_utils.py#L166
     serialized = marshal.dumps(test_func.__code__).decode('raw_unicode_escape')
 
     deserialized = func_load(serialized, defaults=test_func.__defaults__)
diff --git a/tests/keras/utils/io_utils_test.py b/tests/keras/utils/io_utils_test.py
index 6bd80d6ea7f..4da9a547266 100644
--- a/tests/keras/utils/io_utils_test.py
+++ b/tests/keras/utils/io_utils_test.py
@@ -47,7 +47,8 @@ def test_io_utils(in_tmpdir):
     h5_path = 'test.h5'
     create_dataset(h5_path)
 
-    # Instantiating HDF5Matrix for the training set, which is a slice of the first 150 elements
+    # Instantiating HDF5Matrix for the training set,
+    # which is a slice of the first 150 elements
     X_train = HDF5Matrix(h5_path, 'my_data', start=0, end=150)
     y_train = HDF5Matrix(h5_path, 'my_labels', start=0, end=150)
 
@@ -59,7 +60,8 @@ def test_io_utils(in_tmpdir):
     assert y_train.shape == (150, 1), 'HDF5Matrix shape should match input array'
     # But they do not support negative indices, so don't try print(X_train[-1])
 
-    assert y_train.dtype == np.dtype('i'), 'HDF5Matrix dtype should match input array'
+    assert y_train.dtype == np.dtype('i'), (
+        'HDF5Matrix dtype should match input array')
     assert y_train.ndim == 2, 'HDF5Matrix ndim should match input array'
     assert y_train.size == 150, 'HDF5Matrix ndim should match input array'
 
@@ -71,13 +73,15 @@ def test_io_utils(in_tmpdir):
 
     # Note: you have to use shuffle='batch' or False with HDF5Matrix
     model.fit(X_train, y_train, batch_size=32, shuffle='batch', verbose=False)
-    # test that evalutation and prediction don't crash and return reasonable results
+    # test that evalutation and prediction don't crash and
+    # return reasonable results
     out_pred = model.predict(X_test, batch_size=32, verbose=False)
     out_eval = model.evaluate(X_test, y_test, batch_size=32, verbose=False)
 
     assert out_pred.shape == (50, 1), 'Prediction shape does not match'
     assert out_eval.shape == (), 'Shape of evaluation does not match'
-    assert out_eval > 0, 'Evaluation value does not meet criteria: {}'.format(out_eval)
+    assert out_eval > 0, (
+        'Evaluation value does not meet criteria: {}'.format(out_eval))
 
     # test slicing for shortened array
     assert len(X_train[0:]) == len(X_train), 'Incorrect shape for sliced data'
@@ -101,9 +105,22 @@ def test_io_utils(in_tmpdir):
 
     # test normalizer
     normalizer = lambda x: x + 1
-    normalized_X_train = HDF5Matrix(h5_path, 'my_data', start=0, end=150, normalizer=normalizer)
+    normalized_X_train = HDF5Matrix(h5_path, 'my_data', start=0, end=150,
+                                    normalizer=normalizer)
     assert np.isclose(normalized_X_train[0][0], X_train[0][0] + 1)
 
+    # test resizing normalizer
+    normalizer_rs = lambda x: x[:, ::2]
+    normalized_rs_X_train = HDF5Matrix(h5_path, 'my_data', start=0, end=150,
+                                       normalizer=normalizer_rs)
+    assert (normalized_rs_X_train.shape[1] == 5)
+
+    # test dtype changing normalizer
+    normalizer_dtype = lambda x: x.astype(np.uint8)
+    normalized_dtype_X_train = HDF5Matrix(h5_path, 'my_data', start=0, end=150,
+                                          normalizer=normalizer_dtype)
+    assert (normalized_dtype_X_train.dtype == np.uint8)
+
     os.remove(h5_path)
 
 
diff --git a/tests/keras/utils/layer_utils_test.py b/tests/keras/utils/layer_utils_test.py
index 7228f19f95f..9f7d8ee53aa 100644
--- a/tests/keras/utils/layer_utils_test.py
+++ b/tests/keras/utils/layer_utils_test.py
@@ -56,7 +56,8 @@ def get_model(shape, data_format):
 
         # Test equivalence of convert_dense_weights_data_format
         out1 = model1.predict(x)
-        layer_utils.convert_dense_weights_data_format(model1.layers[2], prev_shape, target_data_format)
+        layer_utils.convert_dense_weights_data_format(
+            model1.layers[2], prev_shape, target_data_format)
         for (src, dst) in zip(model1.layers, model2.layers):
             dst.set_weights(src.get_weights())
         out2 = model2.predict(transpose(x))
diff --git a/tests/keras/utils/multi_gpu_test.py b/tests/keras/utils/multi_gpu_test.py
index 07550b7bd03..1f8f59737d7 100644
--- a/tests/keras/utils/multi_gpu_test.py
+++ b/tests/keras/utils/multi_gpu_test.py
@@ -272,5 +272,19 @@ def multi_gpu_application_folder_generator_benchmark():
         print('%d gpus training:' % i, total_time)
 
 
+@keras_test
+def test_multi_gpu_with_multi_input_layers():
+    inputs = keras.Input((4, 3))
+    init_state = keras.Input((3,))
+    outputs = keras.layers.SimpleRNN(
+        3, return_sequences=True)(inputs, initial_state=init_state)
+    x = [np.random.randn(2, 4, 3), np.random.randn(2, 3)]
+    y = np.random.randn(2, 4, 3)
+    model = keras.models.Model([inputs, init_state], outputs)
+    parallel_model = multi_gpu_model(model, 2)
+    parallel_model.compile(loss='mean_squared_error', optimizer='adam')
+    parallel_model.train_on_batch(x, y)
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/utils/vis_utils_test.py b/tests/keras/utils/vis_utils_test.py
index 0c74344d415..bdcbdf530ef 100644
--- a/tests/keras/utils/vis_utils_test.py
+++ b/tests/keras/utils/vis_utils_test.py
@@ -13,7 +13,7 @@
 
 def test_plot_model():
     model = Sequential()
-    model.add(Conv2D(filters=2, kernel_size=(2, 3), input_shape=(3, 5, 5), name='conv'))
+    model.add(Conv2D(2, kernel_size=(2, 3), input_shape=(3, 5, 5), name='conv'))
     model.add(Flatten(name='flat'))
     model.add(Dense(5, name='dense1'))
     vis_utils.plot_model(model, to_file='model1.png', show_layer_names=False)
diff --git a/tests/keras/wrappers/scikit_learn_test.py b/tests/keras/wrappers/scikit_learn_test.py
index abb4fb4e243..c20548c8998 100644
--- a/tests/keras/wrappers/scikit_learn_test.py
+++ b/tests/keras/wrappers/scikit_learn_test.py
@@ -76,7 +76,8 @@ def __call__(self, hidden_dims):
 
 
 def assert_classification_works(clf):
-    clf.fit(X_train, y_train, sample_weight=np.ones(X_train.shape[0]), batch_size=batch_size, epochs=epochs)
+    clf.fit(X_train, y_train, sample_weight=np.ones(X_train.shape[0]),
+            batch_size=batch_size, epochs=epochs)
 
     score = clf.score(X_train, y_train, batch_size=batch_size)
     assert np.isscalar(score) and np.isfinite(score)
@@ -181,11 +182,15 @@ def assert_regression_works(reg):
 
 # Usage of sklearn's grid_search
 # from sklearn import grid_search
-# parameters = dict(hidden_dims = [20, 30], batch_size=[64, 128], epochs=[2], verbose=[0])
+# parameters = dict(hidden_dims = [20, 30], batch_size=[64, 128],
+#                   epochs=[2], verbose=[0])
 # classifier = Inherit_class_build_fn_clf()
 # clf = grid_search.GridSearchCV(classifier, parameters)
 # clf.fit(X_train, y_train)
-# parameters = dict(hidden_dims = [20, 30], batch_size=[64, 128], epochs=[2], verbose=[0])
+# parameters = dict(hidden_dims = [20, 30], batch_size=[64, 128],
+#                   epochs=[2], verbose=[0])
 # regressor = Inherit_class_build_fn_reg()
-# reg = grid_search.GridSearchCV(regressor, parameters, scoring='mean_squared_error', n_jobs=1, cv=2, verbose=2)
+# reg = grid_search.GridSearchCV(regressor, parameters,
+#                                scoring='mean_squared_error',
+#                                n_jobs=1, cv=2, verbose=2)
 # reg.fit(X_train_reg, y_train_reg)
diff --git a/tests/test_documentation.py b/tests/test_documentation.py
index 4020dab7967..e5d02e109e9 100644
--- a/tests/test_documentation.py
+++ b/tests/test_documentation.py
@@ -6,8 +6,11 @@
 
 import pytest
 
-modules = ['keras.layers', 'keras.models', 'keras', 'keras.backend.tensorflow_backend', 'keras.preprocessing.image',
-           'keras.preprocessing.text']
+modules = ['keras.layers', 'keras.models', 'keras',
+           'keras.backend.tensorflow_backend', 'keras.engine',
+           'keras.wrappers', 'keras.utils',
+           'keras.callbacks', 'keras.activations',
+           'keras.losses', 'keras.models', 'keras.optimizers']
 accepted_name = ['from_config']
 accepted_module = ['keras.legacy.layers', 'keras.utils.generic_utils']
 
diff --git a/tests/test_model_saving.py b/tests/test_model_saving.py
index 0a68dbecceb..92555e52b71 100644
--- a/tests/test_model_saving.py
+++ b/tests/test_model_saving.py
@@ -13,6 +13,7 @@
 from keras.layers import Embedding
 from keras.layers import Conv2D, Flatten
 from keras.layers import Input, InputLayer
+from keras.initializers import Constant
 from keras import optimizers
 from keras import losses
 from keras import metrics
@@ -302,7 +303,6 @@ def test_loading_weights_by_name_and_reshape():
         model.load_weights(fname, by_name=False, reshape=False)
     model.load_weights(fname, by_name=False, reshape=True)
     model.load_weights(fname, by_name=True, reshape=True)
-    os.remove(fname)
 
     out2 = model.predict(x)
     assert_allclose(np.squeeze(out), np.squeeze(out2), atol=1e-05)
@@ -313,6 +313,35 @@ def test_loading_weights_by_name_and_reshape():
             if old_weights[i]:
                 assert_allclose(old_weights[i][j], new_weights[j], atol=1e-05)
 
+    # delete and recreate model with `use_bias=False`
+    del(model)
+    model = Sequential()
+    model.add(Conv2D(2, (1, 1), input_shape=(1, 1, 1), use_bias=False, name='rick'))
+    model.add(Flatten())
+    model.add(Dense(3, name='morty'))
+    with pytest.raises(ValueError,
+                       match=r'.* expects [0-9]+ .* but the saved .* [0-9]+ .*'):
+        model.load_weights(fname)
+    with pytest.raises(ValueError,
+                       match=r'.* expects [0-9]+ .* but the saved .* [0-9]+ .*'):
+        model.load_weights(fname, by_name=True)
+    with pytest.warns(UserWarning,
+                      match=r'Skipping loading .* due to mismatch .*'):
+        model.load_weights(fname, by_name=True, skip_mismatch=True)
+
+    # delete and recreate model with `filters=10`
+    del(model)
+    model = Sequential()
+    model.add(Conv2D(10, (1, 1), input_shape=(1, 1, 1), name='rick'))
+    with pytest.raises(ValueError,
+                       match=r'.* has shape .* but the saved .* shape .*'):
+        model.load_weights(fname, by_name=True)
+    with pytest.raises(ValueError,
+                       match=r'.* load .* [0-9]+ layers into .* [0-9]+ layers.'):
+        model.load_weights(fname)
+
+    os.remove(fname)
+
 
 @keras_test
 def test_loading_weights_by_name_2():
@@ -622,6 +651,21 @@ def test_saving_recurrent_layer_without_bias():
     os.remove(fname)
 
 
+@keras_test
+def test_saving_constant_initializer_with_numpy():
+    """Test saving and loading model of constant initializer with numpy ndarray as input.
+    """
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,), kernel_initializer=Constant(np.ones((3, 2)))))
+    model.add(Dense(3))
+    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+    _, fname = tempfile.mkstemp('.h5')
+    save_model(model, fname)
+    model = load_model(fname)
+    os.remove(fname)
+
+
 @keras_test
 @pytest.mark.parametrize('implementation', [1, 2], ids=['impl1', 'impl2'])
 @pytest.mark.parametrize('bidirectional', [False, True], ids=['single', 'bidirectional'])