diff --git a/.gitignore b/.gitignore
index 479317a..1d8de95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,7 +61,7 @@ instance/
 .scrapy
 
 # Sphinx documentation
-docs/_build/
+#docs/_build/
 
 # PyBuilder
 target/
diff --git a/data_engine/prepare_data.py b/data_engine/prepare_data.py
index dfb8420..a656cc4 100644
--- a/data_engine/prepare_data.py
+++ b/data_engine/prepare_data.py
@@ -7,7 +7,7 @@
 def update_dataset_from_file(ds,
                              input_text_filename,
                              params,
-                             splits=list('val'),
+                             splits=list(['val']),
                              output_text_filename=None,
                              remove_outputs=False,
                              compute_state_below=False):
@@ -214,9 +214,9 @@ def keep_n_captions(ds, repeat, n=1, set_names=None):
     """
     Keeps only n captions per image and stores the rest in dictionaries for a later evaluation
     :param ds: Dataset object
-    :param repeat:
-    :param n:
-    :param set_names:
+    :param repeat: Number of input samples per output
+    :param n: Number of outputs to keep.
+    :param set_names: Set name.
     :return:
     """
 
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..ca5a939
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,225 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  epub3      to make an epub3"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+	@echo "  dummy      to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/NMT-Keras.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/NMT-Keras.qhc"
+
+.PHONY: applehelp
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+.PHONY: devhelp
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/NMT-Keras"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/NMT-Keras"
+	@echo "# devhelp"
+
+.PHONY: epub
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+	@echo
+	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+	@echo
+	@echo "Build finished. Dummy builder generates no files."
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..6ceecac
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../'))
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+import sphinx_rtd_theme
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.githubpages',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['ntemplates']
+
+edit_on_github_project = 'lvapeab/nmt-keras'
+edit_on_github_branch = 'master'
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'NMT-Keras'
+copyright = u'2017, Álvaro Peris'
+author = u'Álvaro Peris'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'0.0.1'
+# The full version, including alpha/beta/rc tags.
+release = u'0.0.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'collapse_navigation': False,
+    'display_version': False,
+    'navigation_depth': 3,
+}
+
+html_context = {
+    'display_github': True,
+    'github_repo': "nmt-keras",
+    'github_user': "lvapeab",
+    'github_version': "master",
+    'conf_py_path': "/docs/source/"
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+html_title = u'NMT-Keras'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+html_short_title = u'NMT-Keras documentation'
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['nstatic']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+html_domain_indices = True
+
+# If false, no index is generated.
+#
+html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+html_show_sourcelink = False
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'NMT-Kerasdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     # 'papersize': 'letterpaper',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'NMT-Keras.tex', u'NMT-Keras Documentation',
+     u'Álvaro Peris', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'nmt-keras', u'NMT-Keras Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'NMT-Keras', u'NMT-Keras Documentation',
+     author, 'NMT-Keras', 'Neural Machine Translation with Keras',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
diff --git a/docs/source/data_engine.rst b/docs/source/data_engine.rst
new file mode 100644
index 0000000..432bc3c
--- /dev/null
+++ b/docs/source/data_engine.rst
@@ -0,0 +1,23 @@
+data_engine package
+===================
+
+Submodules
+----------
+
+
+prepare_data module
+-------------------
+
+.. automodule:: data_engine.prepare_data
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule::data_engine
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/source/help.rst b/docs/source/help.rst
new file mode 100644
index 0000000..e8678ef
--- /dev/null
+++ b/docs/source/help.rst
@@ -0,0 +1,24 @@
+Contact
+=======
+
+If you have any trouble using NMT-Keras, please drop an email to: lvapeab@prhlt.upv.es
+
+Acknowledgement
+^^^^^^^^^^^^^^^
+
+Much of this library has been developed together with `Marc Bolaños`_ for other multimodal projects.
+
+Related projects
+^^^^^^^^^^^^^^^^
+To see other projects following the philosophy of NMT-Keras, take a look to:
+
+* TMA_: for egocentric captioning based on temporally-linked sequences.
+* VIBIKNet_: for visual question answering.
+* ABiViRNet_: for video description.
+* `Sentence SelectioNN`_ for sentence classification and selection.
+
+.. _Marc Bolaños: https://github.com/MarcBS
+.. _TMA: https://github.com/MarcBS/TMA
+.. _VIBIKNet: https://github.com/MarcBS/VIBIKNet
+.. _ABiViRNet: https://github.com/lvapeab/ABiViRNet
+.. _Sentence SelectioNN: https://github.com/lvapeab/sentence-selectioNN
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..a076075
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,48 @@
+NMT-Keras
+=========
+
+Neural Machine Translation with Keras (+ Theano backend).
+
+.. image:: ../../examples/documentation/attention_nmt_model.png
+   :scale: 80 %
+   :alt: alternate text
+   :align: left
+
+Features
+********
+
+ * Attention model over the input sequence of annotations.
+ * Peeked decoder: The previously generated word is an input of the current timestep.
+ * Beam search decoding.
+ * Ensemble decoding.
+ * Support for GRU/LSTM networks.
+ * Multilayered residual GRU/LSTM networks.
+ * N-best list generation (as byproduct of the beam search process).
+ * Unknown words replacement.
+ * Use of pretrained (Glove_ or Word2Vec_) word embedding vectors.
+ * MLPs for initializing the RNN hidden and memory state.
+ * Spearmint_ wrapper for hyperparameter optimization.
+
+.. _Spearmint: https://github.com/HIPS/Spearmint
+.. _Glove: http://nlp.stanford.edu/projects/glove/
+.. _Word2Vec: https://code.google.com/archive/p/word2vec/
+
+Guide
+=====
+.. toctree::
+   :maxdepth: 2
+
+   requirements
+   usage
+   resources
+   tutorial
+   modules
+   help
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
new file mode 100644
index 0000000..8b3c75d
--- /dev/null
+++ b/docs/source/modules.rst
@@ -0,0 +1,9 @@
+Modules
+=======
+
+.. toctree::
+   :maxdepth: 4
+
+   nmt-keras
+   data_engine
+   utils
\ No newline at end of file
diff --git a/docs/source/nmt-keras.rst b/docs/source/nmt-keras.rst
new file mode 100644
index 0000000..6e6f268
--- /dev/null
+++ b/docs/source/nmt-keras.rst
@@ -0,0 +1,30 @@
+nmt-keras package
+=================
+
+Submodules
+----------
+
+main module
+-----------
+
+.. automodule:: main
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+model_zoo module
+----------------
+
+.. automodule:: model_zoo
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: nmt-keras
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/source/requirements.rst b/docs/source/requirements.rst
new file mode 100644
index 0000000..32b7eef
--- /dev/null
+++ b/docs/source/requirements.rst
@@ -0,0 +1,12 @@
+Requirements
+============
+
+ - Our version of Keras_.
+ - `Multimodal Keras Wrapper`_. See the documentation_ and tutorial_.
+ - Coco-caption_ evaluation package (Only required to perform evaluation).
+
+.. _Keras: https://github.com/MarcBS/keras
+.. _Multimodal Keras Wrapper: https://github.com/lvapeab/multimodal_keras_wrapper
+.. _documentation: http://marcbs.github.io/staged_keras_wrapper/
+.. _tutorial: http://marcbs.github.io/multimodal_keras_wrapper/tutorial.html
+.. _Coco-caption: https://github.com/lvapeab/coco-caption
\ No newline at end of file
diff --git a/docs/source/resources.rst b/docs/source/resources.rst
new file mode 100644
index 0000000..a6ff0ba
--- /dev/null
+++ b/docs/source/resources.rst
@@ -0,0 +1,24 @@
+Resources
+=========
+
+- Overview_ of an attentional NMT system.
+
+- NMT-Keras step-by-step guide (iPython_ and html_ versions): Tutorials for running this library. They are expected to be followed in order:
+
+    1. `Dataset setup`_: Shows how to invoke and configure a Dataset instance for a translation problem.
+    2. `Training tutorial`_: Shows how to call a translation model, link it with the dataset object and construct calllbacks for monitorizing the training.
+    3. `Decoding tutorial`_: Shows how to call a trained translation model and use it to translate new text.
+    4. `NMT model tutorial`_: Shows how to build a state-of-the-art NMT model with Keras in few (~50) lines.
+
+.. _Overview: https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/neural_machine_translation.pdf
+.. _iPython:  https://github.com/lvapeab/nmt-keras/blob/master/examples
+.. _html: ./tutorial.html
+.. _Dataset setup: https://github.com/lvapeab/nmt-keras/blob/master/examples/1_dataset_tutorial.ipynb
+.. _Training tutorial: https://github.com/lvapeab/nmt-keras/blob/master/examples/2_training_tutorial.ipynb
+.. _Decoding tutorial: https://github.com/lvapeab/nmt-keras/blob/master/examples/3_decoding_tutorial.ipynb
+.. _NMT model tutorial: https://github.com/lvapeab/nmt-keras/blob/master/examples/4_nmt_model_tutorial.ipynb
+
+
+
+
+
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
new file mode 100644
index 0000000..e720037
--- /dev/null
+++ b/docs/source/tutorial.rst
@@ -0,0 +1,577 @@
+Tutorials
+=========
+
+
+This page contains some examples and tutorials showing how the library works. All tutorials have a `iPython notebook version`_.
+
+.. _iPython notebook version: https://github.com/lvapeab/nmt-keras/blob/master/examples
+
+Almost every variable tutorials representing model hyperparameters have been intentionally hardcoded in the tutorials,
+aiming to facilitate readability. On a real execution, these values are taken from the `config.py` file.
+
+All tutorials have been executed from the root `nmt-keras` folder. These tutorials basically are a split version of the execution pipeline of the library. If you run `python main.py`, you'll execute almost the same as tutorials 1, 2 and 4.
+
+The translation task is *EuTrans* (`Amengual et al.`_), a toy-task mainly used for debugging purposes.
+
+.. _Amengual et al.: http://link.springer.com/article/10.1023/A:1011116115948
+
+
+Dataset tutorial
+****************
+
+First, we'll create a Dataset_ instance, in order to properly manage the data. First, we are creating a Dataset_ object (from the `Multimodal Keras Wrapper`_ library).
+Let's make some imports and create an empty Dataset_ instance::
+
+    from keras_wrapper.dataset import Dataset, saveDataset
+    from data_engine.prepare_data import keep_n_captions
+    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
+
+.. _Multimodal Keras Wrapper: https://github.com/lvapeab/multimodal_keras_wrapper
+.. _Dataset: http://marcbs.github.io/multimodal_keras_wrapper/tutorial.html#basic-components
+
+
+Now that we have the empty Dataset_, we must indicate its inputs and outputs. In our case, we'll have two different inputs and one single output:
+
+1. Outputs::
+    **target_text**: Sentences in the target language.
+
+2. Inputs::
+    **source_text**: Sentences in the source language.
+
+    **state_below**: Sentences in the target language, but shifted one position to the right (for teacher-forcing training of the model).
+
+For setting up the outputs, we use the setOutputs function, with the appropriate parameters. Note that, when we are building the dataset for the training split, we build the vocabulary (up to 30000 words)::
+
+    ds.setOutput('examples/EuTrans/training.en',
+                 'train',
+                 type='text',
+                 id='target_text',
+                 tokenization='tokenize_none',
+                 build_vocabulary=True,
+                 pad_on_batch=True,
+                 sample_weights=True,
+                 max_text_len=30,
+                 max_words=30000,
+                 min_occ=0)
+
+    ds.setOutput('examples/EuTrans/dev.en',
+                 'val',
+                 type='text',
+                 id='target_text',
+                 pad_on_batch=True,
+                 tokenization='tokenize_none',
+                 sample_weights=True,
+                 max_text_len=30,
+                 max_words=0)
+
+Similarly, we introduce the source text data, with the setInputs function. Again, when building the training split, we must construct the vocabulary::
+
+
+
+    ds.setInput('examples/EuTrans/training.es',
+                'train',
+                type='text',
+                id='source_text',
+                pad_on_batch=True,
+                tokenization='tokenize_none',
+                build_vocabulary=True,
+                fill='end',
+                max_text_len=30,
+                max_words=30000,
+                min_occ=0)
+    ds.setInput('examples/EuTrans/dev.es',
+                'val',
+                type='text',
+                id='source_text',
+                pad_on_batch=True,
+                tokenization='tokenize_none',
+                fill='end',
+                max_text_len=30,
+                min_occ=0)
+
+
+
+
+...and for the `state_below` data. Note that: 1) The offset flat is set to 1, which means that the text will be shifted to the right 1 position. 2) During sampling time, we won't have this input. Hence, we 'hack' the dataset model by inserting an artificial input, of type 'ghost' for the validation split::
+
+    ds.setInput('examples/EuTrans/training.en',
+                'train',
+                type='text',
+                id='state_below',
+                required=False,
+                tokenization='tokenize_none',
+                pad_on_batch=True,
+                build_vocabulary='target_text',
+                offset=1,
+                fill='end',
+                max_text_len=30,
+                max_words=30000)
+    ds.setInput(None,
+                'val',
+                type='ghost',
+                id='state_below',
+                required=False)
+
+
+Next, we match the references with the inputs, in order to evaluate against the raw references::
+
+    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
+
+
+Finally, we can save our dataset instance for using it in other experiments::
+
+    saveDataset(ds, 'datasets')
+
+
+Training tutorial
+*****************
+Now, we'll create and train a Neural Machine Translation (NMT) model.
+We'll build the so-called `GroundHogModel`. It is defined at the `model_zoo.py` file.
+If you followed prior tutorial, you should have a dataset instance. Otherwise, you should follow that notebook first.
+
+So, let's go! First, we make some imports, load the default parameters and the dataset::
+
+    from config import load_parameters
+    from model_zoo import TranslationModel
+    import utils
+    from keras_wrapper.cnn_model import loadModel
+    from keras_wrapper.dataset import loadDataset
+    params = load_parameters()
+    dataset = loadDataset('datasets/Dataset_tutorial_dataset.pkl')
+
+Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance::
+
+
+    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['source_text']
+    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text']
+
+Now, we create a `TranslationModel` object: An instance of a `Model_Wrapper`_ object from `Multimodal Keras Wrapper`_.
+We specify the type of the model (`GroundHogModel`) and the vocabularies from the Dataset_::
+
+    nmt_model = TranslationModel(params,
+                                 model_type='GroundHogModel',
+                                 model_name='tutorial_model',
+                                 vocabularies=dataset.vocabulary,
+                                 store_path='trained_models/tutorial_model/',
+                                 verbose=True)
+
+.. _Model_Wrapper: http://marcbs.github.io/multimodal_keras_wrapper/tutorial.html#basic-components
+
+Now, we must define the inputs and outputs mapping from our Dataset instance to our model::
+
+    inputMapping = dict()
+    for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
+        pos_source = dataset.ids_inputs.index(id_in)
+        id_dest = nmt_model.ids_inputs[i]
+        inputMapping[id_dest] = pos_source
+    nmt_model.setInputsMapping(inputMapping)
+
+    outputMapping = dict()
+    for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
+        pos_target = dataset.ids_outputs.index(id_out)
+        id_dest = nmt_model.ids_outputs[i]
+        outputMapping[id_dest] = pos_target
+    nmt_model.setOutputsMapping(outputMapping)
+
+
+
+
+We can add some callbacks for controlling the training (e.g. Sampling each N updates, early stop, learning rate annealing...).
+For instance, let's build a `PrintPerformanceMetricOnEpochEndOrEachNUpdates` callback. Each 2 epochs, it will compute the 'coco' scores on the development set.
+We need to pass some variables to the callback (in the extra_vars dictionary)::
+
+    from keras_wrapper.extra.callbacks import *
+    extra_vars = {'language': 'en',
+                  'n_parallel_loaders': 8,
+                  'tokenize_f': eval('dataset.' + 'tokenize_none'),
+                  'beam_size': 12,
+                  'maxlen': 50,
+                  'model_inputs': ['source_text', 'state_below'],
+                  'model_outputs': ['target_text'],
+                  'dataset_inputs': ['source_text', 'state_below'],
+                  'dataset_outputs': ['target_text'],
+                  'normalize': True,
+                  'alpha_factor': 0.6,
+                  'val':{'references': dataset.extra_variables['val']['target_text']}
+                  }
+    vocab = dataset.vocabulary['target_text']['idx2words']
+    callbacks = []
+    callbacks.append(PrintPerformanceMetricOnEpochEnd(nmt_model,
+                                                      dataset,
+                                                      gt_id='target_text',
+                                                      metric_name=['coco'],
+                                                      set_name=['val'],
+                                                      batch_size=50,
+                                                      each_n_epochs=2,
+                                                      extra_vars=extra_vars,
+                                                      reload_epoch=0,
+                                                      is_text=True,
+                                                      index2word_y=vocab,
+                                                      sampling_type='max_likelihood',
+                                                      beam_search=True,
+                                                      save_path=nmt_model.model_path,
+                                                      start_eval_on_epoch=0,
+                                                      write_samples=True,
+                                                      write_type='list',
+                                                      save_each_evaluation=True,
+                                                      verbose=True))
+
+
+Now we are almost ready to train. We set up some training parameters...::
+
+    training_params = {'n_epochs': 100,
+                       'batch_size': 40,
+                       'maxlen': 30,
+                       'epochs_for_save': 1,
+                       'verbose': 0,
+                       'eval_on_sets': [],
+                       'n_parallel_loaders': 8,
+                       'extra_callbacks': callbacks,
+                       'reload_epoch': 0,
+                       'epoch_offset': 0}
+
+
+
+And train!::
+
+    nmt_model.trainNet(dataset, training_params)
+
+
+For a description of the training output, refer to the `typical output`_ document.
+
+.. _typical output: https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/typical_output.md
+
+Decoding tutorial
+*****************
+
+
+Now, we'll load from disk a trained Neural Machine Translation (NMT) model and we'll apply it for translating new text. This is done by the sample_ensemble_ script.
+
+This tutorial assumes that you followed both previous tutorials. In this case, we want to translate the 'test' split of our dataset.
+
+As before, let's import some stuff and load the dataset instance::
+
+    from config import load_parameters
+    from data_engine.prepare_data import keep_n_captions
+    from keras_wrapper.cnn_model import loadModel
+    from keras_wrapper.dataset import loadDataset
+    params = load_parameters()
+    dataset = loadDataset('datasets/Dataset_tutorial_dataset.pkl')
+
+
+Since we want to translate a new data split ('test') we must add it to the dataset instance, just as we did before (at the first tutorial).
+In case we also had the refences of the test split and we wanted to evaluate it, we can add it to the dataset. Note that this is not mandatory and we could just predict without evaluating.::
+
+    dataset.setInput('examples/EuTrans/test.es',
+                'test',
+                type='text',
+                id='source_text',
+                pad_on_batch=True,
+                tokenization='tokenize_none',
+                fill='end',
+                max_text_len=100,
+                min_occ=0)
+
+    dataset.setInput(None,
+    'test',
+    type='ghost',
+    id='state_below',
+    required=False)
+
+.. _sample_ensemble: https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/ensembling_tutorial.md
+
+
+Now, let's load the translation model. Suppose we want to load the model saved at the end of the epoch 4::
+
+    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
+    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
+    # Load model
+    nmt_model = loadModel('trained_models/tutorial_model', 4)
+    nmt_model.setOptimizer()
+
+
+
+Once we loaded the model, we just have to invoke the sampling method (in this case, the Beam Search algorithm) for the 'test' split::
+
+    params_prediction = {'batch_size': 50,
+                         'n_parallel_loaders': 8,
+                         'predict_on_sets': ['test'],
+                         'beam_size': 12,
+                         'maxlen': 50,
+                         'model_inputs': ['source_text', 'state_below'],
+                         'model_outputs': ['target_text'],
+                         'dataset_inputs': ['source_text', 'state_below'],
+                         'dataset_outputs': ['target_text'],
+                         'normalize': True,
+                         'alpha_factor': 0.6
+                         }
+    predictions = nmt_model.predictBeamSearchNet(dataset, params_prediction)['test']
+
+
+Up to this moment, in the variable 'predictions', we have the indices of the words of the hypotheses. We must decode them into words. For doing this, we'll use the dictionary stored in the dataset object::
+
+    vocab = dataset.vocabulary['target_text']['idx2words']
+    predictions = nmt_model.decode_predictions_beam_search(predictions,
+                                                           vocab,
+                                                           verbose=params['VERBOSE'])
+
+Finally, we store the system hypotheses::
+
+    filepath = nmt_model.model_path+'/' + 'test' + '_sampling.pred'  # results file
+    from keras_wrapper.extra.read_write import list2file
+    list2file(filepath, predictions)
+
+
+
+
+If we have the references of this split, we can also evaluate the performance of our system on it. First, we must add them to the dataset object::
+
+    # In case we had the references of this split, we could also load the split and evaluate on it
+    dataset.setOutput('examples/EuTrans/test.en',
+                 'test',
+                 type='text',
+                 id='target_text',
+                 pad_on_batch=True,
+                 tokenization='tokenize_none',
+                 sample_weights=True,
+                 max_text_len=30,
+                 max_words=0)
+    keep_n_captions(dataset, repeat=1, n=1, set_names=['test'])
+
+
+
+Next, we call the evaluation system: The Coco-caption_ package. Although its main usage is for multimodal captioning, we can use it in machine translation::
+
+
+    from keras_wrapper.extra import evaluation
+    metric = 'coco'
+    # Apply sampling
+    extra_vars = dict()
+    extra_vars['tokenize_f'] = eval('dataset.' + 'tokenize_none')
+    extra_vars['test'] = dict()
+    extra_vars['test']['references'] = dataset.extra_variables['test']['target_text']
+    metrics = evaluation.select[metric](pred_list=predictions,
+                                        verbose=1,
+                                        extra_vars=extra_vars,
+                                        split='test')
+
+.. _Coco-caption: https://github.com/lvapeab/coco-caption
+
+
+NMT model tutorial
+******************
+
+
+
+In this module, we are going to create an encoder-decoder model with:
+
+    * A bidirectional GRU encoder and a GRU decoder
+    * An attention model
+    * The previously generated word feeds back de decoder
+    * MLPs for initializing the initial RNN state
+    * Skip connections from inputs to outputs
+    * Beam search.
+
+As usual, first we import the necessary stuff::
+
+    from keras.layers import *
+    from keras.models import model_from_json, Model
+    from keras.optimizers import Adam, RMSprop, Nadam, Adadelta, SGD, Adagrad, Adamax
+    from keras.regularizers import l2
+    from keras_wrapper.cnn_model import Model_Wrapper
+    from keras_wrapper.extra.regularize import Regularize
+
+And define the dimesnions of our model. For instance, a word embedding size of 50 and 100 units in RNNs.
+The inputs/outpus are defined as in previous tutorials.::
+
+    ids_inputs = ['source_text', 'state_below']
+    ids_outputs = ['target_text']
+    word_embedding_size = 50
+    hidden_state_size = 100
+    input_vocabulary_size=686  # Autoset in the library
+    output_vocabulary_size=513  # Autoset in the library
+
+Now, let's define our encoder. First, we have to create an Input layer to connect the input text to our model.
+Next, we'll apply a word embedding to the sequence of input indices. This word embedding will feed a Bidirectional GRU network, which will produce our sequence of annotations::
+
+    # 1. Source text input
+    src_text = Input(name=ids_inputs[0],
+                     batch_shape=tuple([None, None]), # Since the input sequences have variable-length, we do not retrict the Input shape
+                     dtype='int32')
+    # 2. Encoder
+    # 2.1. Source word embedding
+    src_embedding = Embedding(input_vocabulary_size, word_embedding_size,
+                              name='source_word_embedding', mask_zero=True # Zeroes as mask
+                              )(src_text)
+    # 2.2. BRNN encoder (GRU/LSTM)
+    annotations = Bidirectional(GRU(hidden_state_size,
+                                    return_sequences=True  # Return the full sequence
+                                    ),
+                                name='bidirectional_encoder',
+                                merge_mode='concat')(src_embedding)
+
+
+
+Once we have built the encoder, let's build our decoder.
+First, we have an additional input: The previously generated word (the so-called state_below). We introduce it by means of an Input layer and a (target language) word embedding::
+
+    # 3. Decoder
+    # 3.1.1. Previously generated words as inputs for training -> Teacher forcing
+    next_words = Input(name=ids_inputs[1], batch_shape=tuple([None, None]), dtype='int32')
+    # 3.1.2. Target word embedding
+    state_below = Embedding(output_vocabulary_size, word_embedding_size,
+                            name='target_word_embedding',
+                            mask_zero=True)(next_words)
+
+
+
+The initial hidden state of the decoder's GRU is initialized by means of a MLP (in this case, single-layered) from the average of the annotations. We also aplly the mask to the annotations::
+
+
+    ctx_mean = MaskedMean()(annotations)
+    annotations = MaskLayer()(annotations)  # We may want the padded annotations
+    initial_state = Dense(hidden_state_size, name='initial_state',
+                          activation='tanh')(ctx_mean)
+
+So, we have the input of our decoder::
+
+    input_attentional_decoder = [state_below, annotations, initial_state]
+
+
+
+Note that, for a sample, the sequence of annotations and initial state is the same, independently of the decoding time-step.
+In order to avoid computation time, we build two models, one for training and the other one for sampling.
+They will share weights, but the sampling model will be made up of two different models. One (model_init) will compute the sequence of annotations and initial_state.
+The other model (model_next) will compute a single recurrent step, given the sequence of annotations, the previous hidden state and the generated words up to this moment.
+
+Therefore, now we slightly change the form of declaring layers. We must share layers between the decoding models.
+
+So, let's start by building the attentional-conditional GRU::
+
+    # Define the AttGRUCond function
+    sharedAttGRUCond = AttGRUCond(hidden_state_size,
+                                  return_sequences=True,
+                                  return_extra_variables=True, # Return attended input and attenton weights
+                                  return_states=True # Returns the sequence of hidden states (see discussion above)
+                                  )
+    [proj_h, x_att, alphas, h_state] = sharedAttGRUCond(input_attentional_decoder) # Apply shared_AttnGRUCond to our input
+
+Now, we set skip connections between input and output layer. Note that, since we have a temporal dimension because of the RNN decoder, we must apply the layers in a TimeDistributed way.
+Finally, we will merge all skip-connections and apply a 'tanh' no-linearlity::
+
+    # Define layer function
+    shared_FC_mlp = TimeDistributed(Dense(word_embedding_size, activation='linear',),
+                                    name='logit_lstm')
+    # Apply layer function
+    out_layer_mlp = shared_FC_mlp(proj_h)
+
+    # Define layer function
+    shared_FC_ctx = TimeDistributed(Dense(word_embedding_size, activation='linear'),
+                                    name='logit_ctx')
+    # Apply layer function
+    out_layer_ctx = shared_FC_ctx(x_att)
+    shared_Lambda_Permute = PermuteGeneral((1, 0, 2))
+    out_layer_ctx = shared_Lambda_Permute(out_layer_ctx)
+
+    # Define layer function
+    shared_FC_emb = TimeDistributed(Dense(word_embedding_size, activation='linear'),
+                                    name='logit_emb')
+    # Apply layer function
+    out_layer_emb = shared_FC_emb(state_below)
+
+    additional_output = merge([out_layer_mlp, out_layer_ctx, out_layer_emb], mode='sum', name='additional_input')
+    shared_activation_tanh = Activation('tanh')
+    out_layer = shared_activation_tanh(additional_output)
+
+Now, we'll' apply a deep output layer, with Maxout activation::
+
+    shared_maxout = TimeDistributed(MaxoutDense(word_embedding_size), name='maxout_layer')
+    out_layer = shared_maxout(out_layer)
+
+
+Finally, we apply a softmax function for obtaining a probability distribution over the target vocabulary words at each timestep::
+
+    shared_FC_soft = TimeDistributed(Dense(output_vocabulary_size,
+                                                   activation='softmax',
+                                                   name='softmax_layer'),
+                                             name=ids_outputs[0])
+    softout = shared_FC_soft(out_layer)
+
+That's all! We built a NMT Model!
+
+NMT models for decoding
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Now, let's build the models required for sampling. Recall that we are building two models, one for encoding the inputs and the other one for advancing steps in the decoding stage.
+
+Let's start with model_init. It will take the usual inputs (src_text and state_below) and will output:
+
+1. The vector probabilities (for timestep 1).
+2. The sequence of annotations (from encoder).
+3. The current decoder's hidden state.
+
+The only restriction here is that the first output must be the output layer (probabilities) of the model.::
+
+    model_init = Model(input=[src_text, next_words], output=[softout, annotations, h_state])
+    # Store inputs and outputs names for model_init
+    ids_inputs_init = ids_inputs
+
+    # first output must be the output probs.
+    ids_outputs_init = ids_outputs + ['preprocessed_input', 'next_state']
+
+
+
+Next, we will be the model_next. It will have the following inputs:
+
+    * Preprocessed input
+    * Previously generated word
+    * Previous hidden state
+
+And the following outputs:
+
+    * Model probabilities
+    * Current hidden state
+
+First, we define the inputs::
+
+    preprocessed_size = hidden_state_size*2 # Because we have a bidirectional encoder
+    preprocessed_annotations = Input(name='preprocessed_input', shape=tuple([None, preprocessed_size]))
+    prev_h_state = Input(name='prev_state', shape=tuple([hidden_state_size]))
+    input_attentional_decoder = [state_below, preprocessed_annotations, prev_h_state]
+
+
+And now, we build the model, using the functions stored in the 'shared*' variables declared before::
+
+    # Apply decoder
+    [proj_h, x_att, alphas, h_state] = sharedAttGRUCond(input_attentional_decoder)
+    out_layer_mlp = shared_FC_mlp(proj_h)
+    out_layer_ctx = shared_FC_ctx(x_att)
+    out_layer_ctx = shared_Lambda_Permute(out_layer_ctx)
+    out_layer_emb = shared_FC_emb(state_below)
+    additional_output = merge([out_layer_mlp, out_layer_ctx, out_layer_emb], mode='sum', name='additional_input')
+    out_layer = shared_activation_tanh(additional_output)
+    out_layer = shared_maxout(out_layer)
+    softout = shared_FC_soft(out_layer)
+    model_next = Model(input=[next_words, preprocessed_annotations, prev_h_state],
+                       output=[softout, preprocessed_annotations, h_state])
+
+Finally, we store inputs/outputs for model_next. In addition, we create a couple of dictionaries, matching inputs/outputs from the different models (model_init->model_next, model_nex->model_next)::
+
+    # Store inputs and outputs names for model_next
+    # first input must be previous word
+    ids_inputs_next = [ids_inputs[1]] + ['preprocessed_input', 'prev_state']
+    # first output must be the output probs.
+    ids_outputs_next = ids_outputs + ['preprocessed_input', 'next_state']
+
+    # Input -> Output matchings from model_init to model_next and from model_next to model_nextxt
+    matchings_init_to_next = {'preprocessed_input': 'preprocessed_input', 'next_state': 'prev_state'}
+    matchings_next_to_next = {'preprocessed_input': 'preprocessed_input', 'next_state': 'prev_state'}
+
+
+
+
+And that's all! For using this model together with the facilities provided by the staged_model_wrapper library, we should declare the model as a method of a Model_Wrapper class.
+A complete example of this with additional features can be found at model_zoo.py_.
+
+
+.. _model_zoo.py: https://github.com/lvapeab/nmt-keras/blob/master/model_zoo.py
\ No newline at end of file
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000..bffdac3
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,60 @@
+Usage
+=====
+
+Training
+********
+
+1) Set a training configuration in the config.py_ script. Each parameter is commented. See the `documentation file`_ for further info about each specific hyperparameter. You can also specify the parameters when calling the `main.py`_ script following the syntax `Key=Value`
+
+2) Train!::
+
+    python main.py
+
+Decoding
+********
+Once we have our model trained, we can translate new text using the `sample_ensemble.py`_ script. Please refer to the `ensembling tutorial`_ for more details about this script.
+In short, if we want to use the models from the first three epochs to translate the `examples/EuTrans/test.en` file, just run::
+
+    python sample_ensemble.py --models trained_models/tutorial_model/epoch_1 \
+                                       trained_models/tutorial_model/epoch_2 \
+                              --dataset datasets/Dataset_tutorial_dataset.pkl \
+                              --text examples/EuTrans/test.en
+
+Scoring
+*******
+
+The `score.py`_ script can be used to obtain the (-log)probabilities of a parallel corpus. Its syntax is the following::
+
+    python score.py --help
+    usage: Use several translation models for scoring source--target pairs
+       [-h] -ds DATASET [-src SOURCE] [-trg TARGET] [-s SPLITS [SPLITS ...]]
+       [-d DEST] [-v] [-c CONFIG] --models MODELS [MODELS ...]
+    optional arguments:
+        -h, --help            show this help message and exit
+        -ds DATASET, --dataset DATASET
+                            Dataset instance with data
+        -src SOURCE, --source SOURCE
+                            Text file with source sentences
+        -trg TARGET, --target TARGET
+                            Text file with target sentences
+        -s SPLITS [SPLITS ...], --splits SPLITS [SPLITS ...]
+                            Splits to sample. Should be already included into the
+                            dataset object.
+        -d DEST, --dest DEST  File to save scores in
+        -v, --verbose         Be verbose
+        -c CONFIG, --config CONFIG
+                            Config pkl for loading the model configuration. If not
+                            specified, hyperparameters are read from config.py
+        --models MODELS [MODELS ...]
+                            path to the models
+
+
+
+
+.. _documentation file: https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/config.md
+.. _config.py: https://github.com/lvapeab/nmt-keras/blob/master/config.py
+.. _main.py: https://github.com/lvapeab/nmt-keras/blob/master/main.py
+.. _sample_ensemble.py: https://github.com/lvapeab/nmt-keras/blob/master/sample_ensemble.py
+.. _ensembling tutorial: https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/ensembling_tutorial.md
+.. _score.py: https://github.com/lvapeab/nmt-keras/blob/master/score.py
+
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
new file mode 100644
index 0000000..a7a7588
--- /dev/null
+++ b/docs/source/utils.rst
@@ -0,0 +1,46 @@
+utils package
+=============
+
+Submodules
+----------
+
+evaluate_from_file module
+-------------------------
+
+.. automodule:: utils.evaluate_from_file
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+preprocess_binary_word_vectors module
+-------------------------------------
+
+.. automodule:: utils.preprocess_binary_word_vectors
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+preprocess_text_word_vectors module
+-----------------------------------
+
+.. automodule:: utils.preprocess_text_word_vectors
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+utils module
+------------
+
+.. automodule:: utils.utils
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: utils
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/examples/1_dataset_tutorial.ipynb b/examples/1_dataset_tutorial.ipynb
index 967e00c..2cdec09 100644
--- a/examples/1_dataset_tutorial.ipynb
+++ b/examples/1_dataset_tutorial.ipynb
@@ -41,8 +41,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Using Theano backend.\n",
-      "Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5105)\n"
+      "Using Theano backend.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using cuDNN version 5105 on context None\nMapped name None to device cuda: GeForce GTX 1080 (0000:01:00.0)\n"
      ]
     }
    ],
@@ -77,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
@@ -86,16 +92,40 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[30/11/2016 18:30:49] Creating vocabulary for data with id 'target_text'.\n",
-      "[30/11/2016 18:30:49] \t Total: 513 unique words in 9900 sentences with a total of 98304 words.\n",
-      "[30/11/2016 18:30:49] Creating dictionary of 30000 most common words, covering 100.0% of the text.\n",
-      "[30/11/2016 18:30:49] Loaded \"train\" set outputs of type \"text\" with id \"target_text\" and length 9900.\n",
-      "[30/11/2016 18:30:49] Loaded \"val\" set outputs of type \"text\" with id \"target_text\" and length 100.\n"
+      "[26/04/2017 13:48:48] Creating vocabulary for data with id 'target_text'.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:48] \t Total: 513 unique words in 9900 sentences with a total of 98304 words.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:48] Creating dictionary of 30000 most common words, covering 100.0% of the text.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:48] Loaded \"train\" set outputs of type \"text\" with id \"target_text\" and length 9900.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:48] Loaded \"val\" set outputs of type \"text\" with id \"target_text\" and length 100.\n"
      ]
     }
    ],
    "source": [
-    "ds.setOutput('examples/EuTrans/DATA/training.en',\n",
+    "ds.setOutput('examples/EuTrans/training.en',\n",
     "             'train',\n",
     "             type='text',\n",
     "             id='target_text',\n",
@@ -107,7 +137,7 @@
     "             max_words=30000,\n",
     "             min_occ=0)\n",
     "\n",
-    "ds.setOutput('examples/EuTrans/DATA//dev.en',\n",
+    "ds.setOutput('examples/EuTrans/dev.en',\n",
     "             'val',\n",
     "             type='text',\n",
     "             id='target_text',\n",
@@ -128,7 +158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {
     "collapsed": true
    },
@@ -137,16 +167,40 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[30/11/2016 18:30:51] Creating vocabulary for data with id 'source_text'.\n",
-      "[30/11/2016 18:30:51] \t Total: 686 unique words in 9900 sentences with a total of 96172 words.\n",
-      "[30/11/2016 18:30:51] Creating dictionary of 30000 most common words, covering 100.0% of the text.\n",
-      "[30/11/2016 18:30:51] Loaded \"train\" set inputs of type \"text\" with id \"source_text\" and length 9900.\n",
-      "[30/11/2016 18:30:51] Loaded \"val\" set inputs of type \"text\" with id \"source_text\" and length 100.\n"
+      "[26/04/2017 13:48:52] Creating vocabulary for data with id 'source_text'.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:52] \t Total: 686 unique words in 9900 sentences with a total of 96172 words.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:52] Creating dictionary of 30000 most common words, covering 100.0% of the text.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:52] Loaded \"train\" set inputs of type \"text\" with id \"source_text\" and length 9900.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:52] Loaded \"val\" set inputs of type \"text\" with id \"source_text\" and length 100.\n"
      ]
     }
    ],
    "source": [
-    "ds.setInput('examples/EuTrans/DATA//training.es',\n",
+    "ds.setInput('examples/EuTrans/training.es',\n",
     "            'train',\n",
     "            type='text',\n",
     "            id='source_text',\n",
@@ -157,7 +211,7 @@
     "            max_text_len=30,\n",
     "            max_words=30000,\n",
     "            min_occ=0)\n",
-    "ds.setInput('examples/EuTrans/DATA//dev.es',\n",
+    "ds.setInput('examples/EuTrans/dev.es',\n",
     "            'val',\n",
     "            type='text',\n",
     "            id='source_text',\n",
@@ -177,7 +231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {
     "collapsed": true
    },
@@ -186,14 +240,26 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[30/11/2016 18:30:52] \tReusing vocabulary named \"target_text\" for data with id \"state_below\".\n",
-      "[30/11/2016 18:30:52] Loaded \"train\" set inputs of type \"text\" with id \"state_below\" and length 9900.\n",
-      "[30/11/2016 18:30:52] Loaded \"val\" set inputs of type \"ghost\" with id \"state_below\" and length 100.\n"
+      "[26/04/2017 13:48:58] \tReusing vocabulary named \"target_text\" for data with id \"state_below\".\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:58] Loaded \"train\" set inputs of type \"text\" with id \"state_below\" and length 9900.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:58] Loaded \"val\" set inputs of type \"ghost\" with id \"state_below\" and length 100.\n"
      ]
     }
    ],
    "source": [
-    "ds.setInput('examples/EuTrans/DATA//training.en',\n",
+    "ds.setInput('examples/EuTrans/training.en',\n",
     "            'train',\n",
     "            type='text',\n",
     "            id='state_below',\n",
@@ -221,7 +287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {
     "collapsed": false
    },
@@ -230,8 +296,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[30/11/2016 18:30:54] Keeping 1 captions per input on the val set.\n",
-      "[30/11/2016 18:30:54] Samples reduced to 100 in val set.\n"
+      "[26/04/2017 13:48:59] Keeping 1 captions per input on the val set.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:48:59] Samples reduced to 100 in val set.\n"
      ]
     }
    ],
@@ -249,25 +321,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 7,
    "metadata": {
     "collapsed": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:49:01] <<< Saving Dataset instance to datasets/Dataset_tutorial_dataset.pkl ... >>>\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[26/04/2017 13:49:01] <<< Dataset instance saved >>>\n"
+     ]
+    }
+   ],
    "source": [
     "saveDataset(ds, 'datasets')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    ""
-   ]
   }
  ],
  "metadata": {
diff --git a/examples/2_training_tutorial.ipynb b/examples/2_training_tutorial.ipynb
index e7066be..4b55077 100644
--- a/examples/2_training_tutorial.ipynb
+++ b/examples/2_training_tutorial.ipynb
@@ -36,21 +36,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "30/11/2016_16:33:14:  <<< Loading Dataset instance from datasets/Dataset_tutorial_dataset.pkl ... >>>\n"
+      "[26/04/2017 13:51:24] <<< Loading Dataset instance from datasets/Dataset_tutorial_dataset.pkl ... >>>\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "30/11/2016_16:33:14:  <<< Dataset instance loaded >>>\n"
+      "[26/04/2017 13:51:24] <<< Dataset instance loaded >>>\n"
      ]
     }
    ],
@@ -60,6 +60,7 @@
     "import utils\n",
     "from keras_wrapper.cnn_model import loadModel\n",
     "from keras_wrapper.dataset import loadDataset\n",
+    "from keras_wrapper.extra.callbacks import PrintPerformanceMetricOnEpochEndOrEachNUpdates\n",
     "params = load_parameters()\n",
     "dataset = loadDataset('datasets/Dataset_tutorial_dataset.pkl')"
    ]
@@ -73,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -90,62 +91,76 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "30/11/2016_16:33:18:  <<< Building GroundHogModel Translation_Model >>>\n"
+      "[26/04/2017 13:50:11] <<< Building GroundHogModel Translation_Model >>>\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "-----------------------------------------------------------------------------------\n\t\tTranslationModel instance\n-----------------------------------------------------------------------------------\n_model_type: GroundHogModel\nname: tutorial_model\nmodel_path: trained_models/tutorial_model/\nverbose: True\n\nMODEL PARAMETERS:\n{'SOURCE_GLOVE_VECTORS': None, 'SAMPLE_ON_SETS': ['train', 'val'], 'CLIP_C': 1.0, 'HEURISTIC': 1, 'EVAL_EACH': 1, 'TRG_LAN': 'en', 'SAMPLING': 'max_likelihood', 'SAMPLE_EACH_UPDATES': 2500, 'N_LAYERS_DECODER': 1, 'POS_UNK': False, 'ENCODER_HIDDEN_SIZE': 600, 'REBUILD_DATASET': True, 'METRICS': ['coco'], 'OPTIMIZER': 'Adam', 'SOURCE_TEXT_EMBEDDING_SIZE': 420, 'EVAL_EACH_EPOCHS': True, 'EPOCHS_FOR_SAVE': 1, 'USE_BATCH_NORMALIZATION': True, 'BATCH_SIZE': 50, 'MODEL_NAME': 'ue_GroundHogModel_src_emb_420_bidir_True_enc_600_dec_600_deepout_maxout_trg_emb_420_Adam_0.001', 'BATCH_NORMALIZATION_MODE': 1, 'N_SAMPLES': 5, 'PARALLEL_LOADERS': 8, 'MIN_OCCURRENCES_VOCAB': 0, 'OUTPUTS_IDS_DATASET': ['target_text'], 'INIT_LAYERS': ['tanh'], 'EARLY_STOP': True, 'DATA_AUGMENTATION': False, 'TEXT_FILES': {'test': 'DATA/test.', 'train': 'DATA/training.', 'val': 'DATA/dev.'}, 'START_SAMPLING_ON_EPOCH': 1, 'SAMPLE_WEIGHTS': True, 'SAMPLING_SAVE_MODE': 'list', 'EXTRA_NAME': '', 'LOSS': 'categorical_crossentropy', 'WRITE_VALID_SAMPLES': True, 'INPUTS_IDS_MODEL': ['source_text', 'state_below'], 'MODE': 'training', 'LR_GAMMA': 0.8, 'NOISE_AMOUNT': 0.01, 'STOP_METRIC': 'Bleu_4', 'N_LAYERS_ENCODER': 1, 'WEIGHT_DECAY': 0.0001, 'BIDIRECTIONAL_ENCODER': True, 'MAX_OUTPUT_TEXT_LEN_TEST': 120, 'FORCE_RELOAD_VOCABULARY': False, 'layer': ('maxout', 210), 'TOKENIZATION_METHOD': 'tokenize_none', 'DEEP_OUTPUT_LAYERS': [('maxout', 210)], 'OUTPUT_VOCABULARY_SIZE': 516, 'START_EVAL_ON_EPOCH': 10, 'BEAM_SEARCH': True, 'TARGET_TEXT_EMBEDDING_SIZE': 420, 'DECODER_HIDDEN_SIZE': 600, 'MODEL_TYPE': 'GroundHogModel', 'MAX_INPUT_TEXT_LEN': 50, 'EVAL_ON_SETS_KERAS': [], 'NORMALIZE_SAMPLING': True, 'MAX_OUTPUT_TEXT_LEN': 50, 'PAD_ON_BATCH': True, 'USE_PRELU': False, 'INPUT_VOCABULARY_SIZE': 689, 'BEAM_SIZE': 12, 'TRAIN_ON_TRAINVAL': False, 'LR': 0.001, 'SRC_LAN': 'es', 'CLASSIFIER_ACTIVATION': 'softmax', 'FILL': 'end', 'ALPHA_FACTOR': 0.6, 'TEMPERATURE': 1, 'STORE_PATH': 'trained_models/ue_GroundHogModel_src_emb_420_bidir_True_enc_600_dec_600_deepout_maxout_trg_emb_420_Adam_0.001/', 'DATASET_STORE_PATH': 'datasets/', 'DROPOUT_P': 0.5, 'DATA_ROOT_PATH': '/media/HDD_2TB/DATASETS/ue/', 'HOMOGENEOUS_BATCHES': False, 'LR_DECAY': 20, 'DATASET_NAME': 'ue', 'USE_DROPOUT': False, 'INPUTS_IDS_DATASET': ['source_text', 'state_below'], 'VERBOSE': 1, 'PATIENCE': 20, 'OUTPUTS_IDS_MODEL': ['target_text'], 'USE_NOISE': True, 'TARGET_GLOVE_VECTORS': None, 'TARGET_GLOVE_VECTORS_TRAINABLE': True, 'RELOAD': 0, 'EVAL_ON_SETS': ['val'], 'MAX_EPOCH': 500, 'SOURCE_GLOVE_VECTORS_TRAINABLE': True, 'USE_L2': False}\n-----------------------------------------------------------------------------------\n____________________________________________________________________________________________________\nLayer (type)                     Output Shape          Param #     Connected to                     \n====================================================================================================\nsource_text (InputLayer)         (None, None)          0                                            \n____________________________________________________________________________________________________\nsource_word_embedding (Embedding)(None, None, 420)     289380      source_text[0][0]                \n____________________________________________________________________________________________________\nsrc_embedding_gaussian_noise (Gau(None, None, 420)     0           source_word_embedding[0][0]      \n____________________________________________________________________________________________________\nsrc_embedding_batch_normalization(None, None, 420)     840         src_embedding_gaussian_noise[0][0\n____________________________________________________________________________________________________\nbidirectional_encoder (Bidirectio(None, None, 1200)    3675600     src_embedding_batch_normalization\n____________________________________________________________________________________________________\nannotations_gaussian_noise (Gauss(None, None, 1200)    0           bidirectional_encoder[0][0]      \n____________________________________________________________________________________________________\nannotations_batch_normalization ((None, None, 1200)    2400        annotations_gaussian_noise[0][0] \n____________________________________________________________________________________________________\nstate_below (InputLayer)         (None, None)          0                                            \n____________________________________________________________________________________________________\nmaskedmean_2 (MaskedMean)        (None, 1200)          0           annotations_batch_normalization[0\n____________________________________________________________________________________________________\ntarget_word_embedding (Embedding)(None, None, 420)     216720      state_below[0][0]                \n____________________________________________________________________________________________________\ninitial_state (Dense)            (None, 600)           720600      maskedmean_2[0][0]               \n____________________________________________________________________________________________________\nstate_below_gaussian_noise (Gauss(None, None, 420)     0           target_word_embedding[0][0]      \n____________________________________________________________________________________________________\ninitial_state_gaussian_noise (Gau(None, 600)           0           initial_state[0][0]              \n____________________________________________________________________________________________________\n"
+      "-----------------------------------------------------------------------------------\n\t\tTranslationModel instance\n-----------------------------------------------------------------------------------\n_model_type: GroundHogModel\nname: tutorial_model\nmodel_path: trained_models/tutorial_model/\nverbose: True\n\nMODEL params:\n{'SAMPLE_ON_SETS': ['train', 'val'], 'CLIP_C': 1.0, 'HEURISTIC': 0, 'SAMPLING_SAVE_MODE': 'list', 'TRG_LAN': 'pe', 'SAMPLING': 'max_likelihood', 'SAMPLE_EACH_UPDATES': 300, 'N_LAYERS_DECODER': 1, 'TRG_PRETRAINED_VECTORS_TRAINABLE': True, 'USE_PRELU': False, 'POS_UNK': False, 'ENCODER_HIDDEN_SIZE': 256, 'REBUILD_DATASET': True, 'METRICS': ['coco'], 'TOKENIZE_REFERENCES': True, 'OPTIMIZER': 'Adadelta', 'SOURCE_TEXT_EMBEDDING_SIZE': 300, 'EVAL_EACH_EPOCHS': True, 'EPOCHS_FOR_SAVE': 1, 'USE_BATCH_NORMALIZATION': True, 'BATCH_SIZE': 50, 'MODEL_NAME': 'APE_mtpe_GroundHogModel_src_emb_300_bidir_True_enc_LSTM_256_dec_LSTM_256_deepout_linear_trg_emb_300_Adadelta_1.0', 'BATCH_NORMALIZATION_MODE': 1, 'EVAL_ON_SETS_KERAS': [], 'N_SAMPLES': 5, 'RECURRENT_DROPOUT_P': 0.5, 'WEIGHT_DECAY': 0.0001, 'OUTPUTS_IDS_DATASET': ['target_text'], 'INIT_LAYERS': ['tanh'], 'EARLY_STOP': True, 'DATA_AUGMENTATION': False, 'TEXT_FILES': {'test': 'test.', 'train': 'training.', 'val': 'dev.'}, 'MAPPING': '/media/HDD_2TB/DATASETS/APE/in-domain/joint_bpe//mapping.mt_pe.pkl', 'DROPOUT_P': 0.5, 'RECURRENT_WEIGHT_DECAY': 0.0, 'ADDITIONAL_OUTPUT_MERGE_MODE': 'sum', 'PARALLEL_LOADERS': 1, 'ALIGN_FROM_RAW': True, 'SAMPLE_WEIGHTS': True, 'EVAL_EACH': 1, 'EXTRA_NAME': '', 'MIN_OCCURRENCES_INPUT_VOCAB': 0, 'INIT_FUNCTION': 'glorot_uniform', 'LOSS': 'categorical_crossentropy', 'WRITE_VALID_SAMPLES': True, 'INPUTS_IDS_MODEL': ['source_text', 'state_below'], 'MODE': 'training', 'LR_GAMMA': 0.8, 'NOISE_AMOUNT': 0.01, 'SRC_PRETRAINED_VECTORS_TRAINABLE': True, 'STOP_METRIC': 'TER', 'N_LAYERS_ENCODER': 1, 'INPUTS_IDS_DATASET': ['source_text', 'state_below'], 'BIDIRECTIONAL_ENCODER': True, 'MAX_OUTPUT_TEXT_LEN_TEST': 150, 'FORCE_RELOAD_VOCABULARY': False, 'layer': ('linear', 300), 'TOKENIZATION_METHOD': 'tokenize_none', 'OUTPUT_VOCABULARY_SIZE': 516, 'SRC_PRETRAINED_VECTORS': None, 'START_EVAL_ON_EPOCH': 1, 'BEAM_SEARCH': True, 'TARGET_TEXT_EMBEDDING_SIZE': 300, 'DECODER_HIDDEN_SIZE': 256, 'MODEL_TYPE': 'GroundHogModel', 'STORE_PATH': '/media/HDD_2TB/MODELS/APE/trained_models/APE_mtpe_GroundHogModel_src_emb_300_bidir_True_enc_LSTM_256_dec_LSTM_256_deepout_linear_trg_emb_300_Adadelta_1.0/', 'TRG_PRETRAINED_VECTORS': None, 'JOINT_BATCHES': 4, 'CLIP_V': 0.0, 'SKIP_VECTORS_HIDDEN_SIZE': 300, 'NORMALIZE_SAMPLING': True, 'MAX_OUTPUT_TEXT_LEN': 50, 'PAD_ON_BATCH': True, 'START_SAMPLING_ON_EPOCH': 1, 'RNN_TYPE': 'LSTM', 'INPUT_VOCABULARY_SIZE': 689, 'BEAM_SIZE': 6, 'TRAIN_ON_TRAINVAL': False, 'LR': 1.0, 'SRC_LAN': 'mt', 'OPTIMIZED_SEARCH': True, 'CLASSIFIER_ACTIVATION': 'softmax', 'FILL': 'end', 'ALPHA_FACTOR': 0.6, 'TEMPERATURE': 1, 'MAX_INPUT_TEXT_LEN': 50, 'USE_RECURRENT_DROPOUT': False, 'DATASET_STORE_PATH': 'datasets/', 'APPLY_DETOKENIZATION': False, 'PATIENCE': 20, 'SAVE_EACH_EVALUATION': True, 'DATA_ROOT_PATH': '/media/HDD_2TB/DATASETS/APE/in-domain/joint_bpe/', 'HOMOGENEOUS_BATCHES': False, 'LR_DECAY': None, 'DATASET_NAME': 'APE', 'USE_DROPOUT': False, 'TOKENIZE_HYPOTHESES': True, 'VERBOSE': 1, 'MIN_OCCURRENCES_OUTPUT_VOCAB': 0, 'BIDIRECTIONAL_DEEP_ENCODER': True, 'OUTPUTS_IDS_MODEL': ['target_text'], 'USE_NOISE': True, 'DETOKENIZATION_METHOD': 'detokenize_bpe', 'DEEP_OUTPUT_LAYERS': [('linear', 300)], 'RELOAD': 0, 'EVAL_ON_SETS': ['val'], 'MAX_EPOCH': 500, 'USE_L2': False}\n-----------------------------------------------------------------------------------\n____________________________________________________________________________________________________\nLayer (type)                     Output Shape          Param #     Connected to                     \n====================================================================================================\nsource_text (InputLayer)         (None, None)          0                                            \n____________________________________________________________________________________________________\nsource_word_embedding (Embedding (None, None, 300)     206700      source_text[0][0]                \n____________________________________________________________________________________________________\nsrc_embedding_gaussian_noise (Ga (None, None, 300)     0           source_word_embedding[0][0]      \n____________________________________________________________________________________________________\nsrc_embedding_batch_normalizatio (None, None, 300)     1200        src_embedding_gaussian_noise[0][0\n____________________________________________________________________________________________________\nbidirectional_encoder_LSTM (Bidi (None, None, 512)     1140736     src_embedding_batch_normalization\n____________________________________________________________________________________________________\nannotations_gaussian_noise (Gaus (None, None, 512)     0           bidirectional_encoder_LSTM[0][0] \n____________________________________________________________________________________________________\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "state_below_batch_normalization ((None, None, 420)     840         state_below_gaussian_noise[0][0] \n____________________________________________________________________________________________________\ninitial_state_batch_normalization(None, 600)           1200        initial_state_gaussian_noise[0][0\n____________________________________________________________________________________________________\nattgrucond_2 (AttGRUCond)        [(None, None, 600), (N6160201     state_below_batch_normalization[0\n                                                                   annotations_batch_normalization[0\n                                                                   initial_state_batch_normalization\n____________________________________________________________________________________________________\nproj_h0_gaussian_noise (GaussianN(None, None, 600)     0           attgrucond_2[0][0]               \n____________________________________________________________________________________________________\nproj_h0_batch_normalization (Batc(None, None, 600)     1200        proj_h0_gaussian_noise[0][0]     \n____________________________________________________________________________________________________\n"
+      "annotations_batch_normalization  (None, None, 512)     2048        annotations_gaussian_noise[0][0] \n____________________________________________________________________________________________________\nstate_below (InputLayer)         (None, None)          0                                            \n____________________________________________________________________________________________________\nmaskedmean_1 (MaskedMean)        (None, 512)           0           annotations_batch_normalization[0\n____________________________________________________________________________________________________\ntarget_word_embedding (Embedding (None, None, 300)     154800      state_below[0][0]                \n____________________________________________________________________________________________________\ninitial_state (Dense)            (None, 256)           131328      maskedmean_1[0][0]               \n____________________________________________________________________________________________________\ninitial_memory (Dense)           (None, 256)           131328      maskedmean_1[0][0]               \n____________________________________________________________________________________________________\nstate_below_gaussian_noise (Gaus (None, None, 300)     0           target_word_embedding[0][0]      \n____________________________________________________________________________________________________\ninitial_state_gaussian_noise (Ga (None, 256)           0           initial_state[0][0]              \n____________________________________________________________________________________________________\ninitial_memory_gaussian_noise (G (None, 256)           0           initial_memory[0][0]             \n____________________________________________________________________________________________________\nstate_below_batch_normalization  (None, None, 300)     1200        state_below_gaussian_noise[0][0] \n____________________________________________________________________________________________________\nmasklayer_1 (MaskLayer)          (None, None, 512)     0           annotations_batch_normalization[0\n____________________________________________________________________________________________________\ninitial_state_batch_normalizatio (None, 256)           1024        initial_state_gaussian_noise[0][0\n____________________________________________________________________________________________________\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "logit_ctx (TimeDistributed)      multiple              504420      attgrucond_2[0][1]               \n____________________________________________________________________________________________________\nlogit_lstm (TimeDistributed)     multiple              252420      proj_h0_batch_normalization[0][0]\n____________________________________________________________________________________________________\nlambda_2 (Lambda)                (None, None, 420)     0           logit_ctx[0][0]                  \n____________________________________________________________________________________________________\nlogit_emb (TimeDistributed)      multiple              176820      state_below_batch_normalization[0\n____________________________________________________________________________________________________\nout_layer_mlp_gaussian_noise (Gau(None, None, 420)     0           logit_lstm[0][0]                 \n____________________________________________________________________________________________________\nout_layer_ctx_gaussian_noise (Gau(None, None, 420)     0           lambda_2[0][0]                   \n____________________________________________________________________________________________________\nout_layer_emb_gaussian_noise (Gau(None, None, 420)     0           logit_emb[0][0]                  \n____________________________________________________________________________________________________\nout_layer_mlp_batch_normalization(None, None, 420)     840         out_layer_mlp_gaussian_noise[0][0\n____________________________________________________________________________________________________\nout_layer_ctx_batch_normalization(None, None, 420)     840         out_layer_ctx_gaussian_noise[0][0\n____________________________________________________________________________________________________\nout_layer_emb_batch_normalization(None, None, 420)     840         out_layer_emb_gaussian_noise[0][0\n____________________________________________________________________________________________________\nadditional_input (Merge)         (None, None, 420)     0           out_layer_mlp_batch_normalization\n                                                                   out_layer_ctx_batch_normalization\n                                                                   out_layer_emb_batch_normalization\n____________________________________________________________________________________________________\nactivation_2 (Activation)        (None, None, 420)     0           additional_input[0][0]           \n____________________________________________________________________________________________________\nmaxout_0 (TimeDistributed)       multiple              353640      activation_2[0][0]               \n____________________________________________________________________________________________________\nout_layermaxout_gaussian_noise (G(None, None, 210)     0           maxout_0[0][0]                   \n____________________________________________________________________________________________________\nout_layermaxout_batch_normalizati(None, None, 210)     420         out_layermaxout_gaussian_noise[0]\n____________________________________________________________________________________________________\n"
+      "initial_memory_batch_normalizati (None, 256)           1024        initial_memory_gaussian_noise[0][\n____________________________________________________________________________________________________\ndecoder_AttLSTMCond (AttLSTMCond [(None, None, 256), ( 1488897     state_below_batch_normalization[0\n                                                                   masklayer_1[0][0]                \n                                                                   initial_state_batch_normalization\n                                                                   initial_memory_batch_normalizatio\n____________________________________________________________________________________________________\nproj_h0_gaussian_noise (Gaussian (None, None, 256)     0           decoder_AttLSTMCond[0][0]        \n____________________________________________________________________________________________________\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "target_text (TimeDistributed)    multiple              108876      out_layermaxout_batch_normalizati\n====================================================================================================\n"
+      "proj_h0_batch_normalization (Bat (None, None, 256)     1024        proj_h0_gaussian_noise[0][0]     \n____________________________________________________________________________________________________\nlogit_ctx (TimeDistributed)      multiple              153900      decoder_AttLSTMCond[0][1]        \n____________________________________________________________________________________________________\nlogit_lstm (TimeDistributed)     multiple              77100       proj_h0_batch_normalization[0][0]\n____________________________________________________________________________________________________\npermutegeneral_1 (PermuteGeneral (None, None, 300)     0           logit_ctx[0][0]                  \n____________________________________________________________________________________________________\nlogit_emb (TimeDistributed)      multiple              90300       state_below_batch_normalization[0\n____________________________________________________________________________________________________\nout_layer_mlp_gaussian_noise (Ga (None, None, 300)     0           logit_lstm[0][0]                 \n____________________________________________________________________________________________________\nout_layer_ctx_gaussian_noise (Ga (None, None, 300)     0           permutegeneral_1[0][0]           \n____________________________________________________________________________________________________\nout_layer_emb_gaussian_noise (Ga (None, None, 300)     0           logit_emb[0][0]                  \n____________________________________________________________________________________________________\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "out_layer_mlp_batch_normalizatio (None, None, 300)     1200        out_layer_mlp_gaussian_noise[0][0\n____________________________________________________________________________________________________\nout_layer_ctx_batch_normalizatio (None, None, 300)     1200        out_layer_ctx_gaussian_noise[0][0\n____________________________________________________________________________________________________\nout_layer_emb_batch_normalizatio (None, None, 300)     1200        out_layer_emb_gaussian_noise[0][0\n____________________________________________________________________________________________________\nadditional_input (Merge)         (None, None, 300)     0           out_layer_mlp_batch_normalization\n                                                                   out_layer_ctx_batch_normalization\n                                                                   out_layer_emb_batch_normalization\n____________________________________________________________________________________________________\nactivation_1 (Activation)        (None, None, 300)     0           additional_input[0][0]           \n____________________________________________________________________________________________________\nlinear_0 (TimeDistributed)       multiple              90300       activation_1[0][0]               \n____________________________________________________________________________________________________\nout_layerlinear_gaussian_noise ( (None, None, 300)     0           linear_0[0][0]                   \n____________________________________________________________________________________________________\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "out_layerlinear_batch_normalizat (None, None, 300)     1200        out_layerlinear_gaussian_noise[0]\n____________________________________________________________________________________________________\ntarget_text (TimeDistributed)    multiple              155316      out_layerlinear_batch_normalizati\n====================================================================================================\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "30/11/2016_16:33:21:  Preparing optimizer and compiling.\n"
+      "[26/04/2017 13:50:15] Preparing optimizer: Adadelta [LR: 1.0 - LOSS: categorical_crossentropy] and compiling.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Total params: 12468097\n____________________________________________________________________________________________________\n"
+      "Total params: 3,833,025\nTrainable params: 3,826,865\nNon-trainable params: 6,160\n____________________________________________________________________________________________________\n"
      ]
     }
    ],
    "source": [
     "nmt_model = TranslationModel(params,\n",
-    "                             type='GroundHogModel', \n",
+    "                             model_type='GroundHogModel', \n",
     "                             model_name='tutorial_model',\n",
     "                             vocabularies=dataset.vocabulary,\n",
     "                             store_path='trained_models/tutorial_model/',\n",
@@ -161,7 +176,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,11 +204,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "extra_vars = {'language': 'en', \n",
+    "extra_vars = {'language': 'en',\n",
     "              'n_parallel_loaders': 8,\n",
     "              'tokenize_f': eval('dataset.' + 'tokenize_none'),\n",
     "              'beam_size': 12,\n",
@@ -204,32 +219,29 @@
     "              'dataset_outputs': ['target_text'],\n",
     "              'normalize': True,\n",
     "              'alpha_factor': 0.6,\n",
-    "              'val':{'references': dataset.extra_variables['val']['target_text']}\n",
+    "              'val': {'references': dataset.extra_variables['val']['target_text']}\n",
     "              }\n",
     "\n",
     "vocab = dataset.vocabulary['target_text']['idx2words']\n",
     "callbacks = []\n",
-    "callbacks.append(utils.callbacks.PrintPerformanceMetricOnEpochEnd(nmt_model,\n",
-    "                                                 dataset,\n",
-    "                                                 gt_id='target_text',\n",
-    "                                                 metric_name=['coco'],\n",
-    "                                                 set_name=['val'],\n",
-    "                                                 batch_size=50,\n",
-    "                                                 each_n_epochs=2,\n",
-    "                                                 extra_vars=extra_vars,\n",
-    "                                                 reload_epoch=0,\n",
-    "                                                 is_text=True,\n",
-    "                                                 index2word_y=vocab,\n",
-    "                                                 sampling_type='max_likelihood',\n",
-    "                                                 beam_search=True,\n",
-    "                                                 save_path=nmt_model.model_path,\n",
-    "                                                 start_eval_on_epoch=0,\n",
-    "                                                 write_samples=True,\n",
-    "                                                 write_type='list',\n",
-    "                                                 early_stop=True,\n",
-    "                                                 patience=5,\n",
-    "                                                 stop_metric='Bleu_4',\n",
-    "                                                 verbose=True))"
+    "callbacks.append(PrintPerformanceMetricOnEpochEndOrEachNUpdates(nmt_model,\n",
+    "                                                                dataset,\n",
+    "                                                                gt_id='target_text',\n",
+    "                                                                metric_name=['coco'],\n",
+    "                                                                set_name=['val'],\n",
+    "                                                                batch_size=50,\n",
+    "                                                                each_n_epochs=2,\n",
+    "                                                                extra_vars=extra_vars,\n",
+    "                                                                reload_epoch=0,\n",
+    "                                                                is_text=True,\n",
+    "                                                                index2word_y=vocab,\n",
+    "                                                                sampling_type='max_likelihood',\n",
+    "                                                                beam_search=True,\n",
+    "                                                                save_path=nmt_model.model_path,\n",
+    "                                                                start_eval_on_epoch=0,\n",
+    "                                                                write_samples=True,\n",
+    "                                                                write_type='list',\n",
+    "                                                                verbose=True))\n"
    ]
   },
   {
@@ -241,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -272,6 +284,14 @@
    "source": [
     "nmt_model.trainNet(dataset, training_params)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    ""
+   ]
   }
  ],
  "metadata": {
diff --git a/examples/4_nmt_model_tutorial.ipynb b/examples/4_nmt_model_tutorial.ipynb
index c22eae0..c63d12e 100644
--- a/examples/4_nmt_model_tutorial.ipynb
+++ b/examples/4_nmt_model_tutorial.ipynb
@@ -31,33 +31,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using Theano backend.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5105)\n"
-     ]
-    }
-   ],
-   "source": [
-    "from keras.engine import Input\n",
-    "from keras.layers.embeddings import Embedding\n",
-    "from keras.layers.recurrent import GRU, AttGRUCond\n",
-    "from keras.layers import TimeDistributed, Bidirectional\n",
-    "from keras.layers.core import Dense, Activation, Lambda, MaxoutDense, MaskedMean, PermuteGeneral\n",
-    "from keras import backend as K\n",
-    "from keras.engine.topology import merge\n",
-    "from keras.models import Model"
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from keras.layers import *\n",
+    "from keras.models import model_from_json, Model\n",
+    "from keras.optimizers import Adam, RMSprop, Nadam, Adadelta, SGD, Adagrad, Adamax\n",
+    "from keras.regularizers import l2\n",
+    "from keras_wrapper.cnn_model import Model_Wrapper\n",
+    "from keras_wrapper.extra.regularize import Regularize"
    ]
   },
   {
@@ -69,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -90,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -120,7 +103,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -142,11 +125,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "ctx_mean = MaskedMean()(annotations)\n",
+    "annotations = MaskLayer()(annotations)  # We may want the padded annotations\n",
+    "\n",
     "initial_state = Dense(hidden_state_size, name='initial_state',\n",
     "                      activation='tanh')(ctx_mean)"
    ]
@@ -160,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,7 +165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -202,7 +187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -240,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -257,7 +242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -290,7 +275,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -326,7 +311,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -345,7 +330,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -373,7 +358,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -394,15 +379,6 @@
    "source": [
     "And that's all! For using this model together with the facilities provided by the staged_model_wrapper library, we should declare the model as a method of a Model_Wrapper class. A complete example of this can be found at `model_zoo.py`."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    ""
-   ]
   }
  ],
  "metadata": {
diff --git a/examples/README.md b/examples/README.md
index 53beaa6..c1af017 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -26,11 +26,3 @@ In the `documentation` folder you'll find a couple of pdf files:
 * [staged_keras_wrapper.pdf](https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/staged_keras_wrapper.pdf) contains the autogenerated documentation for the staged_keras_wrapper package (at 30/Nov/2016).
 * [typical_output.md](https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/typical_output.md) describes the files and output generated during a typical training run.
 * [ensembling_tutorial.md](https://github.com/lvapeab/nmt-keras/blob/master/examples/documentation/ensembling_tutorial.md) shows the usage of the `sample_ensemble` script.
-
-## TODO list
-
-- [x] Add a tutorial for building a NMT model with Keras.
-
-- [ ] Document (with sphinx) the NMT-Keras library
-
-- [x] Add a document explaining the typical output of a training. 
diff --git a/examples/documentation/attention_nmt_model.png b/examples/documentation/attention_nmt_model.png
new file mode 100644
index 0000000..a34ee3a
Binary files /dev/null and b/examples/documentation/attention_nmt_model.png differ
diff --git a/main.py b/main.py
index 84d4749..e30b582 100644
--- a/main.py
+++ b/main.py
@@ -545,7 +545,7 @@ def buildCallbacks(params, model, dataset):
 def check_params(params):
     """
     Checks some typical parameters and warns if something wrong was specified.
-    :param params:  Model instance on which to apply the callback.
+    :param params: Model instance on which to apply the callback.
     :return: None
     """
     if params['POS_UNK']:
diff --git a/meta-optimizers/spearmint/spearmint_opt.py b/meta-optimizers/spearmint/spearmint_opt.py
index 4d2a7bd..48282b8 100644
--- a/meta-optimizers/spearmint/spearmint_opt.py
+++ b/meta-optimizers/spearmint/spearmint_opt.py
@@ -17,6 +17,11 @@
 d = dict(os.environ.copy())
 d['LC_NUMERIC'] = 'en_US.utf-8'
 def invoke_model(parameters):
+    """
+    Loads a model, trains it and evaluates it.
+    :param parameters: Model parameters
+    :return: Metric to minimize value.
+    """
 
     model_params = load_parameters()
     model_name = model_params["MODEL_TYPE"]
@@ -51,6 +56,12 @@ def invoke_model(parameters):
 
 
 def main(job_id, params):
+    """
+    Launches the spearmint job
+    :param job_id: Job identifier.
+    :param params: Model parameters.
+    :return: Metric to minimize value.
+    """
     print params
     return invoke_model(params)
 
diff --git a/model_zoo.py b/model_zoo.py
index 95df155..4086755 100644
--- a/model_zoo.py
+++ b/model_zoo.py
@@ -12,10 +12,22 @@
 class TranslationModel(Model_Wrapper):
     """
     Translation model class. Instance of the Model_Wrapper class (see staged_keras_wrapper).
-    """
 
-    def resumeTrainNet(self, ds, params, out_name=None):
-        pass
+    :param params: all hyperparams of the model.
+    :param model_type: network name type (corresponds to any method defined in the section 'MODELS' of this class).
+                 Only valid if 'structure_path' == None.
+    :param verbose: set to 0 if you don't want the model to output informative messages
+    :param structure_path: path to a Keras' model json file.
+                          If we speficy this parameter then 'type' will be only an informative parameter.
+    :param weights_path: path to the pre-trained weights file (if None, then it will be randomly initialized)
+    :param model_name: optional name given to the network
+                       (if None, then it will be assigned to current time as its name)
+    :param vocabularies: vocabularies used for word embedding
+    :param store_path: path to the folder where the temporal model packups will be stored
+    :param set_optimizer: Compile optimizer or not.
+    :param clear_dirs: Clean model directories or not.
+
+    """
 
     def __init__(self, params, model_type='Translation_Model', verbose=1, structure_path=None, weights_path=None,
                  model_name=None, vocabularies=None, store_path=None, set_optimizer=True, clear_dirs=True):
@@ -33,6 +45,9 @@ def __init__(self, params, model_type='Translation_Model', verbose=1, structure_
                            (if None, then it will be assigned to current time as its name)
         :param vocabularies: vocabularies used for word embedding
         :param store_path: path to the folder where the temporal model packups will be stored
+        :param set_optimizer: Compile optimizer or not.
+        :param clear_dirs: Clean model directories or not.
+
         """
         super(self.__class__, self).__init__(type=model_type, model_name=model_name,
                                              silence=verbose == 0, models_path=store_path, inheritance=True)
@@ -116,8 +131,7 @@ def setParams(self, params):
 
     def setOptimizer(self, **kwargs):
         """
-        Sets a new optimizer for the Translation_Model.
-
+        Sets and compiles a new optimizer for the Translation_Model.
         :param kwargs:
         :return:
         """
@@ -188,6 +202,8 @@ def setOptimizer(self, **kwargs):
     def __str__(self):
         """
         Plots basic model information.
+
+        :return: String containing model information.
         """
         obj_str = '-----------------------------------------------------------------------------------\n'
         class_name = self.__class__.__name__
@@ -214,14 +230,16 @@ def __str__(self):
     def GroundHogModel(self, params):
         """
         Neural machine translation with:
-            * BLSTM encoder
+            * BRNN encoder
             * Attention mechansim on input sequence of annotations
-            * Conditional LSTM for decoding
-            * Feed forward layers:
-                + Context projected to output
-                + Last word projected to output
+            * Conditional RNN for decoding
+            * Deep output layers:
+            * Context projected to output
+            * Last word projected to output
             * Possibly deep encoder/decoder
+
         See https://arxiv.org/abs/1409.0473 for an in-depth review of the model.
+
         :param params: Dictionary of params (see config.py)
         :return: None
         """
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7f08875
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+cloud
+h5py
+-e git+https://github.com/MarcBS/keras.git#egg=keras
+-e git+https://github.com/MarcBS/multimodal_keras_wrapper.git#egg=keras_wrapper
+-e git+https://github.com/lvapeab/coco-caption.git#egg=coco-caption
+matplotlib
+numpy
+scikit-image
+scikit-learn
+six
+tables
+Theano
+toolz
\ No newline at end of file
diff --git a/utils/evaluate_from_file.py b/utils/evaluate_from_file.py
index bea977e..2e258e1 100644
--- a/utils/evaluate_from_file.py
+++ b/utils/evaluate_from_file.py
@@ -1,3 +1,5 @@
+import argparse
+
 """
 Scores a file of hypothesis.
 Usage:
@@ -5,15 +7,12 @@
     2. python evaluate_from_file.py -hyp hypothesis -r references
 """
 
-import argparse
 
 from pycocoevalcap.bleu.bleu import Bleu
 from pycocoevalcap.cider.cider import Cider
 from pycocoevalcap.rouge.rouge import Rouge
 from pycocoevalcap.meteor.meteor import Meteor
 from pycocoevalcap.ter.ter import Ter
-ROOT_PATH = '/media/HDD_2TB/DATASETS/'
-
 
 parser = argparse.ArgumentParser(
     description="""This takes two files and a path the references (source, references),
@@ -26,6 +25,13 @@
 
 
 def load_textfiles(references, hypothesis):
+    """
+    Loads the references and hypothesis text files.
+
+    :param references: Path to the references files.
+    :param hypothesis: Path to the hypotheses file.
+    :return:
+    """
     print "The number of references is {}".format(len(references))
     hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesis)}
     # take out newlines before creating dictionary
@@ -41,9 +47,12 @@ def load_textfiles(references, hypothesis):
 
 def CocoScore(ref, hypo, language='en'):
     """
-    ref, dictionary of reference sentences (id, sentence)
-    hypo, dictionary of hypothesis sentences (id, sentence)
-    score, dictionary of scores
+    Obtains the COCO scores from the references and hypotheses.
+
+    :param ref: Dictionary of reference sentences (id, sentence)
+    :param hypo: Dictionary of hypothesis sentences (id, sentence)
+    :param language: Language of the sentences (for METEOR)
+    :return: dictionary of scores
     """
     scorers = [
         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
diff --git a/utils/preprocess_binary_word_vectors.py b/utils/preprocess_binary_word_vectors.py
index 47419b7..cead2af 100644
--- a/utils/preprocess_binary_word_vectors.py
+++ b/utils/preprocess_binary_word_vectors.py
@@ -15,6 +15,12 @@
 
 
 def word2vec2npy(v_path, base_path_save, dest_filename):
+    """
+    Preprocess pretrained binary vectors and stores them in a suitable format (.npy)
+    :param v_path: Path to the binary vectors file.
+    :param base_path_save: Path where the formatted vectors will be stored.
+    :param dest_filename: Filename of the formatted vectors.
+    """
     word_vecs = dict()
     print "Loading vectors from %s" % v_path
 
diff --git a/utils/preprocess_text_word_vectors.py b/utils/preprocess_text_word_vectors.py
index 4e80654..77d834a 100644
--- a/utils/preprocess_text_word_vectors.py
+++ b/utils/preprocess_text_word_vectors.py
@@ -15,6 +15,12 @@
 
 
 def txtvec2npy(v_path, base_path_save, dest_filename):
+    """
+    Preprocess pretrained text vectors and stores them in a suitable format (.npy)
+    :param v_path: Path to the text vectors file.
+    :param base_path_save: Path where the formatted vectors will be stored.
+    :param dest_filename: Filename of the formatted vectors.
+    """
     vecs_dict = dict()
     print "Loading vectors from %s" % v_path