Merge branch 'fixes_and_docs_refactor' of https://github.com/analysis…

…center/nbtools into fixes_and_docs_refactor
analysiscenter · May 16, 2024 · 5c50ca6 · 5c50ca6
2 parents 82dce34 + 87501f8
commit 5c50ca6
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -52,15 +52,15 @@ pylint_notebook(path_to_ipynb,             # If not provided, use path to the cu
 
 Under the hood, it converts `.ipynb` notebook to `.py` script, creates a custom `.pylintrc` configuration, runs the `pylint` and removes all temporary files. Learn more about its usage in the [tutorial.](tutorials/NBstat.ipynb)
 
-### **run_notebook**
+### **exec_notebook**
 Provides a `eval`-like interface for running Jupyter Notebooks programmatically. We use it for running interactive tests, that are easy to work with: in case of any failures, one can jump right into fixing it with an already set-up environment.
 
 ```python
-from nbtools import run_notebook
-run_notebook(path_to_ipynb,                       # Which notebook to run
-             out_path_ipynb,                      # Where to save result
-             inputs={'learning_rate': 0.05,},     # Pass variables to notebook
-             outputs=['accuracy'])                # Extract variables from notebook
+from nbtools import exec_notebook
+exec_notebook(path_to_ipynb,                       # Which notebook to run
+              out_path_ipynb,                      # Where to save result
+              inputs={'learning_rate': 0.05,},     # Pass variables to notebook
+              outputs=['accuracy'])                # Extract variables from notebook
 ```
 
 

diff --git a/nbtools/__init__.py b/nbtools/__init__.py
@@ -1,7 +1,7 @@
 """ Init file. """
 #pylint: disable=wildcard-import
 from .core import *
-from .run_notebook import run_notebook
+from .exec_notebook import exec_notebook
 from .pylint_notebook import pylint_notebook
 
 __version__ = '0.9.12'
diff --git a/nbtools/run_notebook.py → nbtools/exec_notebook.py b/nbtools/run_notebook.py → nbtools/exec_notebook.py
@@ -10,13 +10,13 @@
 import psutil
 
 
-TMP_DIR = '/tmp/nbtools_run_notebook'
+TMP_DIR = '/tmp/nbtools_exec_notebook'
 os.makedirs(TMP_DIR, exist_ok=True)
 
 
 # Decorator
 def run_in_process(func):
-    """ Decorator to run the `func` in a separated process for terminating all relevant processes properly. """
+    """ Decorator to run the `func` in a separate process for terminating all related processes properly. """
     @wraps(func)
     def _wrapper(*args, **kwargs):
         # pylint: disable=broad-exception-caught, broad-exception-raised
@@ -39,7 +39,7 @@ def _wrapper(*args, **kwargs):
 
             output = _output_queue.get()
             process.join()
-        except (Exception, KeyboardInterrupt) as e:
+        except (KeyboardInterrupt, Exception) as e:
             output = {'failed': True, 'traceback': e}
 
             # Terminate all relevant processes when something went wrong, e.g. Keyboard Interrupt
@@ -59,19 +59,22 @@ def _wrapper(*args, **kwargs):
         return output
     return _wrapper
 
-def get_run_notebook_name(pid):
-    """ Check /tmp/ directory for logs of running `run_notebook` executors and extract name for a given pid. """
+def get_exec_notebook_name(pid):
+    """ Get the notebook name by its pid.
+
+    Under the hood, the function checks the /tmp/ directory for logs of running `exec_notebook` executors and extract the
+    name for the provided pid.
+    """
     json_path = f'{TMP_DIR}/{pid}.json'
     if not os.path.exists(json_path):
-        return 'run_notebook'
+        return 'exec_notebook'
     with open(json_path, 'r', encoding='utf-8') as file:
         path = json.load(file)['path']
     return path.split('/')[-1]
 
 
 
-# Code cells for insertion
-# Code fragments that are inserted in the notebook
+# Code fragments that are inserted as additional cells in a notebook
 CELL_INSERT_COMMENT = "# Cell inserted during automated execution"
 
 # Connect to a shelve database for inputs/outputs providing
@@ -134,68 +137,79 @@ def get_run_notebook_name(pid):
 
 # Main functions
 @run_in_process
-def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_pos=False,
-                 working_dir = './', execute_kwargs=None,
-                 out_path_db=None, out_path_ipynb=None, out_path_html=None, remove_db='always', add_timestamp=True,
-                 hide_code_cells=False, mask_extra_code=False, display_links=True,
-                 raise_exception=False, return_notebook=False, _output_queue=None):
+def exec_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_pos=False,
+                  display_inputs=False, display_outputs=False,
+                  working_dir = './', execute_kwargs=None,
+                  out_path_db=None, out_path_ipynb=None, out_path_html=None, remove_db='always',
+                  add_timestamp=True, hide_code_cells=False, display_links=True,
+                  raise_exception=False, return_notebook=False, _output_queue=None):
     """ Execute a Jupyter Notebook programmatically.
     Heavily inspired by https://github.com/tritemio/nbrun.
 
     Intended to be an analog of `exec`, providing a way to inject / extract variables from the execution.
-    Executed notebook is optionally saved to disk as `.ipynb`/`.html`: we strongly recommend always doing that.
     For a detailed description of how to do that, check the `inputs` and `outputs` parameters.
+    The executed notebook is optionally saved to disk as `.ipynb` / `.html` file: we strongly recommend always doing so.
 
-    Flag `raise_exception` defines behavior if the execution of the notebook is failed due to an exception.
+    The `raise_exception` flag defines the behavior if the execution of the notebook fails due to an exception.
 
     Under the hood, this function does the following:
         - Create an internal database to communicate variables (both `inputs` and `outputs`). Save `inputs` to it.
-        - Add a cell with reading `inputs` from the database, add a cell for saving `outputs` to the database.
+        - Add a cell for reading `inputs` from the internal database, add a cell for saving `outputs` to it.
         - Execute notebook.
         - Handle exceptions.
         - Read `outputs` from the database.
         - Add a timestamp cell to the notebook, if needed.
-        - Save the executed notebook as `.ipynb` and/or `.html`.
+        - Save the executed notebook as `.ipynb` and / or `.html`.
         - Return a dictionary with intermediate results, execution info and values of `outputs` variables.
 
-    If there are no `inputs` nor `outputs`, a database is not created and additional cells are not inserted.
-    If either of them is provided, then one of `out_path_ipynb` or `out_path_db` must be explicitly defined.
+    If there are no `inputs` or `outputs`, a database is not created and additional cells are not inserted.
+    Note, if either of them is provided, then one of `out_path_ipynb` or `out_path_db` must be explicitly defined.
 
     Parameters
     ----------
     path : str
         Path to the notebook to execute.
     inputs : dict, optional
-        Inputs for notebook execution: essentially, its `globals`.
+        Inputs for execution are essentially equivalent to notebook `globals`.
         Must be a dictionary with variable names and their values; therefore, keys must be valid Python identifiers.
-        Saved to a database, loaded in the notebook in a separate cell, that is inserted at `inputs_pos` position.
-        Therefore, values must be serializable.
+        Under the hood, inputs are saved into a database, loaded in the notebook in a separate cell, that is inserted at
+        the `inputs_pos` position. Therefore, values must be serializable.
     outputs : str or iterable of str, optional
-        List of notebook local variable names to return.
-        Extracted from the notebook in a separate cell, that is inserted at the last position.
-        If some of the variables don't exist, no errors are raised.
+        The list of variable names to return from the notebook.
+        Extracted from the notebook in a separate cell, which is inserted at the last position.
+        Note, if some of the variables don't exist, no errors are raised.
     inputs_pos : int, optional
         Position to insert the cell with `inputs` loading into the notebook.
     replace_inputs_pos : int, optional
         Whether to replace `inputs_pos` code cell with `inputs` or insert a new one.
+    display_inputs : bool, optional
+        Whether to display `inputs` or not.
+        Under the hood, inputs are provided using a shelve database. If `display_inputs=True`, variables will be
+        inserted in the cell in the following manner: `input_name = input_value`, instead of importing code from shelve.
+        For more, see the :func:`~._display_inputs_reading` docstring.
+    display_outputs : bool, optional
+        Whether to display `outputs` or not.
+        Under the hood, outputs are saved using a shelve database. If `display_outputs=True`, variables will be shown in
+        the last cell in the following manner: `print(input_name)`, instead of dumping code into the database.
+        For more, see the :func:`~._display_outputs_dumping` docstring.
     working_dir : str
         The working directory of starting the kernel.
     out_path_db : str, optional
-        Path to save shelve database files without file extension.
+        Path to save the internal database files (without file extension).
         If not provided, then it is inferred from `out_path_ipynb`.
     out_path_ipynb : str, optional
         Path to save the output ipynb file.
     out_path_html : str, optional
         Path to save the output html file.
     remove_db : str, optional
-        Whether to remove shelve database after notebook execution.
+        Whether to remove the internal database after notebook execution.
         Possible options are: 'always', 'not_failed_case' or 'never'.
         If 'always', then remove the database after notebook execution.
         If 'not_failed_case', then remove the database if there wasn't any execution failure.
         If 'never', then don't remove the database after notebook execution.
-        Running `:meth:run_notebook` with 'not_failed_case' or 'never' option helps to reproduce failures
-        in the `out_path_ipynb` notebook: it will take passed inputs from the saved shelve database.
-        Note, that database exists only if inputs and/or outputs are provided.
+        Running `:func:exec_notebook` with the 'not_failed_case' or 'never' option helps to reproduce failures
+        in the `out_path_ipynb` notebook: it will take the inputs from the saved shelve database.
+        Note, that the database exists only if inputs and / or outputs are provided.
     execute_kwargs : dict, optional
         Parameters of `:class:ExecutePreprocessor`.
         For example, you can provide timeout, kernel_name, resources (such as metadata)
@@ -205,9 +219,6 @@ def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_p
         Whether to add a cell with execution information at the beginning of the saved notebook.
     hide_code_cells : bool, optional
         Whether to hide the code cells in the saved notebook.
-    mask_extra_code : bool, optional
-        Whether to mask database reading and dumping code.
-        For more, see :func:`~.mask_inputs_reading` and :func`~.mask_outputs_dumping` docstrings.
     display_links : bool, optional
         Whether to display links to the executed notebook and html at execution.
     raise_exception : bool, optional
@@ -221,7 +232,7 @@ def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_p
     -------
     exec_res : dict
         Dictionary with the notebook execution results.
-        It provides next information:
+        It provides the following information:
         - 'failed' : bool
            Whether the notebook execution failed.
         - 'outputs' : dict
@@ -276,17 +287,14 @@ def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_p
     with open(path, encoding='utf-8') as file:
         notebook = nbformat.read(file, as_version=4)
 
-    if hide_code_cells:
-        notebook["metadata"].update({"hide_input": True})
-
     if inputs is not None:
         # Save `inputs` in the shelve database and create a cell in the notebook
         # for parameters extraction
         with shelve.open(out_path_db) as notebook_db:
             notebook_db.update(inputs)
 
         code = CELL_INSERT_COMMENT + DB_CONNECT_CODE_CELL.format(repr(working_dir_out_path_db)) + INPUTS_CODE_CELL
-        if mask_extra_code:
+        if display_inputs:
             code += INPUTS_DISPLAY
 
         if replace_inputs_pos:
@@ -303,7 +311,7 @@ def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_p
                (DB_CONNECT_CODE_CELL.format(repr(working_dir_out_path_db)) if not inputs else "") + \
                OUTPUTS_CODE_CELL.format(outputs)
 
-        if mask_extra_code:
+        if display_outputs:
             code += OUTPUTS_DISPLAY
 
         output_cell = nbformat.v4.new_code_cell(code)
@@ -367,14 +375,18 @@ def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_p
             timestamp_cell = nbformat.v4.new_markdown_cell(timestamp)
             notebook['cells'].insert(0, timestamp_cell)
 
-        if mask_extra_code:
-            if inputs is not None:
-                pos = inputs_pos + 1 if add_timestamp else inputs_pos
-                mask_inputs_reading(notebook=notebook, pos=pos)
+        if display_inputs and inputs is not None:
+            pos = inputs_pos + 1 if add_timestamp else inputs_pos
+            _display_inputs_reading(notebook=notebook, pos=pos)
 
-            if outputs is not None:
-                pos = len(notebook['cells']) - 1
-                mask_outputs_dumping(notebook=notebook, pos=pos)
+        if display_outputs and outputs is not None:
+            pos = len(notebook['cells']) - 1
+            _display_outputs_dumping(notebook=notebook, pos=pos)
+
+        if hide_code_cells:
+            for cell in notebook['cells']:
+                if cell['cell_type'] == 'code':
+                    cell["metadata"].update({"jupyter": {"source_hidden": "true"}})
 
         # Save the executed notebook/HTML to disk
         if out_path_ipynb is not None:
@@ -388,9 +400,9 @@ def run_notebook(path, inputs=None, outputs=None, inputs_pos=1, replace_inputs_p
         _output_queue.put(exec_res) # return for parent process
     return None
 
-# Mask functions for database operations cells
-def mask_inputs_reading(notebook, pos):
-    """ Replace database reading by variables initialization.
+# Functions for database operations cells masking
+def _display_inputs_reading(notebook, pos):
+    """ Replace database reading code by variables initialization mask.
 
     Result is a code cell with the following view:
     .. code-block:: python
@@ -408,8 +420,8 @@ def mask_inputs_reading(notebook, pos):
     cell_mask = nbformat.v4.new_code_cell(source=code_mask, execution_count=execution_count)
     notebook['cells'][pos] = cell_mask
 
-def mask_outputs_dumping(notebook, pos):
-    """Replace database dumping by printing outputs.
+def _display_outputs_dumping(notebook, pos):
+    """ Replace database dumping code by printing outputs.
 
     Result is a code cell with the following view and corresponding output:
     .. code-block:: python
@@ -436,7 +448,9 @@ def mask_outputs_dumping(notebook, pos):
             code_mask += f'print({variable_name})\n'
             text_mask += variable_value
 
-    outputs_mask =  [nbformat.v4.new_output(text=text_mask, name='stdout', output_type='stream')]
+    code_mask = code_mask[:-1] # remove extra '\n'
+
+    outputs_mask =  [nbformat.v4.new_output(text=text_mask[1:], name='stdout', output_type='stream')]
 
     cell_mask = nbformat.v4.new_code_cell(source=code_mask, execution_count=execution_count, outputs=outputs_mask)
     notebook['cells'][pos] = cell_mask

diff --git a/nbtools/nbstat/resource_entry.py b/nbtools/nbstat/resource_entry.py
@@ -62,7 +62,7 @@ def to_format_data(self, resource, terminal, **kwargs):
             if data is not None:
                 if 'zombie' in data or 'containerd' in data:
                     style = terminal.red
-                if data == 'run_notebook':
+                if data == 'exec_notebook':
                     style = terminal.green
 
         elif resource == Resource.CREATE_TIME:

diff --git a/nbtools/nbstat/resource_inspector.py b/nbtools/nbstat/resource_inspector.py
@@ -13,7 +13,7 @@
 from .resource import Resource
 from .resource_table import ResourceTable
 from .utils import format_memory, pid_to_name, pid_to_ngid, FiniteList, true_len, true_rjust, true_center
-from ..run_notebook import get_run_notebook_name
+from ..exec_notebook import get_exec_notebook_name
 
 
 
@@ -253,7 +253,7 @@ def get_process_table(self, formatter=None):
                     kernel_id = KERNEL_ID_SEARCHER(cmdline)
                     vscode_key = VSCODE_KEY_SEARCHER(cmdline)
                     script_name = SCRIPT_NAME_SEARCHER(cmdline)
-                    run_notebook_path = RUN_NOTEBOOK_PATH_SEARCHER(cmdline)
+                    exec_notebook_path = RUN_NOTEBOOK_PATH_SEARCHER(cmdline)
 
                     if kernel_id:
                         # The name will be changed by data from `notebook_table`.
@@ -267,9 +267,9 @@ def get_process_table(self, formatter=None):
                         type_ = 'vscode'
                         name = vscode_key.group(1).split('-')[0] + '.ipynb'
                         path = kernel_id = vscode_key.group(1)
-                    elif run_notebook_path:
-                        type_ = 'run_notebook'
-                        name = get_run_notebook_name(process.ppid())
+                    elif exec_notebook_path:
+                        type_ = 'exec_notebook'
+                        name = get_exec_notebook_name(process.ppid())
                         path = os.path.join(cwd, name)
                         kernel_id = None
                     elif script_name:
@@ -285,8 +285,8 @@ def get_process_table(self, formatter=None):
 
                     # PYTHON_PPID = PPID if parent is Python process else -1
                     ppid = process.ppid()
-                    if type_ == 'run_notebook':
-                        # Spawned by `run_notebook` function of the library
+                    if type_ == 'exec_notebook':
+                        # Spawned by `exec_notebook` function of the library
                         python_ppid = ppid
                     elif ppid in python_pids:
                         # Spawned by one of other Python processes