Merge pull request #21 from big-o/develop

v0.0.5
scikit-learn-contrib · Aug 13, 2022 · 7191a7d · 7191a7d
2 parents d65705f + 7df1a10
commit 7191a7d
Show file tree

Hide file tree

Showing 14 changed files with 224 additions and 117 deletions.
diff --git a/.coveragerc b/.coveragerc
diff --git a/doc/_static/img/cover.png b/doc/_static/img/cover.png
diff --git a/doc/_static/img/dag2.png b/doc/_static/img/dag2.png
diff --git a/doc/_static/img/dag2a.png b/doc/_static/img/dag2a.png
diff --git a/doc/_static/img/dag3.png b/doc/_static/img/dag3.png
diff --git a/doc/_static/img/dag3a.png b/doc/_static/img/dag3a.png
diff --git a/doc/quick_start.rst b/doc/quick_start.rst
@@ -26,23 +26,26 @@ The simplest DAGs are just a chain of singular dependencies. These DAGs may be
 created from the :meth:`skdag.dag.DAG.from_pipeline` method in the same way as a
 DAG:
 
->>> from sklearn.decomposition import PCA
->>> from sklearn.impute import SimpleImputer
->>> from sklearn.linear_model import LogisticRegression
->>> dag = DAG.from_pipeline(
-...     steps=[
-...         ("impute", SimpleImputer()),
-...         ("pca", PCA()),
-...         ("lr", LogisticRegression())
-...     ]
-... )
->>> dag.draw()
-o    impute
-|
-o    pca
-|
-o    lr
-<BLANKLINE>
+.. code-block:: python
+
+    >>> from skdag import DAGBuilder
+    >>> from sklearn.decomposition import PCA
+    >>> from sklearn.impute import SimpleImputer
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> dag = DAGBuilder().from_pipeline(
+    ...     steps=[
+    ...         ("impute", SimpleImputer()),
+    ...         ("pca", PCA()),
+    ...         ("lr", LogisticRegression())
+    ...     ]
+    ... ).make_dag()
+    >>> dag.show()
+    o    impute
+    |
+    o    pca
+    |
+    o    lr
+    <BLANKLINE>
 
 .. image:: _static/img/dag1.png
 
@@ -52,7 +55,6 @@ estimator:
 
 .. code-block:: python
 
-    >>> from skdag import DAGBuilder
     >>> dag = (
     ...     DAGBuilder(infer_dataframe=True)
     ...     .add_step("impute", SimpleImputer())
@@ -61,15 +63,15 @@ estimator:
     ...     .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
     ...     .make_dag()
     ... )
-    >>> dag.draw()
+    >>> dag.show()
     o    impute
     |\
     o o    blood,vitals
     |/
     o    lr
     <BLANKLINE>
 
-.. image:: _static/img/dag2.png
+.. image:: _static/img/dag2a.png
 
 In the above examples we pass the first four columns directly to a regressor, but
 the remaining columns have dimensionality reduction applied first before being
@@ -82,36 +84,36 @@ on how to control this behaviour, see the `User Guide <user_guide.html>`_.
 The DAG may now be used as an estimator in its own right:
 
 >>> from sklearn import datasets
->>> X, y = datasets.load_diabetes(return_X_y=True)
->>> dag.fit_predict(X, y)
-array([...
+>>> X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
+>>> type(dag.fit_predict(X, y))
+<class 'pandas.core.series.Series'>
 
 In an extension to the scikit-learn estimator interface, DAGs also support multiple
 inputs and multiple outputs. Let's say we want to compare two different classifiers:
 
 >>> from sklearn.ensemble import RandomForestClassifier
->>> cal = DAG.from_pipeline(
+>>> cal = DAGBuilder(infer_dataframe=True).from_pipeline(
 ...     [("rf", RandomForestClassifier(random_state=0))]
-... )
+... ).make_dag()
 >>> dag2 = dag.join(cal, edges=[("blood", "rf"), ("vitals", "rf")])
->>> dag2.draw()
+>>> dag2.show()
 o    impute
 |\
 o o    blood,vitals
 |x|
 o o    lr,rf
 <BLANKLINE>
 
-.. image:: _static/img/dag3.png
+.. image:: _static/img/dag3a.png
 
 Now our DAG will return two outputs: one from each classifier. Multiple outputs are
 returned as a :class:`sklearn.utils.Bunch<Bunch>`:
 
 >>> y_pred = dag2.fit_predict(X, y)
->>> y_pred.lr
-array([...
->>> y_pred.rf
-array([...
+>>> type(y_pred.lr)
+<class 'pandas.core.series.Series'>
+>>> type(y_pred.rf)
+<class 'pandas.core.series.Series'>
 
 Similarly, multiple inputs are also acceptable and inputs can be provided by
 specifying ``X`` and ``y`` as ``dict``-like objects.
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -18,17 +18,17 @@ scikit-learn :class:`~sklearn.pipeline.Pipeline`. These DAGs may be created from
 
 .. code-block:: python
 
+    >>> from skdag import DAGBuilder
     >>> from sklearn.decomposition import PCA
     >>> from sklearn.impute import SimpleImputer
     >>> from sklearn.linear_model import LogisticRegression
-    >>> dag = DAG.from_pipeline(
+    >>> dag = DAGBuilder(infer_dataframe=True).from_pipeline(
     ...     steps=[
     ...         ("impute", SimpleImputer()),
     ...         ("pca", PCA()),
     ...         ("lr", LogisticRegression())
-    ...     ],
-    ...     infer_dataframe=True,
-    ... )
+    ...     ]
+    ... ).make_dag()
 
 You may view a diagram of the DAG with the :meth:`~skdag.dag.DAG.show` method. In a
 notbook environment this will display an image, whereas in a terminal it will generate
@@ -97,19 +97,20 @@ The DAG may now be used as an estimator in its own right:
 .. code-block:: python
 
     >>> from sklearn import datasets
-    >>> X, y = datasets.load_diabetes(return_X_y=True)
-    >>> dag.fit_predict(X, y)
-    array([...
+    >>> X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
+    >>> y_hat = dag.fit_predict(X, y)
+    >>> type(y_hat)
+    <class 'pandas.core.series.Series'>
 
 In an extension to the scikit-learn estimator interface, DAGs also support multiple
 inputs and multiple outputs. Let's say we want to compare two different classifiers:
 
 .. code-block:: python
 
     >>> from sklearn.ensemble import RandomForestClassifier
-    >>> rf = DAG.from_pipeline(
+    >>> rf = DAGBuilder().from_pipeline(
     ...     [("rf", RandomForestClassifier(random_state=0))]
-    ... )
+    ... ).make_dag()
     >>> dag2 = dag.join(rf, edges=[("blood", "rf"), ("vitals", "rf")])
     >>> dag2.show()
     o    impute
@@ -126,10 +127,14 @@ returned as a :class:`sklearn.utils.Bunch<Bunch>`:
 .. code-block:: python
 
     >>> y_pred = dag2.fit_predict(X, y)
-    >>> y_pred.lr
-    array([...
-    >>> y_pred.rf
-    array([...
+    >>> type(y_pred.lr)
+    <class 'pandas.core.series.Series'>
+    >>> type(y_pred.rf)
+    <class 'numpy.ndarray'>
+
+Note that we have different types of output here because ``LogisticRegression`` natively
+supports dataframe input whereas ``RandomForestClassifier`` does not. We could fix this
+by specifying ``infer_dataframe=True`` when we createed our ``rf`` DAG extension.
 
 Similarly, multiple inputs are also acceptable and inputs can be provided by
 specifying ``X`` and ``y`` as ``dict``-like objects.
@@ -174,6 +179,7 @@ the next step(s).
     ...     .make_dag()
     ... )
     >>> stack.fit(X_train, y_train)
+    DAG(...
 
 .. image:: _static/img/stack.png
 
@@ -210,7 +216,7 @@ as a dictionary of step name to column indices instead:
     ...     .add_step("pass", "passthrough")
     ...     .add_step("rf", RandomForestClassifier(), deps=["pass"])
     ...     .add_step("svr", SVC(), deps=["pass"])
-    ...     .add_step("meta", LinearRegression(), deps={"rf": 1, "svc": 1}])
+    ...     .add_step("meta", LinearRegression(), deps={"rf": 1, "svr": 1})
     ...     .make_dag()
     ... )
 

diff --git a/setup.cfg b/setup.cfg
@@ -5,11 +5,38 @@ description-file = README.rst
 test = pytest
 
 [tool:pytest]
+doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
+testpaths = .
 addopts =
     -s
     --doctest-modules
+    --doctest-glob="*.rst"
     --cov=skdag
     --ignore setup.py
     --ignore doc/_build
     --ignore doc/_templates
     --no-cov-on-fail
+
+[coverage:run]
+branch = True
+source = skdag
+include = */skdag/*
+omit =
+    */tests/*
+    *_test.py
+    test_*.py
+    */setup.py
+
+[coverage:report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    if self.verbose:
+show_missing = True
+
diff --git a/skdag/_version.py b/skdag/_version.py
@@ -1 +1 @@
-__version__ = "0.0.4"
+__version__ = "0.0.5"
diff --git a/skdag/dag/_dag.py b/skdag/dag/_dag.py
@@ -7,6 +7,7 @@
 from copy import deepcopy
 from inspect import signature
 from itertools import chain
+from typing import Iterable
 
 import networkx as nx
 import numpy as np
@@ -32,12 +33,34 @@
 __all__ = ["DAG", "DAGStep"]
 
 
+def _get_columns(X, dep, cols, is_root, axis=1):
+    if callable(cols):
+        # sklearn.compose.make_column_selector
+        cols = cols(X)
+
+    if not is_root:
+        # The DAG will prepend output columns with the step name, so add this in to any
+        # dep columns if missing. This helps keep user-provided deps readable.
+        if isinstance(cols, str):
+            cols = cols if cols.startswith(f"{dep}__") else f"{dep}__{cols}"
+        elif isinstance(cols, Iterable):
+            orig = cols
+            cols = []
+            for col in orig:
+                if isinstance(col, str):
+                    cols.append(col if col.startswith(f"{dep}__") else f"{dep}__{col}")
+                else:
+                    cols.append(col)
+
+    return _safe_indexing(X, cols, axis=axis)
+
+
 def _stack_inputs(dag, X, node):
     # For root nodes, the dependency is just the node name itself.
     deps = {node.name: None} if node.is_root else node.deps
 
     cols = [
-        X[dep][cols(X[dep])] if callable(cols) else _safe_indexing(X[dep], cols, axis=1)
+        _get_columns(X[dep], dep, cols, node.is_root, axis=1)
         for dep, cols in deps.items()
     ]
 
@@ -204,7 +227,7 @@ def _parallel_transform(dag, step, Xin, Xs, transform_fn, **fn_params):
     clsname = type(dag).__name__
     with _print_elapsed_time(clsname, dag._log_message(step)):
         if transformer is None or transformer == "passthrough":
-                Xt = X
+            Xt = X
         else:
             # Fit or load from cache the current transformer
             Xt = transform_fn(

diff --git a/skdag/dag/_render.py b/skdag/dag/_render.py
@@ -1,5 +1,8 @@
-import black
 import html
+from typing import Iterable
+
+import black
+from matplotlib.pyplot import isinteractive
 import networkx as nx
 import stackeddag.core as sd
 from skdag.dag._utils import _is_passthrough
@@ -52,7 +55,7 @@ def to_agraph(self, detailed):
 
         try:
             A = nx.nx_agraph.to_agraph(G)
-        except (ImportError, ModuleNotFoundError) as err:
+        except (ImportError, ModuleNotFoundError) as err:  # pragma: no cover
             raise ImportError(
                 "DAG visualisation requires pygraphviz to be installed. "
                 "See http://pygraphviz.github.io/ for guidance."
@@ -115,6 +118,36 @@ def to_agraph(self, detailed):
                         if key.startswith("edge__")
                     }
                 )
+            cols = G.nodes[v]["step"].deps[u]
+            if cols:
+                if isinstance(cols, Iterable):
+                    cols = list(cols)
+
+                    if len(cols) > 5:
+                        colrepr = f"[{repr(cols[0])}, ..., {repr(cols[-1])}]"
+                    else:
+                        colrepr = repr(cols)
+                elif callable(cols):
+                    selector = cols
+                    cols = {}
+                    for attr in ["pattern", "dtype_include", "dtype_exclude"]:
+                        if hasattr(selector, attr):
+                            val = getattr(selector, attr)
+                            if val is not None:
+                                cols[attr] = val
+                    if cols:
+                        selrepr = ", ".join(
+                            f"{key}={repr(val)}" for key, val in cols.items()
+                        )
+                        colrepr = f"column_selector({selrepr})"
+                    else:
+                        colrepr = f"{selector.__name__}()"
+                else:
+                    colrepr = repr(cols)
+
+                aedge.attr.update(
+                    {"label": colrepr, "fontsize": "8pt", "fontname": "SANS"}
+                )
 
         A.layout()
         return A