Merge branch 'alexander-pv-feature/outcomes'

uber · Jul 8, 2023 · a8efeeb · a8efeeb
2 parents 60cc631 + 3485182
commit a8efeeb
Show file tree

Hide file tree

Showing 22 changed files with 197 additions and 141 deletions.
diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml
@@ -25,9 +25,7 @@ jobs:
         run: |
             pip install -U pip
             pip install -U setuptools
-            pip install -r requirements.txt
-            pip install -r requirements-test.txt
+            python -m pip install .[test]
             python setup.py build_ext --inplace
-            python setup.py install
       - name: Test with pytest
         run: pytest -vs tests/ --cov causalml/
diff --git a/Makefile b/Makefile
@@ -22,4 +22,5 @@ test: build_ext
 .PHONY: clean
 clean:
 	python setup.py clean --all
-	rm -rf ./build ./dist ./causalml.egg-info
+	rm -rf ./build ./dist ./eggs ./causalml.egg-info
+	find ./causalml -type f \( -name "*.so" -o -name "*.c" -o -name "*.html" \) -delete
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ The package currently supports the following methods
 
 # Installation
 
-Installation with `conda` is recommended. `conda` environment files for Python 3.6, 3.7, 3.8 and 3.9 are available in the repository. To use models under the `inference.tf` module (e.g. `DragonNet`), additional dependency of `tensorflow` is required. For detailed instructions, see below.
+Installation with `conda` is recommended. `conda` environment files for Python 3.7, 3.8 and 3.9 are available in the repository. To use models under the `inference.tf` module (e.g. `DragonNet`), additional dependency of `tensorflow` is required. For detailed instructions, see below.
 
 ## Install using `conda`:
 ### Install from `conda-forge`
@@ -65,7 +65,7 @@ $ conda install -c conda-forge causalml
 ### Install with the `conda` virtual environment
 This will create a new `conda` virtual environment named `causalml-[tf-]py3x`, where `x` is in `[6, 7, 8, 9]`. e.g. `causalml-py37` or `causalml-tf-py38`. If you want to change the name of the environment, update the relevant YAML file in `envs/`
 
-```
+```bash
 $ git clone https://github.com/uber/causalml.git
 $ cd causalml/envs/
 $ conda env create -f environment-py38.yml	# for the virtual environment with Python 3.8 and CausalML
@@ -74,40 +74,36 @@ $ conda activate causalml-py38
 ```
 
 ### Install `causalml` with `tensorflow`
-```
+```bash
 $ git clone https://github.com/uber/causalml.git
 $ cd causalml/envs/
 $ conda env create -f environment-tf-py38.yml	# for the virtual environment with Python 3.8 and CausalML
 $ conda activate causalml-tf-py38
 (causalml-tf-py38) pip install -U numpy			# this step is necessary to fix [#338](https://github.com/uber/causalml/issues/338)
 ```
 
-## Install using `pip`:
+## Install from `PyPI`:
 
-```
-$ git clone https://github.com/uber/causalml.git
-$ cd causalml
-$ pip install -r requirements.txt
+```bash
 $ pip install causalml
 ```
 
 ### Install `causalml` with `tensorflow`
-```
-$ git clone https://github.com/uber/causalml.git
-$ cd causalml
-$ pip install -r requirements-tf.txt
+```bash
 $ pip install causalml[tf]
 $ pip install -U numpy							# this step is necessary to fix [#338](https://github.com/uber/causalml/issues/338)
 ```
 
 ## Install from source:
 
-```
+```bash
 $ git clone https://github.com/uber/causalml.git
 $ cd causalml
-$ pip install -r requirements.txt
-$ python setup.py build_ext --inplace
-$ python setup.py install
+$ pip install .
+```
+with `tensorflow`:
+```bash
+pip install .[tf]
 ```
 
 

diff --git a/causalml/__init__.py b/causalml/__init__.py
@@ -1,5 +1,3 @@
-name = "causalml"
-__version__ = "0.13.0"
 __all__ = [
     "dataset",
     "features",

diff --git a/causalml/inference/iv/drivlearner.py b/causalml/inference/iv/drivlearner.py
@@ -749,7 +749,7 @@ def plot_shap_values(
         model_tau_feature=None,
         features=None,
         shap_dict=None,
-        **kwargs
+        **kwargs,
     ):
         """
         Plots distribution of shapley values.
@@ -788,7 +788,7 @@ def plot_shap_dependence(
         features=None,
         shap_dict=None,
         interaction_idx="auto",
-        **kwargs
+        **kwargs,
     ):
         """
         Plots dependency of shapley values for a specified feature, colored by an interaction feature.
@@ -831,7 +831,7 @@ def plot_shap_dependence(
             feature_idx=feature_idx,
             shap_dict=shap_dict,
             interaction_idx=interaction_idx,
-            **kwargs
+            **kwargs,
         )
 
 

diff --git a/causalml/inference/meta/base.py b/causalml/inference/meta/base.py
@@ -249,7 +249,7 @@ def plot_shap_values(
         model_tau_feature=None,
         features=None,
         shap_dict=None,
-        **kwargs
+        **kwargs,
     ):
         """
         Plots distribution of shapley values.
@@ -288,7 +288,7 @@ def plot_shap_dependence(
         features=None,
         shap_dict=None,
         interaction_idx="auto",
-        **kwargs
+        **kwargs,
     ):
         """
         Plots dependency of shapley values for a specified feature, colored by an interaction feature.
@@ -331,5 +331,5 @@ def plot_shap_dependence(
             feature_idx=feature_idx,
             shap_dict=shap_dict,
             interaction_idx=interaction_idx,
-            **kwargs
+            **kwargs,
         )
diff --git a/causalml/inference/meta/explainer.py b/causalml/inference/meta/explainer.py
@@ -238,7 +238,7 @@ def plot_shap_dependence(
         feature_idx,
         shap_dict=None,
         interaction_idx="auto",
-        **kwargs
+        **kwargs,
     ):
         """
          Plots dependency of shapley values for a specified feature, colored by an interaction feature.
@@ -269,5 +269,5 @@ def plot_shap_dependence(
             self.X,
             interaction_index=interaction_idx,
             feature_names=self.features,
-            **kwargs
+            **kwargs,
         )
diff --git a/causalml/inference/tree/causal/_criterion.pyx b/causalml/inference/tree/causal/_criterion.pyx
@@ -205,10 +205,9 @@ cdef class CausalRegressionCriterion(RegressionCriterion):
         return 0
 
     cdef void node_value(self, double * dest) nogil:
-        """Compute the node value of samples[start:end] into dest."""
-        # Save the average of treatment effects within a node as a value for the node
-        dest[0] = self.state.node.tr_y_sum / self.state.node.tr_count - \
-                  self.state.node.ct_y_sum / self.state.node.ct_count
+        """Compute the node values of samples[start:end] into dest."""
+        dest[0] = self.state.node.ct_y_sum / self.state.node.ct_count
+        dest[1] = self.state.node.tr_y_sum / self.state.node.tr_count
 
     cdef double get_groups_penalty(self, double tr_count, double ct_count) nogil:
         """Compute penalty for the sample size difference between groups"""

diff --git a/causalml/inference/tree/causal/_tree.py b/causalml/inference/tree/causal/_tree.py
@@ -74,7 +74,8 @@ def fit(
             # [:, np.newaxis] that does not.
             y = np.reshape(y, (-1, 1))
 
-        self.n_outputs_ = y.shape[1]
+        # For memory allocation to store control, treatment outcomes
+        self.n_outputs_ = np.unique(sample_weight).astype(int).size
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)

diff --git a/causalml/inference/tree/causal/causalforest.py b/causalml/inference/tree/causal/causalforest.py
@@ -249,8 +249,8 @@ def _fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None):
                     "is necessary for Poisson regression."
                 )
 
-        self.n_outputs_ = y.shape[1]
-
+        self.n_outputs_ = np.unique(sample_weight).astype(int).size + 1
+        self.max_outputs_ = self.n_outputs_
         y, expanded_class_weight = self._validate_y_class_weight(y)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
@@ -380,6 +380,26 @@ def fit(self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray):
         X, y, w = self.base_estimator._prepare_data(X=X, y=y, treatment=treatment)
         return self._fit(X=X, y=y, sample_weight=w)
 
+    def predict(self, X: np.ndarray, with_outcomes: bool = False) -> np.ndarray:
+        """Predict individual treatment effects
+
+        Args:
+            X (np.matrix): a feature matrix
+            with_outcomes (bool), default=False,
+                                  include outcomes Y_hat(X|T=0), Y_hat(X|T=1) along with individual treatment effect
+        Returns:
+           (np.matrix): individual treatment effect (ITE), dim=nx1
+                        or ITE with outcomes [Y_hat(X|T=0), Y_hat(X|T=1), ITE], dim=nx3
+        """
+        if with_outcomes:
+            self.n_outputs_ = self.max_outputs_
+            for estimator in self.estimators_:
+                estimator._with_outcomes = True
+        else:
+            self.n_outputs_ = 1
+        y_pred = super().predict(X)
+        return y_pred
+
     def calculate_error(
         self,
         X_train: np.ndarray,

diff --git a/causalml/inference/tree/causal/causaltree.py b/causalml/inference/tree/causal/causaltree.py
@@ -132,6 +132,7 @@ def __init__(
         self._classes = {}
         self.groups_cnt = groups_cnt
         self.groups_cnt_mode = groups_cnt_mode
+        self._with_outcomes = False
         self._groups_cnt = {}
 
         super().__init__(
@@ -190,6 +191,30 @@ def fit(
             self._groups_cnt = self._count_groups_distribution(X=X, treatment=w)
         return self
 
+    def predict(
+        self, X: np.ndarray, with_outcomes: bool = False, check_input=True
+    ) -> np.ndarray:
+        """Predict individual treatment effects
+
+        Args:
+            X (np.matrix): a feature matrix
+            with_outcomes (bool), default=False,
+                                  include outcomes Y_hat(X|T=0), Y_hat(X|T=1) along with individual treatment effect
+            check_input (bool), default=True,
+                                Allow to bypass several input checking.
+        Returns:
+           (np.matrix): individual treatment effect (ITE), dim=nx1
+                        or ITE with outcomes [Y_hat(X|T=0), Y_hat(X|T=1), ITE], dim=nx3
+        """
+        if check_input:
+            X = self._validate_X_predict(X, check_input)
+        y_outcomes = super().predict(X)
+        y_pred = y_outcomes[:, 1] - y_outcomes[:, 0]
+        need_outcomes = with_outcomes or self._with_outcomes
+        return (
+            np.hstack([y_outcomes, y_pred.reshape(-1, 1)]) if need_outcomes else y_pred
+        )
+
     def fit_predict(
         self,
         X: np.ndarray,
@@ -295,8 +320,7 @@ def _bootstrap(i: int):
             )
 
         pool = PPool(nodes=n_jobs)
-        if "pytest" in sys.modules:
-            pool.restart(force=True)
+        pool.restart(force=True)
 
         bootstrap_estimates = np.array(
             list(

diff --git a/causalml/inference/tree/plot.py b/causalml/inference/tree/plot.py
@@ -9,13 +9,12 @@
 import numpy as np
 import pydotplus
 import seaborn as sns
-
 from sklearn.tree import _tree
 from sklearn.tree._export import _MPLTreeExporter, _color_brew
 from sklearn.utils.validation import check_is_fitted
 
-from .utils import get_tree_leaves_mask
 from . import CausalTreeRegressor
+from .utils import get_tree_leaves_mask
 
 
 def uplift_tree_string(decisionTree, x_names):
@@ -309,9 +308,16 @@ def plot_dist_tree_leaves_values(
 
     """
     tree_leaves_mask = get_tree_leaves_mask(tree)
-    leaves_values = tree.tree_.value.reshape(-1)[tree_leaves_mask]
+    leaves_values = tree.tree_.value
+    treatment_effects = leaves_values[:, 1] - leaves_values[:, 0]
+    treatment_effects = treatment_effects.reshape(
+        -1,
+    )[tree_leaves_mask]
     fig, ax = plt.subplots(figsize=figsize)
-    sns.distplot(leaves_values, ax=ax)
+    sns.distplot(
+        treatment_effects,
+        ax=ax,
+    )
     plt.title(title, fontsize=fontsize)
     plt.show()
 

diff --git a/causalml/inference/tree/uplift.pyx b/causalml/inference/tree/uplift.pyx
@@ -1,6 +1,7 @@
 # cython: cdivision=True
 # cython: boundscheck=False
 # cython: wraparound=False
+# cython: language_level=3
 """
 Forest of trees-based ensemble methods for Uplift modeling on Classification
 Problem. Those methods include random forests and extremely randomized trees.

diff --git a/causalml/metrics/visualize.py b/causalml/metrics/visualize.py
@@ -782,7 +782,7 @@ def auuc_score(
     normalize=True,
     tmle=False,
     *args,
-    **kwarg
+    **kwarg,
 ):
     """Calculate the AUUC (Area Under the Uplift Curve) score.
 
@@ -816,7 +816,7 @@ def qini_score(
     normalize=True,
     tmle=False,
     *args,
-    **kwarg
+    **kwarg,
 ):
     """Calculate the Qini score: the area between the Qini curves of a model and random.
 

diff --git a/causalml/optimize/value_optimization.py b/causalml/optimize/value_optimization.py
@@ -55,7 +55,7 @@ def __init__(
         conversion_cost,
         impression_cost,
         *args,
-        **kwargs
+        **kwargs,
     ):
         self.treatment = treatment
         self.control_name = control_name

diff --git a/causalml/propensity.py b/causalml/propensity.py
@@ -169,7 +169,7 @@ def predict(self, X):
                 self.model.predict_proba(X, ntree_limit=self.model.best_ntree_limit)[
                     :, 1
                 ],
-                *self.clip_bounds
+                *self.clip_bounds,
             )
         else:
             return super(GradientBoostedPropensityModel, self).predict(X)