diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml index 224ef3d1..d207682e 100644 --- a/.github/workflows/python-test.yaml +++ b/.github/workflows/python-test.yaml @@ -25,9 +25,7 @@ jobs: run: | pip install -U pip pip install -U setuptools - pip install -r requirements.txt - pip install -r requirements-test.txt + python -m pip install .[test] python setup.py build_ext --inplace - python setup.py install - name: Test with pytest run: pytest -vs tests/ --cov causalml/ diff --git a/Makefile b/Makefile index 7fe0dbf7..70258628 100644 --- a/Makefile +++ b/Makefile @@ -22,4 +22,5 @@ test: build_ext .PHONY: clean clean: python setup.py clean --all - rm -rf ./build ./dist ./causalml.egg-info \ No newline at end of file + rm -rf ./build ./dist ./eggs ./causalml.egg-info + find ./causalml -type f \( -name "*.so" -o -name "*.c" -o -name "*.html" \) -delete diff --git a/README.md b/README.md index 13d4a5a7..6e2ba0de 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ The package currently supports the following methods # Installation -Installation with `conda` is recommended. `conda` environment files for Python 3.6, 3.7, 3.8 and 3.9 are available in the repository. To use models under the `inference.tf` module (e.g. `DragonNet`), additional dependency of `tensorflow` is required. For detailed instructions, see below. +Installation with `conda` is recommended. `conda` environment files for Python 3.7, 3.8 and 3.9 are available in the repository. To use models under the `inference.tf` module (e.g. `DragonNet`), additional dependency of `tensorflow` is required. For detailed instructions, see below. ## Install using `conda`: ### Install from `conda-forge` @@ -65,7 +65,7 @@ $ conda install -c conda-forge causalml ### Install with the `conda` virtual environment This will create a new `conda` virtual environment named `causalml-[tf-]py3x`, where `x` is in `[6, 7, 8, 9]`. e.g. `causalml-py37` or `causalml-tf-py38`. If you want to change the name of the environment, update the relevant YAML file in `envs/` -``` +```bash $ git clone https://github.com/uber/causalml.git $ cd causalml/envs/ $ conda env create -f environment-py38.yml # for the virtual environment with Python 3.8 and CausalML @@ -74,7 +74,7 @@ $ conda activate causalml-py38 ``` ### Install `causalml` with `tensorflow` -``` +```bash $ git clone https://github.com/uber/causalml.git $ cd causalml/envs/ $ conda env create -f environment-tf-py38.yml # for the virtual environment with Python 3.8 and CausalML @@ -82,32 +82,28 @@ $ conda activate causalml-tf-py38 (causalml-tf-py38) pip install -U numpy # this step is necessary to fix [#338](https://github.com/uber/causalml/issues/338) ``` -## Install using `pip`: +## Install from `PyPI`: -``` -$ git clone https://github.com/uber/causalml.git -$ cd causalml -$ pip install -r requirements.txt +```bash $ pip install causalml ``` ### Install `causalml` with `tensorflow` -``` -$ git clone https://github.com/uber/causalml.git -$ cd causalml -$ pip install -r requirements-tf.txt +```bash $ pip install causalml[tf] $ pip install -U numpy # this step is necessary to fix [#338](https://github.com/uber/causalml/issues/338) ``` ## Install from source: -``` +```bash $ git clone https://github.com/uber/causalml.git $ cd causalml -$ pip install -r requirements.txt -$ python setup.py build_ext --inplace -$ python setup.py install +$ pip install . +``` +with `tensorflow`: +```bash +pip install .[tf] ``` diff --git a/causalml/__init__.py b/causalml/__init__.py index f1f74177..129cba00 100644 --- a/causalml/__init__.py +++ b/causalml/__init__.py @@ -1,5 +1,3 @@ -name = "causalml" -__version__ = "0.13.0" __all__ = [ "dataset", "features", diff --git a/causalml/inference/iv/drivlearner.py b/causalml/inference/iv/drivlearner.py index 2b6cd962..3f13bdd5 100644 --- a/causalml/inference/iv/drivlearner.py +++ b/causalml/inference/iv/drivlearner.py @@ -749,7 +749,7 @@ def plot_shap_values( model_tau_feature=None, features=None, shap_dict=None, - **kwargs + **kwargs, ): """ Plots distribution of shapley values. @@ -788,7 +788,7 @@ def plot_shap_dependence( features=None, shap_dict=None, interaction_idx="auto", - **kwargs + **kwargs, ): """ Plots dependency of shapley values for a specified feature, colored by an interaction feature. @@ -831,7 +831,7 @@ def plot_shap_dependence( feature_idx=feature_idx, shap_dict=shap_dict, interaction_idx=interaction_idx, - **kwargs + **kwargs, ) diff --git a/causalml/inference/meta/base.py b/causalml/inference/meta/base.py index 507c7ff8..fb2a6660 100644 --- a/causalml/inference/meta/base.py +++ b/causalml/inference/meta/base.py @@ -249,7 +249,7 @@ def plot_shap_values( model_tau_feature=None, features=None, shap_dict=None, - **kwargs + **kwargs, ): """ Plots distribution of shapley values. @@ -288,7 +288,7 @@ def plot_shap_dependence( features=None, shap_dict=None, interaction_idx="auto", - **kwargs + **kwargs, ): """ Plots dependency of shapley values for a specified feature, colored by an interaction feature. @@ -331,5 +331,5 @@ def plot_shap_dependence( feature_idx=feature_idx, shap_dict=shap_dict, interaction_idx=interaction_idx, - **kwargs + **kwargs, ) diff --git a/causalml/inference/meta/explainer.py b/causalml/inference/meta/explainer.py index 96016a74..7b990c7e 100644 --- a/causalml/inference/meta/explainer.py +++ b/causalml/inference/meta/explainer.py @@ -238,7 +238,7 @@ def plot_shap_dependence( feature_idx, shap_dict=None, interaction_idx="auto", - **kwargs + **kwargs, ): """ Plots dependency of shapley values for a specified feature, colored by an interaction feature. @@ -269,5 +269,5 @@ def plot_shap_dependence( self.X, interaction_index=interaction_idx, feature_names=self.features, - **kwargs + **kwargs, ) diff --git a/causalml/inference/tree/causal/_criterion.pyx b/causalml/inference/tree/causal/_criterion.pyx index d4795acb..3a5a413b 100644 --- a/causalml/inference/tree/causal/_criterion.pyx +++ b/causalml/inference/tree/causal/_criterion.pyx @@ -205,10 +205,9 @@ cdef class CausalRegressionCriterion(RegressionCriterion): return 0 cdef void node_value(self, double * dest) nogil: - """Compute the node value of samples[start:end] into dest.""" - # Save the average of treatment effects within a node as a value for the node - dest[0] = self.state.node.tr_y_sum / self.state.node.tr_count - \ - self.state.node.ct_y_sum / self.state.node.ct_count + """Compute the node values of samples[start:end] into dest.""" + dest[0] = self.state.node.ct_y_sum / self.state.node.ct_count + dest[1] = self.state.node.tr_y_sum / self.state.node.tr_count cdef double get_groups_penalty(self, double tr_count, double ct_count) nogil: """Compute penalty for the sample size difference between groups""" diff --git a/causalml/inference/tree/causal/_tree.py b/causalml/inference/tree/causal/_tree.py index c5fbbb3d..bc319590 100644 --- a/causalml/inference/tree/causal/_tree.py +++ b/causalml/inference/tree/causal/_tree.py @@ -74,7 +74,8 @@ def fit( # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) - self.n_outputs_ = y.shape[1] + # For memory allocation to store control, treatment outcomes + self.n_outputs_ = np.unique(sample_weight).astype(int).size if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) diff --git a/causalml/inference/tree/causal/causalforest.py b/causalml/inference/tree/causal/causalforest.py index 234a61ff..527762db 100644 --- a/causalml/inference/tree/causal/causalforest.py +++ b/causalml/inference/tree/causal/causalforest.py @@ -249,8 +249,8 @@ def _fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None): "is necessary for Poisson regression." ) - self.n_outputs_ = y.shape[1] - + self.n_outputs_ = np.unique(sample_weight).astype(int).size + 1 + self.max_outputs_ = self.n_outputs_ y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: @@ -380,6 +380,26 @@ def fit(self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray): X, y, w = self.base_estimator._prepare_data(X=X, y=y, treatment=treatment) return self._fit(X=X, y=y, sample_weight=w) + def predict(self, X: np.ndarray, with_outcomes: bool = False) -> np.ndarray: + """Predict individual treatment effects + + Args: + X (np.matrix): a feature matrix + with_outcomes (bool), default=False, + include outcomes Y_hat(X|T=0), Y_hat(X|T=1) along with individual treatment effect + Returns: + (np.matrix): individual treatment effect (ITE), dim=nx1 + or ITE with outcomes [Y_hat(X|T=0), Y_hat(X|T=1), ITE], dim=nx3 + """ + if with_outcomes: + self.n_outputs_ = self.max_outputs_ + for estimator in self.estimators_: + estimator._with_outcomes = True + else: + self.n_outputs_ = 1 + y_pred = super().predict(X) + return y_pred + def calculate_error( self, X_train: np.ndarray, diff --git a/causalml/inference/tree/causal/causaltree.py b/causalml/inference/tree/causal/causaltree.py index a5d71c05..bcf0b697 100644 --- a/causalml/inference/tree/causal/causaltree.py +++ b/causalml/inference/tree/causal/causaltree.py @@ -132,6 +132,7 @@ def __init__( self._classes = {} self.groups_cnt = groups_cnt self.groups_cnt_mode = groups_cnt_mode + self._with_outcomes = False self._groups_cnt = {} super().__init__( @@ -190,6 +191,30 @@ def fit( self._groups_cnt = self._count_groups_distribution(X=X, treatment=w) return self + def predict( + self, X: np.ndarray, with_outcomes: bool = False, check_input=True + ) -> np.ndarray: + """Predict individual treatment effects + + Args: + X (np.matrix): a feature matrix + with_outcomes (bool), default=False, + include outcomes Y_hat(X|T=0), Y_hat(X|T=1) along with individual treatment effect + check_input (bool), default=True, + Allow to bypass several input checking. + Returns: + (np.matrix): individual treatment effect (ITE), dim=nx1 + or ITE with outcomes [Y_hat(X|T=0), Y_hat(X|T=1), ITE], dim=nx3 + """ + if check_input: + X = self._validate_X_predict(X, check_input) + y_outcomes = super().predict(X) + y_pred = y_outcomes[:, 1] - y_outcomes[:, 0] + need_outcomes = with_outcomes or self._with_outcomes + return ( + np.hstack([y_outcomes, y_pred.reshape(-1, 1)]) if need_outcomes else y_pred + ) + def fit_predict( self, X: np.ndarray, @@ -295,8 +320,7 @@ def _bootstrap(i: int): ) pool = PPool(nodes=n_jobs) - if "pytest" in sys.modules: - pool.restart(force=True) + pool.restart(force=True) bootstrap_estimates = np.array( list( diff --git a/causalml/inference/tree/plot.py b/causalml/inference/tree/plot.py index 92ab852f..595af7cd 100644 --- a/causalml/inference/tree/plot.py +++ b/causalml/inference/tree/plot.py @@ -9,13 +9,12 @@ import numpy as np import pydotplus import seaborn as sns - from sklearn.tree import _tree from sklearn.tree._export import _MPLTreeExporter, _color_brew from sklearn.utils.validation import check_is_fitted -from .utils import get_tree_leaves_mask from . import CausalTreeRegressor +from .utils import get_tree_leaves_mask def uplift_tree_string(decisionTree, x_names): @@ -309,9 +308,16 @@ def plot_dist_tree_leaves_values( """ tree_leaves_mask = get_tree_leaves_mask(tree) - leaves_values = tree.tree_.value.reshape(-1)[tree_leaves_mask] + leaves_values = tree.tree_.value + treatment_effects = leaves_values[:, 1] - leaves_values[:, 0] + treatment_effects = treatment_effects.reshape( + -1, + )[tree_leaves_mask] fig, ax = plt.subplots(figsize=figsize) - sns.distplot(leaves_values, ax=ax) + sns.distplot( + treatment_effects, + ax=ax, + ) plt.title(title, fontsize=fontsize) plt.show() diff --git a/causalml/inference/tree/uplift.pyx b/causalml/inference/tree/uplift.pyx index baf8fa9d..6508718e 100644 --- a/causalml/inference/tree/uplift.pyx +++ b/causalml/inference/tree/uplift.pyx @@ -1,6 +1,7 @@ # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False +# cython: language_level=3 """ Forest of trees-based ensemble methods for Uplift modeling on Classification Problem. Those methods include random forests and extremely randomized trees. diff --git a/causalml/metrics/visualize.py b/causalml/metrics/visualize.py index 4c88f563..e6b9b226 100644 --- a/causalml/metrics/visualize.py +++ b/causalml/metrics/visualize.py @@ -782,7 +782,7 @@ def auuc_score( normalize=True, tmle=False, *args, - **kwarg + **kwarg, ): """Calculate the AUUC (Area Under the Uplift Curve) score. @@ -816,7 +816,7 @@ def qini_score( normalize=True, tmle=False, *args, - **kwarg + **kwarg, ): """Calculate the Qini score: the area between the Qini curves of a model and random. diff --git a/causalml/optimize/value_optimization.py b/causalml/optimize/value_optimization.py index 72e50634..adba267f 100644 --- a/causalml/optimize/value_optimization.py +++ b/causalml/optimize/value_optimization.py @@ -55,7 +55,7 @@ def __init__( conversion_cost, impression_cost, *args, - **kwargs + **kwargs, ): self.treatment = treatment self.control_name = control_name diff --git a/causalml/propensity.py b/causalml/propensity.py index 9dd7c563..28934c23 100644 --- a/causalml/propensity.py +++ b/causalml/propensity.py @@ -169,7 +169,7 @@ def predict(self, X): self.model.predict_proba(X, ntree_limit=self.model.best_ntree_limit)[ :, 1 ], - *self.clip_bounds + *self.clip_bounds, ) else: return super(GradientBoostedPropensityModel, self).predict(X) diff --git a/pyproject.toml b/pyproject.toml index e2ede155..a4ca7b68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,70 @@ +[project] +name = "causalml" +version = "0.13.0" +description = "Python Package for Uplift Modeling and Causal Inference with Machine Learning Algorithms" +readme = { file = "README.md", content-type = "text/markdown" } + +authors = [ + { "name" = "Huigang Chen" }, + { "name" = "Totte Harinen" }, + { "name" = "Jeong-Yoon Lee" }, + { "name" = "Jing Pan" }, + { "name" = "Mike Yung" }, + { "name" = "Zhenyu Zhao" } +] +maintainers = [ + { name = "Jeong-Yoon Lee" } +] +classifiers = [ + "Programming Language :: Python", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] + +requires-python = ">=3.7" +dependencies = [ + "forestci==0.6", + "pathos==0.2.9", + "pip>=10.0", + "numpy<1.24", + "scipy>=1.4.1", + "matplotlib", + "pandas>=0.24.1,<1.4.0", + "scikit-learn<=1.0.2", + "statsmodels>=0.9.0", + "Cython>=0.28.0", + "seaborn", + "xgboost", + "pydotplus", + "tqdm", + "shap", + "dill", + "lightgbm", + "pygam", + "packaging", + "torch", + "pyro-ppl", + "graphviz", +] + +[project.optional-dependencies] +test = [ + "pytest>=4.6", + "pytest-cov>=4.0" +] +tf = [ + "tensorflow>=2.4.0" +] + + [build-system] requires = [ - "setuptools>=18.0", - "wheel", - "cython", - "numpy", - "scikit-learn<=1.0.2", + "setuptools>=18.0", + "wheel", + "Cython>=0.28.0", + "numpy<1.24", + "scikit-learn<=1.0.2", ] + +[project.urls] +homepage = "https://github.com/uber/causalml" \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index c58d200d..00000000 --- a/requirements-test.txt +++ /dev/null @@ -1,2 +0,0 @@ -pytest>=4.6 -pytest-cov diff --git a/requirements-tf.txt b/requirements-tf.txt deleted file mode 100644 index b184f2c5..00000000 --- a/requirements-tf.txt +++ /dev/null @@ -1,22 +0,0 @@ -setuptools>=41.0.0 -pip>=10.0 -numpy>=1.20.0 -scipy==1.10.0 -matplotlib -pandas>=0.24.1 -scikit-learn>=0.22.0 -statsmodels>=0.9.0 -seaborn -Cython>=0.28.0 -xgboost -pydotplus -tqdm -shap -dill -lightgbm -pygam -packaging -torch -pyro-ppl -graphviz -tensorflow>=2.4.0 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 64fbf5cb..00000000 --- a/requirements.txt +++ /dev/null @@ -1,23 +0,0 @@ -setuptools>=41.0.0 -forestci==0.6 -pathos==0.2.9 -pip>=10.0 -numpy>=1.18.5 -scipy>=1.4.1 -matplotlib -pandas>=0.24.1 -scikit-learn<=1.0.2 -statsmodels>=0.9.0 -seaborn -Cython>=0.28.0 -xgboost -pydotplus -tqdm -shap -dill -lightgbm -pygam -packaging -torch -pyro-ppl -graphviz diff --git a/setup.py b/setup.py index 0d06498b..cecacea5 100644 --- a/setup.py +++ b/setup.py @@ -15,17 +15,6 @@ dist.Distribution().fetch_build_eggs(["numpy"]) from numpy import get_include as np_get_include -import causalml - -with open("README.md", "r", encoding="utf-8") as f: - long_description = f.read() - -with open("requirements.txt") as f: - requirements = f.readlines() - -with open("requirements-test.txt") as f: - requirements_test = f.readlines() - extensions = [ Extension( "causalml.inference.tree.causal._criterion", @@ -53,31 +42,7 @@ packages = find_packages(exclude=["tests", "tests.*"]) setup( - name="causalml", - version=causalml.__version__, - author="Huigang Chen, Totte Harinen, Jeong-Yoon Lee, Yuchen Luo, Jing Pan, Mike Yung, Zhenyu Zhao", - author_email="", - description="Python Package for Uplift Modeling and Causal Inference with Machine Learning Algorithms", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/uber/causalml", packages=packages, - python_requires=">=3.7", - classifiers=[ - "Programming Language :: Python", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - setup_requires=[ - # Setuptools 18.0 properly handles Cython extensions. - "setuptools>=18.0", - "cython", - "numpy", - "scikit-learn<=1.0.2", - ], - install_requires=requirements, - tests_require=requirements_test, ext_modules=cythonize(extensions, annotate=True), include_dirs=[np_get_include()], - extras_require={"tf": ["tensorflow>=2.4.0"]}, ) diff --git a/tests/test_causal_trees.py b/tests/test_causal_trees.py index de61f3b6..c407faf8 100644 --- a/tests/test_causal_trees.py +++ b/tests/test_causal_trees.py @@ -1,4 +1,5 @@ import multiprocessing as mp +from abc import abstractmethod import pandas as pd import pytest @@ -14,6 +15,18 @@ class CausalTreeBase: test_size: float = 0.2 control_name: int or str = 0 + @abstractmethod + def prepare_model(self, *args, **kwargs): + return + + @abstractmethod + def test_fit(self, *args, **kwargs): + return + + @abstractmethod + def test_predict(self, *args, **kwargs): + return + def prepare_data(self, generate_regression_data) -> tuple: y, X, treatment, tau, b, e = generate_regression_data(mode=2) df = pd.DataFrame(X) @@ -41,14 +54,14 @@ def prepare_data(self, generate_regression_data) -> tuple: class TestCausalTreeRegressor(CausalTreeBase): - def prepare_causal_tree(self) -> CausalTreeRegressor: + def prepare_model(self) -> CausalTreeRegressor: ctree = CausalTreeRegressor( control_name=self.control_name, groups_cnt=True, random_state=RANDOM_SEED ) return ctree def test_fit(self, generate_regression_data): - ctree = self.prepare_causal_tree() + ctree = self.prepare_model() ( X_train, X_test, @@ -80,7 +93,7 @@ def test_fit_predict( self, generate_regression_data, return_ci, bootstrap_size, n_bootstraps ): y, X, treatment, tau, b, e = generate_regression_data(mode=1) - ctree = self.prepare_causal_tree() + ctree = self.prepare_model() output = ctree.fit_predict( X=X, treatment=treatment, @@ -99,16 +112,25 @@ def test_fit_predict( te = output assert te.shape[0] == y.shape[0] + def test_predict(self, generate_regression_data): + y, X, treatment, tau, b, e = generate_regression_data(mode=2) + ctree = self.prepare_model() + ctree.fit(X=X, y=y, treatment=treatment) + y_pred = ctree.predict(X[:1, :]) + y_pred_with_outcomes = ctree.predict(X[:1, :], with_outcomes=True) + assert y_pred.shape == (1,) + assert y_pred_with_outcomes.shape == (1, 3) + def test_ate(self, generate_regression_data): y, X, treatment, tau, b, e = generate_regression_data(mode=2) - ctree = self.prepare_causal_tree() + ctree = self.prepare_model() ate, ate_lower, ate_upper = ctree.estimate_ate(X=X, y=y, treatment=treatment) assert (ate >= ate_lower) and (ate <= ate_upper) assert ape(tau.mean(), ate) < ERROR_THRESHOLD class TestCausalRandomForestRegressor(CausalTreeBase): - def prepare_causal_rforest(self, n_estimators: int) -> CausalRandomForestRegressor: + def prepare_model(self, n_estimators: int) -> CausalRandomForestRegressor: crforest = CausalRandomForestRegressor( criterion="causal_mse", control_name=self.control_name, @@ -119,7 +141,7 @@ def prepare_causal_rforest(self, n_estimators: int) -> CausalRandomForestRegress @pytest.mark.parametrize("n_estimators", (5, 10, 50)) def test_fit(self, generate_regression_data, n_estimators): - crforest = self.prepare_causal_rforest(n_estimators=n_estimators) + crforest = self.prepare_model(n_estimators=n_estimators) ( X_train, X_test, @@ -145,9 +167,19 @@ def test_fit(self, generate_regression_data, n_estimators): ) assert df_qini["crforest_ite_pred"] > df_qini["Random"] + @pytest.mark.parametrize("n_estimators", (5,)) + def test_predict(self, generate_regression_data, n_estimators): + y, X, treatment, tau, b, e = generate_regression_data(mode=2) + ctree = self.prepare_model(n_estimators=n_estimators) + ctree.fit(X=X, y=y, treatment=treatment) + y_pred = ctree.predict(X[:1, :]) + y_pred_with_outcomes = ctree.predict(X[:1, :], with_outcomes=True) + assert y_pred.shape == (1,) + assert y_pred_with_outcomes.shape == (1, 3) + @pytest.mark.parametrize("n_estimators", (5,)) def test_unbiased_sampling_error(self, generate_regression_data, n_estimators): - crforest = self.prepare_causal_rforest(n_estimators=n_estimators) + crforest = self.prepare_model(n_estimators=n_estimators) ( X_train, X_test,