Optuna hyperparameter optimization tutorial (#178)

Tianzhang Cai · bcebere · robsdavis · web-flow · commit 46f83ad81b9f · 2023-04-25T09:47:29.000+01:00
* first commit for the addition of the TabDDPM plugin

* Add DDPM test script and update DDPM plugin

* add TabDDPM class and refactor

* handle discrete cols and label generation

* add hparam space and update tests of DDPM

* debug and test DDPM

* update TensorDataLoader and training loop

* clear bugs

* debug for regression tasks

* debug for regression tasks; ALL TESTS PASSED

* remove the official repo of TabDDPM

* passed all pre-commit checks

* convert assert to conditional AssertionErrors

* added an auto annotation tool

* update auto-anno and generate annotations

* remove auto-anno and flake8 noqa

* add python&lt;3.9 compatible annotations

* remove star import

* replace builtin type annos to typing annos

* resolve py38 compatibility issue

* tests/plugins/generic/test_ddpm.py

* change TabDDPM method signatures

* remove Iterator subscription

* update AssertionErrors, add EarlyStop callback, removed additional MLP, update logging

* remove TensorDataLoader, update test_ddpm

* update EarlyStopping

* add TabDDPM tutorial, update TabDDPM plugin and encoders

* add TabDDPM tutorial

* major update of FeatureEncoder and TabularEncoder

* add LogDistribution and LogIntDistribution

* update DDPM to use TabularEncoder

* update test_tabular_encoder and debug

* debug and DDPM tutorial OK

* debug LogDistribution and LogIntDistribution

* change discrete encoding of BinEncoder to passthrough;  passed all tests in test_tabular_encoder

* add tabnet to plugins/core/models

* add factory.py, let DDPM use TabNet, refactor

* update docstrings and refactor

* fix type annotation compatibility

* make SkipConnection serializable

* fix TabularEncoder.activation_layout

* remove unnecessary code

* fix minor bug and add more nn models in factory

* update pandas and torch version requirement

* update pandas and torch version requirement

* update ddpm tutorial

* restore setup.cfg

* restore setup.cfg

* replace LabelEncoder with OrdinalEncoder

* update setup.cfg

* update setup.cfg

* debug datetimeDistribution

* clean

* update setup.cfg and goggle test

* move DDPM tutorial to tutorials/plugins

* update tabnet.py reference

* update tab_ddpm

* update distribution, add optuna utils and tutorial

* update

* Fix plugin type of static_model of fflows

* update intlogdistribution and tutorial

* try fixing goggle

* add more activations

* minor fix

* update

* update

* update

* update

* Update tabular_encoder.py

* Update test_goggle.py

* Update tabular_encoder.py

* update

* update tutorial 8

* update

* default cat nonlin of goggle &lt;- gumbel_softmax

* get_nonlin('softmax') &lt;- GumbelSoftmax()

* remove debug logging

* update

* update

* fix merge

* fix merge

* update pip upgrade commands in workflows

* update pip upgrade commands in workflows

* keep pip version to 23.0.1 in workflows

* keep pip version to 23.0.1 in workflows

* update

* update

* update

* update

* update

* update

* fix distribution

* update

* move upgrading of wheel to prereq.txt

* update

---------

Co-authored-by: Bogdan Cebere &lt;bogdan.cebere@gmail.com&gt;
Co-authored-by: Rob &lt;62107751+robsdavis@users.noreply.github.com&gt;
diff --git a/.github/workflows/test_full.yml b/.github/workflows/test_full.yml
@@ -27,7 +27,7 @@ jobs:
         if: ${{ matrix.os == 'macos-latest' }}
       - name: Install dependencies
         run: |
-            pip install pip==23.0.1
+            python -m pip install -U pip
             pip install -r prereq.txt
       - name: Test Core
         run: |
diff --git a/.github/workflows/test_pr.yml b/.github/workflows/test_pr.yml
@@ -54,7 +54,7 @@ jobs:
         if: ${{ matrix.os == 'macos-latest' }}
       - name: Install dependencies
         run: |
-            pip install pip==23.0.1
+            python -m pip install -U pip
             pip install -r prereq.txt
       - name: Test Core
         run: |
diff --git a/.github/workflows/test_tutorials.yml b/.github/workflows/test_tutorials.yml
@@ -32,7 +32,7 @@ jobs:
         if: ${{ matrix.os == 'macos-latest' }}
       - name: Install dependencies
         run: |
-            pip install pip==23.0.1
+            python -m pip install -U pip
             pip install -r prereq.txt
 
             pip install .[all]
diff --git a/prereq.txt b/prereq.txt
@@ -1,3 +1,4 @@
 numpy
-torch<2.0
+torch>=1.10.0,<2.0
 tsai
+wheel>=0.40
diff --git a/src/synthcity/plugins/core/distribution.py b/src/synthcity/plugins/core/distribution.py
@@ -111,17 +111,25 @@ def as_constraint(self) -> Constraints:
 
     @abstractmethod
     def min(self) -> Any:
-        "Get the min value of the distribution"
+        """Get the min value of the distribution."""
         ...
 
     @abstractmethod
     def max(self) -> Any:
-        "Get the max value of the distribution"
+        """Get the max value of the distribution."""
         ...
 
-    @abstractmethod
     def __eq__(self, other: Any) -> bool:
-        ...
+        return type(self) == type(other) and self.get() == other.get()
+
+    def __contains__(self, item: Any) -> bool:
+        """
+        Example:
+        >>> dist = CategoricalDistribution(name="foo", choices=["a", "b", "c"])
+        >>> "a" in dist
+        True
+        """
+        return self.has(item)
 
     @abstractmethod
     def dtype(self) -> str:
@@ -146,7 +154,7 @@ def _validate_choices(cls: Any, v: List, values: Dict) -> List:
             raise ValueError(
                 "Invalid choices for CategoricalDistribution. Provide data or choices params"
             )
-        return v
+        return sorted(set(v))
 
     def get(self) -> List[Any]:
         return [self.name, self.choices]
@@ -176,12 +184,6 @@ def min(self) -> Any:
     def max(self) -> Any:
         return max(self.choices)
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, CategoricalDistribution):
-            return False
-
-        return self.name == other.name and set(self.choices) == set(other.choices)
-
     def dtype(self) -> str:
         types = {
             "object": 0,
@@ -259,33 +261,24 @@ def min(self) -> Any:
     def max(self) -> Any:
         return self.high
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, type(self)):
-            return False
-
-        return (
-            self.name == other.name
-            and self.low == other.low
-            and self.high == other.high
-        )
-
     def dtype(self) -> str:
         return "float"
 
 
 class LogDistribution(FloatDistribution):
     low: float = np.finfo(np.float64).tiny
     high: float = np.finfo(np.float64).max
-    base: float = 2.0
+
+    def get(self) -> List[Any]:
+        return [self.name, self.low, self.high]
 
     def sample(self, count: int = 1) -> Any:
         np.random.seed(self.random_state)
         msamples = self.sample_marginal(count)
         if msamples is not None:
             return msamples
-        lo = np.log2(self.low) / np.log2(self.base)
-        hi = np.log2(self.high) / np.log2(self.base)
-        return self.base ** np.random.uniform(lo, hi, count)
+        lo, hi = np.log2(self.low), np.log2(self.high)
+        return 2.0 ** np.random.uniform(lo, hi, count)
 
 
 class IntegerDistribution(Distribution):
@@ -313,6 +306,12 @@ def _validate_high_thresh(cls: Any, v: int, values: Dict) -> int:
             return int(values[mkey].index.max())
         return v
 
+    @validator("step", always=True)
+    def _validate_step(cls: Any, v: int, values: Dict) -> int:
+        if v < 1:
+            raise ValueError("Step must be greater than 0")
+        return v
+
     def get(self) -> List[Any]:
         return [self.name, self.low, self.high, self.step]
 
@@ -322,9 +321,9 @@ def sample(self, count: int = 1) -> Any:
         if msamples is not None:
             return msamples
 
-        high = (self.high + 1 - self.low) // self.step
-        s = np.random.choice(high, count)
-        return s * self.step + self.low
+        steps = (self.high - self.low) // self.step
+        samples = np.random.choice(steps + 1, count)
+        return samples * self.step + self.low
 
     def has(self, val: Any) -> bool:
         return self.low <= val and val <= self.high
@@ -347,34 +346,31 @@ def min(self) -> Any:
     def max(self) -> Any:
         return self.high
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, IntegerDistribution):
-            return False
-
-        return (
-            self.name == other.name
-            and self.low == other.low
-            and self.high == other.high
-        )
-
     def dtype(self) -> str:
         return "int"
 
 
-class LogIntDistribution(FloatDistribution):
-    low: float = 1.0
-    high: float = float(np.iinfo(np.int64).max)
-    base: float = 2.0
+class IntLogDistribution(IntegerDistribution):
+    low: int = 1
+    high: int = np.iinfo(np.int64).max
+
+    @validator("step", always=True)
+    def _validate_step(cls: Any, v: int, values: Dict) -> int:
+        if v != 1:
+            raise ValueError("Step must be 1 for IntLogDistribution")
+        return v
+
+    def get(self) -> List[Any]:
+        return [self.name, self.low, self.high]
 
     def sample(self, count: int = 1) -> Any:
         np.random.seed(self.random_state)
         msamples = self.sample_marginal(count)
         if msamples is not None:
             return msamples
-        lo = np.log2(self.low) / np.log2(self.base)
-        hi = np.log2(self.high) / np.log2(self.base)
-        s = self.base ** np.random.uniform(lo, hi, count)
-        return s.astype(int)
+        lo, hi = np.log2(self.low), np.log2(self.high)
+        samples = 2.0 ** np.random.uniform(lo, hi, count)
+        return samples.astype(int)
 
 
 class DatetimeDistribution(Distribution):
@@ -383,49 +379,46 @@ class DatetimeDistribution(Distribution):
         :parts: 1
     """
 
-    offset: int = 120
     low: datetime = datetime.utcfromtimestamp(0)
     high: datetime = datetime.now()
-
-    @validator("offset", always=True)
-    def _validate_offset(cls: Any, v: int) -> int:
-        if v < 0:
-            raise ValueError("offset must be greater than 0")
-        return v
+    step: timedelta = timedelta(microseconds=1)
+    offset: timedelta = timedelta(seconds=120)
 
     @validator("low", always=True)
     def _validate_low_thresh(cls: Any, v: datetime, values: Dict) -> datetime:
         mkey = "marginal_distribution"
         if mkey in values and values[mkey] is not None:
             v = values[mkey].index.min()
-        return v - timedelta(seconds=values["offset"])
+        return v
 
     @validator("high", always=True)
     def _validate_high_thresh(cls: Any, v: datetime, values: Dict) -> datetime:
         mkey = "marginal_distribution"
         if mkey in values and values[mkey] is not None:
             v = values[mkey].index.max()
-        return v + timedelta(seconds=values["offset"])
+        return v
 
     def get(self) -> List[Any]:
-        return [self.name, self.low, self.high]
+        return [self.name, self.low, self.high, self.step, self.offset]
 
     def sample(self, count: int = 1) -> Any:
         np.random.seed(self.random_state)
         msamples = self.sample_marginal(count)
         if msamples is not None:
             return msamples
 
-        delta = self.high - self.low
-        return self.low + delta * np.random.rand(count)
+        n = (self.high - self.low) // self.step + 1
+        samples = np.round(np.random.rand(count) * n - 0.5)
+        return self.low + samples * self.step
 
     def has(self, val: datetime) -> bool:
         return self.low <= val and val <= self.high
 
     def includes(self, other: "Distribution") -> bool:
-        return self.min() - timedelta(
-            seconds=self.offset
-        ) <= other.min() and other.max() <= self.max() + timedelta(seconds=self.offset)
+        return (
+            self.min() - self.offset <= other.min()
+            and other.max() <= self.max() + self.offset
+        )
 
     def as_constraint(self) -> Constraints:
         return Constraints(
@@ -442,16 +435,6 @@ def min(self) -> Any:
     def max(self) -> Any:
         return self.high
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, DatetimeDistribution):
-            return False
-
-        return (
-            self.name == other.name
-            and self.low == other.low
-            and self.high == other.high
-        )
-
     def dtype(self) -> str:
         return "datetime"
 
diff --git a/src/synthcity/plugins/core/models/factory.py b/src/synthcity/plugins/core/models/factory.py
@@ -20,6 +20,7 @@
     DatetimeEncoder,
     FeatureEncoder,
     GaussianQuantileTransformer,
+    LabelEncoder,
     MinMaxScaler,
     OneHotEncoder,
     OrdinalEncoder,
@@ -75,6 +76,7 @@
     datetime=DatetimeEncoder,
     onehot=OneHotEncoder,
     ordinal=OrdinalEncoder,
+    label=LabelEncoder,
     standard=StandardScaler,
     minmax=MinMaxScaler,
     robust=RobustScaler,
diff --git a/src/synthcity/plugins/generic/plugin_ddpm.py b/src/synthcity/plugins/generic/plugin_ddpm.py
@@ -18,8 +18,8 @@
 from synthcity.plugins.core.distribution import (
     Distribution,
     IntegerDistribution,
+    IntLogDistribution,
     LogDistribution,
-    LogIntDistribution,
 )
 from synthcity.plugins.core.models.tabular_ddpm import TabDDPM
 from synthcity.plugins.core.models.tabular_encoder import TabularEncoder
@@ -180,11 +180,11 @@ def hyperparameter_space(**kwargs: Any) -> List[Distribution]:
         """
         return [
             LogDistribution(name="lr", low=1e-5, high=1e-1),
-            LogIntDistribution(name="batch_size", low=256, high=4096),
+            IntLogDistribution(name="batch_size", low=256, high=4096),
             IntegerDistribution(name="num_timesteps", low=10, high=1000),
-            LogIntDistribution(name="n_iter", low=1000, high=10000),
+            IntLogDistribution(name="n_iter", low=1000, high=10000),
             # IntegerDistribution(name="n_layers_hidden", low=2, high=8),
-            # LogIntDistribution(name="dim_hidden", low=128, high=1024),
+            # IntLogDistribution(name="dim_hidden", low=128, high=1024),
         ]
 
     def _fit(self, X: DataLoader, *args: Any, **kwargs: Any) -> "TabDDPMPlugin":
diff --git a/src/synthcity/plugins/time_series/plugin_fflows.py b/src/synthcity/plugins/time_series/plugin_fflows.py
@@ -11,6 +11,7 @@
 from fflows import FourierFlow
 
 # synthcity absolute
+from synthcity.plugins import Plugins
 from synthcity.plugins.core.dataloader import DataLoader
 from synthcity.plugins.core.distribution import (
     CategoricalDistribution,
@@ -24,7 +25,6 @@
 from synthcity.plugins.core.models.ts_model import TimeSeriesModel
 from synthcity.plugins.core.plugin import Plugin
 from synthcity.plugins.core.schema import Schema
-from synthcity.plugins.generic import GenericPlugins
 from synthcity.utils.constants import DEVICE
 
 
@@ -134,9 +134,7 @@ def __init__(
             normalize=normalize,
         ).to(device)
 
-        self.static_model = GenericPlugins().get(
-            self.static_model_name, device=self.device
-        )
+        self.static_model = Plugins().get(self.static_model_name, device=self.device)
 
         self.temporal_encoder = TimeSeriesTabularEncoder(
             max_clusters=encoder_max_clusters
diff --git a/src/synthcity/utils/optuna_sample.py b/src/synthcity/utils/optuna_sample.py
@@ -0,0 +1,27 @@
+# stdlib
+from typing import Any, Dict, List
+
+# third party
+import optuna
+
+# synthcity absolute
+import synthcity.plugins.core.distribution as D
+
+
+def suggest(trial: optuna.Trial, dist: D.Distribution) -> Any:
+    if isinstance(dist, D.FloatDistribution):
+        return trial.suggest_float(dist.name, dist.low, dist.high)
+    elif isinstance(dist, D.LogDistribution):
+        return trial.suggest_float(dist.name, dist.low, dist.high, log=True)
+    elif isinstance(dist, D.IntegerDistribution):
+        return trial.suggest_int(dist.name, dist.low, dist.high, dist.step)
+    elif isinstance(dist, D.IntLogDistribution):
+        return trial.suggest_int(dist.name, dist.low, dist.high, log=True)
+    elif isinstance(dist, D.CategoricalDistribution):
+        return trial.suggest_categorical(dist.name, dist.choices)
+    else:
+        raise ValueError(f"Unknown dist: {dist}")
+
+
+def suggest_all(trial: optuna.Trial, distributions: List[D.Distribution]) -> Dict:
+    return {dist.name: suggest(trial, dist) for dist in distributions}
diff --git a/tutorials/tutorial8_hyperparameter_optimization.ipynb b/tutorials/tutorial8_hyperparameter_optimization.ipynb