From e0316d4e869d90cd5e09af2f144be3c34a99041c Mon Sep 17 00:00:00 2001
From: Fabiana <30911746+fabclmnt@users.noreply.github.com>
Date: Thu, 29 Aug 2024 15:38:13 +0100
Subject: [PATCH 1/3] feat: remove unecessary files and update logic
---
examples/regular/models/creditcard_ctgan.py | 8 +-
requirements.txt | 13 +-
src/ydata_synthetic/__init__.py | 5 +-
src/ydata_synthetic/evaluation/__init__.py | 0
.../regular/inverse_preprocesser.py | 45 --
src/ydata_synthetic/preprocessing/__init__.py | 7 -
.../preprocessing/base_processor.py | 127 ----
.../preprocessing/regular/__init__.py | 0
.../preprocessing/regular/ctgan_processor.py | 221 -------
.../preprocessing/regular/processor.py | 121 ----
.../preprocessing/timeseries/__init__.py | 5 -
.../timeseries/doppelganger_processor.py | 266 --------
.../preprocessing/timeseries/stock.py | 18 -
.../timeseries/timeseries_processor.py | 22 -
.../preprocessing/timeseries/utils.py | 36 --
.../streamlit_app/.streamlit/config.toml | 3 -
src/ydata_synthetic/streamlit_app/About.py | 92 ---
src/ydata_synthetic/streamlit_app/__init__.py | 14 +-
.../pages/1_Train_a_synthesizer.py | 148 -----
.../pages/2_Generate_synthetic_data.py | 84 ---
.../streamlit_app/pages/functions/__init__.py | 0
.../streamlit_app/pages/functions/generate.py | 22 -
.../pages/functions/load_data.py | 25 -
.../streamlit_app/pages/functions/train.py | 50 --
src/ydata_synthetic/streamlit_app/run.py | 17 +-
src/ydata_synthetic/synthesizers/__init__.py | 40 +-
src/ydata_synthetic/synthesizers/base.py | 326 +---------
src/ydata_synthetic/synthesizers/loss.py | 72 ---
.../synthesizers/regular/__init__.py | 14 +-
.../synthesizers/regular/cgan/model.py | 244 +------
.../synthesizers/regular/cramergan/model.py | 287 +--------
.../synthesizers/regular/ctgan/__init__.py | 12 +-
.../synthesizers/regular/ctgan/model.py | 328 +---------
.../synthesizers/regular/ctgan/utils.py | 156 -----
.../synthesizers/regular/cwgangp/model.py | 279 +-------
.../synthesizers/regular/dragan/model.py | 278 +-------
.../synthesizers/regular/gmm/model.py | 109 +---
.../synthesizers/regular/model.py | 85 ---
.../synthesizers/regular/vanillagan/model.py | 209 +-----
.../synthesizers/regular/wgan/model.py | 247 +-------
.../synthesizers/regular/wgangp/model.py | 287 +--------
.../synthesizers/saving_keras.py | 23 -
.../synthesizers/timeseries/__init__.py | 14 +-
.../timeseries/doppelganger/doppelganger.py | 594 ------------------
.../timeseries/doppelganger/model.py | 210 +------
.../timeseries/doppelganger/network.py | 436 -------------
.../synthesizers/timeseries/model.py | 57 --
.../synthesizers/timeseries/timegan/model.py | 382 +----------
.../test_gumbel_softmax_activation.py | 72 ---
.../test_gumbel_softmax_layer.py | 54 --
.../test_regular_data_processor.py | 77 ---
src/ydata_synthetic/utils/__init__.py | 0
src/ydata_synthetic/utils/cache.py | 73 ---
src/ydata_synthetic/utils/gumbel_softmax.py | 88 ---
src/ydata_synthetic/utils/logger.py | 23 -
src/ydata_synthetic/utils/misc/colormaps.py | 34 -
src/ydata_synthetic/utils/utils.py | 32 -
57 files changed, 199 insertions(+), 6292 deletions(-)
delete mode 100644 src/ydata_synthetic/evaluation/__init__.py
delete mode 100644 src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py
delete mode 100644 src/ydata_synthetic/preprocessing/__init__.py
delete mode 100644 src/ydata_synthetic/preprocessing/base_processor.py
delete mode 100644 src/ydata_synthetic/preprocessing/regular/__init__.py
delete mode 100644 src/ydata_synthetic/preprocessing/regular/ctgan_processor.py
delete mode 100644 src/ydata_synthetic/preprocessing/regular/processor.py
delete mode 100644 src/ydata_synthetic/preprocessing/timeseries/__init__.py
delete mode 100644 src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py
delete mode 100644 src/ydata_synthetic/preprocessing/timeseries/stock.py
delete mode 100644 src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py
delete mode 100644 src/ydata_synthetic/preprocessing/timeseries/utils.py
delete mode 100644 src/ydata_synthetic/streamlit_app/.streamlit/config.toml
delete mode 100644 src/ydata_synthetic/streamlit_app/About.py
delete mode 100644 src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py
delete mode 100644 src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py
delete mode 100644 src/ydata_synthetic/streamlit_app/pages/functions/__init__.py
delete mode 100644 src/ydata_synthetic/streamlit_app/pages/functions/generate.py
delete mode 100644 src/ydata_synthetic/streamlit_app/pages/functions/load_data.py
delete mode 100644 src/ydata_synthetic/streamlit_app/pages/functions/train.py
delete mode 100644 src/ydata_synthetic/synthesizers/loss.py
delete mode 100644 src/ydata_synthetic/synthesizers/regular/ctgan/utils.py
delete mode 100644 src/ydata_synthetic/synthesizers/regular/model.py
delete mode 100644 src/ydata_synthetic/synthesizers/saving_keras.py
delete mode 100644 src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py
delete mode 100644 src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py
delete mode 100644 src/ydata_synthetic/synthesizers/timeseries/model.py
delete mode 100644 src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py
delete mode 100644 src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py
delete mode 100644 src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py
delete mode 100644 src/ydata_synthetic/utils/__init__.py
delete mode 100644 src/ydata_synthetic/utils/cache.py
delete mode 100644 src/ydata_synthetic/utils/gumbel_softmax.py
delete mode 100644 src/ydata_synthetic/utils/logger.py
delete mode 100644 src/ydata_synthetic/utils/misc/colormaps.py
delete mode 100644 src/ydata_synthetic/utils/utils.py
diff --git a/examples/regular/models/creditcard_ctgan.py b/examples/regular/models/creditcard_ctgan.py
index e79f7a36..07fd5bf6 100644
--- a/examples/regular/models/creditcard_ctgan.py
+++ b/examples/regular/models/creditcard_ctgan.py
@@ -1,5 +1,5 @@
"""
- CTGAN architecture example file
+ ctgan architecture example file
"""
import pandas as pd
from sklearn import cluster
@@ -35,7 +35,7 @@
fraud_w_classes['Class'] = labels
#----------------------------
-# CTGAN Training
+# ctgan Training
#----------------------------
batch_size = 500
@@ -53,10 +53,10 @@
# Create a bining
fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes
-# Init the CTGAN
+# Init the ctgan
synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args)
-#Training the CTGAN
+#Training the ctgan
synth.fit(data=fraud_w_classes, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)
# Saving the synthesizer
diff --git a/requirements.txt b/requirements.txt
index e7edeeae..4c4a2a22 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1 @@
-requests>=2.28, <2.31
-pandas<3
-numpy<2
-scikit-learn<2
-matplotlib<4
-tensorflow==2.15.*
-tensorflow-probability[tf]
-easydict==1.10
-pmlb==1.0.*
-tqdm<5.0
-typeguard==4.2.*
-pytest==7.4.*
+ydata-sdk
\ No newline at end of file
diff --git a/src/ydata_synthetic/__init__.py b/src/ydata_synthetic/__init__.py
index 9bd1f30e..d5935d63 100644
--- a/src/ydata_synthetic/__init__.py
+++ b/src/ydata_synthetic/__init__.py
@@ -1 +1,4 @@
-#from .version import __version__
\ No newline at end of file
+"""Main module of ydata-synthetic.
+
+.. include:: ../../README.md
+"""
diff --git a/src/ydata_synthetic/evaluation/__init__.py b/src/ydata_synthetic/evaluation/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py b/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py
deleted file mode 100644
index 9b9a0b50..00000000
--- a/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Inverts all preprocessing pipelines provided in the preprocessing examples
-from typing import Union
-
-import pandas as pd
-
-from sklearn.pipeline import Pipeline
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler
-
-
-def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer, OneHotEncoder, StandardScaler]) -> pd.DataFrame:
- """Inverts data transformations taking place in a standard sklearn processor.
- Supported processes are sklearn pipelines, column transformers or base estimators like standard scalers.
-
- Args:
- data (pd.DataFrame): The data object that needs inversion of preprocessing
- processor (Union[Pipeline, ColumnTransformer, BaseEstimator]): The processor applied on the original data
-
- Returns:
- inv_data (pd.DataFrame): The data object after inverting preprocessing"""
- inv_data = data.copy()
- if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, Pipeline)):
- inv_data = pd.DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_)
- elif isinstance(processor, ColumnTransformer):
- output_indices = processor.output_indices_
- assert isinstance(data, pd.DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame."
- for t_name, t, t_cols in processor.transformers_[::-1]:
- slice_ = output_indices[t_name]
- t_indices = list(range(slice_.start, slice_.stop, 1 if slice_.step is None else slice_.step))
- if t == 'drop':
- continue
- elif t == 'passthrough':
- inv_cols = pd.DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index)
- inv_col_names = inv_cols.columns
- else:
- inv_cols = pd.DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index)
- inv_col_names = inv_cols.columns
- if set(inv_col_names).issubset(set(inv_data.columns)):
- inv_data[inv_col_names] = inv_cols[inv_col_names]
- else:
- inv_data = pd.concat([inv_data, inv_cols], axis=1)
- else:
- print('The provided data processor is not supported and cannot be inverted with this method.')
- return None
- return inv_data[processor.feature_names_in_]
diff --git a/src/ydata_synthetic/preprocessing/__init__.py b/src/ydata_synthetic/preprocessing/__init__.py
deleted file mode 100644
index 4c5e2055..00000000
--- a/src/ydata_synthetic/preprocessing/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from ydata_synthetic.preprocessing.regular.processor import RegularDataProcessor
-from ydata_synthetic.preprocessing.timeseries.timeseries_processor import TimeSeriesDataProcessor
-
-__all__ = [
- "RegularDataProcessor",
- "TimeSeriesDataProcessor"
-]
\ No newline at end of file
diff --git a/src/ydata_synthetic/preprocessing/base_processor.py b/src/ydata_synthetic/preprocessing/base_processor.py
deleted file mode 100644
index 05336917..00000000
--- a/src/ydata_synthetic/preprocessing/base_processor.py
+++ /dev/null
@@ -1,127 +0,0 @@
-"Base class of Data Preprocessors, do not instantiate this class directly."
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from types import SimpleNamespace
-from typing import List, Optional
-
-from numpy import ndarray
-from pandas import DataFrame, Series
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.exceptions import NotFittedError
-from typeguard import typechecked
-
-
-# pylint: disable=R0902
-@typechecked
-class BaseProcessor(ABC, BaseEstimator, TransformerMixin):
- """
- This data processor works like a scikit learn transformer in with the methods fit, transform and inverse transform.
- Args:
- num_cols (list of strings):
- List of names of numerical columns.
- cat_cols (list of strings):
- List of names of categorical columns.
- """
- def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None):
- self.num_cols = [] if num_cols is None else num_cols
- self.cat_cols = [] if cat_cols is None else cat_cols
-
- self._num_pipeline = None # To be overriden by child processors
- self._cat_pipeline = None # To be overriden by child processors
-
- self._col_transform_info = None # Metadata object mapping inputs/outputs of each pipeline
-
- @property
- def num_pipeline(self) -> BaseEstimator:
- """Returns the pipeline applied to numerical columns."""
- return self._num_pipeline
-
- @property
- def cat_pipeline(self) -> BaseEstimator:
- """Returns the pipeline applied to categorical columns."""
- return self._cat_pipeline
-
- @property
- def types(self) -> Series:
- """Returns a Series with the dtypes of each column in the fitted DataFrame."""
- return self._types
-
- @property
- def col_transform_info(self) -> SimpleNamespace:
- """Returns a ProcessorInfo object specifying input/output feature mappings of this processor's pipelines."""
- self._check_is_fitted()
- if self._col_transform_info is None:
- self._col_transform_info = self.__create_metadata_synth()
- return self._col_transform_info
-
- def __create_metadata_synth(self) -> SimpleNamespace:
- def new_pipeline_info(feat_in, feat_out):
- return SimpleNamespace(feat_names_in = feat_in, feat_names_out = feat_out)
- if self.num_cols:
- num_info = new_pipeline_info(self.num_pipeline.feature_names_in_, self.num_pipeline.get_feature_names_out())
- else:
- num_info = new_pipeline_info([], [])
- if self.cat_cols:
- cat_info = new_pipeline_info(self.cat_pipeline.feature_names_in_, self.cat_pipeline.get_feature_names_out())
- else:
- cat_info = new_pipeline_info([], [])
- return SimpleNamespace(numerical=num_info, categorical=cat_info)
-
- def _check_is_fitted(self):
- """Checks if the processor is fitted by testing the numerical pipeline.
- Raises NotFittedError if not."""
- if self._num_pipeline is None:
- raise NotFittedError("This data processor has not yet been fitted.")
-
- def _validate_cols(self, x_cols):
- """Ensures validity of the passed numerical and categorical columns.
- The following is verified:
- 1) Num cols and cat cols are disjoint sets;
- 2) The union of these sets should equal x_cols;.
- Assertion errors are raised in case any of the tests fails."""
- missing = set(x_cols).difference(set(self.num_cols).union(set(self.cat_cols)))
- intersection = set(self.num_cols).intersection(set(self.cat_cols))
- assert intersection == set(), f"num_cols and cat_cols share columns {intersection} but should be disjoint."
- assert missing == set(), f"The columns {missing} of the provided dataset were not attributed to a pipeline."
-
- # pylint: disable=C0103
- @abstractmethod
- def fit(self, X: DataFrame) -> BaseProcessor:
- """Fits the DataProcessor to a passed DataFrame.
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the num/cat columns defined in initialization.
- Returns:
- self (DataProcessor): The fitted data processor.
- """
- raise NotImplementedError
-
- # pylint: disable=C0103
- @abstractmethod
- def transform(self, X: DataFrame) -> ndarray:
- """Transforms the passed DataFrame with the fit DataProcessor.
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the columns types defined in initialization.
- Returns:
- transformed (ndarray): Processed version of the passed DataFrame.
- """
- raise NotImplementedError
-
- # pylint: disable=C0103
- @abstractmethod
- def inverse_transform(self, X: ndarray) -> DataFrame:
- """Inverts the data transformation pipelines on a passed DataFrame.
- Args:
- X (ndarray):
- Numpy array to be brought back to the original data format.
- Should share the schema of data transformed by this DataProcessor.
- Can be used to revert transformations of training data or for synthetic samples.
- Returns:
- result (DataFrame):
- DataFrame with all performed transformations inverted.
- """
- raise NotImplementedError
diff --git a/src/ydata_synthetic/preprocessing/regular/__init__.py b/src/ydata_synthetic/preprocessing/regular/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py b/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py
deleted file mode 100644
index 158cedf1..00000000
--- a/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py
+++ /dev/null
@@ -1,221 +0,0 @@
-from __future__ import annotations
-
-from typing import List, Optional
-from typeguard import typechecked
-from dataclasses import dataclass
-import pandas as pd
-import numpy as np
-from sklearn.exceptions import NotFittedError, ConvergenceWarning
-from sklearn.utils._testing import ignore_warnings
-from sklearn.mixture import BayesianGaussianMixture
-from sklearn.preprocessing import OneHotEncoder
-
-from ydata_synthetic.preprocessing.base_processor import BaseProcessor
-
-@dataclass
-class ColumnMetadata:
- """
- Dataclass that stores the metadata of each column.
- """
- start_idx: int
- end_idx: int
- discrete: bool
- output_dim: int
- model: any
- components: list
- name: str
-
-
-@typechecked
-class CTGANDataProcessor(BaseProcessor):
- """
- CTGAN data preprocessing class.
- It works like any other transformer in scikit-learn with the methods fit, transform and inverse_transform.
- Args:
- n_clusters (int), default=10:
- Number of clusters.
- epsilon (float), default=0.005:
- Epsilon value.
- num_cols (list of strings):
- List of names of numerical columns.
- cat_cols (list of strings):
- List of names of categorical columns.
- """
- SUPPORTED_MODEL = 'CTGAN'
-
- def __init__(self, n_clusters=10, epsilon=0.005,
- num_cols: Optional[List[str]] = None,
- cat_cols: Optional[List[str]] = None):
- super().__init__(num_cols, cat_cols)
-
- self._n_clusters = n_clusters
- self._epsilon = epsilon
- self._metadata = None
- self._dtypes = None
- self._output_dimensions = None
-
- @property
- def metadata(self) -> list[ColumnMetadata]:
- """
- Returns the metadata for each column.
- """
- return self._metadata
-
- @property
- def output_dimensions(self) -> int:
- """
- Returns the dataset dimensionality after the preprocessing.
- """
- return int(self._output_dimensions)
-
- @ignore_warnings(category=ConvergenceWarning)
- def fit(self, X: pd.DataFrame) -> CTGANDataProcessor:
- """
- Fits the data processor to a passed DataFrame.
-
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the num/cat columns defined in initialization.
- Returns:
- self (CTGANDataProcessor): The fitted data processor.
- """
- self._dtypes = X.infer_objects().dtypes
- self._metadata = []
- cur_idx = 0
- for column in X.columns:
- column_data = X[[column]].values
- if column in self.cat_cols:
- ohe = OneHotEncoder(sparse_output=False)
- ohe.fit(column_data)
- n_categories = len(ohe.categories_[0])
- self._metadata.append(
- ColumnMetadata(
- start_idx=cur_idx,
- end_idx=cur_idx + n_categories,
- discrete=True,
- output_dim=n_categories,
- model=ohe,
- components=None,
- name=column
- )
- )
- cur_idx += n_categories
- else:
- bgm = BayesianGaussianMixture(
- n_components=self._n_clusters,
- weight_concentration_prior_type='dirichlet_process',
- weight_concentration_prior=0.001,
- n_init=1
- )
- bgm.fit(column_data)
- components = bgm.weights_ > self._epsilon
- output_dim = components.sum() + 1
- self._metadata.append(
- ColumnMetadata(
- start_idx=cur_idx,
- end_idx=cur_idx + output_dim,
- discrete=False,
- output_dim=output_dim,
- model=bgm,
- components=components,
- name=column
- )
- )
- cur_idx += output_dim
- self._output_dimensions = cur_idx
- return self
-
- def transform(self, X: pd.DataFrame) -> np.ndarray:
- """
- Transforms the passed DataFrame with the fitted data processor.
-
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the columns types defined in initialization.
- Returns:
- Processed version of the passed DataFrame.
- """
- if self._metadata is None:
- raise NotFittedError("This data processor has not yet been fitted.")
-
- transformed_data = []
- for col_md in self._metadata:
- column_data = X[[col_md.name]].values
- if col_md.discrete:
- ohe = col_md.model
- transformed_data.append(ohe.transform(column_data))
- else:
- bgm = col_md.model
- components = col_md.components
-
- means = bgm.means_.reshape((1, self._n_clusters))
- stds = np.sqrt(bgm.covariances_).reshape((1, self._n_clusters))
- features = (column_data - means) / (4 * stds)
-
- probabilities = bgm.predict_proba(column_data)
- n_opts = components.sum()
- features = features[:, components]
- probabilities = probabilities[:, components]
-
- opt_sel = np.zeros(len(column_data), dtype='int')
- for i in range(len(column_data)):
- norm_probs = probabilities[i] + 1e-6
- norm_probs = norm_probs / norm_probs.sum()
- opt_sel[i] = np.random.choice(np.arange(n_opts), p=norm_probs)
-
- idx = np.arange((len(features)))
- features = features[idx, opt_sel].reshape([-1, 1])
- features = np.clip(features, -.99, .99)
-
- probs_onehot = np.zeros_like(probabilities)
- probs_onehot[np.arange(len(probabilities)), opt_sel] = 1
- transformed_data.append(
- np.concatenate([features, probs_onehot], axis=1).astype(float))
-
- return np.concatenate(transformed_data, axis=1).astype(float)
-
- def inverse_transform(self, X: np.ndarray) -> pd.DataFrame:
- """
- Reverts the data transformations on a passed DataFrame.
-
- Args:
- X (ndarray):
- Numpy array to be brought back to the original data format.
- Should share the schema of data transformed by this data processor.
- Can be used to revert transformations of training data or for synthetic samples.
- Returns:
- DataFrame with all performed transformations reverted.
- """
- if self._metadata is None:
- raise NotFittedError("This data processor has not yet been fitted.")
-
- transformed_data = []
- col_names = []
- for col_md in self._metadata:
- col_data = X[:, col_md.start_idx:col_md.end_idx]
- if col_md.discrete:
- inv_data = col_md.model.inverse_transform(col_data)
- else:
- mean = col_data[:, 0]
- variance = col_data[:, 1:]
- mean = np.clip(mean, -1, 1)
-
- v_t = np.ones((len(col_data), self._n_clusters)) * -100
- v_t[:, col_md.components] = variance
- variance = v_t
- means = col_md.model.means_.reshape([-1])
- stds = np.sqrt(col_md.model.covariances_).reshape([-1])
-
- p_argmax = np.argmax(variance, axis=1)
- std_t = stds[p_argmax]
- mean_t = means[p_argmax]
- inv_data = mean * 4 * std_t + mean_t
-
- transformed_data.append(inv_data)
- col_names.append(col_md.name)
-
- transformed_data = np.column_stack(transformed_data)
- transformed_data = pd.DataFrame(transformed_data, columns=col_names).astype(self._dtypes)
- return transformed_data
diff --git a/src/ydata_synthetic/preprocessing/regular/processor.py b/src/ydata_synthetic/preprocessing/regular/processor.py
deleted file mode 100644
index cf7716a4..00000000
--- a/src/ydata_synthetic/preprocessing/regular/processor.py
+++ /dev/null
@@ -1,121 +0,0 @@
-"Implementation of a Regular DataProcessor."
-from __future__ import annotations
-
-from enum import Enum
-from typing import List, Optional
-
-from numpy import concatenate, ndarray, split, zeros
-from pandas import DataFrame, concat
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
-from typeguard import typechecked
-
-from ydata_synthetic.preprocessing.base_processor import BaseProcessor
-
-
-class RegularModels(Enum):
- "Supported models for the Regular Data Processor."
- CGAN = 'CGAN'
- CRAMERGAN = 'CramerGAN'
- DRAGAN = 'DRAGAN'
- GAN = 'VanillaGAN'
- WGAN = 'WGAN'
- WGAN_GP = 'WGAN_GP'
- CWGAN_GP = 'CWGAN_GP'
-
-
-@typechecked
-class RegularDataProcessor(BaseProcessor):
- """
- Main class for Regular/Tabular Data Preprocessing.
- It works like any other transformer in scikit learn with the methods fit, transform and inverse transform.
- Args:
- num_cols (list of strings):
- List of names of numerical columns.
- cat_cols (list of strings):
- List of names of categorical columns.
- """
- def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None):
- super().__init__(num_cols, cat_cols)
-
- self._col_order_ = None
- self._num_col_idx_ = None
- self._cat_col_idx_ = None
-
- # pylint: disable=W0106
- def fit(self, X: DataFrame) -> RegularDataProcessor:
- """Fits the DataProcessor to a passed DataFrame.
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the num/cat columns defined in initialization.
- Returns:
- self (RegularDataProcessor): The fitted data processor.
- """
- self._validate_cols(X.columns)
-
- self._col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols]
-
- self._types = X.dtypes
-
- self._num_pipeline = Pipeline([
- ("scaler", MinMaxScaler()),
- ])
- self._cat_pipeline = Pipeline([
- ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
- ])
-
- self.num_pipeline.fit(X[self.num_cols]) if self.num_cols else zeros([len(X), 0])
- self.cat_pipeline.fit(X[self.cat_cols]) if self.num_cols else zeros([len(X), 0])
-
- self._num_col_idx_ = len(self.num_pipeline.get_feature_names_out())
- self._cat_col_idx_ = self._num_col_idx_ + len(self.cat_pipeline.get_feature_names_out())
-
- return self
-
- def transform(self, X: DataFrame) -> ndarray:
- """Transforms the passed DataFrame with the fit DataProcessor.
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the columns types defined in initialization.
- Returns:
- transformed (ndarray):
- Processed version of the passed DataFrame.
- """
- self._check_is_fitted()
-
- num_data = self.num_pipeline.transform(X[self.num_cols]) if self.num_cols else zeros([len(X), 0])
- cat_data = self.cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0])
-
- transformed = concatenate([num_data, cat_data], axis=1)
-
- return transformed
-
- def inverse_transform(self, X: ndarray) -> DataFrame:
- """Inverts the data transformation pipelines on a passed DataFrame.
- Args:
- X (ndarray):
- Numpy array to be brought back to the original data format.
- Should share the schema of data transformed by this DataProcessor.
- Can be used to revert transformations of training data or for synthetic samples.
- Returns:
- result (DataFrame):
- DataFrame with all performed transformations inverted.
- """
- self._check_is_fitted()
-
- num_data, cat_data, _ = split(X, [self._num_col_idx_, self._cat_col_idx_], axis=1)
-
- num_data = self.num_pipeline.inverse_transform(num_data) if self.num_cols else zeros([len(X), 0])
- cat_data = self.cat_pipeline.inverse_transform(cat_data) if self.cat_cols else zeros([len(X), 0])
-
- result = concat([DataFrame(num_data, columns=self.num_cols),
- DataFrame(cat_data, columns=self.cat_cols)], axis=1)
-
- result = result.loc[:, self._col_order_]
-
- for col in result.columns:
- result[col]=result[col].astype(self._types[col])
-
- return result
diff --git a/src/ydata_synthetic/preprocessing/timeseries/__init__.py b/src/ydata_synthetic/preprocessing/timeseries/__init__.py
deleted file mode 100644
index e8eff6c2..00000000
--- a/src/ydata_synthetic/preprocessing/timeseries/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from ydata_synthetic.preprocessing.timeseries.stock import transformations as processed_stock
-
-__all__ = [
- "processed_stock",
-]
diff --git a/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py b/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py
deleted file mode 100644
index f3f7143b..00000000
--- a/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py
+++ /dev/null
@@ -1,266 +0,0 @@
-from __future__ import annotations
-
-from typing import List, Optional
-from dataclasses import dataclass
-
-from numpy import concatenate, ndarray, zeros, ones, expand_dims, reshape, sum as npsum, repeat, array_split, asarray, amin, amax, stack
-from pandas import DataFrame
-from typeguard import typechecked
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
-
-from ydata_synthetic.preprocessing.base_processor import BaseProcessor
-
-
-@dataclass
-class ColumnMetadata:
- """
- Dataclass that stores the metadata of each column.
- """
- discrete: bool
- output_dim: int
- name: str
- real: bool = True
-
-
-@typechecked
-class DoppelGANgerProcessor(BaseProcessor):
- """
- Main class for class the DoppelGANger preprocessing.
- It works like any other transformer in scikit learn with the methods fit, transform and inverse transform.
- Args:
- num_cols (list of strings):
- List of names of numerical columns.
- measurement_cols (list of strings):
- List of measurement columns.
- sequence_length (int):
- Sequence length.
- """
- SUPPORTED_MODEL = 'DoppelGANger'
-
- def __init__(self, num_cols: Optional[List[str]] = None,
- cat_cols: Optional[List[str]] = None,
- measurement_cols: Optional[List[str]] = None,
- sequence_length: Optional[int] = None,
- sample_length: Optional[int] = None,
- normalize_tanh: Optional[bool] = None):
- super().__init__(num_cols, cat_cols)
-
- if num_cols is None:
- num_cols = []
- if cat_cols is None:
- cat_cols = []
- if measurement_cols is None:
- measurement_cols = []
- if normalize_tanh is None:
- normalize_tanh = False
-
- self._col_order_ = None
- self.sequence_length = sequence_length
- self.sample_length = sample_length
- self.normalize_tanh = normalize_tanh
-
- if self.sequence_length is not None and self.sample_length is not None:
- if self.sequence_length % self.sample_length != 0:
- raise ValueError("The sequence length must be a multiple of the sample length.")
-
- self._measurement_num_cols = [c for c in self.num_cols if c in measurement_cols]
- self._measurement_cat_cols = [c for c in self.cat_cols if c in measurement_cols]
- self._attribute_num_cols = [c for c in self.num_cols if c not in measurement_cols]
- self._attribute_cat_cols = [c for c in self.cat_cols if c not in measurement_cols]
- self._measurement_cols_metadata = None
- self._attribute_cols_metadata = None
- self._measurement_one_hot_cat_cols = None
- self._attribute_one_hot_cat_cols = None
- self._has_attributes = bool(self._attribute_num_cols or self._attribute_cat_cols)
- self._eps = 1e-4
-
- @property
- def measurement_cols_metadata(self):
- return self._measurement_cols_metadata
-
- @property
- def attribute_cols_metadata(self):
- return self._attribute_cols_metadata
-
- def add_gen_flag(self, data_features: ndarray, sample_len: int):
- num_sample = data_features.shape[0]
- length = data_features.shape[1]
- data_gen_flag = ones((num_sample, length))
- data_gen_flag = expand_dims(data_gen_flag, 2)
- shift_gen_flag = concatenate(
- [data_gen_flag[:, 1:, :],
- zeros((data_gen_flag.shape[0], 1, 1))],
- axis=1)
- data_gen_flag_t = reshape(
- data_gen_flag,
- [num_sample, int(length / sample_len), sample_len])
- data_gen_flag_t = npsum(data_gen_flag_t, 2)
- data_gen_flag_t = data_gen_flag_t > 0.5
- data_gen_flag_t = repeat(data_gen_flag_t, sample_len, axis=1)
- data_gen_flag_t = expand_dims(data_gen_flag_t, 2)
- data_features = concatenate(
- [data_features,
- shift_gen_flag,
- (1 - shift_gen_flag) * data_gen_flag_t],
- axis=2)
-
- return data_features
-
- # pylint: disable=W0106
- def fit(self, X: DataFrame) -> DoppelGANgerProcessor:
- """Fits the data processor to a passed DataFrame.
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the num/cat columns defined in initialization.
- Returns:
- self (DoppelGANgerProcessor): The fitted data processor.
- """
- self._validate_cols(X.columns)
-
- measurement_cols = self._measurement_num_cols + self._measurement_cat_cols
- if not measurement_cols:
- raise ValueError("At least one measurement column must be supplied.")
- if not all(c in self.num_cols + self.cat_cols for c in measurement_cols):
- raise ValueError("At least one of the supplied measurement columns does not exist in the dataset.")
- if self.sequence_length is None:
- raise ValueError("The sequence length is mandatory.")
-
- self._col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols]
- self._types = X.dtypes
- self._num_pipeline = Pipeline([
- ("scaler", MinMaxScaler()),
- ])
- self._cat_pipeline = Pipeline([
- ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='if_binary')),
- ])
- self._num_pipeline.fit(X[self._attribute_num_cols]) if self._attribute_num_cols else zeros([len(X), 0])
- self._cat_pipeline.fit(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0])
-
- return self
-
- def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]:
- """Transforms the passed DataFrame with the fit DataProcessor.
- Args:
- X (DataFrame):
- DataFrame used to fit the processor parameters.
- Should be aligned with the columns types defined in initialization.
- Returns:
- transformed (ndarray, ndarray):
- Processed version of the passed DataFrame.
- """
- self._check_is_fitted()
-
- one_hot_cat_cols_out = self._cat_pipeline.get_feature_names_out() if self.cat_cols else []
- cat_data = DataFrame(self._cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]), columns=one_hot_cat_cols_out)
-
- self._measurement_one_hot_cat_cols = [c for c in one_hot_cat_cols_out if c.startswith(tuple(self._measurement_cat_cols))] # .split("_")[0]
- self._measurement_cols_metadata = [ColumnMetadata(discrete=False,
- output_dim=1,
- name=c) for c in self._measurement_num_cols]
- measurement_cat_data = cat_data[self._measurement_one_hot_cat_cols].to_numpy() if self._measurement_one_hot_cat_cols else zeros([len(X), 0])
- self._measurement_cols_metadata += [ColumnMetadata(discrete=True,
- output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
- name=c) for c in self._measurement_cat_cols]
- data_features = concatenate([X[self._measurement_num_cols].to_numpy(), measurement_cat_data], axis=1)
-
- if self._has_attributes:
- self._attribute_one_hot_cat_cols = [c for c in one_hot_cat_cols_out if c.startswith(tuple(self._attribute_cat_cols))] # .split("_")[0]
- attribute_num_data = self._num_pipeline.transform(X[self._attribute_num_cols]) if self._attribute_num_cols else zeros([len(X), 0])
- self._attribute_cols_metadata = [ColumnMetadata(discrete=False,
- output_dim=1,
- name=c) for c in self._attribute_num_cols]
- attribute_cat_data = cat_data[self._attribute_one_hot_cat_cols].to_numpy() if self._attribute_one_hot_cat_cols else zeros([len(X), 0])
- self._attribute_cols_metadata += [ColumnMetadata(discrete=True,
- output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
- name=c) for c in self._attribute_cat_cols]
- data_attributes = concatenate([attribute_num_data, attribute_cat_data], axis=1)
- else:
- data_attributes = zeros((data_features.shape[0], 0))
- self._attribute_one_hot_cat_cols = []
- self._attribute_cols_metadata = []
-
- num_samples = int(X.shape[0] / self.sequence_length)
- data_features = asarray(array_split(data_features, num_samples))
-
- additional_attributes = []
- for ix, col_meta in enumerate(self._measurement_cols_metadata):
- if not col_meta.discrete:
- col_data = X[col_meta.name].to_numpy().reshape(num_samples, -1)
- max_col = amax(col_data, axis=1) + self._eps
- min_col = amin(col_data, axis=1) - self._eps
- additional_attributes.append((max_col + min_col) / 2.0)
- additional_attributes.append((max_col - min_col) / 2.0)
- self._attribute_cols_metadata += [ColumnMetadata(discrete=False,
- output_dim=1,
- name=f"addi_{col_meta.name}_{ix}",
- real=False) for ix in range (1, 3)]
- max_col = expand_dims(max_col, axis=1)
- min_col = expand_dims(min_col, axis=1)
- data_features[:, :, ix] = (data_features[:, :, ix] - min_col) / (max_col - min_col)
- if self.normalize_tanh:
- data_features[:, :, ix] = data_features[:, :, ix] * 2.0 - 1.0
-
- data_attributes = asarray(array_split(data_attributes, num_samples))
- data_attributes = data_attributes.mean(axis=1)
-
- if additional_attributes:
- additional_attributes = stack(additional_attributes, axis=1)
- data_attributes = concatenate([data_attributes, additional_attributes], axis=1)
-
- data_features = self.add_gen_flag(data_features, sample_len=self.sample_length)
- self._measurement_cols_metadata += [ColumnMetadata(discrete=True, output_dim=2, name="gen_flags")]
- return data_features, data_attributes
-
- def inverse_transform(self, X_features: ndarray, X_attributes: ndarray, gen_flags: ndarray) -> list[DataFrame]:
- """Inverts the data transformation pipelines on a passed DataFrame.
- Args:
- X_features (ndarray):
- Numpy array with the measurement data to be brought back to the original format.
- X_attributes (ndarray):
- Numpy array with the attribute data to be brought back to the original format.
- gen_flags (ndarray):
- Numpy array with the flags indicating the activation of features.
- Returns:
- result (DataFrame):
- DataFrame with all performed transformations inverted.
- """
- self._check_is_fitted()
-
- addi_cols_idx = addi_cols_idx_start = sum([c.output_dim for c in self._attribute_cols_metadata if c.real])
- for m_col_ix in range(len(self._measurement_num_cols)):
- max_plus_min = X_attributes[:, addi_cols_idx]
- max_minus_min = X_attributes[:, addi_cols_idx + 1]
- max_val = expand_dims(max_plus_min + max_minus_min, axis=1)
- min_val = expand_dims(max_plus_min - max_minus_min, axis=1)
- if self.normalize_tanh:
- X_features[:, :, m_col_ix] = (X_features[:, :, m_col_ix] + 1.0) / 2.0
- X_features[:, :, m_col_ix] = X_features[:, :, m_col_ix] * (max_val - min_val) + min_val
- addi_cols_idx += 2
-
- X_features = X_features * expand_dims(gen_flags, axis=2)
- X_attributes = X_attributes[:, :addi_cols_idx_start]
-
- num_samples = X_attributes.shape[0]
- if self._has_attributes:
- X_attributes = repeat(X_attributes.reshape((num_samples, 1, X_attributes.shape[1])), repeats=X_features.shape[1], axis=1)
- generated_data = concatenate((X_features, X_attributes), axis=2)
- else:
- generated_data = X_features
- output_cols = self._measurement_num_cols + self._measurement_one_hot_cat_cols + self._attribute_num_cols + self._attribute_one_hot_cat_cols
- one_hot_cat_cols = self._measurement_one_hot_cat_cols + self._attribute_one_hot_cat_cols
-
- samples = []
- for i in range(num_samples):
- df = DataFrame(generated_data[i], columns=output_cols)
- df_num_feat = df[self._measurement_num_cols].to_numpy()
- df_num_attr = self._num_pipeline.inverse_transform(df[self._attribute_num_cols]) if self._attribute_num_cols else zeros([len(df), 0])
- df_cat = self._cat_pipeline.inverse_transform(df[one_hot_cat_cols]) if self.cat_cols else zeros([len(df), 0])
- df = DataFrame(concatenate((df_num_feat, df_num_attr, df_cat), axis=1), columns=self._measurement_num_cols+self._attribute_num_cols+self.cat_cols)
- df = df.loc[:, self._col_order_]
- for col in df.columns:
- df[col] = df[col].astype(self._types[col])
- samples.append(df)
-
- return samples
diff --git a/src/ydata_synthetic/preprocessing/timeseries/stock.py b/src/ydata_synthetic/preprocessing/timeseries/stock.py
deleted file mode 100644
index f10367cc..00000000
--- a/src/ydata_synthetic/preprocessing/timeseries/stock.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""
- Get the stock data from Yahoo finance data
- Data from the period 01 January 2017 - 24 January 2021
-"""
-import pandas as pd
-
-from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading
-
-def transformations(path, seq_len: int):
- stock_df = pd.read_csv(path)
- try:
- stock_df = stock_df.set_index('Date').sort_index()
- except:
- stock_df=stock_df
- #Data transformations to be applied prior to be used with the synthesizer model
- processed_data = real_data_loading(stock_df.values, seq_len=seq_len)
-
- return processed_data
diff --git a/src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py b/src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py
deleted file mode 100644
index b48750d9..00000000
--- a/src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"Implementation of a TimeSeries DataProcessor."
-from enum import Enum
-from typing import List, Optional
-
-from typeguard import typechecked
-
-from ydata_synthetic.preprocessing.base_processor import BaseProcessor
-
-
-class TimeSeriesModels(Enum):
- "Supported models for the TimeSeries Data Processor."
- TIMEGAN = 'TIMEGAN'
- TSCWGAN = 'TSCWGAN'
-
-
-@typechecked
-class TimeSeriesDataProcessor(BaseProcessor):
- """
- Not implemented.
- """
- def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None):
- raise NotImplementedError
diff --git a/src/ydata_synthetic/preprocessing/timeseries/utils.py b/src/ydata_synthetic/preprocessing/timeseries/utils.py
deleted file mode 100644
index c77c67b2..00000000
--- a/src/ydata_synthetic/preprocessing/timeseries/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
- Utility functions to be shared by the time-series preprocessing required to feed the data into the synthesizers
-"""
-import numpy as np
-from sklearn.preprocessing import MinMaxScaler
-
-# Method implemented here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py
-# Originally used in TimeGAN research
-def real_data_loading(data: np.array, seq_len):
- """Load and preprocess real-world datasets.
- Args:
- - data_name: Numpy array with the values from a a Dataset
- - seq_len: sequence length
-
- Returns:
- - data: preprocessed data.
- """
- # Flip the data to make chronological data
- ori_data = data[::-1]
- # Normalize the data
- scaler = MinMaxScaler().fit(ori_data)
- ori_data = scaler.transform(ori_data)
-
- # Preprocess the dataset
- temp_data = []
- # Cut data by sequence length
- for i in range(0, len(ori_data) - seq_len):
- _x = ori_data[i:i + seq_len]
- temp_data.append(_x)
-
- # Mix the datasets (to make it similar to i.i.d)
- idx = np.random.permutation(len(temp_data))
- data = []
- for i in range(len(temp_data)):
- data.append(temp_data[idx[i]])
- return data
diff --git a/src/ydata_synthetic/streamlit_app/.streamlit/config.toml b/src/ydata_synthetic/streamlit_app/.streamlit/config.toml
deleted file mode 100644
index 12f51c67..00000000
--- a/src/ydata_synthetic/streamlit_app/.streamlit/config.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[theme]
-base="light"
-primaryColor="#e32212"
diff --git a/src/ydata_synthetic/streamlit_app/About.py b/src/ydata_synthetic/streamlit_app/About.py
deleted file mode 100644
index cec3669d..00000000
--- a/src/ydata_synthetic/streamlit_app/About.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
- ydata-synthetic streamlit app landing page
-"""
-import streamlit as st
-
-def main():
- st.set_page_config(
- page_title="YData Synthetic - Synthetic data generation streamlit_app",
- page_icon="👋",
- layout="wide"
- )
- col1, col2 = st.columns([2, 4])
-
- with col1:
- st.image("https://assets.ydata.ai/oss/ydata-synthetic-_red.png", width=200)
-
- with col2:
- st.title("Welcome to YData Synthetic!")
- st.text("Your application for synthetic data generation!")
-
- st.markdown('[ydata-synthetic](https://github.com/ydataai/ydata-synthetic) is an open-source library and is used to generate synthetic data mimicking the real world data.')
- st.header('What is synthetic data?')
- st.markdown('Synthetic data is artificially generated data that is not collected from real-world events. It replicates the statistical components of real data containing no identifiable information, ensuring an individual’s privacy.')
- st.header('Why Synthetic Data?')
- st.markdown('''
- Synthetic data can be used for many applications:
- - Privacy
- - Remove bias
- - Balance datasets
- - Augment datasets''')
-
- # read the instructions in x/
- st.markdown('This *streamlit_app* application can generate synthetic data for your dataset. '
- 'Please read all the instructions in the sidebar before you start the process.')
-
- # read the instructions in x/
- st.subheader('Select & train a synthesizer')
- #Add here the example text for the end users
-
- st.markdown('''
- `ydata-synthetic` streamlit app enables the training and generation of synthetic data from generative architectures.
- The current app only provides support for the generation tabular data and for the following architectures:
- - GAN
- - WGAN
- - WGANGP
- - CTGAN
- - **ydata-sdk Synthesizer**
- ''')
-
- st.success('''In particular, **ydata-sdk Synthesizer** uses [`ydata-sdk`](https://docs.sdk.ydata.ai/) to leverage the state-of-the-art synthesizer model developed by YData.''')
- st.info('''
- Using **ydata-sdk Synthesizer** requires a valid token. The token is attached to a Fabric account.
- In case you do not have an account, you can create one at https://ydata.ai/ydata-fabric-free-trial.
- To obtain the token, please, login to https://fabric.ydata.ai.
- The token is available on the homepage once you are connected.
- ''')
-
- #best practives for synthetic data generation
- st.markdown('''
- ##### What you should ensure before training the synthesizer:
- - Make sure your dataset has no missing data.
- - If missing data is a problem, no worries. Check the article and this article.
- - Make sure you choose the right number of epochs and batch_size considering your dataset shape.
- - The choice of these 2 parameters highly affects the results you may get.
- - Make sure that you've the right data types selected.
- - Only numerical and categorical values are supported.
- - In case date , datetime, or text is available in the dataset, the columns should be preprocessed before the model training.''')
-
- st.markdown('The trained synthesizer is saved to `*.trained_synth.pkl*` by default.')
-
- st.subheader('Generate & compare synthetic samples')
-
- st.markdown('''
- The ydata-synthetic app experience allows you to:
- - Generate as many samples as you want based on the provided input
- - Generate a profile for the generated synthetic samples
- - Save the generated samples to a local directory''')
-
- # guidelines for sampling and
- st.markdown('''
- ##### What you should ensure before generating synthetic samples:
- - If no model file path is provided, the default location `.trained_synth.pkl` is assumed.
- - Always choose the correct type of data, that corresponds to the trained model in order to avoid loading errors.''')
-
- st.subheader('Coming soon')
- st.markdown('''
- - Support for time-series models: TimeGAN
- - Integrate more advanced settings for CTGAN
- - Side-by-side comparison real vs synthetic data sample with `ydata-profiling`''')
-
-if __name__ == '__main__':
- main()
\ No newline at end of file
diff --git a/src/ydata_synthetic/streamlit_app/__init__.py b/src/ydata_synthetic/streamlit_app/__init__.py
index aa617462..27e851a4 100644
--- a/src/ydata_synthetic/streamlit_app/__init__.py
+++ b/src/ydata_synthetic/streamlit_app/__init__.py
@@ -1,3 +1,13 @@
-from ydata_synthetic.streamlit_app.run import run
+"""
+ YData synthetic streamlit app for data synthesis
+"""
+from warnings import warn
-##
\ No newline at end of file
+from ydata_synthetic.streamlit_app import run
+
+warn(
+ "`import ydata_synthetic.streamllit_app` is deprecated. Please use **YData Fabric** instead."
+ "For more information check https://docs.fabric.ydata.ai/latest/. To start today go to http://ydata.ai/register.",
+ DeprecationWarning,
+ stacklevel=2,
+)
\ No newline at end of file
diff --git a/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py b/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py
deleted file mode 100644
index a5c573f0..00000000
--- a/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py
+++ /dev/null
@@ -1,148 +0,0 @@
-from typing import Union
-import os
-import json
-import logging
-
-import streamlit as st
-
-from ydata.sdk.synthesizers import RegularSynthesizer
-from ydata.sdk.common.client import get_client
-
-from ydata_synthetic.utils.logger import SynthesizersLogger
-from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
-from ydata_synthetic.synthesizers.regular.model import Model
-
-from ydata_synthetic.streamlit_app.pages.functions.load_data import upload_file
-from ydata_synthetic.streamlit_app.pages.functions.train import DataType, __CONDITIONAL_MODELS
-from ydata_synthetic.streamlit_app.pages.functions.train import init_synth, advanced_setttings, training_parameters
-
-logger = SynthesizersLogger(name='streamlitSynthesizer.logger')
-logger.setLevel(logging.INFO)
-
-def get_available_models(type: Union[str, DataType]):
-
- dtype = DataType(type)
- if dtype == DataType.TABULAR:
- models_list = [e.value.upper() for e in Model if e.value not in ['cgan', 'cwgangp']] + ['ydata-sdk Synthesizer']
- else:
- st.warning('Time-Series models are not yet supported .')
- models_list = ([''])
- return models_list
-
-def run():
- model_name= None
-
- df, num_cols, cat_cols = upload_file()
-
- if df is not None:
- st.subheader("2. Select your synthesizer parameters")
-
- col_type, col_model = st.columns(2)
-
- with col_type:
- datatype = st.selectbox('Select your data type', (DataType.TABULAR.value, ))
- with col_model:
- if datatype is not None:
- models_list = get_available_models(type=datatype)
- model_name = st.selectbox('Select your model', models_list)
-
- if model_name not in ['', 'ydata-sdk Synthesizer']:
- st.text("Select your synthesizer model parameters")
- col1, col2 = st.columns(2)
- with col1:
- batch_size = st.number_input('Batch size', 0, 500, 500, 1)
-
- with col2:
- lr = st.number_input('Learning rate', 0.01, 0.1, 0.05, 0.01)
-
- with st.expander('**More settings**'):
- model_path = st.text_input("Saved trained model to path:", value="trained_synth.pkl")
- noise_dim, layer_dim, beta_1, beta_2 = advanced_setttings()
-
- # Create the Train parameters
- gan_args = ModelParameters(batch_size=batch_size,
- lr=lr,
- betas=(beta_1, beta_2),
- noise_dim=noise_dim,
- layers_dim=layer_dim)
-
- model = init_synth(datatype=datatype, modelname=model_name, model_parameters=gan_args)
-
- if model != None:
- st.text("Set your synthesizer training parameters")
- #Get the training parameters
- epochs, label_col = training_parameters(model_name, df.columns)
-
- train_args = TrainParameters(epochs=epochs)
-
- st.subheader("3. Train your synthesizer")
- if st.button('Click here to start the training process'):
- with st.spinner("Please wait while your synthesizer trains..."):
- if label_col is not None:
- model.fit(data=df, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_args, label_cols=label_col)
- else:
- model.fit(data=df, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_args)
-
- st.success('Synthesizer was trained succesfully!')
- st.info(f"The trained model will be saved at {model_path}.")
-
- model.save(model_path)
-
-
-
- if model_name == 'ydata-sdk Synthesizer':
- valid_token = False
- st.text("Model parameters")
- col1, col2 = st.columns(2)
- with col1:
- token = st.text_input("SDK Token", type="password")
- os.environ['YDATA_TOKEN'] = token
-
- with col2:
- st.write("##")
- try:
- get_client()
- st.text('✅ Valid')
- valid_token = True
- except Exception:
- st.text('❌ Invalid')
-
- if not valid_token:
- st.error("""**ydata-sdk Synthesizer requires a valid token.**
- In case you do not have an account, please, create one at https://ydata.ai/ydata-fabric-free-trial.
- To obtain the token, please, login to https://fabric.ydata.ai.
- The token is available on the homepage once you are connected.
- """)
-
-
- with st.expander('**More settings**'):
- model_path = st.text_input("Saved trained model to path:", value="trained_synth.pkl")
-
- st.subheader("3. Train your synthesizer")
- if st.button('Click here to start the training process', disabled=not valid_token):
-
- logger.info_def_report(model='fabric')
- model = RegularSynthesizer()
- with st.spinner("Please wait while your synthesizer trains..."):
- dtypes = {}
- for c in num_cols:
- dtypes[c] = 'numerical'
- for c in cat_cols:
- dtypes[c] = 'categorical'
- model.fit(X=df, dtypes=dtypes)
-
- st.success('Synthesizer was trained succesfully!')
- st.info(f"The trained model will be saved at {model_path}.")
-
- model_data = {
- 'uid': model.uid,
- 'token': os.environ['YDATA_TOKEN']
- }
- with open(model_path, 'w') as outfile:
- json.dump(model_data, outfile)
-
-
-
-
-if __name__ == '__main__':
- run()
\ No newline at end of file
diff --git a/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py b/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py
deleted file mode 100644
index 5eba0df4..00000000
--- a/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import streamlit as st
-import json
-import os
-
-from ydata.sdk.synthesizers import RegularSynthesizer
-from ydata.sdk.common.client import get_client
-
-from ydata_synthetic.streamlit_app.pages.functions.train import DataType
-from ydata_synthetic.streamlit_app.pages.functions.generate import load_model, generate_profile
-
-def run():
- st.subheader("Generate synthetic data from a trained model")
- from_SDK = False
- model_data = {}
- valid_token = False
- col1, col2 = st.columns([4, 2])
- with col1:
- input_path = st.text_input("Provide the path to a trained model", value="trained_synth.pkl")
- # Try to load as a JSON as SDK
- try:
- f = open(input_path)
- model_data = json.load(f)
- from_SDK = True
- except:
- pass
-
- if from_SDK:
- token = st.text_input("SDK Token", type="password", value=model_data.get('token'))
- os.environ['YDATA_TOKEN'] = token
-
-
- with col2:
- datatype = st.selectbox('Select your data type', (DataType.TABULAR.value,))
- datatype=DataType(datatype)
-
- if from_SDK and 'YDATA_TOKEN' in os.environ:
- st.write("##")
- try:
- get_client()
- st.text('✅ Valid')
- valid_token = True
- except Exception:
- st.text('❌ Invalid')
-
- if from_SDK and 'token' in model_data and not valid_token:
- st.warning("The token used during training is not valid anymore. Please, use a new token.")
-
- if from_SDK and not valid_token:
- st.error("""**ydata-sdk Synthesizer requires a valid token.**
- In case you do not have an account, please, create one at https://ydata.ai/ydata-fabric-free-trial.
- To obtain the token, please, login to https://fabric.ydata.ai.
- The token is available on the homepage once you are connected.
- """)
-
- col1, col2 = st.columns([4,2])
- with col1:
- n_samples = st.number_input("Number of samples to generate", min_value=0, value=1000)
- profile = st.checkbox("Generate synthetic data profiling?", value=False)
- with col2:
- sample_path = st.text_input("Synthetic samples file path", value='synthetic.csv')
-
- if st.button('Generate samples'):
- if from_SDK:
- model = RegularSynthesizer.get(uid=model_data.get('uid'))
-
- else:
- model = load_model(input_path=input_path, datatype=datatype)
-
- st.success('The model was properly loaded and is now ready to generate synthetic samples!')
-
-
- #sample synthetic data
- with st.spinner('Generating samples... This might take time.'):
- synth_data = model.sample(n_samples)
- st.write(synth_data)
-
- #save the synthetic data samples to a given path
- synth_data.to_csv(sample_path)
-
- if profile:
- generate_profile(df=synth_data)
-
-if __name__ == '__main__':
- run()
\ No newline at end of file
diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/__init__.py b/src/ydata_synthetic/streamlit_app/pages/functions/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/generate.py b/src/ydata_synthetic/streamlit_app/pages/functions/generate.py
deleted file mode 100644
index cb098119..00000000
--- a/src/ydata_synthetic/streamlit_app/pages/functions/generate.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""
- Auxiliary functions for the synthetic data generation
-"""
-#passar o datatype para outro sítio??
-import pandas as pd
-from ydata_profiling import ProfileReport
-from streamlit_pandas_profiling import st_profile_report
-
-from ydata_synthetic.streamlit_app.pages.functions.train import DataType
-from ydata_synthetic.synthesizers.regular import RegularSynthesizer
-from ydata_synthetic.synthesizers.timeseries import TimeGAN
-
-def load_model(input_path: str, datatype: DataType):
- if datatype == DataType.TABULAR:
- model = RegularSynthesizer.load(input_path)
- else:
- model = TimeGAN.load(input_path)
- return model
-
-def generate_profile(df: pd.DataFrame):
- report = ProfileReport(df, title='Synthetic data profile', interactions=None)
- st_profile_report(report)
\ No newline at end of file
diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py b/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py
deleted file mode 100644
index 7ff3051f..00000000
--- a/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import streamlit as st
-import pandas as pd
-
-def upload_file():
- df = None
- num_cols = None
- cat_cols = None
-
- st.subheader("1. Select your dataset")
- uploaded_file = st.file_uploader("Choose a file:")
-
- if uploaded_file is not None:
- df = pd.read_csv(uploaded_file)
- st.write(df)
-
- #add here more things for the mainpage
- if df is not None:
- col1, col2 = st.columns(2)
- with col1:
- num_cols = st.multiselect('Choose the numerical columns', df.columns, key=1)
- with col2:
- cat_cols = st.multiselect('Choose categorical columns', [x for x in df.columns if x not in num_cols], key=2)
-
- return df, num_cols, cat_cols
-
diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/train.py b/src/ydata_synthetic/streamlit_app/pages/functions/train.py
deleted file mode 100644
index e6dd35da..00000000
--- a/src/ydata_synthetic/streamlit_app/pages/functions/train.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
- Auxiliary functions for synthetic data training
-"""
-from enum import Enum
-import streamlit as st
-
-from ydata_synthetic.synthesizers.regular import RegularSynthesizer
-from ydata_synthetic.synthesizers.timeseries.timegan.model import TimeGAN
-from ydata_synthetic.synthesizers import ModelParameters
-
-__MODEL_MAPPING = {'tabular': RegularSynthesizer, 'timeseries': TimeGAN}
-__CONDITIONAL_MODELS = ['CGAN', 'CWGANGP']
-
-class DataType(Enum):
- TABULAR = 'tabular'
- TIMESERIES = 'timeseries'
-
-def init_synth(datatype: str, modelname: str, model_parameters: ModelParameters, n_critic: int=1):
- synth = __MODEL_MAPPING[datatype]
- modelname = modelname.lower()
- if modelname in ['wgan', 'cwgangp', 'wgangp']:
- synth = synth(modelname=modelname,
- model_parameters=model_parameters,
- n_critic=n_critic)
- else:
- synth = synth(modelname=modelname,
- model_parameters=model_parameters)
- return synth
-
-def advanced_setttings():
- col1, col2 = st.columns(2)
- with col1:
- noise_dim = st.number_input('Select noise dimension', 0, 200, 128, 1)
- layer_dim = st.number_input('Select the layer dimension', 0, 200, 128, 1)
- with col2:
- beta_1 = st.slider('Select first beta co-efficient', 0.0, 1.0, 0.5)
- beta_2 = st.slider('Select second beta co-efficient', 0.0, 1.0, 0.9)
- return noise_dim, layer_dim, beta_1, beta_2
-
-def training_parameters(model_name:str, df_cols: list):
- col1, col2 = st.columns([2, 4])
- with col1:
- epochs = st.number_input('Epochs', min_value=0, value=100)
-
- if model_name in __CONDITIONAL_MODELS:
- with col2:
- label_col = st.multiselect('Choose the conditional cols:', df_cols)
- else:
- label_col=None
- return epochs, label_col
\ No newline at end of file
diff --git a/src/ydata_synthetic/streamlit_app/run.py b/src/ydata_synthetic/streamlit_app/run.py
index 5e416b4e..f009cfa0 100644
--- a/src/ydata_synthetic/streamlit_app/run.py
+++ b/src/ydata_synthetic/streamlit_app/run.py
@@ -1,15 +1,12 @@
"""
Logic to run streamlit app from python code
"""
-import os
-from streamlit import config as _config
-from streamlit.web import bootstrap
+from warnings import warn
def run():
- dir_path = os.path.dirname(__file__)
- file_path = os.path.join(dir_path, "About.py")
-
- _config.set_option("server.headless", True)
- args = []
-
- bootstrap.run(file_path,'',args, flag_options={})
\ No newline at end of file
+ warn(
+ "`import ydata_synthetic.streamllit_app` is deprecated. Please use **YData Fabric** instead."
+ "For more information check https://docs.fabric.ydata.ai/latest/. To start today go to http://ydata.ai/register.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/__init__.py b/src/ydata_synthetic/synthesizers/__init__.py
index 65e8da40..8aacc4cc 100644
--- a/src/ydata_synthetic/synthesizers/__init__.py
+++ b/src/ydata_synthetic/synthesizers/__init__.py
@@ -1,6 +1,36 @@
-from ydata_synthetic.synthesizers.base import ModelParameters, TrainParameters
+"""
+ ydata_synthetic.synthesizers init file
+"""
+from warnings import warn
-__all__ = [
- "ModelParameters",
- "TrainParameters"
-]
\ No newline at end of file
+from ydata_synthetic.synthesizers.regular.ctgan.model import CTGAN
+from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN
+from ydata_synthetic.synthesizers.regular.vanillagan.model import VanillaGAN
+from ydata_synthetic.synthesizers.regular.gmm.model import GMM
+from ydata_synthetic.synthesizers.regular.wgan.model import WGAN
+from ydata_synthetic.synthesizers.regular.wgangp.model import WGAN_GP
+from ydata_synthetic.synthesizers.regular.cwgangp.model import CWGANGP
+from ydata_synthetic.synthesizers.regular.cgan.model import CGAN
+from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN
+from ydata_synthetic.synthesizers.timeseries.timegan.model import TimeGAN
+from ydata_synthetic.synthesizers.timeseries.doppelganger.model import DoppelGANgerNetwork
+
+warn(
+ "`import ydata_synthetic.synthesizers` is deprecated. Please use `import ydata.sdk.synthesizers` instead."
+ "For more information check https://docs.synthetic.ydata.ai/latest and https://docs.fabric.ydata.ai/latest/sdk",
+ DeprecationWarning,
+ stacklevel=2,
+)
+
+__all__ = ['CTGAN',
+ 'CRAMERGAN',
+ 'VanillaGAN',
+ 'WGAN',
+ 'WGAN_GP',
+ 'CWGANGP',
+ 'DRAGAN',
+ 'CGAN',
+ 'GMM',
+ 'TimeGAN',
+ 'DoppelGANgerNetwork'
+ ]
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/base.py b/src/ydata_synthetic/synthesizers/base.py
index 850c2079..5a17fb71 100644
--- a/src/ydata_synthetic/synthesizers/base.py
+++ b/src/ydata_synthetic/synthesizers/base.py
@@ -1,319 +1,43 @@
-"Implements a GAN BaseModel synthesizer, not meant to be directly instantiated."
-from abc import ABC, abstractmethod
-from collections import namedtuple
-from typing import List, Optional, Union
-
-import pandas as pd
-import tqdm
-
-from numpy import array, vstack, ndarray
-from numpy.random import normal
-from pandas.api.types import is_float_dtype, is_integer_dtype
-from pandas import DataFrame
-from pandas import concat
+"""
+ Implements a GAN BaseModel synthesizer, not meant to be directly instantiated
+"""
+from abc import ABC
-from joblib import dump, load
+from typing import List, Optional
+from warnings import warn
-import tensorflow as tf
-
-from tensorflow import config as tfconfig
-from tensorflow import data as tfdata
-from tensorflow import random
-from typeguard import typechecked
+from collections import namedtuple
-from ydata_synthetic.preprocessing.regular.processor import (
- RegularDataProcessor, RegularModels)
-from ydata_synthetic.preprocessing.timeseries.timeseries_processor import (
- TimeSeriesDataProcessor, TimeSeriesModels)
-from ydata_synthetic.preprocessing.regular.ctgan_processor import CTGANDataProcessor
-from ydata_synthetic.preprocessing.timeseries.doppelganger_processor import DoppelGANgerProcessor
-from ydata_synthetic.synthesizers.saving_keras import make_keras_picklable
_model_parameters = ['batch_size', 'lr', 'betas', 'layers_dim', 'noise_dim',
- 'n_cols', 'seq_len', 'condition', 'n_critic', 'n_features',
- 'tau_gs', 'generator_dims', 'critic_dims', 'l2_scale',
+ 'n_cols', 'seq_len', 'condition', 'n_critic', 'n_features',
+ 'tau_gs', 'generator_dims', 'critic_dims', 'l2_scale',
'latent_dim', 'gp_lambda', 'pac', 'gamma', 'tanh']
_model_parameters_df = [128, 1e-4, (None, None), 128, 264,
- None, None, None, 1, None, 0.2, [256, 256],
+ None, None, None, 1, None, 0.2, [256, 256],
[256, 256], 1e-6, 128, 10.0, 10, 1, False]
-_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval',
- 'labels', 'n_clusters', 'epsilon', 'log_frequency',
- 'measurement_cols', 'sequence_length', 'number_sequences',
+_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval',
+ 'labels', 'n_clusters', 'epsilon', 'log_frequency',
+ 'measurement_cols', 'sequence_length', 'number_sequences',
'sample_length', 'rounds']
ModelParameters = namedtuple('ModelParameters', _model_parameters, defaults=_model_parameters_df)
TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None, 10, 0.005, True, None, 1, 1, 1, 1))
-@typechecked
class BaseModel(ABC):
"""
- Abstract class for synthetic data generation nmodels
-
- The main methods are train (for fitting the synthesizer), save/load and sample (generating synthetic records).
-
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
"""
- __MODEL__ = None
-
- @abstractmethod
- def fit(self, data: Union[DataFrame, array],
+ def __init__(self,
+ model_parameters: ModelParameters,
num_cols: Optional[List[str]] = None,
- cat_cols: Optional[List[str]] = None):
- """
- ### Description:
- Trains and fit a synthesizer model to a given input dataset.
-
- ### Args:
- `data` (Union[DataFrame, array]): Training data
- `num_cols` (Optional[List[str]]) : List with the names of the categorical columns
- `cat_cols` (Optional[List[str]]): List of names of categorical columns
-
- ### Returns:
- **self:** *object*
- Fitted synthesizer
- """
- ...
- @abstractmethod
- def sample(self, n_samples:int) -> pd.DataFrame:
- assert n_samples>0, "Please insert a value bigger than 0 for n_samples parameter."
- ...
-
- @classmethod
- def load(cls, path: str):
- ...
-
- @abstractmethod
- def save(self, path: str):
- ...
-
-# pylint: disable=R0902
-@typechecked
-class BaseGANModel(BaseModel):
- """
- Base class of GAN synthesizer models.
- The main methods are train (for fitting the synthesizer), save/load and sample (obtain synthetic records).
- Args:
- model_parameters (ModelParameters):
- Set of architectural parameters for model definition.
- """
- def __init__(
- self,
- model_parameters: ModelParameters
- ):
- gpu_devices = tfconfig.list_physical_devices('GPU')
- if len(gpu_devices) > 0:
- try:
- tfconfig.experimental.set_memory_growth(gpu_devices[0], True)
- except (ValueError, RuntimeError):
- # Invalid device or cannot modify virtual devices once initialized.
- pass
- #Validate the provided model parameters
- if model_parameters.betas is not None:
- assert len(model_parameters.betas) == 2, "Please provide the betas information as a tuple."
-
- self.batch_size = model_parameters.batch_size
- self._set_lr(model_parameters.lr)
- self.beta_1 = model_parameters.betas[0]
- self.beta_2 = model_parameters.betas[1]
- self.noise_dim = model_parameters.noise_dim
- self.data_dim = None
- self.layers_dim = model_parameters.layers_dim
-
- # Additional parameters for the CTGAN
- self.generator_dims = model_parameters.generator_dims
- self.critic_dims = model_parameters.critic_dims
- self.l2_scale = model_parameters.l2_scale
- self.latent_dim = model_parameters.latent_dim
- self.gp_lambda = model_parameters.gp_lambda
- self.pac = model_parameters.pac
-
- self.use_tanh = model_parameters.tanh
- self.processor=None
- if self.__MODEL__ in RegularModels.__members__ or \
- self.__MODEL__ == CTGANDataProcessor.SUPPORTED_MODEL:
- self.tau = model_parameters.tau_gs
-
- # pylint: disable=E1101
- def __call__(self, inputs, **kwargs):
- return self.model(inputs=inputs, **kwargs)
-
- # pylint: disable=C0103
- def _set_lr(self, lr):
- if isinstance(lr, float):
- self.g_lr=lr
- self.d_lr=lr
- elif isinstance(lr,(list, tuple)):
- assert len(lr)==2, "Please provide a two values array for the learning rates or a float."
- self.g_lr=lr[0]
- self.d_lr=lr[1]
-
- def define_gan(self):
- """Define the trainable model components.
-
- Optionally validate model structure with mock inputs and initialize optimizers."""
- raise NotImplementedError
-
- @property
- def model_parameters(self):
- "Returns the parameters of the model."
- return self._model_parameters
-
- @property
- def model_name(self):
- "Returns the model (class) name."
- return self.__class__.__name__
-
- def fit(self,
- data: Union[DataFrame, array],
- num_cols: Optional[List[str]] = None,
- cat_cols: Optional[List[str]] = None,
- train_arguments: Optional[TrainParameters] = None) -> Union[DataFrame, array]:
- """
- Trains and fit a synthesizer model to a given input dataset.
-
- Args:
- data (Union[DataFrame, array]): Training data
- num_cols (Optional[List[str]]) : List with the names of the categorical columns
- cat_cols (Optional[List[str]]): List of names of categorical columns
- train_arguments (Optional[TrainParameters]): Training parameters
-
- Returns:
- Fitted synthesizer
- """
- if self.__MODEL__ in RegularModels.__members__:
- self.processor = RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data)
- elif self.__MODEL__ in TimeSeriesModels.__members__:
- self.processor = TimeSeriesDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data)
- elif self.__MODEL__ == CTGANDataProcessor.SUPPORTED_MODEL:
- n_clusters = train_arguments.n_clusters
- epsilon = train_arguments.epsilon
- self.processor = CTGANDataProcessor(n_clusters=n_clusters, epsilon=epsilon,
- num_cols=num_cols, cat_cols=cat_cols).fit(data)
- elif self.__MODEL__ == DoppelGANgerProcessor.SUPPORTED_MODEL:
- measurement_cols = train_arguments.measurement_cols
- sequence_length = train_arguments.sequence_length
- sample_length = train_arguments.sample_length
- self.processor = DoppelGANgerProcessor(num_cols=num_cols, cat_cols=cat_cols,
- measurement_cols=measurement_cols,
- sequence_length=sequence_length,
- sample_length=sample_length,
- normalize_tanh=self.use_tanh).fit(data)
- else:
- print(f'A DataProcessor is not available for the {self.__MODEL__}.')
-
- def sample(self, n_samples: int):
- """
- Generates samples from the trained synthesizer.
-
- Args:
- n_samples (int): Number of rows to generated.
-
- Returns:
- synth_sample (pandas.DataFrame): generated synthetic samples.
- """
- steps = n_samples // self.batch_size + 1
- data = []
- for _ in tqdm.trange(steps, desc='Synthetic data generation'):
- z = random.uniform([self.batch_size, self.noise_dim], dtype=tf.dtypes.float32)
- records = self.generator(z, training=False).numpy()
- data.append(records)
- return self.processor.inverse_transform(array(vstack(data)))
-
- def save(self, path):
- """
- Saves a synthesizer as a pickle.
-
- Args:
- path (str): Path to write the synthesizer as a pickle object.
- """
- #Save only the generator?
- if self.__MODEL__=='WGAN' or self.__MODEL__=='WGAN_GP' or self.__MODEL__=='CWGAN_GP':
- del self.critic
- make_keras_picklable()
- dump(self, path)
-
- @classmethod
- def load(cls, path):
- """
- Loads a saved synthesizer from a pickle.
-
- Args:
- path (str): Path to read the synthesizer pickle from.
- """
- gpu_devices = tfconfig.list_physical_devices('GPU')
- if len(gpu_devices) > 0:
- try:
- tfconfig.experimental.set_memory_growth(gpu_devices[0], True)
- except (ValueError, RuntimeError):
- # Invalid device or cannot modify virtual devices once initialized.
- pass
- synth = load(path)
- return synth
-
-
-class ConditionalModel(BaseModel):
-
- @staticmethod
- def _validate_label_col(data: DataFrame, label_cols: List[str]):
- "Validates the label_col format, raises ValueError if invalid."
- assert all(item in list(data.columns) for item in label_cols), \
- f"The column {label_cols} could not be found on the provided dataset and cannot be used as condition."
- assert all(data[label_cols].isna().sum() == 0), \
- f"The provided {label_cols} contains NaN values, please impute or drop the respective records before proceeding."
- assert all([(is_float_dtype(data[col]) or is_integer_dtype(data[col])) for col in label_cols]), \
- f"The provided {label_cols} are expected to be integers or floats."
- unique_frac = data[label_cols].nunique() / len(data.index)
- assert all(unique_frac < 0.3), \
- f"The provided columns {label_cols} are not valid conditional columns due to high cardinality. Please revise your input."
-
- def _prep_fit(self, data: DataFrame, label_cols: List[str], num_cols: List[str], cat_cols: List[str]):
- """
- Validate and prepare the data for the training of a conditionalGAN architecture
- Args:
- data: training data
- label_cols: label columns
- num_cols: numerical columns
- cat_cols: categorical columns
- Returns:
- data, label: preprocessed data and labels
- """
- # Validating the label columns
- self._validate_label_col(data, label_cols)
- self._col_order = data.columns
- self.label_col = label_cols
-
- # Separating labels from the rest of the data to fit the data processor
- data, label = data[data.columns[~data.columns.isin(label_cols)]], data[label_cols].values
-
- BaseModel.fit(self, data, num_cols, cat_cols)
- return data, label
-
- def _generate_noise(self):
- "Gaussian noise for the generator input."
- while True:
- yield normal(size=self.noise_dim)
-
- def get_batch_noise(self):
- "Create a batch iterator for the generator gaussian noise input."
- return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=tf.dtypes.float32)
- .batch(self.batch_size)
- .repeat())
-
- def sample(self, condition: DataFrame) -> ndarray:
- """
- Method to generate synthetic samples from a conditional synth previsously trained.
- Args:
- condition (pandas.DataFrame): A dataframe with the shape (n_cols, nrows) where n_cols=number of columns used to condition the training
- n_samples (int): Number of synthetic samples to be generated
-
- Returns:
- sample (pandas.DataFrame): A dataframe with the generated synthetic records.
- """
- ##Validate here if the cond_vector=label_dim
- condition = condition.reset_index(drop=True)
- n_samples = len(condition)
- z_dist = random.uniform(shape=(n_samples, self.noise_dim))
- records = self.generator([z_dist, condition], training=False)
- data = self.processor.inverse_transform(array(records))
- data = concat([condition, data], axis=1)
- return data[self._col_order]
+ cat_cols: Optional[List[str]] = None,
+ **kwargs):
+ warn(
+ f"{self.__class__.__name__} is deprecated. Please leverage ydata-sdk RegularSynthesizer or TimeSeriesSynthesizer instead. For more information, "
+ f"check ydata-sdk documentation: https://docs.fabric.ydata.ai/latest/sdk/examples/synthesize_tabular_data/.",
+ DeprecationWarning,
+ stacklevel=2
+ )
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/loss.py b/src/ydata_synthetic/synthesizers/loss.py
deleted file mode 100644
index a4f45671..00000000
--- a/src/ydata_synthetic/synthesizers/loss.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from tensorflow import \
- (random, reshape, shape, GradientTape, reduce_mean,
- norm as tfnorm, tile, constant, int32)
-from tensorflow.math import reduce_std, reduce_euclidean_norm
-from enum import Enum
-
-class Mode(Enum):
- WGANGP = 'wgangp'
- DRAGAN = 'dragan'
- CRAMER = 'cramer'
- CTGAN = 'ctgan'
-
-## Original code loss from
-## https://github.com/LynnHo/DCGAN-LSGAN-WGAN-GP-DRAGAN-Tensorflow-2/blob/master/tf2gan/loss.py
-def gradient_penalty(f, real, fake, mode, pac=None):
- def _gradient_penalty(f, real, fake=None):
- def _interpolate(a, b=None):
- if b is None: # interpolation in DRAGAN
- beta = random.uniform(shape=shape(a), minval=0., maxval=1.)
- b = a + 0.5 * reduce_std(a) * beta
- shape_ = [shape(a)[0]] + [1] * (a.shape.ndims - 1)
- alpha = random.uniform(shape=shape_, minval=0., maxval=1.)
- inter = a + alpha * (b - a)
- inter.set_shape(a.shape)
- return inter
-
- x = _interpolate(real, fake)
- with GradientTape() as t:
- t.watch(x)
- pred = f(x)
- grad = t.gradient(pred, x)
- norm = tfnorm(reshape(grad, [shape(grad)[0], -1]), axis=1)
- gp = reduce_mean((norm - 1.)**2)
- return gp
-
- def _gradient_penalty_cramer(f_crit, real, fake):
- epsilon = random.uniform([real.shape[0], 1], 0.0, 1.0)
- x_hat = epsilon * real + (1 - epsilon) * fake[0]
- with GradientTape() as t:
- t.watch(x_hat)
- f_x_hat = f_crit(x_hat, fake[1])
- gradients = t.gradient(f_x_hat, x_hat)
- c_dx = tfnorm(reshape(gradients, [shape(gradients)[0], -1]), axis=1)
- c_regularizer = (c_dx - 1.0) ** 2
- return c_regularizer
-
- def _gradient_penalty_ctgan(f, real, fake, pac=10):
- alpha = random.uniform([real.shape[0] // pac, 1, 1], 0., 1.)
- alpha = tile(alpha, constant([1, pac, real.shape[1]], int32))
- alpha = reshape(alpha, [-1, real.shape[1]])
- interpolate = alpha * real + ((1 - alpha) * fake)
- with GradientTape() as tape:
- tape.watch(interpolate)
- prediction = f(interpolate)
- gradient = tape.gradient(prediction, [interpolate])[0]
- gradient = reshape(gradient, constant([-1, pac * real.shape[1]], int32))
- slope = reduce_euclidean_norm(gradient, axis=1)
- return reduce_mean((slope - 1.) ** 2)
-
- if mode == Mode.DRAGAN:
- gp = _gradient_penalty(f, real)
- elif mode == Mode.CRAMER:
- gp = _gradient_penalty_cramer(f, real, fake)
- elif mode == Mode.WGANGP:
- gp = _gradient_penalty(f, real, fake)
- elif mode == Mode.CTGAN:
- if pac is not None:
- gp = _gradient_penalty_ctgan(f, real, fake, pac=pac)
- else:
- gp = _gradient_penalty_ctgan(f, real, fake)
-
- return gp
diff --git a/src/ydata_synthetic/synthesizers/regular/__init__.py b/src/ydata_synthetic/synthesizers/regular/__init__.py
index 78ee556c..0602bc74 100644
--- a/src/ydata_synthetic/synthesizers/regular/__init__.py
+++ b/src/ydata_synthetic/synthesizers/regular/__init__.py
@@ -1,5 +1,11 @@
-from ydata_synthetic.synthesizers.regular.model import RegularSynthesizer
+"""
+ ydata_synthetic.synthesizers.regular init file
+"""
+from warnings import warn
-__all__ = [
- "RegularSynthesizer",
-]
+warn(
+ "`import ydata_synthetic.synthesizers.regular` is deprecated. Please use `import ydata.sdk.synthesizers import RegularSynthesizer` instead."
+ "For more information check https://docs.synthetic.ydata.ai/latest and https://docs.fabric.ydata.ai/latest/sdk",
+ DeprecationWarning,
+ stacklevel=2,
+)
diff --git a/src/ydata_synthetic/synthesizers/regular/cgan/model.py b/src/ydata_synthetic/synthesizers/regular/cgan/model.py
index f48dd7d7..fc0bbf9d 100644
--- a/src/ydata_synthetic/synthesizers/regular/cgan/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/cgan/model.py
@@ -1,240 +1,10 @@
"""
- CGAN architecture implementation file
+ CGAN class file
"""
-import os
-from os import path
-from typing import List, Optional, NamedTuple
+from ydata_synthetic.synthesizers.base import BaseModel
-from tqdm import trange
-
-import numpy as np
-from numpy import hstack
-from pandas import DataFrame
-
-from tensorflow import random
-from tensorflow import data as tfdata
-from tensorflow import dtypes
-from keras import Model
-from keras.layers import (Dense, Dropout, Input, concatenate)
-from keras.optimizers import Adam
-
-#Import ydata synthetic classes
-from ....synthesizers import TrainParameters
-from ....synthesizers.base import ConditionalModel
-
-class CGAN(ConditionalModel):
- "CGAN model for discrete conditions"
-
- __MODEL__='CGAN'
-
- def __init__(self, model_parameters):
- self._col_order = None
- super().__init__(model_parameters)
-
- def define_gan(self, activation_info: Optional[NamedTuple] = None):
- """Define the trainable model components.
-
- Args:
- activation_info (Optional[NamedTuple]): Defaults to None
- """
- self.generator = Generator(self.batch_size). \
- build_model(input_shape=(self.noise_dim,),
- label_shape=(self.label_dim),
- dim=self.layers_dim, data_dim=self.data_dim,
- activation_info = activation_info, tau = self.tau)
-
- self.discriminator = Discriminator(self.batch_size). \
- build_model(input_shape=(self.data_dim,),
- label_shape=(self.label_dim,),
- dim=self.layers_dim)
-
- g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- d_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
-
- # Build and compile the discriminator
- self.discriminator.compile(loss='binary_crossentropy',
- optimizer=d_optimizer,
- metrics=['accuracy'])
-
- # The generator takes noise as input and generates imgs
- noise = Input(shape=(self.noise_dim,))
- label = Input(shape=(1,)) # A label vector is expected
- record = self.generator([noise, label])
-
- # For the combined model we will only train the generator
- self.discriminator.trainable = False
-
- # The discriminator takes generated images as input and determines validity
- validity = self.discriminator([record, label])
-
- # The combined model (stacked generator and discriminator)
- # Trains the generator to fool the discriminator
- self._model = Model([noise, label], validity)
- self._model.compile(loss='binary_crossentropy', optimizer=g_optimizer)
-
- def _generate_noise(self):
- """Gaussian noise for the generator input."""
- while True:
- yield random.uniform(shape=(self.noise_dim,))
-
- def get_batch_noise(self):
- """Create a batch iterator for the generator gaussian noise input."""
- return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=dtypes.float32)
- .batch(self.batch_size)
- .repeat())
-
- def get_data_batch(self, data, batch_size, seed=0):
- """Produce real data batches from the passed data object.
-
- Args:
- data: real data.
- batch_size: batch size.
- seed (int, optional): Defaults to 0.
-
- Returns:
- data batch.
- """
- start_i = (batch_size * seed) % len(data)
- stop_i = start_i + batch_size
- shuffle_seed = (batch_size * seed) // len(data)
- np.random.seed(shuffle_seed)
- data_ix = np.random.choice(data.shape[0], replace=False, size=len(data)) # wasteful to shuffle every time
- return data[data_ix[start_i: stop_i]]
-
- def fit(self,
- data: DataFrame,
- label_cols: List[str],
- train_arguments: TrainParameters,
- num_cols: List[str],
- cat_cols: List[str]):
- """Trains and fit a synthesizer model to a given input dataset.
-
- Args:
- data: A pandas DataFrame with the data to be synthesized
- label_cols: The name of the column to be used as a label and condition for the training
- train_arguments: GAN training arguments.
- num_cols: List of columns of the data object to be handled as numerical
- cat_cols: List of columns of the data object to be handled as categorical
- """
- data, label = self._prep_fit(data,label_cols,num_cols,cat_cols)
-
- processed_data = self.processor.transform(data)
- self.data_dim = processed_data.shape[1]
- self.label_dim = len(label_cols)
-
- # Init the GAN model and optimizers
- self.define_gan(self.processor.col_transform_info)
-
- # Merging labels with processed data
- processed_data = hstack([processed_data, label])
-
- noise_batches = self.get_batch_noise()
-
- iterations = int(abs(processed_data.shape[0] / self.batch_size) + 1)
- # Adversarial ground truths
- valid = np.ones((self.batch_size, 1))
- fake = np.zeros((self.batch_size, 1))
-
- for epoch in trange(train_arguments.epochs):
- for _ in range(iterations):
- # ---------------------
- # Train Discriminator
- # ---------------------
- batch_x = self.get_data_batch(processed_data, self.batch_size) # Batches are retrieved with labels
- batch_x, label = batch_x[:, :-1], batch_x[:, -1] # Separate labels from batch
- noise = next(noise_batches)
-
- # Generate a batch of new records
- gen_records = self.generator([noise, label], training=True)
-
- # Train the discriminator
- d_loss_real = self.discriminator.train_on_batch([batch_x, label], valid) # Separate labels
- d_loss_fake = self.discriminator.train_on_batch([gen_records, label], fake) # Separate labels
- d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
-
- # ---------------------
- # Train Generator
- # ---------------------
- noise = next(noise_batches)
- # Train the generator (to have the discriminator label samples as valid)
- g_loss = self._model.train_on_batch([noise, label], valid)
-
- # Plot the progress
- print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss))
-
- # If at save interval => save model state and generated image samples
- if epoch % train_arguments.sample_interval == 0:
- self._run_checkpoint(train_arguments, epoch, label)
-
- def _run_checkpoint(self, train_arguments, epoch, label):
- """Run checkpoint and store model state and generated samples.
-
- Args:
- train_arguments: GAN training arguments.
- epoch: training epoch
- label: deprecated
- """
- if path.exists('./cache') is False:
- os.mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
- self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch))
-
-# pylint: disable=R0903
-class Generator():
- "Standard discrete conditional generator."
- def __init__(self, batch_size):
- self.batch_size = batch_size
-
- def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- label_shape: label dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- Returns:
- Generator model
- """
- noise = Input(shape=input_shape, batch_size=self.batch_size)
- label_v = Input(shape=label_shape)
- x = concatenate([noise, label_v])
- x = Dense(dim, activation='relu')(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim)(x)
- #if activation_info:
- # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x)
- return Model(inputs=[noise, label_v], outputs=x)
-
-
-# pylint: disable=R0903
-class Discriminator():
- "Standard discrete conditional discriminator."
- def __init__(self, batch_size):
- self.batch_size = batch_size
-
- def build_model(self, input_shape, label_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- label_shape: labels dimenstionality.
- dim: hidden layers size.
-
- Returns:
- Discriminator model
- """
- events = Input(shape=input_shape, batch_size=self.batch_size)
- label = Input(shape=label_shape, batch_size=self.batch_size)
- input_ = concatenate([events, label])
- x = Dense(dim * 4, activation='relu')(input_)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation='relu')(x)
- x = Dense(1, activation='sigmoid')(x)
- return Model(inputs=[events, label], outputs=x)
+class CGAN(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/cramergan/model.py b/src/ydata_synthetic/synthesizers/regular/cramergan/model.py
index 3915d29f..6ab69d39 100644
--- a/src/ydata_synthetic/synthesizers/regular/cramergan/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/cramergan/model.py
@@ -1,283 +1,10 @@
"""
- CramerGAN model file
+ CramerGAN class file
"""
-import os
-from os import path
-from typing import List, Optional, NamedTuple
+from ydata_synthetic.synthesizers.base import BaseModel
-import numpy as np
-
-import tensorflow as tf
-from keras import Model
-from keras.layers import (Dense, Dropout, Input)
-from keras.optimizers import Adam
-from tqdm import trange
-
-#Import ydata synthetic classes
-from ....synthesizers import TrainParameters
-from ....synthesizers.base import BaseGANModel
-from ....synthesizers.loss import Mode, gradient_penalty
-
-class CRAMERGAN(BaseGANModel):
-
- __MODEL__='CRAMERGAN'
-
- def __init__(self, model_parameters, gradient_penalty_weight=10):
- """Create a base CramerGAN.
-
- Based according to the WGAN paper - https://arxiv.org/pdf/1705.10743.pdf
- CramerGAN, a solution to biased Wassertein Gradients https://arxiv.org/abs/1705.10743"""
- self.gradient_penalty_weight = gradient_penalty_weight
- super().__init__(model_parameters)
-
- def define_gan(self, activation_info: Optional[NamedTuple] = None):
- """Define the trainable model components.
-
- Args:
- activation_info (Optional[NamedTuple], optional): Defaults to None.
-
- Returns:
- (generator_optimizer, critic_optimizer): Generator and critic optimizers
- """
- self.generator = Generator(self.batch_size). \
- build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
- activation_info=activation_info, tau = self.tau)
-
- self.critic = Critic(self.batch_size). \
- build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
-
- g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- c_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
-
- # The generator takes noise as input and generates records
- z = Input(shape=(self.noise_dim,), batch_size=self.batch_size)
- fake = self.generator(z)
- logits = self.critic(fake)
-
- return g_optimizer, c_optimizer
-
- def gradient_penalty(self, real, fake):
- """Compute gradient penalty.
-
- Args:
- real: real event.
- fake: fake event.
- Returns:
- gradient_penalty.
- """
- gp = gradient_penalty(self.f_crit, real, fake, mode=Mode.CRAMER)
- return gp
-
- def update_gradients(self, x, g_optimizer, c_optimizer):
- """Compute and apply the gradients for both the Generator and the Critic.
-
- Args:
- x: real data event
- g_optimizer: generator optimizer
- c_optimizer: critic optimizer
- Returns:
- (critic loss, generator loss)
- """
- # Update the gradients of critic for n_critic times (Training the critic)
-
- ##New generator gradient_tape
- noise= tf.random.normal([x.shape[0], self.noise_dim], dtype=tf.dtypes.float32)
- noise2= tf.random.normal([x.shape[0], self.noise_dim], dtype=tf.dtypes.float32)
-
- with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape:
- fake=self.generator(noise, training=True)
- fake2=self.generator(noise2, training=True)
-
- g_loss = self.g_lossfn(x, fake, fake2)
-
- c_loss = self.c_lossfn(x, fake, fake2)
-
- # Get the gradients of the generator
- g_gradients = g_tape.gradient(g_loss, self.generator.trainable_variables)
-
- # Update the weights of the generator
- g_optimizer.apply_gradients(
- zip(g_gradients, self.generator.trainable_variables)
- )
-
- c_gradient = d_tape.gradient(c_loss, self.critic.trainable_variables)
- # Update the weights of the critic using the optimizer
- c_optimizer.apply_gradients(
- zip(c_gradient, self.critic.trainable_variables)
- )
-
- return c_loss, g_loss
-
- def g_lossfn(self, real, fake, fake2):
- """Compute generator loss function according to the CramerGAN paper.
-
- Args:
- real: A real sample
- fake: A fake sample
- fak2: A second fake sample
-
- Returns:
- Loss of the generator
- """
- g_loss = tf.norm(self.critic(real, training=True) - self.critic(fake, training=True), axis=1) + \
- tf.norm(self.critic(real, training=True) - self.critic(fake2, training=True), axis=1) - \
- tf.norm(self.critic(fake, training=True) - self.critic(fake2, training=True), axis=1)
- return tf.reduce_mean(g_loss)
-
- def f_crit(self, real, fake):
- """
- Computes the critic distance function f between two samples.
-
- Args:
- real: A real sample
- fake: A fake sample
- Returns:
- Loss of the critic
- """
- return tf.norm(self.critic(real, training=True) - self.critic(fake, training=True), axis=1) - tf.norm(self.critic(real, training=True), axis=1)
-
- def c_lossfn(self, real, fake, fake2):
- """Compute the loss of the critic.
-
- Args:
- real: A real sample
- fake: A fake sample
- fake2: A second fake sample
-
- Returns:
- Loss of the critic
- """
- f_real = self.f_crit(real, fake2)
- f_fake = self.f_crit(fake, fake2)
- loss_surrogate = f_real - f_fake
- gp = self.gradient_penalty(real, [fake, fake2])
- return tf.reduce_mean(- loss_surrogate + self.gradient_penalty_weight*gp)
-
- @staticmethod
- def get_data_batch(train, batch_size, seed=0):
- """Get real data batches from the passed data object.
-
- Args:
- train: real data.
- batch_size: batch size.
- seed (int, optional):Defaults to 0.
-
- Returns:
- data batch.
- """
- # np.random.seed(seed)
- # x = train.loc[ np.random.choice(train.index, batch_size) ].values
- # iterate through shuffled indices, so every sample gets covered evenly
- start_i = (batch_size * seed) % len(train)
- stop_i = start_i + batch_size
- shuffle_seed = (batch_size * seed) // len(train)
- np.random.seed(shuffle_seed)
- train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time
- train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set
- return train[train_ix[start_i: stop_i]]
-
- def train_step(self, train_data, optimizers):
- """Perform a training step.
-
- Args:
- train_data: training data
- optimizers: generator and critic optimizers
-
- Returns:
- (critic_loss, generator_loss): Critic and generator loss.
- """
- critic_loss, g_loss = self.update_gradients(train_data, *optimizers)
- return critic_loss, g_loss
-
- def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]):
- """Fit a synthesizer model to a given input dataset.
-
- Args:
- data: A pandas DataFrame or a Numpy array with the data to be synthesized
- train_arguments: GAN training arguments.
- num_cols: List of columns of the data object to be handled as numerical
- cat_cols: List of columns of the data object to be handled as categorical
- """
- super().fit(data, num_cols, cat_cols)
-
- data = self.processor.transform(data)
- self.data_dim = data.shape[1]
- optimizers = self.define_gan(self.processor.col_transform_info)
-
- iterations = int(abs(data.shape[0] / self.batch_size) + 1)
-
- # Create a summary file
- train_summary_writer = tf.summary.create_file_writer(path.join('..\cramergan_test', 'summaries', 'train'))
-
- with train_summary_writer.as_default():
- for epoch in trange(train_arguments.epochs):
- for iteration in range(iterations):
- batch_data = self.get_data_batch(data, self.batch_size)
- c_loss, g_loss = self.train_step(batch_data, optimizers)
-
- if iteration % train_arguments.sample_interval == 0:
- # Test here data generation step
- # save model checkpoints
- if path.exists('./cache') is False:
- os.mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', iteration))
- self.critic.save_weights(model_checkpoint_base_name.format('critic', iteration))
- print(f"Epoch: {epoch} | critic_loss: {c_loss} | gen_loss: {g_loss}")
-
-
-class Generator(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple generator with dense feedforward layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- Returns:
- Generator model
- """
- input_ = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim, activation='relu')(input_)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim, activation='softmax')(x)
- return Model(inputs=input_, outputs=x)
-
-class Critic(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple critic with dense feedforward and dropout layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers size.
-
- Returns:
- Critic model
- """
- input_ = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim * 4, activation='relu')(input_)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation='relu')(x)
- x = Dense(1)(x)
- return Model(inputs=input_, outputs=x)
+class CRAMERGAN(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py b/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py
index e8b1dd71..0602bc74 100644
--- a/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py
+++ b/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py
@@ -1 +1,11 @@
-from .model import CTGAN
+"""
+ ydata_synthetic.synthesizers.regular init file
+"""
+from warnings import warn
+
+warn(
+ "`import ydata_synthetic.synthesizers.regular` is deprecated. Please use `import ydata.sdk.synthesizers import RegularSynthesizer` instead."
+ "For more information check https://docs.synthetic.ydata.ai/latest and https://docs.fabric.ydata.ai/latest/sdk",
+ DeprecationWarning,
+ stacklevel=2,
+)
diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/model.py b/src/ydata_synthetic/synthesizers/regular/ctgan/model.py
index 3599d7fd..5a06b6c0 100644
--- a/src/ydata_synthetic/synthesizers/regular/ctgan/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/ctgan/model.py
@@ -1,322 +1,10 @@
-from functools import partial
-from joblib import dump
-import numpy as np
-from pandas import DataFrame
-import tensorflow as tf
-from keras.layers import \
- (Input, Dense, LeakyReLU, Dropout, BatchNormalization, ReLU, Concatenate)
-from keras import Model
+"""
+ CTGAN synthesizer
+"""
+from ydata_synthetic.synthesizers.base import BaseModel
-import tensorflow_probability as tfp
-from ydata_synthetic.synthesizers.regular.ctgan.utils \
- import ConditionalLoss, RealDataSampler, ConditionalSampler
-
-from ydata_synthetic.synthesizers.loss import gradient_penalty, Mode as ModeGP
-from ydata_synthetic.synthesizers.base import BaseGANModel, ModelParameters, TrainParameters
-from ydata_synthetic.preprocessing.regular.ctgan_processor import CTGANDataProcessor
-
-class CTGAN(BaseGANModel):
- """
- Conditional Tabular GAN model.
- Based on the paper https://arxiv.org/abs/1907.00503.
-
- Args:
- model_parameters: Parameters used to create the CTGAN model.
+class CTGAN(BaseModel):
"""
- __MODEL__ = 'CTGAN'
-
- def __init__(self, model_parameters: ModelParameters):
- super().__init__(model_parameters)
- if self.batch_size % 2 != 0 or self.batch_size % self.pac != 0:
- raise ValueError("The batch size needs to be an even value divisible by the PAC.")
- self._model_parameters = model_parameters
- self._real_data_sampler = None
- self._conditional_sampler = None
- self._generator_model = None
- self._critic_model = None
-
- @staticmethod
- def _create_generator_model(input_dim, generator_dims, data_dim, metadata, tau):
- """
- Creates the generator model.
-
- Args:
- input_dim: Input dimensionality.
- generator_dims: Dimensions of each hidden layer.
- data_dim: Output dimensionality.
- metadata: Dataset columns metadata.
- tau: Gumbel-Softmax non-negative temperature.
- """
- input = Input(shape=(input_dim, ))
- x = input
- dim = input_dim
- for layer_dim in generator_dims:
- layer_input = x
- x = Dense(layer_dim,
- kernel_initializer="random_uniform",
- bias_initializer="random_uniform")(x)
- x = BatchNormalization(epsilon=1e-5, momentum=0.9)(x)
- x = ReLU()(x)
- x = Concatenate(axis=1)([x, layer_input])
- dim += layer_dim
-
- def _gumbel_softmax(logits, tau=1.0):
- """Applies the Gumbel-Softmax function to the given logits."""
- gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1)
- gumbels = gumbel_dist.sample(tf.shape(logits))
- gumbels = (logits + gumbels) / tau
- return tf.nn.softmax(gumbels, -1)
-
- def _generator_activation(data):
- """Custom activation function for the generator model."""
- data_transformed = []
- for col_md in metadata:
- if col_md.discrete:
- logits = data[:, col_md.start_idx:col_md.end_idx]
- data_transformed.append(_gumbel_softmax(logits, tau=tau))
- else:
- data_transformed.append(tf.math.tanh(data[:, col_md.start_idx:col_md.start_idx+1]))
- logits = data[:, col_md.start_idx+1:col_md.end_idx]
- data_transformed.append(_gumbel_softmax(logits, tau=tau))
- return data, tf.concat(data_transformed, axis=1)
-
- x = Dense(data_dim, kernel_initializer="random_uniform",
- bias_initializer="random_uniform",
- activation=_generator_activation)(x)
- return Model(inputs=input, outputs=x)
-
- @staticmethod
- def _create_critic_model(input_dim, critic_dims, pac):
- """
- Creates the critic model.
-
- Args:
- input_dim: Input dimensionality.
- critic_dims: Dimensions of each hidden layer.
- pac: PAC size.
- """
- input = Input(shape=(input_dim,))
- x = tf.reshape(input, [-1, input_dim * pac])
- for dim in critic_dims:
- x = Dense(dim,
- kernel_initializer="random_uniform",
- bias_initializer="random_uniform")(x)
- x = LeakyReLU(0.2)(x)
- x = Dropout(0.5)(x)
- x = Dense(1, kernel_initializer="random_uniform",
- bias_initializer="random_uniform")(x)
- return Model(inputs=input, outputs=x)
-
- def fit(self, data: DataFrame, train_arguments: TrainParameters, num_cols: list[str], cat_cols: list[str]):
- """
- Fits the CTGAN model.
-
- Args:
- data: A pandas DataFrame with the data to be synthesized.
- train_arguments: CTGAN training arguments.
- num_cols: List of columns to be handled as numerical
- cat_cols: List of columns to be handled as categorical
- """
- super().fit(data=data, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_arguments)
-
- self._generator_optimizer = tf.keras.optimizers.Adam(
- learning_rate=self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- self._critic_optimizer = tf.keras.optimizers.Adam(
- learning_rate=self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
-
- train_data = self.processor.transform(data)
- metadata = self.processor.metadata
- data_dim = self.processor.output_dimensions
-
- self._real_data_sampler = RealDataSampler(train_data, metadata)
- self._conditional_sampler = ConditionalSampler(train_data, metadata, train_arguments.log_frequency)
-
- gen_input_dim = self.latent_dim + self._conditional_sampler.output_dimensions
- self._generator_model = self._create_generator_model(
- gen_input_dim, self.generator_dims, data_dim, metadata, self.tau)
-
- crt_input_dim = data_dim + self._conditional_sampler.output_dimensions
- self._critic_model = self._create_critic_model(crt_input_dim, self.critic_dims, self.pac)
-
- self._generator_model.build((self.batch_size, gen_input_dim))
- self._critic_model.build((self.batch_size, crt_input_dim))
-
- steps_per_epoch = max(len(train_data) // self.batch_size, 1)
- for epoch in range(train_arguments.epochs):
- for _ in range(steps_per_epoch):
- fake_z = tf.random.normal([self.batch_size, self.latent_dim])
- cond_vector = self._conditional_sampler.sample(self.batch_size)
- if cond_vector is None:
- real = self._real_data_sampler.sample(self.batch_size)
- else:
- cond, _, col_idx, opt_idx = cond_vector
- cond = tf.convert_to_tensor(cond)
- fake_z = tf.concat([fake_z, cond], 1)
- perm = np.arange(self.batch_size)
- np.random.shuffle(perm)
- real = self._real_data_sampler.sample_col(col_idx[perm], opt_idx[perm])
- cond_perm = tf.gather(cond, perm)
-
- fake, fake_act = self._generator_model(fake_z, training=True)
- real = tf.convert_to_tensor(real.astype('float32'))
- real_cat = real if cond_vector is None else tf.concat([real, cond_perm], 1)
- fake_cat = fake if cond_vector is None else tf.concat([fake_act, cond], 1)
- critic_loss = self._train_critic_step(real_cat, fake_cat)
-
- fake_z = tf.random.normal([self.batch_size, self.latent_dim])
- cond_vector = self._conditional_sampler.sample(self.batch_size)
- if cond_vector is None:
- generator_loss = self._train_generator_step(fake_z)
- else:
- cond, mask, _, _ = cond_vector
- cond = tf.convert_to_tensor(cond)
- mask = tf.convert_to_tensor(mask)
- fake_z = tf.concat([fake_z, cond], axis=1)
- generator_loss = self._train_generator_step(fake_z, cond, mask, metadata)
-
- print(f"Epoch: {epoch} | critic_loss: {critic_loss} | generator_loss: {generator_loss}")
-
- def _train_critic_step(self, real, fake):
- """
- Single training iteration of the critic model.
-
- Args:
- real: Real data.
- fake: Fake data.
- """
- with tf.GradientTape() as tape:
- y_real = self._critic_model(real, training=True)
- y_fake = self._critic_model(fake, training=True)
- gp = gradient_penalty(
- partial(self._critic_model, training=True), real, fake, ModeGP.CTGAN, self.pac)
- rec_loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake))
- critic_loss = rec_loss + gp * self.gp_lambda
- gradient = tape.gradient(critic_loss, self._critic_model.trainable_variables)
- self._apply_critic_gradients(gradient, self._critic_model.trainable_variables)
- return critic_loss
-
- @tf.function
- def _apply_critic_gradients(self, gradient, trainable_variables):
- """
- Updates gradients of the critic model.
- This logic is isolated in order to be optimized as a TF function.
-
- Args:
- gradient: Gradient.
- trainable_variables: Variables to be updated.
- """
- self._critic_optimizer.apply_gradients(zip(gradient, trainable_variables))
-
- def _train_generator_step(self, fake_z, cond_vector=None, mask=None, metadata=None):
- """
- Single training iteration of the generator model.
-
- Args:
- real: Real data.
- fake: Fake data.
- cond_vector: Conditional vector.
- mask: Mask vector.
- metadata: Dataset columns metadata.
- """
- with tf.GradientTape() as tape:
- fake, fake_act = self._generator_model(fake_z, training=True)
- if cond_vector is not None:
- y_fake = self._critic_model(
- tf.concat([fake_act, cond_vector], 1), training=True)
- cond_loss = ConditionalLoss.compute(fake, cond_vector, mask, metadata)
- generator_loss = -tf.reduce_mean(y_fake) + cond_loss
- else:
- y_fake = self._critic_model(fake_act, training=True)
- generator_loss = -tf.reduce_mean(y_fake)
- gradient = tape.gradient(generator_loss, self._generator_model.trainable_variables)
- gradient = [gradient[i] + self.l2_scale * self._generator_model.trainable_variables[i] for i in range(len(gradient))]
- self._apply_generator_gradients(gradient, self._generator_model.trainable_variables)
- return generator_loss
-
- @tf.function
- def _apply_generator_gradients(self, gradient, trainable_variables):
- """
- Updates gradients of the generator model.
- This logic is isolated in order to be optimized as a TF function.
-
- Args:
- gradient: Gradient.
- trainable_variables: Variables to be updated.
- """
- self._generator_optimizer.apply_gradients(zip(gradient, trainable_variables))
-
- def sample(self, n_samples: int):
- """
- Samples new data from the CTGAN.
-
- Args:
- n_samples: Number of samples to be generated.
- """
- if n_samples <= 0:
- raise ValueError("Invalid number of samples.")
-
- steps = n_samples // self.batch_size + 1
- data = []
- for _ in tf.range(steps):
- fake_z = tf.random.normal([self.batch_size, self.latent_dim])
- cond_vec = self._conditional_sampler.sample(self.batch_size, from_active_bits=True)
- if cond_vec is not None:
- cond = tf.constant(cond_vec)
- fake_z = tf.concat([fake_z, cond], 1)
-
- fake = self._generator_model(fake_z)[1]
- data.append(fake.numpy())
-
- data = np.concatenate(data, 0)
- data = data[:n_samples]
- return self.processor.inverse_transform(data)
-
- def save(self, path):
- """
- Save the CTGAN model in a pickle file.
- Only the required components to sample new data are saved.
-
- Args:
- path: Path of the pickle file.
- """
- dump({
- "model_parameters": self._model_parameters,
- "data_dim": self.processor.output_dimensions,
- "gen_input_dim": self.latent_dim + self._conditional_sampler.output_dimensions,
- "generator_dims": self.generator_dims,
- "tau": self.tau,
- "metadata": self.processor.metadata,
- "batch_size": self.batch_size,
- "latent_dim": self.latent_dim,
- "conditional_sampler": self._conditional_sampler.__dict__,
- "generator_model_weights": self._generator_model.get_weights(),
- "processor": self.processor.__dict__
- }, path)
-
- @staticmethod
- def load(class_dict):
- """
- Load the CTGAN model from a pickle file.
- Only the required components to sample new data are loaded.
-
- Args:
- class_dict: Class dict loaded from the pickle file.
- """
- new_instance = CTGAN(class_dict["model_parameters"])
- setattr(new_instance, "generator_dims", class_dict["generator_dims"])
- setattr(new_instance, "tau", class_dict["tau"])
- setattr(new_instance, "batch_size", class_dict["batch_size"])
- setattr(new_instance, "latent_dim", class_dict["latent_dim"])
-
- new_instance._conditional_sampler = ConditionalSampler()
- new_instance._conditional_sampler.__dict__ = class_dict["conditional_sampler"]
- new_instance.processor = CTGANDataProcessor()
- new_instance.processor.__dict__ = class_dict["processor"]
-
- new_instance._generator_model = new_instance._create_generator_model(
- class_dict["gen_input_dim"], class_dict["generator_dims"],
- class_dict["data_dim"], class_dict["metadata"], class_dict["tau"])
-
- new_instance._generator_model.build((class_dict["batch_size"], class_dict["gen_input_dim"]))
- new_instance._generator_model.set_weights(class_dict['generator_model_weights'])
- return new_instance
\ No newline at end of file
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py b/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py
deleted file mode 100644
index f204bf3f..00000000
--- a/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import tensorflow as tf
-import numpy as np
-
-
-class RealDataSampler:
- """
- Class used to sample from real data.
-
- Args:
- data: Input data.
- metadata: Dataset columns metadata.
- """
- def __init__(self, data, metadata):
- super(RealDataSampler, self).__init__()
- self._data = data
- self._active_bits = []
- self._n_rows = len(data)
-
- for col_md in metadata:
- if col_md.discrete:
- col_active_bits = []
- for c in range(col_md.output_dim):
- col_active_bits.append(np.nonzero(data[:, col_md.start_idx + c])[0])
- self._active_bits.append(col_active_bits)
-
- def sample(self, num_samples):
- """
- Samples from the entire dataset.
-
- Args:
- num_samples: Number of samples to be returned.
- """
- return self._data[np.random.choice(np.arange(self._n_rows), num_samples)]
-
- def sample_col(self, col_idx, opt_idx):
- """
- Samples a specific discrete column.
-
- Args:
- col_idx: Index of the column to be sampled.
- opt_idx: Index of the category.
- """
- idx = []
- for col, opt in zip(col_idx, opt_idx):
- idx.append(np.random.choice(self._active_bits[col][opt]))
- return self._data[idx]
-
-
-class ConditionalSampler:
- """
- Class used to sample conditional vectors.
-
- Args:
- data: Input data.
- metadata: Dataset columns metadata.
- log_frequency: Whether to apply log frequency or not.
- """
- def __init__(self, data=None, metadata=None, log_frequency=None):
- if data is None:
- return
- self._active_bits = []
- max_interval = 0
- counter = 0
-
- for col_md in metadata:
- if col_md.discrete:
- max_interval = max(max_interval, col_md.end_idx - col_md.start_idx)
- self._active_bits.append(np.argmax(data[:, col_md.start_idx:col_md.end_idx], axis=-1))
- counter += 1
-
- self._interval = []
- self._n_col = 0
- self._n_opt = 0
- self._probabilities = np.zeros((counter, max_interval))
-
- for col_md in metadata:
- if col_md.discrete:
- col_active_bits_sum = np.sum(data[:, col_md.start_idx:col_md.end_idx], axis=0)
- if log_frequency:
- col_active_bits_sum = np.log(col_active_bits_sum + 1)
- col_active_bits_sum = col_active_bits_sum / np.sum(col_active_bits_sum)
- self._probabilities[self._n_col, :col_md.output_dim] = col_active_bits_sum
- self._interval.append((self._n_opt, col_md.output_dim))
- self._n_opt += col_md.output_dim
- self._n_col += 1
-
- self._interval = np.asarray(self._interval)
-
- @property
- def output_dimensions(self):
- """
- Returns the dimensionality of the conditional vectors.
- """
- return self._n_opt
-
- def sample(self, batch_size, from_active_bits=False):
- """
- Samples conditional vectors.
-
- Args:
- batch_size: Batch size.
- from_active_bits: Whether to directly sample from active bits or not.
- """
- if self._n_col == 0:
- return None
-
- col_idx = np.random.choice(np.arange(self._n_col), batch_size)
- cond_vector = np.zeros((batch_size, self._n_opt), dtype='float32')
-
- if from_active_bits:
- for i in range(batch_size):
- pick = int(np.random.choice(self._active_bits[col_idx[i]]))
- cond_vector[i, pick + self._interval[col_idx[i], 0]] = 1
- return cond_vector
-
- mask = np.zeros((batch_size, self._n_col), dtype='float32')
- mask[np.arange(batch_size), col_idx] = 1
- prob = self._probabilities[col_idx]
- rand = np.expand_dims(np.random.rand(prob.shape[0]), axis=1)
- opt_idx = (prob.cumsum(axis=1) > rand).argmax(axis=1)
- opt = self._interval[col_idx, 0] + opt_idx
- cond_vector[np.arange(batch_size), opt] = 1
- return cond_vector, mask, col_idx, opt_idx
-
-class ConditionalLoss:
- """
- Conditional loss utils.
- """
- @staticmethod
- def compute(data, cond_vector, mask, metadata):
- """
- Computes the conditional loss.
-
- Args:
- data: Input data.
- cond_vector: Conditional vector.
- mask: Mask vector.
- metadata: Dataset columns metadata.
- """
- shape = tf.shape(mask)
- cond_loss = tf.zeros(shape)
- start_cat = 0
- counter = 0
- for col_md in metadata:
- if col_md.discrete:
- end_cat = start_cat + col_md.output_dim
- data_log_softmax = data[:, col_md.start_idx:col_md.end_idx]
- cond_vector_am = tf.math.argmax(cond_vector[:, start_cat:end_cat], 1)
- loss = tf.reshape(tf.nn.sparse_softmax_cross_entropy_with_logits(
- cond_vector_am, data_log_softmax), [-1, 1])
- cond_loss = tf.concat(
- [cond_loss[:, :counter], loss, cond_loss[:, counter+1:]], 1)
- start_cat = end_cat
- counter += 1
-
- return tf.reduce_sum(cond_loss * mask) / tf.cast(shape[0], dtype=tf.float32)
diff --git a/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py b/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py
index 101b8901..4e7cdd47 100644
--- a/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py
@@ -1,268 +1,11 @@
-"""CWGANGP implementation."""
-import os
-from os import path
-from typing import List, Optional, NamedTuple
-
-from tqdm import trange
-
-import numpy as np
-from numpy import hstack
-from pandas import DataFrame
-from tensorflow import dtypes, GradientTape, reduce_sum, reduce_mean, sqrt, random
-from keras import Model
-from keras.layers import (Dense, Dropout, Input, concatenate, LeakyReLU)
-from keras.optimizers import Adam
-
-#Import ydata synthetic classes
-from ....synthesizers import TrainParameters
-from ....synthesizers.base import ConditionalModel
-from ....synthesizers.regular.wgangp.model import WGAN_GP
-
-class CWGANGP(ConditionalModel, WGAN_GP):
-
- __MODEL__='CWGAN_GP'
-
- def __init__(self, model_parameters,
- n_generator: Optional[int]=1,
- n_critic: Optional[int]=1,
- gradient_penalty_weight:int=10):
- """
- Adapts the WGAN_GP synthesizer implementation to be conditional.
-
- Several conditional WGAN implementations can be found online, here are a few:
- https://cameronfabbri.github.io/papers/conditionalWGAN.pdf
- https://www.sciencedirect.com/science/article/abs/pii/S0020025519309715
- https://arxiv.org/pdf/2008.09202.pdf
- """
- WGAN_GP.__init__(self, model_parameters,
- n_generator=n_generator,
- n_critic=n_critic,
- gradient_penalty_weight=gradient_penalty_weight)
-
- def define_gan(self, activation_info: Optional[NamedTuple] = None):
- """Define the trainable model components.
-
- Args:
- activation_info (Optional[NamedTuple]): Defaults to None
- """
- self.generator = Generator(self.batch_size). \
- build_model(input_shape=(self.noise_dim,),
- label_shape=(self.label_dim, ),
- dim=self.layers_dim,
- data_dim=self.data_dim,
- activation_info = activation_info,
- tau = self.tau)
-
- self.critic = Critic(self.batch_size). \
- build_model(input_shape=(self.data_dim,),
- label_shape=(self.label_dim,),
- dim=self.layers_dim)
-
- g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- c_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- return g_optimizer, c_optimizer
-
- def gradient_penalty(self, real, fake, label):
- """Compute gradient penalty.
-
- Args:
- real: real event.
- fake: fake event.
- label: ground truth.
- Returns:
- gradient_penalty
- """
- epsilon = random.uniform([real.shape[0], 1], 0.0, 1.0, dtype=dtypes.float32)
- x_hat = epsilon * real + (1 - epsilon) * fake
- with GradientTape() as t:
- t.watch(x_hat)
- d_hat = self.critic([x_hat, label])
- gradients = t.gradient(d_hat, x_hat)
- ddx = sqrt(reduce_sum(gradients ** 2))
- d_regularizer = reduce_mean((ddx - 1.0) ** 2)
- return d_regularizer
-
- @staticmethod
- def get_data_batch(data, batch_size, seed=0):
- """Produce real data batches from the passed data object.
-
- Args:
- train: real data.
- batch_size: batch size.
- seed (int, optional):Defaults to 0.
-
- Returns:
- data batch.
- """
- start_i = (batch_size * seed) % len(data)
- stop_i = start_i + batch_size
- shuffle_seed = (batch_size * seed) // len(data)
- np.random.seed(shuffle_seed)
- data_ix = np.random.choice(data.shape[0], replace=False, size=len(data)) # wasteful to shuffle every time
- return dtypes.cast(data[data_ix[start_i: stop_i]], dtype=dtypes.float32)
-
- def c_lossfn(self, real):
- """Compute the critic loss.
-
- Args:
- real: A real sample
-
- Returns:
- Critic loss
- """
- real, label = real
- # generating noise from a uniform distribution
- noise = random.uniform([real.shape[0], self.noise_dim], minval=0.999, maxval=1.0 , dtype=dtypes.float32)
- # run noise through generator
- fake = self.generator([noise, label])
- # discriminate x and x_gen
- logits_real = self.critic([real, label])
- logits_fake = self.critic([fake, label])
-
- # gradient penalty
- gp = self.gradient_penalty(real, fake, label)
- # getting the loss of the critic.
- c_loss = (reduce_mean(logits_fake)
- - reduce_mean(logits_real)
- + gp * self.gradient_penalty_weight)
- return c_loss
-
- def g_lossfn(self, real):
- """
- Forward pass on the generator and computes the loss.
-
- Args:
- real: Data batch we are analyzing
- Returns:
- Generator loss
- """
- real, label = real
-
- # generating noise from a uniform distribution
- noise = random.uniform([real.shape[0], self.noise_dim], minval=0.0, maxval=0.001 ,dtype=dtypes.float32)
-
- fake = self.generator([noise, label])
- logits_fake = self.critic([fake, label])
- g_loss = -reduce_mean(logits_fake)
- return g_loss
-
- def fit(self, data: DataFrame,
- label_cols: List[str],
- train_arguments: TrainParameters,
- num_cols: List[str],
- cat_cols: List[str]):
- """
- Train the synthesizer on a provided dataset based on a specified condition column.
-
- Args:
- data: A pandas DataFrame with the data to be synthesized
- label: The name of the column to be used as a label and condition for the training
- train_arguments: GAN training arguments.
- num_cols: List of columns of the data object to be handled as numerical
- cat_cols: List of columns of the data object to be handled as categorical
- """
- data, label = self._prep_fit(data, label_cols, num_cols, cat_cols)
-
- processed_data = self.processor.transform(data)
- self.data_dim = processed_data.shape[1]
- self.label_dim = len(label_cols)
-
- #Init the GAN model and optimizers
- optimizers = self.define_gan(self.processor.col_transform_info)
-
- # Merging labels with processed data
- processed_data = hstack([processed_data, label])
-
- iterations = int(abs(processed_data.shape[0] / self.batch_size) + 1)
- print(f'Number of iterations per epoch: {iterations}')
-
- for epoch in trange(train_arguments.epochs):
- for _ in range(iterations):
- # ---------------------
- # Train Discriminator
- # ---------------------
- batch_x = self.get_data_batch(processed_data, self.batch_size) # Batches are retrieved with labels
- batch_x, label = batch_x[:, :-self.label_dim], batch_x[:, -self.label_dim:] # Separate labels from batch
-
- cri_loss, ge_loss = self.train_step((batch_x, label), optimizers)
-
- print(
- "Epoch: {} | critic_loss: {} | gen_loss: {}".format(
- epoch, cri_loss, ge_loss
- ))
-
- # If at save interval => save model state and generated image samples
- if epoch % train_arguments.sample_interval == 0:
- self._run_checkpoint(train_arguments, epoch)
-
- def _run_checkpoint(self, train_arguments, epoch):
- "Run checkpoint and store model state and generated samples."
- if path.exists('./cache') is False:
- os.mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
- self.critic.save_weights(model_checkpoint_base_name.format('critic', epoch))
-
-
-act_leakyr = LeakyReLU(alpha=0.2)
-# pylint: disable=R0903,D203
-class Generator():
- "Standard discrete conditional generator."
- def __init__(self, batch_size):
- """Sets the properties of the generator.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- label_shape: label dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- """
- noise = Input(shape=input_shape, batch_size=self.batch_size)
- label_v = Input(shape=label_shape)
- x = concatenate([noise, label_v])
- x = Dense(dim, activation=act_leakyr)(x)
- x = Dense(dim * 2, activation=act_leakyr)(x)
- x = Dense(dim * 4, activation=act_leakyr)(x)
- x = Dense(data_dim)(x)
- #if activation_info:
- # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x)
- return Model(inputs=[noise, label_v], outputs=x)
-
-# pylint: disable=R0903,D203
-class Critic():
- "Conditional Critic."
- def __init__(self, batch_size):
- "Sets the properties of the critic."
- self.batch_size = batch_size
-
- def build_model(self, input_shape, label_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- label_shape: label dimensionality.
- dim: hidden layers size.
-
- Returns:
- Critic model
- """
- events = Input(shape=input_shape, batch_size=self.batch_size)
- label = Input(shape=label_shape, batch_size=self.batch_size)
- input_ = concatenate([events, label])
- x = Dense(dim * 4, activation=act_leakyr)(input_)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation=act_leakyr)(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation=act_leakyr)(x)
- x = Dense(1)(x)
- return Model(inputs=[events, label], outputs=x)
+"""
+ WGAN class file
+"""
+from typing import Optional
+from ydata_synthetic.synthesizers.base import BaseModel
+
+class CWGANGP(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/dragan/model.py b/src/ydata_synthetic/synthesizers/regular/dragan/model.py
index cf2cb128..c1b5033a 100644
--- a/src/ydata_synthetic/synthesizers/regular/dragan/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/dragan/model.py
@@ -1,271 +1,11 @@
"""
- DRAGAN model architecture implementation
+ DRAGAN class file
"""
-import os
-from os import path
-
-from typing import Optional, NamedTuple
-import tensorflow as tf
-import tqdm
-from keras import Model, initializers
-from keras.layers import Dense, Dropout, Input
-from keras.optimizers import Adam
-
-#Import ydata synthetic classes
-from ....synthesizers.base import BaseGANModel
-from ....synthesizers.loss import Mode, gradient_penalty
-
-class DRAGAN(BaseGANModel):
-
- __MODEL__='DRAGAN'
-
- def __init__(self, model_parameters, n_discriminator, gradient_penalty_weight=10):
- """DRAGAN model architecture implementation.
-
- Args:
- model_parameters:
- n_discriminator:
- gradient_penalty_weight (int, optional): Defaults to 10.
- """
- # As recommended in DRAGAN paper - https://arxiv.org/abs/1705.07215
- self.n_discriminator = n_discriminator
- self.gradient_penalty_weight = gradient_penalty_weight
- super().__init__(model_parameters)
-
- def define_gan(self, col_transform_info: Optional[NamedTuple] = None):
- """Define the trainable model components.
-
- Args:
- col_transform_info (Optional[NamedTuple], optional): Defaults to None.
-
- Returns:
- (generator_optimizer, discriminator_optimizer): Generator and discriminator optimizers
- """
- # define generator/discriminator
- self.generator = Generator(self.batch_size). \
- build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
- activation_info=col_transform_info, tau = self.tau)
-
- self.discriminator = Discriminator(self.batch_size). \
- build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
-
- g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2, clipvalue=0.001)
- d_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2, clipvalue=0.001)
- return g_optimizer, d_optimizer
-
- def gradient_penalty(self, real, fake):
- """Compute gradient penalty.
-
- Args:
- real: real event.
- fake: fake event.
- Returns:
- gradient_penalty.
- """
- gp = gradient_penalty(self.discriminator, real, fake, mode= Mode.DRAGAN)
- return gp
-
- def update_gradients(self, x, g_optimizer, d_optimizer):
- """Compute the gradients for Generator and Discriminator.
-
- Args:
- x (tf.tensor): real data event
- g_optimizer (tf.OptimizerV2): Optimizer for the generator model
- c_optimizer (tf.OptimizerV2): Optimizer for the discriminator model
- Returns:
- (discriminator loss, generator loss)
- """
- # Update the gradients of critic for n_critic times (Training the critic)
- for _ in range(self.n_discriminator):
- with tf.GradientTape() as d_tape:
- d_loss = self.d_lossfn(x)
- # Get the gradients of the critic
- d_gradient = d_tape.gradient(d_loss, self.discriminator.trainable_variables)
- # Update the weights of the critic using the optimizer
- d_optimizer.apply_gradients(
- zip(d_gradient, self.discriminator.trainable_variables)
- )
-
- # Update the generator
- with tf.GradientTape() as g_tape:
- gen_loss = self.g_lossfn(x)
-
- # Get the gradients of the generator
- gen_gradients = g_tape.gradient(gen_loss, self.generator.trainable_variables)
-
- # Update the weights of the generator
- g_optimizer.apply_gradients(
- zip(gen_gradients, self.generator.trainable_variables)
- )
-
- return d_loss, gen_loss
-
- def d_lossfn(self, real):
- """Calculates the critic losses.
-
- Args:
- real: real data examples.
-
- Returns:
- discriminator loss
- """
- noise = tf.random.normal((self.batch_size, self.noise_dim), dtype=tf.dtypes.float64)
- # run noise through generator
- fake = self.generator(noise)
- # discriminate x and x_gen
- logits_real = self.discriminator(real, training=True)
- logits_fake = self.discriminator(fake, training=True)
-
- # gradient penalty
- gp = self.gradient_penalty(real, fake)
-
- # getting the loss of the discriminator.
- d_loss = (tf.reduce_mean(logits_fake)
- - tf.reduce_mean(logits_real)
- + gp * self.gradient_penalty_weight)
- return d_loss
-
- def g_lossfn(self, real):
- """Calculates the Generator losses.
-
- Args:
- real: real data.
- Returns:
- generator loss
- """
- # generating noise from a uniform distribution
- noise = tf.random.normal((real.shape[0], self.noise_dim), dtype=tf.float64)
-
- fake = self.generator(noise, training=True)
- logits_fake = self.discriminator(fake, training=True)
- g_loss = -tf.reduce_mean(logits_fake)
- return g_loss
-
- def get_data_batch(self, train, batch_size):
- """Get real data batches from the passed data object.
-
- Args:
- train: real data.
- batch_size: batch size.
- seed (int, optional):Defaults to 0.
-
- Returns:
- data batch.
- """
- buffer_size = len(train)
- #tensor_data = pd.concat([x_train, y_train], axis=1)
- train_loader = tf.data.Dataset.from_tensor_slices(train) \
- .batch(batch_size).shuffle(buffer_size)
- return train_loader
-
- def train_step(self, train_data, optimizers):
- """Perform a training step.
-
- Args:
- train_data: training data
- optimizers: generator and critic optimizers
-
- Returns:
- (critic_loss, generator_loss): Critic and generator loss.
- """
- d_loss, g_loss = self.update_gradients(train_data, *optimizers)
- return d_loss, g_loss
-
- def fit(self, data, train_arguments, num_cols, cat_cols):
- """Fit a synthesizer model to a given input dataset.
-
- Args:
- data: A pandas DataFrame or a Numpy array with the data to be synthesized
- train_arguments: GAN training arguments.
- num_cols: List of columns of the data object to be handled as numerical
- cat_cols: List of columns of the data object to be handled as categorical
- """
- super().fit(data, num_cols, cat_cols)
-
- processed_data = self.processor.transform(data)
- self.data_dim = processed_data.shape[1]
- optimizers = self.define_gan(self.processor.col_transform_info)
-
- train_loader = self.get_data_batch(processed_data, self.batch_size)
-
- # Create a summary file
- train_summary_writer = tf.summary.create_file_writer(path.join('..\dragan_test', 'summaries', 'train'))
-
- with train_summary_writer.as_default():
- for epoch in tqdm.trange(train_arguments.epochs):
- for batch_data in train_loader:
- batch_data = tf.cast(batch_data, dtype=tf.float32)
- d_loss, g_loss = self.train_step(batch_data, optimizers)
-
- print(
- "Epoch: {} | disc_loss: {} | gen_loss: {}".format(
- epoch, d_loss, g_loss
- ))
-
- if epoch % train_arguments.sample_interval == 0:
- # Test here data generation step
- # save model checkpoints
- if path.exists('./cache') is False:
- os.mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
- self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch))
-
-
-class Discriminator(Model):
- def __init__(self, batch_size):
- """Simple discriminator with dense feedforward layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers size.
-
- Returns:
- Discriminator model
- """
- input = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim * 4, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation='relu')(x)
- x = Dense(1, activation='sigmoid')(x)
- return Model(inputs=input, outputs=x)
-
-class Generator(Model):
- def __init__(self, batch_size):
- """Simple generator with dense feedforward layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim, data_dim, activation_info: NamedTuple = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- Returns:
- Generator model
- """
- input = Input(shape=input_shape, batch_size = self.batch_size)
- x = Dense(dim, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim)(x)
- #if activation_info:
- # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x)
- return Model(inputs=input, outputs=x)
+from typing import Optional
+from ydata_synthetic.synthesizers.base import BaseModel
+
+class DRAGAN(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/gmm/model.py b/src/ydata_synthetic/synthesizers/regular/gmm/model.py
index 466d1978..3dd49e96 100644
--- a/src/ydata_synthetic/synthesizers/regular/gmm/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/gmm/model.py
@@ -1,111 +1,10 @@
"""
GMM based synthetic data generation model
"""
-from typing import List, Optional, Union
-
-from joblib import dump, load
-from tqdm import tqdm
-
-from pandas import DataFrame
-from numpy import (array, arange)
-
-from sklearn.mixture import GaussianMixture
-from sklearn.metrics import silhouette_score
-
from ydata_synthetic.synthesizers.base import BaseModel
-from ydata_synthetic.preprocessing import RegularDataProcessor
class GMM(BaseModel):
-
- def __init__(self,
- covariance_type:str="full",
- random_state:int=0):
- self.covariance_type = covariance_type
- self.random_state = random_state
- self.__MODEL__ = GaussianMixture(covariance_type=covariance_type,
- random_state=random_state)
- self.processor = RegularDataProcessor
-
- def __optimize(self, prep_data: array):
- """
- Auxiliary method to optimize the number of components to be considered for the Gaussian or Bayesian Mixture
- Returns:
- n_components (int): Optimal number of components calculated based on Silhouette score
- """
- c = arange(2, 40, 5)
- n_components=2
- max_silhouette=0
- for n in tqdm(c, desc="Hyperparameter search"):
- model = GaussianMixture(n, covariance_type=self.covariance_type, random_state=self.random_state)
- labels = model.fit_predict(prep_data)
- s = silhouette_score(prep_data, labels, metric='euclidean')
- if model.converged_:
- if max_silhouette < s:
- n_components = n
- max_silhouette=s
- return n_components
-
- def fit(self, data: Union[DataFrame, array],
- num_cols: Optional[List[str]] = None,
- cat_cols: Optional[List[str]] = None,):
- """
- ### Description:
- Trains and fit a synthesizer model to a given input dataset.
-
- ### Args:
- `data` (Union[DataFrame, array]): Training data
- `num_cols` (Optional[List[str]]) : List with the names of the categorical columns
- `cat_cols` (Optional[List[str]]): List of names of categorical columns
-
- ### Returns:
- **self:** *object*
- Fitted synthesizer
- """
- self.processor = RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data)
- train_data = self.processor.transform(data)
-
- #optimize the n_components selection
- n_components = self.__optimize(train_data)
-
- self.__MODEL__.n_components=n_components
- #Fit the gaussian model
- self.__MODEL__.fit(train_data)
-
- def sample(self, n_samples: int):
- """
- ### Description:
- Generates samples from the trained synthesizer.
-
- ### Args:
- `n_samples` (int): Number of rows to generated.
-
- ### Returns:
- **synth_sample:** pandas.DataFrame, shape (n_samples, n_features)
- Returns the generated synthetic samples.
- """
- sample = self.__MODEL__.sample(n_samples=n_samples)[0]
-
- return self.processor.inverse_transform(sample)
-
- def save(self, path='str'):
- """
- Save a model as a pickle
- Args:
- path (str): The path where the model should be saved as pickle
- """
- try:
- with open(path, 'wb') as f:
- dump(self, f)
- except:
- raise Exception(f'The path {path} provided is not valid. Please validate your inputs')
-
- @classmethod
- def load(cls, path:str):
- """
- Load a trained synthesizer from a given path
- Returns:
- model (GMM): A trained GMM model
- """
- with open(path, 'rb') as f:
- model = load(f)
- return model
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/model.py b/src/ydata_synthetic/synthesizers/regular/model.py
deleted file mode 100644
index b03e81fc..00000000
--- a/src/ydata_synthetic/synthesizers/regular/model.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
- Main synthesizer class
-"""
-from enum import Enum, unique
-import logging
-
-from joblib import load
-
-from tensorflow import config as tfconfig
-
-from ydata_synthetic.synthesizers.regular.vanillagan.model import VanilllaGAN
-from ydata_synthetic.synthesizers.regular.cgan.model import CGAN
-from ydata_synthetic.synthesizers.regular.wgan.model import WGAN
-from ydata_synthetic.synthesizers.regular.wgangp.model import WGAN_GP
-from ydata_synthetic.synthesizers.regular.cwgangp.model import CWGANGP
-from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN
-from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN
-from ydata_synthetic.synthesizers.regular.ctgan.model import CTGAN
-from ydata_synthetic.synthesizers.regular.gmm.model import GMM
-
-from ydata_synthetic.utils.logger import SynthesizersLogger
-
-logger = SynthesizersLogger(name='regularsynthesizer.logger')
-logger.setLevel(logging.INFO)
-
-@unique
-class Model(Enum):
- VANILLA = 'gan'
- CONDITIONAL = 'cgan'
- WASSERTEIN = 'wgan'
- WASSERTEINGP ='wgangp'
- CWASSERTEINGP = 'cwgangp'
- CRAMER = 'cramer'
- DEEPREGRET = 'dragan'
- CONDITIONALTABULAR = 'ctgan'
- FAST = 'fast'
-
- __MAPPING__ = {
- VANILLA : VanilllaGAN,
- CONDITIONAL: CGAN,
- WASSERTEIN: WGAN,
- WASSERTEINGP: WGAN_GP,
- CWASSERTEINGP: CWGANGP,
- CRAMER: CRAMERGAN,
- DEEPREGRET: DRAGAN,
- CONDITIONALTABULAR: CTGAN,
- FAST: GMM
- }
-
- @property
- def function(self):
- return self.__MAPPING__[self.value]
-
-class RegularSynthesizer():
- "Abstraction class "
- def __new__(cls, modelname: str, model_parameters =None, **kwargs):
- model = None
- if Model(modelname) == Model.FAST:
- model=Model(modelname).function(**kwargs)
- else:
- model=Model(modelname).function(model_parameters, **kwargs)
-
- logger.info_def_report(model=modelname)
- return model
-
- @staticmethod
- def load(path):
- """
- ### Description:
- Loads a saved synthesizer from a pickle.
-
- ### Args:
- `path` (str): Path to read the synthesizer pickle from.
- """
- gpu_devices = tfconfig.list_physical_devices('GPU')
- if len(gpu_devices) > 0:
- try:
- tfconfig.experimental.set_memory_growth(gpu_devices[0], True)
- except (ValueError, RuntimeError):
- # Invalid device or cannot modify virtual devices once initialized.
- pass
- synth = load(path)
- if isinstance(synth, dict):
- return CTGAN.load(synth)
- return synth
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py b/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py
index a6e60580..da107cdc 100644
--- a/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py
@@ -1,206 +1,11 @@
"""
- Vanilla GAN architecture model implementation
+ Vanilla GAN file definition
"""
-import os
-from os import path
-from typing import List, Optional, NamedTuple
-import numpy as np
-from tqdm import trange
+from ydata_synthetic.synthesizers.base import BaseModel
-import tensorflow as tf
-from keras.layers import Input, Dense, Dropout
-from keras import Model
-from keras.optimizers import Adam
-
-#Import ydata synthetic classes
-from ....synthesizers.base import BaseGANModel
-from ....synthesizers import TrainParameters
-
-class VanilllaGAN(BaseGANModel):
-
- __MODEL__='GAN'
-
- def __init__(self, model_parameters):
- super().__init__(model_parameters)
-
- def define_gan(self, activation_info: Optional[NamedTuple]):
- """Define the trainable model components.
-
- Args:
- activation_info (Optional[NamedTuple], optional): Defaults to None.
-
- Returns:
- (generator_optimizer, critic_optimizer): Generator and critic optimizers
- """
- self.generator = Generator(self.batch_size).\
- build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,)
-
- self.discriminator = Discriminator(self.batch_size).\
- build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
-
- g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- d_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
-
- # Build and compile the discriminator
- self.discriminator.compile(loss='binary_crossentropy',
- optimizer=d_optimizer,
- metrics=['accuracy'])
-
- # The generator takes noise as input and generates imgs
- z = Input(shape=(self.noise_dim,))
- record = self.generator(z)
-
- # For the combined model we will only train the generator
- self.discriminator.trainable = False
-
- # The discriminator takes generated images as input and determines validity
- validity = self.discriminator(record)
-
- # The combined model (stacked generator and discriminator)
- # Trains the generator to fool the discriminator
- self._model = Model(z, validity)
- self._model.compile(loss='binary_crossentropy', optimizer=g_optimizer)
-
- def get_data_batch(self, train, batch_size, seed=0):
- """Get real data batches from the passed data object.
-
- Args:
- train: real data
- batch_size: batch size
- seed (int, optional):Defaults to 0.
-
- Returns:
- data batch
- """
- # # random sampling - some samples will have excessively low or high sampling, but easy to implement
- # np.random.seed(seed)
- # x = train.loc[ np.random.choice(train.index, batch_size) ].values
- # iterate through shuffled indices, so every sample gets covered evenly
-
- start_i = (batch_size * seed) % len(train)
- stop_i = start_i + batch_size
- shuffle_seed = (batch_size * seed) // len(train)
- np.random.seed(shuffle_seed)
- train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time
- train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set
- return train[train_ix[start_i: stop_i]]
-
- def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]):
- """Fit a synthesizer model to a given input dataset.
-
- Args:
- data: A pandas DataFrame or a Numpy array with the data to be synthesized
- train_arguments: GAN training arguments.
- num_cols (List[str]): List of columns of the data object to be handled as numerical
- cat_cols (List[str]): List of columns of the data object to be handled as categorical
- """
- super().fit(data, num_cols, cat_cols)
-
- processed_data = self.processor.transform(data)
- self.data_dim = processed_data.shape[1]
- self.define_gan(self.processor.col_transform_info)
-
- iterations = int(abs(data.shape[0]/self.batch_size)+1)
-
- # Adversarial ground truths
- valid = np.ones((self.batch_size, 1))
- fake = np.zeros((self.batch_size, 1))
-
- for epoch in trange(train_arguments.epochs):
- for _ in range(iterations):
- # ---------------------
- # Train Discriminator
- # ---------------------
- batch_data = self.get_data_batch(processed_data, self.batch_size)
- noise = tf.random.normal((self.batch_size, self.noise_dim))
-
- # Generate a batch of events
- gen_data = self.generator(noise, training=True)
-
- # Train the discriminator
- d_loss_real = self.discriminator.train_on_batch(batch_data, valid)
- d_loss_fake = self.discriminator.train_on_batch(gen_data, fake)
- d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
-
- # ---------------------
- # Train Generator
- # ---------------------
- noise = tf.random.normal((self.batch_size, self.noise_dim))
- # Train the generator (to have the discriminator label samples as valid)
- g_loss = self._model.train_on_batch(noise, valid)
-
- # Plot the progress
- print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss))
-
- # If at save interval => save generated events
- if epoch % train_arguments.sample_interval == 0:
- #Test here data generation step
- # save model checkpoints
- if path.exists('./cache') is False:
- os.mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
- self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch))
-
- #Here is generating the data
- z = tf.random.normal((432, self.noise_dim))
- gen_data = self.generator(z)
- print('generated_data')
-
-
-class Generator(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple generator with dense feedforward layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size=batch_size
-
- def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- Returns:
- Generator model
- """
- input= Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim, activation='relu')(input)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim)(x)
- return Model(inputs=input, outputs=x)
-
-class Discriminator(tf.keras.Model):
- def __init__(self,batch_size):
- """Simple discriminator with dense feedforward and dropout layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size=batch_size
-
- def build_model(self, input_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers size.
-
- Returns:
- Discriminator model
- """
- input = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim * 4, activation='relu')(input)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation='relu')(x)
- x = Dense(1, activation='sigmoid')(x)
- return Model(inputs=input, outputs=x)
+class VanillaGAN(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/wgan/model.py b/src/ydata_synthetic/synthesizers/regular/wgan/model.py
index e326507a..78a246ca 100644
--- a/src/ydata_synthetic/synthesizers/regular/wgan/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/wgan/model.py
@@ -1,243 +1,10 @@
"""
- WGAN architecture model implementation
+ WGAN class file
"""
+from ydata_synthetic.synthesizers.base import BaseModel
-from os import mkdir, path
-from typing import List, Optional, NamedTuple
-
-from tqdm import trange
-
-import numpy as np
-
-import tensorflow as tf
-import keras.backend as K
-from keras import Model
-from keras.layers import Dense, Dropout, Input
-from keras.optimizers import Adam
-
-#Import ydata synthetic classes
-from ....synthesizers import TrainParameters
-from ....synthesizers.base import BaseGANModel
-
-#Auxiliary Keras backend class to calculate the Random Weighted average
-#https://stackoverflow.com/questions/58133430/how-to-substitute-keras-layers-merge-merge-in-tensorflow-keras
-class RandomWeightedAverage(tf.keras.layers.Layer):
- def __init__(self, batch_size):
- super().__init__()
- self.batch_size = batch_size
-
- def call(self, inputs, **kwargs):
- alpha = tf.random_uniform((self.batch_size, 1, 1, 1))
- return (alpha * inputs[0]) + ((1 - alpha) * inputs[1])
-
- def compute_output_shape(self, input_shape):
- return input_shape[0]
-
-class WGAN(BaseGANModel):
-
- __MODEL__='WGAN'
-
- def __init__(self, model_parameters, n_critic, clip_value=0.01):
- # As recommended in WGAN paper - https://arxiv.org/abs/1701.07875
- # WGAN-GP - WGAN with Gradient Penalty
- self.n_critic = n_critic
- self.clip_value = clip_value
- super().__init__(model_parameters)
-
- def wasserstein_loss(self, y_true, y_pred):
- """Calculate wasserstein loss.
-
- Args:
- y_true: ground truth.
- y_pred: predictions.
-
- Returns:
- wasserstein loss.
- """
- return K.mean(y_true * y_pred)
-
- def define_gan(self, activation_info: Optional[NamedTuple] = None):
- """Define the trainable model components.
-
- Args:
- activation_info (Optional[NamedTuple], optional): Defaults to None.
-
- Returns:
- (generator_optimizer, critic_optimizer): Generator and critic optimizers.
- """
- self.generator = Generator(self.batch_size). \
- build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
- activation_info=activation_info, tau = self.tau)
-
- self.critic = Critic(self.batch_size). \
- build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
-
- optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- critic_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
-
- # Build and compile the critic
- self.critic.compile(loss=self.wasserstein_loss,
- optimizer=critic_optimizer,
- metrics=['accuracy'])
-
- # The generator takes noise as input and generates imgs
- z = Input(shape=(self.noise_dim,))
- record = self.generator(z)
- # The discriminator takes generated images as input and determines validity
- validity = self.critic(record)
-
- # For the combined model we will only train the generator
- self.critic.trainable = False
-
- # The combined model (stacked generator and discriminator)
- # Trains the generator to fool the discriminator
- #For the WGAN model use the Wassertein loss
- self._model = Model(z, validity)
- self._model.compile(loss='binary_crossentropy', optimizer=optimizer)
-
- def get_data_batch(self, train, batch_size, seed=0):
- """Get real data batches from the passed data object.
-
- Args:
- train: real data.
- batch_size: batch size.
- seed (int, optional):Defaults to 0.
-
- Returns:
- data batch.
- """
- # np.random.seed(seed)
- # x = train.loc[ np.random.choice(train.index, batch_size) ].values
- # iterate through shuffled indices, so every sample gets covered evenly
- start_i = (batch_size * seed) % len(train)
- stop_i = start_i + batch_size
- shuffle_seed = (batch_size * seed) // len(train)
- np.random.seed(shuffle_seed)
- train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time
- train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set
- return train[train_ix[start_i: stop_i]]
-
- def fit(self, data, train_arguments: TrainParameters, num_cols: List[str],
- cat_cols: List[str]):
- """Fit a synthesizer model to a given input dataset.
-
- Args:
- data: A pandas DataFrame or a Numpy array with the data to be synthesized.
- train_arguments: GAN training arguments.
- num_cols (List[str]): List of columns of the data object to be handled as numerical.
- cat_cols (List[str]): List of columns of the data object to be handled as categorical.
- """
- super().fit(data, num_cols, cat_cols)
-
- processed_data = self.processor.transform(data)
- self.data_dim = processed_data.shape[1]
- self.define_gan(self.processor.col_transform_info)
-
- #Create a summary file
- iterations = int(abs(data.shape[0]/self.batch_size)+1)
- train_summary_writer = tf.summary.create_file_writer(path.join('.', 'summaries', 'train'))
-
- # Adversarial ground truths
- valid = np.ones((self.batch_size, 1))
- fake = -np.ones((self.batch_size, 1))
-
- with train_summary_writer.as_default():
- for epoch in trange(train_arguments.epochs, desc='Epoch Iterations'):
- for _ in range(iterations):
- for _ in range(self.n_critic):
- # ---------------------
- # Train the Critic
- # ---------------------
- batch_data = self.get_data_batch(processed_data, self.batch_size)
- noise = tf.random.normal((self.batch_size, self.noise_dim))
-
- # Generate a batch of events
- gen_data = self.generator(noise)
-
- # Train the Critic
- d_loss_real = self.critic.train_on_batch(batch_data, valid)
- d_loss_fake = self.critic.train_on_batch(gen_data, fake)
- d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
-
- for l in self.critic.layers:
- weights = l.get_weights()
- weights = [np.clip(w, -self.clip_value, self.clip_value) for w in weights]
- l.set_weights(weights)
-
- # ---------------------
- # Train Generator
- # ---------------------
- noise = tf.random.normal((self.batch_size, self.noise_dim))
- # Train the generator (to have the critic label samples as valid)
- g_loss = self._model.train_on_batch(noise, valid)
- # Plot the progress
- print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss))
-
- #If at save interval => save generated events
- if epoch % train_arguments.sample_interval == 0:
- # Test here data generation step
- # save model checkpoints
- if path.exists('./cache') is False:
- mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
- self.critic.save_weights(model_checkpoint_base_name.format('critic', epoch))
-
-
-class Generator(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple generator with dense feedforward layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- Returns:
- Generator model
- """
- input = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim, activation='relu')(input)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim)(x)
- #if activation_info:
- # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x)
- return Model(inputs=input, outputs=x)
-
-class Critic(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple critic with dense feedforward and dropout layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers size.
-
- Returns:
- Critic model
- """
- input = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim * 4, activation='relu')(input)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation='relu')(x)
- x = Dense(1)(x)
- return Model(inputs=input, outputs=x)
+class WGAN(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/regular/wgangp/model.py b/src/ydata_synthetic/synthesizers/regular/wgangp/model.py
index f6599272..c052dddb 100644
--- a/src/ydata_synthetic/synthesizers/regular/wgangp/model.py
+++ b/src/ydata_synthetic/synthesizers/regular/wgangp/model.py
@@ -1,283 +1,10 @@
"""
- WGANGP architecture model implementation
+ WGANPGP class file
"""
+from ydata_synthetic.synthesizers.base import BaseModel
-import os
-from os import path
-from typing import List, NamedTuple, Optional
-
-from tqdm import trange
-import numpy as np
-
-import tensorflow as tf
-from keras import Model
-from keras.layers import Dense, Dropout, Input
-from keras.optimizers import Adam
-
-#Import ydata synthetic classes
-from ....synthesizers import TrainParameters
-from ....synthesizers.base import BaseGANModel
-
-class WGAN_GP(BaseGANModel):
-
- __MODEL__='WGAN_GP'
-
- def __init__(self, model_parameters, n_generator:int=1, n_critic:int=1, gradient_penalty_weight:int=10):
- # As recommended in WGAN paper - https://arxiv.org/abs/1701.07875
- # WGAN-GP - WGAN with Gradient Penalty
- self.n_critic = n_critic
- self.n_generator = n_generator
- self.gradient_penalty_weight = gradient_penalty_weight
- super().__init__(model_parameters)
-
- def define_gan(self, activation_info: Optional[NamedTuple] = None):
- """Define the trainable model components.
-
- Args:
- activation_info (Optional[NamedTuple], optional): Defaults to None.
-
- Returns:
- (generator_optimizer, critic_optimizer): Generator and critic optimizers.
- """
- self.generator = Generator(self.batch_size). \
- build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
- activation_info=activation_info, tau = self.tau)
-
- self.critic = Critic(self.batch_size). \
- build_model(input_shape=(self.data_dim,), dim=self.layers_dim)
-
- g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- c_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2)
- return g_optimizer, c_optimizer
-
- def gradient_penalty(self, real, fake):
- """Compute gradient penalty.
-
- Args:
- real: real event.
- fake: fake event.
- Returns:
- gradient_penalty.
- """
- epsilon = tf.random.uniform([real.shape[0], 1], minval=0.0, maxval=1.0, dtype=tf.dtypes.float32)
- x_hat = epsilon * real + (1 - epsilon) * fake
- with tf.GradientTape() as t:
- t.watch(x_hat)
- d_hat = self.critic(x_hat)
- gradients = t.gradient(d_hat, x_hat)
- ddx = tf.sqrt(tf.reduce_sum(gradients ** 2))
- d_regularizer = tf.reduce_mean((ddx - 1.0) ** 2)
- return d_regularizer
-
- @tf.function
- def update_gradients(self, x, g_optimizer, c_optimizer):
- """Compute and apply the gradients for both the Generator and the Critic.
-
- Args:
- x: real data event
- g_optimizer: generator optimizer
- c_optimizer: critic optimizer
- Returns:
- (critic loss, generator loss)
- """
- for _ in range(self.n_critic):
- with tf.GradientTape() as d_tape:
- critic_loss = self.c_lossfn(x)
- # Get the gradients of the critic
- d_gradient = d_tape.gradient(critic_loss, self.critic.trainable_variables)
- # Update the weights of the critic using the optimizer
- c_optimizer.apply_gradients(
- zip(d_gradient, self.critic.trainable_variables)
- )
-
- ##Add here the n_generator
- # Update the generator
- for _ in range(self.n_generator):
- with tf.GradientTape() as g_tape:
- gen_loss = self.g_lossfn(x)
- # Get the gradients of the generator
- gen_gradients = g_tape.gradient(gen_loss, self.generator.trainable_variables)
- # Update the weights of the generator
- g_optimizer.apply_gradients(
- zip(gen_gradients, self.generator.trainable_variables)
- )
-
- return critic_loss, gen_loss
-
- def c_lossfn(self, real):
- """Compute critic loss.
-
- Args:
- real: real data
-
- Returns:
- critic loss
- """
- # generating noise from a uniform distribution
- noise = tf.random.normal([real.shape[0], self.noise_dim], dtype=tf.dtypes.float32)
- # run noise through generator
- fake = self.generator(noise)
- # discriminate x and x_gen
- logits_real = self.critic(real)
- logits_fake = self.critic(fake)
-
- # gradient penalty
- gp = self.gradient_penalty(real, fake)
- # getting the loss of the critic.
- c_loss = (tf.reduce_mean(logits_fake)
- - tf.reduce_mean(logits_real)
- + gp * self.gradient_penalty_weight)
- return c_loss
-
- def g_lossfn(self, real):
- """Compute generator loss.
-
- Args:
- real: A real sample
- fake: A fake sample
- fak2: A second fake sample
-
- Returns:
- Loss of the generator
- """
- # generating noise from a uniform distribution
- noise = tf.random.normal([real.shape[0], self.noise_dim], dtype=tf.dtypes.float32)
-
- fake = self.generator(noise)
- logits_fake = self.critic(fake)
- g_loss = -tf.reduce_mean(logits_fake)
- return g_loss
-
- def get_data_batch(self, train, batch_size, seed=0):
- """Get real data batches from the passed data object.
-
- Args:
- train: real data.
- batch_size: batch size.
- seed (int, optional):Defaults to 0.
-
- Returns:
- data batch.
- """
- # np.random.seed(seed)
- # x = train.loc[ np.random.choice(train.index, batch_size) ].values
- # iterate through shuffled indices, so every sample gets covered evenly
- start_i = (batch_size * seed) % len(train)
- stop_i = start_i + batch_size
- shuffle_seed = (batch_size * seed) // len(train)
- np.random.seed(shuffle_seed)
- train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time
- train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set
- return train[train_ix[start_i: stop_i]]
-
- def train_step(self, train_data, optimizers):
- """Perform a training step.
-
- Args:
- train_data: training data
- optimizers: generator and critic optimizers
-
- Returns:
- (critic_loss, generator_loss): Critic and generator loss.
- """
- cri_loss, ge_loss = self.update_gradients(train_data, *optimizers)
- return cri_loss, ge_loss
-
- def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]):
- """Fit a synthesizer model to a given input dataset.
-
- Args:
- data: A pandas DataFrame or a Numpy array with the data to be synthesized.
- train_arguments: GAN training arguments.
- num_cols (List[str]): List of columns of the data object to be handled as numerical.
- cat_cols (List[str]): List of columns of the data object to be handled as categorical.
- """
- super().fit(data, num_cols, cat_cols)
-
- processed_data = self.processor.transform(data)
- self.data_dim = processed_data.shape[1]
- optimizers = self.define_gan(self.processor.col_transform_info)
-
- iterations = int(abs(data.shape[0]/self.batch_size)+1)
-
- # Create a summary file
- train_summary_writer = tf.summary.create_file_writer(path.join('..\wgan_gp_test', 'summaries', 'train'))
-
- with train_summary_writer.as_default():
- for epoch in trange(train_arguments.epochs):
- for _ in range(iterations):
- batch_data = self.get_data_batch(processed_data, self.batch_size).astype(np.float32)
- cri_loss, ge_loss = self.train_step(batch_data, optimizers)
-
- print(
- "Epoch: {} | disc_loss: {} | gen_loss: {}".format(
- epoch, cri_loss, ge_loss
- ))
-
- if epoch % train_arguments.sample_interval == 0:
- # Test here data generation step
- # save model checkpoints
- if path.exists('./cache') is False:
- os.mkdir('./cache')
- model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5'
- self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
- self.critic.save_weights(model_checkpoint_base_name.format('critic', epoch))
-
-
-class Generator(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple generator with dense feedforward layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers dimensions.
- data_dim: Output dimensionality.
- activation_info (Optional[NamedTuple]): Defaults to None
- tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None
- Returns:
- Generator model
- """
- input = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim, activation='relu')(input)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim)(x)
- #if activation_info:
- # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x)
- return Model(inputs=input, outputs=x)
-
-class Critic(tf.keras.Model):
- def __init__(self, batch_size):
- """Simple critic with dense feedforward and dropout layers.
-
- Args:
- batch_size (int): batch size
- """
- self.batch_size = batch_size
-
- def build_model(self, input_shape, dim):
- """Create model components.
-
- Args:
- input_shape: input dimensionality.
- dim: hidden layers size.
-
- Returns:
- Critic model
- """
- input = Input(shape=input_shape, batch_size=self.batch_size)
- x = Dense(dim * 4, activation='relu')(input)
- x = Dropout(0.1)(x)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dropout(0.1)(x)
- x = Dense(dim, activation='relu')(x)
- x = Dense(1)(x)
- return Model(inputs=input, outputs=x)
+class WGAN_GP(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/saving_keras.py b/src/ydata_synthetic/synthesizers/saving_keras.py
deleted file mode 100644
index faf98bcf..00000000
--- a/src/ydata_synthetic/synthesizers/saving_keras.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import tensorflow.python.keras as tf_keras
-from keras import __version__
-tf_keras.__version__ = __version__
-
-from tensorflow.keras import Model
-
-def unpack(model, training_config, weights):
- restored_model = tf_keras.layers.deserialize(model)
- if training_config is not None:
- restored_model.compile(**tf_keras.saving.saving_utils.compile_args_from_training_config(training_config))
- restored_model.set_weights(weights)
- return restored_model
-
-def make_keras_picklable():
- def __reduce__(self):
- model_metadata = tf_keras.saving.saving_utils.model_metadata(self)
- training_config = model_metadata.get("training_config", None)
- model = tf_keras.layers.serialize(self)
- weights = self.get_weights()
- return (unpack, (model, training_config, weights))
-
- cls = Model
- cls.__reduce__=__reduce__
diff --git a/src/ydata_synthetic/synthesizers/timeseries/__init__.py b/src/ydata_synthetic/synthesizers/timeseries/__init__.py
index 0309d113..d45b893d 100644
--- a/src/ydata_synthetic/synthesizers/timeseries/__init__.py
+++ b/src/ydata_synthetic/synthesizers/timeseries/__init__.py
@@ -1,5 +1,11 @@
-from ydata_synthetic.synthesizers.timeseries.model import TimeSeriesSynthesizer
+"""
+ ydata_synthetic.synthesizers.regular init file
+"""
+from warnings import warn
-__all__ = [
- 'TimeSeriesSynthesizer'
-]
+warn(
+ "`import ydata_synthetic.synthesizers.timeseries` is deprecated. Please use `import ydata.sdk.synthesizers import TimeSeriesSynthesizer` instead."
+ "For more information check https://docs.synthetic.ydata.ai/latest and https://docs.fabric.ydata.ai/latest/sdk",
+ DeprecationWarning,
+ stacklevel=2,
+)
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py
deleted file mode 100644
index 1f18b7f6..00000000
--- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py
+++ /dev/null
@@ -1,594 +0,0 @@
-import tensorflow as tf
-import numpy as np
-from tqdm import tqdm
-import math
-from joblib import dump
-
-
-class DoppelGANgerNetwork(object):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/doppelganger.py.
- """
- def __init__(self,
- sess,
- epoch,
- batch_size,
- data_feature,
- data_attribute,
- attribute_cols_metadata,
- sample_len,
- generator,
- discriminator,
- rounds,
- d_gp_coe,
- num_packing,
- attr_discriminator=None,
- attr_d_gp_coe=None,
- g_attr_d_coe=None,
- attribute_latent_dim=5,
- feature_latent_dim=5,
- fix_feature_network=False,
- g_lr=0.001,
- g_beta1=0.5,
- d_lr=0.001,
- d_beta1=0.5,
- attr_d_lr=0.001,
- attr_d_beta1=0.5):
- """Constructor of DoppelGANger
- Args:
- sess: A tensorflow session
- epoch: Number of training epochs
- batch_size: Training batch size
- data_feature: Training features, in numpy float32 array format.
- The size is [(number of training samples) x (maximum length) x
- (total dimension of features)].
- data_attribute: Training attributes, in numpy float32 array format.
- The size is [(number of training samples) x (total dimension
- of attributes)]
- sample_len: The time series batch size
- generator: An instance of network.DoppelGANgerGenerator
- discriminator: An instance of network.Discriminator
- rounds: Number of steps per batch
- d_gp_coe: Weight of gradient penalty loss in Wasserstein GAN
- num_packing: Packing degree in PacGAN (a method for solving mode
- collapse in NeurIPS 2018, see https://arxiv.org/abs/1712.04086)
- attr_discriminator: An instance of network.AttrDiscriminator. None
- if you do not want to use this auxiliary discriminator
- attr_d_gp_coe: Weight of gradient penalty loss in Wasserstein GAN
- for the auxiliary discriminator
- g_attr_d_coe: Weight of the auxiliary discriminator in the
- generator's loss
- attribute_latent_dim: The dimension of noise for generating
- attributes
- feature_latent_dim: The dimension of noise for generating
- features
- fix_feature_network: Whether to fix the feature network during
- training
- g_lr: The learning rate in Adam for training the generator
- g_beta1: The beta1 in Adam for training the generator
- d_lr: The learning rate in Adam for training the discriminator
- d_beta1: The beta1 in Adam for training the discriminator
- attr_d_lr: The learning rate in Adam for training the auxiliary
- discriminator
- attr_d_beta1: The beta1 in Adam for training the auxiliary
- discriminator
- """
- self.sess = sess
- self.epoch = epoch
- self.batch_size = batch_size
- self.data_feature = data_feature
- self.data_attribute = data_attribute
- self.attribute_cols_metadata = attribute_cols_metadata
- self.sample_len = sample_len
- self.generator = generator
- self.discriminator = discriminator
- self.rounds = rounds
- self.attr_discriminator = attr_discriminator
- self.d_gp_coe = d_gp_coe
- self.attr_d_gp_coe = attr_d_gp_coe
- self.g_attr_d_coe = g_attr_d_coe
- self.num_packing = num_packing
- self.attribute_latent_dim = attribute_latent_dim
- self.feature_latent_dim = feature_latent_dim
- self.fix_feature_network = fix_feature_network
- self.g_lr = g_lr
- self.g_beta1 = g_beta1
- self.d_lr = d_lr
- self.d_beta1 = d_beta1
- self.attr_d_lr = attr_d_lr
- self.attr_d_beta1 = attr_d_beta1
-
- if self.data_feature is not None:
- if self.data_feature.shape[1] % self.sample_len != 0:
- raise Exception("Length must be a multiple of sample_len.")
- self.sample_time = int(self.data_feature.shape[1] / self.sample_len)
- self.sample_feature_dim = self.data_feature.shape[2]
- if self.data_attribute is not None:
- self.sample_attribute_dim = self.data_attribute.shape[1]
- self.sample_real_attribute_dim = sum([c.output_dim for c in self.attribute_cols_metadata if c.real])
-
- self.EPS = 1e-8
-
- def build(self):
- self.build_connection()
- self.build_loss()
-
- def build_connection(self):
- # build connections for train-fake
- self.g_feature_input_noise_train_pl_l = []
- for i in range(self.num_packing):
- self.g_feature_input_noise_train_pl_l.append(
- tf.compat.v1.placeholder(
- tf.float32,
- [None, self.sample_time, self.feature_latent_dim],
- name="g_feature_input_noise_train_{}".format(i)))
- self.g_real_attribute_input_noise_train_pl_l = []
- for i in range(self.num_packing):
- self.g_real_attribute_input_noise_train_pl_l.append(
- tf.compat.v1.placeholder(
- tf.float32,
- [None, self.attribute_latent_dim],
- name="g_real_attribute_input_noise_train_{}".format(i)))
- self.g_addi_attribute_input_noise_train_pl_l = []
- for i in range(self.num_packing):
- self.g_addi_attribute_input_noise_train_pl_l.append(
- tf.compat.v1.placeholder(
- tf.float32,
- [None, self.attribute_latent_dim],
- name=("g_addi_attribute_input_noise_train_{}".format(i))))
- self.g_feature_input_data_train_pl_l = []
- for i in range(self.num_packing):
- self.g_feature_input_data_train_pl_l.append(
- tf.compat.v1.placeholder(
- tf.float32,
- [None, self.sample_len * self.sample_feature_dim],
- name="g_feature_input_data_train_{}".format(i)))
-
- batch_size = tf.shape(input=self.g_feature_input_noise_train_pl_l[0])[0]
- self.real_attribute_mask_tensor = []
- for col_meta in self.attribute_cols_metadata:
- if col_meta.real:
- sub_mask_tensor = tf.ones((batch_size, col_meta.output_dim))
- else:
- sub_mask_tensor = tf.zeros((batch_size, col_meta.output_dim))
- self.real_attribute_mask_tensor.append(sub_mask_tensor)
- self.real_attribute_mask_tensor = tf.concat(self.real_attribute_mask_tensor,axis=1)
-
- self.g_output_feature_train_tf_l = []
- self.g_output_attribute_train_tf_l = []
- self.g_output_gen_flag_train_tf_l = []
- self.g_output_length_train_tf_l = []
- self.g_output_argmax_train_tf_l = []
- for i in range(self.num_packing):
- (g_output_feature_train_tf, g_output_attribute_train_tf,
- g_output_gen_flag_train_tf, g_output_length_train_tf,
- g_output_argmax_train_tf) = \
- self.generator.build(
- self.g_real_attribute_input_noise_train_pl_l[i],
- self.g_addi_attribute_input_noise_train_pl_l[i],
- self.g_feature_input_noise_train_pl_l[i],
- self.g_feature_input_data_train_pl_l[i],
- train=True)
-
- if self.fix_feature_network:
- g_output_feature_train_tf = tf.zeros_like(
- g_output_feature_train_tf)
- g_output_gen_flag_train_tf = tf.zeros_like(
- g_output_gen_flag_train_tf)
- g_output_attribute_train_tf *= self.real_attribute_mask_tensor
-
- self.g_output_feature_train_tf_l.append(
- g_output_feature_train_tf)
- self.g_output_attribute_train_tf_l.append(
- g_output_attribute_train_tf)
- self.g_output_gen_flag_train_tf_l.append(
- g_output_gen_flag_train_tf)
- self.g_output_length_train_tf_l.append(
- g_output_length_train_tf)
- self.g_output_argmax_train_tf_l.append(
- g_output_argmax_train_tf)
- self.g_output_feature_train_tf = tf.concat(
- self.g_output_feature_train_tf_l,
- axis=1)
- self.g_output_attribute_train_tf = tf.concat(
- self.g_output_attribute_train_tf_l,
- axis=1)
-
- self.d_fake_train_tf = self.discriminator.build(
- self.g_output_feature_train_tf,
- self.g_output_attribute_train_tf)
-
- if self.attr_discriminator is not None:
- self.attr_d_fake_train_tf = self.attr_discriminator.build(
- self.g_output_attribute_train_tf)
-
- self.real_feature_pl_l = []
- for i in range(self.num_packing):
- real_feature_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None,
- self.sample_time * self.sample_len,
- self.sample_feature_dim],
- name="real_feature_{}".format(i))
- if self.fix_feature_network:
- real_feature_pl = tf.zeros_like(
- real_feature_pl)
- self.real_feature_pl_l.append(real_feature_pl)
- self.real_attribute_pl_l = []
- for i in range(self.num_packing):
- real_attribute_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, self.sample_attribute_dim],
- name="real_attribute_{}".format(i))
- if self.fix_feature_network:
- real_attribute_pl *= self.real_attribute_mask_tensor
- self.real_attribute_pl_l.append(real_attribute_pl)
- self.real_feature_pl = tf.concat(
- self.real_feature_pl_l,
- axis=1)
- self.real_attribute_pl = tf.concat(
- self.real_attribute_pl_l,
- axis=1)
-
- self.d_real_train_tf = self.discriminator.build(
- self.real_feature_pl,
- self.real_attribute_pl)
- self.d_real_test_tf = self.discriminator.build(
- self.real_feature_pl,
- self.real_attribute_pl)
-
- if self.attr_discriminator is not None:
- self.attr_d_real_train_tf = self.attr_discriminator.build(
- self.real_attribute_pl)
-
- self.g_real_attribute_input_noise_test_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, self.attribute_latent_dim],
- name="g_real_attribute_input_noise_test")
- self.g_addi_attribute_input_noise_test_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, self.attribute_latent_dim],
- name="g_addi_attribute_input_noise_test")
- self.g_feature_input_noise_test_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, None, self.feature_latent_dim],
- name="g_feature_input_noise_test")
-
- self.g_feature_input_data_test_teacher_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, None, self.sample_len * self.sample_feature_dim],
- name="g_feature_input_data_test_teacher")
- (self.g_output_feature_test_teacher_tf,
- self.g_output_attribute_test_teacher_tf,
- self.g_output_gen_flag_test_teacher_tf,
- self.g_output_length_test_teacher_tf, _) = \
- self.generator.build(
- self.g_real_attribute_input_noise_test_pl,
- self.g_addi_attribute_input_noise_test_pl,
- self.g_feature_input_noise_test_pl,
- self.g_feature_input_data_test_teacher_pl,
- train=False)
-
- self.g_feature_input_data_test_free_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, self.sample_len * self.sample_feature_dim],
- name="g_feature_input_data_test_free")
- (self.g_output_feature_test_free_tf,
- self.g_output_attribute_test_free_tf,
- self.g_output_gen_flag_test_free_tf,
- self.g_output_length_test_free_tf, _) = \
- self.generator.build(
- self.g_real_attribute_input_noise_test_pl,
- self.g_addi_attribute_input_noise_test_pl,
- self.g_feature_input_noise_test_pl,
- self.g_feature_input_data_test_free_pl,
- train=False)
-
- self.given_attribute_attribute_pl = tf.compat.v1.placeholder(
- tf.float32,
- [None, self.sample_real_attribute_dim],
- name="given_attribute")
- (self.g_output_feature_given_attribute_test_free_tf,
- self.g_output_attribute_given_attribute_test_free_tf,
- self.g_output_gen_flag_given_attribute_test_free_tf,
- self.g_output_length_given_attribute_test_free_tf, _) = \
- self.generator.build(
- None,
- self.g_addi_attribute_input_noise_test_pl,
- self.g_feature_input_noise_test_pl,
- self.g_feature_input_data_test_free_pl,
- train=False,
- attribute=self.given_attribute_attribute_pl)
-
- def build_loss(self):
- batch_size = tf.shape(input=self.g_feature_input_noise_train_pl_l[0])[0]
-
- self.g_loss_d = -tf.reduce_mean(input_tensor=self.d_fake_train_tf)
- if self.attr_discriminator is not None:
- self.g_loss_attr_d = -tf.reduce_mean(input_tensor=self.attr_d_fake_train_tf)
- self.g_loss = (self.g_loss_d +
- self.g_attr_d_coe * self.g_loss_attr_d)
- else:
- self.g_loss = self.g_loss_d
-
- self.d_loss_fake = tf.reduce_mean(input_tensor=self.d_fake_train_tf)
- self.d_loss_fake_unflattened = self.d_fake_train_tf
- self.d_loss_real = -tf.reduce_mean(input_tensor=self.d_real_train_tf)
- self.d_loss_real_unflattened = -self.d_real_train_tf
- alpha_dim2 = tf.random.uniform(
- shape=[batch_size, 1],
- minval=0.,
- maxval=1.)
- alpha_dim3 = tf.expand_dims(alpha_dim2, 2)
- differences_input_feature = (self.g_output_feature_train_tf -
- self.real_feature_pl)
- interpolates_input_feature = (self.real_feature_pl +
- alpha_dim3 * differences_input_feature)
- differences_input_attribute = (self.g_output_attribute_train_tf -
- self.real_attribute_pl)
- interpolates_input_attribute = (self.real_attribute_pl +
- (alpha_dim2 *
- differences_input_attribute))
- gradients = tf.gradients(
- ys=self.discriminator.build(
- interpolates_input_feature,
- interpolates_input_attribute),
- xs=[interpolates_input_feature, interpolates_input_attribute])
- slopes1 = tf.reduce_sum(input_tensor=tf.square(gradients[0]),
- axis=[1, 2])
- slopes2 = tf.reduce_sum(input_tensor=tf.square(gradients[1]),
- axis=[1])
- slopes = tf.sqrt(slopes1 + slopes2 + self.EPS)
- self.d_loss_gp = tf.reduce_mean(input_tensor=(slopes - 1.)**2)
- self.d_loss_gp_unflattened = (slopes - 1.)**2
-
- self.d_loss = (self.d_loss_fake +
- self.d_loss_real +
- self.d_gp_coe * self.d_loss_gp)
-
- self.d_loss_unflattened = (self.d_loss_fake_unflattened +
- self.d_loss_real_unflattened +
- self.d_gp_coe * self.d_loss_gp_unflattened)
-
- if self.attr_discriminator is not None:
- self.attr_d_loss_fake = tf.reduce_mean(input_tensor=self.attr_d_fake_train_tf)
- self.attr_d_loss_fake_unflattened = self.attr_d_fake_train_tf
- self.attr_d_loss_real = -tf.reduce_mean(input_tensor=self.attr_d_real_train_tf)
- self.attr_d_loss_real_unflattened = -self.attr_d_real_train_tf
- alpha_dim2 = tf.random.uniform(
- shape=[batch_size, 1],
- minval=0.,
- maxval=1.)
- differences_input_attribute = (self.g_output_attribute_train_tf -
- self.real_attribute_pl)
- interpolates_input_attribute = (self.real_attribute_pl +
- (alpha_dim2 *
- differences_input_attribute))
- gradients = tf.gradients(
- ys=self.attr_discriminator.build(
- interpolates_input_attribute),
- xs=[interpolates_input_attribute])
- slopes1 = tf.reduce_sum(input_tensor=tf.square(gradients[0]),
- axis=[1])
- slopes = tf.sqrt(slopes1 + self.EPS)
- self.attr_d_loss_gp = tf.reduce_mean(input_tensor=(slopes - 1.)**2)
- self.attr_d_loss_gp_unflattened = (slopes - 1.)**2
-
- self.attr_d_loss = (self.attr_d_loss_fake +
- self.attr_d_loss_real +
- self.attr_d_gp_coe * self.attr_d_loss_gp)
-
- self.attr_d_loss_unflattened = \
- (self.attr_d_loss_fake_unflattened +
- self.attr_d_loss_real_unflattened +
- self.attr_d_gp_coe * self.attr_d_loss_gp_unflattened)
-
- self.g_op = \
- tf.compat.v1.train.AdamOptimizer(self.g_lr, self.g_beta1)\
- .minimize(
- self.g_loss,
- var_list=self.generator.trainable_vars)
-
- self.d_op = \
- tf.compat.v1.train.AdamOptimizer(self.d_lr, self.d_beta1)\
- .minimize(
- self.d_loss,
- var_list=self.discriminator.trainable_vars)
-
- if self.attr_discriminator is not None:
- self.attr_d_op = \
- tf.compat.v1.train.AdamOptimizer(self.attr_d_lr, self.attr_d_beta1)\
- .minimize(
- self.attr_d_loss,
- var_list=self.attr_discriminator.trainable_vars)
-
- def sample_from(self, real_attribute_input_noise,
- addi_attribute_input_noise, feature_input_noise,
- feature_input_data, given_attribute=None,
- return_gen_flag_feature=False):
- features = []
- attributes = []
- gen_flags = []
- lengths = []
- round_ = int(
- math.ceil(float(feature_input_noise.shape[0]) / self.batch_size))
- for i in range(round_):
- if given_attribute is None:
- if feature_input_data.ndim == 2:
- (sub_features, sub_attributes, sub_gen_flags,
- sub_lengths) = self.sess.run(
- [self.g_output_feature_test_free_tf,
- self.g_output_attribute_test_free_tf,
- self.g_output_gen_flag_test_free_tf,
- self.g_output_length_test_free_tf],
- feed_dict={
- self.g_real_attribute_input_noise_test_pl:
- real_attribute_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_addi_attribute_input_noise_test_pl:
- addi_attribute_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_feature_input_noise_test_pl:
- feature_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_feature_input_data_test_free_pl:
- feature_input_data[
- i * self.batch_size:
- (i + 1) * self.batch_size]})
- else:
- (sub_features, sub_attributes, sub_gen_flags,
- sub_lengths) = self.sess.run(
- [self.g_output_feature_test_teacher_tf,
- self.g_output_attribute_test_teacher_tf,
- self.g_output_gen_flag_test_teacher_tf,
- self.g_output_length_test_teacher_tf],
- feed_dict={
- self.g_real_attribute_input_noise_test_pl:
- real_attribute_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_addi_attribute_input_noise_test_pl:
- addi_attribute_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_feature_input_noise_test_pl:
- feature_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_feature_input_data_test_teacher_pl:
- feature_input_data[
- i * self.batch_size:
- (i + 1) * self.batch_size]})
- else:
- (sub_features, sub_attributes, sub_gen_flags,
- sub_lengths) = self.sess.run(
- [self.g_output_feature_given_attribute_test_free_tf,
- self.g_output_attribute_given_attribute_test_free_tf,
- self.g_output_gen_flag_given_attribute_test_free_tf,
- self.g_output_length_given_attribute_test_free_tf],
- feed_dict={
- self.g_addi_attribute_input_noise_test_pl:
- addi_attribute_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_feature_input_noise_test_pl:
- feature_input_noise[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.g_feature_input_data_test_free_pl:
- feature_input_data[
- i * self.batch_size:
- (i + 1) * self.batch_size],
- self.given_attribute_attribute_pl:
- given_attribute[
- i * self.batch_size:
- (i + 1) * self.batch_size]})
- features.append(sub_features)
- attributes.append(sub_attributes)
- gen_flags.append(sub_gen_flags)
- lengths.append(sub_lengths)
-
- features = np.concatenate(features, axis=0)
- attributes = np.concatenate(attributes, axis=0)
- gen_flags = np.concatenate(gen_flags, axis=0)
- lengths = np.concatenate(lengths, axis=0)
-
- if not return_gen_flag_feature:
- features = np.delete(features, [features.shape[2] - 2, features.shape[2] - 1], axis=2)
-
- assert len(gen_flags.shape) == 3
- assert gen_flags.shape[2] == 1
- gen_flags = gen_flags[:, :, 0]
-
- return features, attributes, gen_flags, lengths
-
- def gen_attribute_input_noise(self, num_sample):
- return np.random.normal(
- size=[num_sample, self.attribute_latent_dim])
-
- def gen_feature_input_noise(self, num_sample, length=1):
- return np.random.normal(
- size=[num_sample, length, self.feature_latent_dim])
-
- def gen_feature_input_data_free(self, num_sample):
- return np.zeros(
- [num_sample, self.sample_len * self.sample_feature_dim],
- dtype=np.float32)
-
- def train(self):
- tf.compat.v1.global_variables_initializer().run()
-
- batch_num = self.data_feature.shape[0] // self.batch_size
-
- for _ in tqdm(range(self.epoch)):
- data_id = np.random.choice(
- self.data_feature.shape[0],
- size=(self.data_feature.shape[0], self.num_packing))
-
- for batch_id in range(batch_num):
- feed_dict = {}
- for i in range(self.num_packing):
- batch_data_id = data_id[batch_id * self.batch_size:
- (batch_id + 1) * self.batch_size,
- i]
- batch_data_feature = self.data_feature[batch_data_id]
- batch_data_attribute = self.data_attribute[batch_data_id]
-
- batch_real_attribute_input_noise = \
- self.gen_attribute_input_noise(self.batch_size)
- batch_addi_attribute_input_noise = \
- self.gen_attribute_input_noise(self.batch_size)
- batch_feature_input_noise = \
- self.gen_feature_input_noise(
- self.batch_size, self.sample_time)
- batch_feature_input_data = \
- self.gen_feature_input_data_free(self.batch_size)
-
- feed_dict[self.real_feature_pl_l[i]] = \
- batch_data_feature
- feed_dict[self.real_attribute_pl_l[i]] = \
- batch_data_attribute
- feed_dict[self.
- g_real_attribute_input_noise_train_pl_l[i]] = \
- batch_real_attribute_input_noise
- feed_dict[self.
- g_addi_attribute_input_noise_train_pl_l[i]] = \
- batch_addi_attribute_input_noise
- feed_dict[self.g_feature_input_noise_train_pl_l[i]] = \
- batch_feature_input_noise
- feed_dict[self.g_feature_input_data_train_pl_l[i]] = \
- batch_feature_input_data
-
- for _ in range(self.rounds):
- self.sess.run(self.d_op, feed_dict=feed_dict)
- if self.attr_discriminator is not None:
- self.sess.run(self.attr_d_op, feed_dict=feed_dict)
- self.sess.run(self.g_op, feed_dict=feed_dict)
-
- def save(self, path):
- dump({
- "epoch": self.epoch,
- "batch_size": self.batch_size,
- "sample_len": self.sample_len,
- "rounds": self.rounds,
- "d_gp_coe": self.d_gp_coe,
- "attr_d_gp_coe": self.attr_d_gp_coe,
- "g_attr_d_coe": self.g_attr_d_coe,
- "num_packing": self.num_packing,
- "attribute_latent_dim": self.attribute_latent_dim,
- "feature_latent_dim": self.feature_latent_dim,
- "fix_feature_network": self.fix_feature_network,
- "g_lr": self.g_lr,
- "g_beta1": self.g_beta1,
- "d_lr": self.d_lr,
- "d_beta1": self.d_beta1,
- "attr_d_lr": self.attr_d_lr,
- "attr_d_beta1": self.attr_d_beta1,
- "sample_time": self.sample_time,
- "sample_feature_dim": self.sample_feature_dim,
- "sample_attribute_dim": self.sample_attribute_dim,
- "sample_real_attribute_dim": self.sample_real_attribute_dim
- }, path)
diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
index 4f4194cd..4fc22f6b 100644
--- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
+++ b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
@@ -1,205 +1,9 @@
-from pandas import DataFrame
-import tensorflow as tf
-import os
-from joblib import dump, load
+"""
+ Doppelganger implementation file
+"""
-from ydata_synthetic.synthesizers.timeseries.doppelganger.network import DoppelGANgerGenerator, AttrDiscriminator, Discriminator
-from ydata_synthetic.synthesizers.timeseries.doppelganger.doppelganger import DoppelGANgerNetwork
-from ydata_synthetic.synthesizers.base import BaseGANModel, ModelParameters, TrainParameters
-from ydata_synthetic.preprocessing.timeseries.doppelganger_processor import DoppelGANgerProcessor
-
-class DoppelGANger(BaseGANModel):
- """
- DoppelGANger model.
- Based on the paper https://dl.acm.org/doi/pdf/10.1145/3419394.3423643.
-
- Args:
- model_parameters: Parameters used to create the DoppelGANger model.
+class DoppelGANgerNetwork(BaseModel):
"""
- __MODEL__ = 'DoppelGANger'
-
- def __init__(self, model_parameters: ModelParameters):
- super().__init__(model_parameters)
- self._model_parameters = model_parameters
- self._gan_model = None
- self._tf_session = None
- self._sequence_length = None
- tf.compat.v1.disable_eager_execution()
-
- def fit(self, data: DataFrame,
- train_arguments: TrainParameters,
- num_cols: list[str] | None = None,
- cat_cols: list[str] | None = None):
- """
- Fits the DoppelGANger model.
-
- Args:
- data: A pandas DataFrame with the data to be synthesized.
- train_arguments: DoppelGANger training arguments.
- num_cols: List of columns to be handled as numerical
- cat_cols: List of columns to be handled as categorical
- """
- super().fit(data=data, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_arguments)
-
- self._sequence_length = train_arguments.sequence_length
- self._sample_length = train_arguments.sample_length
- self._rounds = train_arguments.rounds
-
- if data.shape[0] % self._sequence_length != 0:
- raise ValueError("The number of samples must be a multiple of the sequence length.")
-
- if self._sequence_length % self._sample_length != 0:
- raise ValueError("The sequence length must be a multiple of the sample length.")
-
- data_features, data_attributes = self.processor.transform(data)
- measurement_cols_metadata = self.processor.measurement_cols_metadata
- attribute_cols_metadata = self.processor.attribute_cols_metadata
-
- generator = DoppelGANgerGenerator(
- feed_back=False,
- noise=True,
- use_tanh=self.use_tanh,
- measurement_cols_metadata=measurement_cols_metadata,
- attribute_cols_metadata=attribute_cols_metadata,
- sample_len=self._sample_length)
- discriminator = Discriminator()
- attr_discriminator = AttrDiscriminator()
-
- self._tf_session = tf.compat.v1.Session()
- with self._tf_session.as_default() as sess:
- self._gan_model = DoppelGANgerNetwork(
- sess=sess,
- epoch=train_arguments.epochs,
- batch_size=self.batch_size,
- data_feature=data_features,
- data_attribute=data_attributes,
- attribute_cols_metadata=attribute_cols_metadata,
- sample_len=self._sample_length,
- generator=generator,
- discriminator=discriminator,
- rounds=self._rounds,
- attr_discriminator=attr_discriminator,
- d_gp_coe=self.gp_lambda,
- attr_d_gp_coe=self.gp_lambda,
- g_attr_d_coe=self.gp_lambda,
- num_packing=self.pac,
- attribute_latent_dim=self.latent_dim,
- feature_latent_dim=self.latent_dim,
- fix_feature_network=False,
- g_lr=self.g_lr,
- g_beta1=self.beta_1,
- d_lr=self.d_lr,
- d_beta1=self.beta_1,
- attr_d_lr=self.d_lr,
- attr_d_beta1=self.beta_1)
- self._gan_model.build()
- self._gan_model.train()
-
- def sample(self, n_samples: int):
- """
- Samples new data from the DoppelGANger.
-
- Args:
- n_samples: Number of samples to be generated.
- """
- if n_samples <= 0:
- raise ValueError("Invalid number of samples.")
-
- real_attribute_input_noise = self._gan_model.gen_attribute_input_noise(n_samples)
- addi_attribute_input_noise = self._gan_model.gen_attribute_input_noise(n_samples)
- length = int(self._sequence_length / self._sample_length)
- feature_input_noise = self._gan_model.gen_feature_input_noise(n_samples, length=length)
- input_data = self._gan_model.gen_feature_input_data_free(n_samples)
-
- with self._tf_session.as_default() as sess:
- self._gan_model.sess = sess
- data_features, data_attributes, gen_flags, _ = self._gan_model.sample_from(
- real_attribute_input_noise, addi_attribute_input_noise,
- feature_input_noise, input_data)
-
- return self.processor.inverse_transform(data_features, data_attributes, gen_flags)
-
- def save(self, path):
- """
- Save the DoppelGANger model in a directory.
-
- Args:
- path: Path of the directory where the files will be saved.
- """
- saver = tf.compat.v1.train.Saver()
- with self._tf_session.as_default() as sess:
- saver.save(sess, os.path.join(path, "doppelganger"), write_meta_graph=False)
- self._gan_model.save(os.path.join(path, "doppelganger_network.pkl"))
- dump({
- "processor": self.processor.__dict__,
- "measurement_cols_metadata": self.processor.measurement_cols_metadata,
- "attribute_cols_metadata": self.processor.attribute_cols_metadata,
- "_sequence_length": self._sequence_length,
- "_sample_length": self._sample_length
- }, os.path.join(path, "doppelganger_metadata.pkl"))
-
- @staticmethod
- def load(path):
- """
- Load the DoppelGANger model from a directory.
- Only the required components to sample new data are loaded.
-
- Args:
- class_dict: Path of the directory where the files were saved.
- """
- dp_model = DoppelGANger(ModelParameters())
- dp_network_parms = load(os.path.join(path, "doppelganger_network.pkl"))
- dp_metadata = load(os.path.join(path, "doppelganger_metadata.pkl"))
-
- dp_model.processor = DoppelGANgerProcessor()
- dp_model.processor.__dict__ = dp_metadata["processor"]
- dp_model._sequence_length = dp_metadata["_sequence_length"]
- dp_model._sample_length = dp_metadata["_sample_length"]
-
- generator = DoppelGANgerGenerator(
- feed_back=False,
- noise=True,
- measurement_cols_metadata=dp_metadata["measurement_cols_metadata"],
- attribute_cols_metadata=dp_metadata["attribute_cols_metadata"],
- sample_len=dp_network_parms["sample_len"])
- discriminator = Discriminator()
- attr_discriminator = AttrDiscriminator()
-
- with tf.compat.v1.Session().as_default() as sess:
- dp_model._gan_model = DoppelGANgerNetwork(
- sess=sess,
- epoch=dp_network_parms["epoch"],
- batch_size=dp_network_parms["batch_size"],
- data_feature=None,
- data_attribute=None,
- attribute_cols_metadata=dp_metadata["attribute_cols_metadata"],
- sample_len=dp_network_parms["sample_len"],
- generator=generator,
- discriminator=discriminator,
- rounds=dp_network_parms["rounds"],
- attr_discriminator=attr_discriminator,
- d_gp_coe=dp_network_parms["d_gp_coe"],
- attr_d_gp_coe=dp_network_parms["attr_d_gp_coe"],
- g_attr_d_coe=dp_network_parms["g_attr_d_coe"],
- num_packing=dp_network_parms["num_packing"],
- attribute_latent_dim=dp_network_parms["attribute_latent_dim"],
- feature_latent_dim=dp_network_parms["feature_latent_dim"],
- fix_feature_network=dp_network_parms["fix_feature_network"],
- g_lr=dp_network_parms["g_lr"],
- g_beta1=dp_network_parms["g_beta1"],
- d_lr=dp_network_parms["d_lr"],
- d_beta1=dp_network_parms["d_beta1"],
- attr_d_lr=dp_network_parms["attr_d_lr"],
- attr_d_beta1=dp_network_parms["attr_d_beta1"])
-
- dp_model._gan_model.sample_time = dp_network_parms["sample_time"]
- dp_model._gan_model.sample_feature_dim = dp_network_parms["sample_feature_dim"]
- dp_model._gan_model.sample_attribute_dim = dp_network_parms["sample_attribute_dim"]
- dp_model._gan_model.sample_real_attribute_dim = dp_network_parms["sample_real_attribute_dim"]
- dp_model._gan_model.build()
-
- saver = tf.compat.v1.train.Saver()
- saver.restore(sess, tf.compat.v1.train.latest_checkpoint(path))
- dp_model._tf_session = sess
-
- return dp_model
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py
deleted file mode 100644
index b5369b7b..00000000
--- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py
+++ /dev/null
@@ -1,436 +0,0 @@
-import tensorflow as tf
-import numpy as np
-
-
-def linear(input_, output_size, scope_name="linear"):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/op.py.
- """
- with tf.compat.v1.variable_scope(scope_name):
- input_ = tf.reshape(
- input_,
- [-1, np.prod(input_.get_shape().as_list()[1:])])
- output = tf.compat.v1.layers.dense(
- input_,
- output_size)
- return output
-
-
-def flatten(input_, scope_name="flatten"):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/op.py.
- """
- with tf.compat.v1.variable_scope(scope_name):
- output = tf.reshape(
- input_,
- [-1, np.prod(input_.get_shape().as_list()[1:])])
- return output
-
-
-class batch_norm(object):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/op.py.
- """
- def __init__(self, epsilon=1e-5, momentum=0.9, name="batch_norm"):
- with tf.compat.v1.variable_scope(name):
- self.epsilon = epsilon
- self.momentum = momentum
- self.name = name
-
- def __call__(self, x, train=True):
- return tf.keras.layers.BatchNormalization(momentum=self.momentum,
- epsilon=self.epsilon,
- scale=True,
- trainable=train,
- name=self.name)(x)
-
-
-class Network(object):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/network.py.
- """
- def __init__(self, scope_name):
- self.scope_name = scope_name
-
- @property
- def trainable_vars(self):
- return tf.compat.v1.get_collection(
- tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
- scope=self.scope_name)
-
-
-class Discriminator(Network):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/network.py.
- """
- def __init__(self,
- num_layers=5, num_units=200,
- scope_name="discriminator", *args, **kwargs):
- super(Discriminator, self).__init__(
- scope_name=scope_name, *args, **kwargs)
- self.num_layers = num_layers
- self.num_units = num_units
-
- def build(self, input_feature, input_attribute):
- with tf.compat.v1.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE):
- input_feature = flatten(input_feature)
- input_attribute = flatten(input_attribute)
- input_ = tf.concat([input_feature, input_attribute], 1)
- layers = [input_feature, input_attribute, input_]
- for i in range(self.num_layers - 1):
- with tf.compat.v1.variable_scope("layer{}".format(i)):
- layers.append(linear(layers[-1], self.num_units))
- layers.append(tf.nn.relu(layers[-1]))
- with tf.compat.v1.variable_scope("layer{}".format(self.num_layers - 1)):
- layers.append(linear(layers[-1], 1))
- layers.append(tf.squeeze(layers[-1], 1))
- return layers[-1]
-
-
-class AttrDiscriminator(Network):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/network.py.
- """
- def __init__(self,
- num_layers=5, num_units=200,
- scope_name="attrDiscriminator", *args, **kwargs):
- super(AttrDiscriminator, self).__init__(
- scope_name=scope_name, *args, **kwargs)
- self.num_layers = num_layers
- self.num_units = num_units
-
- def build(self, input_attribute):
- with tf.compat.v1.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE):
- input_attribute = flatten(input_attribute)
- layers = [input_attribute]
- for i in range(self.num_layers - 1):
- with tf.compat.v1.variable_scope("layer{}".format(i)):
- layers.append(linear(layers[-1], self.num_units))
- layers.append(tf.nn.relu(layers[-1]))
- with tf.compat.v1.variable_scope("layer{}".format(self.num_layers - 1)):
- layers.append(linear(layers[-1], 1))
- layers.append(tf.squeeze(layers[-1], 1))
- return layers[-1]
-
-
-class DoppelGANgerGenerator(Network):
- """
- Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/network.py.
- """
- def __init__(self, feed_back, noise,
- measurement_cols_metadata, attribute_cols_metadata, sample_len,
- attribute_num_units=100, attribute_num_layers=3,
- feature_num_units=100, feature_num_layers=1, use_tanh=False,
- scope_name="DoppelGANgerGenerator", *args, **kwargs):
- super(DoppelGANgerGenerator, self).__init__(
- scope_name=scope_name, *args, **kwargs)
- self.feed_back = feed_back
- self.noise = noise
- self.attribute_num_units = attribute_num_units
- self.attribute_num_layers = attribute_num_layers
- self.feature_num_units = feature_num_units
- self.measurement_cols_metadata = measurement_cols_metadata
- self.attribute_cols_metadata = attribute_cols_metadata
- self.feature_num_layers = feature_num_layers
- self.use_tanh = use_tanh
- self.sample_len = sample_len
- self.feature_out_dim = (np.sum([t.output_dim for t in measurement_cols_metadata]) *
- self.sample_len)
- self.attribute_out_dim = np.sum([t.output_dim for t in attribute_cols_metadata])
- if not self.noise and not self.feed_back:
- raise Exception("noise and feed_back should have at least one True")
-
- self.real_attribute_outputs = [c for c in self.attribute_cols_metadata if c.real]
- self.addi_attribute_outputs = [c for c in self.attribute_cols_metadata if not c.real]
- self.real_attribute_out_dim = sum([c.output_dim for c in self.attribute_cols_metadata if c.real])
- self.addi_attribute_out_dim = sum([c.output_dim for c in self.attribute_cols_metadata if not c.real])
-
- self.gen_flag_id = len(self.measurement_cols_metadata) - 1
- self.STR_REAL = "real"
- self.STR_ADDI = "addi"
-
- # noqa: MC0001
- def build(self, attribute_input_noise, addi_attribute_input_noise,
- feature_input_noise, feature_input_data, train, attribute=None):
- with tf.compat.v1.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE):
- batch_size = tf.shape(input=feature_input_noise)[0]
-
- if attribute is None:
- all_attribute = []
- all_discrete_attribute = []
- if len(self.addi_attribute_outputs) > 0 and len(self.real_attribute_outputs) > 0:
- all_attribute_input_noise = \
- [attribute_input_noise,
- addi_attribute_input_noise]
- all_attribute_outputs = \
- [self.real_attribute_outputs,
- self.addi_attribute_outputs]
- all_attribute_part_name = \
- [self.STR_REAL, self.STR_ADDI]
- all_attribute_out_dim = \
- [self.real_attribute_out_dim,
- self.addi_attribute_out_dim]
- elif len(self.addi_attribute_outputs) > 0:
- all_attribute_input_noise = [addi_attribute_input_noise]
- all_attribute_outputs = [self.addi_attribute_outputs]
- all_attribute_part_name = [self.STR_ADDI]
- all_attribute_out_dim = [self.addi_attribute_out_dim]
- else:
- all_attribute_input_noise = [attribute_input_noise]
- all_attribute_outputs = [self.real_attribute_outputs]
- all_attribute_part_name = [self.STR_REAL]
- all_attribute_out_dim = [self.real_attribute_out_dim]
- else:
- all_attribute = [attribute]
- all_discrete_attribute = [attribute]
- if len(self.addi_attribute_outputs) > 0:
- all_attribute_input_noise = \
- [addi_attribute_input_noise]
- all_attribute_outputs = \
- [self.addi_attribute_outputs]
- all_attribute_part_name = \
- [self.STR_ADDI]
- all_attribute_out_dim = [self.addi_attribute_out_dim]
- else:
- all_attribute_input_noise = []
- all_attribute_outputs = []
- all_attribute_part_name = []
- all_attribute_out_dim = []
-
- for part_i, _ in enumerate(all_attribute_input_noise):
- with tf.compat.v1.variable_scope(
- "attribute_{}".format(all_attribute_part_name[part_i]),
- reuse=tf.compat.v1.AUTO_REUSE):
-
- if len(all_discrete_attribute) > 0:
- layers = [tf.concat(
- [all_attribute_input_noise[part_i]] +
- all_discrete_attribute,
- axis=1)]
- else:
- layers = [all_attribute_input_noise[part_i]]
-
- for i in range(self.attribute_num_layers - 1):
- with tf.compat.v1.variable_scope("layer{}".format(i)):
- layers.append(linear(layers[-1], self.attribute_num_units))
- layers.append(tf.nn.relu(layers[-1]))
- layers.append(batch_norm()(layers[-1], train=train))
- with tf.compat.v1.variable_scope(
- "layer{}".format(self.attribute_num_layers - 1),
- reuse=tf.compat.v1.AUTO_REUSE):
- part_attribute = []
- part_discrete_attribute = []
- for i in range(len(all_attribute_outputs[part_i])):
- with tf.compat.v1.variable_scope("output{}".format(i),
- reuse=tf.compat.v1.AUTO_REUSE):
- output = all_attribute_outputs[part_i][i]
-
- sub_output_ori = linear(layers[-1], output.output_dim)
- if output.discrete:
- sub_output = tf.nn.softmax(sub_output_ori)
- sub_output_discrete = tf.one_hot(
- tf.argmax(input=sub_output, axis=1),
- output.output_dim)
- else:
- if self.use_tanh:
- sub_output = tf.nn.tanh(sub_output_ori)
- else:
- sub_output = tf.nn.sigmoid(sub_output_ori)
- sub_output_discrete = sub_output
- part_attribute.append(sub_output)
- part_discrete_attribute.append(
- sub_output_discrete)
- part_attribute = tf.concat(part_attribute, axis=1)
- part_discrete_attribute = tf.concat(
- part_discrete_attribute, axis=1)
- part_attribute = tf.reshape(
- part_attribute,
- [batch_size, all_attribute_out_dim[part_i]])
- part_discrete_attribute = tf.reshape(
- part_discrete_attribute,
- [batch_size, all_attribute_out_dim[part_i]])
- # batch_size * dim
-
- part_discrete_attribute = tf.stop_gradient(
- part_discrete_attribute)
-
- all_attribute.append(part_attribute)
- all_discrete_attribute.append(part_discrete_attribute)
-
- all_attribute = tf.concat(all_attribute, axis=1)
- all_discrete_attribute = tf.concat(all_discrete_attribute, axis=1)
- all_attribute = tf.reshape(
- all_attribute,
- [batch_size, self.attribute_out_dim])
- all_discrete_attribute = tf.reshape(
- all_discrete_attribute,
- [batch_size, self.attribute_out_dim])
-
- with tf.compat.v1.variable_scope("feature", reuse=tf.compat.v1.AUTO_REUSE):
- all_cell = []
- for i in range(self.feature_num_layers):
- with tf.compat.v1.variable_scope("unit{}".format(i),
- reuse=tf.compat.v1.AUTO_REUSE):
- cell = tf.compat.v1.nn.rnn_cell.LSTMCell(
- num_units=self.feature_num_units,
- state_is_tuple=True)
- all_cell.append(cell)
- rnn_network = tf.compat.v1.nn.rnn_cell.MultiRNNCell(all_cell)
-
- feature_input_data_dim = \
- len(feature_input_data.get_shape().as_list())
- if feature_input_data_dim == 3:
- feature_input_data_reshape = tf.transpose(
- a=feature_input_data, perm=[1, 0, 2])
- feature_input_noise_reshape = tf.transpose(
- a=feature_input_noise, perm=[1, 0, 2])
-
- initial_state = tf.random.normal(
- shape=(self.feature_num_layers,
- 2,
- batch_size,
- self.feature_num_units),
- mean=0.0, stddev=1.0)
- initial_state = tf.unstack(initial_state, axis=0)
- initial_state = tuple(
- [tf.compat.v1.nn.rnn_cell.LSTMStateTuple(
- initial_state[idx][0], initial_state[idx][1])
- for idx in range(self.feature_num_layers)])
-
- time = feature_input_noise.get_shape().as_list()[1]
- if time is None:
- time = tf.shape(input=feature_input_noise)[1]
-
- def compute(i, state, last_output, all_output,
- gen_flag, all_gen_flag, all_cur_argmax,
- last_cell_output):
- input_all = [all_discrete_attribute]
- if self.noise:
- input_all.append(feature_input_noise_reshape[i])
- if self.feed_back:
- if feature_input_data_dim == 3:
- input_all.append(feature_input_data_reshape[i])
- else:
- input_all.append(last_output)
- input_all = tf.concat(input_all, axis=1)
-
- cell_new_output, new_state = rnn_network(input_all, state)
- new_output_all = []
- id_ = 0
- for j in range(self.sample_len):
- for k, _ in enumerate(self.measurement_cols_metadata):
- with tf.compat.v1.variable_scope("output{}".format(id_),
- reuse=tf.compat.v1.AUTO_REUSE):
- output = self.measurement_cols_metadata[k]
- sub_output = linear(cell_new_output, output.output_dim)
- if output.discrete:
- sub_output = tf.nn.softmax(sub_output)
- else:
- if self.use_tanh:
- sub_output = tf.nn.tanh(sub_output)
- else:
- sub_output = tf.nn.sigmoid(sub_output)
- new_output_all.append(sub_output)
- id_ += 1
- new_output = tf.concat(new_output_all, axis=1)
-
- for j in range(self.sample_len):
- all_gen_flag = all_gen_flag.write(
- i * self.sample_len + j, gen_flag)
- cur_gen_flag = tf.cast(tf.equal(tf.argmax(
- input=new_output_all[(j * len(self.measurement_cols_metadata) +
- self.gen_flag_id)],
- axis=1), 0), dtype=tf.float32)
- cur_gen_flag = tf.reshape(cur_gen_flag, [-1, 1])
- all_cur_argmax = all_cur_argmax.write(
- i * self.sample_len + j,
- tf.argmax(
- input=new_output_all[(j * len(self.measurement_cols_metadata) +
- self.gen_flag_id)],
- axis=1))
- gen_flag = gen_flag * cur_gen_flag
-
- return (i + 1,
- new_state,
- new_output,
- all_output.write(i, new_output),
- gen_flag,
- all_gen_flag,
- all_cur_argmax,
- cell_new_output)
-
- (i, _, _, feature, _, gen_flag, cur_argmax, _) = \
- tf.while_loop(
- cond=lambda a, b, c, d, e, f, g, h:
- tf.logical_and(a < time,
- tf.equal(tf.reduce_max(input_tensor=e), 1)),
- body=compute,
- loop_vars=(0,
- initial_state,
- feature_input_data if feature_input_data_dim == 2
- else feature_input_data_reshape[0],
- tf.TensorArray(tf.float32, time),
- tf.ones((batch_size, 1)),
- tf.TensorArray(tf.float32, time * self.sample_len),
- tf.TensorArray(tf.int64, time * self.sample_len),
- tf.zeros((batch_size, self.feature_num_units))))
-
- def fill_rest(i, all_output, all_gen_flag, all_cur_argmax):
- all_output = all_output.write(
- i, tf.zeros((batch_size, self.feature_out_dim)))
-
- for j in range(self.sample_len):
- all_gen_flag = all_gen_flag.write(
- i * self.sample_len + j,
- tf.zeros((batch_size, 1)))
- all_cur_argmax = all_cur_argmax.write(
- i * self.sample_len + j,
- tf.zeros((batch_size,), dtype=tf.int64))
- return (i + 1,
- all_output,
- all_gen_flag,
- all_cur_argmax)
-
- _, feature, gen_flag, cur_argmax = tf.while_loop(
- cond=lambda a, b, c, d: a < time,
- body=fill_rest,
- loop_vars=(i, feature, gen_flag, cur_argmax))
-
- feature = feature.stack()
- # time * batch_size * (dim * sample_len)
- gen_flag = gen_flag.stack()
- # (time * sample_len) * batch_size * 1
- cur_argmax = cur_argmax.stack()
-
- gen_flag = tf.transpose(a=gen_flag, perm=[1, 0, 2])
- # batch_size * (time * sample_len) * 1
- cur_argmax = tf.transpose(a=cur_argmax, perm=[1, 0])
- # batch_size * (time * sample_len)
- length = tf.reduce_sum(input_tensor=gen_flag, axis=[1, 2])
- # batch_size
-
- feature = tf.transpose(a=feature, perm=[1, 0, 2])
- # batch_size * time * (dim * sample_len)
- gen_flag_t = tf.reshape(
- gen_flag,
- [batch_size, time, self.sample_len])
- # batch_size * time * sample_len
- gen_flag_t = tf.reduce_sum(input_tensor=gen_flag_t, axis=[2])
- # batch_size * time
- gen_flag_t = tf.cast(gen_flag_t > 0.5, dtype=tf.float32)
- gen_flag_t = tf.expand_dims(gen_flag_t, 2)
- # batch_size * time * 1
- gen_flag_t = tf.tile(
- gen_flag_t,
- [1, 1, self.feature_out_dim])
- # batch_size * time * (dim * sample_len)
- # zero out the parts after sequence ends
- feature = feature * gen_flag_t
- feature = tf.reshape(
- feature,
- [batch_size,
- time * self.sample_len,
- self.feature_out_dim / self.sample_len])
- # batch_size * (time * sample_len) * dim
-
- return feature, all_attribute, gen_flag, length, cur_argmax
diff --git a/src/ydata_synthetic/synthesizers/timeseries/model.py b/src/ydata_synthetic/synthesizers/timeseries/model.py
deleted file mode 100644
index 436055e9..00000000
--- a/src/ydata_synthetic/synthesizers/timeseries/model.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
- Main time-series synthesizer class
-"""
-from enum import Enum, unique
-import os
-import logging
-from joblib import load
-
-from tensorflow import config as tfconfig
-
-from ydata_synthetic.synthesizers.timeseries.timegan.model import TimeGAN
-from ydata_synthetic.synthesizers.timeseries.doppelganger.model import DoppelGANger
-
-from ydata_synthetic.utils.logger import SynthesizersLogger
-
-logger = SynthesizersLogger(name='timseriesSynthesizer.logger')
-logger.setLevel(logging.INFO)
-
-@unique
-class Model(Enum):
- TIMEGAN = 'timegan'
- DOPPELGANGER = 'doppelganger'
-
- __MAPPING__ = {
- TIMEGAN : TimeGAN,
- DOPPELGANGER: DoppelGANger
- }
-
- @property
- def function(self):
- return self.__MAPPING__[self.value]
-
-class TimeSeriesSynthesizer():
- "Abstraction class "
- def __new__(cls, modelname: str, model_parameters=None, **kwargs):
- logger.info_def_report(model=modelname)
- return Model(modelname).function(model_parameters, **kwargs)
-
- @staticmethod
- def load(path):
- """
- ### Description:
- Loads a saved synthesizer from a pickle.
-
- ### Args:
- `path` (str): Path to read the synthesizer pickle from.
- """
- gpu_devices = tfconfig.list_physical_devices('GPU')
- if len(gpu_devices) > 0:
- try:
- tfconfig.experimental.set_memory_growth(gpu_devices[0], True)
- except (ValueError, RuntimeError):
- # Invalid device or cannot modify virtual devices once initialized.
- pass
- if os.path.isdir(path):
- return DoppelGANger.load(path)
- return load(path)
diff --git a/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py b/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py
index 4648b479..cf5adead 100644
--- a/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py
+++ b/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py
@@ -1,379 +1,9 @@
"""
- TimeGAN class implemented accordingly with:
- Original code can be found here: https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/
+ TimeGAN file definition
"""
-from tqdm import tqdm
-import numpy as np
-from pandas import DataFrame
-from tensorflow import function, GradientTape, sqrt, abs, reduce_mean, ones_like, zeros_like, convert_to_tensor, float32
-from tensorflow import data as tfdata
-from tensorflow import nn
-from keras import (Model, Sequential, Input)
-from keras.layers import (GRU, LSTM, Dense)
-from keras.optimizers import Adam
-from keras.losses import (BinaryCrossentropy, MeanSquaredError)
-
-from ydata_synthetic.synthesizers.base import BaseGANModel, ModelParameters, TrainParameters
-from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading
-
-def make_net(model, n_layers, hidden_units, output_units, net_type='GRU'):
- if net_type=='GRU':
- for i in range(n_layers):
- model.add(GRU(units=hidden_units,
- return_sequences=True,
- name=f'GRU_{i + 1}'))
- else:
- for i in range(n_layers):
- model.add(LSTM(units=hidden_units,
- return_sequences=True,
- name=f'LSTM_{i + 1}'))
-
- model.add(Dense(units=output_units,
- activation='sigmoid',
- name='OUT'))
- return model
-
-
-class TimeGAN(BaseGANModel):
-
- __MODEL__ = 'TimeGAN'
-
- def __init__(self, model_parameters: ModelParameters):
- super().__init__(model_parameters)
- self.seq_len = None
- self.n_seq = None
- self.hidden_dim = model_parameters.latent_dim
- self.gamma = model_parameters.gamma
- self.num_cols = None
-
- def fit(self, data: DataFrame,
- train_arguments: TrainParameters,
- num_cols: list[str] | None = None,
- cat_cols: list[str] | None = None):
- """
- Fits the TimeGAN model.
-
- Args:
- data: A pandas DataFrame with the data to be synthesized.
- train_arguments: TimeGAN training arguments.
- num_cols: List of columns to be handled as numerical
- cat_cols: List of columns to be handled as categorical
- """
- super().fit(data=data, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_arguments)
- if cat_cols:
- raise NotImplementedError("TimeGAN does not support categorical features.")
- self.num_cols = num_cols
- self.seq_len = train_arguments.sequence_length
- self.n_seq = train_arguments.number_sequences
- processed_data = real_data_loading(data[self.num_cols].values, seq_len=self.seq_len)
- self.train(data=processed_data, train_steps=train_arguments.epochs)
-
- def sample(self, n_samples: int):
- """
- Samples new data from the TimeGAN.
-
- Args:
- n_samples: Number of samples to be generated.
- """
- Z_ = next(self.get_batch_noise(size=n_samples))
- records = self.generator(Z_)
- data = []
- for i in range(records.shape[0]):
- data.append(DataFrame(records[i], columns=self.num_cols))
- return data
-
- def define_gan(self):
- self.generator_aux=Generator(self.hidden_dim).build()
- self.supervisor=Supervisor(self.hidden_dim).build()
- self.discriminator=Discriminator(self.hidden_dim).build()
- self.recovery = Recovery(self.hidden_dim, self.n_seq).build()
- self.embedder = Embedder(self.hidden_dim).build()
-
- X = Input(shape=[self.seq_len, self.n_seq], batch_size=self.batch_size, name='RealData')
- Z = Input(shape=[self.seq_len, self.n_seq], batch_size=self.batch_size, name='RandomNoise')
-
- #--------------------------------
- # Building the AutoEncoder
- #--------------------------------
- H = self.embedder(X)
- X_tilde = self.recovery(H)
-
- self.autoencoder = Model(inputs=X, outputs=X_tilde)
-
- #---------------------------------
- # Adversarial Supervise Architecture
- #---------------------------------
- E_Hat = self.generator_aux(Z)
- H_hat = self.supervisor(E_Hat)
- Y_fake = self.discriminator(H_hat)
-
- self.adversarial_supervised = Model(inputs=Z,
- outputs=Y_fake,
- name='AdversarialSupervised')
-
- #---------------------------------
- # Adversarial architecture in latent space
- #---------------------------------
- Y_fake_e = self.discriminator(E_Hat)
-
- self.adversarial_embedded = Model(inputs=Z,
- outputs=Y_fake_e,
- name='AdversarialEmbedded')
- # ---------------------------------
- # Synthetic data generation
- # ---------------------------------
- X_hat = self.recovery(H_hat)
- self.generator = Model(inputs=Z,
- outputs=X_hat,
- name='FinalGenerator')
-
- # --------------------------------
- # Final discriminator model
- # --------------------------------
- Y_real = self.discriminator(H)
- self.discriminator_model = Model(inputs=X,
- outputs=Y_real,
- name="RealDiscriminator")
-
- # ----------------------------
- # Define the loss functions
- # ----------------------------
- self._mse=MeanSquaredError()
- self._bce=BinaryCrossentropy()
-
-
- @function
- def train_autoencoder(self, x, opt):
- with GradientTape() as tape:
- x_tilde = self.autoencoder(x)
- embedding_loss_t0 = self._mse(x, x_tilde)
- e_loss_0 = 10 * sqrt(embedding_loss_t0)
-
- var_list = self.embedder.trainable_variables + self.recovery.trainable_variables
- gradients = tape.gradient(e_loss_0, var_list)
- opt.apply_gradients(zip(gradients, var_list))
- return sqrt(embedding_loss_t0)
-
- @function
- def train_supervisor(self, x, opt):
- with GradientTape() as tape:
- h = self.embedder(x)
- h_hat_supervised = self.supervisor(h)
- generator_loss_supervised = self._mse(h[:, 1:, :], h_hat_supervised[:, :-1, :])
-
- var_list = self.supervisor.trainable_variables + self.generator.trainable_variables
- gradients = tape.gradient(generator_loss_supervised, var_list)
- apply_grads = [(grad, var) for (grad, var) in zip(gradients, var_list) if grad is not None]
- opt.apply_gradients(apply_grads)
- return generator_loss_supervised
-
- @function
- def train_embedder(self,x, opt):
- with GradientTape() as tape:
- # Supervised Loss
- h = self.embedder(x)
- h_hat_supervised = self.supervisor(h)
- generator_loss_supervised = self._mse(h[:, 1:, :], h_hat_supervised[:, :-1, :])
-
- # Reconstruction Loss
- x_tilde = self.autoencoder(x)
- embedding_loss_t0 = self._mse(x, x_tilde)
- e_loss = 10 * sqrt(embedding_loss_t0) + 0.1 * generator_loss_supervised
-
- var_list = self.embedder.trainable_variables + self.recovery.trainable_variables
- gradients = tape.gradient(e_loss, var_list)
- opt.apply_gradients(zip(gradients, var_list))
- return sqrt(embedding_loss_t0)
-
- def discriminator_loss(self, x, z):
- # Loss on false negatives
- y_real = self.discriminator_model(x)
- discriminator_loss_real = self._bce(y_true=ones_like(y_real),
- y_pred=y_real)
-
- # Loss on false positives
- y_fake = self.adversarial_supervised(z)
- discriminator_loss_fake = self._bce(y_true=zeros_like(y_fake),
- y_pred=y_fake)
-
- y_fake_e = self.adversarial_embedded(z)
- discriminator_loss_fake_e = self._bce(y_true=zeros_like(y_fake_e),
- y_pred=y_fake_e)
- return (discriminator_loss_real +
- discriminator_loss_fake +
- self.gamma * discriminator_loss_fake_e)
-
- @staticmethod
- def calc_generator_moments_loss(y_true, y_pred):
- y_true_mean, y_true_var = nn.moments(x=y_true, axes=[0])
- y_pred_mean, y_pred_var = nn.moments(x=y_pred, axes=[0])
- g_loss_mean = reduce_mean(abs(y_true_mean - y_pred_mean))
- g_loss_var = reduce_mean(abs(sqrt(y_true_var + 1e-6) - sqrt(y_pred_var + 1e-6)))
- return g_loss_mean + g_loss_var
-
- @function
- def train_generator(self, x, z, opt):
- with GradientTape() as tape:
- y_fake = self.adversarial_supervised(z)
- generator_loss_unsupervised = self._bce(y_true=ones_like(y_fake),
- y_pred=y_fake)
-
- y_fake_e = self.adversarial_embedded(z)
- generator_loss_unsupervised_e = self._bce(y_true=ones_like(y_fake_e),
- y_pred=y_fake_e)
- h = self.embedder(x)
- h_hat_supervised = self.supervisor(h)
- generator_loss_supervised = self._mse(h[:, 1:, :], h_hat_supervised[:, :-1, :])
-
- x_hat = self.generator(z)
- generator_moment_loss = self.calc_generator_moments_loss(x, x_hat)
-
- generator_loss = (generator_loss_unsupervised +
- generator_loss_unsupervised_e +
- 100 * sqrt(generator_loss_supervised) +
- 100 * generator_moment_loss)
-
- var_list = self.generator_aux.trainable_variables + self.supervisor.trainable_variables
- gradients = tape.gradient(generator_loss, var_list)
- opt.apply_gradients(zip(gradients, var_list))
- return generator_loss_unsupervised, generator_loss_supervised, generator_moment_loss
-
- @function
- def train_discriminator(self, x, z, opt):
- with GradientTape() as tape:
- discriminator_loss = self.discriminator_loss(x, z)
-
- var_list = self.discriminator.trainable_variables
- gradients = tape.gradient(discriminator_loss, var_list)
- opt.apply_gradients(zip(gradients, var_list))
- return discriminator_loss
-
- def get_batch_data(self, data, n_windows):
- data = convert_to_tensor(data, dtype=float32)
- return iter(tfdata.Dataset.from_tensor_slices(data)
- .shuffle(buffer_size=n_windows)
- .batch(self.batch_size).repeat())
-
- def _generate_noise(self):
- while True:
- yield np.random.uniform(low=0, high=1, size=(self.seq_len, self.n_seq))
-
- def get_batch_noise(self, size=None):
- return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=float32)
- .batch(self.batch_size if size is None else size)
- .repeat())
-
- def train(self, data, train_steps):
- # Assemble the model
- self.define_gan()
-
- ## Embedding network training
- autoencoder_opt = Adam(learning_rate=self.g_lr)
- for _ in tqdm(range(train_steps), desc='Emddeding network training'):
- X_ = next(self.get_batch_data(data, n_windows=len(data)))
- step_e_loss_t0 = self.train_autoencoder(X_, autoencoder_opt)
-
- ## Supervised Network training
- supervisor_opt = Adam(learning_rate=self.g_lr)
- for _ in tqdm(range(train_steps), desc='Supervised network training'):
- X_ = next(self.get_batch_data(data, n_windows=len(data)))
- step_g_loss_s = self.train_supervisor(X_, supervisor_opt)
-
- ## Joint training
- generator_opt = Adam(learning_rate=self.g_lr)
- embedder_opt = Adam(learning_rate=self.g_lr)
- discriminator_opt = Adam(learning_rate=self.d_lr)
-
- step_g_loss_u = step_g_loss_s = step_g_loss_v = step_e_loss_t0 = step_d_loss = 0
- for _ in tqdm(range(train_steps), desc='Joint networks training'):
-
- #Train the generator (k times as often as the discriminator)
- # Here k=2
- for _ in range(2):
- X_ = next(self.get_batch_data(data, n_windows=len(data)))
- Z_ = next(self.get_batch_noise())
- # --------------------------
- # Train the generator
- # --------------------------
- step_g_loss_u, step_g_loss_s, step_g_loss_v = self.train_generator(X_, Z_, generator_opt)
-
- # --------------------------
- # Train the embedder
- # --------------------------
- step_e_loss_t0 = self.train_embedder(X_, embedder_opt)
-
- X_ = next(self.get_batch_data(data, n_windows=len(data)))
- Z_ = next(self.get_batch_noise())
- step_d_loss = self.discriminator_loss(X_, Z_)
- if step_d_loss > 0.15:
- step_d_loss = self.train_discriminator(X_, Z_, discriminator_opt)
-
-
-class Generator(Model):
- def __init__(self, hidden_dim, net_type='GRU'):
- self.hidden_dim = hidden_dim
- self.net_type = net_type
-
- def build(self):
- model = Sequential(name='Generator')
- model = make_net(model,
- n_layers=3,
- hidden_units=self.hidden_dim,
- output_units=self.hidden_dim,
- net_type=self.net_type)
- return model
-
-class Discriminator(Model):
- def __init__(self, hidden_dim, net_type='GRU'):
- self.hidden_dim = hidden_dim
- self.net_type=net_type
-
- def build(self):
- model = Sequential(name='Discriminator')
- model = make_net(model,
- n_layers=3,
- hidden_units=self.hidden_dim,
- output_units=1,
- net_type=self.net_type)
- return model
-
-class Recovery(Model):
- def __init__(self, hidden_dim, n_seq):
- self.hidden_dim=hidden_dim
- self.n_seq=n_seq
- return
-
- def build(self):
- recovery = Sequential(name='Recovery')
- recovery = make_net(recovery,
- n_layers=3,
- hidden_units=self.hidden_dim,
- output_units=self.n_seq)
- return recovery
-
-class Embedder(Model):
-
- def __init__(self, hidden_dim):
- self.hidden_dim=hidden_dim
- return
-
- def build(self):
- embedder = Sequential(name='Embedder')
- embedder = make_net(embedder,
- n_layers=3,
- hidden_units=self.hidden_dim,
- output_units=self.hidden_dim)
- return embedder
-
-class Supervisor(Model):
- def __init__(self, hidden_dim):
- self.hidden_dim=hidden_dim
-
- def build(self):
- model = Sequential(name='Supervisor')
- model = make_net(model,
- n_layers=2,
- hidden_units=self.hidden_dim,
- output_units=self.hidden_dim)
- return model
+class TimeGAN(BaseModel):
+ """
+ This class is deprecated and should no longer be used.
+ Please refer to the new implementation.
+ """
\ No newline at end of file
diff --git a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py
deleted file mode 100644
index ce4366c7..00000000
--- a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"GumbelSoftmaxActivation layer test suite."
-from itertools import cycle, islice
-from re import search
-
-from numpy import array, cumsum, isin, split
-from numpy import sum as npsum
-from numpy.random import normal
-from pandas import DataFrame, concat
-from pytest import fixture
-from tensorflow.keras import Model
-from tensorflow.keras.layers import Dense, Input
-
-from ydata_synthetic.preprocessing.regular.processor import \
- RegularDataProcessor
-from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation
-
-BATCH_SIZE = 10
-
-@fixture(name='noise_batch')
-def fixture_noise_batch():
- "Sample noise for mock output generation."
- return normal(size=(BATCH_SIZE, 16))
-
-@fixture(name='mock_data')
-def fixture_mock_data():
- "Creates mock data for the tests."
- num_block = DataFrame(normal(size=(BATCH_SIZE, 6)), columns = [f'num_{i}' for i in range(6)])
- cat_block_1 = DataFrame(array(list(islice(cycle(range(2)), BATCH_SIZE))), columns = ['cat_0'])
- cat_block_2 = DataFrame(array(list(islice(cycle(range(4)), BATCH_SIZE))), columns = ['cat_1'])
- return concat([num_block, cat_block_1, cat_block_2], axis = 1)
-
-@fixture(name='mock_processor')
-def fixture_mock_processor(mock_data):
- "Creates a mock data processor for the mock data."
- num_cols = [col for col in mock_data.columns if col.startswith('num')]
- cat_cols = [col for col in mock_data.columns if col.startswith('cat')]
- return RegularDataProcessor(num_cols, cat_cols).fit(mock_data)
-
-# pylint: disable=C0103
-@fixture(name='mock_generator')
-def fixture_mock_generator(noise_batch, mock_processor):
- "A mock generator with the Activation Interface as final layer."
- input_ = Input(shape=noise_batch.shape[1], batch_size = BATCH_SIZE)
- dim = 15
- data_dim = 12
- x = Dense(dim, activation='relu')(input_)
- x = Dense(dim * 2, activation='relu')(x)
- x = Dense(dim * 4, activation='relu')(x)
- x = Dense(data_dim)(x)
- x = GumbelSoftmaxActivation(activation_info=mock_processor.col_transform_info, name='act_itf')(x)
- return Model(inputs=input_, outputs=x)
-
-@fixture(name='mock_output')
-def fixture_mock_output(noise_batch, mock_generator):
- "Returns mock output of the model as a numpy object."
- return mock_generator(noise_batch).numpy()
-
-# pylint: disable=W0632
-def test_io(mock_processor, mock_output):
- "Tests the output format of the activation interface for a known input."
- num_lens = len(mock_processor.col_transform_info.numerical.feat_names_out)
- cat_lens = len(mock_processor.col_transform_info.categorical.feat_names_out)
- assert mock_output.shape == (BATCH_SIZE, num_lens + cat_lens), "The output has wrong shape."
- num_part, cat_part = split(mock_output, [num_lens], 1)
- assert not isin(num_part, [0, 1]).all(), "The numerical block is not expected to contain 0 or 1."
- assert isin(cat_part, [0, 1]).all(), "The categorical block is expected to contain only 0 or 1."
- cat_i, cat_o = mock_processor.col_transform_info.categorical
- cat_blocks = cumsum([len([col for col in cat_o if col.startswith(feat) and search('_[0-9]*$', col)]) \
- for feat in cat_i])
- cat_blocks = split(cat_part, cat_blocks[:-1], 1)
- assert all(npsum(abs(block)) == BATCH_SIZE for block in cat_blocks), "There are non one-hot encoded \
- categorical blocks."
diff --git a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py
deleted file mode 100644
index dd52c71d..00000000
--- a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"Test suite for the Gumbel-Softmax layer implementation."
-import tensorflow as tf
-from numpy import amax, amin, isclose, ones
-from numpy import sum as npsum
-from pytest import fixture
-from tensorflow.keras import layers
-
-from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxLayer
-
-
-# pylint:disable=W0613
-def custom_initializer(shape_list, dtype):
- "A constant weight intializer to ensure test reproducibility."
- return tf.constant(ones((5, 5)), dtype=tf.dtypes.float32)
-
-@fixture(name='rand_input')
-def fixture_rand_input():
- "A random, reproducible, input for the mock model."
- return tf.constant(tf.random.normal([4, 5], seed=42))
-
-def test_hard_sample_output_format(rand_input):
- """Tests that the hard output samples are in the expected formats.
- The hard sample should be returned as a one-hot tensor."""
- affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input)
- hard_sample, _ = GumbelSoftmaxLayer()(affined)
- assert npsum(hard_sample) == hard_sample.shape[0], "The sum of the hard samples should equal the number."
- assert all(npsum(hard_sample == 0, 1) == hard_sample.shape[1] - 1), "The hard samples is not a one-hot tensor."
-
-def test_soft_sample_output_format(rand_input):
- """Tests that the soft output samples are in the expected formats.
- The soft sample should be returned as a probabilities tensor."""
- affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input)
- _, soft_sample = GumbelSoftmaxLayer(tau=0.5)(affined)
- assert isclose(npsum(soft_sample), soft_sample.shape[0]), "The sum of the soft samples should be close to \
- the number of records."
- assert amax(soft_sample) <= 1, "Invalid probability values found."
- assert amin(soft_sample) >= 0, "Invalid probability values found."
-
-def test_gradients(rand_input):
- "Performs basic numerical assertions on the gradients of the sof/hard samples."
- def mock(i):
- return GumbelSoftmaxLayer()(layers.Dense(5, use_bias=False, kernel_initializer=custom_initializer)(i))
- with tf.GradientTape() as hard_tape:
- hard_tape.watch(rand_input)
- hard_sample, _ = mock(rand_input)
- with tf.GradientTape() as soft_tape:
- soft_tape.watch(rand_input)
- _, soft_sample = mock(rand_input)
- hard_grads = hard_tape.gradient(hard_sample, rand_input)
- soft_grads = soft_tape.gradient(soft_sample, rand_input)
-
- assert hard_grads is None, "The hard sample must not compute gradients."
- assert soft_grads is not None, "The soft sample is expected to compute gradients."
- assert npsum(abs(soft_grads)) != 0, "The soft sample is expected to have non-zero gradients."
diff --git a/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py b/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py
deleted file mode 100644
index 561319a0..00000000
--- a/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py
+++ /dev/null
@@ -1,77 +0,0 @@
-"""
-Test suite for the RegularProcessor.
-"""
-from numpy import isclose, ndarray
-from pmlb import fetch_data
-from pytest import fixture, raises
-from sklearn.exceptions import NotFittedError
-
-from ydata_synthetic.preprocessing.regular.processor import \
- RegularDataProcessor
-
-
-@fixture
-def regular_data_example():
- return fetch_data('adult')
-
-@fixture
-def regular_data_processor_args(regular_data_example):
- num_cols = ['fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
- cat_cols = list(set(regular_data_example.columns).difference(set(num_cols)))
- return num_cols, cat_cols
-
-@fixture
-def overlapped_column_lists(regular_data_processor_args):
- num_cols, cat_cols = regular_data_processor_args
- cat_cols.append(num_cols[0])
- return num_cols, cat_cols
-
-@fixture
-def incomplete_column_lists(regular_data_processor_args):
- num_cols, cat_cols = regular_data_processor_args
- num_cols.pop()
- return num_cols, cat_cols
-
-@fixture
-def regular_data_processor(regular_data_processor_args):
- num_cols, cat_cols = regular_data_processor_args
- return RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols)
-
-def test_is_fitted(regular_data_processor, regular_data_example):
- "Tests raising NotFittedError in attempting to transform with a non fitted processor."
- with raises(NotFittedError):
- regular_data_processor.transform(regular_data_example)
-
-def test_column_validations(regular_data_example, overlapped_column_lists, incomplete_column_lists):
- "Tests the column lists validation method."
- processor = RegularDataProcessor
- with raises(AssertionError):
- processor(*overlapped_column_lists).fit(regular_data_example)
- with raises(AssertionError):
- processor(*incomplete_column_lists).fit(regular_data_example)
-
-def test_fit(regular_data_processor, regular_data_example):
- "Tests fit method and _check_is_fitted method before and after fitting."
- with raises(NotFittedError):
- regular_data_processor._check_is_fitted()
- processor = regular_data_processor.fit(regular_data_example)
- assert processor._check_is_fitted() is None
-
-def test_fit_transform(regular_data_processor, regular_data_example):
- "Tests fit transform method, _check_is_fitted method and storing of attributes required for inverse_transform."
- transformed = regular_data_processor.fit_transform(regular_data_example)
- assert regular_data_processor._check_is_fitted() is None
- assert transformed.shape[0] == regular_data_example.shape[0]
- assert transformed.shape[1] != regular_data_example.shape[1]
- assert all([isinstance(idx, int) for idx in [regular_data_processor._num_col_idx_, regular_data_processor._cat_col_idx_]])
- assert isinstance(transformed, ndarray)
-
-def test_inverse_transform(regular_data_processor, regular_data_example):
- "Tests inverse_transform and its output by comparing to the original data example."
- transformed = regular_data_processor.fit_transform(regular_data_example)
- inverted = regular_data_processor.inverse_transform(transformed)
- assert isinstance(inverted, type(regular_data_example))
- assert inverted.shape == regular_data_example.shape
- assert (inverted.columns == regular_data_example.columns).all()
- assert (inverted.dtypes == regular_data_processor._types).all()
- assert isclose(inverted, regular_data_example).all()
diff --git a/src/ydata_synthetic/utils/__init__.py b/src/ydata_synthetic/utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/ydata_synthetic/utils/cache.py b/src/ydata_synthetic/utils/cache.py
deleted file mode 100644
index 2baba801..00000000
--- a/src/ydata_synthetic/utils/cache.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
- Dataset cache utility functions
- Original code can be found at https://github.com/ydataai/pandas-profiling/blob/master/src/pandas_profiling/utils/
-"""
-import zipfile
-from pathlib import Path
-
-import requests
-
-def get_project_root() -> Path:
- """Returns the path to the project root folder.
- Returns:
- The path to the project root folder.
- """
- return Path(__file__).parent.parent.parent.parent
-
-def get_data_path() -> Path:
- """Returns the path to the dataset cache ([root] / data)
- Returns:
- The path to the dataset cache
- """
- return get_project_root() / "data"
-
-def cache_file(file_name: str, url: str) -> Path:
- """Check if file_name already is in the data path, otherwise download it from url.
- Args:
- file_name: the file name
- url: the URL of the dataset
- Returns:
- The relative path to the dataset
- """
-
- data_path = get_data_path()
- data_path.mkdir(exist_ok=True)
-
- file_path = data_path / file_name
-
- # If not exists, download and create file
- if not file_path.exists():
- response = requests.get(url)
- file_path.write_bytes(response.content)
-
- return file_path
-
-def cache_zipped_file(file_name: str, url: str) -> Path:
- """Check if file_name already is in the data path, otherwise download it from url.
- Args:
- file_name: the file name
- url: the URL of the dataset
- Returns:
- The relative path to the dataset
- """
-
- data_path = get_data_path()
- data_path.mkdir(exist_ok=True)
-
- file_path = data_path / file_name
-
- # If not exists, download and create file
- if not file_path.exists():
- response = requests.get(url)
- if response.status_code != 200:
- raise FileNotFoundError("Could not download resource")
-
- tmp_path = data_path / "tmp.zip"
- tmp_path.write_bytes(response.content)
-
- with zipfile.ZipFile(tmp_path, "r") as zip_file:
- zip_file.extract(file_path.name, data_path)
-
- tmp_path.unlink()
-
- return file_path
\ No newline at end of file
diff --git a/src/ydata_synthetic/utils/gumbel_softmax.py b/src/ydata_synthetic/utils/gumbel_softmax.py
deleted file mode 100644
index 59c5ba0a..00000000
--- a/src/ydata_synthetic/utils/gumbel_softmax.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Gumbel-Softmax layer implementation.
-Reference: https://arxiv.org/pdf/1611.04051.pdf"""
-from re import search
-from typing import NamedTuple, Optional
-
-# pylint: disable=E0401
-import tensorflow as tf
-from tensorflow import (Tensor, TensorShape, concat, one_hot, split, squeeze,
- stop_gradient)
-from keras.layers import Activation, Layer
-
-TOL = 1e-20
-
-def gumbel_noise(shape: TensorShape) -> Tensor:
- """Create a single sample from the standard (loc = 0, scale = 1) Gumbel distribution."""
- uniform_sample = tf.random.uniform(shape, seed=0)
- return -tf.math.log(-tf.math.log(uniform_sample + TOL) + TOL)
-
-@tf.keras.utils.register_keras_serializable(package='Custom', name='GumbelSoftmaxLayer')
-class GumbelSoftmaxLayer(Layer):
- """A Gumbel-Softmax layer implementation that should be stacked on top of a categorical feature logits.
-
- Arguments:
- tau (float): Temperature parameter of the GS layer
- name (Optional[str]): Name for a single categorical block
- """
-
- def __init__(self, tau: float, name: Optional[str] = None, **kwargs):
- super().__init__(name=name, **kwargs)
- self.tau = tau
-
- # pylint: disable=W0221, E1120
- def call(self, _input):
- """Computes Gumbel-Softmax for the logits output of a particular categorical feature."""
- noised_input = _input + gumbel_noise(_input.shape)
- soft_sample = tf.nn.softmax(noised_input/self.tau, -1)
- hard_sample = stop_gradient(squeeze(one_hot(tf.random.categorical(tf.math.log(soft_sample), 1), _input.shape[-1]), 1))
- return hard_sample, soft_sample
-
- def get_config(self):
- config = super().get_config().copy()
- config.update({'tau': self.tau})
- return config
-
-@tf.keras.utils.register_keras_serializable(package='Custom', name='GumbelSoftmaxActivation')
-class GumbelSoftmaxActivation(Layer):
- """An interface layer connecting different parts of an incoming tensor to adequate activation functions.
- The tensor parts are qualified according to the passed processor object.
- Processed categorical features are sent to specific Gumbel-Softmax layers.
- Processed features of different kind are sent to a TanH activation.
- Finally all output parts are concatenated and returned in the same order.
-
- The parts of an incoming tensor are qualified by leveraging a namedtuple pointing to each of the used data \
- processor's pipelines in/out feature maps. For simplicity this object can be taken directly from the data \
- processor col_transform_info."""
-
- def __init__(self, activation_info: NamedTuple, name: Optional[str] = None, tau: Optional[float] = None, **kwargs):
- """Arguments:
- col_map (NamedTuple): Defines each of the processor pipelines input/output features.
- name (Optional[str]): Name of the GumbelSoftmaxActivation layer
- tau (Optional[float]): Temperature parameter of the GS layer, must be a float bigger than 0"""
- super().__init__(name=name, **kwargs)
- self.tau = 0.2 if not tau else tau # Defaults to the default value proposed in the original article
- assert isinstance(self.tau, (int, float)) and self.tau > 0, "Optional argument tau must be numerical and \
-bigger than 0."
-
- self._activation_info = activation_info
-
- self.cat_feats = activation_info.categorical
- self.num_feats = activation_info.numerical
-
- self._cat_lens = [len([col for col in self.cat_feats.feat_names_out if search(f'^{cat_feat}_.*$', col)]) \
- for cat_feat in self.cat_feats.feat_names_in]
- self._num_lens = len(self.num_feats.feat_names_out)
-
- def call(self, _input): # pylint: disable=W0221
- num_cols, cat_cols = split(_input, [self._num_lens, -1], 1, name='split_num_cats')
- cat_cols = split(cat_cols, self._cat_lens if self._cat_lens else [0], 1, name='split_cats')
-
- num_cols = [Activation('tanh', name='num_cols_activation')(num_cols)]
- cat_cols = [GumbelSoftmaxLayer(tau=self.tau, name=name)(col)[0] for name, col in \
- zip(self.cat_feats.feat_names_in, cat_cols)]
- return concat(num_cols+cat_cols, 1)
-
- def get_config(self):
- config = super().get_config().copy()
- config.update({'activation_info': self._activation_info})
- return config
diff --git a/src/ydata_synthetic/utils/logger.py b/src/ydata_synthetic/utils/logger.py
deleted file mode 100644
index 33d6431c..00000000
--- a/src/ydata_synthetic/utils/logger.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""
- ydata-synthetic logger
-"""
-from __future__ import absolute_import, division, print_function
-
-import logging
-
-from ydata_synthetic.utils.utils import analytics_features
-
-class SynthesizersLogger(logging.Logger):
- def __init__(self, name, level=logging.INFO):
- super().__init__(name, level)
-
- def info(
- self,
- msg: object,
- ) -> None:
- super().info(f'[SYNTHESIZER] - {msg}.')
-
- def info_def_report(self, model: str):
- analytics_features(model=model)
-
- super().info(f'[SYNTHESIZER] Creating a synthetic data generator with the following model - {model}.')
\ No newline at end of file
diff --git a/src/ydata_synthetic/utils/misc/colormaps.py b/src/ydata_synthetic/utils/misc/colormaps.py
deleted file mode 100644
index 98951c85..00000000
--- a/src/ydata_synthetic/utils/misc/colormaps.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from matplotlib.colors import ListedColormap
-import matplotlib.pyplot as plt
-import numpy as np
-from matplotlib import cm
-
-viridis = cm.get_cmap('viridis', 256)
-newcolors = viridis(np.linspace(0, 1, 256))
-pink = np.array([248/256, 24/256, 148/256, 1])
-newcolors[:25, :] = pink
-newcmp = ListedColormap(newcolors)
-
-def ydata_colormap(n: int = None):
- """Returns a colormap with the YData colors and a discrete boundary norm.
- Pass n to define a truncated color map (use less colors)"""
- colors = ["#830000", "#040404", "#FFFFFF", "#E32212"]
- if n and n>len(colors):
- n=len(colors)
- return ListedColormap(colors[:n])
-
-if __name__ == '__main__':
- def plot_examples(cms):
- """
- helper function to plot colormaps
- """
- np.random.seed(19680801)
- data = np.random.randn(30, 30)
-
- fig, axs = plt.subplots(1, len(cms), figsize=(6, 3), constrained_layout=True)
- for [ax, cmap] in zip(axs, cms):
- psm = ax.pcolormesh(data, cmap=cmap, rasterized=True, vmin=-4, vmax=4)
- fig.colorbar(psm, ax=ax)
- plt.show()
-
- plot_examples([viridis, ydata_colormap()])
diff --git a/src/ydata_synthetic/utils/utils.py b/src/ydata_synthetic/utils/utils.py
deleted file mode 100644
index 8ea2e213..00000000
--- a/src/ydata_synthetic/utils/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""
- Utility functions that are common to ydata-synthetic project
-"""
-import os
-import subprocess
-import platform
-import requests
-
-from ydata_synthetic.version import __version__
-def analytics_features(model: str):
- endpoint= "https://packages.ydata.ai/ydata-synthetic?"
-
- if bool(os.getenv("YDATA_SYNTHETIC_NO_ANALYTICS"))!= True:
- package_version = __version__
- try:
- subprocess.check_output("nvidia-smi")
- gpu_present = True
- except Exception:
- gpu_present = False
-
- python_version = ".".join(platform.python_version().split(".")[:2])
-
- try:
- request_message = f"{endpoint}version={package_version}" \
- f"&python_version={python_version}" \
- f"&model={model}" \
- f"&os={platform.system()}" \
- f"&gpu={str(gpu_present)}"
-
- requests.get(request_message)
- except Exception:
- pass
From dc24389bdb5a10d983bacc4e1ee8dfe4aa26a56b Mon Sep 17 00:00:00 2001
From: Fabiana <30911746+fabclmnt@users.noreply.github.com>
Date: Thu, 29 Aug 2024 15:57:19 +0100
Subject: [PATCH 2/3] fix: add import BaseModel to timeseries models
---
src/ydata_synthetic/synthesizers/base.py | 4 ++--
.../synthesizers/timeseries/doppelganger/model.py | 1 +
src/ydata_synthetic/synthesizers/timeseries/timegan/model.py | 1 +
3 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/ydata_synthetic/synthesizers/base.py b/src/ydata_synthetic/synthesizers/base.py
index 5a17fb71..bc564afe 100644
--- a/src/ydata_synthetic/synthesizers/base.py
+++ b/src/ydata_synthetic/synthesizers/base.py
@@ -36,8 +36,8 @@ def __init__(self,
cat_cols: Optional[List[str]] = None,
**kwargs):
warn(
- f"{self.__class__.__name__} is deprecated. Please leverage ydata-sdk RegularSynthesizer or TimeSeriesSynthesizer instead. For more information, "
- f"check ydata-sdk documentation: https://docs.fabric.ydata.ai/latest/sdk/examples/synthesize_tabular_data/.",
+ f"{self.__class__.__name__} is deprecated. Please leverage ydata-sdk **RegularSynthesizer** or **TimeSeriesSynthesizer** instead. For more information, "
+ f"check ydata-sdk documentation: https://docs.fabric.ydata.ai/latest/sdk/examples/.",
DeprecationWarning,
stacklevel=2
)
\ No newline at end of file
diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
index 4fc22f6b..b5331a5a 100644
--- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
+++ b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
@@ -1,6 +1,7 @@
"""
Doppelganger implementation file
"""
+from ydata_synthetic.synthesizers.base import BaseModel
class DoppelGANgerNetwork(BaseModel):
"""
diff --git a/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py b/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py
index cf5adead..75461e27 100644
--- a/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py
+++ b/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py
@@ -1,6 +1,7 @@
"""
TimeGAN file definition
"""
+from ydata_synthetic.synthesizers.base import BaseModel
class TimeGAN(BaseModel):
"""
From 64e3972fe80a7a4e3d6504b8e54088a8cf884aef Mon Sep 17 00:00:00 2001
From: Fabiana <30911746+fabclmnt@users.noreply.github.com>
Date: Mon, 9 Sep 2024 18:43:43 +0100
Subject: [PATCH 3/3] chore: Update ydata-synthetic docs
Add more information regarding the change to ydata-sdk.
---
README.md | 104 +++++++-----------
docs/getting-started/installation.md | 27 ++---
docs/getting-started/quickstart.md | 78 -------------
docs/index.md | 56 +++++++---
docs/integrations/gx_integration.md | 3 +
docs/reference/api/index.md | 0
docs/reference/api/preprocessing/base.md | 2 -
.../regular/ctgan_preprocessor.md | 2 -
.../api/preprocessing/regular/preprocessor.md | 2 -
docs/reference/api/synthesizers/cgan.md | 2 -
docs/reference/api/synthesizers/gan.md | 4 -
.../api/synthesizers/regular/cgan.md | 2 -
.../api/synthesizers/regular/cramergan.md | 2 -
.../api/synthesizers/regular/ctgan.md | 2 -
.../api/synthesizers/regular/cwgangp.md | 2 -
.../api/synthesizers/regular/dragan.md | 2 -
.../reference/api/synthesizers/regular/gan.md | 5 -
.../api/synthesizers/regular/vanilllagan.md | 2 -
.../api/synthesizers/regular/wgan.md | 2 -
.../api/synthesizers/regular/wgan_gp.md | 2 -
.../synthesizers/timeseries/doppelganger.md | 2 -
.../api/synthesizers/timeseries/timegan.md | 2 -
docs/reference/changelog.md | 0
docs/support/analytics.md | 44 --------
docs/support/contribute.md | 22 ----
docs/support/help-troubleshooting.md | 4 +-
docs/synthetic_data/faqs.md | 28 +----
docs/synthetic_data/index.md | 15 +--
.../single_table/cgan_example.md | 5 +
.../single_table/cramer_gan_example.md | 5 +
.../single_table/ctgan_example.md | 5 +
.../single_table/cwgangp_example.md | 5 +
.../single_table/dragan_example.md | 5 +
.../single_table/gmm_example.md | 5 +
.../single_table/wgan_example.md | 5 +
.../single_table/wgangp_example.md | 5 +
docs/synthetic_data/streamlit_app.md | 46 --------
.../time_series/doppelganger_example.md | 6 +
.../time_series/timegan_example.md | 16 ++-
docs/synthetic_data/ydata_fabric_app.md | 28 +++++
mkdocs.yml | 30 +----
setup.py | 11 +-
42 files changed, 193 insertions(+), 402 deletions(-)
delete mode 100644 docs/getting-started/quickstart.md
delete mode 100644 docs/reference/api/index.md
delete mode 100644 docs/reference/api/preprocessing/base.md
delete mode 100644 docs/reference/api/preprocessing/regular/ctgan_preprocessor.md
delete mode 100644 docs/reference/api/preprocessing/regular/preprocessor.md
delete mode 100644 docs/reference/api/synthesizers/cgan.md
delete mode 100644 docs/reference/api/synthesizers/gan.md
delete mode 100644 docs/reference/api/synthesizers/regular/cgan.md
delete mode 100644 docs/reference/api/synthesizers/regular/cramergan.md
delete mode 100644 docs/reference/api/synthesizers/regular/ctgan.md
delete mode 100644 docs/reference/api/synthesizers/regular/cwgangp.md
delete mode 100644 docs/reference/api/synthesizers/regular/dragan.md
delete mode 100644 docs/reference/api/synthesizers/regular/gan.md
delete mode 100644 docs/reference/api/synthesizers/regular/vanilllagan.md
delete mode 100644 docs/reference/api/synthesizers/regular/wgan.md
delete mode 100644 docs/reference/api/synthesizers/regular/wgan_gp.md
delete mode 100644 docs/reference/api/synthesizers/timeseries/doppelganger.md
delete mode 100644 docs/reference/api/synthesizers/timeseries/timegan.md
delete mode 100644 docs/reference/changelog.md
delete mode 100644 docs/support/analytics.md
delete mode 100644 docs/support/contribute.md
delete mode 100644 docs/synthetic_data/streamlit_app.md
create mode 100644 docs/synthetic_data/ydata_fabric_app.md
diff --git a/README.md b/README.md
index c022af6c..025f5de3 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
![](https://img.shields.io/github/workflow/status/ydataai/ydata-synthetic/prerelease)
![](https://img.shields.io/pypi/status/ydata-synthetic)
[![](https://pepy.tech/badge/ydata-synthetic)](https://pypi.org/project/ydata-synthetic/)
-![](https://img.shields.io/badge/python-3.9%20%7C%203.10-blue)
+![](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
[![](https://img.shields.io/pypi/v/ydata-synthetic)](https://pypi.org/project/ydata-synthetic/)
![](https://img.shields.io/github/license/ydataai/ydata-synthetic)
@@ -11,14 +11,15 @@
Join us on [![Discord](https://img.shields.io/badge/Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://tiny.ydata.ai/dcai-ydata-synthetic)
# YData Synthetic
-A package to generate synthetic tabular and time-series data leveraging the state of the art generative models.
-
-## 🎊 The exciting features:
-> These are must try features when it comes to synthetic data generation:
- > - A new streamlit app that delivers the synthetic data generation experience with a UI interface. A low code experience for the quick generation of synthetic data
- > - A new fast synthetic data generation model based on Gaussian Mixture. So you can quickstart in the world of synthetic data generation without the need for a GPU.
- > - A conditional architecture for tabular data: CTGAN, which will make the process of synthetic data generation easier and with higher quality!
-
+`YData-Synthetic` is an open-source package developed in 2020 with the primary goal of educating users about generative models for synthetic data generation.
+Designed as a collection of models, it was intended for exploratory studies and educational purposes.
+However, it was not optimized for the quality, performance, and scalability needs typically required by organizations.
+
+!!! note "Update"
+ Even though the journey was fun, and we have learned a lot from the community it is now time to upgrade `ydata-synthetic`.
+ Heading towards the future of synthetic data generation we recommend users to transition to `ydata-sdk`, which provides a superior experience with enhanced performance,
+ precision, and ease of use, making it the preferred tool for synthetic data generation and a perfect introduction to Generative AI.
+
## Synthetic data
### What is synthetic data?
Synthetic data is artificially generated data that is not collected from real world events. It replicates the statistical components of real data without containing any identifiable information, ensuring individuals' privacy.
@@ -32,68 +33,50 @@ Synthetic data can be used for many applications:
> **Looking for an end-to-end solution to Synthetic Data Generation?**
> [YData Fabric](https://ydata.ai/products/synthetic_data) enables the generation of high-quality datasets within a full UI experience, from data preparation to synthetic data generation and evaluation.
-> Check out the [Community Version](https://ydata.ai/ydata-fabric-free-trial).
-
-
-# ydata-synthetic
-This repository contains material related with architectures and models for synthetic data, from Generative Adversarial Networks (GANs) to Gaussian Mixtures.
-The repo includes a full ecosystem for synthetic data generation, that includes different models for the generation of synthetic structure data and time-series.
-All the Deep Learning models are implemented leveraging Tensorflow 2.0.
-Several example Jupyter Notebooks and Python scripts are included, to show how to use the different architectures.
-
-Are you ready to learn more about synthetic data and the bext-practices for synthetic data generation?
-
-## Quickstart
-The source code is currently hosted on GitHub at: https://github.com/ydataai/ydata-synthetic
-
-Binary installers for the latest released version are available at the [Python Package Index (PyPI).](https://pypi.org/project/ydata-synthetic/)
-```commandline
-pip install ydata-synthetic
-```
+> Check out the [Community Version](https://ydata.ai/register).
-### The UI guide for synthetic data generation
-YData synthetic has now a UI interface to guide you through the steps and inputs to generate structure tabular data.
-The streamlit app is available form *v1.0.0* onwards, and supports the following flows:
-- Train a synthesizer model
-- Generate & profile synthetic data samples
+## ydata-synthetic to ydata-sdk
+With the upcoming update of `ydata-synthetic`to `ydata-sdk`, users will now have access to a single API that automatically selects and optimizes
+the best generative model for their data. This streamlined approach eliminates the need to choose between
+various models manually, as the API intelligently identifies the optimal model based on the specific dataset and use case.
-#### Installation
+Instead of having to manually select from models such as:
-```commandline
-pip install ydata-synthetic[streamlit]
-```
-#### Quickstart
-Use the code snippet below in a python file (Jupyter Notebooks are not supported):
-```python
-from ydata_synthetic import streamlit_app
+- [GAN](https://arxiv.org/abs/1406.2661)
+- [CGAN](https://arxiv.org/abs/1411.1784) (Conditional GAN)
+- [WGAN](https://arxiv.org/abs/1701.07875) (Wasserstein GAN)
+- [WGAN-GP](https://arxiv.org/abs/1704.00028) (Wassertein GAN with Gradient Penalty)
+- [DRAGAN](https://arxiv.org/pdf/1705.07215.pdf) (Deep Regret Analytic GAN)
+- [Cramer GAN](https://arxiv.org/abs/1705.10743) (Cramer Distance Solution to Biased Wasserstein Gradients)
+- [CWGAN-GP](https://cameronfabbri.github.io/papers/conditionalWGAN.pdf) (Conditional Wassertein GAN with Gradient Penalty)
+- [CTGAN](https://arxiv.org/pdf/1907.00503.pdf) (Conditional Tabular GAN)
+- [TimeGAN](https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf) (specifically for *time-series* data)
+- [DoppelGANger](https://dl.acm.org/doi/pdf/10.1145/3419394.3423643) (specifically for *time-series* data)
-streamlit_app.run()
-```
+The new API handles model selection automatically, optimizing for the best performance in fidelity, utility, and privacy.
+This significantly simplifies the synthetic data generation process, ensuring that users get the highest quality output without
+the need for manual intervention and tiring hyperparameter tuning.
-Or use the file streamlit_app.py that can be found in the [examples folder](https://github.com/ydataai/ydata-synthetic/tree/master/examples/streamlit_app.py).
+Are you ready to learn more about synthetic data and the best-practices for synthetic data generation?
+For more materials on [synthetic data generation with Python see the documentation](https://docs.fabric.ydata.ai/latest/sdk/).
+## Quickstart
+Binary installers for the latest released version are available at the [Python Package Index (PyPI).](https://pypi.org/project/ydata-sdk/)
```commandline
-python -m streamlit_app
+pip install ydata-sdk
```
-The below models are supported:
- - CGAN
- - WGAN
- - WGANGP
- - DRAGAN
- - CRAMER
- - CTGAN
+### The UI guide for synthetic data generation
-[![Watch the video](assets/streamlit_app.png)](https://youtu.be/ep0PhwsFx0A)
+YData Fabric offers an UI interface to guide you through the steps and inputs to generate structure data.
+You can experiment today with [YData Fabric by registering the Community version](https://ydata.ai/register).
### Examples
Here you can find usage examples of the package and models to synthesize tabular data.
- - Fast tabular data synthesis on adult census income dataset [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ydataai/ydata-synthetic/blob/master/examples/regular/models/Fast_Adult_Census_Income_Data.ipynb)
- - Tabular synthetic data generation with CTGAN on adult census income dataset [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ydataai/ydata-synthetic/blob/master/examples/regular/models/CTGAN_Adult_Census_Income_Data.ipynb)
- - Time Series synthetic data generation with TimeGAN on stock dataset [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ydataai/ydata-synthetic/blob/master/examples/timeseries/TimeGAN_Synthetic_stock_data.ipynb)
- - Time Series synthetic data generation with DoppelGANger on FCC MBA dataset [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ydataai/ydata-synthetic/blob/master/examples/timeseries/DoppelGANger_FCC_MBA_Dataset.ipynb)
- - More examples are continuously added and can be found in `/examples` directory.
+ - Tabular [synthetic data generation on Titanic Kaggle dataset](https://github.com/ydataai/ydata-sdk/blob/main/examples/synthesizers/regular_quickstart.py)
+ - Time Series [synthetic data generation]('https://github.com/ydataai/ydata-sdk/blob/main/examples/synthesizers/time_series_quickstart.py')
+ - More examples are continuously added and can be found in [examples directory](https://github.com/ydataai/ydata-sdk/tree/main/examples).
### Datasets for you to experiment
Here are some example datasets for you to try with the synthesizers:
@@ -108,7 +91,7 @@ Here are some example datasets for you to try with the synthesizers:
## Project Resources
-In this repository you can find the several GAN architectures that are used to create synthesizers:
+Find below useful literature of how to generate synthetic data and available generative models:
### Tabular data
- [GAN](https://arxiv.org/abs/1406.2661)
@@ -125,11 +108,6 @@ In this repository you can find the several GAN architectures that are used to c
- [TimeGAN](https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf)
- [DoppelGANger](https://dl.acm.org/doi/pdf/10.1145/3419394.3423643)
-## Contributing
-We are open to collaboration! If you want to start contributing you only need to:
- 1. Search for an issue in which you would like to work. Issues for newcomers are labeled with good first issue.
- 2. Create a PR solving the issue.
- 3. We would review every PRs and either accept or ask for revisions.
## Support
For support in using this library, please join our Discord server. Our Discord community is very friendly and great about quickly answering questions about the use and development of the library. [Click here to join our Discord community!](https://tiny.ydata.ai/dcai-ydata-synthetic)
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index d633f51d..bcead052 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -1,22 +1,22 @@
-`ydata-synthetic` is available through PyPi, allowing an easy process of installation and integration with the data science programing environments (Google Colab, Jupyter Notebooks, Visual Studio Code, PyCharm) and stack (`pandas`, `numpy`, `scikit-learn`).
+`ydata-sdk` is available through PyPi, allowing an easy process of installation and integration with the data science programing environments (Google Colab, Jupyter Notebooks, Visual Studio Code, PyCharm) and stack (`pandas`, `numpy`, `scikit-learn`).
##Installing the package
-Currently, the package supports **python versions over 3.9**, and can be installed in Windows, Linux or MacOS operating systems.
+Currently, the package supports **python versions over 3.9 and up-to python 3.12**, and can be installed in Windows, Linux or MacOS operating systems.
Prior to the package installation, it is recommended the creation of a virtual or `conda` environment:
=== "conda"
``` commandline
- conda create -n synth-env python=3.10
+ conda create -n synth-env python=3.12
conda activate synth-env
```
-The above command creates and activates a new environment called "synth-env" with Python version 3.10.X. In the new environment, you can then install `ydata-synthetic`:
+The above command creates and activates a new environment called "synth-env" with Python version 3.12.X. In the new environment, you can then install `ydata-sdk`:
=== "pypi"
``` commandline
- pip install ydata-synthetic==1.1.0
+ pip install ydata-sdk
```
:fontawesome-brands-youtube:{ style="color: #EE0F0F" }
@@ -27,20 +27,7 @@ The above command creates and activates a new environment called "synth-env" wit
To install inside a Google Colab notebook, you can use the following:
``` commandline
-!pip install ydata-synthetic==1.1.0
+!pip install ydata-sdk
```
-Make sure your Google Colab is running Python versions `>=3.9, <3.11`. Learn how to configure Python versions on Google Colab [here](https://stackoverflow.com/questions/68657341/how-can-i-update-google-colabs-python-version/68658479#68658479).
-
-
-## Installing the Streamlit App
-Since version 1.0.0, the `ydata-synthetic` includes a GUI experience provided by a Streamlit app. The UI supports the data synthesization process from reading the data to profiling the synthetic data generation, and can be installed as follows:
-
-``` commandline
-pip install "ydata-synthetic[streamlit]"
-```
-
-Note that Jupyter or Colab Notebooks are not yet supported, so use it in your Python environment.
-
-
-
+Make sure your Google Colab is running Python versions `>=3.9, <=3.12`. Learn how to configure Python versions on Google Colab [here](https://stackoverflow.com/questions/68657341/how-can-i-update-google-colabs-python-version/68658479#68658479).
diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md
deleted file mode 100644
index 3814bb39..00000000
--- a/docs/getting-started/quickstart.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Quickstart
-
-`ydata-synthetic` is equipped to handle both **tabular** (comprising numeric and categorical features) and sequential, **time-series** data. In this section we explain how you can **quickstart the synthesization** of tabular and time-series datasets.
-
-## Synthesizing a Tabular Dataset
-The following example showcases how to synthesize the [Adult Census Income](https://www.kaggle.com/datasets/uciml/adult-census-income) dataset with CTGAN:
-=== "Tabular Data"
- ```python
- # Import the necessary modules
- from pmlb import fetch_data
- from ydata_synthetic.synthesizers.regular import RegularSynthesizer
- from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
-
- # Load data
- data = fetch_data('adult')
- num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
- cat_cols = ['workclass','education', 'education-num', 'marital-status',
- 'occupation', 'relationship', 'race', 'sex', 'native-country', 'target']
-
- # Define model and training parameters
- ctgan_args = ModelParameters(batch_size=500, lr=2e-4, betas=(0.5, 0.9))
- train_args = TrainParameters(epochs=501)
-
- # Train the generator model
- synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args)
- synth.fit(data=data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)
-
- # Generate 1000 new synthetic samples
- synth_data = synth.sample(1000)
- ```
-
-## Synthesizing a Time-Series Dataset
-The following example showcases how to synthesize the [Yahoo Stock Price](https://www.kaggle.com/datasets/arashnic/time-series-forecasting-with-yahoo-stock-price) dataset with TimeGAN:
-=== "Time-Series Data"
- ```python
- # Import the necessary modules
- import pandas as pd
- from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer
- from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
-
- # Define model parameters
- gan_args = ModelParameters(batch_size=128,
- lr=5e-4,
- noise_dim=32,
- layers_dim=128,
- latent_dim=24,
- gamma=1)
-
- train_args = TrainParameters(epochs=50000,
- sequence_length=24,
- number_sequences=6)
-
- # Read the data
- stock_data = pd.read_csv("stock_data.csv")
-
- # Training the TimeGAN synthesizer
- synth = TimeSeriesSynthesizer(modelname='timegan', model_parameters=gan_args)
- synth.fit(stock_data, train_args, num_cols=list(stock_data.columns))
-
- # Generating new synthetic samples
- synth_data = synth.sample(n_samples=500)
- ```
-
-## Running the Streamlit App
-Once the package is [installed](installation.md) with the "streamlit" extra, the app can be launched as:
-
-=== "Streamlit App"
- ```python
- from ydata_synthetic import streamlit_app
-
- streamlit_app.run()
- ```
-
-The console will then output the URL from which the app can be accessed.
-
-:fontawesome-brands-youtube:{ style="color: #EE0F0F" } Here's a [quick example](https://www.youtube.com/watch?v=6Lzi26szKNo&t=4s) of how to synthesize data with the Streamlit App – :octicons-clock-24: 5min
-
-
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index a95ff9ac..57ebd27b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -3,7 +3,7 @@
- -
- -## Installation - -pip install ydata-synthetic[streamlit] - -## Quickstart - -Use the code snippet below in a python file: - -!!! warning "Use python scripts" - - I know you probably love Jupyter Notebooks or Google Colab, but make sure that you start your - synthetic data generation streamlit app from a python script as notebooks are not supported! - -``` py - from ydata_synthetic import streamlit_app - streamlit_app.run() -``` - -Or use the file streamlit_app.py that can be found in the [examples folder](). - -``` py - python -m streamlit_app -``` - -The below models are supported: - -- [ydata-sdk Synthetic Data generator](https://docs.sdk.ydata.ai/0.6/examples/synthesize_tabular_data/) -- CGAN -- WGAN -- WGANGP -- DRAGAN -- CRAMER -- CTGAN - diff --git a/docs/synthetic_data/time_series/doppelganger_example.md b/docs/synthetic_data/time_series/doppelganger_example.md index b418879b..2cd125fa 100644 --- a/docs/synthetic_data/time_series/doppelganger_example.md +++ b/docs/synthetic_data/time_series/doppelganger_example.md @@ -1,5 +1,11 @@ # Synthesize time-series data +!!! note "Outdated" + Note that this example won't work with the latest version of `ydata-synthetic`. + + Please check `ydata-sdk` to see [how to generate synthetic time-series data](https://docs.fabric.ydata.ai/latest/sdk/examples/synthesize_timeseries_data/). + + **Using *DoppelGANger* to generate synthetic time-series data:** Although tabular data may be the most frequently discussed type of data, a great number of real-world domains — from traffic and daily trajectories to stock prices and energy consumption patterns — produce **time-series data** which introduces several aspects of complexity to synthetic data generation. diff --git a/docs/synthetic_data/time_series/timegan_example.md b/docs/synthetic_data/time_series/timegan_example.md index b4c36c3f..ec2fbe87 100644 --- a/docs/synthetic_data/time_series/timegan_example.md +++ b/docs/synthetic_data/time_series/timegan_example.md @@ -1,6 +1,20 @@ # Synthesize time-series data -**Using *TimeGAN* to generate synthetic time-series data:** +!!! note "Outdated" + Note that this example won't work with the latest version of `ydata-synthetic`. + + Please check `ydata-sdk` to see [how to generate synthetic time-series data](https://docs.fabric.ydata.ai/latest/sdk/examples/synthesize_timeseries_data/). + +## Why YData Fabric vs TimeGAN for time-series data +YData Fabric offers advanced capabilities for time-series synthetic data generation, surpassing TimeGAN in terms of flexibility, +scalability, and ease of use. With YData Fabric, users can generate high-quality synthetic time-series data while benefiting from built-in data profiling tools +that ensure the integrity and consistency of the data. Unlike TimeGAN, which is a single model for time-series, YData Fabric offers a solution that is suitable for different types of datasets and behaviours. +Additionally, YData Fabric is designed for scalability, enabling seamless handling of large, complex time-series datasets. Its guided UI makes it easy to adapt to different time-series scenarios, +from healthcare to financial data, making it a more comprehensive and flexible solution for time-series data generation. + +For more on [YData Fabric vs Synthetic data generation with TimeGAN read this blogpost](https://ydata.ai/resources/the-best-generative-ai-model-for-time-series-synthetic-data-generation). + +## Using *TimeGAN* to generate synthetic time-series data Although tabular data may be the most frequently discussed type of data, a great number of real-world domains — from traffic and daily trajectories to stock prices and energy consumption patterns — produce **time-series data** which introduces several aspects of complexity to synthetic data generation. diff --git a/docs/synthetic_data/ydata_fabric_app.md b/docs/synthetic_data/ydata_fabric_app.md new file mode 100644 index 00000000..ddf2ea44 --- /dev/null +++ b/docs/synthetic_data/ydata_fabric_app.md @@ -0,0 +1,28 @@ +# The UI guided experience for Synthetic Data generation + +[YData Fabric provides a robust, guided user interface (UI) specifically designed to streamline synthetic data generation](https://ydata.ai/products/fabric). +This interface is tailored to support users at every level, ensuring that both novice users and experienced data scientists can efficiently generate +synthetic datasets while adhering to best practices. + +## Step-by-Step Workflow +The YData Fabric UI organizes the synthetic data generation process into a structured, step-by-step workflow. +Each stage of the process is clearly defined and supported by guidance within the interface, helping users navigate tasks like data profiling, +metadata and synthesizer configuration and synthetic data quality evaluation. + +- **Data Upload and Profiling:** Users start by uploading their datasets directly into the platform. YData Fabric’s profiling tool automatically scans +the data, generating insights into key attributes such as data distributions, correlations, and missing values. +These insights are presented in an intuitive, visual format, ensuring users can quickly assess the quality and structure of their data. +- **Alerts for Data Issues:** The UI will alert users to potential issues such as data imbalances, outliers, or incomplete fields that may affect the +quality of the synthetic data. +- **Synthetic Data Generation Model Configuration:** Once the data is profiled, the UI supports metadata configuration (categorical, numerical, dates, etc), +anonymization integration. +- **Model Performance Insights:** During the model training phase, YData Fabric monitors key performance indicators (KPIs) like fidelity, utility and privacy. +These KPIs, such as data fidelity and privacy scores, are displayed on the dashboard, allowing users to evaluate how closely the synthetic data aligns with the original dataset. +- **Customization and Advanced Controls:** For more experienced users, YData Fabric provides customization options within the guided UI. +Users have access to advanced settings, such as [conditional synthetic data generation](https://ydata.ai/resources/conditional-synthetic-data-generation-for-robust-machine-learning-applications) or business rules. +- **Preserving Data Integrity:** For datasets requiring strict adherence to structural patterns (e.g., time-series data, healthcare records or databases). + +### Getting started with YData Fabric (Community version) +YData Fabric’s Community Version offers users a free, accessible entry point to explore synthetic data generation. +To get started, users can [sign up for the Community Version and access the guided UI directly](https://ydata.ai/register). +Once registered, users are provided with a range of features, including data profiling, synthetic data generation, pipelines and access to YData’s proprietary models for data quality! \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index c8c15442..f5db0ca8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,10 +9,9 @@ nav: - 'index.md' - Overview: 'index.md' - Installation: 'getting-started/installation.md' - - Quickstart: 'getting-started/quickstart.md' - - Synthetic Data Generation: + - Synthetic Data Generation: - "synthetic_data/index.md" - - UI interface - Streamlit app: "synthetic_data/streamlit_app.md" + - UI interface - YData Fabric: "synthetic_data/ydata_fabric_app.md" - Generate Single Table: - GMM: "synthetic_data/single_table/gmm_example.md" - CGAN: "synthetic_data/single_table/cgan_example.md" @@ -32,31 +31,6 @@ nav: - Great Expectations: "integrations/gx_integration.md" - Support: - Help & Troubleshooting: 'support/help-troubleshooting.md' - - Contribution Guidelines: 'support/contribute.md' - - Contribution Guidelines: 'support/contribute.md' - - Analytics: 'support/analytics.md' - - Reference: - - Changelog: 'reference/changelog.md' - - API: - - Synthesizers: - - GAN: 'reference/api/synthesizers/regular/gan.md' - - Regular: - - CGAN: 'reference/api/synthesizers/regular/cgan.md' - - CRAMERGAN: 'reference/api/synthesizers/regular/cramergan.md' - - CTGAN: 'reference/api/synthesizers/regular/ctgan.md' - - CWGANGP: 'reference/api/synthesizers/regular/cwgangp.md' - - DRAGAN: 'reference/api/synthesizers/regular/dragan.md' - - VANILLAGAN: 'reference/api/synthesizers/regular/vanillagan.md' - - WGAN_GP: 'reference/api/synthesizers/regular/wgan_gp.md' - - WGAN: 'reference/api/synthesizers/regular/wgan.md' - - Timeseries: - - TimeGAN: 'reference/api/synthesizers/timeseries/timegan.md' - - DoppelGANger: 'reference/api/synthesizers/timeseries/doppelganger.md' - - Preprocessing: - - BaseProcessor: 'reference/api/preprocessing/base.md' - - Regular: - - RegularDataProcessor: 'reference/api/preprocessing/regular/preprocessor.md' - - CTGANDataProcessor: 'reference/api/preprocessing/regular/ctgan_preprocessor.md' theme: name: material language: en diff --git a/setup.py b/setup.py index 05c2b92b..baa72e29 100644 --- a/setup.py +++ b/setup.py @@ -42,19 +42,10 @@ keywords='data science ydata', url='https://github.com/ydataai/ydata-synthetic', license="https://github.com/ydataai/ydata-synthetic/blob/master/LICENSE", - python_requires=">=3.9, <3.12", + python_requires=">=3.9, <3.13", packages=find_namespace_packages('src'), package_dir={'':'src'}, include_package_data=True, options={"bdist_wheel": {"universal": True}}, install_requires=requirements, - extras_require={ - "streamlit": [ - "streamlit==1.29.0", - "typing-extensions>=3.10.0", - "streamlit_pandas_profiling==0.1.3", - "ydata-profiling<5", - "ydata-sdk>=0.2.1", - ], - }, )