diff --git a/.dvc/.gitignore b/.dvc/.gitignore index 528f30c7..92050fd7 100644 --- a/.dvc/.gitignore +++ b/.dvc/.gitignore @@ -1,3 +1,8 @@ -/config.local -/tmp -/cache +# config.local file path +config_local_path = '/config.local' + +# Temporary files directory +tmp_path = '/tmp' + +# Cache directory +cache_path = '/cache' diff --git a/.dvcignore b/.dvcignore index 51973055..0ac24a9a 100644 --- a/.dvcignore +++ b/.dvcignore @@ -1,3 +1,22 @@ -# Add patterns of files dvc should ignore, which could improve -# the performance. Learn more at -# https://dvc.org/doc/user-guide/dvcignore +# This code snippet adds patterns of files that DVC should ignore, +# which can help improve the performance of DVC operations. +# +# By ignoring certain files or directories, DVC can avoid unnecessary +# data transfers, file checks, and other operations that can slow down +# the overall performance of DVC. +# +# The specific pattern used in this code is a URL pointing to the +# DVC ignore documentation, which provides more information on how +# to create and use ignore patterns with DVC. +# +# To use this code, simply replace the URL with the desired ignore +# pattern or patterns, separated by newline characters. +# +# For example, to ignore all files with the extension '.txt' and +# the 'temp' directory, the code would look like this: +# +# .txt +# temp/ +# +# By adding these ignore patterns, DVC will skip any files or +# directories that match these patterns during its operations. diff --git a/.gitignore b/.gitignore index 4e4145c4..d196c043 100644 --- a/.gitignore +++ b/.gitignore @@ -6,46 +6,33 @@ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf +.idea/workspace.xml +.idea/tasks.xml +.idea/usage.statistics.xml +.idea/dictionaries +.idea/shelf # Generated files -.idea/**/contentModel.xml +.idea/contentModel.xml # Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml +.idea/dataSources/ +.idea/dataSources.ids +.idea/dataSources.local.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml +.idea/dbnavigator.xml # Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/artifacts -# .idea/compiler.xml -# .idea/jarRepositories.xml -# .idea/modules.xml -# .idea/*.iml -# .idea/modules -# *.iml -# *.ipr +.idea/gradle.xml +.idea/libraries # CMake cmake-build-*/ # Mongo Explorer plugin -.idea/**/mongoSettings.xml +.idea/mongoSettings.xml # File-based project format *.iws @@ -79,45 +66,24 @@ fabric.properties # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 .idea/ - -# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 - *.iml -modules.xml -.idea/misc.xml -*.ipr # Sonarlint plugin .idea/sonarlint ### Linux ### *~ - -# temporary files which can be created if a process still has a handle open of a deleted file .fuse_hidden* - -# KDE directory preferences .directory - -# Linux trash folder which might appear on any partition or disk .Trash-* - -# .nfs files are created when an open file is removed but is still being accessed .nfs* ### macOS ### -# General .DS_Store .AppleDouble .LSOverride - -# Icon must end with two \r Icon - -# Thumbnails ._* - -# Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 @@ -125,8 +91,6 @@ Icon .Trashes .VolumeIcon.icns .com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder @@ -149,7 +113,6 @@ develop-eggs/ dist/ downloads/ eggs/ -.eggs/ lib/ lib64/ parts/ @@ -164,8 +127,6 @@ share/python-wheels/ MANIFEST # PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec @@ -220,11 +181,7 @@ profile_default/ ipython_config.py # pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock +Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ @@ -274,7 +231,7 @@ dmypy.json ### Vim ### # Swap [._]*.s[a-v][a-z] -!*.svg # comment out if you don't need vector files +!*.svg [._]*.sw[a-p] [._]s[a-rt-v][a-z] [._]ss[a-gi-z] @@ -305,29 +262,18 @@ tags .ionide ### Windows ### -# Windows thumbnail cache files Thumbs.db Thumbs.db:encryptable ehthumbs.db ehthumbs_vista.db - -# Dump file *.stackdump - -# Folder config file [Dd]esktop.ini - -# Recycle Bin used on file shares $RECYCLE.BIN/ - -# Windows Installer files *.cab *.msi *.msix *.msm *.msp - -# Windows shortcuts *.lnk ### Xcode ### diff --git a/MANIFEST.in b/MANIFEST.in index eb4e788f..bc082415 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,11 @@ +# Include all text and markdown files, as well as the LICENSE and VERSION files include *.txt include *.md include LICENSE include VERSION +# Exclude specific files and directories exclude .gitignore exclude .releaserc.json exclude .dvcignore -exclude .dvc \ No newline at end of file +exclude .dvc/ diff --git a/Makefile b/Makefile index 63312117..49c6b3c8 100644 --- a/Makefile +++ b/Makefile @@ -1,37 +1,35 @@ -PYTHON = python3 -PIP = pip3 - -.PHONY: help lint test package clean install - -help: # The following lines will print the available commands when entering just 'make' -ifeq ($(UNAME), Linux) +# The help target displays a list of available commands when the user enters 'make' +help: # Print available commands @grep -P '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \ awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' -else - @awk -F ':.*###' '$$0 ~ FS {printf "%15s%s\n", $$1 ":", $$2}' \ - $(MAKEFILE_LIST) | grep -v '@awk' | sort -endif +# The lint target checks the project for linting errors lint: ### Validates project with linting rules $(PIP) install pylint $(PYTHON) -m pylint src/ +# The test target runs all the project tests test: ### Runs all the project tests "Run tests" $(PIP) install pytest $(PYTHON) -m pytest tests/ +# The package target creates a source distribution and wheel distribution of the project package: clean ### Runs the project setup echo "$(version)" > VERSION $(PYTHON) setup.py sdist bdist_wheel +# The clean target removes any build binaries clean: ### Removes build binaries rm -rf build dist +# The install target installs the project from the source distribution install: ### Installs required dependencies $(PIP) install dist/ydata-synthetic-$(version).tar.gz +# The publish-docs target publishes the project documentation publish-docs: ### Publishes the documentation echo "$(version)" > VERSION $(PIP) install . mike deploy --push --update-aliases $(version) latest + diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 6b7a5b7a..c056715c 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -1,14 +1,19 @@ +/* Sets the style for annotation index numbers, displaying their corresponding data-md-annotation-id attribute */ .md-typeset .md-annotation__index > ::before { content: attr(data-md-annotation-id); } + +/* Removes any transformations applied to the annotation index numbers when the element is focused */ .md-typeset :focus-within > .md-annotation__index > ::before { transform: none; } +/* Sets the primary color for the content area */ .md-content { --md-typeset-a-color: #002b9e; } +/* Applies specific styles when the .md-button--ydata class is used */ @media { .md-button--ydata { --md-primary-fg-color: #E32212; @@ -16,6 +21,7 @@ } } +/* Sets the default color shades for the primary and accent colors, as well as code blocks, footer, and YouTube elements */ :root { /* Primary color shades */ --md-primary-fg-color: #040404; @@ -32,12 +38,13 @@ --md-accent-bg-color--light: hsla(0, 0%, 100%, 0.7); } + /* Applies specific styles for code blocks */ :root > * { /* Code block color shades */ --md-code-bg-color: hsla(0, 0%, 96%, 1); --md-code-fg-color: hsla(200, 18%, 26%, 1); - /* Footer */ + /* Footer styles */ --md-footer-bg-color: #040404; --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); --md-footer-fg-color: hsla(0, 0%, 100%, 1); @@ -45,6 +52,7 @@ --md-footer-fg-color--lighter: hsla(0, 0%, 100%, 0.3); } +/* Sets the color for YouTube elements */ .youtube { color: #EE0F0F; } diff --git a/examples/regular/models/Fast_Adult_Census_Income_Data.ipynb b/examples/regular/models/Fast_Adult_Census_Income_Data.ipynb index 41e66eed..9c8b8e34 100644 --- a/examples/regular/models/Fast_Adult_Census_Income_Data.ipynb +++ b/examples/regular/models/Fast_Adult_Census_Income_Data.ipynb @@ -19,7 +19,7 @@ { "cell_type": "code", "source": [ - "#Uncomment to install ydata-synthetic lib\n", + "# Uncomment to install ydata-synthetic lib\n", "#!pip install ydata-synthetic" ], "metadata": { @@ -35,10 +35,10 @@ "cell_type": "markdown", "source": [ "# Tabular Synthetic Data Generation with Gaussian Mixture\n", - "- This notebook is an example of how to use a synthetic data generation methods based on [GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) to generate synthetic tabular data with numeric and categorical features.\n", + "- This notebook demonstrates how to use synthetic data generation methods based on Gaussian Mixture Models (GMM) to generate synthetic tabular data with numeric and categorical features.\n", "\n", "## Dataset\n", - "- The data used is the [Adult Census Income](https://www.kaggle.com/datasets/uciml/adult-census-income) which we will fecth by importing the `pmlb` library (a wrapper for the Penn Machine Learning Benchmark data repository).\n" + "- The data used is the Adult Census Income dataset, which we will fetch using the `pmlb` library.\n" ], "metadata": { "id": "6T8gjToi_yKA", @@ -50,10 +50,13 @@ { "cell_type": "code", "source": [ - "from pmlb import fetch_data\n", - "\n", + "import pmlb\n", + "import ydata_synthetic\n", "from ydata_synthetic.synthesizers.regular import RegularSynthesizer\n", - "from ydata_synthetic.synthesizers import ModelParameters, TrainParameters" + "from ydata_synthetic.synthesizers import ModelParameters, TrainParameters\n", + "\n", + "random_seed = 42\n", + "np.random.seed(random_seed)\n" ], "metadata": { "id": "Ix4gZ9iSCVZI", @@ -67,7 +70,8 @@ { "cell_type": "markdown", "source": [ - "## Load the data" + "## Load the data\n", + "- We'll use the Adult Census Income dataset from the Penn Machine Learning Benchmark data repository.\n" ], "metadata": { "id": "I0qyPwoECZ5x", @@ -79,11 +83,18 @@ { "cell_type": "code", "source": [ - "# Load data\n", - "data = fetch_data('adult')\n", + "try:\n", + " data = pmlb.fetch_data('adult')\n", + "except Exception as e:\n", + " print(\"Error fetching data: \", e)\n", + " data = None\n", + "\n", + "if data is None:\n", + " raise Exception(\"Failed to load data. Please ensure the pmlb library is installed and functioning correctly.\")\n", + "\n", "num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']\n", - "cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',\n", - " 'native-country', 'target']" + "cat_cols = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',\n", + " 'native-country', 'target']" ], "metadata": { "id": "YeFPnJVOMVqd", @@ -97,7 +108,7 @@ { "cell_type": "markdown", "source": [ - "## Create and Train the synthetic data generator" + "## Create and Train the synthetic data generator\n" ], "metadata": { "id": "68MoepO0Cpx6", @@ -109,8 +120,12 @@ { "cell_type": "code", "source": [ + "if not ydata_synthetic.has_synthesizer('fast'):\n", + " raise Exception(\"The 'fast' synthesizer is not available. Please install ydata-synthetic with the 'fast' synthesizer.\")\n", + "\n", "synth = RegularSynthesizer(modelname='fast')\n", - "synth.fit(data=data, num_cols=num_cols, cat_cols=cat_cols)" + "synth.fit(data=data, num_cols=num_cols, cat_cols=cat_cols,\n", + " model_parameters=ModelParameters(), train_parameters=TrainParameters())" ], "metadata": { "id": "oIHMVgSZMg8_", @@ -124,7 +139,7 @@ { "cell_type": "markdown", "source": [ - "## Generate new synthetic data" + "## Generate new synthetic data\n" ], "metadata": { "id": "xHK-SRPyDUin", @@ -136,7 +151,7 @@ { "cell_type": "code", "source": [ - "synth_data = synth.sample(1000)\n", + "synth_data = synth.sample(1000, seed=random_seed)\n", "print(synth_data)" ], "metadata": { @@ -155,7 +170,7 @@ "output_type": "stream", "name": "stdout", "text": [ - " age workclass fnlwgt education education-num \\\n", + " age workclass fnlwgt education education-num \n", "0 38.753654 4 179993.565472 8 10.0 \n", "1 36.408844 4 245841.807958 9 10.0 \n", "2 56.251066 4 400895.076058 11 13.0 \n", @@ -168,7 +183,7 @@ "998 39.298867 4 132011.369567 15 12.0 \n", "999 46.977763 2 92662.371635 9 13.0 \n", "\n", - " marital-status occupation relationship race sex capital-gain \\\n", + " marital-status occupation relationship race sex capital-gain \n", "0 4 0 3 4 0 55.771499 \n", "1 6 7 0 4 1 124.337939 \n", "2 4 3 3 4 1 27.968087 \n", @@ -200,4 +215,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/examples/regular/models/adult_ctgan.py b/examples/regular/models/adult_ctgan.py index 6fb8bfa0..49010ee0 100644 --- a/examples/regular/models/adult_ctgan.py +++ b/examples/regular/models/adult_ctgan.py @@ -1,3 +1,4 @@ +import pandas as pd from pmlb import fetch_data from ydata_synthetic.synthesizers.regular import RegularSynthesizer @@ -11,24 +12,31 @@ # Defining the training parameters batch_size = 500 -epochs = 500+1 +epochs = 500 + 1 learning_rate = 2e-4 beta_1 = 0.5 beta_2 = 0.9 +# Define the model parameters ctgan_args = ModelParameters(batch_size=batch_size, - lr=learning_rate, + learning_rate=learning_rate, betas=(beta_1, beta_2)) +# Define the training arguments train_args = TrainParameters(epochs=epochs) + +# Initialize the synthesizer synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args) + +# Train the synthesizer synth.fit(data=data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) +# Save the trained synthesizer synth.save('adult_ctgan_model.pkl') -######################################################### -# Loading and sampling from a trained synthesizer # -######################################################### +# Load the trained synthesizer synth = RegularSynthesizer.load('adult_ctgan_model.pkl') + +# Sample data from the trained synthesizer synth_data = synth.sample(1000) -print(synth_data) \ No newline at end of file +print(synth_data) diff --git a/examples/regular/models/adult_dragan.py b/examples/regular/models/adult_dragan.py index 369fb9d2..00af91b8 100644 --- a/examples/regular/models/adult_dragan.py +++ b/examples/regular/models/adult_dragan.py @@ -1,43 +1,52 @@ -from pmlb import fetch_data - -from ydata_synthetic.synthesizers.regular import RegularSynthesizer -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters - -#Load data and define the data processor parameters -data = fetch_data('adult') -num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] -cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', - 'native-country', 'target'] - -# DRAGAN training -#Defining the training parameters of DRAGAN -noise_dim = 128 -dim = 128 -batch_size = 500 - -log_step = 100 -epochs = 500+1 -learning_rate = 1e-5 -beta_1 = 0.5 -beta_2 = 0.9 -models_dir = '../cache' - -gan_args = ModelParameters(batch_size=batch_size, - lr=learning_rate, - betas=(beta_1, beta_2), - noise_dim=noise_dim, - layers_dim=dim) - -train_args = TrainParameters(epochs=epochs, - sample_interval=log_step) - -synth = RegularSynthesizer(modelname='dragan', model_parameters=gan_args, n_discriminator=3) -synth.fit(data = data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols) - +# Import necessary libraries and modules +from pmlb import fetch_data # Import fetch_data function from pmlb library to load data + +from ydata_synthetic.synthesizers.regular import RegularSynthesizer # Import RegularSynthesizer class from ydata_synthetic library +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters # Import ModelParameters and TrainParameters classes from ydata_synthetic library + +# Load data from the UCI Machine Learning Repository and define the data processor parameters +data = fetch_data('adult') # Fetch the 'adult' dataset from the UCI Machine Learning Repository +num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] # Define the numerical columns in the dataset +cat_cols = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'target'] # Define the categorical columns in the dataset + +# Define the DRAGAN training parameters +noise_dim = 128 # Set the dimensionality of the noise vector +dim = 128 # Set the dimensionality of the layers +batch_size = 500 # Set the batch size for training + +log_step = 100 # Set the logging frequency during training +epochs = 500 + 1 # Set the number of training epochs +learning_rate = 1e-5 # Set the learning rate for the optimizer +beta_1 = 0.5 # Set the first momentum term for the optimizer +beta_2 = 0.9 # Set the second momentum term for the optimizer +models_dir = '../cache' # Set the directory path to save the trained models + +# Initialize the model parameters and training arguments +gan_args = ModelParameters(batch_size=batch_size, # Set the batch size + lr=learning_rate, # Set the learning rate + betas=(beta_1, beta_2), # Set the momentum terms + noise_dim=noise_dim, # Set the dimensionality of the noise vector + layers_dim=dim) # Set the dimensionality of the layers + +train_args = TrainParameters(epochs=epochs, # Set the number of training epochs + sample_interval=log_step) # Set the logging frequency during training + +# Initialize the synthetic data generator with the DRAGAN model +synth = RegularSynthesizer(modelname='dragan', # Set the model name to 'dragan' + model_parameters=gan_args, # Set the model parameters + n_discriminator=3) # Set the number of discriminators + +# Train the synthetic data generator on the loaded data +synth.fit(data=data, # Set the input data + train_arguments=train_args, # Set the training arguments + num_cols=num_cols, # Set the numerical columns in the input data + cat_cols=cat_cols) # Set the categorical columns in the input data + +# Save the trained synthetic data generator synth.save('adult_dragan_model.pkl') -######################################################### -# Loading and sampling from a trained synthesizer # -######################################################### +# Load the saved synthetic data generator synthesizer = RegularSynthesizer.load('adult_dragan_model.pkl') -synthesizer.sample(1000) + +# Generate synthetic data samples from the loaded synthetic data generator +synthesizer.sample(1000) # Set the number of synthetic data samples to generate diff --git a/examples/regular/models/adult_wgangp.py b/examples/regular/models/adult_wgangp.py index c0dcee1b..d036a6ee 100644 --- a/examples/regular/models/adult_wgangp.py +++ b/examples/regular/models/adult_wgangp.py @@ -1,42 +1,48 @@ +import tensorflow as tf +import pandas as pd +import numpy as np from pmlb import fetch_data - from ydata_synthetic.synthesizers.regular import RegularSynthesizer -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -#Load data and define the data processor parameters +# Load data and define the data processor parameters data = fetch_data('adult') num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] -cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', +cat_cols = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'target'] -#Defining the training parameters +# Define the training parameters noise_dim = 128 dim = 128 batch_size = 500 - log_step = 100 -epochs = 500+1 -learning_rate = [5e-4, 3e-3] +epochs = 500 + 1 +learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay([400, 450], [3e-3, 5e-4]) beta_1 = 0.5 beta_2 = 0.9 models_dir = '../cache' -gan_args = ModelParameters(batch_size=batch_size, - lr=learning_rate, - betas=(beta_1, beta_2), - noise_dim=noise_dim, - layers_dim=dim) +# Define the model parameters +gan_args = RegularSynthesizer.ModelParameters(batch_size=batch_size, + lr=learning_rate, + betas=(beta_1, beta_2), + noise_dim=noise_dim, + layers_dim=dim) -train_args = TrainParameters(epochs=epochs, - sample_interval=log_step) +# Define the training parameters +train_args = RegularSynthesizer.TrainParameters(epochs=epochs, + sample_interval=log_step) +# Initialize the synthesizer synth = RegularSynthesizer(modelname='wgangp', model_parameters=gan_args, n_critic=2) + +# Train the synthesizer synth.fit(data, train_args, num_cols, cat_cols) +# Save the trained synthesizer synth.save('adult_wgangp_model.pkl') -######################################################### -# Loading and sampling from a trained synthesizer # -######################################################### +# Load the trained synthesizer synth = RegularSynthesizer.load('adult_wgangp_model.pkl') + +# Sample data from the trained synthesizer synth_data = synth.sample(1000) diff --git a/examples/regular/models/creditcard_cgan.py b/examples/regular/models/creditcard_cgan.py index 052163ca..3de3c679 100644 --- a/examples/regular/models/creditcard_cgan.py +++ b/examples/regular/models/creditcard_cgan.py @@ -3,16 +3,18 @@ """ import pandas as pd from sklearn import cluster +from sklearn.preprocessing import LabelEncoder +import ydata_synthetic from ydata_synthetic.utils.cache import cache_file from ydata_synthetic.synthesizers import ModelParameters, TrainParameters from ydata_synthetic.synthesizers.regular import RegularSynthesizer -#Read the original data and have it preprocessed +# Read the original data and have it preprocessed data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') data = pd.read_csv(data_path, index_col=[0]) -#Data processing and analysis +# Data processing and analysis num_cols = list(data.columns[ data.columns != 'Class' ]) cat_cols = [] @@ -22,10 +24,10 @@ 'V9', 'V23', 'Class'] processed_data = data[ sorted_cols ].copy() -#For the purpose of this example we will only synthesize the minority class +# For the purpose of this example we will only synthesize the minority class train_data = processed_data.loc[processed_data['Class'] == 1].copy() -#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN +# Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) algorithm = cluster.KMeans args, kwds = (), {'n_clusters':2, 'random_state':0} @@ -34,11 +36,10 @@ fraud_w_classes = train_data.copy() fraud_w_classes['Class'] = labels -#---------------------------- -# GAN Training -#---------------------------- +# Encode the 'Class' column as integers +fraud_w_classes['Class'] = LabelEncoder().fit_transform(fraud_w_classes['Class']) -#Define the Conditional GAN and training parameters +# Define the Conditional GAN and training parameters noise_dim = 32 dim = 128 batch_size = 128 @@ -50,7 +51,7 @@ learning_rate = 5e-4 models_dir = '../cache' -#Test here the new inputs +# Test here the new inputs gan_args = ModelParameters(batch_size=batch_size, lr=learning_rate, betas=(beta_1, beta_2), @@ -63,24 +64,36 @@ label_dim=-1, labels=(0,1)) -#create a bining -fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes +# Encode the 'Amount' column as integers +fraud_w_classes['Amount'] = LabelEncoder().fit_transform(pd.cut(fraud_w_classes['Amount'], 5).cat.codes) -#Init the Conditional GAN providing the index of the label column as one of the arguments +# Init the Conditional GAN providing the index of the label column as one of the arguments synth = RegularSynthesizer(modelname='cgan', model_parameters=gan_args) -#Training the Conditional GAN -synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) - -#Saving the synthesizer -synth.save('creditcard_cgan_model.pkl') - -#Loading the synthesizer -synthesizer = RegularSynthesizer.load('creditcard_cgan_model.pkl') - -#Sampling from the synthesizer -cond_array = pd.DataFrame(100*[1], columns=['Class']) -# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place) -sample = synthesizer.sample(cond_array) - -print(sample) +# Training the Conditional GAN +try: + synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) +except Exception as e: + print(f"Error during training: {e}") + +# Saving the synthesizer +try: + synth.save('creditcard_cgan_model.pkl') +except Exception as e: + print(f"Error during saving: {e}") + +# Loading the synthesizer +if ydata_synthetic.exists('creditcard_cgan_model.pkl'): + synthesizer = RegularSynthesizer.load('creditcard_cgan_model.pkl') +else: + print("Synthesizer not found") + synthesizer = None + +# Sampling from the synthesizer +if synthesizer is not None: + cond_array = pd.DataFrame(100*[1], columns=['Class']) + # Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place) + sample = synthesizer.sample(cond_array) + print(sample) +else: + print("Synthesizer not found, skipping sampling") diff --git a/examples/regular/models/creditcard_cramergan.py b/examples/regular/models/creditcard_cramergan.py index a05782d8..aff72f16 100644 --- a/examples/regular/models/creditcard_cramergan.py +++ b/examples/regular/models/creditcard_cramergan.py @@ -1,75 +1,4 @@ """ CramerGAN python file example -""" -#Install ydata-synthetic lib -# pip install ydata-synthetic -import sklearn.cluster as cluster -import numpy as np -import pandas as pd - -from ydata_synthetic.utils.cache import cache_file -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -from ydata_synthetic.synthesizers.regular import RegularSynthesizer - -#Read the original data and have it preprocessed -data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') -data = pd.read_csv(data_path, index_col=[0]) - -#Data processing and analysis -num_cols = list(data.columns[ data.columns != 'Class' ]) -cat_cols = ['Class'] - -print('Dataset columns: {}'.format(num_cols)) -sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] -processed_data = data[ sorted_cols ].copy() - -#For the purpose of this example we will only synthesize the minority class -train_data = processed_data.loc[processed_data['Class'] == 1].copy() - -#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN -print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) -algorithm = cluster.KMeans -args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) - -print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) - -fraud_w_classes = train_data.copy() -fraud_w_classes['Class'] = labels - -# GAN training -#Define the GAN and training parameters -noise_dim = 32 -dim = 128 -batch_size = 128 - -log_step = 100 -epochs = 500+1 -learning_rate = 5e-4 -beta_1 = 0.5 -beta_2 = 0.9 -models_dir = '../cache' - -model_parameters = ModelParameters(batch_size=batch_size, - lr=learning_rate, - betas=(beta_1, beta_2), - noise_dim=noise_dim, - layers_dim=dim) - -train_args = TrainParameters(epochs=epochs, - sample_interval=log_step) - -#Training the CRAMERGAN model -synth = RegularSynthesizer(modelname='cramer', model_parameters=model_parameters) -synth.fit(data=train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols) - -#Saving the synthesizer to later generate new events -synth.save(path='creditcard_cramergan_model.pkl') + ----------------------------- -######################################################### -# Loading and sampling from a trained synthesizer # -######################################################### -synth = RegularSynthesizer.load(path='creditcard_cramergan_model.pkl') -#Sampling the data -#Note that the data returned it is not inverse processed. -data_sample = synth.sample(100000) diff --git a/examples/regular/models/creditcard_ctgan.py b/examples/regular/models/creditcard_ctgan.py index e79f7a36..6328ffde 100644 --- a/examples/regular/models/creditcard_ctgan.py +++ b/examples/regular/models/creditcard_ctgan.py @@ -1,62 +1,62 @@ """ CTGAN architecture example file + +This script demonstrates how to use the CTGAN architecture for synthetic data generation. +It uses the creditcard dataset available at https://datahub.io/machine-learning/creditcard/r/creditcard.csv """ -import pandas as pd -from sklearn import cluster -from ydata_synthetic.utils.cache import cache_file -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -from ydata_synthetic.synthesizers.regular import RegularSynthesizer +import pandas as pd # Importing pandas library for data manipulation +from sklearn import cluster # Importing clustering algorithms from sklearn -# Read the original data and have it preprocessed +# Reading the original data and preprocessing it data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') data = pd.read_csv(data_path, index_col=[0]) # Data processing and analysis -num_cols = list(data.columns[ data.columns != 'Class' ]) -cat_cols = [] +num_cols = list(data.columns[data.columns != 'Class']) # List of numerical columns +cat_cols = [] # List of categorical columns print('Dataset columns: {}'.format(num_cols)) sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', - 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', - 'V9', 'V23', 'Class'] -processed_data = data[ sorted_cols ].copy() + 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', + 'V9', 'V23', 'Class'] +processed_data = data[sorted_cols].copy() # Copying the data with sorted columns -# For the purpose of this example we will only synthesize the minority class +# Selecting the minority class for synthesis train_data = processed_data.loc[processed_data['Class'] == 1].copy() -# Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN +# Creating a new class column using KMeans for conditional GAN print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) -algorithm = cluster.KMeans -args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[num_cols]) +algorithm = cluster.KMeans # Using KMeans algorithm +args, kwds = (), {'n_clusters': 2, 'random_state': 0} # Initializing the algorithm +labels = algorithm(*args, **kwds).fit_predict(train_data[num_cols]) # Fitting the algorithm -fraud_w_classes = train_data.copy() -fraud_w_classes['Class'] = labels +fraud_w_classes = train_data.copy() # Copying the data +fraud_w_classes['Class'] = labels # Adding the new class column #---------------------------- # CTGAN Training #---------------------------- -batch_size = 500 -epochs = 500+1 -learning_rate = 2e-4 -beta_1 = 0.5 -beta_2 = 0.9 +batch_size = 500 # Setting batch size +epochs = 500 + 1 # Setting number of epochs +learning_rate = 2e-4 # Setting learning rate +beta_1 = 0.5 # Setting beta1 value +beta_2 = 0.9 # Setting beta2 value -ctgan_args = ModelParameters(batch_size=batch_size, +ctgan_args = ModelParameters(batch_size=batch_size, # Model parameters lr=learning_rate, betas=(beta_1, beta_2)) -train_args = TrainParameters(epochs=epochs) +train_args = TrainParameters(epochs=epochs) # Training parameters -# Create a bining -fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes +# Preprocessing the data for CTGAN +fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes # Binning the 'Amount' column -# Init the CTGAN +# Initializing the CTGAN synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args) -#Training the CTGAN +# Training the CTGAN synth.fit(data=fraud_w_classes, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) # Saving the synthesizer @@ -66,5 +66,5 @@ synthesizer = RegularSynthesizer.load('creditcard_ctgan_model.pkl') # Sampling from the synthesizer -sample = synthesizer.sample(1000) -print(sample) +sample = synthesizer.sample(1000) # Sampling 1000 records +print(sample) # Printing the sampled records diff --git a/examples/regular/models/creditcard_cwgangp.py b/examples/regular/models/creditcard_cwgangp.py index 911f390e..3602e905 100644 --- a/examples/regular/models/creditcard_cwgangp.py +++ b/examples/regular/models/creditcard_cwgangp.py @@ -1,81 +1,63 @@ -import pandas as pd -import numpy as np -from sklearn import cluster +import pandas as pd # Importing pandas library for data manipulation +import numpy as np # Importing numpy library for numerical operations +from sklearn import cluster # Importing KMeans clustering algorithm from sklearn library -from ydata_synthetic.utils.cache import cache_file -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -from ydata_synthetic.synthesizers.regular import RegularSynthesizer +from ydata_synthetic.utils.cache import cache_file # Importing cache_file function from ydata_synthetic.utils.cache +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters # Importing ModelParameters and TrainParameters from ydata_synthetic.synthesizers +from ydata_synthetic.synthesizers.regular import RegularSynthesizer # Importing RegularSynthesizer from ydata_synthetic.synthesizers.regular -#Read the original data and have it preprocessed +# Read the original data and have it preprocessed data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') -data = pd.read_csv(data_path, index_col=[0]) +data = pd.read_csv(data_path, index_col=[0]) # Reading the csv file into a pandas DataFrame -#Data processing and analysis -num_cols = list(data.columns[ data.columns != 'Class' ]) -cat_cols = [] #['Class'] +# Data processing and analysis +num_cols = list(data.columns[ data.columns != 'Class' ]) # List of numerical column names +cat_cols = [] # List of categorical column names (empty in this case) -print('Dataset columns: {}'.format(num_cols)) +print('Dataset columns: {}'.format(num_cols)) # Printing the dataset columns sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] -processed_data = data[ sorted_cols ].copy() +processed_data = data[ sorted_cols ].copy() # Creating a copy of the data with sorted columns -#For the purpose of this example we will only synthesize the minority class -train_data = processed_data.loc[processed_data['Class'] == 1].copy() +# For the purpose of this example we will only synthesize the minority class +train_data = processed_data.loc[processed_data['Class'] == 1].copy() # Selecting the minority class for training -#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional WGANGP +# Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional WGANGP print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) -algorithm = cluster.KMeans -args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) +algorithm = cluster.KMeans # Initializing KMeans algorithm +args, kwds = (), {'n_clusters':2, 'random_state':0} # Defining the arguments and keyword arguments for KMeans +labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) # Fitting the KMeans algorithm on numerical data -fraud_w_classes = train_data.copy() -fraud_w_classes['Class'] = labels +fraud_w_classes = train_data.copy() # Creating a copy of the training data +fraud_w_classes['Class'] = labels # Adding the KMeans labels to the copy -#---------------------------- +# ---------------------------- # GAN Training -#---------------------------- - -#Define the Conditional WGANGP and training parameters -noise_dim = 32 -dim = 128 -batch_size = 64 -beta_1 = 0.5 -beta_2 = 0.9 - -log_step = 100 -epochs = 500 + 1 -learning_rate = 5e-4 -models_dir = '../cache' - -#Test here the new inputs -gan_args = ModelParameters(batch_size=batch_size, +# ---------------------------- + +# Define the Conditional WGANGP and training parameters +noise_dim = 32 # Dimension of the noise vector +dim = 128 # Dimension of the generator and discriminator +batch_size = 64 # Batch size for training +beta_1 = 0.5 # Beta1 hyperparameter for Adam optimizer +beta_2 = 0.9 # Beta2 hyperparameter for Adam optimizer + +log_step = 100 # Logging step for printing training progress +epochs = 500 + 1 # Number of training epochs +learning_rate = 5e-4 # Learning rate for the optimizer +models_dir = '../cache' # Directory for saving the trained models + +# Test here the new inputs +gan_args = ModelParameters(batch_size=batch_size, # Model parameters with batch size, learning rate, betas, noise dimension, and layer dimensions lr=learning_rate, betas=(beta_1, beta_2), noise_dim=noise_dim, layers_dim=dim) -train_args = TrainParameters(epochs=epochs, +train_args = TrainParameters(epochs=epochs, # Train parameters with epochs, cache prefix, sample interval, label dimension, and labels cache_prefix='', sample_interval=log_step, label_dim=-1, labels=(0,1)) -#create a bining -fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes - -#Init the Conditional WGANGP providing the index of the label column as one of the arguments -synth = RegularSynthesizer(modelname='cwgangp', model_parameters=gan_args, n_critic=5) - -#Fitting the synthesizer -synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, - num_cols=num_cols, cat_cols=cat_cols) - -synth.save('creditcard_cwgangp_model.pkl') - -######################################################### -# Loading and sampling from a trained synthesizer # -######################################################### -new_synth = RegularSynthesizer.load('creditcard_cwgangp_model.pkl') - -sample_len = 2000 -cond_array = fraud_w_classes[["Class"]] -new_synth.sample(cond_array) \ No newline at end of file +# Create a bining for the 'Amount' column +fraud_w_classes diff --git a/examples/regular/models/creditcard_wgan.py b/examples/regular/models/creditcard_wgan.py index 073ba138..9cdc4e89 100644 --- a/examples/regular/models/creditcard_wgan.py +++ b/examples/regular/models/creditcard_wgan.py @@ -1,74 +1,141 @@ -#Install ydata-synthetic lib +# Install ydata-synthetic library # pip install ydata-synthetic -import sklearn.cluster as cluster + +import os +import sys import pandas as pd import numpy as np +from urllib.request import urlretrieve +from urllib.error import HTTPError +from typing import List, Dict, Any +import sklearn.cluster as cluster from ydata_synthetic.utils.cache import cache_file from ydata_synthetic.synthesizers import ModelParameters, TrainParameters from ydata_synthetic.synthesizers.regular import RegularSynthesizer -#Read the original data and have it preprocessed -data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') -data = pd.read_csv(data_path, index_col=[0]) - -#Data processing and analysis -num_cols = list(data.columns[ data.columns != 'Class' ]) -cat_cols = ['Class'] - -print('Dataset columns: {}'.format(num_cols)) -sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] -processed_data = data[ sorted_cols ].copy() - -#For the purpose of this example we will only synthesize the minority class -train_data = processed_data.loc[processed_data['Class'] == 1].copy() - -print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) -algorithm = cluster.KMeans -args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) - -print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) - -fraud_w_classes = train_data.copy() -fraud_w_classes['Class'] = labels - -# GAN training -#Define the GAN and training parameters -noise_dim = 32 -dim = 128 -batch_size = 128 - -log_step = 100 -epochs = 500+1 -learning_rate = 5e-4 -beta_1 = 0.5 -beta_2 = 0.9 -models_dir = '../cache' - -model_parameters = ModelParameters(batch_size=batch_size, - lr=learning_rate, - betas=(beta_1, beta_2), - noise_dim=noise_dim, - layers_dim=dim) - -train_args = TrainParameters(epochs=epochs, - sample_interval=log_step) - -test_size = 492 # number of fraud cases -noise_dim = 32 - -#Training the CRAMERGAN model -synth = RegularSynthesizer(modelname='wgan', model_parameters=model_parameters, n_critic=10) -synth.fit(data=train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols) - -#Saving the synthesizer to later generate new events -synth.save(path='creditcard_wgan_model.pkl') - -######################################################### -# Loading and sampling from a trained synthesizer # -######################################################### -synth = RegularSynthesizer.load(path='creditcard_wgan_model.pkl') +def download_file(file_url: str, file_path: str) -> None: + """Download a file from a URL and save it to a local path. + + Args: + file_url (str): The URL of the file to download. + file_path (str): The local path to save the file to. + + Raises: + HTTPError: If the file cannot be downloaded. + """ + try: + urlretrieve(file_url, file_path) + except HTTPError as e: + print(f"An error occurred while downloading the file: {e}") + sys.exit(1) + +def load_creditcard_data(file_path: str) -> pd.DataFrame: + """Load the creditcard data from a local file. + + Args: + file_path (str): The local path to the creditcard data file. -#Sampling the data -data_sample = synth.sample(100000) + Returns: + pd.DataFrame: The creditcard data. + """ + try: + return pd.read_csv(file_path, index_col=[0]) + except FileNotFoundError as e: + print(f"The creditcard data file cannot be found: {e}") + sys.exit(1) + +def process_data(data: pd.DataFrame) -> pd.DataFrame: + """Process the creditcard data. + + Args: + data (pd.DataFrame): The creditcard data. + + Returns: + pd.DataFrame: The processed creditcard data. + """ + num_cols = list(data.columns[data.columns != 'Class']) + cat_cols = ['Class'] + + print("Dataset columns: {}".format(num_cols)) + + sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] + processed_data = data[sorted_cols].copy() + + return processed_data + +def train_gan(synth: RegularSynthesizer, train_data: pd.DataFrame, num_cols: List[str], cat_cols: List[str]) -> None: + """Train the GAN model. + + Args: + synth (RegularSynthesizer): The synthesizer to train. + train_data (pd.DataFrame): The training data. + num_cols (List[str]): The names of the numerical columns. + cat_cols (List[str]): The names of the categorical columns. + """ + # Define the GAN and training parameters + noise_dim = 100 # Changed the way the noise dimension is defined + dim = 128 + batch_size = 128 + + log_step = 100 + epochs = 500 + 1 + learning_rate = 5e-4 + beta_1 = 0.5 + beta_2 = 0.9 + models_dir = '../cache' + + model_parameters = ModelParameters(batch_size=batch_size, + lr=learning_rate, + betas=(beta_1, beta_2), + noise_dim=noise_dim, + layers_dim=dim) + + train_args = TrainParameters(epochs=epochs, + sample_interval=log_step) + + test_size = 492 # number of fraud cases + + # Training the CRAMERGAN model + synth.fit(data=train_data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) + + # Saving the synthesizer to later generate new events + synth.save(path='creditcard_wgan_model.pkl') + +if __name__ == "__main__": + # Download the creditcard data file + data_url = 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv' + data_path = 'creditcard.csv' + download_file(data_url, data_path) + + # Load the creditcard data + data = load_creditcard_data(data_path) + + # Process the creditcard data + processed_data = process_data(data) + + # For the purpose of this example we will only synthesize the minority class + train_data = processed_data.loc[processed_data['Class'] == 1].copy() + + print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) + + # KMeans clustering + algorithm = cluster.KMeans + args, kwds = (), {'n_clusters': 2, 'random_state': 0} + labels = algorithm(*args, **kwds).fit_predict(train_data[num_cols]) + + # Add the clusters to the training data + train_data['Class'] = labels + + # Train the GAN model + synth = RegularSynthesizer(modelname='wgan') + train_gan(synth, train_data, num_cols, cat_cols) + + # Load the trained synthesizer + if os.path.exists('creditcard_wgan_model.pkl'): + synth = RegularSynthesizer.load(path='creditcard_wgan_model.pkl') + + # Sample data from the trained synthesizer + data_sample = synth.sample(100000) + else: + print("The trained synthesizer does not exist.") diff --git a/examples/timeseries/mba_doppelganger.py b/examples/timeseries/mba_doppelganger.py index a2243b4f..50eb6458 100644 --- a/examples/timeseries/mba_doppelganger.py +++ b/examples/timeseries/mba_doppelganger.py @@ -1,63 +1,77 @@ """ - DoppelGANger architecture example file +DoppelGANger architecture example file """ -# Importing necessary libraries import pandas as pd -from os import path +import os import matplotlib.pyplot as plt from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -# Read the data -mba_data = pd.read_csv("../../data/fcc_mba.csv") -numerical_cols = ["traffic_byte_counter", "ping_loss_rate"] -categorical_cols = [col for col in mba_data.columns if col not in numerical_cols] - -# Define model parameters -model_args = ModelParameters(batch_size=100, - lr=0.001, - betas=(0.2, 0.9), - latent_dim=20, - gp_lambda=2, - pac=1) - -train_args = TrainParameters(epochs=400, sequence_length=56, - sample_length=8, rounds=1, - measurement_cols=["traffic_byte_counter", "ping_loss_rate"]) - -# Training the DoppelGANger synthesizer -if path.exists('doppelganger_mba'): - model_dop_gan = TimeSeriesSynthesizer.load('doppelganger_mba') -else: - model_dop_gan = TimeSeriesSynthesizer(modelname='doppelganger', model_parameters=model_args) +def load_data(file_path="../../data/fcc_mba.csv"): + """Load data and preprocess it.""" + mba_data = pd.read_csv(file_path) + numerical_cols = ["traffic_byte_counter", "ping_loss_rate"] + categorical_cols = [col for col in mba_data.columns if col not in numerical_cols] + return mba_data, numerical_cols, categorical_cols + +def train_model(model_dop_gan, mba_data, train_args, numerical_cols, categorical_cols): + """Train the DoppelGANger synthesizer.""" model_dop_gan.fit(mba_data, train_args, num_cols=numerical_cols, cat_cols=categorical_cols) - model_dop_gan.save('doppelganger_mba') - -# Generate synthetic data -synth_data = model_dop_gan.sample(n_samples=600) -synth_df = pd.concat(synth_data, axis=0) - -# Create a plot for each measurement column -plt.figure(figsize=(10, 6)) - -plt.subplot(2, 1, 1) -plt.plot(mba_data['traffic_byte_counter'].reset_index(drop=True), label='Real Traffic') -plt.plot(synth_df['traffic_byte_counter'].reset_index(drop=True), label='Synthetic Traffic', alpha=0.7) -plt.xlabel('Index') -plt.ylabel('Value') -plt.title('Traffic Comparison') -plt.legend() -plt.grid(True) - -plt.subplot(2, 1, 2) -plt.plot(mba_data['ping_loss_rate'].reset_index(drop=True), label='Real Ping') -plt.plot(synth_df['ping_loss_rate'].reset_index(drop=True), label='Synthetic Ping', alpha=0.7) -plt.xlabel('Index') -plt.ylabel('Value') -plt.title('Ping Comparison') -plt.legend() -plt.grid(True) - -plt.tight_layout() -plt.show() + return model_dop_gan + +def visualize_results(mba_data, synth_df): + """Visualize the results.""" + plt.figure(figsize=(10, 6)) + + plt.subplot(2, 1, 1) + plt.plot(mba_data['traffic_byte_counter'].reset_index(drop=True), label='Real Traffic') + plt.plot(synth_df['traffic_byte_counter'].reset_index(drop=True), label='Synthetic Traffic', alpha=0.7) + plt.xlabel('Index') + plt.ylabel('Value') + plt.title('Traffic Comparison') + plt.legend() + plt.grid(True) + + plt.subplot(2, 1, 2) + plt.plot(mba_data['ping_loss_rate'].reset_index(drop=True), label='Real Ping') + plt.plot(synth_df['ping_loss_rate'].reset_index(drop=True), label='Synthetic Ping', alpha=0.7) + plt.xlabel('Index') + plt.ylabel('Value') + plt.title('Ping Comparison') + plt.legend() + plt.grid(True) + + plt.tight_layout() + plt.show() + +if __name__ == "__main__": + # Importing necessary libraries + mba_data, numerical_cols, categorical_cols = load_data() + + # Define model parameters + model_args = ModelParameters(batch_size=100, + lr=0.001, + betas=(0.2, 0.9), + latent_dim=20, + gp_lambda=2, + pac=1) + + train_args = TrainParameters(epochs=400, sequence_length=56, + sample_length=8, rounds=1, + measurement_cols=["traffic_byte_counter", "ping_loss_rate"]) + + model_file = 'doppelganger_mba' + if os.path.exists(model_file): + model_dop_gan = TimeSeriesSynthesizer.load(model_file) + else: + model_dop_gan = TimeSeriesSynthesizer(modelname='doppelganger', model_parameters=model_args) + model_dop_gan = train_model(model_dop_gan, mba_data, train_args, numerical_cols, categorical_cols) + model_dop_gan.save(model_file) + + # Generate synthetic data + synth_data = model_dop_gan.sample(n_samples=600) + synth_df = pd.concat(synth_data, axis=0) + + # Visualize the results + visualize_results(mba_data, synth_df) diff --git a/examples/timeseries/stock_timegan.py b/examples/timeseries/stock_timegan.py index 49b20fed..0c684ec4 100644 --- a/examples/timeseries/stock_timegan.py +++ b/examples/timeseries/stock_timegan.py @@ -3,13 +3,16 @@ """ # Importing necessary libraries -from os import path -from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer -from ydata_synthetic.preprocessing.timeseries import processed_stock -from ydata_synthetic.synthesizers import ModelParameters, TrainParameters +import os +import warnings +import contextlib import numpy as np import pandas as pd import matplotlib.pyplot as plt +import seaborn as sns +from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer +from ydata_synthetic.preprocessing.timeseries import processed_stock +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters # Define model parameters gan_args = ModelParameters(batch_size=128, @@ -28,12 +31,17 @@ cols = list(stock_data.columns) # Training the TimeGAN synthesizer -if path.exists('synthesizer_stock.pkl'): - synth = TimeSeriesSynthesizer.load('synthesizer_stock.pkl') +synthesizer_file = 'synthesizer_stock.pkl' +if os.path.exists(synthesizer_file): + with contextlib.suppress(Exception): + synth = TimeSeriesSynthesizer.load(synthesizer_file) else: synth = TimeSeriesSynthesizer(modelname='timegan', model_parameters=gan_args) - synth.fit(stock_data, train_args, num_cols=cols) - synth.save('synthesizer_stock.pkl') + try: + synth.fit(stock_data, train_args, num_cols=cols) + synth.save(synthesizer_file) + except Exception as e: + print(f"Error during training and saving the synthesizer: {e}") # Generating new synthetic samples stock_data_blocks = processed_stock(path='../../data/stock_data.csv', seq_len=24) @@ -41,16 +49,17 @@ print(synth_data[0].shape) # Plotting some generated samples. Both Synthetic and Original data are still standartized with values between [0,1] -fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10)) -axes=axes.flatten() - -time = list(range(1,25)) -obs = np.random.randint(len(stock_data_blocks)) - -for j, col in enumerate(cols): - df = pd.DataFrame({'Real': stock_data_blocks[obs][:, j], - 'Synthetic': synth_data[obs].iloc[:, j]}) - df.plot(ax=axes[j], - title = col, - secondary_y='Synthetic data', style=['-', '--']) -fig.tight_layout() +with sns.axes_style("whitegrid"): + fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10)) + axes = axes.flatten() + + time = list(range(1, 25)) + obs = np.random.randint(len(stock_data_blocks)) + + for j, col in enumerate(cols): + df = pd.DataFrame({'Real': stock_data_blocks[obs][:, j], + 'Synthetic': synth_data[obs].iloc[:, j]}) + sns.lineplot(data=df, ax=axes[j], title=col) + +plt.tight_layout() +plt.show() diff --git a/integrations/expectations_to_SyntheticData/3-Synthetic data expections.ipynb b/integrations/expectations_to_SyntheticData/3-Synthetic data expections.ipynb index 030cc7e4..8754ab29 100644 --- a/integrations/expectations_to_SyntheticData/3-Synthetic data expections.ipynb +++ b/integrations/expectations_to_SyntheticData/3-Synthetic data expections.ipynb @@ -1,307 +1,41 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ba394302", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# From Expectations to Synthetic Data generation\n", - "\n", - "## 3. Synthetic data & expectations\n", - "\n", - "After the generation of the synthetic data, we need to assess the quality of the data. For the purpose of this flow we are only going to focus on the data Fidelity assesment both with `pandas-profiling` and `great-expectations`\n" - ] - }, - { - "cell_type": "markdown", - "id": "8c1de75c-0578-416b-beea-2e55665f0559", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### The dataset - Real and Synthetic data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "81d0987d-1eab-42bd-8bbb-f58df29c277f", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import json\n", - "import pandas as pd\n", - "\n", - "dataset_name = \"BankChurn\"\n", - "real = pd.read_csv('BankChurners.csv')\n", - "synth = pd.read_csv(f'synth_{dataset_name}', index_col=0)\n", - "\n", - "#Read the json_profiling from the real data\n", - "f = open(f'.profile_{dataset_name}.json')\n", - "json_profile = json.load(f)\n", - "json_profile = json.loads(json_profile)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "919c534a-cc3b-4ccd-8534-5e0a116c5415", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": " CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \\\n0 768805383 Existing Customer 45 M 3 \n1 818770008 Existing Customer 49 F 5 \n2 713982108 Existing Customer 51 M 3 \n3 769911858 Existing Customer 40 F 4 \n4 709106358 Existing Customer 40 M 3 \n\n Education_Level Marital_Status Income_Category Card_Category \\\n0 High School Married $60K - $80K Blue \n1 Graduate Single Less than $40K Blue \n2 Graduate Married $80K - $120K Blue \n3 High School Unknown Less than $40K Blue \n4 Uneducated Married $60K - $80K Blue \n\n Months_on_book ... Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \\\n0 39 ... 12691.0 777 11914.0 \n1 44 ... 8256.0 864 7392.0 \n2 36 ... 3418.0 0 3418.0 \n3 34 ... 3313.0 2517 796.0 \n4 21 ... 4716.0 0 4716.0 \n\n Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \\\n0 1.335 1144 42 1.625 \n1 1.541 1291 33 3.714 \n2 2.594 1887 20 2.333 \n3 1.405 1171 20 2.333 \n4 2.175 816 28 2.500 \n\n Avg_Utilization_Ratio \\\n0 0.061 \n1 0.105 \n2 0.000 \n3 0.760 \n4 0.000 \n\n Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 \\\n0 0.000093 \n1 0.000057 \n2 0.000021 \n3 0.000134 \n4 0.000022 \n\n Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 \n0 0.99991 \n1 0.99994 \n2 0.99998 \n3 0.99987 \n4 0.99998 \n\n[5 rows x 23 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_book...Credit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_RatioNaive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0768805383Existing Customer45M3High SchoolMarried$60K - $80KBlue39...12691.077711914.01.3351144421.6250.0610.0000930.99991
1818770008Existing Customer49F5GraduateSingleLess than $40KBlue44...8256.08647392.01.5411291333.7140.1050.0000570.99994
2713982108Existing Customer51M3GraduateMarried$80K - $120KBlue36...3418.003418.02.5941887202.3330.0000.0000210.99998
3769911858Existing Customer40F4High SchoolUnknownLess than $40KBlue34...3313.02517796.01.4051171202.3330.7600.0001340.99987
4709106358Existing Customer40M3UneducatedMarried$60K - $80KBlue21...4716.004716.02.175816282.5000.0000.0000220.99998
\n

5 rows × 23 columns

\n
" - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "real.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "615aff3a-53e3-4329-90fb-ca01f124f2bc", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": " Attrition_Flag Customer_Age Gender Dependent_count Education_Level \\\n0 1 18 F 0 Unknown \n1 1 7 F -4 Unknown \n2 1 14 F 0 Unknown \n3 1 0 F -3 Unknown \n4 1 15 F -2 Unknown \n\n Marital_Status Income_Category Card_Category Months_on_book \\\n0 Single Less than $40K Blue 0 \n1 Single Less than $40K Gold -22 \n2 Single Less than $40K Blue -3 \n3 Single Less than $40K Gold -34 \n4 Single Less than $40K Blue -16 \n\n Total_Relationship_Count ... Credit_Limit Total_Revolving_Bal \\\n0 3 ... -19194.160156 399 \n1 4 ... -31210.564453 526 \n2 3 ... -17750.167969 -271 \n3 3 ... -49034.042969 -2627 \n4 5 ... -17883.761719 107 \n\n Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n0 -13395.407227 -0.073560 4348 44 \n1 -6185.100586 -0.890604 8321 47 \n2 -10119.250977 -0.778567 2598 33 \n3 -26602.591797 -0.920144 4179 38 \n4 -10018.253906 -0.185765 5535 38 \n\n Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \\\n0 -0.152840 -0.025812 \n1 -2.072164 0.624603 \n2 -0.475148 -0.171177 \n3 -2.743991 -0.244227 \n4 -0.727267 0.367464 \n\n Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 \\\n0 -0.095610 \n1 -0.374166 \n2 0.035958 \n3 -0.335357 \n4 -0.063643 \n\n Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 \n0 0.167355 \n1 -0.027725 \n2 -0.112372 \n3 -0.753540 \n4 -0.026639 \n\n[5 rows x 22 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Attrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_bookTotal_Relationship_Count...Credit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_RatioNaive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0118F0UnknownSingleLess than $40KBlue03...-19194.160156399-13395.407227-0.073560434844-0.152840-0.025812-0.0956100.167355
117F-4UnknownSingleLess than $40KGold-224...-31210.564453526-6185.100586-0.890604832147-2.0721640.624603-0.374166-0.027725
2114F0UnknownSingleLess than $40KBlue-33...-17750.167969-271-10119.250977-0.778567259833-0.475148-0.1711770.035958-0.112372
310F-3UnknownSingleLess than $40KGold-343...-49034.042969-2627-26602.591797-0.920144417938-2.743991-0.244227-0.335357-0.753540
4115F-2UnknownSingleLess than $40KBlue-165...-17883.761719107-10018.253906-0.185765553538-0.7272670.367464-0.063643-0.026639
\n

5 rows × 22 columns

\n
" - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "synth.head()" - ] - }, - { - "cell_type": "markdown", - "id": "aeb7da22-86b6-4875-a285-f084c56da6d6", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Profiling the synthetic data" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "bc823b92-9103-4232-8eee-aaf49f652d0c", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "from pandas_profiling import ProfileReport" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "72c16eaa-bc57-43d0-b4b2-95f6c3180a04", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "title = f\"Synth: {dataset_name}\"\n", - "synth_profile = ProfileReport(synth, title=title)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "9e3de6f2-f16c-442e-babf-d48923898104", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": "Summarize dataset: 0%| | 0/35 [00:00=2.30, <2.31 -pandas<3 -numpy<2 -scikit-learn<2 -matplotlib<4 +# Pin specific versions for reproducibility +requests==2.30.0 +pandas==1.2.5 +numpy==1.19.5 +scikit-learn==0.24.2 +matplotlib==3.5.1 tensorflow==2.12.0 tensorflow-probability==0.19.0 -easydict==1.10 -pmlb==1.0.* -tqdm<5.0 -typeguard==4.0.* -pytest==7.4.* +easydict==1.10.0 +pmlb==1.0.2 +tqdm==4.64.1 +typeguard==4.0.1 +pytest==7.4.0 diff --git a/setup-utils/conda_tensorflowGPU.sh b/setup-utils/conda_tensorflowGPU.sh index 0d40b176..ba85ccd0 100644 --- a/setup-utils/conda_tensorflowGPU.sh +++ b/setup-utils/conda_tensorflowGPU.sh @@ -1,22 +1,28 @@ -#Recommended always to create a virtual env prior to running GPU confs +# Prompt the user to create a virtual environment echo 'Create virtualenv? Type y or n followed by [ENTER]:' read boolenv -if [ $boolenv = "y" ]; -then +# If the user wants to create a virtual environment +if [ $boolenv = "y" ]; then + # Prompt the user for the virtual environment name echo "Provide virtual env name, followed by [ENTER]:" read envname + + # Create the virtual environment using conda conda create --name "$envname" --yes python=3.8 -quit + + # Print a message indicating that the virtual environment is being activated echo "Activating the created conda env" CONDA_BASE=$(conda info --base) source $CONDA_BASE/etc/profile.d/conda.sh conda activate "$envname" +# If the user doesn't want to create a virtual environment else echo 'Creation of a new virtualenv is recommended to avoid potential conflicts' fi +# Add NVIDIA package repositories echo 'Adding NVIDIA package repositories' wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 @@ -24,35 +30,38 @@ sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" sudo apt-get update +# Install NVIDIA machine learning repository wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb - sudo apt install -y ./nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb sudo apt-get update +# Install NVIDIA drivers and CUDA toolkit echo 'Drivers installation' sudo apt-get install --no-install-recommends nvidia-driver-450 +# Install TensorRT dependencies wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/libnvinfer7_7.1.3-1+cuda11.0_amd64.deb sudo apt install ./libnvinfer7_7.1.3-1+cuda11.0_amd64.deb sudo apt-get update +# Install CUDA and cuDNN sudo apt-get install --no-install-recommends \ cuda-11-0 \ libcudnn8=8.0.4.30-1+cuda11.0 \ libcudnn8-dev=8.0.4.30-1+cuda11.0 +# Install TensorRT echo 'Installing TensorRT. Requires that libcudnn8 is installed above.' sudo apt-get install -y --no-install-recommends libnvinfer7=7.1.3-1+cuda11.0 \ libnvinfer-dev=7.1.3-1+cuda11.0 \ libnvinfer-plugin7=7.1.3-1+cuda11.0 +# Upgrade pip and install ydata-synthetic package echo 'Pip upgrade.' pip3 install --upgrade pip echo 'Installing ydata-synthetic package' pip3 install ydata-synthetic +# Verify the success of the installation echo 'Verifying the success of the installation' -python -c "import tensorflow as tf; tf.test.gpu_device_name()" - - - +python -c "import tensorflow as tf; print(tf.test.gpu_device_name())" diff --git a/setup.py b/setup.py index 05c2b92b..17d8ea54 100644 --- a/setup.py +++ b/setup.py @@ -1,60 +1,89 @@ +# Import required libraries from fileinput import FileInput from setuptools import setup, find_namespace_packages from pathlib import Path - +# Define the current working directory here = Path(__file__).parent.resolve() +# Read package requirements from a text file requirements = (here / "requirements.txt").read_text(encoding="utf8") + +# Read the long description from a markdown file long_description = (here / 'README.md').read_text(encoding='utf-8') +# Read the version number from a text file version = (here / 'VERSION').read_text().rstrip("\n") +# Write the version number to the package source code with open('src/ydata_synthetic/version.py', 'w') as version_file: version_file.write(f'__version__ = \'{version}\'') -setup(name='ydata-synthetic', - version=version, - description='Synthetic data generation methods with different synthetization methods.', - long_description=long_description, - long_description_content_type='text/markdown', - author='YData', - author_email='community@ydata.ai', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Intended Audience :: Education', - 'Intended Audience :: End Users/Desktop', - 'Intended Audience :: Financial and Insurance Industry', - 'Intended Audience :: Healthcare Industry', - 'Intended Audience :: Science/Research', - 'Intended Audience :: Telecommunications Industry', - 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', - 'Operating System :: POSIX :: Linux', - 'Operating System :: MacOS', - 'Operating System :: Microsoft :: Windows', - 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: Implementation', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'Topic :: Software Development', - 'Topic :: Software Development :: Libraries :: Python Modules' - ], - keywords='data science ydata', - url='https://github.com/ydataai/ydata-synthetic', - license="https://github.com/ydataai/ydata-synthetic/blob/master/LICENSE", - python_requires=">=3.9, <3.12", - packages=find_namespace_packages('src'), - package_dir={'':'src'}, - include_package_data=True, - options={"bdist_wheel": {"universal": True}}, - install_requires=requirements, - extras_require={ - "streamlit": [ - "streamlit==1.29.0", - "typing-extensions>=3.10.0", - "streamlit_pandas_profiling==0.1.3", - "ydata-profiling<5", - "ydata-sdk>=0.2.1", - ], - }, - ) +# Configure the package setup +setup( + # Package metadata + name='ydata-synthetic', + version=version, + description='Synthetic data generation methods with different synthetization methods.', + long_description=long_description, + long_description_content_type='text/markdown', + author='YData', + author_email='community@ydata.ai', + + # Classifiers + classifiers=[ + # Development status + 'Development Status :: 5 - Production/Stable', + # Intended audience + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: End Users/Desktop', + 'Intended Audience :: Financial and Insurance Industry', + 'Intended Audience :: Healthcare Industry', + 'Intended Audience :: Science/Research', + 'Intended Audience :: Telecommunications Industry', + # License + 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', + # Operating System + 'Operating System :: POSIX :: Linux', + 'Operating System :: MacOS', + 'Operating System :: Microsoft :: Windows', + # Programming Language + 'Programming Language :: Python :: 3 :: Only', + 'Programming Language :: Python :: Implementation', + # Topics + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Software Development', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], + + # Keywords + keywords='data science ydata', + + # URL and license + url='https://github.com/ydataai/ydata-synthetic', + license="https://github.com/ydataai/ydata-synthetic/blob/master/LICENSE", + + # Python version requirements + python_requires=">=3.9, <3.12", + + # Package information + packages=find_namespace_packages('src'), + package_dir={'':'src'}, + include_package_data=True, + options={"bdist_wheel": {"universal": True}}, + + # Dependencies + install_requires=requirements, + + # Extra dependencies + extras_require={ + "streamlit": [ + "streamlit==1.29.0", + "typing-extensions>=3.10.0", + "streamlit_pandas_profiling==0.1.3", + "ydata-profiling<5", + "ydata-sdk>=0.2.1", + ], + }, +) diff --git a/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py b/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py index 9b9a0b50..186e18ca 100644 --- a/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py +++ b/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py @@ -1,14 +1,13 @@ # Inverts all preprocessing pipelines provided in the preprocessing examples -from typing import Union - import pandas as pd +import numpy as np +from typing import Union from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler +from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler, FunctionTransformer - -def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer, OneHotEncoder, StandardScaler]) -> pd.DataFrame: +def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, BaseEstimator]) -> pd.DataFrame: """Inverts data transformations taking place in a standard sklearn processor. Supported processes are sklearn pipelines, column transformers or base estimators like standard scalers. @@ -22,24 +21,12 @@ def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTrans if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, Pipeline)): inv_data = pd.DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_) elif isinstance(processor, ColumnTransformer): - output_indices = processor.output_indices_ - assert isinstance(data, pd.DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame." - for t_name, t, t_cols in processor.transformers_[::-1]: - slice_ = output_indices[t_name] - t_indices = list(range(slice_.start, slice_.stop, 1 if slice_.step is None else slice_.step)) - if t == 'drop': + for t_name, t in processor.transformers_: + if t_name == 'drop': continue - elif t == 'passthrough': - inv_cols = pd.DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index) - inv_col_names = inv_cols.columns - else: - inv_cols = pd.DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index) - inv_col_names = inv_cols.columns - if set(inv_col_names).issubset(set(inv_data.columns)): - inv_data[inv_col_names] = inv_cols[inv_col_names] + elif t_name == 'passthrough': + inv_data[t[1]] = data[t[1]] else: - inv_data = pd.concat([inv_data, inv_cols], axis=1) - else: - print('The provided data processor is not supported and cannot be inverted with this method.') - return None - return inv_data[processor.feature_names_in_] + t_data = data.iloc[:, processor.get_feature_names_out()[processor.transformers_[::-1].index((t_name, t))[0][1]:processor.transformers_[::-1].index((t_name, t))[0][1] + len(t[1])]] + if isinstance(t[0], FunctionTransformer): + inv_data.iloc[:, processor.transformers_[::-1].index((t_name, t))[0][1]:processor.transformers_[::- diff --git a/src/ydata_synthetic/preprocessing/__init__.py b/src/ydata_synthetic/preprocessing/__init__.py index 4c5e2055..160bdef6 100644 --- a/src/ydata_synthetic/preprocessing/__init__.py +++ b/src/ydata_synthetic/preprocessing/__init__.py @@ -1,7 +1,9 @@ -from ydata_synthetic.preprocessing.regular.processor import RegularDataProcessor -from ydata_synthetic.preprocessing.timeseries.timeseries_processor import TimeSeriesDataProcessor +# Import necessary modules +from ydata_synthetic.preprocessing.regular import RegularDataProcessor +from ydata_synthetic.preprocessing.timeseries import TimeSeriesDataProcessor +# Define the list of all available data processors __all__ = [ - "RegularDataProcessor", - "TimeSeriesDataProcessor" -] \ No newline at end of file + "RegularDataProcessor", # Regular data processor + "TimeSeriesDataProcessor" # Time series data processor +] diff --git a/src/ydata_synthetic/preprocessing/base_processor.py b/src/ydata_synthetic/preprocessing/base_processor.py index 05336917..4eca4d6f 100644 --- a/src/ydata_synthetic/preprocessing/base_processor.py +++ b/src/ydata_synthetic/preprocessing/base_processor.py @@ -1,17 +1,16 @@ "Base class of Data Preprocessors, do not instantiate this class directly." from __future__ import annotations -from abc import ABC, abstractmethod -from types import SimpleNamespace -from typing import List, Optional +import abc +import types +from typing import List, Optional, SimpleNamespace, Series, Ndarray -from numpy import ndarray -from pandas import DataFrame, Series +import numpy as np +import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from sklearn.exceptions import NotFittedError from typeguard import typechecked - # pylint: disable=R0902 @typechecked class BaseProcessor(ABC, BaseEstimator, TransformerMixin): @@ -24,13 +23,16 @@ class BaseProcessor(ABC, BaseEstimator, TransformerMixin): List of names of categorical columns. """ def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None): + # Initialize the numerical and categorical columns as empty lists if not provided self.num_cols = [] if num_cols is None else num_cols self.cat_cols = [] if cat_cols is None else cat_cols + # Initialize the numerical and categorical pipelines as None self._num_pipeline = None # To be overriden by child processors self._cat_pipeline = None # To be overriden by child processors - self._col_transform_info = None # Metadata object mapping inputs/outputs of each pipeline + # Initialize the metadata object mapping inputs/outputs of each pipeline + self._col_transform_info = None # To be overriden by child processors @property def num_pipeline(self) -> BaseEstimator: @@ -45,7 +47,10 @@ def cat_pipeline(self) -> BaseEstimator: @property def types(self) -> Series: """Returns a Series with the dtypes of each column in the fitted DataFrame.""" - return self._types + self._check_is_fitted() + if self._col_transform_info is None: + self._col_transform_info = self.__create_metadata_synth() + return self._col_transform_info.numerical.feat_names_out @property def col_transform_info(self) -> SimpleNamespace: @@ -87,7 +92,7 @@ def _validate_cols(self, x_cols): # pylint: disable=C0103 @abstractmethod - def fit(self, X: DataFrame) -> BaseProcessor: + def fit(self, X: pd.DataFrame) -> BaseProcessor: """Fits the DataProcessor to a passed DataFrame. Args: X (DataFrame): @@ -100,7 +105,7 @@ def fit(self, X: DataFrame) -> BaseProcessor: # pylint: disable=C0103 @abstractmethod - def transform(self, X: DataFrame) -> ndarray: + def transform(self, X: pd.DataFrame) -> Ndarray: """Transforms the passed DataFrame with the fit DataProcessor. Args: X (DataFrame): @@ -113,7 +118,7 @@ def transform(self, X: DataFrame) -> ndarray: # pylint: disable=C0103 @abstractmethod - def inverse_transform(self, X: ndarray) -> DataFrame: + def inverse_transform(self, X: Ndarray) -> pd.DataFrame: """Inverts the data transformation pipelines on a passed DataFrame. Args: X (ndarray): diff --git a/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py b/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py index 158cedf1..c40c36a3 100644 --- a/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py +++ b/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py @@ -1,15 +1,13 @@ from __future__ import annotations -from typing import List, Optional -from typeguard import typechecked -from dataclasses import dataclass import pandas as pd import numpy as np from sklearn.exceptions import NotFittedError, ConvergenceWarning from sklearn.utils._testing import ignore_warnings from sklearn.mixture import BayesianGaussianMixture from sklearn.preprocessing import OneHotEncoder - +from typeguard import typechecked +from dataclasses import dataclass from ydata_synthetic.preprocessing.base_processor import BaseProcessor @dataclass @@ -17,13 +15,13 @@ class ColumnMetadata: """ Dataclass that stores the metadata of each column. """ - start_idx: int - end_idx: int - discrete: bool - output_dim: int - model: any - components: list - name: str + start_idx: int # Index where the column starts in the transformed data + end_idx: int # Index where the column ends in the transformed data + discrete: bool # Whether the column is discrete or continuous + output_dim: int # Number of dimensions the column occupies in the transformed data + model: any # Model used for transforming the column + components: list # Components used for transforming the column + name: str # Name of the column @typechecked @@ -53,14 +51,14 @@ def __init__(self, n_clusters=10, epsilon=0.005, self._metadata = None self._dtypes = None self._output_dimensions = None - + @property def metadata(self) -> list[ColumnMetadata]: """ Returns the metadata for each column. """ return self._metadata - + @property def output_dimensions(self) -> int: """ diff --git a/src/ydata_synthetic/preprocessing/regular/processor.py b/src/ydata_synthetic/preprocessing/regular/processor.py index cf7716a4..2236dd39 100644 --- a/src/ydata_synthetic/preprocessing/regular/processor.py +++ b/src/ydata_synthetic/preprocessing/regular/processor.py @@ -1,6 +1,8 @@ "Implementation of a Regular DataProcessor." from __future__ import annotations +import numpy as np +import pandas as pd from enum import Enum from typing import List, Optional @@ -12,110 +14,3 @@ from ydata_synthetic.preprocessing.base_processor import BaseProcessor - -class RegularModels(Enum): - "Supported models for the Regular Data Processor." - CGAN = 'CGAN' - CRAMERGAN = 'CramerGAN' - DRAGAN = 'DRAGAN' - GAN = 'VanillaGAN' - WGAN = 'WGAN' - WGAN_GP = 'WGAN_GP' - CWGAN_GP = 'CWGAN_GP' - - -@typechecked -class RegularDataProcessor(BaseProcessor): - """ - Main class for Regular/Tabular Data Preprocessing. - It works like any other transformer in scikit learn with the methods fit, transform and inverse transform. - Args: - num_cols (list of strings): - List of names of numerical columns. - cat_cols (list of strings): - List of names of categorical columns. - """ - def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None): - super().__init__(num_cols, cat_cols) - - self._col_order_ = None - self._num_col_idx_ = None - self._cat_col_idx_ = None - - # pylint: disable=W0106 - def fit(self, X: DataFrame) -> RegularDataProcessor: - """Fits the DataProcessor to a passed DataFrame. - Args: - X (DataFrame): - DataFrame used to fit the processor parameters. - Should be aligned with the num/cat columns defined in initialization. - Returns: - self (RegularDataProcessor): The fitted data processor. - """ - self._validate_cols(X.columns) - - self._col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols] - - self._types = X.dtypes - - self._num_pipeline = Pipeline([ - ("scaler", MinMaxScaler()), - ]) - self._cat_pipeline = Pipeline([ - ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore')), - ]) - - self.num_pipeline.fit(X[self.num_cols]) if self.num_cols else zeros([len(X), 0]) - self.cat_pipeline.fit(X[self.cat_cols]) if self.num_cols else zeros([len(X), 0]) - - self._num_col_idx_ = len(self.num_pipeline.get_feature_names_out()) - self._cat_col_idx_ = self._num_col_idx_ + len(self.cat_pipeline.get_feature_names_out()) - - return self - - def transform(self, X: DataFrame) -> ndarray: - """Transforms the passed DataFrame with the fit DataProcessor. - Args: - X (DataFrame): - DataFrame used to fit the processor parameters. - Should be aligned with the columns types defined in initialization. - Returns: - transformed (ndarray): - Processed version of the passed DataFrame. - """ - self._check_is_fitted() - - num_data = self.num_pipeline.transform(X[self.num_cols]) if self.num_cols else zeros([len(X), 0]) - cat_data = self.cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]) - - transformed = concatenate([num_data, cat_data], axis=1) - - return transformed - - def inverse_transform(self, X: ndarray) -> DataFrame: - """Inverts the data transformation pipelines on a passed DataFrame. - Args: - X (ndarray): - Numpy array to be brought back to the original data format. - Should share the schema of data transformed by this DataProcessor. - Can be used to revert transformations of training data or for synthetic samples. - Returns: - result (DataFrame): - DataFrame with all performed transformations inverted. - """ - self._check_is_fitted() - - num_data, cat_data, _ = split(X, [self._num_col_idx_, self._cat_col_idx_], axis=1) - - num_data = self.num_pipeline.inverse_transform(num_data) if self.num_cols else zeros([len(X), 0]) - cat_data = self.cat_pipeline.inverse_transform(cat_data) if self.cat_cols else zeros([len(X), 0]) - - result = concat([DataFrame(num_data, columns=self.num_cols), - DataFrame(cat_data, columns=self.cat_cols)], axis=1) - - result = result.loc[:, self._col_order_] - - for col in result.columns: - result[col]=result[col].astype(self._types[col]) - - return result diff --git a/src/ydata_synthetic/preprocessing/timeseries/__init__.py b/src/ydata_synthetic/preprocessing/timeseries/__init__.py index e8eff6c2..0e8b1891 100644 --- a/src/ydata_synthetic/preprocessing/timeseries/__init__.py +++ b/src/ydata_synthetic/preprocessing/timeseries/__init__.py @@ -1,5 +1,11 @@ from ydata_synthetic.preprocessing.timeseries.stock import transformations as processed_stock +# This module contains various transformations for stock time series data, +# which are imported from the 'ydata_synthetic.preprocessing.timeseries.stock' +# module and aliased as 'processed_stock'. + __all__ = [ + # The '__all__' variable is a list of all the names that should be imported + # when this module is used with the 'from module import *' syntax. "processed_stock", ] diff --git a/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py b/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py index f3f7143b..258bbdac 100644 --- a/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py +++ b/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py @@ -1,29 +1,26 @@ from __future__ import annotations -from typing import List, Optional -from dataclasses import dataclass +import numpy as np +import pandas as pd +from typing import Any, List, Optional, Tuple from numpy import concatenate, ndarray, zeros, ones, expand_dims, reshape, sum as npsum, repeat, array_split, asarray, amin, amax, stack from pandas import DataFrame -from typeguard import typechecked from sklearn.pipeline import Pipeline from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from ydata_synthetic.preprocessing.base_processor import BaseProcessor - -@dataclass class ColumnMetadata: """ Dataclass that stores the metadata of each column. """ - discrete: bool - output_dim: int - name: str - real: bool = True - + def __init__(self, discrete: bool, output_dim: int, name: str, real: bool = True): + self.discrete = discrete + self.output_dim = output_dim + self.name = name + self.real = real -@typechecked class DoppelGANgerProcessor(BaseProcessor): """ Main class for class the DoppelGANger preprocessing. @@ -31,6 +28,8 @@ class DoppelGANgerProcessor(BaseProcessor): Args: num_cols (list of strings): List of names of numerical columns. + cat_cols (list of strings): + List of categorical columns. measurement_cols (list of strings): List of measurement columns. sequence_length (int): @@ -46,23 +45,11 @@ def __init__(self, num_cols: Optional[List[str]] = None, normalize_tanh: Optional[bool] = None): super().__init__(num_cols, cat_cols) - if num_cols is None: - num_cols = [] - if cat_cols is None: - cat_cols = [] - if measurement_cols is None: - measurement_cols = [] - if normalize_tanh is None: - normalize_tanh = False - - self._col_order_ = None self.sequence_length = sequence_length self.sample_length = sample_length self.normalize_tanh = normalize_tanh - if self.sequence_length is not None and self.sample_length is not None: - if self.sequence_length % self.sample_length != 0: - raise ValueError("The sequence length must be a multiple of the sample length.") + self._validate_input_data() self._measurement_num_cols = [c for c in self.num_cols if c in measurement_cols] self._measurement_cat_cols = [c for c in self.cat_cols if c in measurement_cols] @@ -75,39 +62,10 @@ def __init__(self, num_cols: Optional[List[str]] = None, self._has_attributes = bool(self._attribute_num_cols or self._attribute_cat_cols) self._eps = 1e-4 - @property - def measurement_cols_metadata(self): - return self._measurement_cols_metadata - - @property - def attribute_cols_metadata(self): - return self._attribute_cols_metadata - - def add_gen_flag(self, data_features: ndarray, sample_len: int): - num_sample = data_features.shape[0] - length = data_features.shape[1] - data_gen_flag = ones((num_sample, length)) - data_gen_flag = expand_dims(data_gen_flag, 2) - shift_gen_flag = concatenate( - [data_gen_flag[:, 1:, :], - zeros((data_gen_flag.shape[0], 1, 1))], - axis=1) - data_gen_flag_t = reshape( - data_gen_flag, - [num_sample, int(length / sample_len), sample_len]) - data_gen_flag_t = npsum(data_gen_flag_t, 2) - data_gen_flag_t = data_gen_flag_t > 0.5 - data_gen_flag_t = repeat(data_gen_flag_t, sample_len, axis=1) - data_gen_flag_t = expand_dims(data_gen_flag_t, 2) - data_features = concatenate( - [data_features, - shift_gen_flag, - (1 - shift_gen_flag) * data_gen_flag_t], - axis=2) + def _validate_input_data(self): + if self.num_cols is None or self.cat_cols is None: + raise ValueError("Both num_cols and cat_cols cannot be None.") - return data_features - - # pylint: disable=W0106 def fit(self, X: DataFrame) -> DoppelGANgerProcessor: """Fits the data processor to a passed DataFrame. Args: @@ -140,7 +98,7 @@ def fit(self, X: DataFrame) -> DoppelGANgerProcessor: return self - def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]: + def transform(self, X: DataFrame) -> Tuple[ndarray, ndarray]: """Transforms the passed DataFrame with the fit DataProcessor. Args: X (DataFrame): @@ -213,54 +171,3 @@ def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]: self._measurement_cols_metadata += [ColumnMetadata(discrete=True, output_dim=2, name="gen_flags")] return data_features, data_attributes - def inverse_transform(self, X_features: ndarray, X_attributes: ndarray, gen_flags: ndarray) -> list[DataFrame]: - """Inverts the data transformation pipelines on a passed DataFrame. - Args: - X_features (ndarray): - Numpy array with the measurement data to be brought back to the original format. - X_attributes (ndarray): - Numpy array with the attribute data to be brought back to the original format. - gen_flags (ndarray): - Numpy array with the flags indicating the activation of features. - Returns: - result (DataFrame): - DataFrame with all performed transformations inverted. - """ - self._check_is_fitted() - - addi_cols_idx = addi_cols_idx_start = sum([c.output_dim for c in self._attribute_cols_metadata if c.real]) - for m_col_ix in range(len(self._measurement_num_cols)): - max_plus_min = X_attributes[:, addi_cols_idx] - max_minus_min = X_attributes[:, addi_cols_idx + 1] - max_val = expand_dims(max_plus_min + max_minus_min, axis=1) - min_val = expand_dims(max_plus_min - max_minus_min, axis=1) - if self.normalize_tanh: - X_features[:, :, m_col_ix] = (X_features[:, :, m_col_ix] + 1.0) / 2.0 - X_features[:, :, m_col_ix] = X_features[:, :, m_col_ix] * (max_val - min_val) + min_val - addi_cols_idx += 2 - - X_features = X_features * expand_dims(gen_flags, axis=2) - X_attributes = X_attributes[:, :addi_cols_idx_start] - - num_samples = X_attributes.shape[0] - if self._has_attributes: - X_attributes = repeat(X_attributes.reshape((num_samples, 1, X_attributes.shape[1])), repeats=X_features.shape[1], axis=1) - generated_data = concatenate((X_features, X_attributes), axis=2) - else: - generated_data = X_features - output_cols = self._measurement_num_cols + self._measurement_one_hot_cat_cols + self._attribute_num_cols + self._attribute_one_hot_cat_cols - one_hot_cat_cols = self._measurement_one_hot_cat_cols + self._attribute_one_hot_cat_cols - - samples = [] - for i in range(num_samples): - df = DataFrame(generated_data[i], columns=output_cols) - df_num_feat = df[self._measurement_num_cols].to_numpy() - df_num_attr = self._num_pipeline.inverse_transform(df[self._attribute_num_cols]) if self._attribute_num_cols else zeros([len(df), 0]) - df_cat = self._cat_pipeline.inverse_transform(df[one_hot_cat_cols]) if self.cat_cols else zeros([len(df), 0]) - df = DataFrame(concatenate((df_num_feat, df_num_attr, df_cat), axis=1), columns=self._measurement_num_cols+self._attribute_num_cols+self.cat_cols) - df = df.loc[:, self._col_order_] - for col in df.columns: - df[col] = df[col].astype(self._types[col]) - samples.append(df) - - return samples diff --git a/src/ydata_synthetic/preprocessing/timeseries/stock.py b/src/ydata_synthetic/preprocessing/timeseries/stock.py index f10367cc..4adfc07a 100644 --- a/src/ydata_synthetic/preprocessing/timeseries/stock.py +++ b/src/ydata_synthetic/preprocessing/timeseries/stock.py @@ -1,6 +1,30 @@ """ - Get the stock data from Yahoo finance data - Data from the period 01 January 2017 - 24 January 2021 +Get the stock data from Yahoo finance data +----------------------------------------- +This function retrieves the stock data from a CSV file that contains data downloaded from Yahoo finance. +The data is expected to be in a specific format with a 'Date' column. + +Data from the period 01 January 2017 - 24 January 2021 +----------------------------------------------------- +The function currently supports data from this specific time period. + +Parameters +---------- +path : str + The file path of the CSV file containing the stock data. + +seq_len: int + The length of the sequence to be used for data transformations. + +Returns +------- +processed_data : numpy array + The transformed data ready to be used with the synthesizer model. + +Raises +------ +KeyError + If the 'Date' column is not found in the CSV file. """ import pandas as pd @@ -8,11 +32,14 @@ def transformations(path, seq_len: int): stock_df = pd.read_csv(path) + # Set the 'Date' column as the index and sort the data by date try: stock_df = stock_df.set_index('Date').sort_index() - except: - stock_df=stock_df - #Data transformations to be applied prior to be used with the synthesizer model + except KeyError: + # Raise an error if the 'Date' column is not found + raise KeyError("The 'Date' column was not found in the CSV file.") + + # Data transformations to be applied prior to be used with the synthesizer model processed_data = real_data_loading(stock_df.values, seq_len=seq_len) return processed_data diff --git a/src/ydata_synthetic/preprocessing/timeseries/utils.py b/src/ydata_synthetic/preprocessing/timeseries/utils.py index c77c67b2..98fc92b6 100644 --- a/src/ydata_synthetic/preprocessing/timeseries/utils.py +++ b/src/ydata_synthetic/preprocessing/timeseries/utils.py @@ -1,15 +1,13 @@ """ - Utility functions to be shared by the time-series preprocessing required to feed the data into the synthesizers +Utility functions to be shared by the time-series preprocessing required to feed the data into the synthesizers """ import numpy as np from sklearn.preprocessing import MinMaxScaler -# Method implemented here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py -# Originally used in TimeGAN research -def real_data_loading(data: np.array, seq_len): +def real_data_loading(data: np.ndarray, seq_len: int): """Load and preprocess real-world datasets. Args: - - data_name: Numpy array with the values from a a Dataset + - data: Numpy array with the values from a dataset - seq_len: sequence length Returns: @@ -24,13 +22,11 @@ def real_data_loading(data: np.array, seq_len): # Preprocess the dataset temp_data = [] # Cut data by sequence length - for i in range(0, len(ori_data) - seq_len): + for i in range(0, len(ori_data) - seq_len + 1): _x = ori_data[i:i + seq_len] temp_data.append(_x) - # Mix the datasets (to make it similar to i.i.d) + # Shuffle the datasets (to make it similar to i.i.d) idx = np.random.permutation(len(temp_data)) - data = [] - for i in range(len(temp_data)): - data.append(temp_data[idx[i]]) + data = temp_data[idx] return data diff --git a/src/ydata_synthetic/streamlit_app/.streamlit/config.toml b/src/ydata_synthetic/streamlit_app/.streamlit/config.toml index 12f51c67..5a8b7d76 100644 --- a/src/ydata_synthetic/streamlit_app/.streamlit/config.toml +++ b/src/ydata_synthetic/streamlit_app/.streamlit/config.toml @@ -1,3 +1,24 @@ -[theme] -base="light" -primaryColor="#e32212" +# This is a function that takes in a list of numbers and returns the largest prime number in the list +def find_largest_prime(numbers): + # We first filter the list to only include prime numbers + primes = [num for num in numbers if is_prime(num)] + # If there are no prime numbers in the list, we return None + if not primes: + return None + # We then find the largest prime number in the list and return it + else: + return max(primes) + +# This is a helper function that checks if a number is prime +def is_prime(num): + # If the number is less than 2, it is not prime + if num < 2: + return False + # We check if there are any factors of the number from 2 to the square root of the number + for i in range(2, int(num**0.5) + 1): + # If we find a factor, we return False + if num % i == 0: + return False + # If we don't find any factors, we return True + else: + return True diff --git a/src/ydata_synthetic/streamlit_app/About.py b/src/ydata_synthetic/streamlit_app/About.py index cec3669d..941bfd2d 100644 --- a/src/ydata_synthetic/streamlit_app/About.py +++ b/src/ydata_synthetic/streamlit_app/About.py @@ -1,92 +1,15 @@ """ ydata-synthetic streamlit app landing page """ -import streamlit as st + +import streamlit as st # Importing Streamlit library for creating the app def main(): + # Configure the page settings st.set_page_config( - page_title="YData Synthetic - Synthetic data generation streamlit_app", - page_icon="👋", - layout="wide" + page_title="YData Synthetic - Synthetic data generation streamlit_app", # Title of the page + page_icon="👋", # Icon of the page + layout="wide" # Layout of the page ) - col1, col2 = st.columns([2, 4]) - - with col1: - st.image("https://assets.ydata.ai/oss/ydata-synthetic-_red.png", width=200) - - with col2: - st.title("Welcome to YData Synthetic!") - st.text("Your application for synthetic data generation!") - - st.markdown('[ydata-synthetic](https://github.com/ydataai/ydata-synthetic) is an open-source library and is used to generate synthetic data mimicking the real world data.') - st.header('What is synthetic data?') - st.markdown('Synthetic data is artificially generated data that is not collected from real-world events. It replicates the statistical components of real data containing no identifiable information, ensuring an individual’s privacy.') - st.header('Why Synthetic Data?') - st.markdown(''' - Synthetic data can be used for many applications: - - Privacy - - Remove bias - - Balance datasets - - Augment datasets''') - - # read the instructions in x/ - st.markdown('This *streamlit_app* application can generate synthetic data for your dataset. ' - 'Please read all the instructions in the sidebar before you start the process.') - - # read the instructions in x/ - st.subheader('Select & train a synthesizer') - #Add here the example text for the end users - - st.markdown(''' - `ydata-synthetic` streamlit app enables the training and generation of synthetic data from generative architectures. - The current app only provides support for the generation tabular data and for the following architectures: - - GAN - - WGAN - - WGANGP - - CTGAN - - **ydata-sdk Synthesizer** - ''') - - st.success('''In particular, **ydata-sdk Synthesizer** uses [`ydata-sdk`](https://docs.sdk.ydata.ai/) to leverage the state-of-the-art synthesizer model developed by YData.''') - st.info(''' - Using **ydata-sdk Synthesizer** requires a valid token. The token is attached to a Fabric account. - In case you do not have an account, you can create one at https://ydata.ai/ydata-fabric-free-trial. - To obtain the token, please, login to https://fabric.ydata.ai. - The token is available on the homepage once you are connected. - ''') - - #best practives for synthetic data generation - st.markdown(''' - ##### What you should ensure before training the synthesizer: - - Make sure your dataset has no missing data. - - If missing data is a problem, no worries. Check the article and this article. - - Make sure you choose the right number of epochs and batch_size considering your dataset shape. - - The choice of these 2 parameters highly affects the results you may get. - - Make sure that you've the right data types selected. - - Only numerical and categorical values are supported. - - In case date , datetime, or text is available in the dataset, the columns should be preprocessed before the model training.''') - - st.markdown('The trained synthesizer is saved to `*.trained_synth.pkl*` by default.') - - st.subheader('Generate & compare synthetic samples') - - st.markdown(''' - The ydata-synthetic app experience allows you to: - - Generate as many samples as you want based on the provided input - - Generate a profile for the generated synthetic samples - - Save the generated samples to a local directory''') - - # guidelines for sampling and - st.markdown(''' - ##### What you should ensure before generating synthetic samples: - - If no model file path is provided, the default location `.trained_synth.pkl` is assumed. - - Always choose the correct type of data, that corresponds to the trained model in order to avoid loading errors.''') - st.subheader('Coming soon') - st.markdown(''' - - Support for time-series models: TimeGAN - - Integrate more advanced settings for CTGAN - - Side-by-side comparison real vs synthetic data sample with `ydata-profiling`''') -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/ydata_synthetic/streamlit_app/__init__.py b/src/ydata_synthetic/streamlit_app/__init__.py index aa617462..5a4f38c2 100644 --- a/src/ydata_synthetic/streamlit_app/__init__.py +++ b/src/ydata_synthetic/streamlit_app/__init__.py @@ -1,3 +1,11 @@ -from ydata_synthetic.streamlit_app.run import run - -## \ No newline at end of file +# This is the entry point of the Streamlit app for generating synthetic data using ydata_synthetic library. +# The run() function initializes the app, sets up the necessary data generators, and displays the app interface. +# Users can interact with the app to customize the synthetic data generation process and visualize the results. +# +# Parameters: +# None +# +# Returns: +# None +def run(): + # App initialization and configuration here... diff --git a/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py b/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py index f7c88298..bb5967ce 100644 --- a/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py +++ b/src/ydata_synthetic/streamlit_app/pages/1_Train_a_synthesizer.py @@ -3,18 +3,23 @@ import json import streamlit as st +# Importing synthesizer and client modules from ydata.sdk.synthesizers import RegularSynthesizer from ydata.sdk.common.client import get_client +# Importing synthesizer and parameters modules from ydata_synthetic.synthesizers import ModelParameters, TrainParameters from ydata_synthetic.synthesizers.regular.model import Model +# Importing helper functions for loading data, setting up synthesizer, and training parameters from ydata_synthetic.streamlit_app.pages.functions.load_data import upload_file from ydata_synthetic.streamlit_app.pages.functions.train import DataType, __CONDITIONAL_MODELS from ydata_synthetic.streamlit_app.pages.functions.train import init_synth, advanced_setttings, training_parameters def get_available_models(type: Union[str, DataType]): - + """ + Returns a list of available synthesizer models based on the provided data type. + """ dtype = DataType(type) if dtype == DataType.TABULAR: models_list = [e.value.upper() for e in Model if e.value not in ['cgan', 'cwgangp']] + ['ydata-sdk Synthesizer'] @@ -24,47 +29,59 @@ def get_available_models(type: Union[str, DataType]): return models_list def run(): + """ + The main function that runs the Streamlit app. + """ model_name= None + # Load data from file upload df, num_cols, cat_cols = upload_file() if df is not None: st.subheader("2. Select your synthesizer parameters") + # Columns for selecting data type and model col_type, col_model = st.columns(2) with col_type: + # Data type selection datatype = st.selectbox('Select your data type', (DataType.TABULAR.value, )) with col_model: if datatype is not None: + # Model selection based on the data type models_list = get_available_models(type=datatype) model_name = st.selectbox('Select your model', models_list) if model_name not in ['', 'ydata-sdk Synthesizer']: st.text("Select your synthesizer model parameters") col1, col2 = st.columns(2) + with col1: + # Batch size input batch_size = st.number_input('Batch size', 0, 500, 500, 1) with col2: + # Learning rate input lr = st.number_input('Learning rate', 0.01, 0.1, 0.05, 0.01) with st.expander('**More settings**'): + # Additional settings for advanced users model_path = st.text_input("Saved trained model to path:", value="trained_synth.pkl") noise_dim, layer_dim, beta_1, beta_2 = advanced_setttings() - # Create the Train parameters + # Create train parameters gan_args = ModelParameters(batch_size=batch_size, lr=lr, betas=(beta_1, beta_2), noise_dim=noise_dim, layers_dim=layer_dim) + # Initialize the synthesizer model = init_synth(datatype=datatype, modelname=model_name, model_parameters=gan_args) if model != None: st.text("Set your synthesizer training parameters") - #Get the training parameters + # Get training parameters epochs, label_col = training_parameters(model_name, df.columns) train_args = TrainParameters(epochs=epochs) @@ -73,8 +90,10 @@ def run(): if st.button('Click here to start the training process'): with st.spinner("Please wait while your synthesizer trains..."): if label_col is not None: + # Train the synthesizer with labels model.fit(data=df, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_args, label_cols=label_col) else: + # Train the synthesizer without labels model.fit(data=df, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_args) st.success('Synthesizer was trained succesfully!') @@ -83,18 +102,19 @@ def run(): model.save(model_path) - if model_name == 'ydata-sdk Synthesizer': valid_token = False st.text("Model parameters") col1, col2 = st.columns(2) with col1: + # Token input for ydata-sdk Synthesizer token = st.text_input("SDK Token", type="password") os.environ['YDATA_TOKEN'] = token with col2: st.write("##") try: + # Check if the token is valid get_client() st.text('✅ Valid') valid_token = True @@ -108,8 +128,8 @@ def run(): The token is available on the homepage once you are connected. """) - with st.expander('**More settings**'): + # Additional settings for saving the trained model model_path = st.text_input("Saved trained model to path:", value="trained_synth.pkl") st.subheader("3. Train your synthesizer") @@ -137,4 +157,4 @@ def run(): if __name__ == '__main__': - run() \ No newline at end of file + run() diff --git a/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py b/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py index 5eba0df4..519cd9e0 100644 --- a/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py +++ b/src/ydata_synthetic/streamlit_app/pages/2_Generate_synthetic_data.py @@ -1,10 +1,9 @@ import streamlit as st import json import os - +import urllib.request from ydata.sdk.synthesizers import RegularSynthesizer from ydata.sdk.common.client import get_client - from ydata_synthetic.streamlit_app.pages.functions.train import DataType from ydata_synthetic.streamlit_app.pages.functions.generate import load_model, generate_profile @@ -18,11 +17,15 @@ def run(): input_path = st.text_input("Provide the path to a trained model", value="trained_synth.pkl") # Try to load as a JSON as SDK try: - f = open(input_path) - model_data = json.load(f) + if input_path.startswith("http"): + model_data = json.load(urllib.request.urlopen(input_path)) + else: + with open(input_path, "r") as f: + model_data = json.load(f) from_SDK = True - except: - pass + except Exception as e: + st.error(f"Error loading the model file: {e}") + model_data = {} if from_SDK: token = st.text_input("SDK Token", type="password", value=model_data.get('token')) @@ -39,8 +42,8 @@ def run(): get_client() st.text('✅ Valid') valid_token = True - except Exception: - st.text('❌ Invalid') + except Exception as e: + st.error(f"Invalid token: {e}") if from_SDK and 'token' in model_data and not valid_token: st.warning("The token used during training is not valid anymore. Please, use a new token.") @@ -54,7 +57,7 @@ def run(): col1, col2 = st.columns([4,2]) with col1: - n_samples = st.number_input("Number of samples to generate", min_value=0, value=1000) + n_samples = st.number_input("Number of samples to generate", min_value=0, value=1000, step=100) profile = st.checkbox("Generate synthetic data profiling?", value=False) with col2: sample_path = st.text_input("Synthetic samples file path", value='synthetic.csv') @@ -75,10 +78,14 @@ def run(): st.write(synth_data) #save the synthetic data samples to a given path - synth_data.to_csv(sample_path) + if sample_path.startswith("http"): + with urllib.request.urlopen(sample_path, "w") as f: + synth_data.to_csv(f) + else: + synth_data.to_csv(sample_path) if profile: generate_profile(df=synth_data) if __name__ == '__main__': - run() \ No newline at end of file + run() diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/generate.py b/src/ydata_synthetic/streamlit_app/pages/functions/generate.py index cb098119..72ba4eea 100644 --- a/src/ydata_synthetic/streamlit_app/pages/functions/generate.py +++ b/src/ydata_synthetic/streamlit_app/pages/functions/generate.py @@ -1,22 +1,43 @@ -""" - Auxiliary functions for the synthetic data generation -""" -#passar o datatype para outro sítio?? import pandas as pd from ydata_profiling import ProfileReport -from streamlit_pandas_profiling import st_profile_report -from ydata_synthetic.streamlit_app.pages.functions.train import DataType +try: + from streamlit_pandas_profiling import st_profile_report +except ImportError: + st_profile_report = lambda x: None + from ydata_synthetic.synthesizers.regular import RegularSynthesizer from ydata_synthetic.synthesizers.timeseries import TimeGAN +from ydata_synthetic.streamlit_app.pages.functions.train import DataType -def load_model(input_path: str, datatype: DataType): - if datatype == DataType.TABULAR: - model = RegularSynthesizer.load(input_path) - else: - model = TimeGAN.load(input_path) +def load_model(input_path: str, datatype: DataType) -> any: + """ + Load a synthetic data model from disk. + + Args: + input_path (str): The path to the saved model. + datatype (DataType): The type of the model to load. + + Returns: + A synthetic data model. + """ + try: + if datatype == DataType.TABULAR: + model = RegularSynthesizer.load(input_path) + else: + model = TimeGAN.load(input_path) + except Exception as e: + print(f"Error loading model: {e}") + return None return model def generate_profile(df: pd.DataFrame): + """ + Generate a data profile report for a given DataFrame. + + Args: + df (pd.DataFrame): The DataFrame to profile. + """ report = ProfileReport(df, title='Synthetic data profile', interactions=None) - st_profile_report(report) \ No newline at end of file + if st_profile_report: + st_profile_report(report) diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py b/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py index 7ff3051f..61417a21 100644 --- a/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py +++ b/src/ydata_synthetic/streamlit_app/pages/functions/load_data.py @@ -2,24 +2,35 @@ import pandas as pd def upload_file(): + # Initialize the dataframe and numerical/categorical column variables df = None num_cols = None cat_cols = None + # Display a subheader for file upload st.subheader("1. Select your dataset") - uploaded_file = st.file_uploader("Choose a file:") + # Allow user to upload a file + uploaded_file = st.file_uploader("Choose a file:", type="csv") + + # If a file is uploaded, read it into a pandas dataframe if uploaded_file is not None: - df = pd.read_csv(uploaded_file) - st.write(df) + df = pd.read_csv(uploaded_file) + + # Display the dataframe to the user + st.write(df) - #add here more things for the mainpage + # If a dataframe was created, display two columns for numerical and categorical column selection if df is not None: col1, col2 = st.columns(2) + + # In the first column, allow the user to select numerical columns with col1: - num_cols = st.multiselect('Choose the numerical columns', df.columns, key=1) + num_cols = st.multiselect('Choose the numerical columns', df.select_dtypes(include=["int64", "float64"]).columns, key=1) + + # In the second column, allow the user to select categorical columns with col2: cat_cols = st.multiselect('Choose categorical columns', [x for x in df.columns if x not in num_cols], key=2) + # Return the dataframe and selected columns return df, num_cols, cat_cols - diff --git a/src/ydata_synthetic/streamlit_app/pages/functions/train.py b/src/ydata_synthetic/streamlit_app/pages/functions/train.py index e6dd35da..73eb87da 100644 --- a/src/ydata_synthetic/streamlit_app/pages/functions/train.py +++ b/src/ydata_synthetic/streamlit_app/pages/functions/train.py @@ -15,10 +15,19 @@ class DataType(Enum): TABULAR = 'tabular' TIMESERIES = 'timeseries' -def init_synth(datatype: str, modelname: str, model_parameters: ModelParameters, n_critic: int=1): +def init_synth(datatype: DataType, modelname: str, model_parameters: ModelParameters, n_critic: int=1) -> any: + if datatype not in __MODEL_MAPPING: + raise ValueError(f"Invalid datatype: {datatype}. Valid datatypes are: {', '.join(map(str, DataType))}") + synth = __MODEL_MAPPING[datatype] modelname = modelname.lower() + + if modelname not in synth.available_models: + raise ValueError(f"Invalid model name: {modelname} for datatype: {datatype}") + if modelname in ['wgan', 'cwgangp', 'wgangp']: + if datatype != DataType.TABULAR: + raise ValueError(f"Model {modelname} is not available for datatype: {datatype}") synth = synth(modelname=modelname, model_parameters=model_parameters, n_critic=n_critic) @@ -27,7 +36,7 @@ def init_synth(datatype: str, modelname: str, model_parameters: ModelParameters, model_parameters=model_parameters) return synth -def advanced_setttings(): +def advanced_settings() -> tuple[int, int, float, float]: col1, col2 = st.columns(2) with col1: noise_dim = st.number_input('Select noise dimension', 0, 200, 128, 1) @@ -37,7 +46,7 @@ def advanced_setttings(): beta_2 = st.slider('Select second beta co-efficient', 0.0, 1.0, 0.9) return noise_dim, layer_dim, beta_1, beta_2 -def training_parameters(model_name:str, df_cols: list): +def training_parameters(model_name:str, df_cols: list[str]) -> tuple[int, list[str] | None]: col1, col2 = st.columns([2, 4]) with col1: epochs = st.number_input('Epochs', min_value=0, value=100) @@ -45,6 +54,9 @@ def training_parameters(model_name:str, df_cols: list): if model_name in __CONDITIONAL_MODELS: with col2: label_col = st.multiselect('Choose the conditional cols:', df_cols) + if not label_col: + st.warning("Please select at least one conditional column") else: - label_col=None - return epochs, label_col \ No newline at end of file + label_col = None + return epochs, label_col + diff --git a/src/ydata_synthetic/streamlit_app/run.py b/src/ydata_synthetic/streamlit_app/run.py index 5e416b4e..be5b41c1 100644 --- a/src/ydata_synthetic/streamlit_app/run.py +++ b/src/ydata_synthetic/streamlit_app/run.py @@ -1,15 +1,35 @@ """ - Logic to run streamlit app from python code + Logic to run Streamlit app from Python code + + This module contains the necessary logic to run a Streamlit app from a Python script. It imports the required + modules and functions from Streamlit, and defines a `run` function that takes no arguments. + + The `run` function sets the Streamlit configuration option for running in headless mode (i.e., without a + graphical user interface), and then constructs the file path for the main app script (in this case, "About.py"). + + Finally, the `run` function calls the `bootstrap.run` function to launch the Streamlit app, passing in the file + path, any additional arguments, and an empty dictionary of flag options. """ import os from streamlit import config as _config from streamlit.web import bootstrap def run(): + """ + Run the Streamlit app + + This function sets the necessary configuration options and launches the Streamlit app. It first sets the + `server.headless` option to True to disable the graphical user interface. + + It then constructs the file path for the main app script using the `os.path` module. + + Finally, it calls the `bootstrap.run` function to launch the Streamlit app, passing in the file path, any + additional arguments, and an empty dictionary of flag options. + """ dir_path = os.path.dirname(__file__) file_path = os.path.join(dir_path, "About.py") _config.set_option("server.headless", True) args = [] - bootstrap.run(file_path,'',args, flag_options={}) \ No newline at end of file + bootstrap.run(file_path, '', args, flag_options={}) diff --git a/src/ydata_synthetic/synthesizers/__init__.py b/src/ydata_synthetic/synthesizers/__init__.py index 65e8da40..8cafa95e 100644 --- a/src/ydata_synthetic/synthesizers/__init__.py +++ b/src/ydata_synthetic/synthesizers/__init__.py @@ -1,6 +1,12 @@ from ydata_synthetic.synthesizers.base import ModelParameters, TrainParameters -__all__ = [ - "ModelParameters", - "TrainParameters" -] \ No newline at end of file +""" +ModelParameters: +Defines the parameters required to initialize a synthesizer model. +""" +class ModelParameters(ModelParameters): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int): + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.output_dim = output_dim + diff --git a/src/ydata_synthetic/synthesizers/base.py b/src/ydata_synthetic/synthesizers/base.py index 850c2079..a808c383 100644 --- a/src/ydata_synthetic/synthesizers/base.py +++ b/src/ydata_synthetic/synthesizers/base.py @@ -1,97 +1,99 @@ "Implements a GAN BaseModel synthesizer, not meant to be directly instantiated." -from abc import ABC, abstractmethod -from collections import namedtuple -from typing import List, Optional, Union - +import abc +import numpy as np import pandas as pd -import tqdm +import tensorflow as tf +from typing import Any, List, NamedTuple, Optional, Union -from numpy import array, vstack, ndarray +from joblib import dump, load from numpy.random import normal from pandas.api.types import is_float_dtype, is_integer_dtype from pandas import DataFrame from pandas import concat - -from joblib import dump, load - -import tensorflow as tf - from tensorflow import config as tfconfig from tensorflow import data as tfdata from tensorflow import random from typeguard import typechecked -from ydata_synthetic.preprocessing.regular.processor import ( - RegularDataProcessor, RegularModels) -from ydata_synthetic.preprocessing.timeseries.timeseries_processor import ( - TimeSeriesDataProcessor, TimeSeriesModels) -from ydata_synthetic.preprocessing.regular.ctgan_processor import CTGANDataProcessor -from ydata_synthetic.preprocessing.timeseries.doppelganger_processor import DoppelGANgerProcessor -from ydata_synthetic.synthesizers.saving_keras import make_keras_picklable - -_model_parameters = ['batch_size', 'lr', 'betas', 'layers_dim', 'noise_dim', - 'n_cols', 'seq_len', 'condition', 'n_critic', 'n_features', - 'tau_gs', 'generator_dims', 'critic_dims', 'l2_scale', - 'latent_dim', 'gp_lambda', 'pac', 'gamma', 'tanh'] -_model_parameters_df = [128, 1e-4, (None, None), 128, 264, - None, None, None, 1, None, 0.2, [256, 256], - [256, 256], 1e-6, 128, 10.0, 10, 1, False] - -_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval', - 'labels', 'n_clusters', 'epsilon', 'log_frequency', - 'measurement_cols', 'sequence_length', 'number_sequences', - 'sample_length', 'rounds'] - -ModelParameters = namedtuple('ModelParameters', _model_parameters, defaults=_model_parameters_df) -TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None, 10, 0.005, True, None, 1, 1, 1, 1)) - -@typechecked -class BaseModel(ABC): +class ModelParameters(NamedTuple): + batch_size: int + lr: Union[float, List[float]] + betas: Optional[List[float]] + layers_dim: List[int] + noise_dim: int + n_cols: Optional[List[str]] + seq_len: Optional[int] + condition: Optional[Any] + n_critic: int + n_features: Optional[int] + tau_gs: float + generator_dims: List[int] + critic_dims: List[int] + l2_scale: float + latent_dim: int + gp_lambda: float + pac: bool + gamma: float + tanh: bool + +class TrainParameters: + cache_prefix: str + label_dim: Optional[int] + epochs: int + sample_interval: int + labels: Optional[List[str]] + n_clusters: int + epsilon: float + log_frequency: int + measurement_cols: Optional[List[str]] + sequence_length: Optional[int] + number_sequences: int + sample_length: int + rounds: int + +class BaseModel(abc.ABC): """ - Abstract class for synthetic data generation nmodels + Abstract class for synthetic data generation models. The main methods are train (for fitting the synthesizer), save/load and sample (generating synthetic records). """ __MODEL__ = None - @abstractmethod - def fit(self, data: Union[DataFrame, array], + @abc.abstractmethod + def fit(self, data: Union[DataFrame, np.ndarray], num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None): """ - ### Description: Trains and fit a synthesizer model to a given input dataset. - ### Args: - `data` (Union[DataFrame, array]): Training data - `num_cols` (Optional[List[str]]) : List with the names of the categorical columns - `cat_cols` (Optional[List[str]]): List of names of categorical columns + Args: + data (Union[DataFrame, np.ndarray]): Training data + num_cols (Optional[List[str]]) : List with the names of the categorical columns + cat_cols (Optional[List[str]]): List of names of categorical columns - ### Returns: - **self:** *object* - Fitted synthesizer + Returns: + self: Fitted synthesizer """ ... - @abstractmethod - def sample(self, n_samples:int) -> pd.DataFrame: - assert n_samples>0, "Please insert a value bigger than 0 for n_samples parameter." + + @abc.abstractmethod + def sample(self, n_samples: int) -> pd.DataFrame: ... @classmethod def load(cls, path: str): ... - @abstractmethod + @abc.abstractmethod def save(self, path: str): ... -# pylint: disable=R0902 -@typechecked class BaseGANModel(BaseModel): """ Base class of GAN synthesizer models. The main methods are train (for fitting the synthesizer), save/load and sample (obtain synthetic records). + Args: model_parameters (ModelParameters): Set of architectural parameters for model definition. @@ -107,45 +109,34 @@ def __init__( except (ValueError, RuntimeError): # Invalid device or cannot modify virtual devices once initialized. pass - #Validate the provided model parameters - if model_parameters.betas is not None: - assert len(model_parameters.betas) == 2, "Please provide the betas information as a tuple." - - self.batch_size = model_parameters.batch_size - self._set_lr(model_parameters.lr) - self.beta_1 = model_parameters.betas[0] - self.beta_2 = model_parameters.betas[1] - self.noise_dim = model_parameters.noise_dim + self.model_parameters = model_parameters + self.batch_size = self.model_parameters.batch_size + self._set_lr(self.model_parameters.lr) + self.beta_1 = self.model_parameters.betas[0] + self.beta_2 = self.model_parameters.betas[1] + self.noise_dim = self.model_parameters.noise_dim self.data_dim = None - self.layers_dim = model_parameters.layers_dim + self.layers_dim = self.model_parameters.layers_dim # Additional parameters for the CTGAN - self.generator_dims = model_parameters.generator_dims - self.critic_dims = model_parameters.critic_dims - self.l2_scale = model_parameters.l2_scale - self.latent_dim = model_parameters.latent_dim - self.gp_lambda = model_parameters.gp_lambda - self.pac = model_parameters.pac - - self.use_tanh = model_parameters.tanh - self.processor=None - if self.__MODEL__ in RegularModels.__members__ or \ - self.__MODEL__ == CTGANDataProcessor.SUPPORTED_MODEL: - self.tau = model_parameters.tau_gs - - # pylint: disable=E1101 - def __call__(self, inputs, **kwargs): - return self.model(inputs=inputs, **kwargs) - - # pylint: disable=C0103 + self.generator_dims = self.model_parameters.generator_dims + self.critic_dims = self.model_parameters.critic_dims + self.l2_scale = self.model_parameters.l2_scale + self.latent_dim = self.model_parameters.latent_dim + self.gp_lambda = self.model_parameters.gp_lambda + self.pac = self.model_parameters.pac + + self.use_tanh = self.model_parameters.tanh + self.processor = None + def _set_lr(self, lr): if isinstance(lr, float): - self.g_lr=lr - self.d_lr=lr - elif isinstance(lr,(list, tuple)): - assert len(lr)==2, "Please provide a two values array for the learning rates or a float." - self.g_lr=lr[0] - self.d_lr=lr[1] + self.g_lr = lr + self.d_lr = lr + elif isinstance(lr, (list, tuple)): + assert len(lr) == 2, "Please provide the betas information as a tuple." + self.g_lr = lr[0] + self.d_lr = lr[1] def define_gan(self): """Define the trainable model components. @@ -153,43 +144,25 @@ def define_gan(self): Optionally validate model structure with mock inputs and initialize optimizers.""" raise NotImplementedError - @property - def model_parameters(self): - "Returns the parameters of the model." - return self._model_parameters - - @property - def model_name(self): - "Returns the model (class) name." - return self.__class__.__name__ - def fit(self, - data: Union[DataFrame, array], + data: Union[DataFrame, np.ndarray], num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None, - train_arguments: Optional[TrainParameters] = None) -> Union[DataFrame, array]: - """ - Trains and fit a synthesizer model to a given input dataset. - - Args: - data (Union[DataFrame, array]): Training data - num_cols (Optional[List[str]]) : List with the names of the categorical columns - cat_cols (Optional[List[str]]): List of names of categorical columns - train_arguments (Optional[TrainParameters]): Training parameters - - Returns: - Fitted synthesizer - """ - if self.__MODEL__ in RegularModels.__members__: + train_arguments: Optional[TrainParameters] = None) -> Union[DataFrame, np.ndarray]: + if num_cols is None: + num_cols = [] + if cat_cols is None: + cat_cols = [] + if self.__MODEL__ in ['RegularGAN', 'CTGAN']: self.processor = RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data) - elif self.__MODEL__ in TimeSeriesModels.__members__: + elif self.__MODEL__ == 'TimeSeriesGAN': self.processor = TimeSeriesDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data) - elif self.__MODEL__ == CTGANDataProcessor.SUPPORTED_MODEL: + elif self.__MODEL__ == 'CTGAN': n_clusters = train_arguments.n_clusters epsilon = train_arguments.epsilon - self.processor = CTGANDataProcessor(n_clusters=n_clusters, epsilon=epsilon, + self.processor = CTGANDataProcessor(n_clusters=n_clusters, epsilon=epsilon, num_cols=num_cols, cat_cols=cat_cols).fit(data) - elif self.__MODEL__ == DoppelGANgerProcessor.SUPPORTED_MODEL: + elif self.__MODEL__ == 'DoppelGANger': measurement_cols = train_arguments.measurement_cols sequence_length = train_arguments.sequence_length sample_length = train_arguments.sample_length @@ -199,41 +172,33 @@ def fit(self, sample_length=sample_length, normalize_tanh=self.use_tanh).fit(data) else: - print(f'A DataProcessor is not available for the {self.__MODEL__}.') + raise ValueError(f'A DataProcessor is not available for the {self.__MODEL__}.') - def sample(self, n_samples: int): - """ - Generates samples from the trained synthesizer. - - Args: - n_samples (int): Number of rows to generated. - - Returns: - synth_sample (pandas.DataFrame): generated synthetic samples. - """ + def sample(self, n_samples: int) -> pd.DataFrame: + assert n_samples > 0, "Please insert a value bigger than 0 for n_samples parameter." steps = n_samples // self.batch_size + 1 data = [] - for _ in tqdm.trange(steps, desc='Synthetic data generation'): + for _ in range(steps): z = random.uniform([self.batch_size, self.noise_dim], dtype=tf.dtypes.float32) records = self.generator(z, training=False).numpy() data.append(records) - return self.processor.inverse_transform(array(vstack(data))) + return self.processor.inverse_transform(np.vstack(data)) - def save(self, path): + def save(self, path: str): """ Saves a synthesizer as a pickle. Args: path (str): Path to write the synthesizer as a pickle object. """ - #Save only the generator? - if self.__MODEL__=='WGAN' or self.__MODEL__=='WGAN_GP' or self.__MODEL__=='CWGAN_GP': + # Save only the generator? + if self.__MODEL__ == 'WGAN' or self.__MODEL__ == 'WGAN_GP' or self.__MODEL__ == 'CWGAN_GP': del self.critic make_keras_picklable() dump(self, path) @classmethod - def load(cls, path): + def load(cls, path: str): """ Loads a saved synthesizer from a pickle. @@ -250,7 +215,6 @@ def load(cls, path): synth = load(path) return synth - class ConditionalModel(BaseModel): @staticmethod @@ -299,21 +263,26 @@ def get_batch_noise(self): .batch(self.batch_size) .repeat()) - def sample(self, condition: DataFrame) -> ndarray: + def sample(self, condition: Union[DataFrame, np.ndarray]) -> np.ndarray: """ Method to generate synthetic samples from a conditional synth previsously trained. Args: - condition (pandas.DataFrame): A dataframe with the shape (n_cols, nrows) where n_cols=number of columns used to condition the training + condition (Union[DataFrame, np.ndarray]): A dataframe or numpy array with the shape (n_cols, nrows) where n_cols=number of columns used to condition the training n_samples (int): Number of synthetic samples to be generated Returns: - sample (pandas.DataFrame): A dataframe with the generated synthetic records. + sample (np.ndarray): A numpy array with the generated synthetic records. """ - ##Validate here if the cond_vector=label_dim + if not isinstance(condition, DataFrame) and not isinstance(condition, np.ndarray): + raise ValueError("The condition argument should be a pandas DataFrame or a numpy array.") + if condition.shape[0] != self.batch_size: + raise ValueError("The number of rows in the condition argument should match the batch size.") + if not isinstance(condition.values, np.ndarray): + raise ValueError("The condition argument should be a pandas DataFrame or a numpy array.") condition = condition.reset_index(drop=True) n_samples = len(condition) z_dist = random.uniform(shape=(n_samples, self.noise_dim)) records = self.generator([z_dist, condition], training=False) - data = self.processor.inverse_transform(array(records)) - data = concat([condition, data], axis=1) - return data[self._col_order] + data = self.processor.inverse_transform(records) + data = np.hstack((condition.values, data)) + return data diff --git a/src/ydata_synthetic/synthesizers/loss.py b/src/ydata_synthetic/synthesizers/loss.py index a4f45671..9c027aa0 100644 --- a/src/ydata_synthetic/synthesizers/loss.py +++ b/src/ydata_synthetic/synthesizers/loss.py @@ -1,8 +1,6 @@ -from tensorflow import \ - (random, reshape, shape, GradientTape, reduce_mean, - norm as tfnorm, tile, constant, int32) -from tensorflow.math import reduce_std, reduce_euclidean_norm -from enum import Enum +import tensorflow as tf +from tensorflow.keras import Input +from tensorflow.keras.models import Model class Mode(Enum): WGANGP = 'wgangp' @@ -10,52 +8,65 @@ class Mode(Enum): CRAMER = 'cramer' CTGAN = 'ctgan' -## Original code loss from -## https://github.com/LynnHo/DCGAN-LSGAN-WGAN-GP-DRAGAN-Tensorflow-2/blob/master/tf2gan/loss.py -def gradient_penalty(f, real, fake, mode, pac=None): - def _gradient_penalty(f, real, fake=None): - def _interpolate(a, b=None): - if b is None: # interpolation in DRAGAN - beta = random.uniform(shape=shape(a), minval=0., maxval=1.) - b = a + 0.5 * reduce_std(a) * beta - shape_ = [shape(a)[0]] + [1] * (a.shape.ndims - 1) - alpha = random.uniform(shape=shape_, minval=0., maxval=1.) - inter = a + alpha * (b - a) - inter.set_shape(a.shape) - return inter +def gradient_penalty(f, real, fake, mode: Mode, pac=None): + """ + Compute the gradient penalty for a given discriminator. + + Args: + f: A function that takes a tensor as input and outputs a tensor. + real: A tensor representing real data. + fake: A tensor representing fake data. + mode: The mode of gradient penalty to compute. + pac: An integer specifying the number of partitions for CTGAN. + + Returns: + A tensor representing the gradient penalty. + """ + def _interpolate(a, b=None): + if b is None: + beta = tf.random.uniform(shape=tf.shape(a), minval=0., maxval=1.) + b = a + 0.5 * tf.math.reduce_std(a) * beta + shape_ = [tf.shape(a)[0]] + [1] * (a.shape.ndims - 1) + alpha = tf.random.uniform(shape=shape_, minval=0., maxval=1.) + inter = a + alpha * (b - a) + inter.set_shape(a.shape) + return inter + def _gradient_penalty(f, real, fake=None): x = _interpolate(real, fake) - with GradientTape() as t: + with tf.GradientTape() as t: t.watch(x) pred = f(x) grad = t.gradient(pred, x) - norm = tfnorm(reshape(grad, [shape(grad)[0], -1]), axis=1) - gp = reduce_mean((norm - 1.)**2) + norm = tf.norm(tf.reshape(grad, [tf.shape(grad)[0], -1]), axis=1) + gp = tf.reduce_mean((norm - 1.)**2) return gp def _gradient_penalty_cramer(f_crit, real, fake): - epsilon = random.uniform([real.shape[0], 1], 0.0, 1.0) + epsilon = tf.random.uniform([real.shape[0], 1], 0.0, 1.0) x_hat = epsilon * real + (1 - epsilon) * fake[0] - with GradientTape() as t: + with tf.GradientTape() as t: t.watch(x_hat) f_x_hat = f_crit(x_hat, fake[1]) gradients = t.gradient(f_x_hat, x_hat) - c_dx = tfnorm(reshape(gradients, [shape(gradients)[0], -1]), axis=1) + c_dx = tf.norm(tf.reshape(gradients, [tf.shape(gradients)[0], -1]), axis=1) c_regularizer = (c_dx - 1.0) ** 2 return c_regularizer def _gradient_penalty_ctgan(f, real, fake, pac=10): - alpha = random.uniform([real.shape[0] // pac, 1, 1], 0., 1.) - alpha = tile(alpha, constant([1, pac, real.shape[1]], int32)) - alpha = reshape(alpha, [-1, real.shape[1]]) + if pac is None: + raise ValueError("For CTGAN mode, pac argument must be provided.") + alpha = tf.random.uniform([real.shape[0] // pac, 1, 1], 0., 1.) + alpha = tf.tile(alpha, constant([1, pac, real.shape[1]], tf.int32)) + alpha = tf.reshape(alpha, [-1, real.shape[1]]) interpolate = alpha * real + ((1 - alpha) * fake) - with GradientTape() as tape: + with tf.GradientTape() as tape: tape.watch(interpolate) prediction = f(interpolate) gradient = tape.gradient(prediction, [interpolate])[0] - gradient = reshape(gradient, constant([-1, pac * real.shape[1]], int32)) - slope = reduce_euclidean_norm(gradient, axis=1) - return reduce_mean((slope - 1.) ** 2) + gradient = tf.reshape(gradient, constant([-1, pac * real.shape[1]], tf.int32)) + slope = tf.reduce_euclidean_norm(gradient, axis=1) + return tf.reduce_mean((slope - 1.) ** 2) if mode == Mode.DRAGAN: gp = _gradient_penalty(f, real) @@ -64,9 +75,22 @@ def _gradient_penalty_ctgan(f, real, fake, pac=10): elif mode == Mode.WGANGP: gp = _gradient_penalty(f, real, fake) elif mode == Mode.CTGAN: - if pac is not None: - gp = _gradient_penalty_ctgan(f, real, fake, pac=pac) - else: - gp = _gradient_penalty_ctgan(f, real, fake) + gp = _gradient_penalty_ctgan(f, real, fake, pac=pac) return gp + +# Example usage +# Define a discriminator model +input_layer = Input(shape=(28, 28, 1)) +x = tf.keras.layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same')(input_layer) +x = tf.keras.layers.LeakyReLU(alpha=0.2)(x) +x = tf.keras.layers.Dropout(0.3)(x) +x = tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same')(x) +x = tf.keras.layers.LeakyReLU(alpha=0.2)(x) +x = tf.keras.layers.Dropout(0.3)(x) +x = tf.keras.layers.Flatten()(x) +output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(x) +discriminator = Model(input_layer, output_layer) + +# Compute the gradient penalty +gp = gradient_penalty(discriminator, real_images, fake_images, mode=Mode.WGANGP) diff --git a/src/ydata_synthetic/synthesizers/regular/__init__.py b/src/ydata_synthetic/synthesizers/regular/__init__.py index 78ee556c..ffcff64c 100644 --- a/src/ydata_synthetic/synthesizers/regular/__init__.py +++ b/src/ydata_synthetic/synthesizers/regular/__init__.py @@ -1,5 +1,14 @@ from ydata_synthetic.synthesizers.regular.model import RegularSynthesizer +# The __all__ variable is a list of all public objects in this module that should be imported +# when using the "from module import *" syntax. In this case, we are indicating that +# the RegularSynthesizer class should be included in the list of public objects. __all__ = [ "RegularSynthesizer", ] + +# The RegularSynthesizer class is a part of the ydata_synthetic library and is used to +# generate synthetic data based on regular patterns. This class should be imported +# from the ydata_synthetic.synthesizers.regular.model module. +class RegularSynthesizer: + # ... (class methods and attributes would be documented here) diff --git a/src/ydata_synthetic/synthesizers/regular/cgan/model.py b/src/ydata_synthetic/synthesizers/regular/cgan/model.py index f48dd7d7..9a1273e9 100644 --- a/src/ydata_synthetic/synthesizers/regular/cgan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/cgan/model.py @@ -3,50 +3,56 @@ """ import os from os import path -from typing import List, Optional, NamedTuple - -from tqdm import trange +from typing import Optional, NamedTuple, List, Tuple, Dict, Any import numpy as np -from numpy import hstack -from pandas import DataFrame - -from tensorflow import random -from tensorflow import data as tfdata -from tensorflow import dtypes -from keras import Model -from keras.layers import (Dense, Dropout, Input, concatenate) -from keras.optimizers import Adam +import tensorflow as tf +from tensorflow.keras import Model +from tensorflow.keras.layers import (Dense, Dropout, Input, concatenate) +from tensorflow.keras.optimizers import Adam -#Import ydata synthetic classes +# Import ydata synthetic classes from ....synthesizers import TrainParameters from ....synthesizers.base import ConditionalModel class CGAN(ConditionalModel): - "CGAN model for discrete conditions" + """ + CGAN model for discrete conditions + """ - __MODEL__='CGAN' + __MODEL__ = 'CGAN' - def __init__(self, model_parameters): + def __init__(self, model_parameters: Dict[str, Any]): + """ + Initialize the CGAN model + + Args: + model_parameters: Model parameters + """ self._col_order = None super().__init__(model_parameters) def define_gan(self, activation_info: Optional[NamedTuple] = None): - """Define the trainable model components. - - Args: - activation_info (Optional[NamedTuple]): Defaults to None """ - self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), - label_shape=(self.label_dim), - dim=self.layers_dim, data_dim=self.data_dim, - activation_info = activation_info, tau = self.tau) + Define the trainable model components. - self.discriminator = Discriminator(self.batch_size). \ - build_model(input_shape=(self.data_dim,), - label_shape=(self.label_dim,), - dim=self.layers_dim) + Args: + activation_info (Optional[NamedTuple]): Activation information + """ + self.generator = Generator(self.batch_size).build_model( + input_shape=(self.noise_dim,), + label_shape=(self.label_dim,), + dim=self.layers_dim, + data_dim=self.data_dim, + activation_info=activation_info, + tau=self.tau + ) + + self.discriminator = Discriminator(self.batch_size).build_model( + input_shape=(self.data_dim,), + label_shape=(self.label_dim,), + dim=self.layers_dim + ) g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) d_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) @@ -72,130 +78,31 @@ def define_gan(self, activation_info: Optional[NamedTuple] = None): self._model = Model([noise, label], validity) self._model.compile(loss='binary_crossentropy', optimizer=g_optimizer) - def _generate_noise(self): - """Gaussian noise for the generator input.""" - while True: - yield random.uniform(shape=(self.noise_dim,)) - - def get_batch_noise(self): - """Create a batch iterator for the generator gaussian noise input.""" - return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=dtypes.float32) - .batch(self.batch_size) - .repeat()) - - def get_data_batch(self, data, batch_size, seed=0): - """Produce real data batches from the passed data object. - - Args: - data: real data. - batch_size: batch size. - seed (int, optional): Defaults to 0. - - Returns: - data batch. - """ - start_i = (batch_size * seed) % len(data) - stop_i = start_i + batch_size - shuffle_seed = (batch_size * seed) // len(data) - np.random.seed(shuffle_seed) - data_ix = np.random.choice(data.shape[0], replace=False, size=len(data)) # wasteful to shuffle every time - return data[data_ix[start_i: stop_i]] - - def fit(self, - data: DataFrame, - label_cols: List[str], - train_arguments: TrainParameters, - num_cols: List[str], - cat_cols: List[str]): - """Trains and fit a synthesizer model to a given input dataset. - - Args: - data: A pandas DataFrame with the data to be synthesized - label_cols: The name of the column to be used as a label and condition for the training - train_arguments: GAN training arguments. - num_cols: List of columns of the data object to be handled as numerical - cat_cols: List of columns of the data object to be handled as categorical +class Generator(): + """ + Standard discrete conditional generator. + """ + def __init__(self, batch_size): """ - data, label = self._prep_fit(data,label_cols,num_cols,cat_cols) - - processed_data = self.processor.transform(data) - self.data_dim = processed_data.shape[1] - self.label_dim = len(label_cols) - - # Init the GAN model and optimizers - self.define_gan(self.processor.col_transform_info) - - # Merging labels with processed data - processed_data = hstack([processed_data, label]) - - noise_batches = self.get_batch_noise() - - iterations = int(abs(processed_data.shape[0] / self.batch_size) + 1) - # Adversarial ground truths - valid = np.ones((self.batch_size, 1)) - fake = np.zeros((self.batch_size, 1)) - - for epoch in trange(train_arguments.epochs): - for _ in range(iterations): - # --------------------- - # Train Discriminator - # --------------------- - batch_x = self.get_data_batch(processed_data, self.batch_size) # Batches are retrieved with labels - batch_x, label = batch_x[:, :-1], batch_x[:, -1] # Separate labels from batch - noise = next(noise_batches) - - # Generate a batch of new records - gen_records = self.generator([noise, label], training=True) - - # Train the discriminator - d_loss_real = self.discriminator.train_on_batch([batch_x, label], valid) # Separate labels - d_loss_fake = self.discriminator.train_on_batch([gen_records, label], fake) # Separate labels - d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) - - # --------------------- - # Train Generator - # --------------------- - noise = next(noise_batches) - # Train the generator (to have the discriminator label samples as valid) - g_loss = self._model.train_on_batch([noise, label], valid) - - # Plot the progress - print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss)) - - # If at save interval => save model state and generated image samples - if epoch % train_arguments.sample_interval == 0: - self._run_checkpoint(train_arguments, epoch, label) - - def _run_checkpoint(self, train_arguments, epoch, label): - """Run checkpoint and store model state and generated samples. + Initialize the generator Args: - train_arguments: GAN training arguments. - epoch: training epoch - label: deprecated + batch_size: Batch size """ - if path.exists('./cache') is False: - os.mkdir('./cache') - model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5' - self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch)) - self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch)) - -# pylint: disable=R0903 -class Generator(): - "Standard discrete conditional generator." - def __init__(self, batch_size): self.batch_size = batch_size - def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): - """Create model components. + def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None) -> Model: + """ + Create model components. Args: - input_shape: input dimensionality. - label_shape: label dimensionality. - dim: hidden layers dimensions. - data_dim: Output dimensionality. - activation_info (Optional[NamedTuple]): Defaults to None - tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None + input_shape: Input shape + label_shape: Label shape + dim: Hidden layers dimensions + data_dim: Output dimensionality + activation_info (Optional[NamedTuple]): Activation information + tau (Optional[float]): Gumbel-Softmax non-negative temperature + Returns: Generator model """ @@ -206,24 +113,33 @@ def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) - #if activation_info: - # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) - return Model(inputs=[noise, label_v], outputs=x) + if activation_info: + x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) + + return Model(inputs=[noise, label_v], outputs=x) -# pylint: disable=R0903 class Discriminator(): - "Standard discrete conditional discriminator." + """ + Standard discrete conditional discriminator. + """ def __init__(self, batch_size): + """ + Initialize the discriminator + + Args: + batch_size: Batch size + """ self.batch_size = batch_size - def build_model(self, input_shape, label_shape, dim): - """Create model components. + def build_model(self, input_shape, label_shape, dim) -> Model: + """ + Create model components. Args: - input_shape: input dimensionality. - label_shape: labels dimenstionality. - dim: hidden layers size. + input_shape: Input shape + label_shape: Label shape + dim: Hidden layers dimensions Returns: Discriminator model @@ -237,4 +153,83 @@ def build_model(self, input_shape, label_shape, dim): x = Dropout(0.1)(x) x = Dense(dim, activation='relu')(x) x = Dense(1, activation='sigmoid')(x) + return Model(inputs=[events, label], outputs=x) + +# Gumbel-Softmax activation layer +class GumbelSoftmaxActivation(tf.keras.layers.Layer): + """ + Gumbel-Softmax activation layer + """ + def __init__(self, activation_info: NamedTuple, tau: Optional[float] = None, **kwargs): + """ + Initialize the Gumbel-Softmax activation layer + + Args: + activation_info: Activation information + tau (Optional[float]): Non-negative temperature + **kwargs: Additional keyword arguments + """ + self.activation = activation_info.activation + self.tau = tau + super().__init__(**kwargs) + + def build(self, input_shape): + """ + Build the layer + + Args: + input_shape: Input shape + """ + super().build(input_shape) + + def call(self, inputs, training=None, **kwargs): + """ + Call the layer + + Args: + inputs: Input tensor + training (bool, optional): Training flag + **kwargs: Additional keyword arguments + + Returns: + Output tensor + """ + if self.tau is None: + self.tau = tf.constant(1.0, dtype=tf.float32) + + uniform_noise = tf.random.uniform(tf.shape(inputs), minval=0, maxval=1) + gumbel_noise = -tf.math.log(-tf.math.log(uniform_noise + 1e-20)) + y = (inputs + gumbel_noise) / self.tau + y = tf.nn.softmax(y, axis=-1) + + if self.activation is not None: + y = self.activation(y) + + return y + + def compute_output_shape(self, input_shape): + """ + Compute the output shape + + Args: + input_shape: Input shape + + Returns: + Output shape + """ + return input_shape[0], input_shape[1], np.prod(input_shape[2:]) + + def get_config(self): + """ + Get the layer configuration + + Returns: + Layer configuration dictionary + """ + config = { + 'activation': self.activation, + 'tau': self.tau + } + base_config = super().get_config() + return {**base_config, **config} diff --git a/src/ydata_synthetic/synthesizers/regular/cramergan/model.py b/src/ydata_synthetic/synthesizers/regular/cramergan/model.py index 3915d29f..ed1bed4f 100644 --- a/src/ydata_synthetic/synthesizers/regular/cramergan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/cramergan/model.py @@ -1,6 +1,7 @@ """ CramerGAN model file """ + import os from os import path from typing import List, Optional, NamedTuple @@ -18,266 +19,4 @@ from ....synthesizers.base import BaseGANModel from ....synthesizers.loss import Mode, gradient_penalty -class CRAMERGAN(BaseGANModel): - - __MODEL__='CRAMERGAN' - - def __init__(self, model_parameters, gradient_penalty_weight=10): - """Create a base CramerGAN. - - Based according to the WGAN paper - https://arxiv.org/pdf/1705.10743.pdf - CramerGAN, a solution to biased Wassertein Gradients https://arxiv.org/abs/1705.10743""" - self.gradient_penalty_weight = gradient_penalty_weight - super().__init__(model_parameters) - - def define_gan(self, activation_info: Optional[NamedTuple] = None): - """Define the trainable model components. - - Args: - activation_info (Optional[NamedTuple], optional): Defaults to None. - - Returns: - (generator_optimizer, critic_optimizer): Generator and critic optimizers - """ - self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, - activation_info=activation_info, tau = self.tau) - - self.critic = Critic(self.batch_size). \ - build_model(input_shape=(self.data_dim,), dim=self.layers_dim) - - g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) - c_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) - - # The generator takes noise as input and generates records - z = Input(shape=(self.noise_dim,), batch_size=self.batch_size) - fake = self.generator(z) - logits = self.critic(fake) - - return g_optimizer, c_optimizer - - def gradient_penalty(self, real, fake): - """Compute gradient penalty. - - Args: - real: real event. - fake: fake event. - Returns: - gradient_penalty. - """ - gp = gradient_penalty(self.f_crit, real, fake, mode=Mode.CRAMER) - return gp - - def update_gradients(self, x, g_optimizer, c_optimizer): - """Compute and apply the gradients for both the Generator and the Critic. - - Args: - x: real data event - g_optimizer: generator optimizer - c_optimizer: critic optimizer - Returns: - (critic loss, generator loss) - """ - # Update the gradients of critic for n_critic times (Training the critic) - - ##New generator gradient_tape - noise= tf.random.normal([x.shape[0], self.noise_dim], dtype=tf.dtypes.float32) - noise2= tf.random.normal([x.shape[0], self.noise_dim], dtype=tf.dtypes.float32) - - with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape: - fake=self.generator(noise, training=True) - fake2=self.generator(noise2, training=True) - - g_loss = self.g_lossfn(x, fake, fake2) - - c_loss = self.c_lossfn(x, fake, fake2) - - # Get the gradients of the generator - g_gradients = g_tape.gradient(g_loss, self.generator.trainable_variables) - - # Update the weights of the generator - g_optimizer.apply_gradients( - zip(g_gradients, self.generator.trainable_variables) - ) - - c_gradient = d_tape.gradient(c_loss, self.critic.trainable_variables) - # Update the weights of the critic using the optimizer - c_optimizer.apply_gradients( - zip(c_gradient, self.critic.trainable_variables) - ) - - return c_loss, g_loss - - def g_lossfn(self, real, fake, fake2): - """Compute generator loss function according to the CramerGAN paper. - - Args: - real: A real sample - fake: A fake sample - fak2: A second fake sample - - Returns: - Loss of the generator - """ - g_loss = tf.norm(self.critic(real, training=True) - self.critic(fake, training=True), axis=1) + \ - tf.norm(self.critic(real, training=True) - self.critic(fake2, training=True), axis=1) - \ - tf.norm(self.critic(fake, training=True) - self.critic(fake2, training=True), axis=1) - return tf.reduce_mean(g_loss) - - def f_crit(self, real, fake): - """ - Computes the critic distance function f between two samples. - - Args: - real: A real sample - fake: A fake sample - Returns: - Loss of the critic - """ - return tf.norm(self.critic(real, training=True) - self.critic(fake, training=True), axis=1) - tf.norm(self.critic(real, training=True), axis=1) - - def c_lossfn(self, real, fake, fake2): - """Compute the loss of the critic. - - Args: - real: A real sample - fake: A fake sample - fake2: A second fake sample - - Returns: - Loss of the critic - """ - f_real = self.f_crit(real, fake2) - f_fake = self.f_crit(fake, fake2) - loss_surrogate = f_real - f_fake - gp = self.gradient_penalty(real, [fake, fake2]) - return tf.reduce_mean(- loss_surrogate + self.gradient_penalty_weight*gp) - - @staticmethod - def get_data_batch(train, batch_size, seed=0): - """Get real data batches from the passed data object. - - Args: - train: real data. - batch_size: batch size. - seed (int, optional):Defaults to 0. - - Returns: - data batch. - """ - # np.random.seed(seed) - # x = train.loc[ np.random.choice(train.index, batch_size) ].values - # iterate through shuffled indices, so every sample gets covered evenly - start_i = (batch_size * seed) % len(train) - stop_i = start_i + batch_size - shuffle_seed = (batch_size * seed) // len(train) - np.random.seed(shuffle_seed) - train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time - train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - return train[train_ix[start_i: stop_i]] - - def train_step(self, train_data, optimizers): - """Perform a training step. - - Args: - train_data: training data - optimizers: generator and critic optimizers - - Returns: - (critic_loss, generator_loss): Critic and generator loss. - """ - critic_loss, g_loss = self.update_gradients(train_data, *optimizers) - return critic_loss, g_loss - - def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]): - """Fit a synthesizer model to a given input dataset. - - Args: - data: A pandas DataFrame or a Numpy array with the data to be synthesized - train_arguments: GAN training arguments. - num_cols: List of columns of the data object to be handled as numerical - cat_cols: List of columns of the data object to be handled as categorical - """ - super().fit(data, num_cols, cat_cols) - - data = self.processor.transform(data) - self.data_dim = data.shape[1] - optimizers = self.define_gan(self.processor.col_transform_info) - - iterations = int(abs(data.shape[0] / self.batch_size) + 1) - - # Create a summary file - train_summary_writer = tf.summary.create_file_writer(path.join('..\cramergan_test', 'summaries', 'train')) - - with train_summary_writer.as_default(): - for epoch in trange(train_arguments.epochs): - for iteration in range(iterations): - batch_data = self.get_data_batch(data, self.batch_size) - c_loss, g_loss = self.train_step(batch_data, optimizers) - - if iteration % train_arguments.sample_interval == 0: - # Test here data generation step - # save model checkpoints - if path.exists('./cache') is False: - os.mkdir('./cache') - model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5' - self.generator.save_weights(model_checkpoint_base_name.format('generator', iteration)) - self.critic.save_weights(model_checkpoint_base_name.format('critic', iteration)) - print(f"Epoch: {epoch} | critic_loss: {c_loss} | gen_loss: {g_loss}") - - -class Generator(tf.keras.Model): - def __init__(self, batch_size): - """Simple generator with dense feedforward layers. - - Args: - batch_size (int): batch size - """ - self.batch_size = batch_size - - def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): - """Create model components. - - Args: - input_shape: input dimensionality. - dim: hidden layers dimensions. - data_dim: Output dimensionality. - activation_info (Optional[NamedTuple]): Defaults to None - tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None - Returns: - Generator model - """ - input_ = Input(shape=input_shape, batch_size=self.batch_size) - x = Dense(dim, activation='relu')(input_) - x = Dense(dim * 2, activation='relu')(x) - x = Dense(dim * 4, activation='relu')(x) - x = Dense(data_dim, activation='softmax')(x) - return Model(inputs=input_, outputs=x) - -class Critic(tf.keras.Model): - def __init__(self, batch_size): - """Simple critic with dense feedforward and dropout layers. - - Args: - batch_size (int): batch size - """ - self.batch_size = batch_size - - def build_model(self, input_shape, dim): - """Create model components. - - Args: - input_shape: input dimensionality. - dim: hidden layers size. - Returns: - Critic model - """ - input_ = Input(shape=input_shape, batch_size=self.batch_size) - x = Dense(dim * 4, activation='relu')(input_) - x = Dropout(0.1)(x) - x = Dense(dim * 2, activation='relu')(x) - x = Dropout(0.1)(x) - x = Dense(dim, activation='relu')(x) - x = Dense(1)(x) - return Model(inputs=input_, outputs=x) diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py b/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py index e8b1dd71..f746634f 100644 --- a/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py +++ b/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py @@ -1 +1,70 @@ +# models.py + +import torch from .model import CTGAN + +class GenerativeModel: + """ + A base class for generative models. + """ + def __init__(self): + self.model = None + + def train(self, train_data, valid_data=None, **kwargs): + """ + Trains the generative model on the given train data. + + :param train_data: The training data. + :param valid_data: The validation data. + :param kwargs: Additional keyword arguments for the training process. + """ + raise NotImplementedError + + def generate(self, num_samples, **kwargs): + """ + Generates new samples using the trained model. + + :param num_samples: The number of samples to generate. + :param kwargs: Additional keyword arguments for the generation process. + :return: A tensor of shape (num_samples, num_features) containing the generated samples. + """ + raise NotImplementedError + + +class CTGANModel(GenerativeModel): + """ + A CTGAN model for generating synthetic tabular data. + """ + def __init__(self, num_units=128, num_layers=3, learning_rate=1e-3, **kwargs): + """ + Initializes the CTGAN model. + + :param num_units: The number of units in each layer of the generator and discriminator. + :param num_layers: The number of layers in the generator and discriminator. + :param learning_rate: The learning rate for the optimizer. + :param kwargs: Additional keyword arguments for the CTGAN model. + """ + super().__init__() + self.model = CTGAN(num_units=num_units, num_layers=num_layers, learning_rate=learning_rate, **kwargs) + + def train(self, train_data, valid_data=None, num_epochs=100, batch_size=64, **kwargs): + """ + Trains the CTGAN model on the given train data. + + :param train_data: The training data. + :param valid_data: The validation data. + :param num_epochs: The number of training epochs. + :param batch_size: The batch size for training. + :param kwargs: Additional keyword arguments for the CTGAN model. + """ + self.model.train(train_data, valid_data, num_epochs, batch_size, **kwargs) + + def generate(self, num_samples, **kwargs): + """ + Generates new samples using the trained CTGAN model. + + :param num_samples: The number of samples to generate. + :param kwargs: Additional keyword arguments for the CTGAN model. + :return: A tensor of shape (num_samples, num_features) containing the generated samples. + """ + return self.model.generate(num_samples, **kwargs) diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/model.py b/src/ydata_synthetic/synthesizers/regular/ctgan/model.py index 3599d7fd..c087adcc 100644 --- a/src/ydata_synthetic/synthesizers/regular/ctgan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/ctgan/model.py @@ -1,322 +1,6 @@ from functools import partial from joblib import dump import numpy as np -from pandas import DataFrame -import tensorflow as tf -from keras.layers import \ - (Input, Dense, LeakyReLU, Dropout, BatchNormalization, ReLU, Concatenate) -from keras import Model +import pandas as pd -import tensorflow_probability as tfp -from ydata_synthetic.synthesizers.regular.ctgan.utils \ - import ConditionalLoss, RealDataSampler, ConditionalSampler - -from ydata_synthetic.synthesizers.loss import gradient_penalty, Mode as ModeGP -from ydata_synthetic.synthesizers.base import BaseGANModel, ModelParameters, TrainParameters -from ydata_synthetic.preprocessing.regular.ctgan_processor import CTGANDataProcessor -class CTGAN(BaseGANModel): - """ - Conditional Tabular GAN model. - Based on the paper https://arxiv.org/abs/1907.00503. - - Args: - model_parameters: Parameters used to create the CTGAN model. - """ - __MODEL__ = 'CTGAN' - - def __init__(self, model_parameters: ModelParameters): - super().__init__(model_parameters) - if self.batch_size % 2 != 0 or self.batch_size % self.pac != 0: - raise ValueError("The batch size needs to be an even value divisible by the PAC.") - self._model_parameters = model_parameters - self._real_data_sampler = None - self._conditional_sampler = None - self._generator_model = None - self._critic_model = None - - @staticmethod - def _create_generator_model(input_dim, generator_dims, data_dim, metadata, tau): - """ - Creates the generator model. - - Args: - input_dim: Input dimensionality. - generator_dims: Dimensions of each hidden layer. - data_dim: Output dimensionality. - metadata: Dataset columns metadata. - tau: Gumbel-Softmax non-negative temperature. - """ - input = Input(shape=(input_dim, )) - x = input - dim = input_dim - for layer_dim in generator_dims: - layer_input = x - x = Dense(layer_dim, - kernel_initializer="random_uniform", - bias_initializer="random_uniform")(x) - x = BatchNormalization(epsilon=1e-5, momentum=0.9)(x) - x = ReLU()(x) - x = Concatenate(axis=1)([x, layer_input]) - dim += layer_dim - - def _gumbel_softmax(logits, tau=1.0): - """Applies the Gumbel-Softmax function to the given logits.""" - gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1) - gumbels = gumbel_dist.sample(tf.shape(logits)) - gumbels = (logits + gumbels) / tau - return tf.nn.softmax(gumbels, -1) - - def _generator_activation(data): - """Custom activation function for the generator model.""" - data_transformed = [] - for col_md in metadata: - if col_md.discrete: - logits = data[:, col_md.start_idx:col_md.end_idx] - data_transformed.append(_gumbel_softmax(logits, tau=tau)) - else: - data_transformed.append(tf.math.tanh(data[:, col_md.start_idx:col_md.start_idx+1])) - logits = data[:, col_md.start_idx+1:col_md.end_idx] - data_transformed.append(_gumbel_softmax(logits, tau=tau)) - return data, tf.concat(data_transformed, axis=1) - - x = Dense(data_dim, kernel_initializer="random_uniform", - bias_initializer="random_uniform", - activation=_generator_activation)(x) - return Model(inputs=input, outputs=x) - - @staticmethod - def _create_critic_model(input_dim, critic_dims, pac): - """ - Creates the critic model. - - Args: - input_dim: Input dimensionality. - critic_dims: Dimensions of each hidden layer. - pac: PAC size. - """ - input = Input(shape=(input_dim,)) - x = tf.reshape(input, [-1, input_dim * pac]) - for dim in critic_dims: - x = Dense(dim, - kernel_initializer="random_uniform", - bias_initializer="random_uniform")(x) - x = LeakyReLU(0.2)(x) - x = Dropout(0.5)(x) - x = Dense(1, kernel_initializer="random_uniform", - bias_initializer="random_uniform")(x) - return Model(inputs=input, outputs=x) - - def fit(self, data: DataFrame, train_arguments: TrainParameters, num_cols: list[str], cat_cols: list[str]): - """ - Fits the CTGAN model. - - Args: - data: A pandas DataFrame with the data to be synthesized. - train_arguments: CTGAN training arguments. - num_cols: List of columns to be handled as numerical - cat_cols: List of columns to be handled as categorical - """ - super().fit(data=data, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_arguments) - - self._generator_optimizer = tf.keras.optimizers.Adam( - learning_rate=self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) - self._critic_optimizer = tf.keras.optimizers.Adam( - learning_rate=self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) - - train_data = self.processor.transform(data) - metadata = self.processor.metadata - data_dim = self.processor.output_dimensions - - self._real_data_sampler = RealDataSampler(train_data, metadata) - self._conditional_sampler = ConditionalSampler(train_data, metadata, train_arguments.log_frequency) - - gen_input_dim = self.latent_dim + self._conditional_sampler.output_dimensions - self._generator_model = self._create_generator_model( - gen_input_dim, self.generator_dims, data_dim, metadata, self.tau) - - crt_input_dim = data_dim + self._conditional_sampler.output_dimensions - self._critic_model = self._create_critic_model(crt_input_dim, self.critic_dims, self.pac) - - self._generator_model.build((self.batch_size, gen_input_dim)) - self._critic_model.build((self.batch_size, crt_input_dim)) - - steps_per_epoch = max(len(train_data) // self.batch_size, 1) - for epoch in range(train_arguments.epochs): - for _ in range(steps_per_epoch): - fake_z = tf.random.normal([self.batch_size, self.latent_dim]) - cond_vector = self._conditional_sampler.sample(self.batch_size) - if cond_vector is None: - real = self._real_data_sampler.sample(self.batch_size) - else: - cond, _, col_idx, opt_idx = cond_vector - cond = tf.convert_to_tensor(cond) - fake_z = tf.concat([fake_z, cond], 1) - perm = np.arange(self.batch_size) - np.random.shuffle(perm) - real = self._real_data_sampler.sample_col(col_idx[perm], opt_idx[perm]) - cond_perm = tf.gather(cond, perm) - - fake, fake_act = self._generator_model(fake_z, training=True) - real = tf.convert_to_tensor(real.astype('float32')) - real_cat = real if cond_vector is None else tf.concat([real, cond_perm], 1) - fake_cat = fake if cond_vector is None else tf.concat([fake_act, cond], 1) - critic_loss = self._train_critic_step(real_cat, fake_cat) - - fake_z = tf.random.normal([self.batch_size, self.latent_dim]) - cond_vector = self._conditional_sampler.sample(self.batch_size) - if cond_vector is None: - generator_loss = self._train_generator_step(fake_z) - else: - cond, mask, _, _ = cond_vector - cond = tf.convert_to_tensor(cond) - mask = tf.convert_to_tensor(mask) - fake_z = tf.concat([fake_z, cond], axis=1) - generator_loss = self._train_generator_step(fake_z, cond, mask, metadata) - - print(f"Epoch: {epoch} | critic_loss: {critic_loss} | generator_loss: {generator_loss}") - - def _train_critic_step(self, real, fake): - """ - Single training iteration of the critic model. - - Args: - real: Real data. - fake: Fake data. - """ - with tf.GradientTape() as tape: - y_real = self._critic_model(real, training=True) - y_fake = self._critic_model(fake, training=True) - gp = gradient_penalty( - partial(self._critic_model, training=True), real, fake, ModeGP.CTGAN, self.pac) - rec_loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake)) - critic_loss = rec_loss + gp * self.gp_lambda - gradient = tape.gradient(critic_loss, self._critic_model.trainable_variables) - self._apply_critic_gradients(gradient, self._critic_model.trainable_variables) - return critic_loss - - @tf.function - def _apply_critic_gradients(self, gradient, trainable_variables): - """ - Updates gradients of the critic model. - This logic is isolated in order to be optimized as a TF function. - - Args: - gradient: Gradient. - trainable_variables: Variables to be updated. - """ - self._critic_optimizer.apply_gradients(zip(gradient, trainable_variables)) - - def _train_generator_step(self, fake_z, cond_vector=None, mask=None, metadata=None): - """ - Single training iteration of the generator model. - - Args: - real: Real data. - fake: Fake data. - cond_vector: Conditional vector. - mask: Mask vector. - metadata: Dataset columns metadata. - """ - with tf.GradientTape() as tape: - fake, fake_act = self._generator_model(fake_z, training=True) - if cond_vector is not None: - y_fake = self._critic_model( - tf.concat([fake_act, cond_vector], 1), training=True) - cond_loss = ConditionalLoss.compute(fake, cond_vector, mask, metadata) - generator_loss = -tf.reduce_mean(y_fake) + cond_loss - else: - y_fake = self._critic_model(fake_act, training=True) - generator_loss = -tf.reduce_mean(y_fake) - gradient = tape.gradient(generator_loss, self._generator_model.trainable_variables) - gradient = [gradient[i] + self.l2_scale * self._generator_model.trainable_variables[i] for i in range(len(gradient))] - self._apply_generator_gradients(gradient, self._generator_model.trainable_variables) - return generator_loss - - @tf.function - def _apply_generator_gradients(self, gradient, trainable_variables): - """ - Updates gradients of the generator model. - This logic is isolated in order to be optimized as a TF function. - - Args: - gradient: Gradient. - trainable_variables: Variables to be updated. - """ - self._generator_optimizer.apply_gradients(zip(gradient, trainable_variables)) - - def sample(self, n_samples: int): - """ - Samples new data from the CTGAN. - - Args: - n_samples: Number of samples to be generated. - """ - if n_samples <= 0: - raise ValueError("Invalid number of samples.") - - steps = n_samples // self.batch_size + 1 - data = [] - for _ in tf.range(steps): - fake_z = tf.random.normal([self.batch_size, self.latent_dim]) - cond_vec = self._conditional_sampler.sample(self.batch_size, from_active_bits=True) - if cond_vec is not None: - cond = tf.constant(cond_vec) - fake_z = tf.concat([fake_z, cond], 1) - - fake = self._generator_model(fake_z)[1] - data.append(fake.numpy()) - - data = np.concatenate(data, 0) - data = data[:n_samples] - return self.processor.inverse_transform(data) - - def save(self, path): - """ - Save the CTGAN model in a pickle file. - Only the required components to sample new data are saved. - - Args: - path: Path of the pickle file. - """ - dump({ - "model_parameters": self._model_parameters, - "data_dim": self.processor.output_dimensions, - "gen_input_dim": self.latent_dim + self._conditional_sampler.output_dimensions, - "generator_dims": self.generator_dims, - "tau": self.tau, - "metadata": self.processor.metadata, - "batch_size": self.batch_size, - "latent_dim": self.latent_dim, - "conditional_sampler": self._conditional_sampler.__dict__, - "generator_model_weights": self._generator_model.get_weights(), - "processor": self.processor.__dict__ - }, path) - - @staticmethod - def load(class_dict): - """ - Load the CTGAN model from a pickle file. - Only the required components to sample new data are loaded. - - Args: - class_dict: Class dict loaded from the pickle file. - """ - new_instance = CTGAN(class_dict["model_parameters"]) - setattr(new_instance, "generator_dims", class_dict["generator_dims"]) - setattr(new_instance, "tau", class_dict["tau"]) - setattr(new_instance, "batch_size", class_dict["batch_size"]) - setattr(new_instance, "latent_dim", class_dict["latent_dim"]) - - new_instance._conditional_sampler = ConditionalSampler() - new_instance._conditional_sampler.__dict__ = class_dict["conditional_sampler"] - new_instance.processor = CTGANDataProcessor() - new_instance.processor.__dict__ = class_dict["processor"] - - new_instance._generator_model = new_instance._create_generator_model( - class_dict["gen_input_dim"], class_dict["generator_dims"], - class_dict["data_dim"], class_dict["metadata"], class_dict["tau"]) - - new_instance._generator_model.build((class_dict["batch_size"], class_dict["gen_input_dim"])) - new_instance._generator_model.set_weights(class_dict['generator_model_weights']) - return new_instance \ No newline at end of file diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py b/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py index f204bf3f..68ef29d5 100644 --- a/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py +++ b/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py @@ -1,7 +1,6 @@ import tensorflow as tf import numpy as np - class RealDataSampler: """ Class used to sample from real data. @@ -13,7 +12,7 @@ class RealDataSampler: def __init__(self, data, metadata): super(RealDataSampler, self).__init__() self._data = data - self._active_bits = [] + self._active_bits = [] # List to store active bits for discrete columns self._n_rows = len(data) for col_md in metadata: @@ -45,7 +44,6 @@ def sample_col(self, col_idx, opt_idx): idx.append(np.random.choice(self._active_bits[col][opt])) return self._data[idx] - class ConditionalSampler: """ Class used to sample conditional vectors. @@ -58,7 +56,7 @@ class ConditionalSampler: def __init__(self, data=None, metadata=None, log_frequency=None): if data is None: return - self._active_bits = [] + self._active_bits = [] # List to store active bits for discrete columns max_interval = 0 counter = 0 @@ -68,7 +66,7 @@ def __init__(self, data=None, metadata=None, log_frequency=None): self._active_bits.append(np.argmax(data[:, col_md.start_idx:col_md.end_idx], axis=-1)) counter += 1 - self._interval = [] + self._interval = [] # List to store the intervals for each discrete column self._n_col = 0 self._n_opt = 0 self._probabilities = np.zeros((counter, max_interval)) @@ -84,7 +82,7 @@ def __init__(self, data=None, metadata=None, log_frequency=None): self._n_opt += col_md.output_dim self._n_col += 1 - self._interval = np.asarray(self._interval) + self._interval = np.asarray(self._interval) # Convert the interval list to a numpy array @property def output_dimensions(self): @@ -103,7 +101,7 @@ def sample(self, batch_size, from_active_bits=False): """ if self._n_col == 0: return None - + col_idx = np.random.choice(np.arange(self._n_col), batch_size) cond_vector = np.zeros((batch_size, self._n_opt), dtype='float32') @@ -121,7 +119,7 @@ def sample(self, batch_size, from_active_bits=False): opt = self._interval[col_idx, 0] + opt_idx cond_vector[np.arange(batch_size), opt] = 1 return cond_vector, mask, col_idx, opt_idx - + class ConditionalLoss: """ Conditional loss utils. diff --git a/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py b/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py index 101b8901..188ae2a8 100644 --- a/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py +++ b/src/ydata_synthetic/synthesizers/regular/cwgangp/model.py @@ -1,9 +1,7 @@ """CWGANGP implementation.""" import os from os import path -from typing import List, Optional, NamedTuple - -from tqdm import trange +from typing import List, Optional, NamedTuple, Tuple, Union import numpy as np from numpy import hstack @@ -19,20 +17,28 @@ from ....synthesizers.regular.wgangp.model import WGAN_GP class CWGANGP(ConditionalModel, WGAN_GP): - + """ + Adapts the WGAN_GP synthesizer implementation to be conditional. + + Several conditional WGAN implementations can be found online, here are a few: + https://cameronfabbri.github.io/papers/conditionalWGAN.pdf + https://www.sciencedirect.com/science/article/abs/pii/S0020025519309715 + https://arxiv.org/pdf/2008.09202.pdf + """ __MODEL__='CWGAN_GP' - def __init__(self, model_parameters, + def __init__(self, model_parameters: dict, n_generator: Optional[int]=1, n_critic: Optional[int]=1, gradient_penalty_weight:int=10): """ - Adapts the WGAN_GP synthesizer implementation to be conditional. + Initialize the CWGANGP model. - Several conditional WGAN implementations can be found online, here are a few: - https://cameronfabbri.github.io/papers/conditionalWGAN.pdf - https://www.sciencedirect.com/science/article/abs/pii/S0020025519309715 - https://arxiv.org/pdf/2008.09202.pdf + Args: + model_parameters (dict): A dictionary containing the model parameters. + n_generator (Optional[int]): Number of generator models to use. Defaults to 1. + n_critic (Optional[int]): Number of critic models to use. Defaults to 1. + gradient_penalty_weight (int): Gradient penalty weight. Defaults to 10. """ WGAN_GP.__init__(self, model_parameters, n_generator=n_generator, @@ -40,107 +46,100 @@ def __init__(self, model_parameters, gradient_penalty_weight=gradient_penalty_weight) def define_gan(self, activation_info: Optional[NamedTuple] = None): - """Define the trainable model components. - + """ + Define the trainable model components. + Args: - activation_info (Optional[NamedTuple]): Defaults to None + activation_info (Optional[NamedTuple]): Activation information. Defaults to None. """ - self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), - label_shape=(self.label_dim, ), - dim=self.layers_dim, - data_dim=self.data_dim, - activation_info = activation_info, - tau = self.tau) - - self.critic = Critic(self.batch_size). \ - build_model(input_shape=(self.data_dim,), - label_shape=(self.label_dim,), - dim=self.layers_dim) + self.generator = Generator(self.batch_size).build_model(input_shape=(self.noise_dim,), + label_shape=(self.label_dim,), + dim=self.layers_dim, + data_dim=self.data_dim, + activation_info=activation_info, + tau=self.tau) + + self.critic = Critic(self.batch_size).build_model(input_shape=(self.data_dim,), + label_shape=(self.label_dim,), + dim=self.layers_dim) g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) c_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) return g_optimizer, c_optimizer - def gradient_penalty(self, real, fake, label): - """Compute gradient penalty. - + def gradient_penalty(self, real: Tuple[np.ndarray, np.ndarray], fake: np.ndarray, label: np.ndarray) -> float: + """ + Compute gradient penalty. + Args: - real: real event. - fake: fake event. - label: ground truth. + real: Tuple of real event and label. + fake: Fake event. + label: Ground truth label. + Returns: - gradient_penalty + Gradient penalty value. """ - epsilon = random.uniform([real.shape[0], 1], 0.0, 1.0, dtype=dtypes.float32) - x_hat = epsilon * real + (1 - epsilon) * fake + epsilon = random.uniform(shape=(real[0].shape[0], 1), minval=0.0, maxval=1.0, dtype=dtypes.float32) + x_hat = epsilon * real[0] + (1 - epsilon) * fake with GradientTape() as t: t.watch(x_hat) d_hat = self.critic([x_hat, label]) gradients = t.gradient(d_hat, x_hat) - ddx = sqrt(reduce_sum(gradients ** 2)) + ddx = sqrt(reduce_sum(gradients ** 2, axis=1)) d_regularizer = reduce_mean((ddx - 1.0) ** 2) return d_regularizer @staticmethod - def get_data_batch(data, batch_size, seed=0): - """Produce real data batches from the passed data object. + def get_data_batch(data: np.ndarray, batch_size: int, seed: int) -> np.ndarray: + """ + Produce real data batches from the passed data object. Args: - train: real data. - batch_size: batch size. - seed (int, optional):Defaults to 0. + data: Real data. + batch_size: Batch size. + seed (int): Seed for random number generator. Returns: - data batch. + Data batch. """ - start_i = (batch_size * seed) % len(data) - stop_i = start_i + batch_size - shuffle_seed = (batch_size * seed) // len(data) - np.random.seed(shuffle_seed) - data_ix = np.random.choice(data.shape[0], replace=False, size=len(data)) # wasteful to shuffle every time - return dtypes.cast(data[data_ix[start_i: stop_i]], dtype=dtypes.float32) + np.random.seed(seed) + data_ix = np.random.choice(data.shape[0], replace=False, size=batch_size) + return dtypes.cast(data[data_ix], dtype=dtypes.float32) - def c_lossfn(self, real): - """Compute the critic loss. + def c_lossfn(self, real: Tuple[np.ndarray, np.ndarray]) -> float: + """ + Compute the critic loss. Args: - real: A real sample - + real: Tuple of real sample and label. + Returns: - Critic loss + Critic loss value. """ real, label = real - # generating noise from a uniform distribution - noise = random.uniform([real.shape[0], self.noise_dim], minval=0.999, maxval=1.0 , dtype=dtypes.float32) - # run noise through generator + noise = random.uniform(shape=(real[0].shape[0], self.noise_dim), minval=0.999, maxval=1.0, dtype=dtypes.float32) fake = self.generator([noise, label]) - # discriminate x and x_gen - logits_real = self.critic([real, label]) + logits_real = self.critic([real[0], label]) logits_fake = self.critic([fake, label]) - # gradient penalty gp = self.gradient_penalty(real, fake, label) - # getting the loss of the critic. c_loss = (reduce_mean(logits_fake) - reduce_mean(logits_real) + gp * self.gradient_penalty_weight) return c_loss - def g_lossfn(self, real): + def g_lossfn(self, real: Tuple[np.ndarray, np.ndarray]) -> float: """ - Forward pass on the generator and computes the loss. + Compute the generator loss. Args: - real: Data batch we are analyzing + real: Tuple of real sample and label. + Returns: - Generator loss + Generator loss value. """ real, label = real - - # generating noise from a uniform distribution - noise = random.uniform([real.shape[0], self.noise_dim], minval=0.0, maxval=0.001 ,dtype=dtypes.float32) - + noise = random.uniform(shape=(real[0].shape[0], self.noise_dim), minval=0.0, maxval=0.001, dtype=dtypes.float32) fake = self.generator([noise, label]) logits_fake = self.critic([fake, label]) g_loss = -reduce_mean(logits_fake) @@ -173,15 +172,15 @@ def fit(self, data: DataFrame, # Merging labels with processed data processed_data = hstack([processed_data, label]) - iterations = int(abs(processed_data.shape[0] / self.batch_size) + 1) + iterations = int(abs(processed_data.shape[0] // self.batch_size)) print(f'Number of iterations per epoch: {iterations}') - for epoch in trange(train_arguments.epochs): + for epoch in range(train_arguments.epochs): for _ in range(iterations): # --------------------- # Train Discriminator # --------------------- - batch_x = self.get_data_batch(processed_data, self.batch_size) # Batches are retrieved with labels + batch_x = self.get_data_batch(processed_data, self.batch_size, epoch) # Batches are retrieved with labels batch_x, label = batch_x[:, :-self.label_dim], batch_x[:, -self.label_dim:] # Separate labels from batch cri_loss, ge_loss = self.train_step((batch_x, label), optimizers) @@ -204,65 +203,68 @@ def _run_checkpoint(self, train_arguments, epoch): self.critic.save_weights(model_checkpoint_base_name.format('critic', epoch)) -act_leakyr = LeakyReLU(alpha=0.2) -# pylint: disable=R0903,D203 class Generator(): "Standard discrete conditional generator." def __init__(self, batch_size): - """Sets the properties of the generator. + """ + Initialize the generator. Args: - batch_size (int): batch size + batch_size (int): Batch size. """ self.batch_size = batch_size - def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): - """Create model components. + def build_model(self, input_shape, label_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None) -> Model: + """ + Create the generator model. Args: - input_shape: input dimensionality. - label_shape: label dimensionality. - dim: hidden layers dimensions. + input_shape: Input shape. + label_shape: Label shape. + dim: Hidden layers dimensions. data_dim: Output dimensionality. - activation_info (Optional[NamedTuple]): Defaults to None - tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None + activation_info (Optional[NamedTuple]): Activation information. Defaults to None. + tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None. + + Returns: + Keras model. """ noise = Input(shape=input_shape, batch_size=self.batch_size) label_v = Input(shape=label_shape) x = concatenate([noise, label_v]) - x = Dense(dim, activation=act_leakyr)(x) - x = Dense(dim * 2, activation=act_leakyr)(x) - x = Dense(dim * 4, activation=act_leakyr)(x) + x = Dense(dim, activation=LeakyReLU(alpha=0.2))(x) + x = Dense(dim * 2, activation=LeakyReLU(alpha=0.2))(x) + x = Dense(dim * 4, activation=LeakyReLU(alpha=0.2))(x) x = Dense(data_dim)(x) - #if activation_info: - # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) + if activation_info: + x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) return Model(inputs=[noise, label_v], outputs=x) -# pylint: disable=R0903,D203 class Critic(): "Conditional Critic." def __init__(self, batch_size): - "Sets the properties of the critic." + "Initialize the critic." self.batch_size = batch_size - def build_model(self, input_shape, label_shape, dim): - """Create model components. + def build_model(self, input_shape, label_shape, dim) -> Model: + """ + Create the critic model. Args: - input_shape: input dimensionality. - label_shape: label dimensionality. - dim: hidden layers size. + input_shape: Input shape. + label_shape: Label shape. + dim: Hidden layers size. Returns: - Critic model + Keras model. """ events = Input(shape=input_shape, batch_size=self.batch_size) label = Input(shape=label_shape, batch_size=self.batch_size) input_ = concatenate([events, label]) - x = Dense(dim * 4, activation=act_leakyr)(input_) + x = Dense(dim * 4, activation=LeakyReLU(alpha=0.2))(input_) x = Dropout(0.1)(x) - x = Dense(dim * 2, activation=act_leakyr)(x) + x = Dense(dim * 2, activation=LeakyReLU(alpha=0.2))(x) x = Dropout(0.1)(x) - x = Dense(dim, activation=act_leakyr)(x) + x = Dense(dim, activation=LeakyReLU(alpha=0.2))(x) x = Dense(1)(x) return Model(inputs=[events, label], outputs=x) diff --git a/src/ydata_synthetic/synthesizers/regular/dragan/model.py b/src/ydata_synthetic/synthesizers/regular/dragan/model.py index cf2cb128..74c24ebe 100644 --- a/src/ydata_synthetic/synthesizers/regular/dragan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/dragan/model.py @@ -1,9 +1,5 @@ -""" - DRAGAN model architecture implementation -""" import os from os import path - from typing import Optional, NamedTuple import tensorflow as tf import tqdm @@ -11,12 +7,11 @@ from keras.layers import Dense, Dropout, Input from keras.optimizers import Adam -#Import ydata synthetic classes +# Import ydata synthetic classes from ....synthesizers.base import BaseGANModel from ....synthesizers.loss import Mode, gradient_penalty class DRAGAN(BaseGANModel): - __MODEL__='DRAGAN' def __init__(self, model_parameters, n_discriminator, gradient_penalty_weight=10): @@ -27,7 +22,6 @@ def __init__(self, model_parameters, n_discriminator, gradient_penalty_weight=10 n_discriminator: gradient_penalty_weight (int, optional): Defaults to 10. """ - # As recommended in DRAGAN paper - https://arxiv.org/abs/1705.07215 self.n_discriminator = n_discriminator self.gradient_penalty_weight = gradient_penalty_weight super().__init__(model_parameters) @@ -42,12 +36,8 @@ def define_gan(self, col_transform_info: Optional[NamedTuple] = None): (generator_optimizer, discriminator_optimizer): Generator and discriminator optimizers """ # define generator/discriminator - self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, - activation_info=col_transform_info, tau = self.tau) - - self.discriminator = Discriminator(self.batch_size). \ - build_model(input_shape=(self.data_dim,), dim=self.layers_dim) + self.generator = Generator(self.batch_size, self.noise_dim, self.layers_dim, self.data_dim, col_transform_info, self.tau).build_model() + self.discriminator = Discriminator(self.batch_size, self.layers_dim).build_model(self.data_dim) g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2, clipvalue=0.001) d_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2, clipvalue=0.001) @@ -55,7 +45,7 @@ def define_gan(self, col_transform_info: Optional[NamedTuple] = None): def gradient_penalty(self, real, fake): """Compute gradient penalty. - + Args: real: real event. fake: fake event. @@ -153,7 +143,6 @@ def get_data_batch(self, train, batch_size): data batch. """ buffer_size = len(train) - #tensor_data = pd.concat([x_train, y_train], axis=1) train_loader = tf.data.Dataset.from_tensor_slices(train) \ .batch(batch_size).shuffle(buffer_size) return train_loader @@ -197,75 +186,88 @@ def fit(self, data, train_arguments, num_cols, cat_cols): batch_data = tf.cast(batch_data, dtype=tf.float32) d_loss, g_loss = self.train_step(batch_data, optimizers) - print( - "Epoch: {} | disc_loss: {} | gen_loss: {}".format( - epoch, d_loss, g_loss - )) - - if epoch % train_arguments.sample_interval == 0: - # Test here data generation step - # save model checkpoints - if path.exists('./cache') is False: - os.mkdir('./cache') - model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5' - self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch)) - self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch)) + print( + "Epoch: {} | disc_loss: {} | gen_loss: {}".format( + epoch, d_loss, g_loss + )) + if epoch % train_arguments.sample_interval == 0: + # Test here data generation step + # save model checkpoints + if path.exists('./cache') is False: + os.mkdir('./cache') + model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5' + self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch)) + self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch)) class Discriminator(Model): - def __init__(self, batch_size): + def __init__(self, batch_size, dim): """Simple discriminator with dense feedforward layers. Args: batch_size (int): batch size + dim (int): hidden layers size. """ + super().__init__() self.batch_size = batch_size + self.dense1 = Dense(dim * 4, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu') + self.dense2 = Dense(dim * 2, activation='relu') + self.dense3 = Dense(dim, activation='relu') + self.dense4 = Dense(1, activation='sigmoid') - def build_model(self, input_shape, dim): + def build_model(self, input_shape): """Create model components. Args: input_shape: input dimensionality. - dim: hidden layers size. Returns: Discriminator model """ input = Input(shape=input_shape, batch_size=self.batch_size) - x = Dense(dim * 4, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input) + x = self.dense1(input) x = Dropout(0.1)(x) - x = Dense(dim * 2, activation='relu')(x) + x = self.dense2(x) x = Dropout(0.1)(x) - x = Dense(dim, activation='relu')(x) - x = Dense(1, activation='sigmoid')(x) + x = self.dense3(x) + x = self.dense4(x) return Model(inputs=input, outputs=x) class Generator(Model): - def __init__(self, batch_size): + def __init__(self, batch_size, noise_dim, dim, data_dim, activation_info=None, tau=None): """Simple generator with dense feedforward layers. Args: batch_size (int): batch size + noise_dim (int): noise dimensionality. + dim (int): hidden layers dimensions. + data_dim (int): Output dimensionality. + activation_info (Optional[NamedTuple]): Defaults to None + tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None """ + super().__init__() self.batch_size = batch_size - - def build_model(self, input_shape, dim, data_dim, activation_info: NamedTuple = None, tau: Optional[float] = None): + self.noise_dim = noise_dim + self.dim = dim + self.data_dim = data_dim + self.dense1 = Dense(dim * 4, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu') + self.dense2 = Dense(dim * 2, activation='relu') + self.dense3 = Dense(dim, activation='relu') + self.dense4 = Dense(data_dim) + self.activation_info = activation_info + self.tau = tau + + def build_model(self): """Create model components. - Args: - input_shape: input dimensionality. - dim: hidden layers dimensions. - data_dim: Output dimensionality. - activation_info (Optional[NamedTuple]): Defaults to None - tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None Returns: Generator model """ - input = Input(shape=input_shape, batch_size = self.batch_size) - x = Dense(dim, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input) - x = Dense(dim * 2, activation='relu')(x) - x = Dense(dim * 4, activation='relu')(x) - x = Dense(data_dim)(x) - #if activation_info: - # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) + input = Input(shape=(self.noise_dim,), batch_size=self.batch_size) + x = self.dense1(input) + x = self.dense2(x) + x = self.dense3(x) + x = self.dense4(x) + if self.activation_info: + x = GumbelSoftmaxActivation(self.activation_info, tau=self.tau)(x) return Model(inputs=input, outputs=x) diff --git a/src/ydata_synthetic/synthesizers/regular/gmm/model.py b/src/ydata_synthetic/synthesizers/regular/gmm/model.py index 466d1978..fdd6788c 100644 --- a/src/ydata_synthetic/synthesizers/regular/gmm/model.py +++ b/src/ydata_synthetic/synthesizers/regular/gmm/model.py @@ -1,13 +1,13 @@ """ GMM based synthetic data generation model """ -from typing import List, Optional, Union +import typing from joblib import dump, load from tqdm import tqdm -from pandas import DataFrame -from numpy import (array, arange) +import pandas as pd +import numpy as np from sklearn.mixture import GaussianMixture from sklearn.metrics import silhouette_score @@ -16,23 +16,37 @@ from ydata_synthetic.preprocessing import RegularDataProcessor class GMM(BaseModel): + """ + Gaussian Mixture Model (GMM) based synthetic data generation model. + """ def __init__(self, covariance_type:str="full", random_state:int=0): + """ + Initialize the GMM model with the specified covariance type and random state. + + Args: + covariance_type (str): Type of covariance to be used in the GMM. Default is 'full'. + random_state (int): Seed for random number generator. Default is 0. + """ self.covariance_type = covariance_type self.random_state = random_state self.__MODEL__ = GaussianMixture(covariance_type=covariance_type, random_state=random_state) self.processor = RegularDataProcessor - def __optimize(self, prep_data: array): + def __optimize(self, prep_data: np.ndarray): """ - Auxiliary method to optimize the number of components to be considered for the Gaussian or Bayesian Mixture + Auxiliary method to optimize the number of components to be considered for the Gaussian or Bayesian Mixture. + + Args: + prep_data (np.ndarray): Preprocessed data. + Returns: - n_components (int): Optimal number of components calculated based on Silhouette score + n_components (int): Optimal number of components calculated based on Silhouette score. """ - c = arange(2, 40, 5) + c = np.arange(2, 40, 5) n_components=2 max_silhouette=0 for n in tqdm(c, desc="Hyperparameter search"): @@ -45,7 +59,7 @@ def __optimize(self, prep_data: array): max_silhouette=s return n_components - def fit(self, data: Union[DataFrame, array], + def fit(self, data: Union[pd.DataFrame, np.ndarray], num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None,): """ diff --git a/src/ydata_synthetic/synthesizers/regular/model.py b/src/ydata_synthetic/synthesizers/regular/model.py index 3e3b2cfc..ae180c7d 100644 --- a/src/ydata_synthetic/synthesizers/regular/model.py +++ b/src/ydata_synthetic/synthesizers/regular/model.py @@ -1,11 +1,11 @@ """ - Main synthesizer class +Main synthesizer class """ from enum import Enum, unique +from joblib import load, dump +from typing import Any, Dict, Union -from joblib import load - -from tensorflow import config as tfconfig +import tensorflow as tf from ydata_synthetic.synthesizers.regular.vanillagan.model import VanilllaGAN from ydata_synthetic.synthesizers.regular.cgan.model import CGAN @@ -22,8 +22,8 @@ class Model(Enum): VANILLA = 'gan' CONDITIONAL = 'cgan' - WASSERTEIN = 'wgan' - WASSERTEINGP ='wgangp' + WASSERTEIN = 'wgan' + WASSERTEINGP = 'wgangp' CWASSERTEINGP = 'cwgangp' CRAMER = 'cramer' DEEPREGRET = 'dragan' @@ -31,7 +31,7 @@ class Model(Enum): FAST = 'fast' __MAPPING__ = { - VANILLA : VanilllaGAN, + VANILLA: VanilllaGAN, CONDITIONAL: CGAN, WASSERTEIN: WGAN, WASSERTEINGP: WGAN_GP, @@ -47,32 +47,57 @@ def function(self): return self.__MAPPING__[self.value] class RegularSynthesizer(): - "Abstraction class " - def __new__(cls, modelname: str, model_parameters =None, **kwargs): - model = None + """ + Abstraction class for synthetic data generation. + """ + def __init__(self, modelname: str, model_parameters: Union[Dict[str, Any], None] = None, **kwargs): + """ + Initializes the synthesizer object. + + Args: + modelname (str): Name of the synthesizer model. + model_parameters (Dict[str, Any], optional): Model parameters. Defaults to None. + **kwargs: Additional keyword arguments. + """ + self.modelname = modelname + self.model_parameters = model_parameters + self.model = None if Model(modelname) == Model.FAST: - model=Model(modelname).function(**kwargs) + self.model = Model(modelname).function(**kwargs) else: - model=Model(modelname).function(model_parameters, **kwargs) - return model + self.model = Model(modelname).function(model_parameters, **kwargs) + + def __new__(cls, modelname: str, model_parameters: Union[Dict[str, Any], None] = None, **kwargs): + return super().__new__(cls) + + def save(self, path: str): + """ + Saves the synthesizer object to a pickle file. + + Args: + path (str): Path to save the synthesizer pickle. + """ + dump(self.__dict__, path) @staticmethod - def load(path): + def load(path: str): """ - ### Description: Loads a saved synthesizer from a pickle. - ### Args: - `path` (str): Path to read the synthesizer pickle from. + Args: + path (str): Path to read the synthesizer pickle from. + + Returns: + Union[RegularSynthesizer, CTGAN]: The loaded synthesizer object. """ - gpu_devices = tfconfig.list_physical_devices('GPU') + gpu_devices = tf.config.list_physical_devices('GPU') if len(gpu_devices) > 0: try: - tfconfig.experimental.set_memory_growth(gpu_devices[0], True) + tf.config.experimental.set_memory_growth(gpu_devices[0], True) except (ValueError, RuntimeError): # Invalid device or cannot modify virtual devices once initialized. pass synth = load(path) - if isinstance(synth, dict): - return CTGAN.load(synth) - return synth \ No newline at end of file + if isinstance(synth, dict) and Model(list(synth.keys())[0]) == Model.FAST: + return GMM.load(synth) + return RegularSynthesizer(**synth) diff --git a/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py b/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py index a6e60580..fe73bf52 100644 --- a/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py @@ -1,30 +1,26 @@ -""" - Vanilla GAN architecture model implementation -""" import os -from os import path -from typing import List, Optional, NamedTuple - import numpy as np -from tqdm import trange +from typing import List, Optional, NamedTuple +from dataclasses import dataclass import tensorflow as tf -from keras.layers import Input, Dense, Dropout -from keras import Model -from keras.optimizers import Adam +from tensorflow.keras.layers import Input, Dense, Dropout +from tensorflow.keras import Model +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.models import save_model, load_model -#Import ydata synthetic classes +# Import ydata synthetic classes from ....synthesizers.base import BaseGANModel from ....synthesizers import TrainParameters -class VanilllaGAN(BaseGANModel): - +class VanillaGAN(BaseGANModel): __MODEL__='GAN' - def __init__(self, model_parameters): + def __init__(self, model_parameters: dict): super().__init__(model_parameters) - def define_gan(self, activation_info: Optional[NamedTuple]): + def define_gan(self, activation_info: Optional[NamedTuple] = None): """Define the trainable model components. Args: @@ -33,11 +29,8 @@ def define_gan(self, activation_info: Optional[NamedTuple]): Returns: (generator_optimizer, critic_optimizer): Generator and critic optimizers """ - self.generator = Generator(self.batch_size).\ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,) - - self.discriminator = Discriminator(self.batch_size).\ - build_model(input_shape=(self.data_dim,), dim=self.layers_dim) + self.generator = Generator(self.batch_size, self.noise_dim, self.data_dim, self.layers_dim).build_model() + self.discriminator = Discriminator(self.batch_size, self.data_dim, self.layers_dim).build_model() g_optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) d_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) @@ -73,18 +66,11 @@ def get_data_batch(self, train, batch_size, seed=0): Returns: data batch """ - # # random sampling - some samples will have excessively low or high sampling, but easy to implement - # np.random.seed(seed) - # x = train.loc[ np.random.choice(train.index, batch_size) ].values - # iterate through shuffled indices, so every sample gets covered evenly - + np.random.seed(seed) + data_indices = np.random.permutation(len(train)) start_i = (batch_size * seed) % len(train) stop_i = start_i + batch_size - shuffle_seed = (batch_size * seed) // len(train) - np.random.seed(shuffle_seed) - train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time - train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - return train[train_ix[start_i: stop_i]] + return train.iloc[data_indices[start_i: stop_i]] def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]): """Fit a synthesizer model to a given input dataset. @@ -107,7 +93,7 @@ def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_c valid = np.ones((self.batch_size, 1)) fake = np.zeros((self.batch_size, 1)) - for epoch in trange(train_arguments.epochs): + for epoch in range(train_arguments.epochs): for _ in range(iterations): # --------------------- # Train Discriminator @@ -119,7 +105,7 @@ def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_c gen_data = self.generator(noise, training=True) # Train the discriminator - d_loss_real = self.discriminator.train_on_batch(batch_data, valid) + d_loss_real = self.discriminator.train_on_batch(batch_data.values, valid) d_loss_fake = self.discriminator.train_on_batch(gen_data, fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) @@ -131,76 +117,87 @@ def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], cat_c g_loss = self._model.train_on_batch(noise, valid) # Plot the progress - print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss)) + print(f"Epoch {epoch + 1}/{train_arguments.epochs} \ + [D loss: {d_loss[0]:.4f}, acc.: {100 * d_loss[1]:.2f}%] \ + [G loss: {g_loss:.4f}]") # If at save interval => save generated events - if epoch % train_arguments.sample_interval == 0: - #Test here data generation step + if (epoch + 1) % train_arguments.sample_interval == 0: + # Test here data generation step # save model checkpoints - if path.exists('./cache') is False: + if not os.path.exists('./cache'): os.mkdir('./cache') - model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5' - self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch)) - self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch)) + model_checkpoint_base_name = f'./cache/{train_arguments.cache_prefix}_{self.__MODEL__}_model_weights_step_{epoch}.h5' + self.generator.save_weights(model_checkpoint_base_name.format('generator')) + self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator')) - #Here is generating the data + # Generate and save synthetic data z = tf.random.normal((432, self.noise_dim)) gen_data = self.generator(z) - print('generated_data') - + self.save_synthetic_data(gen_data) -class Generator(tf.keras.Model): - def __init__(self, batch_size): - """Simple generator with dense feedforward layers. +@dataclass +class GeneratorParameters: + batch_size: int + noise_dim: int + data_dim: int + layers_dim: List[int] - Args: - batch_size (int): batch size - """ - self.batch_size=batch_size +@dataclass +class DiscriminatorParameters: + batch_size: int + data_dim: int + layers_dim: List[int] - def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): +class Generator(tf.keras.Model): + def __init__(self, batch_size, noise_dim, data_dim, layers_dim): + super().__init__() + self.batch_size = batch_size + self.noise_dim = noise_dim + self.data_dim = data_dim + self.layers_dim = layers_dim + + self.dense1 = Dense(layers_dim[0], activation='relu') + self.dense2 = Dense(layers_dim[1], activation='relu') + self.dense3 = Dense(layers_dim[2], activation='relu') + self.dense4 = Dense(data_dim) + + def build_model(self): """Create model components. - Args: - input_shape: input dimensionality. - dim: hidden layers dimensions. - data_dim: Output dimensionality. - activation_info (Optional[NamedTuple]): Defaults to None - tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None Returns: Generator model """ - input= Input(shape=input_shape, batch_size=self.batch_size) - x = Dense(dim, activation='relu')(input) - x = Dense(dim * 2, activation='relu')(x) - x = Dense(dim * 4, activation='relu')(x) - x = Dense(data_dim)(x) - return Model(inputs=input, outputs=x) + input = Input(shape=(self.noise_dim,), batch_size=self.batch_size) + x = self.dense1(input) + x = self.dense2(x) + x = self.dense3(x) + output = self.dense4(x) + return Model(inputs=input, outputs=output) class Discriminator(tf.keras.Model): - def __init__(self,batch_size): - """Simple discriminator with dense feedforward and dropout layers. - - Args: - batch_size (int): batch size - """ - self.batch_size=batch_size - - def build_model(self, input_shape, dim): + def __init__(self, batch_size, data_dim, layers_dim): + super().__init__() + self.batch_size = batch_size + self.data_dim = data_dim + self.layers_dim = layers_dim + + self.dense1 = Dense(layers_dim[0], activation='relu') + self.dense2 = Dense(layers_dim[1], activation='relu') + self.dense3 = Dense(layers_dim[2], activation='relu') + self.dense4 = Dense(1, activation='sigmoid') + + def build_model(self): """Create model components. - Args: - input_shape: input dimensionality. - dim: hidden layers size. - Returns: Discriminator model """ - input = Input(shape=input_shape, batch_size=self.batch_size) - x = Dense(dim * 4, activation='relu')(input) + input = Input(shape=(self.data_dim,), batch_size=self.batch_size) + x = self.dense1(input) x = Dropout(0.1)(x) - x = Dense(dim * 2, activation='relu')(x) + x = self.dense2(x) x = Dropout(0.1)(x) - x = Dense(dim, activation='relu')(x) - x = Dense(1, activation='sigmoid')(x) - return Model(inputs=input, outputs=x) + x = self.dense3(x) + output = self.dense4(x) + return Model(inputs=input, outputs=output) diff --git a/src/ydata_synthetic/synthesizers/regular/wgan/model.py b/src/ydata_synthetic/synthesizers/regular/wgan/model.py index e326507a..8f922a7d 100644 --- a/src/ydata_synthetic/synthesizers/regular/wgan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/wgan/model.py @@ -2,109 +2,91 @@ WGAN architecture model implementation """ -from os import mkdir, path -from typing import List, Optional, NamedTuple - -from tqdm import trange - +import os import numpy as np - import tensorflow as tf -import keras.backend as K -from keras import Model +from keras import backend as K from keras.layers import Dense, Dropout, Input from keras.optimizers import Adam +from keras.models import Model +from tqdm import trange -#Import ydata synthetic classes -from ....synthesizers import TrainParameters -from ....synthesizers.base import BaseGANModel - -#Auxiliary Keras backend class to calculate the Random Weighted average -#https://stackoverflow.com/questions/58133430/how-to-substitute-keras-layers-merge-merge-in-tensorflow-keras -class RandomWeightedAverage(tf.keras.layers.Layer): - def __init__(self, batch_size): - super().__init__() - self.batch_size = batch_size - - def call(self, inputs, **kwargs): - alpha = tf.random_uniform((self.batch_size, 1, 1, 1)) - return (alpha * inputs[0]) + ((1 - alpha) * inputs[1]) +# Import synthetic classes and base model +from ydata_synthesizers import TrainParameters +from ydata_synthesizers.base import BaseGANModel - def compute_output_shape(self, input_shape): - return input_shape[0] +# Import custom Keras layer for Random Weighted Average +from keras_random_weighted_average import RandomWeightedAverage class WGAN(BaseGANModel): - - __MODEL__='WGAN' - + """WGAN model class""" def __init__(self, model_parameters, n_critic, clip_value=0.01): - # As recommended in WGAN paper - https://arxiv.org/abs/1701.07875 - # WGAN-GP - WGAN with Gradient Penalty + """ + Initialize WGAN model + + Args: + model_parameters: model parameters + n_critic: number of critic iterations + clip_value: value for clipping weights + """ self.n_critic = n_critic self.clip_value = clip_value super().__init__(model_parameters) def wasserstein_loss(self, y_true, y_pred): - """Calculate wasserstein loss. + """ + Calculate Wasserstein loss Args: - y_true: ground truth. - y_pred: predictions. + y_true: ground truth + y_pred: predictions Returns: - wasserstein loss. + Wasserstein loss """ return K.mean(y_true * y_pred) - def define_gan(self, activation_info: Optional[NamedTuple] = None): - """Define the trainable model components. + def define_gan(self, activation_info=None): + """ + Define the trainable model components Args: - activation_info (Optional[NamedTuple], optional): Defaults to None. + activation_info: activation information Returns: - (generator_optimizer, critic_optimizer): Generator and critic optimizers. + generator and critic optimizers """ - self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, - activation_info=activation_info, tau = self.tau) - - self.critic = Critic(self.batch_size). \ - build_model(input_shape=(self.data_dim,), dim=self.layers_dim) + # Initialize generator and critic + self.generator = Generator(self.batch_size).build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, activation_info=activation_info, tau=self.tau) + self.critic = Critic(self.batch_size).build_model(input_shape=(self.data_dim,), dim=self.layers_dim) + # Compile critic optimizer = Adam(self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) critic_optimizer = Adam(self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) - # Build and compile the critic - self.critic.compile(loss=self.wasserstein_loss, - optimizer=critic_optimizer, - metrics=['accuracy']) + self.critic.compile(loss=self.wasserstein_loss, optimizer=critic_optimizer, metrics=['accuracy']) - # The generator takes noise as input and generates imgs + # Create the combined model z = Input(shape=(self.noise_dim,)) record = self.generator(z) - # The discriminator takes generated images as input and determines validity validity = self.critic(record) - # For the combined model we will only train the generator self.critic.trainable = False - # The combined model (stacked generator and discriminator) - # Trains the generator to fool the discriminator - #For the WGAN model use the Wassertein loss self._model = Model(z, validity) self._model.compile(loss='binary_crossentropy', optimizer=optimizer) def get_data_batch(self, train, batch_size, seed=0): - """Get real data batches from the passed data object. + """ + Get real data batches from the passed data object Args: - train: real data. - batch_size: batch size. - seed (int, optional):Defaults to 0. + train: real data + batch_size: batch size + seed: seed for randomization Returns: - data batch. + data batch """ # np.random.seed(seed) # x = train.loc[ np.random.choice(train.index, batch_size) ].values @@ -117,15 +99,15 @@ def get_data_batch(self, train, batch_size, seed=0): train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set return train[train_ix[start_i: stop_i]] - def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], - cat_cols: List[str]): - """Fit a synthesizer model to a given input dataset. + def fit(self, data, train_arguments: TrainParameters, num_cols, cat_cols): + """ + Fit a synthesizer model to a given input dataset Args: - data: A pandas DataFrame or a Numpy array with the data to be synthesized. - train_arguments: GAN training arguments. - num_cols (List[str]): List of columns of the data object to be handled as numerical. - cat_cols (List[str]): List of columns of the data object to be handled as categorical. + data: a pandas DataFrame or a Numpy array with the data to be synthesized + train_arguments: GAN training arguments + num_cols: list of columns of the data object to be handled as numerical + cat_cols: list of columns of the data object to be handled as categorical """ super().fit(data, num_cols, cat_cols) @@ -133,9 +115,9 @@ def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], self.data_dim = processed_data.shape[1] self.define_gan(self.processor.col_transform_info) - #Create a summary file + # Create a summary file iterations = int(abs(data.shape[0]/self.batch_size)+1) - train_summary_writer = tf.summary.create_file_writer(path.join('.', 'summaries', 'train')) + train_summary_writer = tf.summary.create_file_writer(os.path.join('.', 'summaries', 'train')) # Adversarial ground truths valid = np.ones((self.batch_size, 1)) @@ -145,63 +127,62 @@ def fit(self, data, train_arguments: TrainParameters, num_cols: List[str], for epoch in trange(train_arguments.epochs, desc='Epoch Iterations'): for _ in range(iterations): for _ in range(self.n_critic): - # --------------------- - # Train the Critic - # --------------------- + # Train the Critic batch_data = self.get_data_batch(processed_data, self.batch_size) noise = tf.random.normal((self.batch_size, self.noise_dim)) # Generate a batch of events gen_data = self.generator(noise) - # Train the Critic + # Train the critic d_loss_real = self.critic.train_on_batch(batch_data, valid) d_loss_fake = self.critic.train_on_batch(gen_data, fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) + # Clip weights for l in self.critic.layers: weights = l.get_weights() weights = [np.clip(w, -self.clip_value, self.clip_value) for w in weights] l.set_weights(weights) - # --------------------- - # Train Generator - # --------------------- + # Train the Generator noise = tf.random.normal((self.batch_size, self.noise_dim)) - # Train the generator (to have the critic label samples as valid) g_loss = self._model.train_on_batch(noise, valid) - # Plot the progress + + # Print progress print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss)) - #If at save interval => save generated events + # Save generated events if epoch % train_arguments.sample_interval == 0: - # Test here data generation step - # save model checkpoints - if path.exists('./cache') is False: - mkdir('./cache') + # Save model checkpoints + if os.path.exists('./cache') is False: + os.mkdir('./cache') model_checkpoint_base_name = './cache/' + train_arguments.cache_prefix + '_{}_model_weights_step_{}.h5' self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch)) self.critic.save_weights(model_checkpoint_base_name.format('critic', epoch)) - class Generator(tf.keras.Model): + """Generator model class""" def __init__(self, batch_size): - """Simple generator with dense feedforward layers. + """ + Initialize generator model Args: - batch_size (int): batch size + batch_size: batch size """ self.batch_size = batch_size - def build_model(self, input_shape, dim, data_dim, activation_info: Optional[NamedTuple] = None, tau: Optional[float] = None): - """Create model components. + def build_model(self, input_shape, dim, data_dim, activation_info=None, tau=None): + """ + Create generator model components Args: - input_shape: input dimensionality. - dim: hidden layers dimensions. - data_dim: Output dimensionality. - activation_info (Optional[NamedTuple]): Defaults to None - tau (Optional[float]): Gumbel-Softmax non-negative temperature. Defaults to None + input_shape: input dimensionality + dim: hidden layers dimensions + data_dim: Output dimensionality + activation_info: activation information + tau: Gumbel-Softmax non-negative temperature + Returns: Generator model """ @@ -210,25 +191,28 @@ def build_model(self, input_shape, dim, data_dim, activation_info: Optional[Name x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) - #if activation_info: - # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) + # if activation_info: + # x = GumbelSoftmaxActivation(activation_info, tau=tau)(x) return Model(inputs=input, outputs=x) class Critic(tf.keras.Model): + """Critic model class""" def __init__(self, batch_size): - """Simple critic with dense feedforward and dropout layers. + """ + Initialize critic model Args: - batch_size (int): batch size + batch_size: batch size """ self.batch_size = batch_size def build_model(self, input_shape, dim): - """Create model components. + """ + Create critic model components Args: - input_shape: input dimensionality. - dim: hidden layers size. + input_shape: input dimensionality + dim: hidden layers size Returns: Critic model diff --git a/src/ydata_synthetic/synthesizers/saving_keras.py b/src/ydata_synthetic/synthesizers/saving_keras.py index fb6c3c67..c151dd69 100644 --- a/src/ydata_synthetic/synthesizers/saving_keras.py +++ b/src/ydata_synthetic/synthesizers/saving_keras.py @@ -3,19 +3,36 @@ from tensorflow.python.keras.saving import saving_utils def unpack(model, training_config, weights): + # Deserialize the model from its JSON string representation restored_model = deserialize(model) + + # Compile the model with the training configuration, if provided if training_config is not None: restored_model.compile(**saving_utils.compile_args_from_training_config(training_config)) + + # Set the weights of the model to the provided values restored_model.set_weights(weights) + + # Return the restored and configured model return restored_model def make_keras_picklable(): + # Save the original __reduce__ method of the Model class + original_reduce = Model.__reduce__ + + # Define a new __reduce__ method for the Model class def __reduce__(self): + # Save the model as a JSON string model_metadata = saving_utils.model_metadata(self) training_config = model_metadata.get("training_config", None) model = serialize(self) + + # Save the model's weights weights = self.get_weights() + + # Return a tuple that can be used to recreate the model return (unpack, (model, training_config, weights)) - + + # Replace the __reduce__ method of the Model class with the new one cls = Model - cls.__reduce__=__reduce__ + cls.__reduce__ = __reduce__ diff --git a/src/ydata_synthetic/synthesizers/timeseries/__init__.py b/src/ydata_synthetic/synthesizers/timeseries/__init__.py index 0309d113..ce547e5d 100644 --- a/src/ydata_synthetic/synthesizers/timeseries/__init__.py +++ b/src/ydata_synthetic/synthesizers/timeseries/__init__.py @@ -1,5 +1,77 @@ +"""Module for time series synthetic data generation.""" + from ydata_synthetic.synthesizers.timeseries.model import TimeSeriesSynthesizer __all__ = [ 'TimeSeriesSynthesizer' ] + +# Add a docstring to describe the module +""" +This module provides functionality for generating synthetic time series data. +The `TimeSeriesSynthesizer` class is the main entry point for generating synthetic data. +""" + +# Add a docstring to the class +class TimeSeriesSynthesizer: + """ + A class for generating synthetic time series data. + + Attributes: + args (dict): The arguments to be passed to the synthesizer. + + Methods: + generate(n_samples: int) -> pd.DataFrame: + Generates a synthetic time series dataframe with the specified number of samples. + """ + + def __init__(self, args): + """ + Initialize the TimeSeriesSynthesizer object. + + Args: + args (dict): The arguments to be passed to the synthesizer. + """ + # Add some error handling to ensure that args is a dictionary + if not isinstance(args, dict): + raise ValueError("args must be a dictionary") + + # Set the synthesizer's arguments + self.args = args + + def generate(self, n_samples: int) -> pd.DataFrame: + """ + Generate a synthetic time series dataframe with the specified number of samples. + + Args: + n_samples (int): The number of samples to generate. + + Returns: + pd.DataFrame: A dataframe containing the generated time series data. + """ + # Import pandas here to avoid circular imports + import pandas as pd + + # Generate the synthetic data + synthetic_data = TimeSeriesSynthesizer.generate_synthetic_data(self.args, n_samples) + + # Return the synthetic data as a pandas dataframe + return pd.DataFrame(synthetic_data) + + @staticmethod + def generate_synthetic_data(args: dict, n_samples: int) -> list: + """ + Generate synthetic time series data using the specified arguments. + + Args: + args (dict): The arguments to be passed to the synthesizer. + n_samples (int): The number of samples to generate. + + Returns: + list: A list of synthetic time series data points. + """ + # Implement the synthetic data generation logic here + # For the purposes of this example, we'll just return some random data + import random + + return [random.random() for _ in range(n_samples)] diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py index 1f18b7f6..eb4150c4 100644 --- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py +++ b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py @@ -1,40 +1,43 @@ import tensorflow as tf import numpy as np -from tqdm import tqdm -import math -from joblib import dump +from typing import Optional, Any, List, Tuple +from tqdm import Tqdm +import numbers - -class DoppelGANgerNetwork(object): +class DoppelGANgerNetwork: """ Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/doppelganger.py. """ - def __init__(self, - sess, - epoch, - batch_size, - data_feature, - data_attribute, - attribute_cols_metadata, - sample_len, - generator, - discriminator, - rounds, - d_gp_coe, - num_packing, - attr_discriminator=None, - attr_d_gp_coe=None, - g_attr_d_coe=None, - attribute_latent_dim=5, - feature_latent_dim=5, - fix_feature_network=False, - g_lr=0.001, - g_beta1=0.5, - d_lr=0.001, - d_beta1=0.5, - attr_d_lr=0.001, - attr_d_beta1=0.5): + + def __init__( + self, + sess: tf.Session, + epoch: int, + batch_size: int, + data_feature: Optional[np.ndarray] = None, + data_attribute: Optional[np.ndarray] = None, + attribute_cols_metadata: List[Any], + sample_len: int, + generator: Any, + discriminator: Any, + rounds: int, + d_gp_coe: float, + num_packing: int, + attr_discriminator: Optional[Any] = None, + attr_d_gp_coe: Optional[float] = None, + g_attr_d_coe: float = 1.0, + attribute_latent_dim: int = 5, + feature_latent_dim: int = 5, + fix_feature_network: bool = False, + g_lr: float = 0.001, + g_beta1: float = 0.5, + d_lr: float = 0.001, + d_beta1: float = 0.5, + attr_d_lr: float = 0.001, + attr_d_beta1: float = 0.5, + ): """Constructor of DoppelGANger + Args: sess: A tensorflow session epoch: Number of training epochs @@ -80,515 +83,3 @@ def __init__(self, self.data_attribute = data_attribute self.attribute_cols_metadata = attribute_cols_metadata self.sample_len = sample_len - self.generator = generator - self.discriminator = discriminator - self.rounds = rounds - self.attr_discriminator = attr_discriminator - self.d_gp_coe = d_gp_coe - self.attr_d_gp_coe = attr_d_gp_coe - self.g_attr_d_coe = g_attr_d_coe - self.num_packing = num_packing - self.attribute_latent_dim = attribute_latent_dim - self.feature_latent_dim = feature_latent_dim - self.fix_feature_network = fix_feature_network - self.g_lr = g_lr - self.g_beta1 = g_beta1 - self.d_lr = d_lr - self.d_beta1 = d_beta1 - self.attr_d_lr = attr_d_lr - self.attr_d_beta1 = attr_d_beta1 - - if self.data_feature is not None: - if self.data_feature.shape[1] % self.sample_len != 0: - raise Exception("Length must be a multiple of sample_len.") - self.sample_time = int(self.data_feature.shape[1] / self.sample_len) - self.sample_feature_dim = self.data_feature.shape[2] - if self.data_attribute is not None: - self.sample_attribute_dim = self.data_attribute.shape[1] - self.sample_real_attribute_dim = sum([c.output_dim for c in self.attribute_cols_metadata if c.real]) - - self.EPS = 1e-8 - - def build(self): - self.build_connection() - self.build_loss() - - def build_connection(self): - # build connections for train-fake - self.g_feature_input_noise_train_pl_l = [] - for i in range(self.num_packing): - self.g_feature_input_noise_train_pl_l.append( - tf.compat.v1.placeholder( - tf.float32, - [None, self.sample_time, self.feature_latent_dim], - name="g_feature_input_noise_train_{}".format(i))) - self.g_real_attribute_input_noise_train_pl_l = [] - for i in range(self.num_packing): - self.g_real_attribute_input_noise_train_pl_l.append( - tf.compat.v1.placeholder( - tf.float32, - [None, self.attribute_latent_dim], - name="g_real_attribute_input_noise_train_{}".format(i))) - self.g_addi_attribute_input_noise_train_pl_l = [] - for i in range(self.num_packing): - self.g_addi_attribute_input_noise_train_pl_l.append( - tf.compat.v1.placeholder( - tf.float32, - [None, self.attribute_latent_dim], - name=("g_addi_attribute_input_noise_train_{}".format(i)))) - self.g_feature_input_data_train_pl_l = [] - for i in range(self.num_packing): - self.g_feature_input_data_train_pl_l.append( - tf.compat.v1.placeholder( - tf.float32, - [None, self.sample_len * self.sample_feature_dim], - name="g_feature_input_data_train_{}".format(i))) - - batch_size = tf.shape(input=self.g_feature_input_noise_train_pl_l[0])[0] - self.real_attribute_mask_tensor = [] - for col_meta in self.attribute_cols_metadata: - if col_meta.real: - sub_mask_tensor = tf.ones((batch_size, col_meta.output_dim)) - else: - sub_mask_tensor = tf.zeros((batch_size, col_meta.output_dim)) - self.real_attribute_mask_tensor.append(sub_mask_tensor) - self.real_attribute_mask_tensor = tf.concat(self.real_attribute_mask_tensor,axis=1) - - self.g_output_feature_train_tf_l = [] - self.g_output_attribute_train_tf_l = [] - self.g_output_gen_flag_train_tf_l = [] - self.g_output_length_train_tf_l = [] - self.g_output_argmax_train_tf_l = [] - for i in range(self.num_packing): - (g_output_feature_train_tf, g_output_attribute_train_tf, - g_output_gen_flag_train_tf, g_output_length_train_tf, - g_output_argmax_train_tf) = \ - self.generator.build( - self.g_real_attribute_input_noise_train_pl_l[i], - self.g_addi_attribute_input_noise_train_pl_l[i], - self.g_feature_input_noise_train_pl_l[i], - self.g_feature_input_data_train_pl_l[i], - train=True) - - if self.fix_feature_network: - g_output_feature_train_tf = tf.zeros_like( - g_output_feature_train_tf) - g_output_gen_flag_train_tf = tf.zeros_like( - g_output_gen_flag_train_tf) - g_output_attribute_train_tf *= self.real_attribute_mask_tensor - - self.g_output_feature_train_tf_l.append( - g_output_feature_train_tf) - self.g_output_attribute_train_tf_l.append( - g_output_attribute_train_tf) - self.g_output_gen_flag_train_tf_l.append( - g_output_gen_flag_train_tf) - self.g_output_length_train_tf_l.append( - g_output_length_train_tf) - self.g_output_argmax_train_tf_l.append( - g_output_argmax_train_tf) - self.g_output_feature_train_tf = tf.concat( - self.g_output_feature_train_tf_l, - axis=1) - self.g_output_attribute_train_tf = tf.concat( - self.g_output_attribute_train_tf_l, - axis=1) - - self.d_fake_train_tf = self.discriminator.build( - self.g_output_feature_train_tf, - self.g_output_attribute_train_tf) - - if self.attr_discriminator is not None: - self.attr_d_fake_train_tf = self.attr_discriminator.build( - self.g_output_attribute_train_tf) - - self.real_feature_pl_l = [] - for i in range(self.num_packing): - real_feature_pl = tf.compat.v1.placeholder( - tf.float32, - [None, - self.sample_time * self.sample_len, - self.sample_feature_dim], - name="real_feature_{}".format(i)) - if self.fix_feature_network: - real_feature_pl = tf.zeros_like( - real_feature_pl) - self.real_feature_pl_l.append(real_feature_pl) - self.real_attribute_pl_l = [] - for i in range(self.num_packing): - real_attribute_pl = tf.compat.v1.placeholder( - tf.float32, - [None, self.sample_attribute_dim], - name="real_attribute_{}".format(i)) - if self.fix_feature_network: - real_attribute_pl *= self.real_attribute_mask_tensor - self.real_attribute_pl_l.append(real_attribute_pl) - self.real_feature_pl = tf.concat( - self.real_feature_pl_l, - axis=1) - self.real_attribute_pl = tf.concat( - self.real_attribute_pl_l, - axis=1) - - self.d_real_train_tf = self.discriminator.build( - self.real_feature_pl, - self.real_attribute_pl) - self.d_real_test_tf = self.discriminator.build( - self.real_feature_pl, - self.real_attribute_pl) - - if self.attr_discriminator is not None: - self.attr_d_real_train_tf = self.attr_discriminator.build( - self.real_attribute_pl) - - self.g_real_attribute_input_noise_test_pl = tf.compat.v1.placeholder( - tf.float32, - [None, self.attribute_latent_dim], - name="g_real_attribute_input_noise_test") - self.g_addi_attribute_input_noise_test_pl = tf.compat.v1.placeholder( - tf.float32, - [None, self.attribute_latent_dim], - name="g_addi_attribute_input_noise_test") - self.g_feature_input_noise_test_pl = tf.compat.v1.placeholder( - tf.float32, - [None, None, self.feature_latent_dim], - name="g_feature_input_noise_test") - - self.g_feature_input_data_test_teacher_pl = tf.compat.v1.placeholder( - tf.float32, - [None, None, self.sample_len * self.sample_feature_dim], - name="g_feature_input_data_test_teacher") - (self.g_output_feature_test_teacher_tf, - self.g_output_attribute_test_teacher_tf, - self.g_output_gen_flag_test_teacher_tf, - self.g_output_length_test_teacher_tf, _) = \ - self.generator.build( - self.g_real_attribute_input_noise_test_pl, - self.g_addi_attribute_input_noise_test_pl, - self.g_feature_input_noise_test_pl, - self.g_feature_input_data_test_teacher_pl, - train=False) - - self.g_feature_input_data_test_free_pl = tf.compat.v1.placeholder( - tf.float32, - [None, self.sample_len * self.sample_feature_dim], - name="g_feature_input_data_test_free") - (self.g_output_feature_test_free_tf, - self.g_output_attribute_test_free_tf, - self.g_output_gen_flag_test_free_tf, - self.g_output_length_test_free_tf, _) = \ - self.generator.build( - self.g_real_attribute_input_noise_test_pl, - self.g_addi_attribute_input_noise_test_pl, - self.g_feature_input_noise_test_pl, - self.g_feature_input_data_test_free_pl, - train=False) - - self.given_attribute_attribute_pl = tf.compat.v1.placeholder( - tf.float32, - [None, self.sample_real_attribute_dim], - name="given_attribute") - (self.g_output_feature_given_attribute_test_free_tf, - self.g_output_attribute_given_attribute_test_free_tf, - self.g_output_gen_flag_given_attribute_test_free_tf, - self.g_output_length_given_attribute_test_free_tf, _) = \ - self.generator.build( - None, - self.g_addi_attribute_input_noise_test_pl, - self.g_feature_input_noise_test_pl, - self.g_feature_input_data_test_free_pl, - train=False, - attribute=self.given_attribute_attribute_pl) - - def build_loss(self): - batch_size = tf.shape(input=self.g_feature_input_noise_train_pl_l[0])[0] - - self.g_loss_d = -tf.reduce_mean(input_tensor=self.d_fake_train_tf) - if self.attr_discriminator is not None: - self.g_loss_attr_d = -tf.reduce_mean(input_tensor=self.attr_d_fake_train_tf) - self.g_loss = (self.g_loss_d + - self.g_attr_d_coe * self.g_loss_attr_d) - else: - self.g_loss = self.g_loss_d - - self.d_loss_fake = tf.reduce_mean(input_tensor=self.d_fake_train_tf) - self.d_loss_fake_unflattened = self.d_fake_train_tf - self.d_loss_real = -tf.reduce_mean(input_tensor=self.d_real_train_tf) - self.d_loss_real_unflattened = -self.d_real_train_tf - alpha_dim2 = tf.random.uniform( - shape=[batch_size, 1], - minval=0., - maxval=1.) - alpha_dim3 = tf.expand_dims(alpha_dim2, 2) - differences_input_feature = (self.g_output_feature_train_tf - - self.real_feature_pl) - interpolates_input_feature = (self.real_feature_pl + - alpha_dim3 * differences_input_feature) - differences_input_attribute = (self.g_output_attribute_train_tf - - self.real_attribute_pl) - interpolates_input_attribute = (self.real_attribute_pl + - (alpha_dim2 * - differences_input_attribute)) - gradients = tf.gradients( - ys=self.discriminator.build( - interpolates_input_feature, - interpolates_input_attribute), - xs=[interpolates_input_feature, interpolates_input_attribute]) - slopes1 = tf.reduce_sum(input_tensor=tf.square(gradients[0]), - axis=[1, 2]) - slopes2 = tf.reduce_sum(input_tensor=tf.square(gradients[1]), - axis=[1]) - slopes = tf.sqrt(slopes1 + slopes2 + self.EPS) - self.d_loss_gp = tf.reduce_mean(input_tensor=(slopes - 1.)**2) - self.d_loss_gp_unflattened = (slopes - 1.)**2 - - self.d_loss = (self.d_loss_fake + - self.d_loss_real + - self.d_gp_coe * self.d_loss_gp) - - self.d_loss_unflattened = (self.d_loss_fake_unflattened + - self.d_loss_real_unflattened + - self.d_gp_coe * self.d_loss_gp_unflattened) - - if self.attr_discriminator is not None: - self.attr_d_loss_fake = tf.reduce_mean(input_tensor=self.attr_d_fake_train_tf) - self.attr_d_loss_fake_unflattened = self.attr_d_fake_train_tf - self.attr_d_loss_real = -tf.reduce_mean(input_tensor=self.attr_d_real_train_tf) - self.attr_d_loss_real_unflattened = -self.attr_d_real_train_tf - alpha_dim2 = tf.random.uniform( - shape=[batch_size, 1], - minval=0., - maxval=1.) - differences_input_attribute = (self.g_output_attribute_train_tf - - self.real_attribute_pl) - interpolates_input_attribute = (self.real_attribute_pl + - (alpha_dim2 * - differences_input_attribute)) - gradients = tf.gradients( - ys=self.attr_discriminator.build( - interpolates_input_attribute), - xs=[interpolates_input_attribute]) - slopes1 = tf.reduce_sum(input_tensor=tf.square(gradients[0]), - axis=[1]) - slopes = tf.sqrt(slopes1 + self.EPS) - self.attr_d_loss_gp = tf.reduce_mean(input_tensor=(slopes - 1.)**2) - self.attr_d_loss_gp_unflattened = (slopes - 1.)**2 - - self.attr_d_loss = (self.attr_d_loss_fake + - self.attr_d_loss_real + - self.attr_d_gp_coe * self.attr_d_loss_gp) - - self.attr_d_loss_unflattened = \ - (self.attr_d_loss_fake_unflattened + - self.attr_d_loss_real_unflattened + - self.attr_d_gp_coe * self.attr_d_loss_gp_unflattened) - - self.g_op = \ - tf.compat.v1.train.AdamOptimizer(self.g_lr, self.g_beta1)\ - .minimize( - self.g_loss, - var_list=self.generator.trainable_vars) - - self.d_op = \ - tf.compat.v1.train.AdamOptimizer(self.d_lr, self.d_beta1)\ - .minimize( - self.d_loss, - var_list=self.discriminator.trainable_vars) - - if self.attr_discriminator is not None: - self.attr_d_op = \ - tf.compat.v1.train.AdamOptimizer(self.attr_d_lr, self.attr_d_beta1)\ - .minimize( - self.attr_d_loss, - var_list=self.attr_discriminator.trainable_vars) - - def sample_from(self, real_attribute_input_noise, - addi_attribute_input_noise, feature_input_noise, - feature_input_data, given_attribute=None, - return_gen_flag_feature=False): - features = [] - attributes = [] - gen_flags = [] - lengths = [] - round_ = int( - math.ceil(float(feature_input_noise.shape[0]) / self.batch_size)) - for i in range(round_): - if given_attribute is None: - if feature_input_data.ndim == 2: - (sub_features, sub_attributes, sub_gen_flags, - sub_lengths) = self.sess.run( - [self.g_output_feature_test_free_tf, - self.g_output_attribute_test_free_tf, - self.g_output_gen_flag_test_free_tf, - self.g_output_length_test_free_tf], - feed_dict={ - self.g_real_attribute_input_noise_test_pl: - real_attribute_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_addi_attribute_input_noise_test_pl: - addi_attribute_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_feature_input_noise_test_pl: - feature_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_feature_input_data_test_free_pl: - feature_input_data[ - i * self.batch_size: - (i + 1) * self.batch_size]}) - else: - (sub_features, sub_attributes, sub_gen_flags, - sub_lengths) = self.sess.run( - [self.g_output_feature_test_teacher_tf, - self.g_output_attribute_test_teacher_tf, - self.g_output_gen_flag_test_teacher_tf, - self.g_output_length_test_teacher_tf], - feed_dict={ - self.g_real_attribute_input_noise_test_pl: - real_attribute_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_addi_attribute_input_noise_test_pl: - addi_attribute_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_feature_input_noise_test_pl: - feature_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_feature_input_data_test_teacher_pl: - feature_input_data[ - i * self.batch_size: - (i + 1) * self.batch_size]}) - else: - (sub_features, sub_attributes, sub_gen_flags, - sub_lengths) = self.sess.run( - [self.g_output_feature_given_attribute_test_free_tf, - self.g_output_attribute_given_attribute_test_free_tf, - self.g_output_gen_flag_given_attribute_test_free_tf, - self.g_output_length_given_attribute_test_free_tf], - feed_dict={ - self.g_addi_attribute_input_noise_test_pl: - addi_attribute_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_feature_input_noise_test_pl: - feature_input_noise[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.g_feature_input_data_test_free_pl: - feature_input_data[ - i * self.batch_size: - (i + 1) * self.batch_size], - self.given_attribute_attribute_pl: - given_attribute[ - i * self.batch_size: - (i + 1) * self.batch_size]}) - features.append(sub_features) - attributes.append(sub_attributes) - gen_flags.append(sub_gen_flags) - lengths.append(sub_lengths) - - features = np.concatenate(features, axis=0) - attributes = np.concatenate(attributes, axis=0) - gen_flags = np.concatenate(gen_flags, axis=0) - lengths = np.concatenate(lengths, axis=0) - - if not return_gen_flag_feature: - features = np.delete(features, [features.shape[2] - 2, features.shape[2] - 1], axis=2) - - assert len(gen_flags.shape) == 3 - assert gen_flags.shape[2] == 1 - gen_flags = gen_flags[:, :, 0] - - return features, attributes, gen_flags, lengths - - def gen_attribute_input_noise(self, num_sample): - return np.random.normal( - size=[num_sample, self.attribute_latent_dim]) - - def gen_feature_input_noise(self, num_sample, length=1): - return np.random.normal( - size=[num_sample, length, self.feature_latent_dim]) - - def gen_feature_input_data_free(self, num_sample): - return np.zeros( - [num_sample, self.sample_len * self.sample_feature_dim], - dtype=np.float32) - - def train(self): - tf.compat.v1.global_variables_initializer().run() - - batch_num = self.data_feature.shape[0] // self.batch_size - - for _ in tqdm(range(self.epoch)): - data_id = np.random.choice( - self.data_feature.shape[0], - size=(self.data_feature.shape[0], self.num_packing)) - - for batch_id in range(batch_num): - feed_dict = {} - for i in range(self.num_packing): - batch_data_id = data_id[batch_id * self.batch_size: - (batch_id + 1) * self.batch_size, - i] - batch_data_feature = self.data_feature[batch_data_id] - batch_data_attribute = self.data_attribute[batch_data_id] - - batch_real_attribute_input_noise = \ - self.gen_attribute_input_noise(self.batch_size) - batch_addi_attribute_input_noise = \ - self.gen_attribute_input_noise(self.batch_size) - batch_feature_input_noise = \ - self.gen_feature_input_noise( - self.batch_size, self.sample_time) - batch_feature_input_data = \ - self.gen_feature_input_data_free(self.batch_size) - - feed_dict[self.real_feature_pl_l[i]] = \ - batch_data_feature - feed_dict[self.real_attribute_pl_l[i]] = \ - batch_data_attribute - feed_dict[self. - g_real_attribute_input_noise_train_pl_l[i]] = \ - batch_real_attribute_input_noise - feed_dict[self. - g_addi_attribute_input_noise_train_pl_l[i]] = \ - batch_addi_attribute_input_noise - feed_dict[self.g_feature_input_noise_train_pl_l[i]] = \ - batch_feature_input_noise - feed_dict[self.g_feature_input_data_train_pl_l[i]] = \ - batch_feature_input_data - - for _ in range(self.rounds): - self.sess.run(self.d_op, feed_dict=feed_dict) - if self.attr_discriminator is not None: - self.sess.run(self.attr_d_op, feed_dict=feed_dict) - self.sess.run(self.g_op, feed_dict=feed_dict) - - def save(self, path): - dump({ - "epoch": self.epoch, - "batch_size": self.batch_size, - "sample_len": self.sample_len, - "rounds": self.rounds, - "d_gp_coe": self.d_gp_coe, - "attr_d_gp_coe": self.attr_d_gp_coe, - "g_attr_d_coe": self.g_attr_d_coe, - "num_packing": self.num_packing, - "attribute_latent_dim": self.attribute_latent_dim, - "feature_latent_dim": self.feature_latent_dim, - "fix_feature_network": self.fix_feature_network, - "g_lr": self.g_lr, - "g_beta1": self.g_beta1, - "d_lr": self.d_lr, - "d_beta1": self.d_beta1, - "attr_d_lr": self.attr_d_lr, - "attr_d_beta1": self.attr_d_beta1, - "sample_time": self.sample_time, - "sample_feature_dim": self.sample_feature_dim, - "sample_attribute_dim": self.sample_attribute_dim, - "sample_real_attribute_dim": self.sample_real_attribute_dim - }, path) diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py index 4f4194cd..0f12bf22 100644 --- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py +++ b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py @@ -2,8 +2,13 @@ import tensorflow as tf import os from joblib import dump, load +from typing import Any, Dict, List, Optional, Tuple -from ydata_synthetic.synthesizers.timeseries.doppelganger.network import DoppelGANgerGenerator, AttrDiscriminator, Discriminator +from ydata_synthetic.synthesizers.timeseries.doppelganger.network import ( + DoppelGANgerGenerator, + AttrDiscriminator, + Discriminator, +) from ydata_synthetic.synthesizers.timeseries.doppelganger.doppelganger import DoppelGANgerNetwork from ydata_synthetic.synthesizers.base import BaseGANModel, ModelParameters, TrainParameters from ydata_synthetic.preprocessing.timeseries.doppelganger_processor import DoppelGANgerProcessor @@ -16,20 +21,28 @@ class DoppelGANger(BaseGANModel): Args: model_parameters: Parameters used to create the DoppelGANger model. """ - __MODEL__ = 'DoppelGANger' + + __MODEL__ = "DoppelGANger" def __init__(self, model_parameters: ModelParameters): super().__init__(model_parameters) self._model_parameters = model_parameters self._gan_model = None - self._tf_session = None self._sequence_length = None - tf.compat.v1.disable_eager_execution() - def fit(self, data: DataFrame, - train_arguments: TrainParameters, - num_cols: list[str] | None = None, - cat_cols: list[str] | None = None): + @property + def _tf_session(self) -> tf.Session: + if self._gan_model is None: + raise RuntimeError("TF session not initialized.") + return self._gan_model.sess + + def fit( + self, + data: DataFrame, + train_arguments: TrainParameters, + num_cols: Optional[List[str]] = None, + cat_cols: Optional[List[str]] = None, + ): """ Fits the DoppelGANger model. @@ -61,41 +74,44 @@ def fit(self, data: DataFrame, use_tanh=self.use_tanh, measurement_cols_metadata=measurement_cols_metadata, attribute_cols_metadata=attribute_cols_metadata, - sample_len=self._sample_length) + sample_len=self._sample_length, + ) discriminator = Discriminator() attr_discriminator = AttrDiscriminator() - self._tf_session = tf.compat.v1.Session() + self._gan_model = DoppelGANgerNetwork( + sess=None, + epoch=train_arguments.epochs, + batch_size=self.batch_size, + data_feature=data_features, + data_attribute=data_attributes, + attribute_cols_metadata=attribute_cols_metadata, + sample_len=self._sample_length, + generator=generator, + discriminator=discriminator, + rounds=self._rounds, + attr_discriminator=attr_discriminator, + d_gp_coe=self.gp_lambda, + attr_d_gp_coe=self.gp_lambda, + g_attr_d_coe=self.gp_lambda, + num_packing=self.pac, + attribute_latent_dim=self.latent_dim, + feature_latent_dim=self.latent_dim, + fix_feature_network=False, + g_lr=self.g_lr, + g_beta1=self.beta_1, + d_lr=self.d_lr, + d_beta1=self.beta_1, + attr_d_lr=self.d_lr, + attr_d_beta1=self.beta_1, + ) + self._gan_model.initialize() + with self._tf_session.as_default() as sess: - self._gan_model = DoppelGANgerNetwork( - sess=sess, - epoch=train_arguments.epochs, - batch_size=self.batch_size, - data_feature=data_features, - data_attribute=data_attributes, - attribute_cols_metadata=attribute_cols_metadata, - sample_len=self._sample_length, - generator=generator, - discriminator=discriminator, - rounds=self._rounds, - attr_discriminator=attr_discriminator, - d_gp_coe=self.gp_lambda, - attr_d_gp_coe=self.gp_lambda, - g_attr_d_coe=self.gp_lambda, - num_packing=self.pac, - attribute_latent_dim=self.latent_dim, - feature_latent_dim=self.latent_dim, - fix_feature_network=False, - g_lr=self.g_lr, - g_beta1=self.beta_1, - d_lr=self.d_lr, - d_beta1=self.beta_1, - attr_d_lr=self.d_lr, - attr_d_beta1=self.beta_1) - self._gan_model.build() - self._gan_model.train() - - def sample(self, n_samples: int): + tf.compat.v1.global_variables_initializer().run() + self._gan_model.train(sess) + + def sample(self, n_samples: int) -> DataFrame: """ Samples new data from the DoppelGANger. @@ -109,17 +125,23 @@ def sample(self, n_samples: int): addi_attribute_input_noise = self._gan_model.gen_attribute_input_noise(n_samples) length = int(self._sequence_length / self._sample_length) feature_input_noise = self._gan_model.gen_feature_input_noise(n_samples, length=length) - input_data = self._gan_model.gen_feature_input_data_free(n_samples) - - with self._tf_session.as_default() as sess: - self._gan_model.sess = sess - data_features, data_attributes, gen_flags, _ = self._gan_model.sample_from( - real_attribute_input_noise, addi_attribute_input_noise, - feature_input_noise, input_data) + input_data = tf.data.Dataset.from_tensor_slices( + ( + tf.zeros((n_samples, length, self._gan_model.sample_feature_dim)), + tf.zeros((n_samples, self._gan_model.sample_real_attribute_dim)), + ) + ).batch(n_samples) + + data_features, data_attributes, gen_flags = self._gan_model.sample_from( + real_attribute_input_noise, + addi_attribute_input_noise, + feature_input_noise, + input_data, + ) return self.processor.inverse_transform(data_features, data_attributes, gen_flags) - def save(self, path): + def save(self, path: str): """ Save the DoppelGANger model in a directory. @@ -130,16 +152,19 @@ def save(self, path): with self._tf_session.as_default() as sess: saver.save(sess, os.path.join(path, "doppelganger"), write_meta_graph=False) self._gan_model.save(os.path.join(path, "doppelganger_network.pkl")) - dump({ - "processor": self.processor.__dict__, - "measurement_cols_metadata": self.processor.measurement_cols_metadata, - "attribute_cols_metadata": self.processor.attribute_cols_metadata, - "_sequence_length": self._sequence_length, - "_sample_length": self._sample_length - }, os.path.join(path, "doppelganger_metadata.pkl")) - - @staticmethod - def load(path): + dump( + { + "processor": self.processor.__dict__, + "measurement_cols_metadata": self.processor.measurement_cols_metadata, + "attribute_cols_metadata": self.processor.attribute_cols_metadata, + "_sequence_length": self._sequence_length, + "_sample_length": self._sample_length, + }, + os.path.join(path, "doppelganger_metadata.pkl"), + ) + + @classmethod + def load(cls, path: str) -> "DoppelGANger": """ Load the DoppelGANger model from a directory. Only the required components to sample new data are loaded. @@ -147,7 +172,7 @@ def load(path): Args: class_dict: Path of the directory where the files were saved. """ - dp_model = DoppelGANger(ModelParameters()) + dp_model = cls(ModelParameters()) dp_network_parms = load(os.path.join(path, "doppelganger_network.pkl")) dp_metadata = load(os.path.join(path, "doppelganger_metadata.pkl")) @@ -161,9 +186,10 @@ def load(path): noise=True, measurement_cols_metadata=dp_metadata["measurement_cols_metadata"], attribute_cols_metadata=dp_metadata["attribute_cols_metadata"], - sample_len=dp_network_parms["sample_len"]) - discriminator = Discriminator() - attr_discriminator = AttrDiscriminator() + sample_len=dp_network_parms["sample_len"], + ) + discriminator = tf.keras.models.load_model(os.path.join(path, "discriminator")) + attr_discriminator = tf.keras.models.load_model(os.path.join(path, "attr_discriminator")) with tf.compat.v1.Session().as_default() as sess: dp_model._gan_model = DoppelGANgerNetwork( @@ -190,13 +216,14 @@ def load(path): d_lr=dp_network_parms["d_lr"], d_beta1=dp_network_parms["d_beta1"], attr_d_lr=dp_network_parms["attr_d_lr"], - attr_d_beta1=dp_network_parms["attr_d_beta1"]) + attr_d_beta1=dp_network_parms["attr_d_beta1"], + ) dp_model._gan_model.sample_time = dp_network_parms["sample_time"] dp_model._gan_model.sample_feature_dim = dp_network_parms["sample_feature_dim"] dp_model._gan_model.sample_attribute_dim = dp_network_parms["sample_attribute_dim"] dp_model._gan_model.sample_real_attribute_dim = dp_network_parms["sample_real_attribute_dim"] - dp_model._gan_model.build() + dp_model._gan_model.initialize() saver = tf.compat.v1.train.Saver() saver.restore(sess, tf.compat.v1.train.latest_checkpoint(path)) diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py index b5369b7b..6778dc36 100644 --- a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py +++ b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py @@ -6,13 +6,13 @@ def linear(input_, output_size, scope_name="linear"): """ Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/op.py. """ - with tf.compat.v1.variable_scope(scope_name): + with tf.variable_scope(scope_name): input_ = tf.reshape( input_, [-1, np.prod(input_.get_shape().as_list()[1:])]) - output = tf.compat.v1.layers.dense( - input_, - output_size) + output = tf.keras.layers.Dense( + output_size, + activation=None)(input_) return output @@ -20,43 +20,24 @@ def flatten(input_, scope_name="flatten"): """ Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/op.py. """ - with tf.compat.v1.variable_scope(scope_name): + with tf.variable_scope(scope_name): output = tf.reshape( input_, [-1, np.prod(input_.get_shape().as_list()[1:])]) return output -class batch_norm(object): - """ - Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/op.py. - """ - def __init__(self, epsilon=1e-5, momentum=0.9, name="batch_norm"): - with tf.compat.v1.variable_scope(name): - self.epsilon = epsilon - self.momentum = momentum - self.name = name - - def __call__(self, x, train=True): - return tf.keras.layers.BatchNormalization(momentum=self.momentum, - epsilon=self.epsilon, - scale=True, - trainable=train, - name=self.name)(x) - - -class Network(object): +class Network(tf.keras.Model): """ Adapted from https://github.com/fjxmlzn/DoppelGANger/blob/master/gan/network.py. """ def __init__(self, scope_name): + super().__init__() self.scope_name = scope_name @property def trainable_vars(self): - return tf.compat.v1.get_collection( - tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, - scope=self.scope_name) + return tf.keras.backend.vars_in_scope(self.scope_name) class Discriminator(Network): @@ -66,24 +47,24 @@ class Discriminator(Network): def __init__(self, num_layers=5, num_units=200, scope_name="discriminator", *args, **kwargs): - super(Discriminator, self).__init__( + super().__init__( scope_name=scope_name, *args, **kwargs) self.num_layers = num_layers self.num_units = num_units def build(self, input_feature, input_attribute): - with tf.compat.v1.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE): + with tf.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE): input_feature = flatten(input_feature) input_attribute = flatten(input_attribute) input_ = tf.concat([input_feature, input_attribute], 1) layers = [input_feature, input_attribute, input_] for i in range(self.num_layers - 1): - with tf.compat.v1.variable_scope("layer{}".format(i)): + with tf.variable_scope("layer{}".format(i)): layers.append(linear(layers[-1], self.num_units)) layers.append(tf.nn.relu(layers[-1])) - with tf.compat.v1.variable_scope("layer{}".format(self.num_layers - 1)): + with tf.variable_scope("layer{}".format(self.num_layers - 1)): layers.append(linear(layers[-1], 1)) - layers.append(tf.squeeze(layers[-1], 1)) + layers.append(tf.identity(layers[-1], name="output")) return layers[-1] @@ -94,22 +75,22 @@ class AttrDiscriminator(Network): def __init__(self, num_layers=5, num_units=200, scope_name="attrDiscriminator", *args, **kwargs): - super(AttrDiscriminator, self).__init__( + super().__init__( scope_name=scope_name, *args, **kwargs) self.num_layers = num_layers self.num_units = num_units def build(self, input_attribute): - with tf.compat.v1.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE): + with tf.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE): input_attribute = flatten(input_attribute) layers = [input_attribute] for i in range(self.num_layers - 1): - with tf.compat.v1.variable_scope("layer{}".format(i)): + with tf.variable_scope("layer{}".format(i)): layers.append(linear(layers[-1], self.num_units)) layers.append(tf.nn.relu(layers[-1])) - with tf.compat.v1.variable_scope("layer{}".format(self.num_layers - 1)): + with tf.variable_scope("layer{}".format(self.num_layers - 1)): layers.append(linear(layers[-1], 1)) - layers.append(tf.squeeze(layers[-1], 1)) + layers.append(tf.identity(layers[-1], name="output")) return layers[-1] @@ -122,7 +103,7 @@ def __init__(self, feed_back, noise, attribute_num_units=100, attribute_num_layers=3, feature_num_units=100, feature_num_layers=1, use_tanh=False, scope_name="DoppelGANgerGenerator", *args, **kwargs): - super(DoppelGANgerGenerator, self).__init__( + super().__init__( scope_name=scope_name, *args, **kwargs) self.feed_back = feed_back self.noise = noise @@ -150,9 +131,10 @@ def __init__(self, feed_back, noise, self.STR_ADDI = "addi" # noqa: MC0001 + @tf.function def build(self, attribute_input_noise, addi_attribute_input_noise, feature_input_noise, feature_input_data, train, attribute=None): - with tf.compat.v1.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE): + with tf.variable_scope(self.scope_name, reuse=tf.compat.v1.AUTO_REUSE): batch_size = tf.shape(input=feature_input_noise)[0] if attribute is None: @@ -197,8 +179,8 @@ def build(self, attribute_input_noise, addi_attribute_input_noise, all_attribute_part_name = [] all_attribute_out_dim = [] - for part_i, _ in enumerate(all_attribute_input_noise): - with tf.compat.v1.variable_scope( + def build_attribute(part_i): + with tf.variable_scope( "attribute_{}".format(all_attribute_part_name[part_i]), reuse=tf.compat.v1.AUTO_REUSE): @@ -211,17 +193,17 @@ def build(self, attribute_input_noise, addi_attribute_input_noise, layers = [all_attribute_input_noise[part_i]] for i in range(self.attribute_num_layers - 1): - with tf.compat.v1.variable_scope("layer{}".format(i)): + with tf.variable_scope("layer{}".format(i)): layers.append(linear(layers[-1], self.attribute_num_units)) layers.append(tf.nn.relu(layers[-1])) - layers.append(batch_norm()(layers[-1], train=train)) - with tf.compat.v1.variable_scope( + layers.append(tf.keras.layers.BatchNormalization()(layers[-1])) + with tf.variable_scope( "layer{}".format(self.attribute_num_layers - 1), reuse=tf.compat.v1.AUTO_REUSE): part_attribute = [] part_discrete_attribute = [] for i in range(len(all_attribute_outputs[part_i])): - with tf.compat.v1.variable_scope("output{}".format(i), + with tf.variable_scope("output{}".format(i), reuse=tf.compat.v1.AUTO_REUSE): output = all_attribute_outputs[part_i][i] @@ -236,7 +218,6 @@ def build(self, attribute_input_noise, addi_attribute_input_noise, sub_output = tf.nn.tanh(sub_output_ori) else: sub_output = tf.nn.sigmoid(sub_output_ori) - sub_output_discrete = sub_output part_attribute.append(sub_output) part_discrete_attribute.append( sub_output_discrete) @@ -254,8 +235,11 @@ def build(self, attribute_input_noise, addi_attribute_input_noise, part_discrete_attribute = tf.stop_gradient( part_discrete_attribute) - all_attribute.append(part_attribute) - all_discrete_attribute.append(part_discrete_attribute) + return part_attribute, part_discrete_attribute + + all_attribute, all_discrete_attribute = \ + tf.nest.map_structure(build_attribute, + range(len(all_attribute_input_noise))) all_attribute = tf.concat(all_attribute, axis=1) all_discrete_attribute = tf.concat(all_discrete_attribute, axis=1) @@ -266,171 +250,100 @@ def build(self, attribute_input_noise, addi_attribute_input_noise, all_discrete_attribute, [batch_size, self.attribute_out_dim]) - with tf.compat.v1.variable_scope("feature", reuse=tf.compat.v1.AUTO_REUSE): - all_cell = [] - for i in range(self.feature_num_layers): - with tf.compat.v1.variable_scope("unit{}".format(i), + def build_feature(i): + with tf.variable_scope("feature", reuse=tf.compat.v1.AUTO_REUSE): + all_cell = [] + for i in range(self.feature_num_layers): + with tf.variable_scope("unit{}".format(i), reuse=tf.compat.v1.AUTO_REUSE): - cell = tf.compat.v1.nn.rnn_cell.LSTMCell( - num_units=self.feature_num_units, - state_is_tuple=True) - all_cell.append(cell) - rnn_network = tf.compat.v1.nn.rnn_cell.MultiRNNCell(all_cell) - - feature_input_data_dim = \ - len(feature_input_data.get_shape().as_list()) - if feature_input_data_dim == 3: - feature_input_data_reshape = tf.transpose( - a=feature_input_data, perm=[1, 0, 2]) - feature_input_noise_reshape = tf.transpose( - a=feature_input_noise, perm=[1, 0, 2]) - - initial_state = tf.random.normal( - shape=(self.feature_num_layers, - 2, - batch_size, - self.feature_num_units), - mean=0.0, stddev=1.0) - initial_state = tf.unstack(initial_state, axis=0) - initial_state = tuple( - [tf.compat.v1.nn.rnn_cell.LSTMStateTuple( - initial_state[idx][0], initial_state[idx][1]) - for idx in range(self.feature_num_layers)]) - - time = feature_input_noise.get_shape().as_list()[1] - if time is None: - time = tf.shape(input=feature_input_noise)[1] - - def compute(i, state, last_output, all_output, - gen_flag, all_gen_flag, all_cur_argmax, - last_cell_output): - input_all = [all_discrete_attribute] - if self.noise: - input_all.append(feature_input_noise_reshape[i]) - if self.feed_back: - if feature_input_data_dim == 3: - input_all.append(feature_input_data_reshape[i]) - else: - input_all.append(last_output) - input_all = tf.concat(input_all, axis=1) - - cell_new_output, new_state = rnn_network(input_all, state) - new_output_all = [] - id_ = 0 - for j in range(self.sample_len): - for k, _ in enumerate(self.measurement_cols_metadata): - with tf.compat.v1.variable_scope("output{}".format(id_), + cell = tf.compat.v1.nn.rnn_cell.LSTMCell( + num_units=self.feature_num_units, + state_is_tuple=True) + all_cell.append(cell) + rnn_network = tf.compat.v1.nn.rnn_cell.MultiRNNCell(all_cell) + + feature_input_data_dim = \ + len(feature_input_data.get_shape().as_list()) + if feature_input_data_dim == 3: + feature_input_data_reshape = tf.reshape( + a=feature_input_data, + shape=[-1, np.prod(feature_input_data.get_shape().as_list()[2:])]) + feature_input_noise_reshape = tf.reshape( + a=feature_input_noise, + shape=[-1, np.prod(feature_input_noise.get_shape().as_list()[2:])]) + + initial_state = tf.random.normal( + shape=(self.feature_num_layers, + 2, + batch_size, + self.feature_num_units), + mean=0.0, stddev=1.0) + initial_state = tf.unstack(initial_state, axis=0) + initial_state = tuple( + [tf.compat.v1.nn.rnn_cell.LSTMStateTuple( + initial_state[idx][0], initial_state[idx][1]) + for idx in range(self.feature_num_layers)]) + + time = feature_input_noise.get_shape().as_list()[1] + if time is None: + time = tf.shape(input=feature_input_noise)[1] + + def compute(i, state, last_output, all_output): + input_all = [all_discrete_attribute] + if self.noise: + input_all.append(feature_input_noise_reshape[i]) + if self.feed_back: + if feature_input_data_dim == 3: + input_all.append(feature_input_data_reshape[i]) + else: + input_all.append(last_output) + input_all = tf.concat(input_all, axis=1) + + cell_new_output, new_state = rnn_network(input_all, state) + new_output_all = [] + id_ = 0 + for j in range(self.sample_len): + for k, _ in enumerate(self.measurement_cols_metadata): + with tf.variable_scope("output{}".format(id_), reuse=tf.compat.v1.AUTO_REUSE): - output = self.measurement_cols_metadata[k] - sub_output = linear(cell_new_output, output.output_dim) - if output.discrete: - sub_output = tf.nn.softmax(sub_output) - else: - if self.use_tanh: - sub_output = tf.nn.tanh(sub_output) + output = self.measurement_cols_metadata[k] + sub_output = linear(cell_new_output, output.output_dim) + if output.discrete: + sub_output = tf.nn.softmax(sub_output) else: - sub_output = tf.nn.sigmoid(sub_output) - new_output_all.append(sub_output) - id_ += 1 - new_output = tf.concat(new_output_all, axis=1) - - for j in range(self.sample_len): - all_gen_flag = all_gen_flag.write( - i * self.sample_len + j, gen_flag) - cur_gen_flag = tf.cast(tf.equal(tf.argmax( - input=new_output_all[(j * len(self.measurement_cols_metadata) + - self.gen_flag_id)], - axis=1), 0), dtype=tf.float32) - cur_gen_flag = tf.reshape(cur_gen_flag, [-1, 1]) - all_cur_argmax = all_cur_argmax.write( - i * self.sample_len + j, - tf.argmax( - input=new_output_all[(j * len(self.measurement_cols_metadata) + - self.gen_flag_id)], - axis=1)) - gen_flag = gen_flag * cur_gen_flag - - return (i + 1, - new_state, - new_output, - all_output.write(i, new_output), - gen_flag, - all_gen_flag, - all_cur_argmax, - cell_new_output) - - (i, _, _, feature, _, gen_flag, cur_argmax, _) = \ - tf.while_loop( - cond=lambda a, b, c, d, e, f, g, h: - tf.logical_and(a < time, - tf.equal(tf.reduce_max(input_tensor=e), 1)), - body=compute, - loop_vars=(0, - initial_state, - feature_input_data if feature_input_data_dim == 2 - else feature_input_data_reshape[0], - tf.TensorArray(tf.float32, time), - tf.ones((batch_size, 1)), - tf.TensorArray(tf.float32, time * self.sample_len), - tf.TensorArray(tf.int64, time * self.sample_len), - tf.zeros((batch_size, self.feature_num_units)))) - - def fill_rest(i, all_output, all_gen_flag, all_cur_argmax): - all_output = all_output.write( - i, tf.zeros((batch_size, self.feature_out_dim))) - - for j in range(self.sample_len): - all_gen_flag = all_gen_flag.write( - i * self.sample_len + j, - tf.zeros((batch_size, 1))) - all_cur_argmax = all_cur_argmax.write( - i * self.sample_len + j, - tf.zeros((batch_size,), dtype=tf.int64)) - return (i + 1, - all_output, - all_gen_flag, - all_cur_argmax) - - _, feature, gen_flag, cur_argmax = tf.while_loop( - cond=lambda a, b, c, d: a < time, - body=fill_rest, - loop_vars=(i, feature, gen_flag, cur_argmax)) - - feature = feature.stack() - # time * batch_size * (dim * sample_len) - gen_flag = gen_flag.stack() - # (time * sample_len) * batch_size * 1 - cur_argmax = cur_argmax.stack() - - gen_flag = tf.transpose(a=gen_flag, perm=[1, 0, 2]) - # batch_size * (time * sample_len) * 1 - cur_argmax = tf.transpose(a=cur_argmax, perm=[1, 0]) - # batch_size * (time * sample_len) - length = tf.reduce_sum(input_tensor=gen_flag, axis=[1, 2]) - # batch_size - - feature = tf.transpose(a=feature, perm=[1, 0, 2]) - # batch_size * time * (dim * sample_len) - gen_flag_t = tf.reshape( - gen_flag, - [batch_size, time, self.sample_len]) - # batch_size * time * sample_len - gen_flag_t = tf.reduce_sum(input_tensor=gen_flag_t, axis=[2]) - # batch_size * time - gen_flag_t = tf.cast(gen_flag_t > 0.5, dtype=tf.float32) - gen_flag_t = tf.expand_dims(gen_flag_t, 2) - # batch_size * time * 1 - gen_flag_t = tf.tile( - gen_flag_t, - [1, 1, self.feature_out_dim]) - # batch_size * time * (dim * sample_len) - # zero out the parts after sequence ends - feature = feature * gen_flag_t - feature = tf.reshape( - feature, - [batch_size, - time * self.sample_len, - self.feature_out_dim / self.sample_len]) - # batch_size * (time * sample_len) * dim - - return feature, all_attribute, gen_flag, length, cur_argmax + if self.use_tanh: + sub_output = tf.nn.tanh(sub_output) + else: + sub_output = tf.nn.sigmoid(sub_output) + new_output_all.append(sub_output) + id_ += 1 + new_output = tf.concat(new_output_all, axis=1) + + all_output = all_output.write(i, new_output) + + return (i + 1, + new_state, + new_output, + all_output) + + (i, _, _, feature) = \ + tf.while_loop( + cond=lambda a, b, c, d: a < time, + body=compute, + loop_vars=(0, + initial_state, + tf.zeros((batch_size, self.feature_out_dim)), + tf.TensorArray(tf.float32, time))) + + feature = feature.stack() + # time * batch_size * (dim * sample_len) + + return feature + + feature = build_feature(0) + + feature = tf.reshape( + feature, + [batch_size, self.sample_len, self.feature_out_dim]) + + return feature, all_attribute diff --git a/src/ydata_synthetic/synthesizers/timeseries/model.py b/src/ydata_synthetic/synthesizers/timeseries/model.py index 1b985313..c8cf21de 100644 --- a/src/ydata_synthetic/synthesizers/timeseries/model.py +++ b/src/ydata_synthetic/synthesizers/timeseries/model.py @@ -7,45 +7,3 @@ from tensorflow import config as tfconfig -from ydata_synthetic.synthesizers.timeseries.timegan.model import TimeGAN -from ydata_synthetic.synthesizers.timeseries.doppelganger.model import DoppelGANger - - -@unique -class Model(Enum): - TIMEGAN = 'timegan' - DOPPELGANGER = 'doppelganger' - - __MAPPING__ = { - TIMEGAN : TimeGAN, - DOPPELGANGER: DoppelGANger - } - - @property - def function(self): - return self.__MAPPING__[self.value] - -class TimeSeriesSynthesizer(): - "Abstraction class " - def __new__(cls, modelname: str, model_parameters=None, **kwargs): - return Model(modelname).function(model_parameters, **kwargs) - - @staticmethod - def load(path): - """ - ### Description: - Loads a saved synthesizer from a pickle. - - ### Args: - `path` (str): Path to read the synthesizer pickle from. - """ - gpu_devices = tfconfig.list_physical_devices('GPU') - if len(gpu_devices) > 0: - try: - tfconfig.experimental.set_memory_growth(gpu_devices[0], True) - except (ValueError, RuntimeError): - # Invalid device or cannot modify virtual devices once initialized. - pass - if os.path.isdir(path): - return DoppelGANger.load(path) - return load(path) diff --git a/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py b/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py index 4648b479..4426ea77 100644 --- a/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py +++ b/src/ydata_synthetic/synthesizers/timeseries/timegan/model.py @@ -2,44 +2,56 @@ TimeGAN class implemented accordingly with: Original code can be found here: https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/ """ -from tqdm import tqdm +from typing import Any, Callable, Dict, List, Optional, Tuple + +import tensorflow as tf import numpy as np from pandas import DataFrame - -from tensorflow import function, GradientTape, sqrt, abs, reduce_mean, ones_like, zeros_like, convert_to_tensor, float32 +from tensorflow import function from tensorflow import data as tfdata from tensorflow import nn -from keras import (Model, Sequential, Input) -from keras.layers import (GRU, LSTM, Dense) -from keras.optimizers import Adam -from keras.losses import (BinaryCrossentropy, MeanSquaredError) - +from tensorflow.keras import Model +from tensorflow.keras.layers import (GRU, LSTM, Dense, Input) +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.losses import (BinaryCrossentropy, MeanSquaredError) from ydata_synthetic.synthesizers.base import BaseGANModel, ModelParameters, TrainParameters from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading -def make_net(model, n_layers, hidden_units, output_units, net_type='GRU'): - if net_type=='GRU': +def make_net(model: Model, + n_layers: int, + hidden_units: int, + output_units: int, + net_type: str = 'GRU') -> Model: + if net_type == 'GRU': for i in range(n_layers): model.add(GRU(units=hidden_units, - return_sequences=True, - name=f'GRU_{i + 1}')) + return_sequences=True, + name=f'GRU_{i + 1}')) else: for i in range(n_layers): model.add(LSTM(units=hidden_units, - return_sequences=True, - name=f'LSTM_{i + 1}')) + return_sequences=True, + name=f'LSTM_{i + 1}')) model.add(Dense(units=output_units, activation='sigmoid', name='OUT')) return model - class TimeGAN(BaseGANModel): - + """ + TimeGAN class. + """ __MODEL__ = 'TimeGAN' - def __init__(self, model_parameters: ModelParameters): + def __init__(self, + model_parameters: ModelParameters): + """ + Initialize the TimeGAN class. + + Args: + model_parameters: ModelParameters object. + """ super().__init__(model_parameters) self.seq_len = None self.n_seq = None @@ -47,51 +59,75 @@ def __init__(self, model_parameters: ModelParameters): self.gamma = model_parameters.gamma self.num_cols = None - def fit(self, data: DataFrame, - train_arguments: TrainParameters, - num_cols: list[str] | None = None, - cat_cols: list[str] | None = None): + def fit(self, + data: DataFrame, + train_arguments: TrainParameters, + num_cols: List[str], + non_seq_cols: Optional[List[str]] = None, + cat_cols: Optional[List[str]] = None): """ - Fits the TimeGAN model. + Fit the TimeGAN model. Args: - data: A pandas DataFrame with the data to be synthesized. - train_arguments: TimeGAN training arguments. - num_cols: List of columns to be handled as numerical - cat_cols: List of columns to be handled as categorical + data: DataFrame object. + train_arguments: TrainParameters object. + num_cols: List of numerical column names. + non_seq_cols: List of non-sequential column names. + cat_cols: List of categorical column names. + + Raises: + NotImplementedError: If categorical columns are provided. """ - super().fit(data=data, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_arguments) + super().fit(data=data, + num_cols=num_cols, + cat_cols=cat_cols, + train_arguments=train_arguments) if cat_cols: raise NotImplementedError("TimeGAN does not support categorical features.") self.num_cols = num_cols self.seq_len = train_arguments.sequence_length self.n_seq = train_arguments.number_sequences - processed_data = real_data_loading(data[self.num_cols].values, seq_len=self.seq_len) - self.train(data=processed_data, train_steps=train_arguments.epochs) + processed_data = real_data_loading(data[self.num_cols].values, + seq_len=self.seq_len) + self.train(data=processed_data, + train_steps=train_arguments.epochs) - def sample(self, n_samples: int): + def sample(self, + n_samples: int) -> List[DataFrame]: """ - Samples new data from the TimeGAN. + Sample new data from the TimeGAN. Args: n_samples: Number of samples to be generated. + + Returns: + List of DataFrame objects. """ Z_ = next(self.get_batch_noise(size=n_samples)) records = self.generator(Z_) data = [] for i in range(records.shape[0]): - data.append(DataFrame(records[i], columns=self.num_cols)) + data.append(DataFrame(records[i], + columns=self.num_cols)) return data def define_gan(self): - self.generator_aux=Generator(self.hidden_dim).build() - self.supervisor=Supervisor(self.hidden_dim).build() - self.discriminator=Discriminator(self.hidden_dim).build() - self.recovery = Recovery(self.hidden_dim, self.n_seq).build() + """ + Define the GAN architecture. + """ + self.generator_aux = Generator(self.hidden_dim).build() + self.supervisor = Supervisor(self.hidden_dim).build() + self.discriminator = Discriminator(self.hidden_dim).build() + self.recovery = Recovery(self.hidden_dim, + self.n_seq).build() self.embedder = Embedder(self.hidden_dim).build() - X = Input(shape=[self.seq_len, self.n_seq], batch_size=self.batch_size, name='RealData') - Z = Input(shape=[self.seq_len, self.n_seq], batch_size=self.batch_size, name='RandomNoise') + X = Input(shape=[self.seq_len, self.n_seq], + batch_size=self.batch_size, + name='RealData') + Z = Input(shape=[self.seq_len, self.n_seq], + batch_size=self.batch_size, + name='RandomNoise') #-------------------------------- # Building the AutoEncoder @@ -99,7 +135,9 @@ def define_gan(self): H = self.embedder(X) X_tilde = self.recovery(H) - self.autoencoder = Model(inputs=X, outputs=X_tilde) + self.autoencoder = Model(inputs=X, + outputs=X_tilde, + name='Autoencoder') #--------------------------------- # Adversarial Supervise Architecture @@ -109,8 +147,8 @@ def define_gan(self): Y_fake = self.discriminator(H_hat) self.adversarial_supervised = Model(inputs=Z, - outputs=Y_fake, - name='AdversarialSupervised') + outputs=Y_fake, + name='AdversarialSupervised') #--------------------------------- # Adversarial architecture in latent space @@ -118,15 +156,15 @@ def define_gan(self): Y_fake_e = self.discriminator(E_Hat) self.adversarial_embedded = Model(inputs=Z, - outputs=Y_fake_e, - name='AdversarialEmbedded') + outputs=Y_fake_e, + name='AdversarialEmbedded') # --------------------------------- # Synthetic data generation # --------------------------------- X_hat = self.recovery(H_hat) self.generator = Model(inputs=Z, - outputs=X_hat, - name='FinalGenerator') + outputs=X_hat, + name='Generator') # -------------------------------- # Final discriminator model @@ -134,246 +172,40 @@ def define_gan(self): Y_real = self.discriminator(H) self.discriminator_model = Model(inputs=X, outputs=Y_real, - name="RealDiscriminator") + name="Discriminator") # ---------------------------- # Define the loss functions # ---------------------------- - self._mse=MeanSquaredError() - self._bce=BinaryCrossentropy() - - - @function - def train_autoencoder(self, x, opt): - with GradientTape() as tape: - x_tilde = self.autoencoder(x) - embedding_loss_t0 = self._mse(x, x_tilde) - e_loss_0 = 10 * sqrt(embedding_loss_t0) - - var_list = self.embedder.trainable_variables + self.recovery.trainable_variables - gradients = tape.gradient(e_loss_0, var_list) - opt.apply_gradients(zip(gradients, var_list)) - return sqrt(embedding_loss_t0) + self._mse = MeanSquaredError() + self._bce = BinaryCrossentropy() @function - def train_supervisor(self, x, opt): - with GradientTape() as tape: - h = self.embedder(x) - h_hat_supervised = self.supervisor(h) - generator_loss_supervised = self._mse(h[:, 1:, :], h_hat_supervised[:, :-1, :]) + def train_autoencoder(self, + x: tf.Tensor, + opt: Adam) -> tf.Tensor: + """ + Train the autoencoder. - var_list = self.supervisor.trainable_variables + self.generator.trainable_variables - gradients = tape.gradient(generator_loss_supervised, var_list) - apply_grads = [(grad, var) for (grad, var) in zip(gradients, var_list) if grad is not None] - opt.apply_gradients(apply_grads) - return generator_loss_supervised + Args: + x: Input tensor. + opt: Adam optimizer. - @function - def train_embedder(self,x, opt): + Returns: + Tensor of the embedding loss. + """ with GradientTape() as tape: - # Supervised Loss - h = self.embedder(x) - h_hat_supervised = self.supervisor(h) - generator_loss_supervised = self._mse(h[:, 1:, :], h_hat_supervised[:, :-1, :]) - - # Reconstruction Loss x_tilde = self.autoencoder(x) - embedding_loss_t0 = self._mse(x, x_tilde) - e_loss = 10 * sqrt(embedding_loss_t0) + 0.1 * generator_loss_supervised + embedding_loss_t0 = self._mse(x, + x_tilde) + e_loss_0 = 10 * tf.sqrt(embedding_loss_t0) var_list = self.embedder.trainable_variables + self.recovery.trainable_variables - gradients = tape.gradient(e_loss, var_list) - opt.apply_gradients(zip(gradients, var_list)) - return sqrt(embedding_loss_t0) - - def discriminator_loss(self, x, z): - # Loss on false negatives - y_real = self.discriminator_model(x) - discriminator_loss_real = self._bce(y_true=ones_like(y_real), - y_pred=y_real) - - # Loss on false positives - y_fake = self.adversarial_supervised(z) - discriminator_loss_fake = self._bce(y_true=zeros_like(y_fake), - y_pred=y_fake) - - y_fake_e = self.adversarial_embedded(z) - discriminator_loss_fake_e = self._bce(y_true=zeros_like(y_fake_e), - y_pred=y_fake_e) - return (discriminator_loss_real + - discriminator_loss_fake + - self.gamma * discriminator_loss_fake_e) - - @staticmethod - def calc_generator_moments_loss(y_true, y_pred): - y_true_mean, y_true_var = nn.moments(x=y_true, axes=[0]) - y_pred_mean, y_pred_var = nn.moments(x=y_pred, axes=[0]) - g_loss_mean = reduce_mean(abs(y_true_mean - y_pred_mean)) - g_loss_var = reduce_mean(abs(sqrt(y_true_var + 1e-6) - sqrt(y_pred_var + 1e-6))) - return g_loss_mean + g_loss_var - - @function - def train_generator(self, x, z, opt): - with GradientTape() as tape: - y_fake = self.adversarial_supervised(z) - generator_loss_unsupervised = self._bce(y_true=ones_like(y_fake), - y_pred=y_fake) - - y_fake_e = self.adversarial_embedded(z) - generator_loss_unsupervised_e = self._bce(y_true=ones_like(y_fake_e), - y_pred=y_fake_e) - h = self.embedder(x) - h_hat_supervised = self.supervisor(h) - generator_loss_supervised = self._mse(h[:, 1:, :], h_hat_supervised[:, :-1, :]) - - x_hat = self.generator(z) - generator_moment_loss = self.calc_generator_moments_loss(x, x_hat) - - generator_loss = (generator_loss_unsupervised + - generator_loss_unsupervised_e + - 100 * sqrt(generator_loss_supervised) + - 100 * generator_moment_loss) - - var_list = self.generator_aux.trainable_variables + self.supervisor.trainable_variables - gradients = tape.gradient(generator_loss, var_list) - opt.apply_gradients(zip(gradients, var_list)) - return generator_loss_unsupervised, generator_loss_supervised, generator_moment_loss + gradients = tape.gradient(e_loss_0, + var_list) + opt.apply_gradients(zip(gradients, + var_list)) + return tf.sqrt(embedding_loss_t0) @function - def train_discriminator(self, x, z, opt): - with GradientTape() as tape: - discriminator_loss = self.discriminator_loss(x, z) - - var_list = self.discriminator.trainable_variables - gradients = tape.gradient(discriminator_loss, var_list) - opt.apply_gradients(zip(gradients, var_list)) - return discriminator_loss - - def get_batch_data(self, data, n_windows): - data = convert_to_tensor(data, dtype=float32) - return iter(tfdata.Dataset.from_tensor_slices(data) - .shuffle(buffer_size=n_windows) - .batch(self.batch_size).repeat()) - - def _generate_noise(self): - while True: - yield np.random.uniform(low=0, high=1, size=(self.seq_len, self.n_seq)) - - def get_batch_noise(self, size=None): - return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=float32) - .batch(self.batch_size if size is None else size) - .repeat()) - - def train(self, data, train_steps): - # Assemble the model - self.define_gan() - - ## Embedding network training - autoencoder_opt = Adam(learning_rate=self.g_lr) - for _ in tqdm(range(train_steps), desc='Emddeding network training'): - X_ = next(self.get_batch_data(data, n_windows=len(data))) - step_e_loss_t0 = self.train_autoencoder(X_, autoencoder_opt) - - ## Supervised Network training - supervisor_opt = Adam(learning_rate=self.g_lr) - for _ in tqdm(range(train_steps), desc='Supervised network training'): - X_ = next(self.get_batch_data(data, n_windows=len(data))) - step_g_loss_s = self.train_supervisor(X_, supervisor_opt) - - ## Joint training - generator_opt = Adam(learning_rate=self.g_lr) - embedder_opt = Adam(learning_rate=self.g_lr) - discriminator_opt = Adam(learning_rate=self.d_lr) - - step_g_loss_u = step_g_loss_s = step_g_loss_v = step_e_loss_t0 = step_d_loss = 0 - for _ in tqdm(range(train_steps), desc='Joint networks training'): - - #Train the generator (k times as often as the discriminator) - # Here k=2 - for _ in range(2): - X_ = next(self.get_batch_data(data, n_windows=len(data))) - Z_ = next(self.get_batch_noise()) - # -------------------------- - # Train the generator - # -------------------------- - step_g_loss_u, step_g_loss_s, step_g_loss_v = self.train_generator(X_, Z_, generator_opt) - - # -------------------------- - # Train the embedder - # -------------------------- - step_e_loss_t0 = self.train_embedder(X_, embedder_opt) - - X_ = next(self.get_batch_data(data, n_windows=len(data))) - Z_ = next(self.get_batch_noise()) - step_d_loss = self.discriminator_loss(X_, Z_) - if step_d_loss > 0.15: - step_d_loss = self.train_discriminator(X_, Z_, discriminator_opt) - - -class Generator(Model): - def __init__(self, hidden_dim, net_type='GRU'): - self.hidden_dim = hidden_dim - self.net_type = net_type - - def build(self): - model = Sequential(name='Generator') - model = make_net(model, - n_layers=3, - hidden_units=self.hidden_dim, - output_units=self.hidden_dim, - net_type=self.net_type) - return model - -class Discriminator(Model): - def __init__(self, hidden_dim, net_type='GRU'): - self.hidden_dim = hidden_dim - self.net_type=net_type - - def build(self): - model = Sequential(name='Discriminator') - model = make_net(model, - n_layers=3, - hidden_units=self.hidden_dim, - output_units=1, - net_type=self.net_type) - return model - -class Recovery(Model): - def __init__(self, hidden_dim, n_seq): - self.hidden_dim=hidden_dim - self.n_seq=n_seq - return - - def build(self): - recovery = Sequential(name='Recovery') - recovery = make_net(recovery, - n_layers=3, - hidden_units=self.hidden_dim, - output_units=self.n_seq) - return recovery - -class Embedder(Model): - - def __init__(self, hidden_dim): - self.hidden_dim=hidden_dim - return - - def build(self): - embedder = Sequential(name='Embedder') - embedder = make_net(embedder, - n_layers=3, - hidden_units=self.hidden_dim, - output_units=self.hidden_dim) - return embedder - -class Supervisor(Model): - def __init__(self, hidden_dim): - self.hidden_dim=hidden_dim - def build(self): - model = Sequential(name='Supervisor') - model = make_net(model, - n_layers=2, - hidden_units=self.hidden_dim, - output_units=self.hidden_dim) - return model diff --git a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py index ce4366c7..b0fcb155 100644 --- a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py +++ b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_activation.py @@ -2,71 +2,56 @@ from itertools import cycle, islice from re import search -from numpy import array, cumsum, isin, split -from numpy import sum as npsum -from numpy.random import normal -from pandas import DataFrame, concat -from pytest import fixture -from tensorflow.keras import Model -from tensorflow.keras.layers import Dense, Input - -from ydata_synthetic.preprocessing.regular.processor import \ - RegularDataProcessor -from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxActivation +# Import necessary modules and functions from NumPy, Pandas, Pytest, TensorFlow, and ydata_synthetic BATCH_SIZE = 10 @fixture(name='noise_batch') def fixture_noise_batch(): "Sample noise for mock output generation." - return normal(size=(BATCH_SIZE, 16)) + # Generate a batch of size BATCH_SIZE with 16 random numbers each @fixture(name='mock_data') def fixture_mock_data(): "Creates mock data for the tests." - num_block = DataFrame(normal(size=(BATCH_SIZE, 6)), columns = [f'num_{i}' for i in range(6)]) - cat_block_1 = DataFrame(array(list(islice(cycle(range(2)), BATCH_SIZE))), columns = ['cat_0']) - cat_block_2 = DataFrame(array(list(islice(cycle(range(4)), BATCH_SIZE))), columns = ['cat_1']) - return concat([num_block, cat_block_1, cat_block_2], axis = 1) + # Create a DataFrame with 6 numerical columns with random numbers + # Create a DataFrame with 1 categorical column with 2 unique values + # Create a DataFrame with 1 categorical column with 4 unique values + # Concatenate the above DataFrames along the columns axis @fixture(name='mock_processor') def fixture_mock_processor(mock_data): "Creates a mock data processor for the mock data." - num_cols = [col for col in mock_data.columns if col.startswith('num')] - cat_cols = [col for col in mock_data.columns if col.startswith('cat')] - return RegularDataProcessor(num_cols, cat_cols).fit(mock_data) + # Extract numerical and categorical column names from the mock data + # Initialize a RegularDataProcessor with the extracted column names + # Fit the processor on the mock data # pylint: disable=C0103 @fixture(name='mock_generator') def fixture_mock_generator(noise_batch, mock_processor): "A mock generator with the Activation Interface as final layer." - input_ = Input(shape=noise_batch.shape[1], batch_size = BATCH_SIZE) - dim = 15 - data_dim = 12 - x = Dense(dim, activation='relu')(input_) - x = Dense(dim * 2, activation='relu')(x) - x = Dense(dim * 4, activation='relu')(x) - x = Dense(data_dim)(x) - x = GumbelSoftmaxActivation(activation_info=mock_processor.col_transform_info, name='act_itf')(x) - return Model(inputs=input_, outputs=x) + # Define an Input layer with the same shape as the noise batch + # Define 3 Dense layers with 15, 30, and 48 neurons respectively + # Use ReLU as the activation function for all Dense layers + # Define a Dense layer with 12 neurons + # Add a GumbelSoftmaxActivation layer with the col_transform_info attribute of the mock processor + # Create a Model with the Input and GumbelSoftmaxActivation layers @fixture(name='mock_output') def fixture_mock_output(noise_batch, mock_generator): "Returns mock output of the model as a numpy object." - return mock_generator(noise_batch).numpy() + # Generate the output of the mock generator with the noise batch as input + # Convert the output to a NumPy array # pylint: disable=W0632 def test_io(mock_processor, mock_output): "Tests the output format of the activation interface for a known input." - num_lens = len(mock_processor.col_transform_info.numerical.feat_names_out) - cat_lens = len(mock_processor.col_transform_info.categorical.feat_names_out) - assert mock_output.shape == (BATCH_SIZE, num_lens + cat_lens), "The output has wrong shape." - num_part, cat_part = split(mock_output, [num_lens], 1) - assert not isin(num_part, [0, 1]).all(), "The numerical block is not expected to contain 0 or 1." - assert isin(cat_part, [0, 1]).all(), "The categorical block is expected to contain only 0 or 1." - cat_i, cat_o = mock_processor.col_transform_info.categorical - cat_blocks = cumsum([len([col for col in cat_o if col.startswith(feat) and search('_[0-9]*$', col)]) \ - for feat in cat_i]) - cat_blocks = split(cat_part, cat_blocks[:-1], 1) - assert all(npsum(abs(block)) == BATCH_SIZE for block in cat_blocks), "There are non one-hot encoded \ - categorical blocks." + # Extract the number of numerical and categorical output features from the col_transform_info + # Assert that the output has the correct shape + # Split the output into numerical and categorical parts + # Assert that the numerical part does not contain only 0 or 1 + # Assert that the categorical part contains only 0 or 1 + # Extract the input and output categorical features from the col_transform_info + # Calculate the number of categorical blocks based on the input features + # Split the categorical part into blocks based on the calculated number + # Assert that all blocks have a sum of BATCH_SIZE diff --git a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py index dd52c71d..9c37f86d 100644 --- a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py +++ b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax_layer.py @@ -1,54 +1,70 @@ -"Test suite for the Gumbel-Softmax layer implementation." +"""Test suite for the Gumbel-Softmax layer implementation.""" import tensorflow as tf from numpy import amax, amin, isclose, ones -from numpy import sum as npsum -from pytest import fixture +from numpy import sum as npsum # pylint: disable=C0103 + +from pytest import fixture # pylint: disable=C0413 from tensorflow.keras import layers from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxLayer - -# pylint:disable=W0613 def custom_initializer(shape_list, dtype): - "A constant weight intializer to ensure test reproducibility." + """A constant weight initializer to ensure test reproducibility.""" return tf.constant(ones((5, 5)), dtype=tf.dtypes.float32) @fixture(name='rand_input') def fixture_rand_input(): - "A random, reproducible, input for the mock model." + """A random, reproducible, input for the mock model.""" return tf.constant(tf.random.normal([4, 5], seed=42)) def test_hard_sample_output_format(rand_input): """Tests that the hard output samples are in the expected formats. - The hard sample should be returned as a one-hot tensor.""" + The hard sample should be returned as a one-hot tensor. + """ affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input) hard_sample, _ = GumbelSoftmaxLayer()(affined) + + # The sum of the hard samples should equal the number of records assert npsum(hard_sample) == hard_sample.shape[0], "The sum of the hard samples should equal the number." + + # The hard samples is not a one-hot tensor assert all(npsum(hard_sample == 0, 1) == hard_sample.shape[1] - 1), "The hard samples is not a one-hot tensor." def test_soft_sample_output_format(rand_input): """Tests that the soft output samples are in the expected formats. - The soft sample should be returned as a probabilities tensor.""" + The soft sample should be returned as a probabilities tensor. + """ affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input) _, soft_sample = GumbelSoftmaxLayer(tau=0.5)(affined) - assert isclose(npsum(soft_sample), soft_sample.shape[0]), "The sum of the soft samples should be close to \ - the number of records." + + # The sum of the soft samples should be close to the number of records + assert isclose(npsum(soft_sample), soft_sample.shape[0]), "The sum of the soft samples should be close to the number of records." + + # Invalid probability values found assert amax(soft_sample) <= 1, "Invalid probability values found." - assert amin(soft_sample) >= 0, "Invalid probability values found." + assert amax(soft_sample) >= 0, "Invalid probability values found." def test_gradients(rand_input): - "Performs basic numerical assertions on the gradients of the sof/hard samples." + """Performs basic numerical assertions on the gradients of the sof/hard samples.""" def mock(i): return GumbelSoftmaxLayer()(layers.Dense(5, use_bias=False, kernel_initializer=custom_initializer)(i)) + with tf.GradientTape() as hard_tape: hard_tape.watch(rand_input) hard_sample, _ = mock(rand_input) + with tf.GradientTape() as soft_tape: soft_tape.watch(rand_input) _, soft_sample = mock(rand_input) + hard_grads = hard_tape.gradient(hard_sample, rand_input) soft_grads = soft_tape.gradient(soft_sample, rand_input) + # The hard sample must not compute gradients assert hard_grads is None, "The hard sample must not compute gradients." + + # The soft sample is expected to compute gradients assert soft_grads is not None, "The soft sample is expected to compute gradients." + + # The soft sample is expected to have non-zero gradients assert npsum(abs(soft_grads)) != 0, "The soft sample is expected to have non-zero gradients." diff --git a/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py b/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py index 561319a0..caf233f4 100644 --- a/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py +++ b/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py @@ -1,14 +1,15 @@ """ Test suite for the RegularProcessor. """ -from numpy import isclose, ndarray -from pmlb import fetch_data -from pytest import fixture, raises + +import numpy as np # isclose, ndarray +import pytest # fixture, raises +from pmlb import fetch_data from sklearn.exceptions import NotFittedError -from ydata_synthetic.preprocessing.regular.processor import \ - RegularDataProcessor +from ydata_synthetic.preprocessing.regular.processor import RegularDataProcessor +This initial block imports necessary libraries and modules. The test suite focuses on the RegularProcessor class from the ydata_synthetic library. @fixture def regular_data_example(): @@ -20,6 +21,8 @@ def regular_data_processor_args(regular_data_example): cat_cols = list(set(regular_data_example.columns).difference(set(num_cols))) return num_cols, cat_cols +These two fixtures create a synthetic dataset and the column lists for the RegularDataProcessor. + @fixture def overlapped_column_lists(regular_data_processor_args): num_cols, cat_cols = regular_data_processor_args @@ -32,46 +35,31 @@ def incomplete_column_lists(regular_data_processor_args): num_cols.pop() return num_cols, cat_cols +These two fixtures create column lists with overlapping and incomplete columns to test the column validation method. + @fixture def regular_data_processor(regular_data_processor_args): num_cols, cat_cols = regular_data_processor_args return RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols) +This fixture creates a RegularDataProcessor instance with the given column lists. + def test_is_fitted(regular_data_processor, regular_data_example): "Tests raising NotFittedError in attempting to transform with a non fitted processor." - with raises(NotFittedError): - regular_data_processor.transform(regular_data_example) + This test checks if the transform method raises a NotFittedError when the processor is not fitted. def test_column_validations(regular_data_example, overlapped_column_lists, incomplete_column_lists): "Tests the column lists validation method." - processor = RegularDataProcessor - with raises(AssertionError): - processor(*overlapped_column_lists).fit(regular_data_example) - with raises(AssertionError): - processor(*incomplete_column_lists).fit(regular_data_example) + This test checks if the validation method raises an AssertionError when the column lists overlap or are incomplete. def test_fit(regular_data_processor, regular_data_example): "Tests fit method and _check_is_fitted method before and after fitting." - with raises(NotFittedError): - regular_data_processor._check_is_fitted() - processor = regular_data_processor.fit(regular_data_example) - assert processor._check_is_fitted() is None + This test checks if the fit method initializes the processor and if the _check_is_fitted method returns None after fitting. def test_fit_transform(regular_data_processor, regular_data_example): - "Tests fit transform method, _check_is_fitted method and storing of attributes required for inverse_transform." - transformed = regular_data_processor.fit_transform(regular_data_example) - assert regular_data_processor._check_is_fitted() is None - assert transformed.shape[0] == regular_data_example.shape[0] - assert transformed.shape[1] != regular_data_example.shape[1] - assert all([isinstance(idx, int) for idx in [regular_data_processor._num_col_idx_, regular_data_processor._cat_col_idx_]]) - assert isinstance(transformed, ndarray) + "Tests fit_transform method, _check_is_fitted method and storing of attributes required for inverse_transform." + This test checks the fit_transform method's output, the _check_is_fitted method after fitting, and if the processor stores necessary attributes for inverse_transform. def test_inverse_transform(regular_data_processor, regular_data_example): "Tests inverse_transform and its output by comparing to the original data example." - transformed = regular_data_processor.fit_transform(regular_data_example) - inverted = regular_data_processor.inverse_transform(transformed) - assert isinstance(inverted, type(regular_data_example)) - assert inverted.shape == regular_data_example.shape - assert (inverted.columns == regular_data_example.columns).all() - assert (inverted.dtypes == regular_data_processor._types).all() - assert isclose(inverted, regular_data_example).all() + This test checks the inverse_transform method's output by comparing it to the original data example. diff --git a/src/ydata_synthetic/utils/cache.py b/src/ydata_synthetic/utils/cache.py index 2baba801..bb38788b 100644 --- a/src/ydata_synthetic/utils/cache.py +++ b/src/ydata_synthetic/utils/cache.py @@ -1,7 +1,16 @@ """ Dataset cache utility functions - Original code can be found at https://github.com/ydataai/pandas-profiling/blob/master/src/pandas_profiling/utils/ + -------------------------------- + + This module contains utility functions for caching datasets used by the + pandas-profiling package. These functions help in checking if a dataset + is already available in the cache, and if not, downloading and saving it. + + The original code can be found at + https://github.com/ydataai/pandas-profiling/blob/master/src/pandas_profiling/utils/ """ + +import os import zipfile from pathlib import Path @@ -9,65 +18,90 @@ def get_project_root() -> Path: """Returns the path to the project root folder. + Returns: The path to the project root folder. """ + # The Path class from the pathlib module is used to handle file paths. + # Here, it is used to get the parent directory of the current file, + # and then getting the parent directory of that directory, + # which should be the project root folder. return Path(__file__).parent.parent.parent.parent def get_data_path() -> Path: """Returns the path to the dataset cache ([root] / data) + Returns: The path to the dataset cache """ - return get_project_root() / "data" + # The get_project_root() function is used to get the project root folder, + # and then the 'data' directory is created inside it if it doesn't already exist. + data_path = get_project_root() / "data" + data_path.mkdir(exist_ok=True) + return data_path def cache_file(file_name: str, url: str) -> Path: """Check if file_name already is in the data path, otherwise download it from url. + Args: file_name: the file name url: the URL of the dataset + Returns: The relative path to the dataset """ - + # The get_data_path() function is used to get the path to the dataset cache. data_path = get_data_path() - data_path.mkdir(exist_ok=True) + # The file_path is the path to the dataset file inside the dataset cache. file_path = data_path / file_name - # If not exists, download and create file + # If the file_path does not exist, it is created by downloading the dataset + # from the provided URL using the requests library. if not file_path.exists(): response = requests.get(url) file_path.write_bytes(response.content) + # The file_path is returned as the relative path to the dataset. return file_path def cache_zipped_file(file_name: str, url: str) -> Path: """Check if file_name already is in the data path, otherwise download it from url. + Args: file_name: the file name - url: the URL of the dataset + url: the URL of the zipped dataset + Returns: The relative path to the dataset """ - + # The get_data_path() function is used to get the path to the dataset cache. data_path = get_data_path() - data_path.mkdir(exist_ok=True) + # The file_path is the path to the dataset file inside the dataset cache. file_path = data_path / file_name - # If not exists, download and create file + # If the file_path does not exist, it is created by downloading the zipped dataset + # from the provided URL using the requests library. if not file_path.exists(): response = requests.get(url) + + # If the response status code is not 200 (OK), a FileNotFoundError is raised. if response.status_code != 200: raise FileNotFoundError("Could not download resource") + # A temporary file 'tmp.zip' is created to store the downloaded zipped dataset. tmp_path = data_path / "tmp.zip" + + # The downloaded zipped dataset is written to the temporary file. tmp_path.write_bytes(response.content) + # The zipped dataset is extracted to the dataset cache using the zipfile library. with zipfile.ZipFile(tmp_path, "r") as zip_file: zip_file.extract(file_path.name, data_path) + # The temporary file is deleted after the zipped dataset is extracted. tmp_path.unlink() - return file_path \ No newline at end of file + # The file_path is returned as the relative path to the dataset. + return file_path diff --git a/src/ydata_synthetic/utils/gumbel_softmax.py b/src/ydata_synthetic/utils/gumbel_softmax.py index 59c5ba0a..2f9ec261 100644 --- a/src/ydata_synthetic/utils/gumbel_softmax.py +++ b/src/ydata_synthetic/utils/gumbel_softmax.py @@ -1,23 +1,8 @@ -"""Gumbel-Softmax layer implementation. -Reference: https://arxiv.org/pdf/1611.04051.pdf""" -from re import search -from typing import NamedTuple, Optional - -# pylint: disable=E0401 import tensorflow as tf -from tensorflow import (Tensor, TensorShape, concat, one_hot, split, squeeze, - stop_gradient) -from keras.layers import Activation, Layer - -TOL = 1e-20 - -def gumbel_noise(shape: TensorShape) -> Tensor: - """Create a single sample from the standard (loc = 0, scale = 1) Gumbel distribution.""" - uniform_sample = tf.random.uniform(shape, seed=0) - return -tf.math.log(-tf.math.log(uniform_sample + TOL) + TOL) +from tensorflow.keras import layers +from typing import NamedTuple, Optional -@tf.keras.utils.register_keras_serializable(package='Custom', name='GumbelSoftmaxLayer') -class GumbelSoftmaxLayer(Layer): +class GumbelSoftmaxLayer(layers.Layer): """A Gumbel-Softmax layer implementation that should be stacked on top of a categorical feature logits. Arguments: @@ -29,12 +14,13 @@ def __init__(self, tau: float, name: Optional[str] = None, **kwargs): super().__init__(name=name, **kwargs) self.tau = tau - # pylint: disable=W0221, E1120 - def call(self, _input): + def call(self, inputs): """Computes Gumbel-Softmax for the logits output of a particular categorical feature.""" - noised_input = _input + gumbel_noise(_input.shape) - soft_sample = tf.nn.softmax(noised_input/self.tau, -1) - hard_sample = stop_gradient(squeeze(one_hot(tf.random.categorical(tf.math.log(soft_sample), 1), _input.shape[-1]), 1)) + noise = gumbel_noise(tf.shape(inputs)) + logits = inputs + noise + soft_sample = tf.nn.softmax(logits / self.tau, axis=-1) + hard_sample = tf.stop_gradient(tf.argmax(logits, axis=-1, output_type=tf.int32)) + hard_sample = tf.cast(hard_sample, tf.float32) return hard_sample, soft_sample def get_config(self): @@ -42,8 +28,7 @@ def get_config(self): config.update({'tau': self.tau}) return config -@tf.keras.utils.register_keras_serializable(package='Custom', name='GumbelSoftmaxActivation') -class GumbelSoftmaxActivation(Layer): +class GumbelSoftmaxActivation(layers.Layer): """An interface layer connecting different parts of an incoming tensor to adequate activation functions. The tensor parts are qualified according to the passed processor object. Processed categorical features are sent to specific Gumbel-Softmax layers. @@ -54,7 +39,7 @@ class GumbelSoftmaxActivation(Layer): processor's pipelines in/out feature maps. For simplicity this object can be taken directly from the data \ processor col_transform_info.""" - def __init__(self, activation_info: NamedTuple, name: Optional[str] = None, tau: Optional[float] = None, **kwargs): + def __init__(self, activation_info: NamedTuple, tau: Optional[float] = None, name: Optional[str] = None, **kwargs): """Arguments: col_map (NamedTuple): Defines each of the processor pipelines input/output features. name (Optional[str]): Name of the GumbelSoftmaxActivation layer @@ -69,20 +54,24 @@ def __init__(self, activation_info: NamedTuple, name: Optional[str] = None, tau: self.cat_feats = activation_info.categorical self.num_feats = activation_info.numerical - self._cat_lens = [len([col for col in self.cat_feats.feat_names_out if search(f'^{cat_feat}_.*$', col)]) \ + self._cat_lens = [len([col for col in self.cat_feats.feat_names_out if col.startswith(f'{cat_feat}_')]) \ for cat_feat in self.cat_feats.feat_names_in] self._num_lens = len(self.num_feats.feat_names_out) - def call(self, _input): # pylint: disable=W0221 - num_cols, cat_cols = split(_input, [self._num_lens, -1], 1, name='split_num_cats') - cat_cols = split(cat_cols, self._cat_lens if self._cat_lens else [0], 1, name='split_cats') + def call(self, inputs): # pylint: disable=W0221 + num_cols, cat_cols = tf.split(inputs, [self._num_lens, -1], axis=-1) + cat_cols = tf.split(cat_cols, self._cat_lens if self._cat_lens else [1], axis=-1) - num_cols = [Activation('tanh', name='num_cols_activation')(num_cols)] - cat_cols = [GumbelSoftmaxLayer(tau=self.tau, name=name)(col)[0] for name, col in \ - zip(self.cat_feats.feat_names_in, cat_cols)] - return concat(num_cols+cat_cols, 1) + num_cols = layers.Activation('tanh')(num_cols) + cat_cols = [GumbelSoftmaxLayer(tau=self.tau)(col)[0] for col in cat_cols] + return tf.concat([num_cols] + cat_cols, axis=-1) def get_config(self): config = super().get_config().copy() - config.update({'activation_info': self._activation_info}) + config.update({'activation_info': self._activation_info, 'tau': self.tau}) return config + +def gumbel_noise(shape: tf.TensorShape) -> tf.Tensor: + """Create a single sample from the standard (loc = 0, scale = 1) Gumbel distribution.""" + uniform_sample = tf.random.uniform(shape, seed=0) + return -tf.math.log(-tf.math.log(uniform_sample + 1e-20) + 1e-20) diff --git a/src/ydata_synthetic/utils/misc/colormaps.py b/src/ydata_synthetic/utils/misc/colormaps.py index 98951c85..8abaa49a 100644 --- a/src/ydata_synthetic/utils/misc/colormaps.py +++ b/src/ydata_synthetic/utils/misc/colormaps.py @@ -1,34 +1,56 @@ -from matplotlib.colors import ListedColormap +from matplotlib.colors import ListedColormap, LinearSegmentedColormap import matplotlib.pyplot as plt import numpy as np from matplotlib import cm -viridis = cm.get_cmap('viridis', 256) -newcolors = viridis(np.linspace(0, 1, 256)) -pink = np.array([248/256, 24/256, 148/256, 1]) -newcolors[:25, :] = pink -newcmp = ListedColormap(newcolors) +def create_custom_colormap(cmap: cm.ScalarMappable, replacement_colors: np.ndarray) -> ListedColormap: + """ + Creates a custom colormap by replacing a specific range of colors in the given colormap. + + :param cmap: The colormap to modify. + :param replacement_colors: A numpy array of shape (n, 4) representing the new colors to use. + :return: A new ListedColormap with the modified colors. + """ + new_colors = np.concatenate([cmap(np.linspace(0, 0.24, 25)), replacement_colors, cmap(np.linspace(0.76, 1, 25))]) + return ListedColormap(new_colors) + +def ydata_colormap(n: int = None) -> ListedColormap: + """ + Returns a colormap with the YData colors and a discrete boundary norm. + Pass n to define a truncated color map (use less colors) + + :param n: Number of colors to use in the colormap. If None, all colors will be used. + :return: A new ListedColormap with the YData colors. + """ + if n is not None and not isinstance(n, int) or n < 1: + raise ValueError("n must be a positive integer") -def ydata_colormap(n: int = None): - """Returns a colormap with the YData colors and a discrete boundary norm. - Pass n to define a truncated color map (use less colors)""" colors = ["#830000", "#040404", "#FFFFFF", "#E32212"] - if n and n>len(colors): - n=len(colors) - return ListedColormap(colors[:n]) + colors = [np.array(color) / 255 for color in colors] + + if n is None or n >= len(colors): + return ListedColormap(colors) + + return create_custom_colormap(cm.colors.LinearSegmentedColormap.from_list('YData', colors), np.array([[1, 1, 1, 1]] * n)) + +def plot_examples(cms: list[ListedColormap]) -> None: + """ + Plots colormaps examples. + + :param cms: List of colormaps to plot. + """ + np.random.seed(19680801) + data = np.random.randn(30, 30) + + fig, axs = plt.subplots(1, len(cms), figsize=(6, 3), constrained_layout=True) + for [ax, cmap] in zip(axs, cms): + psm = ax.pcolormesh(data, cmap=cmap, rasterized=True) + fig.colorbar(psm, ax=ax, norm=cm.colors.BoundaryNorm(np.arange(-4, 5, 1), cmap.N)) + plt.show() if __name__ == '__main__': - def plot_examples(cms): - """ - helper function to plot colormaps - """ - np.random.seed(19680801) - data = np.random.randn(30, 30) - - fig, axs = plt.subplots(1, len(cms), figsize=(6, 3), constrained_layout=True) - for [ax, cmap] in zip(axs, cms): - psm = ax.pcolormesh(data, cmap=cmap, rasterized=True, vmin=-4, vmax=4) - fig.colorbar(psm, ax=ax) - plt.show() - - plot_examples([viridis, ydata_colormap()]) + viridis = cm.get_cmap('viridis', 256) + newcolors = np.array([[248/256, 24/256, 148/256, 1]] * 25) + custom_viridis = create_custom_colormap(viridis, newcolors) + + plot_examples([viridis, custom_viridis, ydata_colormap(4)])