diff --git a/examples/regular/adult_dragan.py b/examples/regular/adult_dragan.py index b19417fb..03e45c72 100644 --- a/examples/regular/adult_dragan.py +++ b/examples/regular/adult_dragan.py @@ -1,12 +1,18 @@ -from ydata_synthetic.preprocessing.regular.adult import transformations +from pmlb import fetch_data + from ydata_synthetic.synthesizers.regular import DRAGAN from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -#Load and process the data -data, processed_data, preprocessor = transformations() +model = DRAGAN + +#Load data and define the data processor parameters +data = fetch_data('adult') +num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] +cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country', 'target'] -# WGAN_GP training -#Defininf the training parameters of WGAN_GP +# DRAGAN training +#Defining the training parameters of DRAGAN noise_dim = 128 dim = 128 @@ -23,12 +29,14 @@ lr=learning_rate, betas=(beta_1, beta_2), noise_dim=noise_dim, - n_cols=processed_data.shape[1], layers_dim=dim) train_args = TrainParameters(epochs=epochs, sample_interval=log_step) -synthesizer = DRAGAN(gan_args, n_discriminator=3) -synthesizer.train(processed_data, train_args) +synthesizer = model(gan_args, n_discriminator=3) +synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True) synthesizer.save('adult_synth.pkl') + +synthesizer = model.load('adult_synth.pkl') +synthesizer.sample(1000) diff --git a/examples/regular/adult_wgangp.py b/examples/regular/adult_wgangp.py index b62c3f17..5af6c878 100644 --- a/examples/regular/adult_wgangp.py +++ b/examples/regular/adult_wgangp.py @@ -1,16 +1,21 @@ -from ydata_synthetic.preprocessing.regular.adult import transformations +from pmlb import fetch_data + from ydata_synthetic.synthesizers.regular import WGAN_GP from ydata_synthetic.synthesizers import ModelParameters, TrainParameters -#Load and process the data -data, processed_data, preprocessor = transformations() +model = WGAN_GP + +#Load data and define the data processor parameters +data = fetch_data('adult') +num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] +cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country', 'target'] -# WGAN_GP training -#Defining the training parameters of WGAN_GP +#Defining the training parameters -noise_dim = 32 +noise_dim = 128 dim = 128 -batch_size = 128 +batch_size = 500 log_step = 100 epochs = 300+1 @@ -23,14 +28,15 @@ lr=learning_rate, betas=(beta_1, beta_2), noise_dim=noise_dim, - n_cols=processed_data.shape[1], layers_dim=dim) train_args = TrainParameters(epochs=epochs, sample_interval=log_step) -synthesizer = WGAN_GP(gan_args, n_critic=2) -synthesizer.train(processed_data, train_args) +synthesizer = model(gan_args, n_critic=2) +synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True) -synth_data = synthesizer.sample(1000) synthesizer.save('test.pkl') + +synthesizer = model.load('test.pkl') +synth_data = synthesizer.sample(1000) diff --git a/examples/regular/cgan_example.py b/examples/regular/cgan_example.py index ceed6335..5e290fc4 100644 --- a/examples/regular/cgan_example.py +++ b/examples/regular/cgan_example.py @@ -1,25 +1,22 @@ from ydata_synthetic.synthesizers.regular import CGAN -from ydata_synthetic.preprocessing.regular.credit_fraud import transformations from ydata_synthetic.synthesizers import ModelParameters, TrainParameters import pandas as pd import numpy as np from sklearn import cluster +model = CGAN + #Read the original data and have it preprocessed data = pd.read_csv('data/creditcard.csv', index_col=[0]) #List of columns different from the Class column -data_cols = list(data.columns[ data.columns != 'Class' ]) -label_cols = ['Class'] +num_cols = list(data.columns[ data.columns != 'Class' ]) +cat_cols = [] # Condition features are not preprocessed and therefore not listed here -print('Dataset columns: {}'.format(data_cols)) +print('Dataset columns: {}'.format(num_cols)) sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] -processed_data = data[ sorted_cols ].copy() - -#Before training the GAN do not forget to apply the required data transformations -#To ease here we've applied a PowerTransformation -_, data, _ = transformations(data) +data = data[ sorted_cols ].copy() #For the purpose of this example we will only synthesize the minority class train_data = data.loc[ data['Class']==1 ].copy() @@ -28,7 +25,7 @@ print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) algorithm = cluster.KMeans args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ]) +labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) @@ -51,19 +48,11 @@ learning_rate = 5e-4 models_dir = './cache' -train_sample = fraud_w_classes.copy().reset_index(drop=True) -train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True) -label_cols = [list(train_sample.columns).index(i) for i in train_sample.columns if 'Class' in i ] -data_cols = [ i for i in train_sample.columns if i not in label_cols ] -train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn -train_no_label = train_sample[ data_cols ] - #Test here the new inputs gan_args = ModelParameters(batch_size=batch_size, lr=learning_rate, betas=(beta_1, beta_2), noise_dim=noise_dim, - n_cols=train_sample.shape[1] - len(label_cols), # Don't count the label columns here layers_dim=dim) train_args = TrainParameters(epochs=epochs, @@ -73,10 +62,19 @@ labels=(0,1)) #Init the Conditional GAN providing the index of the label column as one of the arguments -synthesizer = CGAN(model_parameters=gan_args, num_classes=2) +synthesizer = model(model_parameters=gan_args, num_classes=2) #Training the Conditional GAN -synthesizer.train(data=train_sample, label="Class",train_arguments=train_args) +synthesizer.train(data=fraud_w_classes, label_col="Class", train_arguments=train_args, + num_cols=num_cols, cat_cols=cat_cols) #Saving the synthesizer synthesizer.save('cgan_synthtrained.pkl') + +#Loading the synthesizer +synthesizer = model.load('cgan_synthtrained.pkl') + +#Sampling from the synthesizer +cond_array = np.array([0]) +# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place) +synthesizer = synthesizer.sample(cond_array, 1000) diff --git a/examples/regular/cramergan_example.py b/examples/regular/cramergan_example.py index 3797ef73..f9970aae 100644 --- a/examples/regular/cramergan_example.py +++ b/examples/regular/cramergan_example.py @@ -6,34 +6,28 @@ from ydata_synthetic.synthesizers import ModelParameters, TrainParameters from ydata_synthetic.synthesizers.regular import CRAMERGAN -from ydata_synthetic.preprocessing.regular.credit_fraud import transformations model = CRAMERGAN #Read the original data and have it preprocessed data = pd.read_csv('data/creditcard.csv', index_col=[0]) -#Data processing and analysis -data_cols = list(data.columns[ data.columns != 'Class' ]) -label_cols = ['Class'] +#List of columns different from the Class column +num_cols = list(data.columns[ data.columns != 'Class' ]) +cat_cols = ['Class'] -print('Dataset columns: {}'.format(data_cols)) +print('Dataset columns: {}'.format(num_cols)) sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] -processed_data = data[ sorted_cols ].copy() - -#Before training the GAN do not forget to apply the required data transformations -#To ease here we've applied a PowerTransformation -_, data, _ = transformations(data) - +data = data[ sorted_cols ].copy() #For the purpose of this example we will only synthesize the minority class train_data = data.loc[ data['Class']==1 ].copy() +#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) - algorithm = cluster.KMeans args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ]) +labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) @@ -53,29 +47,18 @@ beta_2 = 0.9 models_dir = './cache' -train_sample = fraud_w_classes.copy().reset_index(drop=True) -train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True) -label_cols = [ i for i in train_sample.columns if 'Class' in i ] -data_cols = [ i for i in train_sample.columns if i not in label_cols ] -train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn -train_no_label = train_sample[ data_cols ] - model_parameters = ModelParameters(batch_size=batch_size, lr=learning_rate, betas=(beta_1, beta_2), noise_dim=noise_dim, - n_cols=train_sample.shape[1], layers_dim=dim) train_args = TrainParameters(epochs=epochs, sample_interval=log_step) -test_size = 492 # number of fraud cases -noise_dim = 32 - #Training the CRAMERGAN model synthesizer = model(model_parameters, gradient_penalty_weight=10) -synthesizer.train(train_sample, train_args) +synthesizer.train(data=fraud_w_classes, train_arguments=train_args, num_cols = num_cols, cat_cols = cat_cols) #Saving the synthesizer to later generate new events synthesizer.save(path='models/cramergan_creditcard.pkl') diff --git a/examples/regular/gan_example.ipynb b/examples/regular/gan_example.ipynb index c54997db..86a060b4 100644 --- a/examples/regular/gan_example.ipynb +++ b/examples/regular/gan_example.ipynb @@ -1,32 +1,14 @@ { - "nbformat": 4, - "nbformat_minor": 2, - "metadata": { - "kernelspec": { - "name": "python3", - "language": "python", - "display_name": "Python 3 (ipykernel)" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - "colab": { - "name": "gan_example.ipynb", - "provenance": [] - } - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "collapsed": true, + "id": "AnCU8-Mal4fV", + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "# The Credit Card Fraud Dataset - Synthesizing the Minority Class\n", "\n", @@ -34,134 +16,137 @@ "GANs to synthesize tabular data.\n", "For the purpose of this exercise, dataset of credit card fraud from Kaggle is used, that can be found here:\n", "https://www.kaggle.com/mlg-ulb/creditcardfraud" - ], - "metadata": { - "collapsed": true, - "pycharm": { - "name": "#%% md\n" - }, - "id": "AnCU8-Mal4fV" - } + ] }, { "cell_type": "code", "execution_count": 1, - "source": [ - "# Note: You can select between running the Notebook on \"CPU\" or \"GPU\"\n", - "# Click \"Runtime > Change Runtime time\" and set \"GPU\"" - ], - "outputs": [], "metadata": { + "id": "j0CX0r65l4fY", "pycharm": { "name": "#%%\n" - }, - "id": "j0CX0r65l4fY" - } + } + }, + "outputs": [], + "source": [ + "# Note: You can select between running the Notebook on \"CPU\" or \"GPU\"\n", + "# Click \"Runtime > Change Runtime time\" and set \"GPU\"" + ] }, { "cell_type": "code", "execution_count": 2, - "source": [ - "# Install ydata-synthetic lib\n", - "# ! pip install ydata-synthetic" - ], - "outputs": [], "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "x0u2qegKl4fY", - "outputId": "51b00474-09de-4e9a-dbf9-9535f056fbd0", "colab": { "base_uri": "https://localhost:8080/", "height": 247 + }, + "id": "x0u2qegKl4fY", + "outputId": "51b00474-09de-4e9a-dbf9-9535f056fbd0", + "pycharm": { + "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "# Install ydata-synthetic lib\n", + "# ! pip install ydata-synthetic" + ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, + "metadata": { + "id": "oX2OK2fbl4fZ", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], "source": [ "import os\n", "\n", + "import matplotlib.pyplot as plt\n", "import sklearn.cluster as cluster\n", + "from numpy import array, random, sum, unique\n", + "from pandas import DataFrame, read_csv\n", "\n", - "from ydata_synthetic.synthesizers.regular import VanilllaGAN\n", "from ydata_synthetic.synthesizers import ModelParameters, TrainParameters\n", - "from ydata_synthetic.preprocessing.regular.credit_fraud import *\n", - "\n", - "model = VanilllaGAN" - ], - "outputs": [], + "from ydata_synthetic.synthesizers.regular import VanilllaGAN" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": { + "id": "P1Rcz4RPl4fZ", "pycharm": { "name": "#%%\n" - }, - "id": "oX2OK2fbl4fZ" - } + } + }, + "outputs": [], + "source": [ + "model = VanilllaGAN\n", + "\n", + "# Read the original data and have it preprocessed\n", + "data = read_csv('../../data/creditcard.csv', index_col=[0])" + ] }, { "cell_type": "code", - "execution_count": 4, - "source": [ - "# Read the original data and have it preprocessed\n", - "data = pd.read_csv('./data/creditcard.csv', index_col=[0])" - ], + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ceNe1Ofbl4fZ", + "outputId": "f8f9fece-e7d3-454f-d4c9-d6cd116ca68a" + }, "outputs": [ { - "output_type": "error", - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './data/creditcard.csv'", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Read the original data and have it preprocessed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./data/creditcard.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 610\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 611\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 612\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 460\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 462\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 463\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 464\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1048\u001b[0m )\n\u001b[1;32m 1049\u001b[0m \u001b[0;31m# error: Too many arguments for \"ParserBase\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1050\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmapping\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[call-arg]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1051\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1052\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_failover_to_python\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1865\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1866\u001b[0m \u001b[0;31m# open handles\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1867\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open_handles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1868\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1869\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"storage_options\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"encoding\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"memory_map\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"compression\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_open_handles\u001b[0;34m(self, src, kwds)\u001b[0m\n\u001b[1;32m 1360\u001b[0m \u001b[0mLet\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mreaders\u001b[0m \u001b[0mopen\u001b[0m \u001b[0mIOHanldes\u001b[0m \u001b[0mafter\u001b[0m \u001b[0mthey\u001b[0m \u001b[0mare\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtheir\u001b[0m \u001b[0mpotential\u001b[0m \u001b[0mraises\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1361\u001b[0m \"\"\"\n\u001b[0;32m-> 1362\u001b[0;31m self.handles = get_handle(\n\u001b[0m\u001b[1;32m 1363\u001b[0m \u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1364\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/ydata_synth/lib/python3.8/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"replace\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 647\u001b[0;31m handle = open(\n\u001b[0m\u001b[1;32m 648\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './data/creditcard.csv'" + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset columns: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']\n" ] } ], - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "P1Rcz4RPl4fZ" - } - }, - { - "cell_type": "code", - "execution_count": null, "source": [ - "# Extract list of columns\n", - "data_cols = list(data.columns[ data.columns != 'Class' ])\n", + "#List of columns different from the Class column\n", + "num_cols = list(data.columns[ data.columns != 'Class' ])\n", + "cat_cols = ['Class']\n", "\n", - "print('Dataset columns: {}'.format(data_cols))\n", + "print('Dataset columns: {}'.format(num_cols))\n", "sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']\n", "processed_data = data[ sorted_cols ].copy()" - ], - "outputs": [], + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": { - "id": "ceNe1Ofbl4fZ", - "outputId": "f8f9fece-e7d3-454f-d4c9-d6cd116ca68a", "colab": { "base_uri": "https://localhost:8080/" + }, + "id": "3o4V8-ypl4fa", + "outputId": "39fabdb7-b3e4-492f-85f0-cd6232b45609", + "pycharm": { + "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset info: Number of records - 492 Number of variables - 30\n", + " count\n", + "0 455\n", + "1 37\n" + ] + } + ], "source": [ - "# Before training the GAN do not forget to apply the required data transformations\n", - "# To ease here we've applied a PowerTransformation - make data distribution more Gaussian-like.\n", - "_, data, _ = transformations(data)\n", - "\n", "# For the purpose of this example we will only synthesize the minority class\n", "# train_data contains 492 rows which had 'Class' value as 1 (which were very few)\n", "train_data = data.loc[ data['Class']==1 ].copy()\n", @@ -173,46 +158,42 @@ "# We essentially get an array of 492 rows ('labels') having values either 0 or 1 for the 2 clustered classes.\n", "algorithm = cluster.KMeans\n", "args, kwds = (), {'n_clusters':2, 'random_state':0}\n", - "labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])\n", + "labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])\n", "\n", "# Get the count of both classes\n", - "print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )\n", + "print( DataFrame( [ [sum(labels==i)] for i in unique(labels) ], columns=['count'], index=unique(labels) ) )\n", "\n", "# Assign the k-means clustered classes' labels to the a seperate copy of train data 'fraud_w_classes'\n", "fraud_w_classes = train_data.copy()\n", "fraud_w_classes['Class'] = labels" - ], - "outputs": [], - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "3o4V8-ypl4fa", - "outputId": "39fabdb7-b3e4-492f-85f0-cd6232b45609", - "colab": { - "base_uri": "https://localhost:8080/" - } - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "3ezlIjKbl4fb", + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "# GAN training\n", "\n", "Below you can try to train your own generators using the available GANs architectures. You can train it either with labels (created using KMeans) or with no labels at all. \n", "\n", "Remember that for this exercise in particular we've decided to synthesize only the minority class from the Credit Fraud dataset." - ], - "metadata": { - "pycharm": { - "name": "#%% md\n" - }, - "id": "3ezlIjKbl4fb" - } + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "metadata": { + "id": "7FMDs5eql4fb", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], "source": [ "# Define the GAN and training parameters\n", "noise_dim = 32\n", @@ -226,162 +207,1079 @@ "beta_2 = 0.9\n", "models_dir = './cache'\n", "\n", - "train_sample = fraud_w_classes.copy().reset_index(drop=True)\n", - "print(\"train_sample.columns:\")\n", - "print(train_sample.columns)\n", - "\n", - "# There's only 1 class, so essentially rename the 'Class' to 'Class_1',\n", - "# which tells weather a sample data is of class 1 or not.\n", - "train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)\n", - "\n", - "# 'Class_1' label\n", - "label_cols = [ i for i in train_sample.columns if 'Class' in i ]\n", - "\n", - "# All columns except 'Class_1'\n", - "data_cols = [ i for i in train_sample.columns if i not in label_cols ]\n", - "\n", - "# Scale down the data, and rename it to 'train_no_label'\n", - "train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn\n", - "train_no_label = train_sample[ data_cols ]" - ], - "outputs": [], - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "7FMDs5eql4fb" - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ "#Setting the GAN model parameters and the training step parameters\n", "gan_args = ModelParameters(batch_size=batch_size,\n", " lr=learning_rate,\n", " betas=(beta_1, beta_2),\n", " noise_dim=noise_dim,\n", - " n_cols=train_sample.shape[1],\n", " layers_dim=dim)\n", "\n", "train_args = TrainParameters(epochs=epochs,\n", " sample_interval=log_step)" - ], - "outputs": [], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "code", - "execution_count": null, - "source": [ - "# Training the GAN model chosen: Vanilla GAN, CGAN, DCGAN, etc.\n", - "synthesizer = model(gan_args)\n", - "synthesizer.train(train_sample, train_args)" - ], - "outputs": [], + "execution_count": 6, "metadata": { - "pycharm": { - "name": "#%%\n" + "colab": { + "base_uri": "https://localhost:8080/" }, "id": "qgMDmyall4fc", "outputId": "ae669bdf-01b6-49d9-a254-cc0776508f7b", - "colab": { - "base_uri": "https://localhost:8080/" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "# Generator description\n", - "synthesizer.generator.summary()" - ], - "outputs": [], - "metadata": { - "id": "tDjYWJPyl4fc", - "outputId": "8a5c7afb-74ee-44ee-8902-048250d04061", - "colab": { - "base_uri": "https://localhost:8080/" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "# Discriminator description\n", - "synthesizer.discriminator.summary()" - ], - "outputs": [], - "metadata": { "pycharm": { "name": "#%%\n" - }, - "id": "9zyfNK8Gl4fd", - "outputId": "634297a1-dbeb-4fd0-fe52-24b181711336", - "colab": { - "base_uri": "https://localhost:8080/" } - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "# You can easily save the trained generator and loaded it afterwards\n", - "if not os.path.exists(\"./saved/gan\"):\n", - " os.makedirs(\"./saved/gan\")\n", - "synthesizer.save(path=\"./saved/gan/generator_fraud.pkl\")" - ], - "outputs": [], - "metadata": { - "pycharm": { - "name": "#%%\n" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 1%|▏ | 3/201 [00:00<00:40, 4.84it/s]" + ] }, - "id": "C3cs_LKEl4fd", - "outputId": "bdb0af49-7e29-480e-cb83-56ad2f192ae0", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 185 - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "models = {'GAN': ['GAN', False, synthesizer.generator]}" - ], - "outputs": [], - "metadata": { - "id": "5mvCYNH5l4fd" - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "# Setup parameters visualization parameters\n", - "seed = 17\n", - "test_size = 492 # number of fraud cases\n", - "noise_dim = 32\n", - "\n", - "np.random.seed(seed)\n", - "z = np.random.normal(size=(test_size, noise_dim))\n", - "real = synthesizer.get_data_batch(train=train_sample, batch_size=test_size, seed=seed)\n", - "real_samples = pd.DataFrame(real, columns=data_cols+label_cols)\n", - "labels = fraud_w_classes['Class']\n", - "\n", - "model_names = ['GAN']\n", + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 [D loss: 0.550036, acc.: 50.00%] [G loss: 0.627987]\n", + "generated_data\n", + "1 [D loss: 0.724137, acc.: 50.00%] [G loss: 0.482935]\n", + "2 [D loss: 0.773580, acc.: 44.14%] [G loss: 0.655450]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 3%|▎ | 7/201 [00:01<00:20, 9.55it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3 [D loss: 0.723554, acc.: 32.03%] [G loss: 0.887138]\n", + "4 [D loss: 0.686885, acc.: 50.00%] [G loss: 0.951322]\n", + "5 [D loss: 0.730200, acc.: 39.84%] [G loss: 0.795471]\n", + "6 [D loss: 0.689810, acc.: 51.56%] [G loss: 0.808690]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 11/201 [00:01<00:15, 12.43it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7 [D loss: 0.707817, acc.: 35.16%] [G loss: 0.718341]\n", + "8 [D loss: 0.690032, acc.: 49.22%] [G loss: 0.724760]\n", + "9 [D loss: 0.690305, acc.: 42.97%] [G loss: 0.725031]\n", + "10 [D loss: 0.699105, acc.: 42.58%] [G loss: 0.725312]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 7%|▋ | 15/201 [00:01<00:13, 13.97it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11 [D loss: 0.631659, acc.: 78.91%] [G loss: 0.848520]\n", + "12 [D loss: 0.742673, acc.: 34.77%] [G loss: 0.677510]\n", + "13 [D loss: 0.724618, acc.: 33.20%] [G loss: 0.740743]\n", + "14 [D loss: 0.598308, acc.: 85.94%] [G loss: 0.950827]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|▊ | 17/201 [00:01<00:12, 14.42it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 [D loss: 0.748309, acc.: 35.55%] [G loss: 0.643942]\n", + "16 [D loss: 0.821484, acc.: 28.12%] [G loss: 0.584143]\n", + "17 [D loss: 0.676677, acc.: 59.77%] [G loss: 0.840448]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 10%|█ | 21/201 [00:01<00:12, 14.21it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "18 [D loss: 0.610853, acc.: 83.59%] [G loss: 0.938941]\n", + "19 [D loss: 0.650022, acc.: 63.28%] [G loss: 0.887446]\n", + "20 [D loss: 0.756313, acc.: 29.30%] [G loss: 0.759600]\n", + "21 [D loss: 0.728475, acc.: 33.98%] [G loss: 0.778566]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 12%|█▏ | 25/201 [00:02<00:11, 14.75it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 [D loss: 0.636214, acc.: 75.39%] [G loss: 0.886067]\n", + "23 [D loss: 0.701726, acc.: 48.05%] [G loss: 0.729937]\n", + "24 [D loss: 0.692845, acc.: 39.06%] [G loss: 0.725504]\n", + "25 [D loss: 0.641558, acc.: 67.97%] [G loss: 0.878462]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 14%|█▍ | 29/201 [00:02<00:11, 15.10it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "26 [D loss: 0.724304, acc.: 32.81%] [G loss: 0.764726]\n", + "27 [D loss: 0.604908, acc.: 83.98%] [G loss: 0.912407]\n", + "28 [D loss: 0.685659, acc.: 40.23%] [G loss: 0.705084]\n", + "29 [D loss: 0.784759, acc.: 26.56%] [G loss: 0.616831]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 16%|█▋ | 33/201 [00:02<00:10, 15.39it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30 [D loss: 0.680897, acc.: 48.44%] [G loss: 0.784109]\n", + "31 [D loss: 0.488568, acc.: 88.28%] [G loss: 1.022120]\n", + "32 [D loss: 0.517986, acc.: 71.88%] [G loss: 0.984776]\n", + "33 [D loss: 0.773452, acc.: 37.89%] [G loss: 0.677296]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 18%|█▊ | 37/201 [00:02<00:10, 15.60it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "34 [D loss: 0.845428, acc.: 28.91%] [G loss: 0.568926]\n", + "35 [D loss: 0.688814, acc.: 51.56%] [G loss: 0.843009]\n", + "36 [D loss: 0.638633, acc.: 62.89%] [G loss: 0.920435]\n", + "37 [D loss: 0.684682, acc.: 46.88%] [G loss: 0.878682]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 20%|██ | 41/201 [00:03<00:10, 15.65it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "38 [D loss: 0.742228, acc.: 33.20%] [G loss: 0.801381]\n", + "39 [D loss: 0.747093, acc.: 33.98%] [G loss: 0.878402]\n", + "40 [D loss: 0.605564, acc.: 76.56%] [G loss: 1.110654]\n", + "41 [D loss: 0.517402, acc.: 88.67%] [G loss: 1.188590]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 22%|██▏ | 45/201 [00:03<00:09, 15.70it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42 [D loss: 0.622090, acc.: 64.06%] [G loss: 0.839116]\n", + "43 [D loss: 0.687085, acc.: 52.34%] [G loss: 0.777982]\n", + "44 [D loss: 0.699964, acc.: 39.45%] [G loss: 0.749893]\n", + "45 [D loss: 0.646823, acc.: 59.38%] [G loss: 0.811863]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 24%|██▍ | 49/201 [00:03<00:09, 15.67it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "46 [D loss: 0.545613, acc.: 80.08%] [G loss: 1.006777]\n", + "47 [D loss: 0.562644, acc.: 74.61%] [G loss: 1.045896]\n", + "48 [D loss: 0.748366, acc.: 50.39%] [G loss: 0.884654]\n", + "49 [D loss: 0.733166, acc.: 38.28%] [G loss: 0.893640]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 26%|██▋ | 53/201 [00:03<00:09, 15.48it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50 [D loss: 0.684397, acc.: 48.05%] [G loss: 0.911950]\n", + "51 [D loss: 0.673821, acc.: 49.22%] [G loss: 0.949331]\n", + "52 [D loss: 0.682803, acc.: 50.00%] [G loss: 0.892081]\n", + "53 [D loss: 0.633063, acc.: 64.84%] [G loss: 1.007781]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 28%|██▊ | 57/201 [00:04<00:10, 14.24it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "54 [D loss: 0.597956, acc.: 72.27%] [G loss: 0.971366]\n", + "55 [D loss: 0.689976, acc.: 60.94%] [G loss: 0.909638]\n", + "56 [D loss: 0.755158, acc.: 47.66%] [G loss: 0.840833]\n", + "57 [D loss: 0.721160, acc.: 41.41%] [G loss: 0.816230]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 30%|███ | 61/201 [00:04<00:09, 14.95it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "58 [D loss: 0.710852, acc.: 41.02%] [G loss: 0.876071]\n", + "59 [D loss: 0.722174, acc.: 37.89%] [G loss: 0.795401]\n", + "60 [D loss: 0.678059, acc.: 51.17%] [G loss: 0.851614]\n", + "61 [D loss: 0.700930, acc.: 44.14%] [G loss: 0.809518]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 32%|███▏ | 65/201 [00:04<00:09, 15.11it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "62 [D loss: 0.697455, acc.: 52.73%] [G loss: 0.880072]\n", + "63 [D loss: 0.670013, acc.: 54.30%] [G loss: 0.881521]\n", + "64 [D loss: 0.672341, acc.: 53.52%] [G loss: 0.852372]\n", + "65 [D loss: 0.688349, acc.: 42.19%] [G loss: 0.767715]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 34%|███▍ | 69/201 [00:05<00:08, 15.28it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "66 [D loss: 0.673236, acc.: 50.39%] [G loss: 0.785096]\n", + "67 [D loss: 0.662382, acc.: 58.59%] [G loss: 0.802408]\n", + "68 [D loss: 0.671653, acc.: 53.12%] [G loss: 0.758970]\n", + "69 [D loss: 0.678161, acc.: 50.00%] [G loss: 0.772674]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 36%|███▋ | 73/201 [00:05<00:08, 15.42it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "70 [D loss: 0.670603, acc.: 55.47%] [G loss: 0.800789]\n", + "71 [D loss: 0.671375, acc.: 45.31%] [G loss: 0.751903]\n", + "72 [D loss: 0.681129, acc.: 45.70%] [G loss: 0.755616]\n", + "73 [D loss: 0.666789, acc.: 52.34%] [G loss: 0.776031]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 38%|███▊ | 77/201 [00:05<00:08, 15.24it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "74 [D loss: 0.662781, acc.: 54.30%] [G loss: 0.777716]\n", + "75 [D loss: 0.673832, acc.: 46.09%] [G loss: 0.755191]\n", + "76 [D loss: 0.664220, acc.: 57.81%] [G loss: 0.774768]\n", + "77 [D loss: 0.660048, acc.: 52.34%] [G loss: 0.773231]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 40%|████ | 81/201 [00:05<00:07, 15.06it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "78 [D loss: 0.675459, acc.: 53.52%] [G loss: 0.759155]\n", + "79 [D loss: 0.677063, acc.: 54.69%] [G loss: 0.761785]\n", + "80 [D loss: 0.666177, acc.: 54.30%] [G loss: 0.791037]\n", + "81 [D loss: 0.648826, acc.: 56.25%] [G loss: 0.802469]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 42%|████▏ | 85/201 [00:06<00:07, 15.17it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "82 [D loss: 0.667839, acc.: 45.70%] [G loss: 0.752686]\n", + "83 [D loss: 0.677281, acc.: 55.86%] [G loss: 0.792059]\n", + "84 [D loss: 0.645253, acc.: 60.94%] [G loss: 0.804177]\n", + "85 [D loss: 0.679732, acc.: 54.30%] [G loss: 0.760794]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 44%|████▍ | 89/201 [00:06<00:07, 15.29it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "86 [D loss: 0.684020, acc.: 51.56%] [G loss: 0.811780]\n", + "87 [D loss: 0.641267, acc.: 59.77%] [G loss: 0.870217]\n", + "88 [D loss: 0.656589, acc.: 56.64%] [G loss: 0.793731]\n", + "89 [D loss: 0.655231, acc.: 61.33%] [G loss: 0.778803]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 46%|████▋ | 93/201 [00:06<00:07, 15.38it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "90 [D loss: 0.648248, acc.: 59.77%] [G loss: 0.800200]\n", + "91 [D loss: 0.635786, acc.: 64.45%] [G loss: 0.797188]\n", + "92 [D loss: 0.663567, acc.: 51.56%] [G loss: 0.784666]\n", + "93 [D loss: 0.663212, acc.: 56.25%] [G loss: 0.866105]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 48%|████▊ | 97/201 [00:06<00:06, 15.20it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "94 [D loss: 0.636874, acc.: 62.11%] [G loss: 0.884161]\n", + "95 [D loss: 0.631512, acc.: 69.53%] [G loss: 0.898002]\n", + "96 [D loss: 0.658940, acc.: 55.47%] [G loss: 0.810010]\n", + "97 [D loss: 0.661417, acc.: 53.12%] [G loss: 0.852796]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 50%|█████ | 101/201 [00:07<00:06, 14.93it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98 [D loss: 0.638799, acc.: 66.02%] [G loss: 0.853503]\n", + "99 [D loss: 0.615267, acc.: 71.88%] [G loss: 0.891819]\n", + "100 [D loss: 0.653688, acc.: 56.64%] [G loss: 0.866009]\n", + "generated_data\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 52%|█████▏ | 105/201 [00:07<00:06, 15.11it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "101 [D loss: 0.666099, acc.: 50.39%] [G loss: 0.842007]\n", + "102 [D loss: 0.651683, acc.: 58.98%] [G loss: 0.923288]\n", + "103 [D loss: 0.646693, acc.: 62.50%] [G loss: 0.858193]\n", + "104 [D loss: 0.659455, acc.: 58.98%] [G loss: 0.865092]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 54%|█████▍ | 109/201 [00:07<00:05, 15.38it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "105 [D loss: 0.645711, acc.: 59.38%] [G loss: 0.880491]\n", + "106 [D loss: 0.653809, acc.: 55.08%] [G loss: 0.818559]\n", + "107 [D loss: 0.678842, acc.: 43.75%] [G loss: 0.767874]\n", + "108 [D loss: 0.646079, acc.: 58.20%] [G loss: 0.876584]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 56%|█████▌ | 113/201 [00:07<00:05, 15.48it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "109 [D loss: 0.666158, acc.: 51.56%] [G loss: 0.835043]\n", + "110 [D loss: 0.640382, acc.: 59.38%] [G loss: 0.832070]\n", + "111 [D loss: 0.643277, acc.: 60.94%] [G loss: 0.845614]\n", + "112 [D loss: 0.669900, acc.: 57.03%] [G loss: 0.820596]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 58%|█████▊ | 117/201 [00:08<00:05, 15.34it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "113 [D loss: 0.656420, acc.: 57.03%] [G loss: 0.836345]\n", + "114 [D loss: 0.641564, acc.: 57.42%] [G loss: 0.876468]\n", + "115 [D loss: 0.646709, acc.: 59.38%] [G loss: 0.825197]\n", + "116 [D loss: 0.651036, acc.: 54.69%] [G loss: 0.777496]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 60%|██████ | 121/201 [00:08<00:05, 15.05it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "117 [D loss: 0.652030, acc.: 58.59%] [G loss: 0.793402]\n", + "118 [D loss: 0.649176, acc.: 59.38%] [G loss: 0.811079]\n", + "119 [D loss: 0.649228, acc.: 54.69%] [G loss: 0.807060]\n", + "120 [D loss: 0.631465, acc.: 57.81%] [G loss: 0.873953]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 62%|██████▏ | 125/201 [00:08<00:04, 15.27it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "121 [D loss: 0.642788, acc.: 54.30%] [G loss: 0.842716]\n", + "122 [D loss: 0.633139, acc.: 61.72%] [G loss: 0.860812]\n", + "123 [D loss: 0.666036, acc.: 56.25%] [G loss: 0.833639]\n", + "124 [D loss: 0.637599, acc.: 57.42%] [G loss: 0.924634]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 64%|██████▍ | 129/201 [00:09<00:04, 15.19it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "125 [D loss: 0.643670, acc.: 58.59%] [G loss: 0.840004]\n", + "126 [D loss: 0.666960, acc.: 49.22%] [G loss: 0.773053]\n", + "127 [D loss: 0.638831, acc.: 59.77%] [G loss: 0.855623]\n", + "128 [D loss: 0.630438, acc.: 62.11%] [G loss: 0.862262]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 65%|██████▌ | 131/201 [00:09<00:04, 15.05it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "129 [D loss: 0.641727, acc.: 61.33%] [G loss: 0.807410]\n", + "130 [D loss: 0.647421, acc.: 56.64%] [G loss: 0.860150]\n", + "131 [D loss: 0.637578, acc.: 62.11%] [G loss: 0.880380]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 67%|██████▋ | 135/201 [00:09<00:04, 15.14it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "132 [D loss: 0.635053, acc.: 63.67%] [G loss: 0.867916]\n", + "133 [D loss: 0.642439, acc.: 56.25%] [G loss: 0.792550]\n", + "134 [D loss: 0.638722, acc.: 62.11%] [G loss: 0.829996]\n", + "135 [D loss: 0.650501, acc.: 54.30%] [G loss: 0.810637]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 69%|██████▉ | 139/201 [00:09<00:04, 15.30it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "136 [D loss: 0.642949, acc.: 60.55%] [G loss: 0.825194]\n", + "137 [D loss: 0.633457, acc.: 60.16%] [G loss: 0.918275]\n", + "138 [D loss: 0.637209, acc.: 61.72%] [G loss: 0.876791]\n", + "139 [D loss: 0.633082, acc.: 61.72%] [G loss: 0.847758]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 71%|███████ | 143/201 [00:09<00:03, 14.96it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "140 [D loss: 0.673803, acc.: 50.39%] [G loss: 0.753332]\n", + "141 [D loss: 0.652552, acc.: 55.08%] [G loss: 0.834769]\n", + "142 [D loss: 0.609212, acc.: 71.09%] [G loss: 0.917820]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████▎ | 147/201 [00:10<00:03, 15.00it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "143 [D loss: 0.585832, acc.: 72.27%] [G loss: 0.918271]\n", + "144 [D loss: 0.700710, acc.: 44.14%] [G loss: 0.732573]\n", + "145 [D loss: 0.648752, acc.: 57.03%] [G loss: 0.886211]\n", + "146 [D loss: 0.631603, acc.: 61.33%] [G loss: 0.965210]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 75%|███████▌ | 151/201 [00:10<00:03, 15.22it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "147 [D loss: 0.644220, acc.: 61.72%] [G loss: 0.881742]\n", + "148 [D loss: 0.620618, acc.: 63.28%] [G loss: 0.864807]\n", + "149 [D loss: 0.630044, acc.: 61.72%] [G loss: 0.887240]\n", + "150 [D loss: 0.627703, acc.: 63.28%] [G loss: 0.853771]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 77%|███████▋ | 155/201 [00:10<00:02, 15.39it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "151 [D loss: 0.668791, acc.: 53.52%] [G loss: 0.834410]\n", + "152 [D loss: 0.611434, acc.: 61.33%] [G loss: 0.921129]\n", + "153 [D loss: 0.634519, acc.: 59.77%] [G loss: 0.854780]\n", + "154 [D loss: 0.642557, acc.: 53.91%] [G loss: 0.862049]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 79%|███████▉ | 159/201 [00:10<00:02, 15.36it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "155 [D loss: 0.655550, acc.: 58.20%] [G loss: 0.857191]\n", + "156 [D loss: 0.622134, acc.: 64.45%] [G loss: 0.904369]\n", + "157 [D loss: 0.619334, acc.: 66.02%] [G loss: 0.884082]\n", + "158 [D loss: 0.628977, acc.: 63.28%] [G loss: 0.854065]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 81%|████████ | 163/201 [00:11<00:02, 15.33it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "159 [D loss: 0.625271, acc.: 60.55%] [G loss: 0.915155]\n", + "160 [D loss: 0.629663, acc.: 59.77%] [G loss: 0.901985]\n", + "161 [D loss: 0.654066, acc.: 49.22%] [G loss: 0.845741]\n", + "162 [D loss: 0.617537, acc.: 65.23%] [G loss: 0.928255]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 83%|████████▎ | 167/201 [00:11<00:02, 15.52it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "163 [D loss: 0.627255, acc.: 65.62%] [G loss: 0.837423]\n", + "164 [D loss: 0.619008, acc.: 59.77%] [G loss: 0.889656]\n", + "165 [D loss: 0.624287, acc.: 62.50%] [G loss: 0.928081]\n", + "166 [D loss: 0.621571, acc.: 60.94%] [G loss: 0.963335]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 85%|████████▌ | 171/201 [00:11<00:01, 15.38it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "167 [D loss: 0.614337, acc.: 67.97%] [G loss: 0.873778]\n", + "168 [D loss: 0.631943, acc.: 61.33%] [G loss: 0.901347]\n", + "169 [D loss: 0.630981, acc.: 60.94%] [G loss: 0.866399]\n", + "170 [D loss: 0.608251, acc.: 66.02%] [G loss: 0.855200]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 87%|████████▋ | 175/201 [00:12<00:01, 15.29it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "171 [D loss: 0.616137, acc.: 64.06%] [G loss: 0.882089]\n", + "172 [D loss: 0.638764, acc.: 59.38%] [G loss: 0.906505]\n", + "173 [D loss: 0.634071, acc.: 60.94%] [G loss: 0.883332]\n", + "174 [D loss: 0.613683, acc.: 61.72%] [G loss: 0.965842]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 88%|████████▊ | 177/201 [00:12<00:01, 15.13it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "175 [D loss: 0.619862, acc.: 59.38%] [G loss: 0.901156]\n", + "176 [D loss: 0.625921, acc.: 62.11%] [G loss: 0.861191]\n", + "177 [D loss: 0.634482, acc.: 60.55%] [G loss: 0.858311]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 90%|█████████ | 181/201 [00:12<00:01, 14.95it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "178 [D loss: 0.610101, acc.: 64.06%] [G loss: 0.872502]\n", + "179 [D loss: 0.625674, acc.: 60.55%] [G loss: 0.905137]\n", + "180 [D loss: 0.626871, acc.: 62.89%] [G loss: 0.881574]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 92%|█████████▏| 185/201 [00:12<00:01, 14.93it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "181 [D loss: 0.621699, acc.: 63.28%] [G loss: 0.852086]\n", + "182 [D loss: 0.620817, acc.: 63.67%] [G loss: 0.896311]\n", + "183 [D loss: 0.585868, acc.: 64.84%] [G loss: 0.935561]\n", + "184 [D loss: 0.610238, acc.: 65.23%] [G loss: 0.875845]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 94%|█████████▍| 189/201 [00:12<00:00, 15.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "185 [D loss: 0.620626, acc.: 62.89%] [G loss: 0.904199]\n", + "186 [D loss: 0.621455, acc.: 62.89%] [G loss: 0.867477]\n", + "187 [D loss: 0.607304, acc.: 63.67%] [G loss: 0.905859]\n", + "188 [D loss: 0.597267, acc.: 64.45%] [G loss: 0.951865]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 96%|█████████▌| 193/201 [00:13<00:00, 15.40it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "189 [D loss: 0.622227, acc.: 62.11%] [G loss: 0.895168]\n", + "190 [D loss: 0.621200, acc.: 62.11%] [G loss: 0.871956]\n", + "191 [D loss: 0.604076, acc.: 64.45%] [G loss: 0.847970]\n", + "192 [D loss: 0.601647, acc.: 64.45%] [G loss: 0.908888]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 98%|█████████▊| 197/201 [00:13<00:00, 15.47it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "193 [D loss: 0.599213, acc.: 64.45%] [G loss: 0.939804]\n", + "194 [D loss: 0.595883, acc.: 63.28%] [G loss: 0.898727]\n", + "195 [D loss: 0.596841, acc.: 65.23%] [G loss: 0.925748]\n", + "196 [D loss: 0.633918, acc.: 58.20%] [G loss: 0.908294]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 201/201 [00:13<00:00, 14.62it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "197 [D loss: 0.610397, acc.: 60.94%] [G loss: 0.952700]\n", + "198 [D loss: 0.601393, acc.: 64.06%] [G loss: 0.904418]\n", + "199 [D loss: 0.613972, acc.: 64.06%] [G loss: 0.877581]\n", + "200 [D loss: 0.604303, acc.: 63.67%] [G loss: 0.965180]\n", + "generated_data\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Training the GAN model chosen: Vanilla GAN, CGAN, DCGAN, etc.\n", + "synthesizer = model(gan_args)\n", + "synthesizer.train(data = fraud_w_classes, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tDjYWJPyl4fc", + "outputId": "8a5c7afb-74ee-44ee-8902-048250d04061" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"model\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "input_1 (InputLayer) [(128, 32)] 0 \n", + "_________________________________________________________________\n", + "dense (Dense) (128, 128) 4224 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (128, 256) 33024 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (128, 512) 131584 \n", + "_________________________________________________________________\n", + "dense_3 (Dense) (128, 31) 15903 \n", + "=================================================================\n", + "Total params: 184,735\n", + "Trainable params: 184,735\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "# Generator description\n", + "synthesizer.generator.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9zyfNK8Gl4fd", + "outputId": "634297a1-dbeb-4fd0-fe52-24b181711336", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"model_1\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "input_2 (InputLayer) [(128, 31)] 0 \n", + "_________________________________________________________________\n", + "dense_4 (Dense) (128, 512) 16384 \n", + "_________________________________________________________________\n", + "dropout (Dropout) (128, 512) 0 \n", + "_________________________________________________________________\n", + "dense_5 (Dense) (128, 256) 131328 \n", + "_________________________________________________________________\n", + "dropout_1 (Dropout) (128, 256) 0 \n", + "_________________________________________________________________\n", + "dense_6 (Dense) (128, 128) 32896 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (128, 1) 129 \n", + "=================================================================\n", + "Total params: 180,737\n", + "Trainable params: 0\n", + "Non-trainable params: 180,737\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "# Discriminator description\n", + "synthesizer.discriminator.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 185 + }, + "id": "C3cs_LKEl4fd", + "outputId": "bdb0af49-7e29-480e-cb83-56ad2f192ae0", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# You can easily save the trained generator and loaded it afterwards\n", + "if not os.path.exists(\"./saved/gan\"):\n", + " os.makedirs(\"./saved/gan\")\n", + "synthesizer.save(path=\"./saved/gan/generator_fraud.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "5mvCYNH5l4fd" + }, + "outputs": [], + "source": [ + "models = {'GAN': ['GAN', False, synthesizer.generator]}" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "5wl54IVkl4fe", + "outputId": "7f131092-2e97-4a95-eb93-d97b2f991321", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Setup parameters visualization parameters\n", + "seed = 17\n", + "test_size = 492 # number of fraud cases\n", + "noise_dim = 32\n", + "\n", + "random.seed(seed)\n", + "z = random.normal(size=(test_size, noise_dim))\n", + "real_processed = synthesizer.processor.transform(fraud_w_classes)\n", + "real_samples = synthesizer.get_data_batch(real_processed, batch_size)\n", + "class_labels = ['Class_1','Class_2']\n", + "real_samples = DataFrame(real_samples, columns=num_cols+class_labels)\n", + "labels = fraud_w_classes['Class']\n", + "\n", + "model_names = ['GAN']\n", "colors = ['deepskyblue','blue']\n", "markers = ['o','^']\n", - "class_labels = ['Class 1','Class 2']\n", "\n", "col1, col2 = 'V17', 'V10'\n", "\n", @@ -397,20 +1295,20 @@ "fig = plt.figure(figsize=(14,rows*3))\n", "\n", "# Go through each of the 3 model_step values -> 0, 100, 200\n", - "for model_step_ix, model_step in enumerate(model_steps): \n", + "for model_step_ix, model_step in enumerate(model_steps):\n", " axarr[model_step_ix] = plt.subplot(rows, columns, model_step_ix*columns + 1)\n", "\n", " # Plot 'Class 1' and 'Class 2' samples taken from the original data, in a random shuffled fashion\n", " for group, color, marker, label in zip(real_samples.groupby('Class_1'), colors, markers, class_labels ):\n", - " plt.scatter( group[1][[col1]], group[1][[col2]], \n", + " plt.scatter( group[1][[col1]], group[1][[col2]],\n", " label=label, marker=marker, edgecolors=color, facecolors='none' )\n", - " \n", + "\n", " plt.title('Actual Fraud Data')\n", " plt.ylabel(col2) # Only add y label to left plot\n", " plt.xlabel(col1)\n", " xlims, ylims = axarr[model_step_ix].get_xlim(), axarr[model_step_ix].get_ylim()\n", - " \n", - " if model_step_ix == 0: \n", + "\n", + " if model_step_ix == 0:\n", " legend = plt.legend()\n", " legend.get_frame().set_facecolor('white')\n", "\n", @@ -425,17 +1323,17 @@ "\n", " if with_class:\n", " g_z = generator_model.predict([z, labels])\n", - " gen_samples = pd.DataFrame(g_z, columns=data_cols+label_cols)\n", + " gen_samples = DataFrame(g_z, columns=num_cols+class_labels)\n", " for group, color, marker, label in zip( gen_samples.groupby('Class_1'), colors, markers, class_labels ):\n", - " plt.scatter( group[1][[col1]], group[1][[col2]], \n", + " plt.scatter( group[1][[col1]], group[1][[col2]],\n", " label=label, marker=marker, edgecolors=color, facecolors='none' )\n", " else:\n", " g_z = generator_model.predict(z)\n", - " gen_samples = pd.DataFrame(g_z, columns=data_cols+['label'])\n", + " gen_samples = DataFrame(g_z, columns=num_cols+class_labels)\n", " gen_samples.to_csv('../../data/Generated_sample.csv')\n", " plt.scatter( gen_samples[[col1]], gen_samples[[col2]],\n", " label=class_labels[0], marker=markers[0], edgecolors=colors[0], facecolors='none' )\n", - " plt.title(model_name) \n", + " plt.title(model_name)\n", " plt.xlabel(col1)\n", " ax.set_xlim(xlims), ax.set_ylim(ylims)\n", "\n", @@ -443,7 +1341,7 @@ "plt.tight_layout(rect=[0.075,0,1,0.95])\n", "\n", "# Adding text labels for training steps\n", - "vpositions = np.array([ i._position.bounds[1] for i in axarr ])\n", + "vpositions = array([ i._position.bounds[1] for i in axarr ])\n", "vpositions += ((vpositions[0] - vpositions[1]) * 0.35 )\n", "for model_step_ix, model_step in enumerate( model_steps ):\n", " fig.text( 0.05, vpositions[model_step_ix], 'training\\nstep\\n'+str(model_step), ha='center', va='center', size=12)\n", @@ -451,31 +1349,34 @@ "if not os.path.exists(\"./img\"):\n", " os.makedirs(\"./img\")\n", "plt.savefig('img/Comparison_of_GAN_outputs.png', dpi=100)" - ], - "outputs": [], - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "5wl54IVkl4fe", - "outputId": "7f131092-2e97-4a95-eb93-d97b2f991321", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } - } + ] + } + ], + "metadata": { + "colab": { + "name": "gan_example.ipynb", + "provenance": [] }, - { - "cell_type": "code", - "execution_count": null, - "source": [], - "outputs": [], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "interpreter": { + "hash": "e9b94595181d602aedee98e3f77f0c817ffdebb9c945d905c1f334bc0562225d" + }, + "kernelspec": { + "display_name": "Python 3.8.11 64-bit ('ydata_synth': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/regular/wgan_example.py b/examples/regular/wgan_example.py index 620b4176..e5733b4f 100644 --- a/examples/regular/wgan_example.py +++ b/examples/regular/wgan_example.py @@ -1,10 +1,11 @@ #Install ydata-synthetic lib # pip install ydata-synthetic import sklearn.cluster as cluster +import pandas as pd +import numpy as np from ydata_synthetic.synthesizers import ModelParameters, TrainParameters from ydata_synthetic.synthesizers.regular import WGAN -from ydata_synthetic.preprocessing.regular.credit_fraud import * model = WGAN @@ -12,25 +13,20 @@ data = pd.read_csv('data/creditcard.csv', index_col=[0]) #Data processing and analysis -data_cols = list(data.columns[ data.columns != 'Class' ]) -label_cols = ['Class'] +num_cols = list(data.columns[ data.columns != 'Class' ]) +cat_cols = ['Class'] -print('Dataset columns: {}'.format(data_cols)) +print('Dataset columns: {}'.format(num_cols)) sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] processed_data = data[ sorted_cols ].copy() -#Before training the GAN do not forget to apply the required data transformations -#To ease here we've applied a PowerTransformation -_, data, _ = transformations(data) - #For the purpose of this example we will only synthesize the minority class train_data = data.loc[ data['Class']==1 ].copy() print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) - algorithm = cluster.KMeans args, kwds = (), {'n_clusters':2, 'random_state':0} -labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ]) +labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) @@ -50,18 +46,10 @@ beta_2 = 0.9 models_dir = './cache' -train_sample = fraud_w_classes.copy().reset_index(drop=True) -train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True) -label_cols = [ i for i in train_sample.columns if 'Class' in i ] -data_cols = [ i for i in train_sample.columns if i not in label_cols ] -train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn -train_no_label = train_sample[ data_cols ] - gan_args = ModelParameters(batch_size=batch_size, lr=learning_rate, betas=(beta_1, beta_2), noise_dim=noise_dim, - n_cols=train_sample.shape[1], layers_dim=dim) train_args = TrainParameters(epochs=epochs, @@ -72,7 +60,7 @@ #Training the WGAN_GP model synthesizer = model(gan_args, n_critic=2) -synthesizer.train(train_sample, train_args) +synthesizer.train(data = fraud_w_classes, train_arguments=train_args, num_cols = num_cols, cat_cols = cat_cols) #Saving the synthesizer to later generate new events synthesizer.save(path='models/wgan_creditcard.pkl') @@ -83,6 +71,3 @@ #Sampling the data #Note that the data returned it is not inverse processed. data_sample = synth.sample(100000) - - - diff --git a/requirements.txt b/requirements.txt index 99f1b38e..b140d5ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ easydict==1.9 pmlb==1.0.* tqdm<5.0 typeguard==2.13.* +pytest==6.2.* +tensorflow_probability==0.12.* diff --git a/src/ydata_synthetic/preprocessing/base_processor.py b/src/ydata_synthetic/preprocessing/base_processor.py index 26cc4ca2..7e02d983 100644 --- a/src/ydata_synthetic/preprocessing/base_processor.py +++ b/src/ydata_synthetic/preprocessing/base_processor.py @@ -1,99 +1,127 @@ -from typing import List, Union +"Base class of Data Preprocessors, do not instantiate this class directly." +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import namedtuple +from typing import List, Optional from numpy import concatenate, ndarray, split, zeros -from pandas import concat, DataFrame +from pandas import DataFrame, Series, concat from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError from typeguard import typechecked +ProcessorInfo = namedtuple("ProcessorInfo", ["numerical", "categorical"]) +PipelineInfo = namedtuple("PipelineInfo", ["feat_names_in", "feat_names_out"]) + +# pylint: disable=R0902 @typechecked -class BaseProcessor(BaseEstimator, TransformerMixin): +class BaseProcessor(ABC, BaseEstimator, TransformerMixin): """ - Base class for Data Preprocessing. It is a base version and should not be instantiated directly. - It works like any other transformer in scikit learn with the methods fit, transform and inverse transform. + This data processor works like a scikit learn transformer in with the methods fit, transform and inverse transform. Args: - num_cols (list of strings/list of ints): - List of names of numerical columns or positional indexes (if pos_idx was set to True). - cat_cols (list of strings/list of ints): - List of names of categorical columns or positional indexes (if pos_idx was set to True). - pos_idx (bool): - Specifies if the passed col IDs are names or positional indexes (column numbers). + num_cols (list of strings): + List of names of numerical columns. + cat_cols (list of strings): + List of names of categorical columns. """ - def __init__(self, *, num_cols: Union[List[str], List[int]] = None, cat_cols: Union[List[str], List[int]] = None, - pos_idx: bool = False): + def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None): self.num_cols = [] if num_cols is None else num_cols self.cat_cols = [] if cat_cols is None else cat_cols - self.num_col_idx_ = None - self.cat_col_idx_ = None - - self.num_pipeline = None # To be overriden by child processors - - self.cat_pipeline = None # To be overriden by child processors - - self._types = None - self.col_order_ = None - self.pos_idx = pos_idx - - def fit(self, X: DataFrame): + self._num_pipeline = None # To be overriden by child processors + self._cat_pipeline = None # To be overriden by child processors + + self._col_transform_info = None # Metadata object mapping inputs/outputs of each pipeline + + @property + def num_pipeline(self) -> BaseEstimator: + """Returns the pipeline applied to numerical columns.""" + return self._num_pipeline + + @property + def cat_pipeline(self) -> BaseEstimator: + """Returns the pipeline applied to categorical columns.""" + return self._cat_pipeline + + @property + def types(self) -> Series: + """Returns a Series with the dtypes of each column in the fitted DataFrame.""" + return self._types + + @property + def col_transform_info(self) -> ProcessorInfo: + """Returns a ProcessorInfo object specifying input/output feature mappings of this processor's pipelines.""" + self._check_is_fitted() + if self._col_transform_info is None: + self._col_transform_info = self.__create_metadata_synth() + return self._col_transform_info + + def __create_metadata_synth(self): + num_info = PipelineInfo([], []) + cat_info = PipelineInfo([], []) + # Numerical ls named tuple + if self.num_cols: + num_info = PipelineInfo(self.num_pipeline.feature_names_in_, self.num_pipeline.get_feature_names_out()) + # Categorical ls named tuple + if self.cat_cols: + cat_info = PipelineInfo(self.cat_pipeline.feature_names_in_, self.cat_pipeline.get_feature_names_out()) + return ProcessorInfo(num_info, cat_info) + + def _check_is_fitted(self): + """Checks if the processor is fitted by testing the numerical pipeline. + Raises NotFittedError if not.""" + if self._num_pipeline is None: + raise NotFittedError("This data processor has not yet been fitted.") + + def _validate_cols(self, x_cols): + """Ensures validity of the passed numerical and categorical columns. + The following is verified: + 1) Num cols and cat cols are disjoint sets; + 2) The union of these sets should equal x_cols;. + Assertion errors are raised in case any of the tests fails.""" + missing = set(x_cols).difference(set(self.num_cols).union(set(self.cat_cols))) + intersection = set(self.num_cols).intersection(set(self.cat_cols)) + assert intersection == set(), f"num_cols and cat_cols share columns {intersection} but should be disjoint." + assert missing == set(), f"The columns {missing} of the provided dataset were not attributed to a pipeline." + + # pylint: disable=C0103 + @abstractmethod + def fit(self, X: DataFrame) -> BaseProcessor: """Fits the DataProcessor to a passed DataFrame. Args: X (DataFrame): DataFrame used to fit the processor parameters. Should be aligned with the num/cat columns defined in initialization. + Returns: + self (DataProcessor): The fitted data processor. """ - if self.pos_idx: - self.num_cols = list(X.columns[self.num_cols]) - self.cat_cols = list(X.columns[self.cat_cols]) - self.col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols] - self._types = X.dtypes - - self.num_pipeline.fit(X[self.num_cols]) if self.num_cols else zeros([len(X), 0]) - self.cat_pipeline.fit(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]) - - return self + raise NotImplementedError + # pylint: disable=C0103 + @abstractmethod def transform(self, X: DataFrame) -> ndarray: """Transforms the passed DataFrame with the fit DataProcessor. Args: X (DataFrame): DataFrame used to fit the processor parameters. - Should be aligned with the num/cat columns defined in initialization. + Should be aligned with the columns types defined in initialization. Returns: - transformed (ndarray): - Processed version of the passed DataFrame. + transformed (ndarray): Processed version of the passed DataFrame. """ - num_data = self.num_pipeline.transform(X[self.num_cols]) if self.num_cols else zeros([len(X), 0]) - cat_data = self.cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]) - - transformed = concatenate([num_data, cat_data], axis=1) - - self.num_col_idx_ = num_data.shape[1] - self.cat_col_idx_ = self.num_col_idx_ + cat_data.shape[1] - - return transformed + raise NotImplementedError + # pylint: disable=C0103 + @abstractmethod def inverse_transform(self, X: ndarray) -> DataFrame: """Inverts the data transformation pipelines on a passed DataFrame. Args: X (ndarray): Numpy array to be brought back to the original data format. Should share the schema of data transformed by this DataProcessor. - Can be used to revert transformations of training data or for + Can be used to revert transformations of training data or for synthetic samples. Returns: result (DataFrame): - DataFrame with inverted + DataFrame with all performed transformations inverted. """ - num_data, cat_data, _ = split(X, [self.num_col_idx_, self.cat_col_idx_], axis=1) - - num_data = self.num_pipeline.inverse_transform(num_data) if self.num_cols else zeros([len(X), 0]) - cat_data = self.cat_pipeline.inverse_transform(cat_data) if self.cat_cols else zeros([len(X), 0]) - - result = concat([DataFrame(num_data, columns=self.num_cols), - DataFrame(cat_data, columns=self.cat_cols),], axis=1) - - result = result.loc[:, self.col_order_] - - for col in result.columns: - result[col]=result[col].astype(self._types[col]) - - return result + raise NotImplementedError diff --git a/src/ydata_synthetic/preprocessing/regular/adult.py b/src/ydata_synthetic/preprocessing/regular/adult.py deleted file mode 100644 index 441341c8..00000000 --- a/src/ydata_synthetic/preprocessing/regular/adult.py +++ /dev/null @@ -1,36 +0,0 @@ -import pandas as pd - -from sklearn.preprocessing import OneHotEncoder, StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer - -from pmlb import fetch_data - -def transformations(): - data = fetch_data('adult') - - numerical_features = ['age', 'fnlwgt', - 'capital-gain', 'capital-loss', - 'hours-per-week'] - numerical_transformer = Pipeline(steps=[ - ('scaler', StandardScaler())]) - - categorical_features = ['workclass','education', 'marital-status', - 'occupation', 'relationship', - 'race', 'sex'] - categorical_transformer = Pipeline(steps=[ - ('onehot', OneHotEncoder(handle_unknown='ignore'))]) - - remaining_features = ['education-num', 'native-country','target'] - remaining_transformer = 'passthrough' - preprocessor = ColumnTransformer( - transformers=[ - ('num', numerical_transformer, numerical_features), - ('cat', categorical_transformer, categorical_features), - ('remaining', remaining_transformer, remaining_features)]) - - processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(data)) - - return data, processed_data, preprocessor - - diff --git a/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py b/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py deleted file mode 100644 index 5f409fb6..00000000 --- a/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd - -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer - -from pmlb import fetch_data - -def transformations(): - data = fetch_data('breast_cancer_wisconsin') - - scaler = StandardScaler() - processed_data = scaler.fit_transform(data) - processed_data = pd.DataFrame(processed_data) - - return data, processed_data, scaler - - -if __name__ == '__main__': - - data = transformations() - - print(data) diff --git a/src/ydata_synthetic/preprocessing/regular/cardiovascular.py b/src/ydata_synthetic/preprocessing/regular/cardiovascular.py deleted file mode 100644 index 7fcccff9..00000000 --- a/src/ydata_synthetic/preprocessing/regular/cardiovascular.py +++ /dev/null @@ -1,25 +0,0 @@ -import pandas as pd - -from sklearn.preprocessing import OneHotEncoder, StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer - -def transformations(data): - categorical_features = ['gender', 'cardio', 'active', 'alco', 'smoke', 'gluc', - 'cholesterol'] - numerical_features = [ 'height', 'weight', 'ap_hi', 'ap_lo'] - - numerical_transformer = Pipeline(steps=[ - ('onehot', StandardScaler())]) - - categorical_transformer = Pipeline(steps=[ - ('onehot', OneHotEncoder(handle_unknown='ignore'))]) - - preprocessor = ColumnTransformer( - transformers=[ - ('num', numerical_transformer, numerical_features), - ('cat', categorical_transformer, categorical_features)]) - - processed_data = preprocessor.fit_transform(data) - processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(processed_data)) - return data, processed_data, preprocessor diff --git a/src/ydata_synthetic/preprocessing/regular/credit_fraud.py b/src/ydata_synthetic/preprocessing/regular/credit_fraud.py deleted file mode 100644 index e5bc23ef..00000000 --- a/src/ydata_synthetic/preprocessing/regular/credit_fraud.py +++ /dev/null @@ -1,23 +0,0 @@ -#Data transformations to be applied -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import math - -from sklearn.preprocessing import PowerTransformer -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer - -def transformations(data): - #Log transformation to Amount variable - processed_data = data.copy() - data_cols = list(data.columns[data.columns != 'Class']) - - data_transformer = Pipeline(steps=[ - ('PowerTransformer', PowerTransformer(method='yeo-johnson', standardize=True, copy=True))]) - - preprocessor = ColumnTransformer( - transformers = [('power', data_transformer, data_cols)]) - processed_data[data_cols] = preprocessor.fit_transform(data[data_cols]) - - return data, processed_data, preprocessor diff --git a/src/ydata_synthetic/preprocessing/regular/processor.py b/src/ydata_synthetic/preprocessing/regular/processor.py index d369da2b..0278d482 100644 --- a/src/ydata_synthetic/preprocessing/regular/processor.py +++ b/src/ydata_synthetic/preprocessing/regular/processor.py @@ -1,5 +1,10 @@ -from typing import List, Union +"Implementation of a Regular DataProcessor." +from __future__ import annotations +from typing import List, Optional + +from numpy import concatenate, ndarray, split, zeros +from pandas import DataFrame, concat from sklearn.pipeline import Pipeline from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from typeguard import typechecked @@ -12,21 +17,92 @@ class RegularDataProcessor(BaseProcessor): Main class for Regular/Tabular Data Preprocessing. It works like any other transformer in scikit learn with the methods fit, transform and inverse transform. Args: - num_cols (list of strings/list of ints): - List of names of numerical columns or positional indexes (if pos_idx was set to True). - cat_cols (list of strings/list of ints): - List of names of categorical columns or positional indexes (if pos_idx was set to True). - pos_idx (bool): - Specifies if the passed col IDs are names or positional indexes (column numbers). + num_cols (list of strings): + List of names of numerical columns. + cat_cols (list of strings): + List of names of categorical columns. """ - def __init__(self, *, num_cols: Union[List[str], List[int]] = None, cat_cols: Union[List[str], List[int]] = None, - pos_idx: bool = False): - super().__init__(num_cols = num_cols, cat_cols = cat_cols, pos_idx = pos_idx) + def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None): + super().__init__(num_cols, cat_cols) + + self._col_order_ = None + self._num_col_idx_ = None + self._cat_col_idx_ = None + + # pylint: disable=W0106 + def fit(self, X: DataFrame) -> RegularDataProcessor: + """Fits the DataProcessor to a passed DataFrame. + Args: + X (DataFrame): + DataFrame used to fit the processor parameters. + Should be aligned with the num/cat columns defined in initialization. + Returns: + self (RegularDataProcessor): The fitted data processor. + """ + self._validate_cols(X.columns) + + self._col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols] + + self._types = X.dtypes - self.num_pipeline = Pipeline([ + self._num_pipeline = Pipeline([ ("scaler", MinMaxScaler()), ]) - - self.cat_pipeline = Pipeline([ - ("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore')) + self._cat_pipeline = Pipeline([ + ("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore')), ]) + + self.num_pipeline.fit(X[self.num_cols]) if self.num_cols else zeros([len(X), 0]) + self.cat_pipeline.fit(X[self.cat_cols]) if self.num_cols else zeros([len(X), 0]) + + self._num_col_idx_ = len(self.num_pipeline.get_feature_names_out()) + self._cat_col_idx_ = self._num_col_idx_ + len(self.cat_pipeline.get_feature_names_out()) + + return self + + def transform(self, X: DataFrame) -> ndarray: + """Transforms the passed DataFrame with the fit DataProcessor. + Args: + X (DataFrame): + DataFrame used to fit the processor parameters. + Should be aligned with the columns types defined in initialization. + Returns: + transformed (ndarray): + Processed version of the passed DataFrame. + """ + self._check_is_fitted() + + num_data = self.num_pipeline.transform(X[self.num_cols]) if self.num_cols else zeros([len(X), 0]) + cat_data = self.cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]) + + transformed = concatenate([num_data, cat_data], axis=1) + + return transformed + + def inverse_transform(self, X: ndarray) -> DataFrame: + """Inverts the data transformation pipelines on a passed DataFrame. + Args: + X (ndarray): + Numpy array to be brought back to the original data format. + Should share the schema of data transformed by this DataProcessor. + Can be used to revert transformations of training data or for synthetic samples. + Returns: + result (DataFrame): + DataFrame with all performed transformations inverted. + """ + self._check_is_fitted() + + num_data, cat_data, _ = split(X, [self._num_col_idx_, self._cat_col_idx_], axis=1) + + num_data = self.num_pipeline.inverse_transform(num_data) if self.num_cols else zeros([len(X), 0]) + cat_data = self.cat_pipeline.inverse_transform(cat_data) if self.cat_cols else zeros([len(X), 0]) + + result = concat([DataFrame(num_data, columns=self.num_cols), + DataFrame(cat_data, columns=self.cat_cols)], axis=1) + + result = result.loc[:, self._col_order_] + + for col in result.columns: + result[col]=result[col].astype(self._types[col]) + + return result diff --git a/src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py b/src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py new file mode 100644 index 00000000..3a91983e --- /dev/null +++ b/src/ydata_synthetic/preprocessing/timeseries/timeseries_processor.py @@ -0,0 +1,15 @@ +"Implementation of a TimeSeries DataProcessor." +from typing import List, Optional + +from typeguard import typechecked + +from ydata_synthetic.preprocessing.base_processor import BaseProcessor + + +@typechecked +class TimeSeriesDataProcessor(BaseProcessor): + """ + Not implemented. + """ + def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None): + raise NotImplementedError diff --git a/src/ydata_synthetic/synthesizers/gan.py b/src/ydata_synthetic/synthesizers/gan.py index ee4a20ca..d7e952c4 100644 --- a/src/ydata_synthetic/synthesizers/gan.py +++ b/src/ydata_synthetic/synthesizers/gan.py @@ -1,14 +1,20 @@ +"Implements a GAN BaseModel synthesizer, not meant to be directly instantiated." from collections import namedtuple -from typing import Union - -from pandas import DataFrame, concat -from numpy import array +from enum import Enum +from typing import List, Optional, Union +import tensorflow as tf import tqdm from joblib import dump, load -import tensorflow as tf +from numpy import array, vstack +from pandas import DataFrame from tensorflow import config as tfconfig +from typeguard import typechecked +from ydata_synthetic.preprocessing.regular.processor import \ + RegularDataProcessor +from ydata_synthetic.preprocessing.timeseries.timeseries_processor import \ + TimeSeriesDataProcessor from ydata_synthetic.synthesizers.saving_keras import make_keras_picklable _model_parameters = ['batch_size', 'lr', 'betas', 'layers_dim', 'noise_dim', @@ -21,7 +27,34 @@ ModelParameters = namedtuple('ModelParameters', _model_parameters, defaults=_model_parameters_df) TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None)) + +class RegularModels(Enum): + "Supported models for the Regular Data Processor." + CGAN = 'CGAN' + CRAMERGAN = 'CramerGAN' + DRAGAN = 'DRAGAN' + GAN = 'VanillaGAN' + WGAN = 'WGAN' + WGAN_GP = 'WGAN_GP' + + +class TimeSeriesModels(Enum): + "Supported models for the TimeSeries Data Processor." + TIMEGAN = 'TIMEGAN' + TSCWGAN = 'TSCWGAN' + +# pylint: disable=R0902 +@typechecked class BaseModel(): + """ + Base class of GAN synthesizer models. + The main methods are train (for fitting the synthesizer), save/load and sample (obtain synthetic records). + Args: + model_parameters (ModelParameters): + Set of architectural parameters for model definition. + """ + __MODEL__ = None + def __init__( self, model_parameters: ModelParameters @@ -30,11 +63,11 @@ def __init__( if len(gpu_devices) > 0: try: tfconfig.experimental.set_memory_growth(gpu_devices[0], True) - except: + except (ValueError, RuntimeError): # Invalid device or cannot modify virtual devices once initialized. pass #Validate the provided model parameters - if model_parameters.betas!=None: + if model_parameters.betas is not None: assert len(model_parameters.betas) == 2, "Please provide the betas information as a tuple." self.batch_size = model_parameters.batch_size @@ -42,65 +75,92 @@ def __init__( self.beta_1 = model_parameters.betas[0] self.beta_2 = model_parameters.betas[1] self.noise_dim = model_parameters.noise_dim - self.data_dim = model_parameters.n_cols + self.data_dim = None self.layers_dim = model_parameters.layers_dim - self.define_gan() + self.processor = None + # pylint: disable=E1101 def __call__(self, inputs, **kwargs): return self.model(inputs=inputs, **kwargs) + # pylint: disable=C0103 def _set_lr(self, lr): if isinstance(lr, float): self.g_lr=lr self.d_lr=lr - elif isinstance(lr,list) or isinstance(lr, tuple): + elif isinstance(lr,(list, tuple)): assert len(lr)==2, "Please provide a tow values array for the learning rates or a float." self.g_lr=lr[0] self.d_lr=lr[1] def define_gan(self): + """Define the trainable model components. + Optionally validate model structure with mock inputs and initialize optimizers.""" raise NotImplementedError - @property - def trainable_variables(self, network): - return network.trainable_variables - @property def model_parameters(self): + "Returns the parameters of the model." return self._model_parameters @property def model_name(self): + "Returns the model (class) name." return self.__class__.__name__ def train(self, data: Union[DataFrame, array], - train_arguments: TrainParameters): - raise NotImplementedError - - def sample(self, n_samples): + num_cols: Optional[List[str]] = None, + cat_cols: Optional[List[str]] = None, + preprocess: bool = True) -> Union[DataFrame, array]: + """Sets up the train session by instantiating an appropriate processor, fitting and storing it as an attribute. + Args: + data (Union[DataFrame, array]): Raw data object. + num_cols (Optional[List[str]]): List of names of numerical columns. + cat_cols (Optional[List[str]]): List of names of categorical columns. + preprocess (bool): Determines if the preprocessor is to be run on the data or not (p.e. preprocessed data). + """ + if preprocess: + if self.__MODEL__ in RegularModels.__members__: + self.processor = RegularDataProcessor + elif self.__MODEL__ in TimeSeriesModels.__members__: + self.processor = TimeSeriesDataProcessor + else: + print(f'A DataProcessor is not available for the {self.__MODEL__}.') + self.processor = self.processor(num_cols = num_cols, cat_cols = cat_cols).fit(data) + + def sample(self, n_samples: int): + """Generate n_samples synthetic records from the synthesizer. + The records returned are always a multiple of batch_size (can return excess of up to batch_size - 1 records). + The samples are returned in the original data format, with any internal preprocessing inverted. + + Args: + n_samples (int): Intended size of the synthetic sample. + """ steps = n_samples // self.batch_size + 1 data = [] for _ in tqdm.trange(steps, desc='Synthetic data generation'): - z = tf.random.uniform([self.batch_size, self.noise_dim]) - records = tf.make_ndarray(tf.make_tensor_proto(self.generator(z, training=False))) - data.append(DataFrame(records)) - return concat(data) + z = tf.random.uniform([self.batch_size, self.noise_dim], dtype=tf.dtypes.float32) + records = self.generator(z, training=False).numpy() + data.append(records) + return self.processor.inverse_transform(array(vstack(data))) def save(self, path): + "Saves the pickled synthesizer instance in the given path." #Save only the generator? if self.__MODEL__=='WGAN' or self.__MODEL__=='WGAN_GP': - self.critic=None + del self.critic make_keras_picklable() dump(self, path) @staticmethod def load(path): + "Loads a pickled synthesizer from the given path." gpu_devices = tf.config.list_physical_devices('GPU') if len(gpu_devices) > 0: try: tfconfig.experimental.set_memory_growth(gpu_devices[0], True) - except: + except (ValueError, RuntimeError): # Invalid device or cannot modify virtual devices once initialized. pass synth = load(path) diff --git a/src/ydata_synthetic/synthesizers/regular/__init__.py b/src/ydata_synthetic/synthesizers/regular/__init__.py index 9f0464da..435274cb 100644 --- a/src/ydata_synthetic/synthesizers/regular/__init__.py +++ b/src/ydata_synthetic/synthesizers/regular/__init__.py @@ -4,6 +4,7 @@ from ydata_synthetic.synthesizers.regular.wgangp.model import WGAN_GP from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN +from ydata_synthetic.synthesizers.regular.pategan.model import PATEGAN __all__ = [ "VanilllaGAN", @@ -11,5 +12,6 @@ "WGAN", "WGAN_GP", "DRAGAN", - "CRAMERGAN" + "CRAMERGAN", + "PATEGAN" ] diff --git a/src/ydata_synthetic/synthesizers/regular/cgan/model.py b/src/ydata_synthetic/synthesizers/regular/cgan/model.py index 1f810c35..0bc041b9 100644 --- a/src/ydata_synthetic/synthesizers/regular/cgan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/cgan/model.py @@ -1,20 +1,23 @@ +"""CGAN implementation""" import os from os import path -from typing import Union -from tqdm import trange +from typing import List, Union, Optional, NamedTuple import numpy as np -from numpy import array +from numpy import array, vstack, empty, hstack, ndarray +from numpy.random import normal from pandas import DataFrame +from tensorflow import convert_to_tensor, dtypes, expand_dims, tile +from tensorflow import data as tfdata +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Dropout, Input, Flatten, Embedding, multiply +from tensorflow.keras.optimizers import Adam +from tqdm import trange -from ydata_synthetic.synthesizers.gan import BaseModel from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.synthesizers.gan import BaseModel +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface -import tensorflow as tf -from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Embedding, multiply -from tensorflow.keras import Model - -from tensorflow.keras.optimizers import Adam class CGAN(BaseModel): @@ -22,11 +25,13 @@ class CGAN(BaseModel): def __init__(self, model_parameters, num_classes): self.num_classes = num_classes + self.label_col = None super().__init__(model_parameters) - def define_gan(self): + def define_gan(self, processor_info: Optional[NamedTuple] = None): self.generator = Generator(self.batch_size, self.num_classes). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim) + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info = processor_info) self.discriminator = Discriminator(self.batch_size, self.num_classes). \ build_model(input_shape=(self.data_dim,), dim=self.layers_dim) @@ -41,7 +46,7 @@ def define_gan(self): # The generator takes noise as input and generates imgs z = Input(shape=(self.noise_dim,)) - label = Input(shape=(1,)) + label = Input(shape=(1,)) # A label vector is expected record = self.generator([z, label]) # For the combined model we will only train the generator @@ -55,33 +60,56 @@ def define_gan(self): self._model = Model([z, label], validity) self._model.compile(loss='binary_crossentropy', optimizer=g_optimizer) + def _generate_noise(self): + "Gaussian noise for the generator input." + while True: + yield normal(size=self.noise_dim) + + def get_batch_noise(self): + "Create a batch iterator for the generator gaussian noise input." + return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=dtypes.float32) + .batch(self.batch_size) + .repeat()) + def get_data_batch(self, train, batch_size, seed=0): # # random sampling - some samples will have excessively low or high sampling, but easy to implement # np.random.seed(seed) - # x = train.loc[ np.random.choice(train.index, batch_size) ].values # iterate through shuffled indices, so every sample gets covered evenly start_i = (batch_size * seed) % len(train) stop_i = start_i + batch_size shuffle_seed = (batch_size * seed) // len(train) np.random.seed(shuffle_seed) - train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) # wasteful to shuffle every time - train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - x = train.loc[train_ix[start_i: stop_i]].values - return np.reshape(x, (batch_size, -1)) - - def train(self, data: Union[DataFrame, array], - label:str, - train_arguments:TrainParameters): + train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time + return train[train_ix[start_i: stop_i]] + + def train(self, data: Union[DataFrame, array], label_col: str, train_arguments: TrainParameters, num_cols: List[str], + cat_cols: List[str], preprocess: bool = True): """ Args: data: A pandas DataFrame or a Numpy array with the data to be synthesized label: The name of the column to be used as a label and condition for the training - train_arguments: Gan training arguments. - Returns: - A CGAN model fitted to the provided data + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + preprocess: If True preprocess the data before using in train session """ - iterations = int(abs(data.shape[0] / self.batch_size) + 1) + # Separating labels from the rest of the data to fit the data processor + data, label = data.loc[:, data.columns != label_col], expand_dims(data[label_col], 1) + self.label_col = label_col + + super().train(data, num_cols, cat_cols, preprocess) + + processed_data = self.processor.transform(data) + self.data_dim = processed_data.shape[1] + self.define_gan(self.processor.col_transform_info if preprocess else None) + + # Merging labels with processed data + processed_data = hstack([processed_data, label]) + + noise_batches = self.get_batch_noise() + + iterations = int(abs(processed_data.shape[0] / self.batch_size) + 1) # Adversarial ground truths valid = np.ones((self.batch_size, 1)) fake = np.zeros((self.batch_size, 1)) @@ -91,29 +119,29 @@ def train(self, data: Union[DataFrame, array], # --------------------- # Train Discriminator # --------------------- - batch_x = self.get_data_batch(data, self.batch_size) - label = batch_x[:, train_arguments.label_dim] - data_cols = [i for i in range(batch_x.shape[1] - 1)] # All data without the label columns - noise = tf.random.normal((self.batch_size, self.noise_dim)) + batch_x = self.get_data_batch(processed_data, self.batch_size) # Batches are retrieved with labels + batch_x, label = batch_x[:, :-1], batch_x[:, -1] # Separate labels from batch + noise = next(noise_batches) # Generate a batch of new records gen_records = self.generator([noise, label], training=True) # Train the discriminator - d_loss_real = self.discriminator.train_on_batch([batch_x[:, data_cols], label], valid) # Separate labels + d_loss_real = self.discriminator.train_on_batch([batch_x, label], valid) # Separate labels d_loss_fake = self.discriminator.train_on_batch([gen_records, label], fake) # Separate labels d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # --------------------- # Train Generator # --------------------- - noise = tf.random.normal((self.batch_size, self.noise_dim)) + noise = next(noise_batches) # Train the generator (to have the discriminator label samples as valid) g_loss = self._model.train_on_batch([noise, label], valid) # Plot the progress print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss)) + # If at save interval => save generated image samples if epoch % train_arguments.sample_interval == 0: # Test here data generation step @@ -124,17 +152,34 @@ def train(self, data: Union[DataFrame, array], self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch)) self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch)) - #Here is generating synthetic data - z = tf.random.normal((432, self.noise_dim)) - label_z = tf.random.uniform((432,), minval=min(train_arguments.labels), maxval=max(train_arguments.labels)+1, dtype=tf.dtypes.int32) - gen_data = self.generator([z, label_z]) + #Here is generating synthetic data from an arbitrary condition + gen_data = self.sample(array([label[0]]), 1000) + + + def sample(self, condition: ndarray, n_samples: int,): + """Produce n_samples by conditioning the generator with condition.""" + assert condition.shape[0] == 1, \ + "A condition with cardinality one is expected." + steps = n_samples // self.batch_size + 1 + data = [] + z_dist = self.get_batch_noise() + condition = expand_dims(convert_to_tensor(condition, dtypes.float32), axis=0) + cond_seq = tile(condition, multiples=[self.batch_size, 1]) + for step in trange(steps, desc=f'Synthetic data generation'): + records = empty(shape=(self.batch_size, self.data_dim)) + records = self.generator([next(z_dist), cond_seq], training=False) + data.append(records) + data = self.processor.inverse_transform(array(vstack(data))) + data[self.label_col] = tile(condition, multiples=[data.shape[0], 1]) + return data + class Generator(): def __init__(self, batch_size, num_classes): self.batch_size = batch_size self.num_classes = num_classes - def build_model(self, input_shape, dim, data_dim): + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): noise = Input(shape=input_shape, batch_size=self.batch_size) label = Input(shape=(1,), batch_size=self.batch_size, dtype='int32') label_embedding = Flatten()(Embedding(self.num_classes, 1)(label)) @@ -144,8 +189,11 @@ def build_model(self, input_shape, dim, data_dim): x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info).call(x) return Model(inputs=[noise, label], outputs=x) + class Discriminator(): def __init__(self, batch_size, num_classes): self.batch_size = batch_size diff --git a/src/ydata_synthetic/synthesizers/regular/cramergan/model.py b/src/ydata_synthetic/synthesizers/regular/cramergan/model.py index c5b95974..f1a24260 100644 --- a/src/ydata_synthetic/synthesizers/regular/cramergan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/cramergan/model.py @@ -1,16 +1,19 @@ import os from os import path +from typing import List, Optional, NamedTuple + import numpy as np +import tensorflow as tf +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Dropout, Input +from tensorflow.keras.optimizers import Adam from tqdm import trange +from ydata_synthetic.synthesizers import TrainParameters from ydata_synthetic.synthesizers.gan import BaseModel from ydata_synthetic.synthesizers.loss import Mode, gradient_penalty -from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface -import tensorflow as tf -from tensorflow.keras.layers import Input, Dense, Dropout -from tensorflow.keras import Model -from tensorflow.keras.optimizers import Adam class CRAMERGAN(BaseModel): @@ -24,9 +27,10 @@ def __init__(self, model_parameters, gradient_penalty_weight=10): self.gradient_penalty_weight = gradient_penalty_weight super().__init__(model_parameters) - def define_gan(self): + def define_gan(self, processor_info: Optional[NamedTuple] = None): self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim) + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info=processor_info) self.critic = Critic(self.batch_size). \ build_model(input_shape=(self.data_dim,), dim=self.layers_dim) @@ -123,16 +127,30 @@ def get_data_batch(train, batch_size, seed=0): stop_i = start_i + batch_size shuffle_seed = (batch_size * seed) // len(train) np.random.seed(shuffle_seed) - train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) # wasteful to shuffle every time + train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - x = train.loc[train_ix[start_i: stop_i]].values - return np.reshape(x, (batch_size, -1)) + return train[train_ix[start_i: stop_i]] def train_step(self, train_data): critic_loss, g_loss = self.update_gradients(train_data) return critic_loss, g_loss - def train(self, data, train_arguments: TrainParameters): + def train(self, data, train_arguments: TrainParameters, num_cols: List[str], + cat_cols: List[str], preprocess: bool = True): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + preprocess: If True preprocess the data before using in train session + """ + super().train(data, num_cols, cat_cols, preprocess) + + data = self.processor.transform(data) + self.data_dim = data.shape[1] + self.define_gan(self.processor.col_transform_info if preprocess else None) + iterations = int(abs(data.shape[0] / self.batch_size) + 1) # Create a summary file @@ -176,12 +194,14 @@ def __init__(self, batch_size): """Simple generator with dense feedforward layers.""" self.batch_size = batch_size - def build_model(self, input_shape, dim, data_dim): + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): input_ = Input(shape=input_shape, batch_size=self.batch_size) x = Dense(dim, activation='relu')(input_) x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info)(x) return Model(inputs=input_, outputs=x) class Critic(tf.keras.Model): diff --git a/src/ydata_synthetic/synthesizers/regular/dragan/model.py b/src/ydata_synthetic/synthesizers/regular/dragan/model.py index 22c5ce46..490131b5 100644 --- a/src/ydata_synthetic/synthesizers/regular/dragan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/dragan/model.py @@ -1,15 +1,17 @@ import os from os import path -import tqdm - +from typing import Optional, NamedTuple import tensorflow as tf -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.layers import Input, Dense, Dropout +import tqdm from tensorflow.keras import Model, initializers +from tensorflow.keras.layers import Dense, Dropout, Input +from tensorflow.keras.optimizers import Adam from ydata_synthetic.synthesizers.gan import BaseModel from ydata_synthetic.synthesizers.loss import Mode, gradient_penalty +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface + class DRAGAN(BaseModel): @@ -21,10 +23,11 @@ def __init__(self, model_parameters, n_discriminator, gradient_penalty_weight=10 self.gradient_penalty_weight = gradient_penalty_weight super().__init__(model_parameters) - def define_gan(self): + def define_gan(self, col_transform_info: Optional[NamedTuple] = None): # define generator/discriminator self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim) + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info=col_transform_info) self.discriminator = Discriminator(self.batch_size). \ build_model(input_shape=(self.data_dim,), dim=self.layers_dim) @@ -113,8 +116,14 @@ def train_step(self, train_data): d_loss, g_loss = self.update_gradients(train_data) return d_loss, g_loss - def train(self, data, train_arguments): - train_loader = self.get_data_batch(data, self.batch_size) + def train(self, data, train_arguments, num_cols, cat_cols, preprocess: bool = True): + super().train(data, num_cols, cat_cols, preprocess) + + processed_data = self.processor.transform(data) + self.data_dim = processed_data.shape[1] + self.define_gan(self.processor.col_transform_info if preprocess else None) + + train_loader = self.get_data_batch(processed_data, self.batch_size) # Create a summary file train_summary_writer = tf.summary.create_file_writer(path.join('..\dragan_test', 'summaries', 'train')) @@ -144,28 +153,29 @@ def train(self, data, train_arguments): class Discriminator(Model): - def __init__(self, batch_size): - self.batch_size = batch_size - - def build_model(self, input_shape, dim): - input = Input(shape=input_shape, batch_size=self.batch_size) - x = Dense(dim * 4, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input) - x = Dropout(0.1)(x) - x = Dense(dim * 2, activation='relu')(x) - x = Dropout(0.1)(x) - x = Dense(dim, activation='relu')(x) - x = Dense(1, activation='sigmoid')(x) - return Model(inputs=input, outputs=x) + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim): + input = Input(shape=input_shape, batch_size=self.batch_size) + x = Dense(dim * 4, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input) + x = Dropout(0.1)(x) + x = Dense(dim * 2, activation='relu')(x) + x = Dropout(0.1)(x) + x = Dense(dim, activation='relu')(x) + x = Dense(1, activation='sigmoid')(x) + return Model(inputs=input, outputs=x) class Generator(Model): - def __init__(self, batch_size): - self.batch_size = batch_size - - def build_model(self, input_shape, dim, data_dim): - input = Input(shape=input_shape, batch_size = self.batch_size) - x = Dense(dim, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input) - x = Dense(dim * 2, activation='relu')(x) - x = Dense(dim * 4, activation='relu')(x) - x = Dense(data_dim)(x) - return Model(inputs=input, outputs=x) - + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim, data_dim, processor_info: NamedTuple = None): + input = Input(shape=input_shape, batch_size = self.batch_size) + x = Dense(dim, kernel_initializer=initializers.TruncatedNormal(mean=0., stddev=0.5), activation='relu')(input) + x = Dense(dim * 2, activation='relu')(x) + x = Dense(dim * 4, activation='relu')(x) + x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info)(x) + return Model(inputs=input, outputs=x) diff --git a/src/ydata_synthetic/synthesizers/regular/pategan/__init__.py b/src/ydata_synthetic/synthesizers/regular/pategan/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ydata_synthetic/synthesizers/regular/pategan/model.py b/src/ydata_synthetic/synthesizers/regular/pategan/model.py new file mode 100644 index 00000000..650e67e8 --- /dev/null +++ b/src/ydata_synthetic/synthesizers/regular/pategan/model.py @@ -0,0 +1,256 @@ +"PATEGAN implementation supporting Differential Privacy budget specification." +# pylint: disable = W0622, E0401 +from math import log +from typing import List, NamedTuple, Optional + +import tqdm +from tensorflow import (GradientTape, clip_by_value, concat, constant, + expand_dims, ones_like, tensor_scatter_nd_update, + transpose, zeros, zeros_like) +from tensorflow.data import Dataset +from tensorflow.dtypes import cast, float64, int64 +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Input, ReLU +from tensorflow.keras.losses import BinaryCrossentropy +from tensorflow.keras.optimizers import Adam +from tensorflow.math import abs, exp, pow, reduce_sum, square +from tensorflow.random import uniform +from tensorflow_probability import distributions + +from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.synthesizers.gan import BaseModel +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface + + +# pylint: disable=R0902 +class PATEGAN(BaseModel): + "A basic PATEGAN synthesizer implementation with configurable differential privacy budget." + + __MODEL__='PATEGAN' + + def __init__(self, model_parameters, n_teachers: int, target_delta: float, target_epsilon: float): + super().__init__(model_parameters) + self.n_teachers = n_teachers + self.target_epsilon = target_epsilon + self.target_delta = target_delta + + # pylint: disable=W0201 + def define_gan(self, processor_info: Optional[NamedTuple] = None): + def discriminator(): + return Discriminator(self.batch_size).build_model((self.data_dim,), self.layers_dim) + + self.generator = Generator(self.batch_size). \ + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info=processor_info) + self.s_discriminator = discriminator() + self.t_discriminators = [discriminator() for i in range(self.n_teachers)] + + generator_optimizer = Adam(learning_rate=self.g_lr) + discriminator_optimizer = Adam(learning_rate=self.d_lr) + + loss_fn = BinaryCrossentropy(from_logits=True) + self.generator.compile(loss=loss_fn, optimizer=generator_optimizer) + self.s_discriminator.compile(loss=loss_fn, optimizer=discriminator_optimizer) + for teacher in self.t_discriminators: + teacher.compile(loss=loss_fn, optimizer=discriminator_optimizer) + + # pylint: disable = C0103 + @staticmethod + def _moments_acc(n_teachers, votes, lap_scale, l_list): + q = (2 + lap_scale * abs(2 * votes - n_teachers))/(4 * exp(lap_scale * abs(2 * votes - n_teachers))) + + update = [] + for l in l_list: + clip = 2 * square(lap_scale) * l * (l + 1) + t = (1 - q) * pow((1 - q) / (1 - exp(2*lap_scale) * q), l) + q * exp(2 * lap_scale * l) + update.append(reduce_sum(clip_by_value(t, clip_value_min=-clip, clip_value_max=clip))) + return cast(update, dtype=float64) + + def get_data_loader(self, data) -> List[Dataset]: + "Obtain a List of TF Datasets corresponding to partitions for each teacher in n_teachers." + loader = [] + SHUFFLE_BUFFER_SIZE = 100 + + for teacher_id in range(self.n_teachers): + start_id = int(teacher_id * len(data) / self.n_teachers) + end_id = int((teacher_id + 1) * len(data) / self.n_teachers if \ + teacher_id != (self.n_teachers - 1) else len(data)) + loader.append(Dataset.from_tensor_slices(data[start_id:end_id:])\ + .batch(self.batch_size).shuffle(SHUFFLE_BUFFER_SIZE)) + return loader + + # pylint:disable=R0913 + def train(self, data, class_ratios, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + class_ratios: + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + """ + super().train(data, num_cols, cat_cols) + + data = self.processor.transform(data) + self.data_dim = data.shape[1] + self.define_gan(self.processor.col_transform_info) + + self.class_ratios = class_ratios + + alpha = cast([0.0 for _ in range(train_arguments.num_moments)], float64) + l_list = 1 + cast(range(train_arguments.num_moments), float64) + + # print("initial alpha", l_list.shape) + + cross_entropy = BinaryCrossentropy(from_logits=True) + + generator_optimizer = Adam(learning_rate=train_arguments.lr) + disc_opt_stu = Adam(learning_rate=train_arguments.lr) + disc_opt_t = [Adam(learning_rate=train_arguments.lr) for i in range(self.n_teachers)] + + train_loader = self.get_data_loader(data, self.batch_size) + + steps = 0 + epsilon = 0 + + category_samples = distributions.Categorical(probs=self.class_ratios, dtype=float64) + + while epsilon < self.target_epsilon: + # train the teacher descriminator + for t_2 in range(train_arguments.num_teacher_iters): + for i in range(self.n_teachers): + inputs, categories = None, None + for b, data_ in enumerate(train_loader[i]): + inputs, categories = data_, b + #categories will give zero value in each loop as the loop break after running the first time + #inputs will have only the first batch of data + break + + with GradientTape() as disc_tape: + # train with real + dis_data = concat([inputs, zeros([inputs.shape[0], 1], dtype=float64)], 1) + # print("1st batch data", dis_data.shape) + real_output = self.t_discriminators[i](dis_data, training=True) + # print(real_output.shape, tf.ones.shape) + + # train with fake + z = uniform([inputs.shape[0], self.z_dim], dtype=float64) + # print("uniformly distributed noise", z.shape) + + sample = expand_dims(category_samples.sample(inputs.shape[0]), axis=1) + # print("category", sample.shape) + + fake = self.generator(concat([z, sample], 1)) + # print('fake', fake.shape) + + fake_output = self.t_discriminators[i](concat([fake, sample], 1), training=True) + # print('fake_output_dis', fake_output.shape) + + # print("watch", disc_tape.watch(self.teacher_disc[i].trainable_variables) + real_loss_disc = cross_entropy(ones_like(real_output), real_output) + fake_loss_disc = cross_entropy(zeros_like(fake_output), fake_output) + + disc_loss = real_loss_disc + fake_loss_disc + # print(disc_loss, real_loss_disc, fake_loss_disc) + + gradients_of_discriminator = disc_tape.gradient(disc_loss, self.t_discriminators[i].trainable_variables) + # print(gradients_of_discriminator) + + disc_opt_t[i].apply_gradients(zip(gradients_of_discriminator, self.t_discriminators[i].trainable_variables)) + + # train the student discriminator + for t_3 in range(train_arguments.num_student_iters): + z = uniform([inputs.shape[0], self.z_dim], dtype=float64) + + sample = expand_dims(category_samples.sample(inputs.shape[0]), axis=1) + # print("category_stu", sample.shape) + + with GradientTape() as stu_tape: + fake = self.generator(concat([z, sample], 1)) + # print('fake_stu', fake.shape) + + predictions, clean_votes = self._pate_voting( + concat([fake, sample], 1), self.t_discriminators, train_arguments.lap_scale) + # print("noisy_labels", predictions.shape, "clean_votes", clean_votes.shape) + outputs = self.s_discriminator(concat([fake, sample], 1)) + + # update the moments + alpha = alpha + self._moments_acc(self.n_teachers, clean_votes, train_arguments.lap_scale, l_list) + # print("final_alpha", alpha) + + stu_loss = cross_entropy(predictions, outputs) + gradients_of_stu = stu_tape.gradient(stu_loss, self.s_discriminator.trainable_variables) + # print(gradients_of_stu) + + disc_opt_stu.apply_gradients(zip(gradients_of_stu, self.s_discriminator.trainable_variables)) + + # train the generator + z = uniform([inputs.shape[0], self.z_dim], dtype=float64) + + sample_g = expand_dims(category_samples.sample(inputs.shape[0]), axis=1) + + with GradientTape() as gen_tape: + fake = self.generator(concat([z, sample_g], 1)) + output = self.s_discriminator(concat([fake, sample_g], 1)) + + loss_gen = cross_entropy(ones_like(output), output) + gradients_of_generator = gen_tape.gradient(loss_gen, self.generator.trainable_variables) + generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables)) + + # Calculate the current privacy cost + epsilon = min((alpha - log(self.delta)) / l_list) + if steps % 1 == 0: + print("Step : ", steps, "Loss SD : ", stu_loss, "Loss G : ", loss_gen, "Epsilon : ", epsilon) + + steps += 1 + # self.generator.summary() + + def _pate_voting(self, data, netTD, lap_scale): + # TODO: Validate the logic against original article + ## Faz os votos dos teachers (1/0) netTD para cada record em data e guarda em results + results = zeros([len(netTD), data.shape[0]], dtype=int64) + # print(results) + for i in range(len(netTD)): + output = netTD[i](data, training=True) + pred = transpose(cast((output > 0.5), int64)) + # print(pred) + results = tensor_scatter_nd_update(results, constant([[i]]), pred) + # print(results) + + #guarda o somatorio das probabilidades atribuidas por cada disc a cada record (valores entre 0 e len(netTD)) + clean_votes = expand_dims(cast(reduce_sum(results, 0), dtype=float64), 1) + # print("clean_votes",clean_votes) + noise_sample = distributions.Laplace(loc=0, scale=1/lap_scale).sample(clean_votes.shape) + # print("noise_sample", noise_sample) + noisy_results = clean_votes + cast(noise_sample, float64) + noisy_labels = cast((noisy_results > len(netTD)/2), float64) + + return noisy_labels, clean_votes + + +class Discriminator(Model): + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim): + input = Input(shape=input_shape, batch_size=self.batch_size) + x = Dense(dim * 4)(input) + x = ReLU()(x) + x = Dense(dim * 2)(x) + x = Dense(1)(x) + return Model(inputs=input, outputs=x) + + +class Generator(Model): + def __init__(self, batch_size): + self.batch_size = batch_size + + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): + input = Input(shape=input_shape, batch_size = self.batch_size) + x = Dense(dim)(input) + x = ReLU()(x) + x = Dense(dim * 2)(x) + x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info, 'ActivationInterface')(x) + return Model(inputs=input, outputs=x) diff --git a/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py b/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py index a0689cbc..a813f518 100644 --- a/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/vanillagan/model.py @@ -1,10 +1,12 @@ import os from os import path import numpy as np +from typing import List, Optional, NamedTuple from tqdm import trange from ydata_synthetic.synthesizers.gan import BaseModel from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface import tensorflow as tf from tensorflow.keras.layers import Input, Dense, Dropout @@ -18,9 +20,10 @@ class VanilllaGAN(BaseModel): def __init__(self, model_parameters): super().__init__(model_parameters) - def define_gan(self): + def define_gan(self, processor_info: Optional[NamedTuple]): self.generator = Generator(self.batch_size).\ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim) + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info = processor_info) self.discriminator = Discriminator(self.batch_size).\ build_model(input_shape=(self.data_dim,), dim=self.layers_dim) @@ -58,14 +61,26 @@ def get_data_batch(self, train, batch_size, seed=0): stop_i = start_i + batch_size shuffle_seed = (batch_size * seed) // len(train) np.random.seed(shuffle_seed) - train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) # wasteful to shuffle every time + train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - x = train.loc[train_ix[start_i: stop_i]].values - return np.reshape(x, (batch_size, -1)) + return train[train_ix[start_i: stop_i]] + + def train(self, data, train_arguments: TrainParameters, num_cols: List[str], + cat_cols: List[str], preprocess: bool = True): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + preprocess: If True preprocess the data before using in train session + """ + super().train(data, num_cols, cat_cols, preprocess) + + processed_data = self.processor.transform(data) + self.data_dim = processed_data.shape[1] + self.define_gan(self.processor.col_transform_info if preprocess else None) - def train(self, - data, - train_arguments: TrainParameters): iterations = int(abs(data.shape[0]/self.batch_size)+1) # Adversarial ground truths @@ -77,7 +92,7 @@ def train(self, # --------------------- # Train Discriminator # --------------------- - batch_data = self.get_data_batch(data, self.batch_size) + batch_data = self.get_data_batch(processed_data, self.batch_size) noise = tf.random.normal((self.batch_size, self.noise_dim)) # Generate a batch of events @@ -118,12 +133,14 @@ class Generator(tf.keras.Model): def __init__(self, batch_size): self.batch_size=batch_size - def build_model(self, input_shape, dim, data_dim): + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): input= Input(shape=input_shape, batch_size=self.batch_size) x = Dense(dim, activation='relu')(input) x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info)(x) return Model(inputs=input, outputs=x) class Discriminator(tf.keras.Model): @@ -138,4 +155,4 @@ def build_model(self, input_shape, dim): x = Dropout(0.1)(x) x = Dense(dim, activation='relu')(x) x = Dense(1, activation='sigmoid')(x) - return Model(inputs=input, outputs=x) \ No newline at end of file + return Model(inputs=input, outputs=x) diff --git a/src/ydata_synthetic/synthesizers/regular/wgan/model.py b/src/ydata_synthetic/synthesizers/regular/wgan/model.py index fead3720..f58fc67b 100644 --- a/src/ydata_synthetic/synthesizers/regular/wgan/model.py +++ b/src/ydata_synthetic/synthesizers/regular/wgan/model.py @@ -1,15 +1,18 @@ -from os import path, mkdir -import numpy as np -from tqdm import trange - -from ydata_synthetic.synthesizers.gan import BaseModel -from ydata_synthetic.synthesizers import TrainParameters +from os import mkdir, path +from typing import List, Optional, NamedTuple +import numpy as np import tensorflow as tf -from tensorflow.keras.layers import Input, Dense, Dropout import tensorflow.keras.backend as K from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Dropout, Input from tensorflow.keras.optimizers import Adam +from tqdm import trange + +from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.synthesizers.gan import BaseModel +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface + #Auxiliary Keras backend class to calculate the Random Weighted average #https://stackoverflow.com/questions/58133430/how-to-substitute-keras-layers-merge-merge-in-tensorflow-keras @@ -39,9 +42,10 @@ def __init__(self, model_parameters, n_critic, clip_value=0.01): def wasserstein_loss(self, y_true, y_pred): return K.mean(y_true * y_pred) - def define_gan(self): + def define_gan(self, processor_info: Optional[NamedTuple] = None): self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim) + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info=processor_info) self.critic = Critic(self.batch_size). \ build_model(input_shape=(self.data_dim,), dim=self.layers_dim) @@ -77,14 +81,26 @@ def get_data_batch(self, train, batch_size, seed=0): stop_i = start_i + batch_size shuffle_seed = (batch_size * seed) // len(train) np.random.seed(shuffle_seed) - train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) # wasteful to shuffle every time + train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - x = train.loc[train_ix[start_i: stop_i]].values - return np.reshape(x, (batch_size, -1)) + return train[train_ix[start_i: stop_i]] + + def train(self, data, train_arguments: TrainParameters, num_cols: List[str], + cat_cols: List[str], preprocess: bool = True): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + preprocess: If True preprocess the data before using in train session + """ + super().train(data, num_cols, cat_cols, preprocess) + + processed_data = self.processor.transform(data) + self.data_dim = processed_data.shape[1] + self.define_gan(self.processor.col_transform_info if preprocess else None) - def train(self, - data, - train_arguments: TrainParameters): #Create a summary file iterations = int(abs(data.shape[0]/self.batch_size)+1) train_summary_writer = tf.summary.create_file_writer(path.join('.', 'summaries', 'train')) @@ -100,7 +116,7 @@ def train(self, # --------------------- # Train the Critic # --------------------- - batch_data = self.get_data_batch(data, self.batch_size) + batch_data = self.get_data_batch(processed_data, self.batch_size) noise = tf.random.normal((self.batch_size, self.noise_dim)) # Generate a batch of events @@ -140,12 +156,14 @@ class Generator(tf.keras.Model): def __init__(self, batch_size): self.batch_size = batch_size - def build_model(self, input_shape, dim, data_dim): + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): input = Input(shape=input_shape, batch_size=self.batch_size) x = Dense(dim, activation='relu')(input) x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info)(x) return Model(inputs=input, outputs=x) class Critic(tf.keras.Model): diff --git a/src/ydata_synthetic/synthesizers/regular/wgangp/model.py b/src/ydata_synthetic/synthesizers/regular/wgangp/model.py index 4454e7a9..253b54e4 100644 --- a/src/ydata_synthetic/synthesizers/regular/wgangp/model.py +++ b/src/ydata_synthetic/synthesizers/regular/wgangp/model.py @@ -1,15 +1,18 @@ import os from os import path +from typing import List, NamedTuple, Optional + import numpy as np +import tensorflow as tf +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Dropout, Input +from tensorflow.keras.optimizers import Adam from tqdm import trange -from ydata_synthetic.synthesizers.gan import BaseModel from ydata_synthetic.synthesizers import TrainParameters +from ydata_synthetic.synthesizers.gan import BaseModel +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface -import tensorflow as tf -from tensorflow.keras.layers import Input, Dense, Dropout -from tensorflow.keras import Model -from tensorflow.keras.optimizers import Adam class WGAN_GP(BaseModel): @@ -22,9 +25,10 @@ def __init__(self, model_parameters, n_critic, gradient_penalty_weight=10): self.gradient_penalty_weight = gradient_penalty_weight super().__init__(model_parameters) - def define_gan(self): + def define_gan(self, processor_info: Optional[NamedTuple] = None): self.generator = Generator(self.batch_size). \ - build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim) + build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim, + processor_info=processor_info) self.critic = Critic(self.batch_size). \ build_model(input_shape=(self.data_dim,), dim=self.layers_dim) @@ -116,19 +120,30 @@ def get_data_batch(self, train, batch_size, seed=0): stop_i = start_i + batch_size shuffle_seed = (batch_size * seed) // len(train) np.random.seed(shuffle_seed) - train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) # wasteful to shuffle every time + train_ix = np.random.choice(train.shape[0], replace=False, size=len(train)) # wasteful to shuffle every time train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set - x = train.loc[train_ix[start_i: stop_i]].values - return np.reshape(x, (batch_size, -1)) + return train[train_ix[start_i: stop_i]] @tf.function def train_step(self, train_data): cri_loss, ge_loss = self.update_gradients(train_data) return cri_loss, ge_loss - def train(self, - data, - train_arguments: TrainParameters): + def train(self, data, train_arguments: TrainParameters, num_cols: List[str], + cat_cols: List[str], preprocess: bool = True): + """ + Args: + data: A pandas DataFrame or a Numpy array with the data to be synthesized + train_arguments: GAN training arguments. + num_cols: List of columns of the data object to be handled as numerical + cat_cols: List of columns of the data object to be handled as categorical + preprocess: If True preprocess the data before using in train session + """ + super().train(data, num_cols, cat_cols, preprocess) + + processed_data = self.processor.transform(data) + self.data_dim = processed_data.shape[1] + self.define_gan(self.processor.col_transform_info if preprocess else None) iterations = int(abs(data.shape[0]/self.batch_size)+1) @@ -138,7 +153,7 @@ def train(self, with train_summary_writer.as_default(): for epoch in trange(train_arguments.epochs): for _ in range(iterations): - batch_data = self.get_data_batch(data, self.batch_size).astype(np.float32) + batch_data = self.get_data_batch(processed_data, self.batch_size).astype(np.float32) cri_loss, ge_loss = self.train_step(batch_data) print( @@ -163,12 +178,14 @@ class Generator(tf.keras.Model): def __init__(self, batch_size): self.batch_size = batch_size - def build_model(self, input_shape, dim, data_dim): + def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None): input = Input(shape=input_shape, batch_size=self.batch_size) x = Dense(dim, activation='relu')(input) x = Dense(dim * 2, activation='relu')(x) x = Dense(dim * 4, activation='relu')(x) x = Dense(data_dim)(x) + if processor_info: + x = ActivationInterface(processor_info)(x) return Model(inputs=input, outputs=x) class Critic(tf.keras.Model): diff --git a/src/ydata_synthetic/tests/custom_layers/test_activation_interface.py b/src/ydata_synthetic/tests/custom_layers/test_activation_interface.py new file mode 100644 index 00000000..b6bbec63 --- /dev/null +++ b/src/ydata_synthetic/tests/custom_layers/test_activation_interface.py @@ -0,0 +1,72 @@ +"Activation Interface layer test suite." +from itertools import cycle, islice +from re import search + +from numpy import array, cumsum, isin, split +from numpy import sum as npsum +from numpy.random import normal +from pandas import DataFrame, concat +from pytest import fixture +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Input + +from ydata_synthetic.preprocessing.regular.processor import \ + RegularDataProcessor +from ydata_synthetic.utils.gumbel_softmax import ActivationInterface + +BATCH_SIZE = 10 + +@fixture(name='noise_batch') +def fixture_noise_batch(): + "Sample noise for mock output generation." + return normal(size=(BATCH_SIZE, 16)) + +@fixture(name='mock_data') +def fixture_mock_data(): + "Creates mock data for the tests." + num_block = DataFrame(normal(size=(BATCH_SIZE, 6)), columns = [f'num_{i}' for i in range(6)]) + cat_block_1 = DataFrame(array(list(islice(cycle(range(2)), BATCH_SIZE))), columns = ['cat_0']) + cat_block_2 = DataFrame(array(list(islice(cycle(range(4)), BATCH_SIZE))), columns = ['cat_1']) + return concat([num_block, cat_block_1, cat_block_2], axis = 1) + +@fixture(name='mock_processor') +def fixture_mock_processor(mock_data): + "Creates a mock data processor for the mock data." + num_cols = [col for col in mock_data.columns if col.startswith('num')] + cat_cols = [col for col in mock_data.columns if col.startswith('cat')] + return RegularDataProcessor(num_cols, cat_cols).fit(mock_data) + +# pylint: disable=C0103 +@fixture(name='mock_generator') +def fixture_mock_generator(noise_batch, mock_processor): + "A mock generator with the Activation Interface as final layer." + input_ = Input(shape=noise_batch.shape[1], batch_size = BATCH_SIZE) + dim = 15 + data_dim = 12 + x = Dense(dim, activation='relu')(input_) + x = Dense(dim * 2, activation='relu')(x) + x = Dense(dim * 4, activation='relu')(x) + x = Dense(data_dim)(x) + x = ActivationInterface(processor_info=mock_processor.col_transform_info, name='act_itf')(x) + return Model(inputs=input_, outputs=x) + +@fixture(name='mock_output') +def fixture_mock_output(noise_batch, mock_generator): + "Returns mock output of the model as a numpy object." + return mock_generator(noise_batch).numpy() + +# pylint: disable=W0632 +def test_io(mock_processor, mock_output): + "Tests the output format of the activation interface for a known input." + num_lens = len(mock_processor.col_transform_info.numerical.feat_names_out) + cat_lens = len(mock_processor.col_transform_info.categorical.feat_names_out) + assert mock_output.shape == (BATCH_SIZE, num_lens + cat_lens), "The output has wrong shape." + num_part, cat_part = split(mock_output, [num_lens], 1) + assert not isin(num_part, [0, 1]).all(), "The numerical block is not expected to contain 0 or 1." + assert isin(cat_part, [0, 1]).all(), "The categorical block is expected to contain only 0 or 1." + cat_i, cat_o = mock_processor.col_transform_info.categorical + cat_blocks = cumsum([len([col for col in cat_o if col.startswith(feat) and search('_[0-9]*$', col)]) \ + for feat in cat_i]) + cat_blocks = split(cat_part, cat_blocks[:-1], 1) + assert all(npsum(abs(block)) == BATCH_SIZE for block in cat_blocks), "There are non one-hot encoded \ + categorical blocks." diff --git a/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax.py b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax.py new file mode 100644 index 00000000..dd52c71d --- /dev/null +++ b/src/ydata_synthetic/tests/custom_layers/test_gumbel_softmax.py @@ -0,0 +1,54 @@ +"Test suite for the Gumbel-Softmax layer implementation." +import tensorflow as tf +from numpy import amax, amin, isclose, ones +from numpy import sum as npsum +from pytest import fixture +from tensorflow.keras import layers + +from ydata_synthetic.utils.gumbel_softmax import GumbelSoftmaxLayer + + +# pylint:disable=W0613 +def custom_initializer(shape_list, dtype): + "A constant weight intializer to ensure test reproducibility." + return tf.constant(ones((5, 5)), dtype=tf.dtypes.float32) + +@fixture(name='rand_input') +def fixture_rand_input(): + "A random, reproducible, input for the mock model." + return tf.constant(tf.random.normal([4, 5], seed=42)) + +def test_hard_sample_output_format(rand_input): + """Tests that the hard output samples are in the expected formats. + The hard sample should be returned as a one-hot tensor.""" + affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input) + hard_sample, _ = GumbelSoftmaxLayer()(affined) + assert npsum(hard_sample) == hard_sample.shape[0], "The sum of the hard samples should equal the number." + assert all(npsum(hard_sample == 0, 1) == hard_sample.shape[1] - 1), "The hard samples is not a one-hot tensor." + +def test_soft_sample_output_format(rand_input): + """Tests that the soft output samples are in the expected formats. + The soft sample should be returned as a probabilities tensor.""" + affined = layers.Dense(5, use_bias = False, kernel_initializer=custom_initializer)(rand_input) + _, soft_sample = GumbelSoftmaxLayer(tau=0.5)(affined) + assert isclose(npsum(soft_sample), soft_sample.shape[0]), "The sum of the soft samples should be close to \ + the number of records." + assert amax(soft_sample) <= 1, "Invalid probability values found." + assert amin(soft_sample) >= 0, "Invalid probability values found." + +def test_gradients(rand_input): + "Performs basic numerical assertions on the gradients of the sof/hard samples." + def mock(i): + return GumbelSoftmaxLayer()(layers.Dense(5, use_bias=False, kernel_initializer=custom_initializer)(i)) + with tf.GradientTape() as hard_tape: + hard_tape.watch(rand_input) + hard_sample, _ = mock(rand_input) + with tf.GradientTape() as soft_tape: + soft_tape.watch(rand_input) + _, soft_sample = mock(rand_input) + hard_grads = hard_tape.gradient(hard_sample, rand_input) + soft_grads = soft_tape.gradient(soft_sample, rand_input) + + assert hard_grads is None, "The hard sample must not compute gradients." + assert soft_grads is not None, "The soft sample is expected to compute gradients." + assert npsum(abs(soft_grads)) != 0, "The soft sample is expected to have non-zero gradients." diff --git a/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py b/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py new file mode 100644 index 00000000..561319a0 --- /dev/null +++ b/src/ydata_synthetic/tests/preprocessing/test_regular_data_processor.py @@ -0,0 +1,77 @@ +""" +Test suite for the RegularProcessor. +""" +from numpy import isclose, ndarray +from pmlb import fetch_data +from pytest import fixture, raises +from sklearn.exceptions import NotFittedError + +from ydata_synthetic.preprocessing.regular.processor import \ + RegularDataProcessor + + +@fixture +def regular_data_example(): + return fetch_data('adult') + +@fixture +def regular_data_processor_args(regular_data_example): + num_cols = ['fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] + cat_cols = list(set(regular_data_example.columns).difference(set(num_cols))) + return num_cols, cat_cols + +@fixture +def overlapped_column_lists(regular_data_processor_args): + num_cols, cat_cols = regular_data_processor_args + cat_cols.append(num_cols[0]) + return num_cols, cat_cols + +@fixture +def incomplete_column_lists(regular_data_processor_args): + num_cols, cat_cols = regular_data_processor_args + num_cols.pop() + return num_cols, cat_cols + +@fixture +def regular_data_processor(regular_data_processor_args): + num_cols, cat_cols = regular_data_processor_args + return RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols) + +def test_is_fitted(regular_data_processor, regular_data_example): + "Tests raising NotFittedError in attempting to transform with a non fitted processor." + with raises(NotFittedError): + regular_data_processor.transform(regular_data_example) + +def test_column_validations(regular_data_example, overlapped_column_lists, incomplete_column_lists): + "Tests the column lists validation method." + processor = RegularDataProcessor + with raises(AssertionError): + processor(*overlapped_column_lists).fit(regular_data_example) + with raises(AssertionError): + processor(*incomplete_column_lists).fit(regular_data_example) + +def test_fit(regular_data_processor, regular_data_example): + "Tests fit method and _check_is_fitted method before and after fitting." + with raises(NotFittedError): + regular_data_processor._check_is_fitted() + processor = regular_data_processor.fit(regular_data_example) + assert processor._check_is_fitted() is None + +def test_fit_transform(regular_data_processor, regular_data_example): + "Tests fit transform method, _check_is_fitted method and storing of attributes required for inverse_transform." + transformed = regular_data_processor.fit_transform(regular_data_example) + assert regular_data_processor._check_is_fitted() is None + assert transformed.shape[0] == regular_data_example.shape[0] + assert transformed.shape[1] != regular_data_example.shape[1] + assert all([isinstance(idx, int) for idx in [regular_data_processor._num_col_idx_, regular_data_processor._cat_col_idx_]]) + assert isinstance(transformed, ndarray) + +def test_inverse_transform(regular_data_processor, regular_data_example): + "Tests inverse_transform and its output by comparing to the original data example." + transformed = regular_data_processor.fit_transform(regular_data_example) + inverted = regular_data_processor.inverse_transform(transformed) + assert isinstance(inverted, type(regular_data_example)) + assert inverted.shape == regular_data_example.shape + assert (inverted.columns == regular_data_example.columns).all() + assert (inverted.dtypes == regular_data_processor._types).all() + assert isclose(inverted, regular_data_example).all() diff --git a/src/ydata_synthetic/utils/gumbel_softmax.py b/src/ydata_synthetic/utils/gumbel_softmax.py new file mode 100644 index 00000000..affe48b6 --- /dev/null +++ b/src/ydata_synthetic/utils/gumbel_softmax.py @@ -0,0 +1,84 @@ +"""Gumbel-Softmax layer implementation. +Reference: https://arxiv.org/pdf/1611.04051.pdf""" +from re import search +from typing import NamedTuple, Optional + +# pylint: disable=E0401 +from tensorflow import (Tensor, TensorShape, concat, one_hot, split, squeeze, + stop_gradient) +from tensorflow.keras.layers import Activation, Layer +from tensorflow.keras.utils import register_keras_serializable +from tensorflow.math import log +from tensorflow.nn import softmax +from tensorflow.random import categorical, uniform + +TOL = 1e-20 + + +def gumbel_noise(shape: TensorShape) -> Tensor: + """Create a single sample from the standard (loc = 0, scale = 1) Gumbel distribution.""" + uniform_sample = uniform(shape, seed=0) + return -log(-log(uniform_sample + TOL) + TOL) + +@register_keras_serializable(package='Synthetic Data', name='GumbelSoftmaxLayer') +class GumbelSoftmaxLayer(Layer): + "A Gumbel-Softmax layer implementation that should be stacked on top of a categorical feature logits." + + def __init__(self, tau: float = 0.2, name: Optional[str] = None, **kwargs): + super().__init__(name=name, **kwargs) + self.tau = tau + + # pylint: disable=W0221, E1120 + def call(self, _input): + """Computes Gumbel-Softmax for the logits output of a particular categorical feature.""" + noised_input = _input + gumbel_noise(_input.shape) + soft_sample = softmax(noised_input/self.tau, -1) + hard_sample = stop_gradient(squeeze(one_hot(categorical(log(soft_sample), 1), _input.shape[-1]), 1)) + return hard_sample, soft_sample + + def get_config(self): + config = super().get_config().copy() + config.update({'tau': self.tau}) + return config + + +@register_keras_serializable(package='Synthetic Data', name='ActivationInterface') +class ActivationInterface(Layer): + """An interface layer connecting different parts of an incoming tensor to adequate activation functions. + The tensor parts are qualified according to the passed processor object. + Processed categorical features are sent to specific Gumbel-Softmax layers. + Processed features of different kind are sent to a TanH activation. + Finally all output parts are concatenated and returned in the same order. + + The parts of an incoming tensor are qualified by leveraging a namedtuple pointing to each of the used data \ + processor's pipelines in/out feature maps. For simplicity this object can be taken directly from the data \ + processor col_transform_info.""" + + def __init__(self, processor_info: NamedTuple, name: Optional[str] = None, **kwargs): + """Arguments: + col_map (NamedTuple): Defines each of the processor pipelines input/output features. + name (Optional[str]): Name of the layer""" + super().__init__(name=name, **kwargs) + + self._processor_info = processor_info + + self.cat_feats = processor_info.categorical + self.num_feats = processor_info.numerical + + self._cat_lens = [len([col for col in self.cat_feats.feat_names_out if search(f'^{cat_feat}_.*$', col)]) \ + for cat_feat in self.cat_feats.feat_names_in] + self._num_lens = len(self.num_feats.feat_names_out) + + def call(self, _input): # pylint: disable=W0221 + num_cols, cat_cols = split(_input, [self._num_lens, -1], 1, name='split_num_cats') + cat_cols = split(cat_cols, self._cat_lens if self._cat_lens else [0], 1, name='split_cats') + + num_cols = [Activation('tanh', name='num_cols_activation')(num_cols)] + cat_cols = [GumbelSoftmaxLayer(name=name)(col)[0] for name, col in \ + zip(self.cat_feats.feat_names_in, cat_cols)] + return concat(num_cols+cat_cols, 1) + + def get_config(self): + config = super().get_config().copy() + config.update({'processor_info': self._processor_info}) + return config