Skip to content

Commit

Permalink
PATEGAN base implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Francisco Santos committed Dec 14, 2021
1 parent 08d3cae commit 1a57bb9
Show file tree
Hide file tree
Showing 28 changed files with 2,347 additions and 717 deletions.
24 changes: 16 additions & 8 deletions examples/regular/adult_dragan.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from ydata_synthetic.preprocessing.regular.adult import transformations
from pmlb import fetch_data

from ydata_synthetic.synthesizers.regular import DRAGAN
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

#Load and process the data
data, processed_data, preprocessor = transformations()
model = DRAGAN

#Load data and define the data processor parameters
data = fetch_data('adult')
num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country', 'target']

# WGAN_GP training
#Defininf the training parameters of WGAN_GP
# DRAGAN training
#Defining the training parameters of DRAGAN

noise_dim = 128
dim = 128
Expand All @@ -23,12 +29,14 @@
lr=learning_rate,
betas=(beta_1, beta_2),
noise_dim=noise_dim,
n_cols=processed_data.shape[1],
layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
sample_interval=log_step)

synthesizer = DRAGAN(gan_args, n_discriminator=3)
synthesizer.train(processed_data, train_args)
synthesizer = model(gan_args, n_discriminator=3)
synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True)
synthesizer.save('adult_synth.pkl')

synthesizer = model.load('adult_synth.pkl')
synthesizer.sample(1000)
28 changes: 17 additions & 11 deletions examples/regular/adult_wgangp.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
from ydata_synthetic.preprocessing.regular.adult import transformations
from pmlb import fetch_data

from ydata_synthetic.synthesizers.regular import WGAN_GP
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

#Load and process the data
data, processed_data, preprocessor = transformations()
model = WGAN_GP

#Load data and define the data processor parameters
data = fetch_data('adult')
num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country', 'target']

# WGAN_GP training
#Defining the training parameters of WGAN_GP
#Defining the training parameters

noise_dim = 32
noise_dim = 128
dim = 128
batch_size = 128
batch_size = 500

log_step = 100
epochs = 300+1
Expand All @@ -23,14 +28,15 @@
lr=learning_rate,
betas=(beta_1, beta_2),
noise_dim=noise_dim,
n_cols=processed_data.shape[1],
layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
sample_interval=log_step)

synthesizer = WGAN_GP(gan_args, n_critic=2)
synthesizer.train(processed_data, train_args)
synthesizer = model(gan_args, n_critic=2)
synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True)

synth_data = synthesizer.sample(1000)
synthesizer.save('test.pkl')

synthesizer = model.load('test.pkl')
synth_data = synthesizer.sample(1000)
38 changes: 18 additions & 20 deletions examples/regular/cgan_example.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
from ydata_synthetic.synthesizers.regular import CGAN
from ydata_synthetic.preprocessing.regular.credit_fraud import transformations
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

import pandas as pd
import numpy as np
from sklearn import cluster

model = CGAN

#Read the original data and have it preprocessed
data = pd.read_csv('data/creditcard.csv', index_col=[0])

#List of columns different from the Class column
data_cols = list(data.columns[ data.columns != 'Class' ])
label_cols = ['Class']
num_cols = list(data.columns[ data.columns != 'Class' ])
cat_cols = [] # Condition features are not preprocessed and therefore not listed here

print('Dataset columns: {}'.format(data_cols))
print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
processed_data = data[ sorted_cols ].copy()

#Before training the GAN do not forget to apply the required data transformations
#To ease here we've applied a PowerTransformation
_, data, _ = transformations(data)
data = data[ sorted_cols ].copy()

#For the purpose of this example we will only synthesize the minority class
train_data = data.loc[ data['Class']==1 ].copy()
Expand All @@ -28,7 +25,7 @@
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])

print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )

Expand All @@ -51,19 +48,11 @@
learning_rate = 5e-4
models_dir = './cache'

train_sample = fraud_w_classes.copy().reset_index(drop=True)
train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)
label_cols = [list(train_sample.columns).index(i) for i in train_sample.columns if 'Class' in i ]
data_cols = [ i for i in train_sample.columns if i not in label_cols ]
train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
train_no_label = train_sample[ data_cols ]

#Test here the new inputs
gan_args = ModelParameters(batch_size=batch_size,
lr=learning_rate,
betas=(beta_1, beta_2),
noise_dim=noise_dim,
n_cols=train_sample.shape[1] - len(label_cols), # Don't count the label columns here
layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
Expand All @@ -73,10 +62,19 @@
labels=(0,1))

#Init the Conditional GAN providing the index of the label column as one of the arguments
synthesizer = CGAN(model_parameters=gan_args, num_classes=2)
synthesizer = model(model_parameters=gan_args, num_classes=2)

#Training the Conditional GAN
synthesizer.train(data=train_sample, label="Class",train_arguments=train_args)
synthesizer.train(data=fraud_w_classes, label_col="Class", train_arguments=train_args,
num_cols=num_cols, cat_cols=cat_cols)

#Saving the synthesizer
synthesizer.save('cgan_synthtrained.pkl')

#Loading the synthesizer
synthesizer = model.load('cgan_synthtrained.pkl')

#Sampling from the synthesizer
cond_array = np.array([0])
# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place)
synthesizer = synthesizer.sample(cond_array, 1000)
33 changes: 8 additions & 25 deletions examples/regular/cramergan_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,28 @@

from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import CRAMERGAN
from ydata_synthetic.preprocessing.regular.credit_fraud import transformations

model = CRAMERGAN

#Read the original data and have it preprocessed
data = pd.read_csv('data/creditcard.csv', index_col=[0])

#Data processing and analysis
data_cols = list(data.columns[ data.columns != 'Class' ])
label_cols = ['Class']
#List of columns different from the Class column
num_cols = list(data.columns[ data.columns != 'Class' ])
cat_cols = ['Class']

print('Dataset columns: {}'.format(data_cols))
print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
processed_data = data[ sorted_cols ].copy()

#Before training the GAN do not forget to apply the required data transformations
#To ease here we've applied a PowerTransformation
_, data, _ = transformations(data)

data = data[ sorted_cols ].copy()

#For the purpose of this example we will only synthesize the minority class
train_data = data.loc[ data['Class']==1 ].copy()

#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))

algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])

print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )

Expand All @@ -53,29 +47,18 @@
beta_2 = 0.9
models_dir = './cache'

train_sample = fraud_w_classes.copy().reset_index(drop=True)
train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)
label_cols = [ i for i in train_sample.columns if 'Class' in i ]
data_cols = [ i for i in train_sample.columns if i not in label_cols ]
train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
train_no_label = train_sample[ data_cols ]

model_parameters = ModelParameters(batch_size=batch_size,
lr=learning_rate,
betas=(beta_1, beta_2),
noise_dim=noise_dim,
n_cols=train_sample.shape[1],
layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
sample_interval=log_step)

test_size = 492 # number of fraud cases
noise_dim = 32

#Training the CRAMERGAN model
synthesizer = model(model_parameters, gradient_penalty_weight=10)
synthesizer.train(train_sample, train_args)
synthesizer.train(data=fraud_w_classes, train_arguments=train_args, num_cols = num_cols, cat_cols = cat_cols)

#Saving the synthesizer to later generate new events
synthesizer.save(path='models/cramergan_creditcard.pkl')
Expand Down
Loading

0 comments on commit 1a57bb9

Please sign in to comment.