PATEGAN base implementation

ydataai · Dec 14, 2021 · 1a57bb9 · 1a57bb9
1 parent 08d3cae
commit 1a57bb9
Show file tree

Hide file tree

Showing 28 changed files with 2,347 additions and 717 deletions.
diff --git a/examples/regular/adult_dragan.py b/examples/regular/adult_dragan.py
@@ -1,12 +1,18 @@
-from ydata_synthetic.preprocessing.regular.adult import transformations
+from pmlb import fetch_data
+
 from ydata_synthetic.synthesizers.regular import DRAGAN
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 
-#Load and process the data
-data, processed_data, preprocessor = transformations()
+model = DRAGAN
+
+#Load data and define the data processor parameters
+data = fetch_data('adult')
+num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
+cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
+            'native-country', 'target']
 
-# WGAN_GP training
-#Defininf the training parameters of WGAN_GP
+# DRAGAN training
+#Defining the training parameters of DRAGAN
 
 noise_dim = 128
 dim = 128
@@ -23,12 +29,14 @@
                            lr=learning_rate,
                            betas=(beta_1, beta_2),
                            noise_dim=noise_dim,
-                           n_cols=processed_data.shape[1],
                            layers_dim=dim)
 
 train_args = TrainParameters(epochs=epochs,
                              sample_interval=log_step)
 
-synthesizer = DRAGAN(gan_args, n_discriminator=3)
-synthesizer.train(processed_data, train_args)
+synthesizer = model(gan_args, n_discriminator=3)
+synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True)
 synthesizer.save('adult_synth.pkl')
+
+synthesizer = model.load('adult_synth.pkl')
+synthesizer.sample(1000)
diff --git a/examples/regular/adult_wgangp.py b/examples/regular/adult_wgangp.py
@@ -1,16 +1,21 @@
-from ydata_synthetic.preprocessing.regular.adult import transformations
+from pmlb import fetch_data
+
 from ydata_synthetic.synthesizers.regular import WGAN_GP
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 
-#Load and process the data
-data, processed_data, preprocessor = transformations()
+model = WGAN_GP
+
+#Load data and define the data processor parameters
+data = fetch_data('adult')
+num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
+cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
+            'native-country', 'target']
 
-# WGAN_GP training
-#Defining the training parameters of WGAN_GP
+#Defining the training parameters
 
-noise_dim = 32
+noise_dim = 128
 dim = 128
-batch_size = 128
+batch_size = 500
 
 log_step = 100
 epochs = 300+1
@@ -23,14 +28,15 @@
                            lr=learning_rate,
                            betas=(beta_1, beta_2),
                            noise_dim=noise_dim,
-                           n_cols=processed_data.shape[1],
                            layers_dim=dim)
 
 train_args = TrainParameters(epochs=epochs,
                              sample_interval=log_step)
 
-synthesizer = WGAN_GP(gan_args, n_critic=2)
-synthesizer.train(processed_data, train_args)
+synthesizer = model(gan_args, n_critic=2)
+synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True)
 
-synth_data = synthesizer.sample(1000)
 synthesizer.save('test.pkl')
+
+synthesizer = model.load('test.pkl')
+synth_data = synthesizer.sample(1000)
diff --git a/examples/regular/cgan_example.py b/examples/regular/cgan_example.py
@@ -1,25 +1,22 @@
 from ydata_synthetic.synthesizers.regular import CGAN
-from ydata_synthetic.preprocessing.regular.credit_fraud import transformations
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 
 import pandas as pd
 import numpy as np
 from sklearn import cluster
 
+model = CGAN
+
 #Read the original data and have it preprocessed
 data = pd.read_csv('data/creditcard.csv', index_col=[0])
 
 #List of columns different from the Class column
-data_cols = list(data.columns[ data.columns != 'Class' ])
-label_cols = ['Class']
+num_cols = list(data.columns[ data.columns != 'Class' ])
+cat_cols = []  # Condition features are not preprocessed and therefore not listed here
 
-print('Dataset columns: {}'.format(data_cols))
+print('Dataset columns: {}'.format(num_cols))
 sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
-processed_data = data[ sorted_cols ].copy()
-
-#Before training the GAN do not forget to apply the required data transformations
-#To ease here we've applied a PowerTransformation
-_, data, _ = transformations(data)
+data = data[ sorted_cols ].copy()
 
 #For the purpose of this example we will only synthesize the minority class
 train_data = data.loc[ data['Class']==1 ].copy()
@@ -28,7 +25,7 @@
 print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
 algorithm = cluster.KMeans
 args, kwds = (), {'n_clusters':2, 'random_state':0}
-labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])
+labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
 
 print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
 
@@ -51,19 +48,11 @@
 learning_rate = 5e-4
 models_dir = './cache'
 
-train_sample = fraud_w_classes.copy().reset_index(drop=True)
-train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)
-label_cols = [list(train_sample.columns).index(i) for i in train_sample.columns if 'Class' in i ]
-data_cols = [ i for i in train_sample.columns if i not in label_cols ]
-train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
-train_no_label = train_sample[ data_cols ]
-
 #Test here the new inputs
 gan_args = ModelParameters(batch_size=batch_size,
                            lr=learning_rate,
                            betas=(beta_1, beta_2),
                            noise_dim=noise_dim,
-                           n_cols=train_sample.shape[1] - len(label_cols),  # Don't count the label columns here
                            layers_dim=dim)
 
 train_args = TrainParameters(epochs=epochs,
@@ -73,10 +62,19 @@
                              labels=(0,1))
 
 #Init the Conditional GAN providing the index of the label column as one of the arguments
-synthesizer = CGAN(model_parameters=gan_args, num_classes=2)
+synthesizer = model(model_parameters=gan_args, num_classes=2)
 
 #Training the Conditional GAN
-synthesizer.train(data=train_sample, label="Class",train_arguments=train_args)
+synthesizer.train(data=fraud_w_classes, label_col="Class", train_arguments=train_args,
+                  num_cols=num_cols, cat_cols=cat_cols)
 
 #Saving the synthesizer
 synthesizer.save('cgan_synthtrained.pkl')
+
+#Loading the synthesizer
+synthesizer = model.load('cgan_synthtrained.pkl')
+
+#Sampling from the synthesizer
+cond_array = np.array([0])
+# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place)
+synthesizer = synthesizer.sample(cond_array, 1000)
diff --git a/examples/regular/cramergan_example.py b/examples/regular/cramergan_example.py
@@ -6,34 +6,28 @@
 
 from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
 from ydata_synthetic.synthesizers.regular import CRAMERGAN
-from ydata_synthetic.preprocessing.regular.credit_fraud import transformations
 
 model = CRAMERGAN
 
 #Read the original data and have it preprocessed
 data = pd.read_csv('data/creditcard.csv', index_col=[0])
 
-#Data processing and analysis
-data_cols = list(data.columns[ data.columns != 'Class' ])
-label_cols = ['Class']
+#List of columns different from the Class column
+num_cols = list(data.columns[ data.columns != 'Class' ])
+cat_cols = ['Class']
 
-print('Dataset columns: {}'.format(data_cols))
+print('Dataset columns: {}'.format(num_cols))
 sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
-processed_data = data[ sorted_cols ].copy()
-
-#Before training the GAN do not forget to apply the required data transformations
-#To ease here we've applied a PowerTransformation
-_, data, _ = transformations(data)
-
+data = data[ sorted_cols ].copy()
 
 #For the purpose of this example we will only synthesize the minority class
 train_data = data.loc[ data['Class']==1 ].copy()
 
+#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
 print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
-
 algorithm = cluster.KMeans
 args, kwds = (), {'n_clusters':2, 'random_state':0}
-labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])
+labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
 
 print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
 
@@ -53,29 +47,18 @@
 beta_2 = 0.9
 models_dir = './cache'
 
-train_sample = fraud_w_classes.copy().reset_index(drop=True)
-train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)
-label_cols = [ i for i in train_sample.columns if 'Class' in i ]
-data_cols = [ i for i in train_sample.columns if i not in label_cols ]
-train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
-train_no_label = train_sample[ data_cols ]
-
 model_parameters = ModelParameters(batch_size=batch_size,
                            lr=learning_rate,
                            betas=(beta_1, beta_2),
                            noise_dim=noise_dim,
-                           n_cols=train_sample.shape[1],
                            layers_dim=dim)
 
 train_args = TrainParameters(epochs=epochs,
                              sample_interval=log_step)
 
-test_size = 492 # number of fraud cases
-noise_dim = 32
-
 #Training the CRAMERGAN model
 synthesizer = model(model_parameters, gradient_penalty_weight=10)
-synthesizer.train(train_sample, train_args)
+synthesizer.train(data=fraud_w_classes, train_arguments=train_args, num_cols = num_cols, cat_cols = cat_cols)
 
 #Saving the synthesizer to later generate new events
 synthesizer.save(path='models/cramergan_creditcard.pkl')