Skip to content

Commit

Permalink
feat: update to python 3.10, update examples (#223)
Browse files Browse the repository at this point in the history
* feat: update to python 3.10, update examples

* feat: add CWGANGP example

* chore: remove unused imports

* chore: remove trailing whitespace
  • Loading branch information
aquemy authored Jan 16, 2023
1 parent 6c79c89 commit bd20953
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 58 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
![](https://img.shields.io/github/workflow/status/ydataai/ydata-synthetic/prerelease)
![](https://img.shields.io/pypi/status/ydata-synthetic)
[![](https://pepy.tech/badge/ydata-synthetic)](https://pypi.org/project/ydata-synthetic/)
![](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue)
![](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
[![](https://img.shields.io/pypi/v/ydata-synthetic)](https://pypi.org/project/ydata-synthetic/)
![](https://img.shields.io/github/license/ydataai/ydata-synthetic)

Expand Down
5 changes: 2 additions & 3 deletions examples/regular/models/adult_dragan.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

# DRAGAN training
#Defining the training parameters of DRAGAN

noise_dim = 128
dim = 128
batch_size = 500
Expand All @@ -35,10 +34,10 @@
synth = RegularSynthesizer(modelname='dragan', model_parameters=gan_args, n_discriminator=3)
synth.fit(data = data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)

synth.save('adult_synth.pkl')
synth.save('adult_dragan_model.pkl')

#########################################################
# Loading and sampling from a trained synthesizer #
#########################################################
synthesizer = RegularSynthesizer.load('adult_synth.pkl')
synthesizer = RegularSynthesizer.load('adult_dragan_model.pkl')
synthesizer.sample(1000)
18 changes: 9 additions & 9 deletions examples/regular/models/adult_wgangp.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
from pmlb import fetch_data

from ydata_synthetic.synthesizers.regular import WGAN_GP
from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

model = WGAN_GP

#Load data and define the data processor parameters
data = fetch_data('adult')
num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country', 'target']

#Defining the training parameters

noise_dim = 128
dim = 128
batch_size = 500
Expand All @@ -33,10 +30,13 @@
train_args = TrainParameters(epochs=epochs,
sample_interval=log_step)

synthesizer = model(gan_args, n_critic=2)
synthesizer.train(data, train_args, num_cols, cat_cols)
synth = RegularSynthesizer(modelname='wgangp', model_parameters=gan_args, n_critic=2)
synth.fit(data, train_args, num_cols, cat_cols)

synthesizer.save('test.pkl')
synth.save('adult_wgangp_model.pkl')

synthesizer = model.load('test.pkl')
synth_data = synthesizer.sample(1000)
#########################################################
# Loading and sampling from a trained synthesizer #
#########################################################
synth = RegularSynthesizer.load('adult_wgangp_model.pkl')
synth_data = synth.sample(1000)
Original file line number Diff line number Diff line change
@@ -1,35 +1,37 @@
"""
CGAN architecture example file
"""
from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

import pandas as pd
import numpy as np
from sklearn import cluster

from ydata_synthetic.utils.cache import cache_file
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import RegularSynthesizer

#Read the original data and have it preprocessed
data = pd.read_csv('../../data/creditcard.csv', index_col=[0])
data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
data = pd.read_csv(data_path, index_col=[0])

#List of columns different from the Class column
#Data processing and analysis
num_cols = list(data.columns[ data.columns != 'Class' ])
cat_cols = [] # Condition features are not preprocessed and therefore not listed here
cat_cols = []

print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
data = data[ sorted_cols ].copy()
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19',
'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15',
'V9', 'V23', 'Class']
processed_data = data[ sorted_cols ].copy()
processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)

#For the purpose of this example we will only synthesize the minority class
train_data = data.loc[ data['Class']==1 ].copy()
train_data = processed_data.loc[processed_data['Class'] == 1].copy()

#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])

print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )

fraud_w_classes = train_data.copy()
fraud_w_classes['Class'] = labels

Expand Down Expand Up @@ -72,10 +74,10 @@
synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)

#Saving the synthesizer
synth.save('cgan_synthtrained.pkl')
synth.save('creditcard_cgan_model.pkl')

#Loading the synthesizer
synthesizer = RegularSynthesizer.load('cgan_synthtrained.pkl')
synthesizer = RegularSynthesizer.load('creditcard_cgan_model.pkl')

#Sampling from the synthesizer
cond_array = pd.DataFrame(100*[1], columns=['Class'])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,25 @@
import numpy as np
import pandas as pd

from ydata_synthetic.utils.cache import cache_file
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import RegularSynthesizer

#Read the original data and have it preprocessed
data = pd.read_csv('../../../data/creditcard.csv', index_col=[0])
data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
data = pd.read_csv(data_path, index_col=[0])

#List of columns different from the Class column
#Data processing and analysis
num_cols = list(data.columns[ data.columns != 'Class' ])
cat_cols = ['Class']

print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
data = data[ sorted_cols ].copy()
processed_data = data[ sorted_cols ].copy()
processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)

#For the purpose of this example we will only synthesize the minority class
train_data = data.loc[ data['Class']==1 ].copy()
train_data = processed_data.loc[processed_data['Class'] == 1].copy()

#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
Expand Down Expand Up @@ -62,12 +65,12 @@
synth.fit(data=train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)

#Saving the synthesizer to later generate new events
synth.save(path='cramergan_creditcard.pkl')
synth.save(path='creditcard_cramergan_model.pkl')

#########################################################
# Loading and sampling from a trained synthesizer #
#########################################################
synth = RegularSynthesizer.load(path='cramergan_creditcard.pkl')
synth = RegularSynthesizer.load(path='creditcard_cramergan_model.pkl')
#Sampling the data
#Note that the data returned it is not inverse processed.
data_sample = synth.sample(100000)
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

import pandas as pd
import numpy as np
from sklearn import cluster

from ydata_synthetic.utils.cache import cache_file
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import RegularSynthesizer

#Read the original data and have it preprocessed
data = pd.read_csv('../../data/creditcard.csv', index_col=[0])
data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
data = pd.read_csv(data_path, index_col=[0])

#List of columns different from the Class column
num_cols = list(data.columns[~data.columns.isin(['Class', 'Amount'])])
cat_cols = [] # Condition features are not preprocessed and therefore not listed here
#Data processing and analysis
num_cols = list(data.columns[ data.columns != 'Class' ])
cat_cols = [] #['Class']

print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
data = data[ sorted_cols ].copy()
processed_data = data[ sorted_cols ].copy()
processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)

#For the purpose of this example we will only synthesize the minority class
train_data = data.loc[ data['Class']==1 ].copy()
train_data = processed_data.loc[processed_data['Class'] == 1].copy()

#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional WGANGP
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])

print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )

fraud_w_classes = train_data.copy()
fraud_w_classes['Class'] = labels

Expand Down Expand Up @@ -66,16 +67,16 @@
synth = RegularSynthesizer(modelname='cwgangp', model_parameters=gan_args, n_critic=5)

#Fitting the synthesizer
synth.fit(data=fraud_w_classes, label_cols=["Class", "Amount"], train_arguments=train_args,
synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args,
num_cols=num_cols, cat_cols=cat_cols)

synth.save('.model.pkl')
synth.save('creditcard_cwgangp_model.pkl')

#########################################################
# Loading and sampling from a trained synthesizer #
#########################################################
new_synth = RegularSynthesizer.load('.model.pkl')
new_synth = RegularSynthesizer.load('creditcard_cwgangp_model.pkl')

sample_len = 2000
cond_array = fraud_w_classes[["Class", "Amount"]]
cond_array = fraud_w_classes[["Class"]]
new_synth.sample(cond_array)
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import pandas as pd
import numpy as np

from ydata_synthetic.utils.cache import cache_file
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import RegularSynthesizer

#Read the original data and have it preprocessed
data = pd.read_csv('../../../data/creditcard.csv', index_col=[0])
data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
data = pd.read_csv(data_path, index_col=[0])

#Data processing and analysis
num_cols = list(data.columns[ data.columns != 'Class' ])
Expand All @@ -17,9 +19,10 @@
print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
processed_data = data[ sorted_cols ].copy()
processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0)

#For the purpose of this example we will only synthesize the minority class
train_data = data.loc[ data['Class']==1 ].copy()
train_data = processed_data.loc[processed_data['Class'] == 1].copy()

print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
algorithm = cluster.KMeans
Expand Down Expand Up @@ -61,12 +64,12 @@
synth.fit(data=train_data, train_arguments = train_args, num_cols = num_cols, cat_cols = cat_cols)

#Saving the synthesizer to later generate new events
synth.save(path='models/wgan_creditcard.pkl')
synth.save(path='creditcard_wgan_model.pkl')

#########################################################
# Loading and sampling from a trained synthesizer #
#########################################################
synth = RegularSynthesizer.load(path='models/wgan_creditcard.pkl')
synth = RegularSynthesizer.load(path='creditcard_wgan_model.pkl')

#Sampling the data
data_sample = synth.sample(100000)
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
requests>=2.24.0, <2.29
pandas==1.4.*
pandas==1.5.*
numpy==1.23.*
scikit-learn==1.1.*
matplotlib==3.5.*
tensorflow==2.9.0
easydict==1.9
scikit-learn==1.2.*
matplotlib==3.6.*
tensorflow==2.11.0
easydict==1.10
pmlb==1.0.*
tqdm<5.0
typeguard==2.13.*
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
keywords='data science ydata',
url='https://github.com/ydataai/ydata-synthetic',
license="https://github.com/ydataai/ydata-synthetic/blob/master/LICENSE",
python_requires=">=3.6, <3.9",
python_requires=">=3.6, <3.11",
packages=find_namespace_packages('src'),
package_dir={'':'src'},
include_package_data=True,
Expand Down

0 comments on commit bd20953

Please sign in to comment.