Skip to content

Commit

Permalink
PATEGAN base implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Francisco Santos committed Dec 14, 2021
1 parent 7dab8a6 commit b9dffa6
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 19 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ pmlb==1.0.*
tqdm<5.0
typeguard==2.13.*
pytest==6.2.*
tensorflow_probability==0.12.*
4 changes: 3 additions & 1 deletion src/ydata_synthetic/synthesizers/regular/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
from ydata_synthetic.synthesizers.regular.wgangp.model import WGAN_GP
from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN
from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN
from ydata_synthetic.synthesizers.regular.pategan.model import PATEGAN

__all__ = [
"VanilllaGAN",
"CGAN",
"WGAN",
"WGAN_GP",
"DRAGAN",
"CRAMERGAN"
"CRAMERGAN",
"PATEGAN"
]
206 changes: 188 additions & 18 deletions src/ydata_synthetic/synthesizers/regular/pategan/model.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,47 @@
"PATEGAN implementation supporting Differential Privacy budget specification."
from typing import List

# pylint: disable = W0622, E0401
from math import log
from typing import List, NamedTuple, Optional

import tqdm
from tensorflow import clip_by_value
from tensorflow.dtypes import cast, float64
from tensorflow import (GradientTape, clip_by_value, concat, constant,
expand_dims, ones_like, tensor_scatter_nd_update,
transpose, zeros, zeros_like)
from tensorflow.data import Dataset
from tensorflow.dtypes import cast, float64, int64
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.math import abs, exp, pow, reduce_sum, square
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.random import uniform
from tensorflow_probability import distributions

from ydata_synthetic.synthesizers import TrainParameters
from ydata_synthetic.synthesizers.gan import BaseModel
from ydata_synthetic.utils.gumbel_softmax import ActivationInterface


# pylint: disable=R0902
class PATEGAN(BaseModel):
"A basic PATEGAN synthesizer implementation with configurable differential privacy budget."

__MODEL__='PATEGAN'

def __init__(self, model_parameters, n_teachers: int, delta: float, epsilon: float):
def __init__(self, model_parameters, n_teachers: int, target_delta: float, target_epsilon: float):
super().__init__(model_parameters)
self.n_teachers = n_teachers
self.delta = delta
self.epsilon = epsilon
self.target_epsilon = target_epsilon
self.target_delta = target_delta

def define_gan(self):
# pylint: disable=W0201
def define_gan(self, processor_info: Optional[NamedTuple] = None):
def discriminator():
discriminator = Discriminator(self.batch_size)
return discriminator.build_model((self.data_dim,), self.layers_dim)
return Discriminator(self.batch_size).build_model((self.data_dim,), self.layers_dim)

self.generator = Generator(self.batch_size). \
build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim)
build_model(input_shape=(self.noise_dim,), dim=self.layers_dim, data_dim=self.data_dim,
processor_info=processor_info)
self.s_discriminator = discriminator()
self.t_discriminators = [discriminator() for i in range(self.n_teachers)]

Expand All @@ -45,8 +55,9 @@ def discriminator():
teacher.compile(loss=loss_fn, optimizer=discriminator_optimizer)

# pylint: disable = C0103
def _moments_acc(self, votes, lap_scale, l_list):
q = (2 + lap_scale * abs(2 * votes - self.n_teachers))/(4 * exp(lap_scale * abs(2 * votes - self.n_teachers)))
@staticmethod
def _moments_acc(n_teachers, votes, lap_scale, l_list):
q = (2 + lap_scale * abs(2 * votes - n_teachers))/(4 * exp(lap_scale * abs(2 * votes - n_teachers)))

update = []
for l in l_list:
Expand All @@ -55,9 +66,166 @@ def _moments_acc(self, votes, lap_scale, l_list):
update.append(reduce_sum(clip_by_value(t, clip_value_min=-clip, clip_value_max=clip)))
return cast(update, dtype=float64)

def train(self, data, train_arguments, num_cols: List[str], cat_cols: List[str],
preprocess: bool = True):
return None
def get_data_loader(self, data) -> List[Dataset]:
"Obtain a List of TF Datasets corresponding to partitions for each teacher in n_teachers."
loader = []
SHUFFLE_BUFFER_SIZE = 100

for teacher_id in range(self.n_teachers):
start_id = int(teacher_id * len(data) / self.n_teachers)
end_id = int((teacher_id + 1) * len(data) / self.n_teachers if \
teacher_id != (self.n_teachers - 1) else len(data))
loader.append(Dataset.from_tensor_slices(data[start_id:end_id:])\
.batch(self.batch_size).shuffle(SHUFFLE_BUFFER_SIZE))
return loader

# pylint:disable=R0913
def train(self, data, class_ratios, train_arguments: TrainParameters, num_cols: List[str], cat_cols: List[str]):
"""
Args:
data: A pandas DataFrame or a Numpy array with the data to be synthesized
class_ratios:
train_arguments: GAN training arguments.
num_cols: List of columns of the data object to be handled as numerical
cat_cols: List of columns of the data object to be handled as categorical
"""
super().train(data, num_cols, cat_cols)

data = self.processor.transform(data)
self.data_dim = data.shape[1]
self.define_gan(self.processor.col_transform_info)

self.class_ratios = class_ratios

alpha = cast([0.0 for _ in range(train_arguments.num_moments)], float64)
l_list = 1 + cast(range(train_arguments.num_moments), float64)

# print("initial alpha", l_list.shape)

cross_entropy = BinaryCrossentropy(from_logits=True)

generator_optimizer = Adam(learning_rate=train_arguments.lr)
disc_opt_stu = Adam(learning_rate=train_arguments.lr)
disc_opt_t = [Adam(learning_rate=train_arguments.lr) for i in range(self.n_teachers)]

train_loader = self.get_data_loader(data, self.batch_size)

steps = 0
epsilon = 0

category_samples = distributions.Categorical(probs=self.class_ratios, dtype=float64)

while epsilon < self.target_epsilon:
# train the teacher descriminator
for t_2 in range(train_arguments.num_teacher_iters):
for i in range(self.n_teachers):
inputs, categories = None, None
for b, data_ in enumerate(train_loader[i]):
inputs, categories = data_, b
#categories will give zero value in each loop as the loop break after running the first time
#inputs will have only the first batch of data
break

with GradientTape() as disc_tape:
# train with real
dis_data = concat([inputs, zeros([inputs.shape[0], 1], dtype=float64)], 1)
# print("1st batch data", dis_data.shape)
real_output = self.t_discriminators[i](dis_data, training=True)
# print(real_output.shape, tf.ones.shape)

# train with fake
z = uniform([inputs.shape[0], self.z_dim], dtype=float64)
# print("uniformly distributed noise", z.shape)

sample = expand_dims(category_samples.sample(inputs.shape[0]), axis=1)
# print("category", sample.shape)

fake = self.generator(concat([z, sample], 1))
# print('fake', fake.shape)

fake_output = self.t_discriminators[i](concat([fake, sample], 1), training=True)
# print('fake_output_dis', fake_output.shape)

# print("watch", disc_tape.watch(self.teacher_disc[i].trainable_variables)
real_loss_disc = cross_entropy(ones_like(real_output), real_output)
fake_loss_disc = cross_entropy(zeros_like(fake_output), fake_output)

disc_loss = real_loss_disc + fake_loss_disc
# print(disc_loss, real_loss_disc, fake_loss_disc)

gradients_of_discriminator = disc_tape.gradient(disc_loss, self.t_discriminators[i].trainable_variables)
# print(gradients_of_discriminator)

disc_opt_t[i].apply_gradients(zip(gradients_of_discriminator, self.t_discriminators[i].trainable_variables))

# train the student discriminator
for t_3 in range(train_arguments.num_student_iters):
z = uniform([inputs.shape[0], self.z_dim], dtype=float64)

sample = expand_dims(category_samples.sample(inputs.shape[0]), axis=1)
# print("category_stu", sample.shape)

with GradientTape() as stu_tape:
fake = self.generator(concat([z, sample], 1))
# print('fake_stu', fake.shape)

predictions, clean_votes = self._pate_voting(
concat([fake, sample], 1), self.t_discriminators, train_arguments.lap_scale)
# print("noisy_labels", predictions.shape, "clean_votes", clean_votes.shape)
outputs = self.s_discriminator(concat([fake, sample], 1))

# update the moments
alpha = alpha + self._moments_acc(self.n_teachers, clean_votes, train_arguments.lap_scale, l_list)
# print("final_alpha", alpha)

stu_loss = cross_entropy(predictions, outputs)
gradients_of_stu = stu_tape.gradient(stu_loss, self.s_discriminator.trainable_variables)
# print(gradients_of_stu)

disc_opt_stu.apply_gradients(zip(gradients_of_stu, self.s_discriminator.trainable_variables))

# train the generator
z = uniform([inputs.shape[0], self.z_dim], dtype=float64)

sample_g = expand_dims(category_samples.sample(inputs.shape[0]), axis=1)

with GradientTape() as gen_tape:
fake = self.generator(concat([z, sample_g], 1))
output = self.s_discriminator(concat([fake, sample_g], 1))

loss_gen = cross_entropy(ones_like(output), output)
gradients_of_generator = gen_tape.gradient(loss_gen, self.generator.trainable_variables)
generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))

# Calculate the current privacy cost
epsilon = min((alpha - log(self.delta)) / l_list)
if steps % 1 == 0:
print("Step : ", steps, "Loss SD : ", stu_loss, "Loss G : ", loss_gen, "Epsilon : ", epsilon)

steps += 1
# self.generator.summary()

def _pate_voting(self, data, netTD, lap_scale):
# TODO: Validate the logic against original article
## Faz os votos dos teachers (1/0) netTD para cada record em data e guarda em results
results = zeros([len(netTD), data.shape[0]], dtype=int64)
# print(results)
for i in range(len(netTD)):
output = netTD[i](data, training=True)
pred = transpose(cast((output > 0.5), int64))
# print(pred)
results = tensor_scatter_nd_update(results, constant([[i]]), pred)
# print(results)

#guarda o somatorio das probabilidades atribuidas por cada disc a cada record (valores entre 0 e len(netTD))
clean_votes = expand_dims(cast(reduce_sum(results, 0), dtype=float64), 1)
# print("clean_votes",clean_votes)
noise_sample = distributions.Laplace(loc=0, scale=1/lap_scale).sample(clean_votes.shape)
# print("noise_sample", noise_sample)
noisy_results = clean_votes + cast(noise_sample, float64)
noisy_labels = cast((noisy_results > len(netTD)/2), float64)

return noisy_labels, clean_votes


class Discriminator(Model):
Expand All @@ -77,10 +245,12 @@ class Generator(Model):
def __init__(self, batch_size):
self.batch_size = batch_size

def build_model(self, input_shape, dim, data_dim):
def build_model(self, input_shape, dim, data_dim, processor_info: Optional[NamedTuple] = None):
input = Input(shape=input_shape, batch_size = self.batch_size)
x = Dense(dim)(input)
x = ReLU()(x)
x = Dense(dim * 2)(x)
x = Dense(data_dim)(x)
if processor_info:
x = ActivationInterface(processor_info, 'ActivationInterface')(x)
return Model(inputs=input, outputs=x)

0 comments on commit b9dffa6

Please sign in to comment.