Skip to content

Commit

Permalink
fix: improvements to the doppelganger model
Browse files Browse the repository at this point in the history
  • Loading branch information
ricardodcpereira committed Sep 3, 2023
1 parent 8a6ad65 commit cf5712c
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, num_cols: Optional[List[str]] = None,
sequence_length: Optional[int] = None,
sample_length: Optional[int] = None):
super().__init__(num_cols, cat_cols)

if num_cols is None:
num_cols = []
if cat_cols is None:
Expand Down Expand Up @@ -152,30 +152,30 @@ def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]:
cat_data = DataFrame(self._cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]), columns=one_hot_cat_cols)

self._measurement_one_hot_cat_cols = [c for c in one_hot_cat_cols if c.split("_")[0] in self._measurement_cat_cols]
self._measurement_cols_metadata = [ColumnMetadata(discrete=False,
output_dim=1,
self._measurement_cols_metadata = [ColumnMetadata(discrete=False,
output_dim=1,
name=c) for c in self._measurement_num_cols]
measurement_cat_data = cat_data[self._measurement_one_hot_cat_cols].to_numpy() if self._measurement_one_hot_cat_cols else zeros([len(X), 0])
self._measurement_cols_metadata += [ColumnMetadata(discrete=True,
output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
self._measurement_cols_metadata += [ColumnMetadata(discrete=True,
output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
name=c) for c in self._measurement_cat_cols]
data_features = concatenate([X[self._measurement_num_cols].to_numpy(), measurement_cat_data], axis=1)

if self._has_attributes:
self._attribute_one_hot_cat_cols = [c for c in one_hot_cat_cols if c.split("_")[0] in self._attribute_cat_cols]
attribute_num_data = self._num_pipeline.transform(X[self._attribute_num_cols]) if self._attribute_num_cols else zeros([len(X), 0])
self._attribute_cols_metadata = [ColumnMetadata(discrete=False,
output_dim=1,
self._attribute_cols_metadata = [ColumnMetadata(discrete=False,
output_dim=1,
name=c) for c in self._attribute_num_cols]
attribute_cat_data = cat_data[self._attribute_one_hot_cat_cols].to_numpy() if self._attribute_one_hot_cat_cols else zeros([len(X), 0])
self._attribute_cols_metadata += [ColumnMetadata(discrete=True,
output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
self._attribute_cols_metadata += [ColumnMetadata(discrete=True,
output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
name=c) for c in self._attribute_cat_cols]
data_attributes = concatenate([attribute_num_data, attribute_cat_data], axis=1)
else:
data_attributes = zeros((data_features.shape[0], 1))
data_attributes = zeros((data_features.shape[0], 0))
self._attribute_one_hot_cat_cols = []
self._attribute_cols_metadata = [ColumnMetadata(discrete=False, output_dim=1, name="zeros_attribute")]
self._attribute_cols_metadata = []

num_samples = int(X.shape[0] / self.sequence_length)
data_features = asarray(array_split(data_features, num_samples))
Expand All @@ -188,9 +188,9 @@ def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]:
min_col = amin(col_data, axis=1) - self._eps
additional_attributes.append((max_col + min_col) / 2.0)
additional_attributes.append((max_col - min_col) / 2.0)
self._attribute_cols_metadata += [ColumnMetadata(discrete=False,
output_dim=1,
name=f"addi_{col_meta.name}_{ix}",
self._attribute_cols_metadata += [ColumnMetadata(discrete=False,
output_dim=1,
name=f"addi_{col_meta.name}_{ix}",
real=False) for ix in range (1, 3)]
max_col = expand_dims(max_col, axis=1)
min_col = expand_dims(min_col, axis=1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -506,11 +506,11 @@ def sample_from(self, real_attribute_input_noise,
return features, attributes, gen_flags, lengths

def gen_attribute_input_noise(self, num_sample):
return np.random.normal(
return np.random.uniform(low=0, high=1,
size=[num_sample, self.attribute_latent_dim])

def gen_feature_input_noise(self, num_sample, length=1):
return np.random.normal(
return np.random.uniform(low=0, high=1,
size=[num_sample, length, self.feature_latent_dim])

def gen_feature_input_data_free(self, num_sample):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def fit(self, data: DataFrame,

if self._sequence_length % self._sample_length != 0:
raise ValueError("The sequence length must be a multiple of the sample length.")

data_features, data_attributes = self.processor.transform(data)
measurement_cols_metadata = self.processor.measurement_cols_metadata
attribute_cols_metadata = self.processor.attribute_cols_metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def build(self, attribute_input_noise, addi_attribute_input_noise,
if attribute is None:
all_attribute = []
all_discrete_attribute = []
if len(self.addi_attribute_outputs) > 0:
if len(self.addi_attribute_outputs) > 0 and len(self.real_attribute_outputs) > 0:
all_attribute_input_noise = \
[attribute_input_noise,
addi_attribute_input_noise]
Expand All @@ -169,6 +169,11 @@ def build(self, attribute_input_noise, addi_attribute_input_noise,
all_attribute_out_dim = \
[self.real_attribute_out_dim,
self.addi_attribute_out_dim]
elif len(self.addi_attribute_outputs) > 0:
all_attribute_input_noise = [addi_attribute_input_noise]
all_attribute_outputs = [self.addi_attribute_outputs]
all_attribute_part_name = [self.STR_ADDI]
all_attribute_out_dim = [self.addi_attribute_out_dim]
else:
all_attribute_input_noise = [attribute_input_noise]
all_attribute_outputs = [self.real_attribute_outputs]
Expand Down

0 comments on commit cf5712c

Please sign in to comment.