fix: improvements to the doppelganger model

ydataai · Sep 3, 2023 · cf5712c · cf5712c
1 parent 8a6ad65
commit cf5712c
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 18 deletions.
diff --git a/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py b/src/ydata_synthetic/preprocessing/timeseries/doppelganger_processor.py
@@ -44,7 +44,7 @@ def __init__(self, num_cols: Optional[List[str]] = None,
                  sequence_length: Optional[int] = None,
                  sample_length: Optional[int] = None):
         super().__init__(num_cols, cat_cols)
-        
+
         if num_cols is None:
             num_cols = []
         if cat_cols is None:
@@ -152,30 +152,30 @@ def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]:
         cat_data = DataFrame(self._cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0]), columns=one_hot_cat_cols)
 
         self._measurement_one_hot_cat_cols = [c for c in one_hot_cat_cols if c.split("_")[0] in self._measurement_cat_cols]
-        self._measurement_cols_metadata = [ColumnMetadata(discrete=False, 
-                                                          output_dim=1, 
+        self._measurement_cols_metadata = [ColumnMetadata(discrete=False,
+                                                          output_dim=1,
                                                           name=c) for c in self._measurement_num_cols]
         measurement_cat_data = cat_data[self._measurement_one_hot_cat_cols].to_numpy() if self._measurement_one_hot_cat_cols else zeros([len(X), 0])
-        self._measurement_cols_metadata += [ColumnMetadata(discrete=True, 
-                                                           output_dim=X[c].nunique() if X[c].nunique() != 2 else 1, 
+        self._measurement_cols_metadata += [ColumnMetadata(discrete=True,
+                                                           output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
                                                            name=c) for c in self._measurement_cat_cols]
         data_features = concatenate([X[self._measurement_num_cols].to_numpy(), measurement_cat_data], axis=1)
 
         if self._has_attributes:
             self._attribute_one_hot_cat_cols = [c for c in one_hot_cat_cols if c.split("_")[0] in self._attribute_cat_cols]
             attribute_num_data = self._num_pipeline.transform(X[self._attribute_num_cols]) if self._attribute_num_cols else zeros([len(X), 0])
-            self._attribute_cols_metadata = [ColumnMetadata(discrete=False, 
-                                                            output_dim=1, 
+            self._attribute_cols_metadata = [ColumnMetadata(discrete=False,
+                                                            output_dim=1,
                                                             name=c) for c in self._attribute_num_cols]
             attribute_cat_data = cat_data[self._attribute_one_hot_cat_cols].to_numpy() if self._attribute_one_hot_cat_cols else zeros([len(X), 0])
-            self._attribute_cols_metadata += [ColumnMetadata(discrete=True, 
-                                                             output_dim=X[c].nunique() if X[c].nunique() != 2 else 1, 
+            self._attribute_cols_metadata += [ColumnMetadata(discrete=True,
+                                                             output_dim=X[c].nunique() if X[c].nunique() != 2 else 1,
                                                              name=c) for c in self._attribute_cat_cols]
             data_attributes = concatenate([attribute_num_data, attribute_cat_data], axis=1)
         else:
-            data_attributes = zeros((data_features.shape[0], 1))
+            data_attributes = zeros((data_features.shape[0], 0))
             self._attribute_one_hot_cat_cols = []
-            self._attribute_cols_metadata = [ColumnMetadata(discrete=False, output_dim=1, name="zeros_attribute")]
+            self._attribute_cols_metadata = []
 
         num_samples = int(X.shape[0] / self.sequence_length)
         data_features = asarray(array_split(data_features, num_samples))
@@ -188,9 +188,9 @@ def transform(self, X: DataFrame) -> tuple[ndarray, ndarray]:
                 min_col = amin(col_data, axis=1) - self._eps
                 additional_attributes.append((max_col + min_col) / 2.0)
                 additional_attributes.append((max_col - min_col) / 2.0)
-                self._attribute_cols_metadata += [ColumnMetadata(discrete=False, 
-                                                                 output_dim=1, 
-                                                                 name=f"addi_{col_meta.name}_{ix}", 
+                self._attribute_cols_metadata += [ColumnMetadata(discrete=False,
+                                                                 output_dim=1,
+                                                                 name=f"addi_{col_meta.name}_{ix}",
                                                                  real=False) for ix in range (1, 3)]
                 max_col = expand_dims(max_col, axis=1)
                 min_col = expand_dims(min_col, axis=1)

diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/doppelganger.py
@@ -506,11 +506,11 @@ def sample_from(self, real_attribute_input_noise,
         return features, attributes, gen_flags, lengths
 
     def gen_attribute_input_noise(self, num_sample):
-        return np.random.normal(
+        return np.random.uniform(low=0, high=1,
             size=[num_sample, self.attribute_latent_dim])
 
     def gen_feature_input_noise(self, num_sample, length=1):
-        return np.random.normal(
+        return np.random.uniform(low=0, high=1,
             size=[num_sample, length, self.feature_latent_dim])
 
     def gen_feature_input_data_free(self, num_sample):

diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/model.py
@@ -50,7 +50,7 @@ def fit(self, data: DataFrame,
 
         if self._sequence_length % self._sample_length != 0:
             raise ValueError("The sequence length must be a multiple of the sample length.")
-        
+
         data_features, data_attributes = self.processor.transform(data)
         measurement_cols_metadata = self.processor.measurement_cols_metadata
         attribute_cols_metadata = self.processor.attribute_cols_metadata

diff --git a/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py b/src/ydata_synthetic/synthesizers/timeseries/doppelganger/network.py
@@ -157,7 +157,7 @@ def build(self, attribute_input_noise, addi_attribute_input_noise,
             if attribute is None:
                 all_attribute = []
                 all_discrete_attribute = []
-                if len(self.addi_attribute_outputs) > 0:
+                if len(self.addi_attribute_outputs) > 0 and len(self.real_attribute_outputs) > 0:
                     all_attribute_input_noise = \
                         [attribute_input_noise,
                          addi_attribute_input_noise]
@@ -169,6 +169,11 @@ def build(self, attribute_input_noise, addi_attribute_input_noise,
                     all_attribute_out_dim = \
                         [self.real_attribute_out_dim,
                          self.addi_attribute_out_dim]
+                elif len(self.addi_attribute_outputs) > 0:
+                    all_attribute_input_noise = [addi_attribute_input_noise]
+                    all_attribute_outputs = [self.addi_attribute_outputs]
+                    all_attribute_part_name = [self.STR_ADDI]
+                    all_attribute_out_dim = [self.addi_attribute_out_dim]
                 else:
                     all_attribute_input_noise = [attribute_input_noise]
                     all_attribute_outputs = [self.real_attribute_outputs]