docs: fix examples for time-series synthetic data generation (#137)

fabclmnt · azory-ydata · web-flow · commit 611bd22ea2ca · 2024-11-18T17:38:53.000Z
* docs: fix examples for time-series synthetic data generation &amp; update docs

* fix(linting): code formatting

---------

Co-authored-by: Azory YData Bot &lt;azory@ydata.ai&gt;
diff --git a/docs/sdk/examples/synthesize_tabular_data.md b/docs/sdk/examples/synthesize_tabular_data.md
@@ -2,6 +2,8 @@
 
 **Use YData's *RegularSynthesizer* to generate tabular synthetic data**
 
+For a more detailed tutorial please check [YData Fabric Academy ydata-sdk notebooks](https://github.com/ydataai/academy/tree/master).
+
 ```python
 --8<-- "examples/synthesizer_from_pandas.py"
 ```
diff --git a/docs/sdk/examples/synthesize_timeseries_data.md b/docs/sdk/examples/synthesize_timeseries_data.md
@@ -14,6 +14,8 @@ Dissecting any time-series dataset, we see differences in variables' behavior th
 - Variables that refer to entities (single or multiple entities)
 - Variables that are attributes (those that don't depend on time but rather on the entity)
 
+For a more detailed tutorial please check [YData Fabric Academy ydata-sdk notebooks](https://github.com/ydataai/academy/tree/master).
+
 Below find an example:
 
 ```python
diff --git a/examples/synthesizers/time_series_quickstart.py b/examples/synthesizers/time_series_quickstart.py
@@ -1,21 +1,52 @@
+# -*- coding: utf-8 -*-
+
+# Authentication
 import os
 
 from ydata.sdk.dataset import get_dataset
 from ydata.sdk.synthesizers import TimeSeriesSynthesizer
 
 # Do not forget to add your token as env variable
-os.environ["YDATA_TOKEN"] = '<TOKEN>'
+os.environ["YDATA_TOKEN"] = '{insert-token}'
+
+
+# Sampling an example dataset for a multientity & multivariate time-series dataset"""
+
+# Generate the dataset
+time_series_data = get_dataset('timeseries')
 
-X = get_dataset('occupancy')
+# Print the first few rows of the dataset
+print(time_series_data.head())
+
+# Train a Synthetic data generator
+
+# From a pandas dataframe
 
 # We initialize a time series synthesizer
 # As long as the synthesizer does not call `fit`, it exists only locally
-synth = TimeSeriesSynthesizer()
+synth = TimeSeriesSynthesizer(name='Time-series synth')
 
 # We train the synthesizer on our dataset
 # sortbykey -> variable that define the time order for the sequence
-synth.fit(X, sortbykey='date')
+synth.fit(time_series_data, sortbykey='time', entities='entity_id')
+
+# Generate samples from an already trained synthesizer
+# From the synthesizer in context in the notebook
+
+
+# Generate a sample with x number of entities
+# In this example the objective is to generate a dataset with the same size as the original. For that reason, 5 entities will be generated.
+sample = synth.sample(n_entities=5)
+
+sample.head()
+
+# From a previously trained synthetic data generation model
+# List the trained synthetic data generators to get the uid synthetisizer
+TimeSeriesSynthesizer.list()
+
+synth = TimeSeriesSynthesizer(uid='{insert-synth-id}').get()
+
+# Generate a new synthetic dataset with the sample method
+sample = synth.sample(n_entities=5)
 
-# By default it is requested a synthetic sample with the same length as the original data
-# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected
-sample = synth.sample(n_entities=1)
+sample.head()
diff --git a/src/ydata/sdk/dataset/dataset.py b/src/ydata/sdk/dataset/dataset.py
@@ -1,10 +1,55 @@
+import numpy as np
 from numpy import int64
 from pandas import DataFrame as pdDataFrame
 from pandas import read_csv, to_datetime
 
 from ydata.sdk.utils.cache import cache_file
 
 
+def get_timeseries() -> pdDataFrame:
+    def generate_multivariate_multientity_timeseries(num_rows=1000, num_entities=5, num_timesteps=10):
+        """Generates a multivariate, multi-entity time series dataset.
+
+        Args:
+            num_rows: The number of rows in the dataset. Defaults to 1000.
+            num_entities: The number of entities in the dataset. Defaults to 5.
+            num_timesteps: The number of time steps for each entity. Defaults to 10.
+
+        Returns:
+            A pandas DataFrame representing the time-series dataset.
+        """
+
+        data = []
+        for entity in range(num_entities):
+            for t in range(num_timesteps):
+                row = {
+                    'entity_id': entity,
+                    'time': t
+                }
+                for feature in range(3):
+                    # Simulate some random data
+                    row[f'feature_{feature}'] = np.random.rand()
+                data.append(row)
+
+        # Adding more rows to meet the desired number of rows
+        additional_rows = max(0, num_rows - len(data))
+        for _ in range(additional_rows):
+            entity = np.random.randint(0, num_entities)
+            t = np.random.randint(0, num_timesteps)
+            row = {
+                'entity_id': entity,
+                'time': t
+            }
+            for feature in range(3):
+                row[f'feature_{feature}'] = np.random.rand()
+            data.append(row)
+        df = pdDataFrame(data)
+
+        return df
+
+    return generate_multivariate_multientity_timeseries()
+
+
 def get_census() -> pdDataFrame:
     file_name = cache_file(
         "census_train.csv",
@@ -75,7 +120,7 @@ def get_dataset(name: str):
         'census': get_census,
         'titanic': get_titanic,
         'airquality': get_airquality,
-        'occupancy': get_occupancy
+        'timeseries': get_timeseries
     }
 
     if name not in DATASETS: