Skip to content

Commit 611bd22

Browse files
docs: fix examples for time-series synthetic data generation (#137)
* docs: fix examples for time-series synthetic data generation & update docs * fix(linting): code formatting --------- Co-authored-by: Azory YData Bot <[email protected]>
1 parent 233459f commit 611bd22

File tree

4 files changed

+88
-8
lines changed

4 files changed

+88
-8
lines changed

docs/sdk/examples/synthesize_tabular_data.md

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
**Use YData's *RegularSynthesizer* to generate tabular synthetic data**
44

5+
For a more detailed tutorial please check [YData Fabric Academy ydata-sdk notebooks](https://github.com/ydataai/academy/tree/master).
6+
57
```python
68
--8<-- "examples/synthesizer_from_pandas.py"
79
```

docs/sdk/examples/synthesize_timeseries_data.md

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ Dissecting any time-series dataset, we see differences in variables' behavior th
1414
- Variables that refer to entities (single or multiple entities)
1515
- Variables that are attributes (those that don't depend on time but rather on the entity)
1616

17+
For a more detailed tutorial please check [YData Fabric Academy ydata-sdk notebooks](https://github.com/ydataai/academy/tree/master).
18+
1719
Below find an example:
1820

1921
```python
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,52 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Authentication
14
import os
25

36
from ydata.sdk.dataset import get_dataset
47
from ydata.sdk.synthesizers import TimeSeriesSynthesizer
58

69
# Do not forget to add your token as env variable
7-
os.environ["YDATA_TOKEN"] = '<TOKEN>'
10+
os.environ["YDATA_TOKEN"] = '{insert-token}'
11+
12+
13+
# Sampling an example dataset for a multientity & multivariate time-series dataset"""
14+
15+
# Generate the dataset
16+
time_series_data = get_dataset('timeseries')
817

9-
X = get_dataset('occupancy')
18+
# Print the first few rows of the dataset
19+
print(time_series_data.head())
20+
21+
# Train a Synthetic data generator
22+
23+
# From a pandas dataframe
1024

1125
# We initialize a time series synthesizer
1226
# As long as the synthesizer does not call `fit`, it exists only locally
13-
synth = TimeSeriesSynthesizer()
27+
synth = TimeSeriesSynthesizer(name='Time-series synth')
1428

1529
# We train the synthesizer on our dataset
1630
# sortbykey -> variable that define the time order for the sequence
17-
synth.fit(X, sortbykey='date')
31+
synth.fit(time_series_data, sortbykey='time', entities='entity_id')
32+
33+
# Generate samples from an already trained synthesizer
34+
# From the synthesizer in context in the notebook
35+
36+
37+
# Generate a sample with x number of entities
38+
# In this example the objective is to generate a dataset with the same size as the original. For that reason, 5 entities will be generated.
39+
sample = synth.sample(n_entities=5)
40+
41+
sample.head()
42+
43+
# From a previously trained synthetic data generation model
44+
# List the trained synthetic data generators to get the uid synthetisizer
45+
TimeSeriesSynthesizer.list()
46+
47+
synth = TimeSeriesSynthesizer(uid='{insert-synth-id}').get()
48+
49+
# Generate a new synthetic dataset with the sample method
50+
sample = synth.sample(n_entities=5)
1851

19-
# By default it is requested a synthetic sample with the same length as the original data
20-
# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected
21-
sample = synth.sample(n_entities=1)
52+
sample.head()

src/ydata/sdk/dataset/dataset.py

+46-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,55 @@
1+
import numpy as np
12
from numpy import int64
23
from pandas import DataFrame as pdDataFrame
34
from pandas import read_csv, to_datetime
45

56
from ydata.sdk.utils.cache import cache_file
67

78

9+
def get_timeseries() -> pdDataFrame:
10+
def generate_multivariate_multientity_timeseries(num_rows=1000, num_entities=5, num_timesteps=10):
11+
"""Generates a multivariate, multi-entity time series dataset.
12+
13+
Args:
14+
num_rows: The number of rows in the dataset. Defaults to 1000.
15+
num_entities: The number of entities in the dataset. Defaults to 5.
16+
num_timesteps: The number of time steps for each entity. Defaults to 10.
17+
18+
Returns:
19+
A pandas DataFrame representing the time-series dataset.
20+
"""
21+
22+
data = []
23+
for entity in range(num_entities):
24+
for t in range(num_timesteps):
25+
row = {
26+
'entity_id': entity,
27+
'time': t
28+
}
29+
for feature in range(3):
30+
# Simulate some random data
31+
row[f'feature_{feature}'] = np.random.rand()
32+
data.append(row)
33+
34+
# Adding more rows to meet the desired number of rows
35+
additional_rows = max(0, num_rows - len(data))
36+
for _ in range(additional_rows):
37+
entity = np.random.randint(0, num_entities)
38+
t = np.random.randint(0, num_timesteps)
39+
row = {
40+
'entity_id': entity,
41+
'time': t
42+
}
43+
for feature in range(3):
44+
row[f'feature_{feature}'] = np.random.rand()
45+
data.append(row)
46+
df = pdDataFrame(data)
47+
48+
return df
49+
50+
return generate_multivariate_multientity_timeseries()
51+
52+
853
def get_census() -> pdDataFrame:
954
file_name = cache_file(
1055
"census_train.csv",
@@ -75,7 +120,7 @@ def get_dataset(name: str):
75120
'census': get_census,
76121
'titanic': get_titanic,
77122
'airquality': get_airquality,
78-
'occupancy': get_occupancy
123+
'timeseries': get_timeseries
79124
}
80125

81126
if name not in DATASETS:

0 commit comments

Comments
 (0)