Add push_to_hub with multiple configs docs (#6226)

add push_to_hub with multiple configs docs
huggingface · Sep 8, 2023 · d058d6e · d058d6e
1 parent 0a068db
commit d058d6e
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 2 deletions.
diff --git a/docs/source/upload_dataset.mdx b/docs/source/upload_dataset.mdx
@@ -103,6 +103,8 @@ To set your dataset as private, set the `private` parameter to `True`. This para
 >>> dataset.push_to_hub("stevhliu/private_processed_demo", private=True)
 ```
 
+To add a new configuration (or subset) to a dataset or to add a new split (train/validation/test), please refer to the [`Dataset.push_to_hub`] documentation.
+
 ### Privacy
 
 A private dataset is only accessible by you. Similarly, if you share a dataset within your organization, then members of the organization can also access the dataset.

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -5376,7 +5376,7 @@ def push_to_hub(
  `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace
  of the logged-in user.
  config_name (`str`, defaults to "default"):
- The configuration name of a dataset. Defaults to "default"
+ The configuration name (or subset) of a dataset. Defaults to "default"
  split (`str`, *optional*):
  The name of the split that will be given to that dataset. Defaults to `self.split`.
  private (`bool`, *optional*, defaults to `False`):
@@ -5405,10 +5405,31 @@ def push_to_hub(
 
  ```python
  >>> dataset.push_to_hub("<organization>/<dataset_id>")
- >>> dataset.push_to_hub("<organization>/<dataset_id>", split="validation")
+ >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", private=True)
  >>> dataset.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
  >>> dataset.push_to_hub("<organization>/<dataset_id>", num_shards=1024)
  ```
+
+ If your dataset has multiple splits (e.g. train/validation/test):
+
+ ```python
+ >>> train_dataset.push_to_hub("<organization>/<dataset_id>", split="train")
+ >>> val_dataset.push_to_hub("<organization>/<dataset_id>", split="validation")
+ >>> # later
+ >>> dataset = load_dataset("<organization>/<dataset_id>")
+ >>> train_dataset = dataset["train"]
+ >>> val_dataset = dataset["validation"]
+ ```
+
+ If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):
+
+ ```python
+ >>> english_dataset.push_to_hub("<organization>/<dataset_id>", "en")
+ >>> french_dataset.push_to_hub("<organization>/<dataset_id>", "fr")
+ >>> # later
+ >>> english_dataset = load_dataset("<organization>/<dataset_id>", "en")
+ >>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
+ ```
  """
  if config_name == "data":
  raise ValueError("`config_name` cannot be 'data'. Please, choose another name for configuration.")

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -1613,6 +1613,16 @@ def push_to_hub(
  >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
  >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", num_shards={"train": 1024, "test": 8})
  ```
+
+ If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):
+
+ ```python
+ >>> english_dataset.push_to_hub("<organization>/<dataset_id>", "en")
+ >>> french_dataset.push_to_hub("<organization>/<dataset_id>", "fr")
+ >>> # later
+ >>> english_dataset = load_dataset("<organization>/<dataset_id>", "en")
+ >>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
+ ```
  """
 
  if num_shards is None: