Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ def update_metadata_with_features(table: Table, features: Features):


def _check_table(table) -> Table:
"""We check the table type to make sure it's an instance of :class:`datasets.table.Table`"""
"""We check the table type to make sure it's an instance of `datasets.table.Table`"""
if isinstance(table, pa.Table):
# for a pyarrow table, we can just consider it as a in-memory table
# this is here for backward compatibility
Expand Down Expand Up @@ -1760,7 +1760,7 @@ def _build_local_temp_path(uri_or_path: str) -> Path:
`"s3://my-bucket/dataset/train"`) to concatenate.

Returns:
:class:`Path`: the concatenated path (temp dir + path)
`Path`: the concatenated path (temp dir + path)
"""
src_dataset_path = Path(uri_or_path)
tmp_dir = get_temporary_cache_files_directory()
Expand Down Expand Up @@ -2566,8 +2566,8 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
selected format.

Args:
batch_size (:obj:`int`): size of each batch to yield.
drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
batch_size (`int`): size of each batch to yield.
drop_last_batch (`bool`, default `False`): Whether a last batch smaller than the batch_size should be
dropped
"""
if self._indices is None:
Expand Down Expand Up @@ -6535,13 +6535,13 @@ def _concatenate_map_style_datasets(
axis: int = 0,
):
"""
Converts a list of :class:`Dataset` with the same schema into a single :class:`Dataset`.
Converts a list of `Dataset` with the same schema into a single `Dataset`.
When you concatenate on axis 0, missing data are filled with None values.

Args:
dsets (`List[datasets.Dataset]`): List of Datasets to concatenate.
info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
split (:class:`NamedSplit`, optional): Name of the dataset split.
info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
split (`NamedSplit`, optional): Name of the dataset split.
axis (``{0, 1}``, default ``0``, meaning over rows):
Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns
(horizontally).
Expand Down Expand Up @@ -6664,8 +6664,8 @@ def _interleave_map_style_datasets(
probabilities (`List[float]`, optional, default None): If specified, the new dataset is constructed by sampling
examples from one source at a time according to these probabilities.
seed (`int`, optional, default None): The random seed used to choose a source for each example.
info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
split (:class:`NamedSplit`, optional): Name of the dataset split.
info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
split (`NamedSplit`, optional): Name of the dataset split.
stopping_strategy (`str`, defaults to `first_exhausted`):
Two strategies are proposed right now.
By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
Expand All @@ -6674,10 +6674,10 @@ def _interleave_map_style_datasets(
Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
- with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.
- with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
**kwargs (additional keyword arguments): Keyword arguments to be passed to :meth:`datasets.Datasets.select` when selecting the indices used to interleave the datasets.
**kwargs (additional keyword arguments): Keyword arguments to be passed to `datasets.Datasets.select` when selecting the indices used to interleave the datasets.

Output:
:class:`datasets.Dataset`
`datasets.Dataset`
"""
if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
raise ValueError(
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ class TypedSequence:
in order to get an extension array.
- Support for ``try_type`` parameter that can be used instead of ``type``:
When an array is transformed, we like to keep the same type as before if possible.
For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
For example when calling `datasets.Dataset.map`, we don't want to change the type
of each column by default.
- Better error message when a pyarrow array overflows.

Expand Down
4 changes: 2 additions & 2 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,10 +663,10 @@ def set_transform(
):
"""Set ``__getitem__`` return format using this transform. The transform is applied on-the-fly on batches when ``__getitem__`` is called.
The transform is set for every dataset in the dataset dictionary
As :func:`datasets.Dataset.set_format`, this can be reset using :func:`datasets.Dataset.reset_format`
As `datasets.Dataset.set_format`, this can be reset using `datasets.Dataset.reset_format`

Args:
transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by :func:`datasets.Dataset.set_format`
transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by `datasets.Dataset.set_format`
A formatting function is a callable that takes a batch (as a dict) as input and returns a batch.
This function is applied right before returning the objects in ``__getitem__``.
columns (`list[str]`, optional): columns to format in the output
Expand Down
20 changes: 10 additions & 10 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -1461,10 +1461,10 @@ def generate_from_dict(obj: Any):

generate_from_dict is the recursive helper for Features.from_dict, and allows for a convenient constructor syntax
to define features from deserialized JSON dictionaries. This function is used in particular when deserializing
a :class:`DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
:meth:`Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
a `DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
`Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
Comment on lines +1464 to +1465
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it's a dataset class or method, you can use brackets to render it as a link :)

Suggested change
a `DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
`Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
a [`DatasetInfo`] that was dumped to a JSON object. This acts as an analogue to
[`Features.from_arrow_schema`] and handles the recursive field-by-field instantiation, but doesn't require any

mapping to/from pyarrow, except for the fact that it takes advantage of the mapping of pyarrow primitive dtypes
that :class:`Value` automatically performs.
that `Value` automatically performs.
"""
# Nested structures: we allow dict, list/tuples, sequences
if isinstance(obj, list):
Expand Down Expand Up @@ -1665,10 +1665,10 @@ def require_decoding(feature: FeatureType, ignore_decode_attribute: bool = False

Args:
feature (FeatureType): the feature type to be checked
ignore_decode_attribute (:obj:`bool`, default ``False``): Whether to ignore the current value
ignore_decode_attribute (`bool`, default ``False``): Whether to ignore the current value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ignore_decode_attribute (`bool`, default ``False``): Whether to ignore the current value
ignore_decode_attribute (`bool`, default `False`): Whether to ignore the current value

of the `decode` attribute of the decodable feature types.
Returns:
:obj:`bool`
`bool`
"""
if isinstance(feature, dict):
return any(require_decoding(f) for f in feature.values())
Expand All @@ -1690,7 +1690,7 @@ def require_storage_cast(feature: FeatureType) -> bool:
Args:
feature (FeatureType): the feature type to be checked
Returns:
:obj:`bool`
`bool`
"""
if isinstance(feature, dict):
return any(require_storage_cast(f) for f in feature.values())
Expand All @@ -1708,7 +1708,7 @@ def require_storage_embed(feature: FeatureType) -> bool:
Args:
feature (FeatureType): the feature type to be checked
Returns:
:obj:`bool`
`bool`
"""
if isinstance(feature, dict):
return any(require_storage_cast(f) for f in feature.values())
Expand All @@ -1722,7 +1722,7 @@ def require_storage_embed(feature: FeatureType) -> bool:

def keep_features_dicts_synced(func):
"""
Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the :class:`datasets.Features` object
Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the `datasets.Features` object
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the ~ to hide the "datasets" part from rendering in the link

Suggested change
Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the `datasets.Features` object
Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the [`~datasets.Features`] object

in sync with the main dictionary.
"""

Expand Down Expand Up @@ -1812,7 +1812,7 @@ def type(self):
Features field types.

Returns:
:obj:`pyarrow.DataType`
`pyarrow.DataType`
"""
return get_nested_type(self)

Expand All @@ -1822,7 +1822,7 @@ def arrow_schema(self):
Features schema.

Returns:
:obj:`pyarrow.Schema`
`pyarrow.Schema`
"""
hf_metadata = {"info": {"features": self.to_dict()}}
return pa.schema(self.type).with_metadata({"huggingface": json.dumps(hf_metadata)})
Expand Down
8 changes: 4 additions & 4 deletions src/datasets/filesystems/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def __init__(
without the compression extension at the end of the filename.

Args:
fo (:obj:``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
mode (:obj:``str``): Currently, only 'rb' accepted
target_protocol(:obj:``str``, optional): To override the FS protocol inferred from a URL.
target_options (:obj:``dict``, optional): Kwargs passed when instantiating the target FS.
fo (``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets remove all the double backticks here

Suggested change
fo (``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
fo (`str`): Path to compressed file. Will fetch file using ``fsspec.open()``

mode (``str``): Currently, only 'rb' accepted
target_protocol(``str``, optional): To override the FS protocol inferred from a URL.
target_options (``dict``, optional): Kwargs passed when instantiating the target FS.
"""
super().__init__(self, **kwargs)
self.fo = fo.__fspath__() if hasattr(fo, "__fspath__") else fo
Expand Down
12 changes: 6 additions & 6 deletions src/datasets/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,24 +386,24 @@ def fingerprint_transform(
"""
Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
Args:
inplace (:obj:`bool`): If inplace is True, the fingerprint of the dataset is updated inplace.
inplace (`bool`): If inplace is True, the fingerprint of the dataset is updated inplace.
Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
setting the fingerprint of the returned Dataset.
use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
use_kwargs (`List[str]`, optional): optional white list of argument names to take into account
to update the fingerprint to the wrapped method that should take care of
setting the fingerprint of the returned Dataset. By default all the arguments are used.
ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
ignore_kwargs (`List[str]`, optional): optional black list of argument names to take into account
to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
fingerprint_names (`List[str]`, optional, defaults to ["new_fingerprint"]):
If the dataset transforms is not inplace and returns a DatasetDict, then it can require
several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
one fingerprint named after each element of fingerprint_names is going to be passed.
randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
randomized_function (`bool`, defaults to False): If the dataset transform is random and has
optional parameters "seed" and "generator", then you can set randomized_function to True.
This way, even if users set "seed" and "generator" to None, then the fingerprint is
going to be randomly generated depending on numpy's current state. In this case, the
generator is set to np.random.default_rng(np.random.get_state()[1][0]).
version (:obj:`str`, optional): version of the transform. The version is taken into account when
version (`str`, optional): version of the transform. The version is taken into account when
computing the fingerprint. If a datase transform changes (or at least if the output data
that are cached changes), then one should increase the version. If the version stays the
same, then old cached data could be reused that are not compatible with the new transform.
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _query_table_with_indices_mapping(
) -> pa.Table:
"""
Query a pyarrow Table to extract the subtable that correspond to the given key.
The :obj:`indices` parameter corresponds to the indices mapping in case we cant to take into
The `indices` parameter corresponds to the indices mapping in case we cant to take into
account a shuffling or an indices selection for example.
The indices table must contain one column named "indices" of type uint64.
"""
Expand Down Expand Up @@ -634,9 +634,9 @@ def format_table(
the table as either a row, a column or a batch.
formatter (``datasets.formatting.formatting.Formatter``): Any subclass of a Formatter such as
PythonFormatter, NumpyFormatter, etc.
format_columns (:obj:`List[str]`, optional): if not None, it defines the columns that will be formatted using the
format_columns (`List[str]`, optional): if not None, it defines the columns that will be formatted using the
given formatter. Other columns are discarded (unless ``output_all_columns`` is True)
output_all_columns (:obj:`bool`, defaults to False). If True, the formatted output is completed using the columns
output_all_columns (`bool`, defaults to False). If True, the formatted output is completed using the columns
that are not in the ``format_columns`` list. For these columns, the PythonFormatter is used.
Expand Down
12 changes: 6 additions & 6 deletions src/datasets/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,14 +253,14 @@ def get_dataset_config_info(
e.g. `'./dataset/squad'`
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
config_name (`str`, optional): Defining the name of the dataset configuration.
data_files (`str` or `Sequence` or `Mapping`, optional): Path(s) to source data file(s).
download_config (`DownloadConfig`, optional): Specific download configuration parameters.
download_mode (`DownloadMode` or `str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
revision (`Version` or `str`, optional): Version of the dataset to load.
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
token (``str`` or `bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
If True, or not specified, will get token from `"~/.huggingface"`.
**config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.

Expand Down
4 changes: 2 additions & 2 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2572,8 +2572,8 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
"""Iterate through the batches of size `batch_size`.

Args:
batch_size (:obj:`int`): size of each batch to yield.
drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
batch_size (`int`): size of each batch to yield.
drop_last_batch (`bool`, default `False`): Whether a last batch smaller than the batch_size should be
dropped
"""

Expand Down
10 changes: 5 additions & 5 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,14 +855,14 @@ def dataset_module_factory(
-> load a generic dataset builder (csv, text etc.) based on the content of the repository
e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.

revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
revision (`Version` or `str`, optional): Version of the dataset to load.
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
download_config (`DownloadConfig`, optional): Specific download configuration parameters.
download_mode (`DownloadMode` or `str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
data_dir (`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
data_files (`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
cache_dir (`str`, *optional*):
Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.

Expand Down
4 changes: 2 additions & 2 deletions src/datasets/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def extend_module_for_streaming(module_path, download_config: Optional[DownloadC
- `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator)

The patched functions are replaced with custom functions defined to work with the
:class:`~download.streaming_download_manager.StreamingDownloadManager`.
`StreamingDownloadManager`.

Args:
module_path: Path to the module to be extended.
Expand Down Expand Up @@ -111,7 +111,7 @@ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
"""Extend the dataset builder module and the modules imported by it to support streaming.

Args:
builder (:class:`DatasetBuilder`): Dataset builder instance.
builder (`DatasetBuilder`): Dataset builder instance.
"""
# this extends the open and os.path.join functions for data streaming
download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)
Expand Down
Loading