huggingface · Pchambet · Feb 22, 2026 · stevhliu · Feb 23, 2026 · stevhliu
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -600,7 +600,7 @@ def update_metadata_with_features(table: Table, features: Features):
 
 
 def _check_table(table) -> Table:
-    """We check the table type to make sure it's an instance of :class:`datasets.table.Table`"""
+    """We check the table type to make sure it's an instance of `datasets.table.Table`"""
     if isinstance(table, pa.Table):
         # for a pyarrow table, we can just consider it as a in-memory table
         # this is here for backward compatibility
@@ -1760,7 +1760,7 @@ def _build_local_temp_path(uri_or_path: str) -> Path:
                 `"s3://my-bucket/dataset/train"`) to concatenate.
 
         Returns:
-            :class:`Path`: the concatenated path (temp dir + path)
+            `Path`: the concatenated path (temp dir + path)
         """
         src_dataset_path = Path(uri_or_path)
         tmp_dir = get_temporary_cache_files_directory()
@@ -2566,8 +2566,8 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
         selected format.
 
         Args:
-            batch_size (:obj:`int`): size of each batch to yield.
-            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
+            batch_size (`int`): size of each batch to yield.
+            drop_last_batch (`bool`, default `False`): Whether a last batch smaller than the batch_size should be
                 dropped
         """
         if self._indices is None:
@@ -6535,13 +6535,13 @@ def _concatenate_map_style_datasets(
     axis: int = 0,
 ):
     """
-    Converts a list of :class:`Dataset` with the same schema into a single :class:`Dataset`.
+    Converts a list of `Dataset` with the same schema into a single `Dataset`.
     When you concatenate on axis 0, missing data are filled with None values.
 
     Args:
         dsets (`List[datasets.Dataset]`): List of Datasets to concatenate.
-        info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
-        split (:class:`NamedSplit`, optional): Name of the dataset split.
+        info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
+        split (`NamedSplit`, optional): Name of the dataset split.
         axis (``{0, 1}``, default ``0``, meaning over rows):
             Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns
             (horizontally).
@@ -6664,8 +6664,8 @@ def _interleave_map_style_datasets(
         probabilities (`List[float]`, optional, default None): If specified, the new dataset is constructed by sampling
             examples from one source at a time according to these probabilities.
         seed (`int`, optional, default None): The random seed used to choose a source for each example.
-        info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
-        split (:class:`NamedSplit`, optional): Name of the dataset split.
+        info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
+        split (`NamedSplit`, optional): Name of the dataset split.
         stopping_strategy (`str`, defaults to `first_exhausted`):
             Two strategies are proposed right now.
             By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
@@ -6674,10 +6674,10 @@ def _interleave_map_style_datasets(
             Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
             - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.
             - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
-        **kwargs (additional keyword arguments): Keyword arguments to be passed to :meth:`datasets.Datasets.select` when selecting the indices used to interleave the datasets.
+        **kwargs (additional keyword arguments): Keyword arguments to be passed to `datasets.Datasets.select` when selecting the indices used to interleave the datasets.
 
     Output:
-        :class:`datasets.Dataset`
+        `datasets.Dataset`
     """
     if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
         raise ValueError(

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -169,7 +169,7 @@ class TypedSequence:
         in order to get an extension array.
     - Support for ``try_type`` parameter that can be used instead of ``type``:
         When an array is transformed, we like to keep the same type as before if possible.
-        For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
+        For example when calling `datasets.Dataset.map`, we don't want to change the type
         of each column by default.
     - Better error message when a pyarrow array overflows.
 

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -663,10 +663,10 @@ def set_transform(
     ):
         """Set ``__getitem__`` return format using this transform. The transform is applied on-the-fly on batches when ``__getitem__`` is called.
         The transform is set for every dataset in the dataset dictionary
-        As :func:`datasets.Dataset.set_format`, this can be reset using :func:`datasets.Dataset.reset_format`
+        As `datasets.Dataset.set_format`, this can be reset using `datasets.Dataset.reset_format`
 
         Args:
-            transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by :func:`datasets.Dataset.set_format`
+            transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by `datasets.Dataset.set_format`
                 A formatting function is a callable that takes a batch (as a dict) as input and returns a batch.
                 This function is applied right before returning the objects in ``__getitem__``.
             columns (`list[str]`, optional): columns to format in the output

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -1461,10 +1461,10 @@ def generate_from_dict(obj: Any):
 
     generate_from_dict is the recursive helper for Features.from_dict, and allows for a convenient constructor syntax
     to define features from deserialized JSON dictionaries. This function is used in particular when deserializing
-    a :class:`DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
-    :meth:`Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
+    a `DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
+    `Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
-    a `DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
-    `Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
+    a [`DatasetInfo`] that was dumped to a JSON object. This acts as an analogue to
+    [`Features.from_arrow_schema`] and handles the recursive field-by-field instantiation, but doesn't require any
-    a `DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
-    `Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
+    a [`DatasetInfo`] that was dumped to a JSON object. This acts as an analogue to
+    [`Features.from_arrow_schema`] and handles the recursive field-by-field instantiation, but doesn't require any
     mapping to/from pyarrow, except for the fact that it takes advantage of the mapping of pyarrow primitive dtypes
-    that :class:`Value` automatically performs.
+    that `Value` automatically performs.
     """
     # Nested structures: we allow dict, list/tuples, sequences
     if isinstance(obj, list):
@@ -1665,10 +1665,10 @@ def require_decoding(feature: FeatureType, ignore_decode_attribute: bool = False
 
     Args:
         feature (FeatureType): the feature type to be checked
-        ignore_decode_attribute (:obj:`bool`, default ``False``): Whether to ignore the current value
+        ignore_decode_attribute (`bool`, default ``False``): Whether to ignore the current value
-        ignore_decode_attribute (`bool`, default ``False``): Whether to ignore the current value
+        ignore_decode_attribute (`bool`, default `False`): Whether to ignore the current value
-        ignore_decode_attribute (`bool`, default ``False``): Whether to ignore the current value
+        ignore_decode_attribute (`bool`, default `False`): Whether to ignore the current value
             of the `decode` attribute of the decodable feature types.
     Returns:
-        :obj:`bool`
+        `bool`
     """
     if isinstance(feature, dict):
         return any(require_decoding(f) for f in feature.values())
@@ -1690,7 +1690,7 @@ def require_storage_cast(feature: FeatureType) -> bool:
     Args:
         feature (FeatureType): the feature type to be checked
     Returns:
-        :obj:`bool`
+        `bool`
     """
     if isinstance(feature, dict):
         return any(require_storage_cast(f) for f in feature.values())
@@ -1708,7 +1708,7 @@ def require_storage_embed(feature: FeatureType) -> bool:
     Args:
         feature (FeatureType): the feature type to be checked
     Returns:
-        :obj:`bool`
+        `bool`
     """
     if isinstance(feature, dict):
         return any(require_storage_cast(f) for f in feature.values())
@@ -1722,7 +1722,7 @@ def require_storage_embed(feature: FeatureType) -> bool:
 
 def keep_features_dicts_synced(func):
     """
-    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the :class:`datasets.Features` object
+    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the `datasets.Features` object
-    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the `datasets.Features` object
+    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the [`~datasets.Features`] object
-    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the `datasets.Features` object
+    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the [`~datasets.Features`] object
     in sync with the main dictionary.
     """
 
@@ -1812,7 +1812,7 @@ def type(self):
         Features field types.
 
         Returns:
-            :obj:`pyarrow.DataType`
+            `pyarrow.DataType`
         """
         return get_nested_type(self)
 
@@ -1822,7 +1822,7 @@ def arrow_schema(self):
         Features schema.
 
         Returns:
-            :obj:`pyarrow.Schema`
+            `pyarrow.Schema`
         """
         hf_metadata = {"info": {"features": self.to_dict()}}
         return pa.schema(self.type).with_metadata({"huggingface": json.dumps(hf_metadata)})

diff --git a/src/datasets/filesystems/compression.py b/src/datasets/filesystems/compression.py
@@ -27,10 +27,10 @@ def __init__(
         without the compression extension at the end of the filename.
 
         Args:
-            fo (:obj:``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
-            mode (:obj:``str``): Currently, only 'rb' accepted
-            target_protocol(:obj:``str``, optional): To override the FS protocol inferred from a URL.
-            target_options (:obj:``dict``, optional): Kwargs passed when instantiating the target FS.
+            fo (``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
-            fo (``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
+            fo (`str`): Path to compressed file. Will fetch file using ``fsspec.open()``
-            fo (``str``): Path to compressed file. Will fetch file using ``fsspec.open()``
+            fo (`str`): Path to compressed file. Will fetch file using ``fsspec.open()``
+            mode (``str``): Currently, only 'rb' accepted
+            target_protocol(``str``, optional): To override the FS protocol inferred from a URL.
+            target_options (``dict``, optional): Kwargs passed when instantiating the target FS.
         """
         super().__init__(self, **kwargs)
         self.fo = fo.__fspath__() if hasattr(fo, "__fspath__") else fo

diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py
@@ -386,24 +386,24 @@ def fingerprint_transform(
     """
     Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
     Args:
-        inplace (:obj:`bool`):  If inplace is True, the fingerprint of the dataset is updated inplace.
+        inplace (`bool`):  If inplace is True, the fingerprint of the dataset is updated inplace.
             Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
             setting the fingerprint of the returned Dataset.
-        use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
+        use_kwargs (`List[str]`, optional): optional white list of argument names to take into account
             to update the fingerprint to the wrapped method that should take care of
             setting the fingerprint of the returned Dataset. By default all the arguments are used.
-        ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
+        ignore_kwargs (`List[str]`, optional): optional black list of argument names to take into account
             to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
-        fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
+        fingerprint_names (`List[str]`, optional, defaults to ["new_fingerprint"]):
             If the dataset transforms is not inplace and returns a DatasetDict, then it can require
             several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
             one fingerprint named after each element of fingerprint_names is going to be passed.
-        randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
+        randomized_function (`bool`, defaults to False): If the dataset transform is random and has
             optional parameters "seed" and "generator", then you can set randomized_function to True.
             This way, even if users set "seed" and "generator" to None, then the fingerprint is
             going to be randomly generated depending on numpy's current state. In this case, the
             generator is set to np.random.default_rng(np.random.get_state()[1][0]).
-        version (:obj:`str`, optional): version of the transform. The version is taken into account when
+        version (`str`, optional): version of the transform. The version is taken into account when
             computing the fingerprint. If a datase transform changes (or at least if the output data
             that are cached changes), then one should increase the version. If the version stays the
             same, then old cached data could be reused that are not compatible with the new transform.

diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
@@ -52,7 +52,7 @@ def _query_table_with_indices_mapping(
 ) -> pa.Table:
     """
     Query a pyarrow Table to extract the subtable that correspond to the given key.
-    The :obj:`indices` parameter corresponds to the indices mapping in case we cant to take into
+    The `indices` parameter corresponds to the indices mapping in case we cant to take into
     account a shuffling or an indices selection for example.
     The indices table must contain one column named "indices" of type uint64.
     """
@@ -634,9 +634,9 @@ def format_table(
             the table as either a row, a column or a batch.
         formatter (``datasets.formatting.formatting.Formatter``): Any subclass of a Formatter such as
             PythonFormatter, NumpyFormatter, etc.
-        format_columns (:obj:`List[str]`, optional): if not None, it defines the columns that will be formatted using the
+        format_columns (`List[str]`, optional): if not None, it defines the columns that will be formatted using the
             given formatter. Other columns are discarded (unless ``output_all_columns`` is True)
-        output_all_columns (:obj:`bool`, defaults to False). If True, the formatted output is completed using the columns
+        output_all_columns (`bool`, defaults to False). If True, the formatted output is completed using the columns
             that are not in the ``format_columns`` list. For these columns, the PythonFormatter is used.
 
 

diff --git a/src/datasets/inspect.py b/src/datasets/inspect.py
@@ -253,14 +253,14 @@ def get_dataset_config_info(
                 e.g. `'./dataset/squad'`
             - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                 e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
-        config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
-        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
-        download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
-        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
-        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
+        config_name (`str`, optional): Defining the name of the dataset configuration.
+        data_files (`str` or `Sequence` or `Mapping`, optional): Path(s) to source data file(s).
+        download_config (`DownloadConfig`, optional): Specific download configuration parameters.
+        download_mode (`DownloadMode` or `str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
+        revision (`Version` or `str`, optional): Version of the dataset to load.
             As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
             You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
-        token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+        token (``str`` or `bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
             If True, or not specified, will get token from `"~/.huggingface"`.
         **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.
 

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -2572,8 +2572,8 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
         """Iterate through the batches of size `batch_size`.
 
         Args:
-            batch_size (:obj:`int`): size of each batch to yield.
-            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
+            batch_size (`int`): size of each batch to yield.
+            drop_last_batch (`bool`, default `False`): Whether a last batch smaller than the batch_size should be
                 dropped
         """
 

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -855,14 +855,14 @@ def dataset_module_factory(
               -> load a generic dataset builder (csv, text etc.) based on the content of the repository
               e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.
 
-        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
+        revision (`Version` or `str`, optional): Version of the dataset to load.
             As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
             You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
-        download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
-        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
-        data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
+        download_config (`DownloadConfig`, optional): Specific download configuration parameters.
+        download_mode (`DownloadMode` or `str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
+        data_dir (`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
             in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
-        data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
+        data_files (`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
         cache_dir (`str`, *optional*):
             Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
 

diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py
@@ -51,7 +51,7 @@ def extend_module_for_streaming(module_path, download_config: Optional[DownloadC
       - `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator)
 
     The patched functions are replaced with custom functions defined to work with the
-    :class:`~download.streaming_download_manager.StreamingDownloadManager`.
+    `StreamingDownloadManager`.
 
     Args:
         module_path: Path to the module to be extended.
@@ -111,7 +111,7 @@ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
     """Extend the dataset builder module and the modules imported by it to support streaming.
 
     Args:
-        builder (:class:`DatasetBuilder`): Dataset builder instance.
+        builder (`DatasetBuilder`): Dataset builder instance.
     """
     # this extends the open and os.path.join functions for data streaming
     download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)