diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index fd1eb739c99..77c3ae18284 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -682,17 +682,6 @@ def from_patterns( ) return out - def __reduce__(self): - """ - To make sure the order of the keys doesn't matter when pickling and hashing: - - >>> from datasets.data_files import DataFilesDict - >>> from datasets.fingerprint import Hasher - >>> assert Hasher.hash(DataFilesDict(a=[], b=[])) == Hasher.hash(DataFilesDict(b=[], a=[])) - - """ - return DataFilesDict, (dict(sorted(self.items())),) - def filter_extensions(self, extensions: List[str]) -> "DataFilesDict": out = type(self)() for key, data_files_list in self.items(): diff --git a/tests/test_builder.py b/tests/test_builder.py index 45b7574ffc7..54bae47ae08 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -759,10 +759,6 @@ def test_cache_dir_for_data_files(self): cache_dir=tmp_dir, data_files={"train": [dummy_data1], "test": dummy_data2} ) self.assertEqual(builder.cache_dir, other_builder.cache_dir) - other_builder = DummyGeneratorBasedBuilder( - cache_dir=tmp_dir, data_files={"test": dummy_data2, "train": dummy_data1} - ) - self.assertEqual(builder.cache_dir, other_builder.cache_dir) other_builder = DummyGeneratorBasedBuilder( cache_dir=tmp_dir, data_files={"train": dummy_data1, "validation": dummy_data2} ) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 34bfb26332e..d617a9c7c19 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -1,3 +1,4 @@ +import copy import os from pathlib import Path, PurePath from typing import List @@ -385,6 +386,13 @@ def test_DataFilesList_from_patterns_raises_FileNotFoundError(complex_data_dir): DataFilesList.from_patterns(["file_that_doesnt_exist.txt"], complex_data_dir) +class TestDataFilesDict: + def test_key_order_after_copy(self): + data_files = DataFilesDict({"train": "train.csv", "test": "test.csv"}) + copied_data_files = copy.deepcopy(data_files) + assert list(copied_data_files.keys()) == list(data_files.keys()) # test split order with list() + + @pytest.mark.parametrize("pattern", _TEST_PATTERNS) def test_DataFilesDict_from_patterns_in_dataset_repository( hub_dataset_repo_path, hub_dataset_repo_patterns_results, pattern