|
63 | 63 | "metadata": {}, |
64 | 64 | "outputs": [ |
65 | 65 | { |
66 | | - "name": "stderr", |
| 66 | + "name": "stdout", |
67 | 67 | "output_type": "stream", |
68 | 68 | "text": [ |
69 | | - "2022-08-30 15:30:36,678\tINFO worker.py:1510 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", |
70 | | - "2022-08-30 15:30:37,791\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", |
71 | | - "\u001b[2m\u001b[36m(_get_read_tasks pid=3958)\u001b[0m 2022-08-30 15:30:37,789\tWARNING torch_datasource.py:55 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n" |
| 69 | + "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz\n" |
72 | 70 | ] |
73 | 71 | }, |
74 | 72 | { |
75 | | - "name": "stdout", |
| 73 | + "name": "stderr", |
76 | 74 | "output_type": "stream", |
77 | 75 | "text": [ |
78 | | - "\u001b[2m\u001b[36m(_execute_read_task pid=3958)\u001b[0m Using downloaded and verified file: ./data/cifar-10-python.tar.gz\n", |
79 | | - "\u001b[2m\u001b[36m(_execute_read_task pid=3958)\u001b[0m Extracting ./data/cifar-10-python.tar.gz to ./data\n" |
| 76 | + "100%|██████████| 170498071/170498071 [00:21<00:00, 7792736.24it/s]\n" |
80 | 77 | ] |
81 | 78 | }, |
82 | 79 | { |
83 | | - "name": "stderr", |
| 80 | + "name": "stdout", |
84 | 81 | "output_type": "stream", |
85 | 82 | "text": [ |
86 | | - "2022-08-30 15:30:44,508\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", |
87 | | - "\u001b[2m\u001b[36m(_get_read_tasks pid=3958)\u001b[0m 2022-08-30 15:30:44,507\tWARNING torch_datasource.py:55 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n" |
| 83 | + "Extracting data/cifar-10-python.tar.gz to data\n", |
| 84 | + "Files already downloaded and verified\n" |
88 | 85 | ] |
89 | 86 | }, |
90 | 87 | { |
91 | | - "name": "stdout", |
| 88 | + "name": "stderr", |
92 | 89 | "output_type": "stream", |
93 | 90 | "text": [ |
94 | | - "\u001b[2m\u001b[36m(_execute_read_task pid=3958)\u001b[0m Files already downloaded and verified\n" |
| 91 | + "2022-10-23 10:33:48,403\tINFO worker.py:1518 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n" |
95 | 92 | ] |
96 | 93 | } |
97 | 94 | ], |
98 | 95 | "source": [ |
99 | 96 | "import ray\n", |
100 | | - "from ray.data.datasource import SimpleTorchDatasource\n", |
101 | 97 | "import torchvision\n", |
102 | 98 | "import torchvision.transforms as transforms\n", |
103 | 99 | "\n", |
104 | 100 | "transform = transforms.Compose(\n", |
105 | 101 | " [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]\n", |
106 | 102 | ")\n", |
107 | 103 | "\n", |
| 104 | + "train_dataset = torchvision.datasets.CIFAR10(\"data\", download=True, train=True, transform=transform)\n", |
| 105 | + "test_dataset = torchvision.datasets.CIFAR10(\"data\", download=True, train=False, transform=transform)\n", |
108 | 106 | "\n", |
109 | | - "def train_dataset_factory():\n", |
110 | | - " return torchvision.datasets.CIFAR10(\n", |
111 | | - " root=\"./data\", download=True, train=True, transform=transform\n", |
112 | | - " )\n", |
113 | | - "\n", |
114 | | - "\n", |
115 | | - "def test_dataset_factory():\n", |
116 | | - " return torchvision.datasets.CIFAR10(\n", |
117 | | - " root=\"./data\", download=True, train=False, transform=transform\n", |
118 | | - " )\n", |
119 | | - "\n", |
120 | | - "\n", |
121 | | - "train_dataset: ray.data.Dataset = ray.data.read_datasource(\n", |
122 | | - " SimpleTorchDatasource(), dataset_factory=train_dataset_factory\n", |
123 | | - ")\n", |
124 | | - "test_dataset: ray.data.Dataset = ray.data.read_datasource(\n", |
125 | | - " SimpleTorchDatasource(), dataset_factory=test_dataset_factory\n", |
126 | | - ")" |
| 107 | + "train_dataset: ray.data.Dataset = ray.data.from_torch(train_dataset)\n", |
| 108 | + "test_dataset: ray.data.Dataset = ray.data.from_torch(test_dataset)" |
127 | 109 | ] |
128 | 110 | }, |
129 | 111 | { |
|
156 | 138 | "id": "a89b59e8", |
157 | 139 | "metadata": {}, |
158 | 140 | "source": [ |
159 | | - "{py:class}`SimpleTorchDatasource <ray.data.datasource.SimpleTorchDatasource>` doesn't parallelize reads, so you shouldn't use it with larger datasets.\n", |
| 141 | + "{py:class}`from_torch <ray.data.from_torch>` doesn't parallelize reads, so you shouldn't use it with larger datasets.\n", |
160 | 142 | "\n", |
161 | 143 | "Next, let's represent our data using a dictionary of ndarrays instead of tuples. This lets us call {py:meth}`Dataset.iter_torch_batches <ray.data.Dataset.iter_torch_batches>` later in the tutorial." |
162 | 144 | ] |
|
828 | 810 | ], |
829 | 811 | "metadata": { |
830 | 812 | "kernelspec": { |
831 | | - "display_name": "Python 3.9.12 ('.venv': venv)", |
| 813 | + "display_name": "Python 3.10.8 ('.venv': venv)", |
832 | 814 | "language": "python", |
833 | 815 | "name": "python3" |
834 | 816 | }, |
|
842 | 824 | "name": "python", |
843 | 825 | "nbconvert_exporter": "python", |
844 | 826 | "pygments_lexer": "ipython3", |
845 | | - "version": "3.9.12" |
| 827 | + "version": "3.10.8" |
846 | 828 | }, |
847 | 829 | "vscode": { |
848 | 830 | "interpreter": { |
849 | | - "hash": "a658351b4133f922c5967ed6133cfc05c9f16c53a5161e5843ace3f528fccaf5" |
| 831 | + "hash": "c704e19737f24b51bc631dadcac7a7e356bb35d1c5cd7766248d8a6946059909" |
850 | 832 | } |
851 | 833 | } |
852 | 834 | }, |
|
0 commit comments