Skip to content

Commit ba33727

Browse files
committed
parallel download
1 parent 1b3bb74 commit ba33727

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

benchmark/cloc/data_generation.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ def download_directory_from_gcloud(self, prefix):
206206
bucket = storage_client.bucket(bucket_name=bucket_name)
207207
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
208208
first_zip_downloaded = False
209+
blobs_to_download = []
210+
209211
for blob in blobs:
210212
print(blob.name)
211213
if blob.name.endswith("/"):
@@ -222,10 +224,23 @@ def download_directory_from_gcloud(self, prefix):
222224
target = dl_dir / blob.name
223225

224226
if not target.exists():
225-
blob.download_to_filename(dl_dir / blob.name)
227+
blobs_to_download.append((dl_dir / blob.name, blob))
226228
else:
227229
print(f"Skipping {target} as it already exists")
228230

231+
with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(blobs_to_download)) as pbar:
232+
futures_list = []
233+
download_blob = lambda target, blob: blob.download_to_filename(target)
234+
235+
for blob in blobs_to_download:
236+
future = executor.submit(download_blob, *blob)
237+
future.add_done_callback(lambda p: pbar.update(1))
238+
futures_list.append(future)
239+
240+
# Wait for all tasks to complete
241+
for future in futures_list:
242+
future.result()
243+
229244
def download_dataset(self):
230245
"""
231246
Download the order files from Google Cloud Storage.

0 commit comments

Comments
 (0)