From 0acb4633407a0d1abcf184012a82900c80b636e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Fri, 13 Oct 2023 18:00:07 +0200 Subject: [PATCH 01/20] start working on CLOC benchmark --- benchmark/cloc/.gitignore | 1 + benchmark/cloc/data_generation.py | 220 ++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 benchmark/cloc/.gitignore create mode 100644 benchmark/cloc/data_generation.py diff --git a/benchmark/cloc/.gitignore b/benchmark/cloc/.gitignore new file mode 100644 index 000000000..fe30281ee --- /dev/null +++ b/benchmark/cloc/.gitignore @@ -0,0 +1 @@ +cloc_data/ \ No newline at end of file diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py new file mode 100644 index 000000000..f1d581825 --- /dev/null +++ b/benchmark/cloc/data_generation.py @@ -0,0 +1,220 @@ +# This code is partially adapted from https://github.com/hammoudhasan/CLDatasets/blob/main/src/downloader.py which is provided license-free +# We thank the authors for their work and hosting the data. + +import argparse +import pathlib +from shutil import which +import os +import time +import zipfile +import logging +import torch +from concurrent.futures import ThreadPoolExecutor +from google.cloud import storage + +from tqdm import tqdm + +logging.basicConfig( + level=logging.NOTSET, + format="[%(asctime)s] [%(filename)15s:%(lineno)4d] %(levelname)-8s %(message)s", + datefmt="%Y-%m-%d:%H:%M:%S", +) +logger = logging.getLogger(__name__) + + +def setup_argparser() -> argparse.ArgumentParser: + parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script") + parser_.add_argument( + "dir", type=pathlib.Path, action="store", help="Path to data directory" + ) + parser_.add_argument( + "--dummyyear", action="store_true", help="Add a final dummy year to train also on the last trigger in Modyn" + ) + parser_.add_argument( + "--all", action="store_true", help="Store all the available data, including the validation and test sets." + ) + parser_.add_argument( + "--test", action="store_true", help="Enable test mode (just download one zip file)" + ) + parser_.add_argument( + "--skip_download", action="store_true", help="Skips the download and only (re)creates labels and timestamps." + ) + return parser_ + + +def main(): + parser = setup_argparser() + args = parser.parse_args() + + logger.info(f"Destination is {args.dir}") + + downloader = CLDatasets(str(args.dir), test_mode = args.test) + if not args.skip_download: + logger.info("Starting download and extraction.") + downloader.download_and_extract() + + downloader.convert_labels() + downloader.update_timestamps() + + if args.dummyyear: + downloader.add_dummy_year() + + logger.info("Cleaning up") + downloader.cleanup() + + +def is_tool(name): + """Check whether `name` is on PATH and marked as executable.""" + return which(name) is not None + + +class CLDatasets: + """ + A class for downloading datasets from Google Cloud Storage. + """ + + def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): + """ + Initialize the CLDatasets object. + + Args: + directory (str): The directory where the dataset will be saved. + """ + + self.dataset = "CLOC" + self.directory = directory + self.unzip = unzip + self.test_mode = test_mode + + if not os.path.exists(self.directory): + os.makedirs(self.directory) + + def download_and_extract(self): + self.download_dataset() + + if self.unzip: + self.unzip_data_files(self.directory + "/CLOC/data") + + def convert_labels(self): + logger.info("Loading labels and timestamps.") + store_loc = torch.load(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave") + labels = torch.load(self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave") + timestamps = torch.load(self.directory + "/CLOC_torchsave_order_files/train_time.torchSave") + + warned_once = False + + logger.info("Labels and timestamps loaded, applying") + for store_location, label, timestamp in tqdm(zip(store_loc, labels, timestamps), total=len(store_loc)): + + path = pathlib.Path(self.directory + "/CLOC/data/" + store_location.strip().replace("\n", "")) + + if not path.exists(): + if not self.test_mode: + raise FileExistsError(f"Cannot find file {store_location}") + if not warned_once: + logger.warning(f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning.") + warned_once = True + continue + + + label_path = pathlib.Path(path.parent / f"{path.stem}.label") + with open(label_path, "w+", encoding="utf-8") as file: + file.write(str(int(label))) + + # Note: The timestamps obtained in the hd5 file are (very likely) seconds since 2004 (1072911600 GMT timestamp) + actual_timestamp = timestamp + 1072911600 + os.utime(path, (actual_timestamp, actual_timestamp)) + + + + + def update_timestamps(self): + + pass + + def add_dummy_year(self): + pass + + def cleanup(self): + pass + + def download_directory_from_gcloud(self, prefix): + bucket_name = 'cl-datasets' + dl_dir = pathlib.Path(self.directory) + storage_client = storage.Client.create_anonymous_client() + bucket = storage_client.bucket(bucket_name=bucket_name) + blobs = bucket.list_blobs(prefix=prefix) # Get list of files + first_zip_downloaded = False + for blob in blobs: + print(blob.name) + if blob.name.endswith("/"): + continue + if blob.name.endswith("zip"): + if first_zip_downloaded and self.test_mode: + continue + else: + first_zip_downloaded = True + + file_split = blob.name.split("/") + directory = "/".join(file_split[0:-1]) + pathlib.Path(dl_dir / directory).mkdir(parents=True, exist_ok=True) + target = dl_dir / blob.name + + if not target.exists(): + blob.download_to_filename(dl_dir / blob.name) + else: + print(f"Skipping {target} as it already exists") + + def download_dataset(self): + """ + Download the order files from Google Cloud Storage. + """ + print("Order files are being downloaded...") + start_time = time.time() + + self.download_directory_from_gcloud(self.dataset) + + elapsed_time = time.time() - start_time + print("Elapsed time:", elapsed_time) + + def unzip_data_files(self, directory: str) -> None: + """ + Extracts the contents of zip files in a directory into nested folders. + + Args: + directory: The path to the directory containing the zip files. + + Returns: + None + """ + + zip_files = [file for file in os.listdir(directory) if file.endswith('.zip')] + + def extract_single_zip(zip_file: str) -> None: + + zip_path = os.path.join(directory, zip_file) + output_dir = os.path.join( + directory, os.path.splitext(zip_file)[0]) + + os.makedirs(output_dir, exist_ok=True) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(output_dir) + + with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar: + futures_list = [] + for zip_file in zip_files: + future = executor.submit(extract_single_zip, zip_file) + future.add_done_callback(lambda p: pbar.update(1)) + futures_list.append(future) + + # Wait for all tasks to complete + for future in futures_list: + future.result() + + # Remove zip files + remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip" + os.system(remove_command) + +if __name__ == "__main__": + main() From bd51ed85585601089570762551d47d4449d78df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Mon, 16 Oct 2023 20:50:47 +0200 Subject: [PATCH 02/20] progress --- benchmark/cloc/data_generation.py | 46 +++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index f1d581825..76d3fcf57 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -9,6 +9,7 @@ import zipfile import logging import torch +import glob from concurrent.futures import ThreadPoolExecutor from google.cloud import storage @@ -53,8 +54,7 @@ def main(): logger.info("Starting download and extraction.") downloader.download_and_extract() - downloader.convert_labels() - downloader.update_timestamps() + downloader.convert_labels_and_timestamps() if args.dummyyear: downloader.add_dummy_year() @@ -85,6 +85,7 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): self.directory = directory self.unzip = unzip self.test_mode = test_mode + self.max_timestamp = 0 if not os.path.exists(self.directory): os.makedirs(self.directory) @@ -95,17 +96,38 @@ def download_and_extract(self): if self.unzip: self.unzip_data_files(self.directory + "/CLOC/data") - def convert_labels(self): + def convert_labels_and_timestamps(self): + self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/train_time.torchSave") + self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave") + self.remove_images_without_label() + + def remove_images_without_label(self): + print("Removing images without label...") + removed_files = 0 + + for filename in glob.iglob(self.directory + '**/*.jpg', recursive=True): + file_path = pathlib.Path(filename) + label_path = pathlib.Path(file_path.parent / f"{file_path.stem}.label") + + if not label_path.exists(): + removed_files += 1 + file_path.unlink() + + print(f"Removed {removed_files} images that do not have a label.") + + def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timestamps_path): logger.info("Loading labels and timestamps.") - store_loc = torch.load(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave") - labels = torch.load(self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave") - timestamps = torch.load(self.directory + "/CLOC_torchsave_order_files/train_time.torchSave") + store_loc = torch.load(store_loc_path) + labels = torch.load(labels_path) + timestamps = torch.load(timestamps_path) + + assert len(store_loc) == len(labels) + assert len(store_loc) == len(timestamps) warned_once = False logger.info("Labels and timestamps loaded, applying") for store_location, label, timestamp in tqdm(zip(store_loc, labels, timestamps), total=len(store_loc)): - path = pathlib.Path(self.directory + "/CLOC/data/" + store_location.strip().replace("\n", "")) if not path.exists(): @@ -116,21 +138,15 @@ def convert_labels(self): warned_once = True continue - label_path = pathlib.Path(path.parent / f"{path.stem}.label") with open(label_path, "w+", encoding="utf-8") as file: file.write(str(int(label))) # Note: The timestamps obtained in the hd5 file are (very likely) seconds since 2004 (1072911600 GMT timestamp) actual_timestamp = timestamp + 1072911600 - os.utime(path, (actual_timestamp, actual_timestamp)) - + self.max_timestamp = max(self.max_timestamp, actual_timestamp) - - - def update_timestamps(self): - - pass + os.utime(path, (actual_timestamp, actual_timestamp)) def add_dummy_year(self): pass From 0cb455b124a8833252422a041b8a2d7bce225945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Mon, 16 Oct 2023 21:13:49 +0200 Subject: [PATCH 03/20] work --- benchmark/cloc/data_generation.py | 38 +++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 76d3fcf57..6bb9daac4 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -2,17 +2,17 @@ # We thank the authors for their work and hosting the data. import argparse -import pathlib -from shutil import which +import glob +import logging import os +import pathlib import time import zipfile -import logging -import torch -import glob from concurrent.futures import ThreadPoolExecutor -from google.cloud import storage +from shutil import copy, which +import torch +from google.cloud import storage from tqdm import tqdm logging.basicConfig( @@ -22,6 +22,8 @@ ) logger = logging.getLogger(__name__) +DAY_LENGTH_SECONDS = 24 * 60 * 60 + def setup_argparser() -> argparse.ArgumentParser: parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script") @@ -59,8 +61,7 @@ def main(): if args.dummyyear: downloader.add_dummy_year() - logger.info("Cleaning up") - downloader.cleanup() + logger.info("Done.") def is_tool(name): @@ -86,6 +87,8 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): self.unzip = unzip self.test_mode = test_mode self.max_timestamp = 0 + self.example_path = "" + self.example_label_path = "" if not os.path.exists(self.directory): os.makedirs(self.directory) @@ -145,14 +148,25 @@ def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timest # Note: The timestamps obtained in the hd5 file are (very likely) seconds since 2004 (1072911600 GMT timestamp) actual_timestamp = timestamp + 1072911600 self.max_timestamp = max(self.max_timestamp, actual_timestamp) - + + self.example_label_path = label_path + self.example_path = path + os.utime(path, (actual_timestamp, actual_timestamp)) def add_dummy_year(self): - pass + dummy_path = pathlib.Path(self.directory + "/CLOC/data/dummy.jpg") + dummy_label_path = pathlib.Path(self.directory + "/CLOC/data/dummy.label") + + assert not dummy_path.exists() and not dummy_label_path.exists() + + copy(self.example_path, dummy_path) + copy(self.example_label_path, dummy_label_path) + # Two years in the future + dummy_timestamp = self.max_timestamp + (DAY_LENGTH_SECONDS * 365 * 2) + + os.utime(dummy_path, (dummy_timestamp, dummy_timestamp)) - def cleanup(self): - pass def download_directory_from_gcloud(self, prefix): bucket_name = 'cl-datasets' From 9ba980db3b73755c1e47bd8a26a7a7e316c583e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Mon, 16 Oct 2023 21:14:04 +0200 Subject: [PATCH 04/20] formatting --- benchmark/cloc/data_generation.py | 77 ++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 6bb9daac4..86e2807c6 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -31,16 +31,24 @@ def setup_argparser() -> argparse.ArgumentParser: "dir", type=pathlib.Path, action="store", help="Path to data directory" ) parser_.add_argument( - "--dummyyear", action="store_true", help="Add a final dummy year to train also on the last trigger in Modyn" + "--dummyyear", + action="store_true", + help="Add a final dummy year to train also on the last trigger in Modyn", ) parser_.add_argument( - "--all", action="store_true", help="Store all the available data, including the validation and test sets." + "--all", + action="store_true", + help="Store all the available data, including the validation and test sets.", ) parser_.add_argument( - "--test", action="store_true", help="Enable test mode (just download one zip file)" + "--test", + action="store_true", + help="Enable test mode (just download one zip file)", ) parser_.add_argument( - "--skip_download", action="store_true", help="Skips the download and only (re)creates labels and timestamps." + "--skip_download", + action="store_true", + help="Skips the download and only (re)creates labels and timestamps.", ) return parser_ @@ -51,13 +59,13 @@ def main(): logger.info(f"Destination is {args.dir}") - downloader = CLDatasets(str(args.dir), test_mode = args.test) + downloader = CLDatasets(str(args.dir), test_mode=args.test) if not args.skip_download: logger.info("Starting download and extraction.") downloader.download_and_extract() downloader.convert_labels_and_timestamps() - + if args.dummyyear: downloader.add_dummy_year() @@ -100,15 +108,24 @@ def download_and_extract(self): self.unzip_data_files(self.directory + "/CLOC/data") def convert_labels_and_timestamps(self): - self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/train_time.torchSave") - self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave") + self.convert_labels_and_timestamps_impl( + self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", + self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", + self.directory + "/CLOC_torchsave_order_files/train_time.torchSave", + ) + self.convert_labels_and_timestamps_impl( + self.directory + + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", + self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", + self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave", + ) self.remove_images_without_label() def remove_images_without_label(self): print("Removing images without label...") removed_files = 0 - for filename in glob.iglob(self.directory + '**/*.jpg', recursive=True): + for filename in glob.iglob(self.directory + "**/*.jpg", recursive=True): file_path = pathlib.Path(filename) label_path = pathlib.Path(file_path.parent / f"{file_path.stem}.label") @@ -118,7 +135,9 @@ def remove_images_without_label(self): print(f"Removed {removed_files} images that do not have a label.") - def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timestamps_path): + def convert_labels_and_timestamps_impl( + self, store_loc_path, labels_path, timestamps_path + ): logger.info("Loading labels and timestamps.") store_loc = torch.load(store_loc_path) labels = torch.load(labels_path) @@ -130,14 +149,22 @@ def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timest warned_once = False logger.info("Labels and timestamps loaded, applying") - for store_location, label, timestamp in tqdm(zip(store_loc, labels, timestamps), total=len(store_loc)): - path = pathlib.Path(self.directory + "/CLOC/data/" + store_location.strip().replace("\n", "")) - + for store_location, label, timestamp in tqdm( + zip(store_loc, labels, timestamps), total=len(store_loc) + ): + path = pathlib.Path( + self.directory + + "/CLOC/data/" + + store_location.strip().replace("\n", "") + ) + if not path.exists(): if not self.test_mode: raise FileExistsError(f"Cannot find file {store_location}") if not warned_once: - logger.warning(f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning.") + logger.warning( + f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning." + ) warned_once = True continue @@ -158,18 +185,17 @@ def add_dummy_year(self): dummy_path = pathlib.Path(self.directory + "/CLOC/data/dummy.jpg") dummy_label_path = pathlib.Path(self.directory + "/CLOC/data/dummy.label") - assert not dummy_path.exists() and not dummy_label_path.exists() + assert not dummy_path.exists() and not dummy_label_path.exists() - copy(self.example_path, dummy_path) + copy(self.example_path, dummy_path) copy(self.example_label_path, dummy_label_path) # Two years in the future dummy_timestamp = self.max_timestamp + (DAY_LENGTH_SECONDS * 365 * 2) os.utime(dummy_path, (dummy_timestamp, dummy_timestamp)) - def download_directory_from_gcloud(self, prefix): - bucket_name = 'cl-datasets' + bucket_name = "cl-datasets" dl_dir = pathlib.Path(self.directory) storage_client = storage.Client.create_anonymous_client() bucket = storage_client.bucket(bucket_name=bucket_name) @@ -184,14 +210,14 @@ def download_directory_from_gcloud(self, prefix): continue else: first_zip_downloaded = True - + file_split = blob.name.split("/") directory = "/".join(file_split[0:-1]) pathlib.Path(dl_dir / directory).mkdir(parents=True, exist_ok=True) target = dl_dir / blob.name if not target.exists(): - blob.download_to_filename(dl_dir / blob.name) + blob.download_to_filename(dl_dir / blob.name) else: print(f"Skipping {target} as it already exists") @@ -201,7 +227,7 @@ def download_dataset(self): """ print("Order files are being downloaded...") start_time = time.time() - + self.download_directory_from_gcloud(self.dataset) elapsed_time = time.time() - start_time @@ -218,17 +244,15 @@ def unzip_data_files(self, directory: str) -> None: None """ - zip_files = [file for file in os.listdir(directory) if file.endswith('.zip')] + zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")] def extract_single_zip(zip_file: str) -> None: - zip_path = os.path.join(directory, zip_file) - output_dir = os.path.join( - directory, os.path.splitext(zip_file)[0]) + output_dir = os.path.join(directory, os.path.splitext(zip_file)[0]) os.makedirs(output_dir, exist_ok=True) - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(output_dir) with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar: @@ -246,5 +270,6 @@ def extract_single_zip(zip_file: str) -> None: remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip" os.system(remove_command) + if __name__ == "__main__": main() From 416f8f6d822db3a62333635e81b4fb55432191ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Mon, 16 Oct 2023 21:17:11 +0200 Subject: [PATCH 05/20] all flag --- benchmark/cloc/data_generation.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 86e2807c6..8ac52c4a2 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -64,7 +64,7 @@ def main(): logger.info("Starting download and extraction.") downloader.download_and_extract() - downloader.convert_labels_and_timestamps() + downloader.convert_labels_and_timestamps(args.all) if args.dummyyear: downloader.add_dummy_year() @@ -107,18 +107,22 @@ def download_and_extract(self): if self.unzip: self.unzip_data_files(self.directory + "/CLOC/data") - def convert_labels_and_timestamps(self): + def convert_labels_and_timestamps(self, all_data: bool): self.convert_labels_and_timestamps_impl( self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/train_time.torchSave", ) - self.convert_labels_and_timestamps_impl( - self.directory - + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", - self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", - self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave", - ) + + if all_data: + logger.info("Converting all data") + self.convert_labels_and_timestamps_impl( + self.directory + + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", + self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", + self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave", + ) + self.remove_images_without_label() def remove_images_without_label(self): From 1ff03aa72fd693c112bb1c1c2fd255dd4c7178eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Mon, 16 Oct 2023 21:50:52 +0200 Subject: [PATCH 06/20] add readme --- benchmark/cloc/README.md | 38 +++++++++++++++++++++++++++++++ benchmark/cloc/data_generation.py | 1 + 2 files changed, 39 insertions(+) create mode 100644 benchmark/cloc/README.md diff --git a/benchmark/cloc/README.md b/benchmark/cloc/README.md new file mode 100644 index 000000000..c86139e60 --- /dev/null +++ b/benchmark/cloc/README.md @@ -0,0 +1,38 @@ +# CLOC Data + +In this directory, you can find the files necessary to run experiments with the CLOC dataset. +The dataset was introduced [in this paper](https://arxiv.org/pdf/2108.09020.pdf), and we make use of the [CLDatasets](https://github.com/hammoudhasan/CLDatasets) mirror. + +## Data Generation +To run the downloading script you need to install the `google-cloud-storage` package. +Then, you can use the `data_generation.py` script to download the data and set the timestamps accordingly. +Use the `-h` flag to find out more. + +## License + +The CLOC Dataset comes with the MIT License. +The CLDatasets repository does not have an explicit license but allows to "adapt the code to suit your specific requirements". + +### CLOC License Copy + +MIT License + +Copyright (c) 2021 Intel Labs + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 8ac52c4a2..e6ef65750 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -1,5 +1,6 @@ # This code is partially adapted from https://github.com/hammoudhasan/CLDatasets/blob/main/src/downloader.py which is provided license-free # We thank the authors for their work and hosting the data. +# This code requires the pip google-cloud-storage package import argparse import glob From 1b3bb74ceb0d68a17e99255ea9537e8dd57ba732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Tue, 17 Oct 2023 13:55:33 +0200 Subject: [PATCH 07/20] add example pipeline (yet to be tested) --- benchmark/cloc/example_pipeline.yml | 50 +++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 benchmark/cloc/example_pipeline.yml diff --git a/benchmark/cloc/example_pipeline.yml b/benchmark/cloc/example_pipeline.yml new file mode 100644 index 000000000..c031dc33b --- /dev/null +++ b/benchmark/cloc/example_pipeline.yml @@ -0,0 +1,50 @@ +pipeline: + name: CLOC Pipeline + version: 1.0.0 +model: + id: ResNet50 + config: + num_classes: 713 +training: + gpus: 1 + device: "cuda:0" + dataloader_workers: 8 + use_previous_model: True + initial_model: random + batch_size: 256 + optimizers: + - name: "default" + algorithm: "SGD" + source: "PyTorch" + param_groups: + - module: "model" + config: + lr: 0.025 + weight_decay: 0.0001 + momentum: 0.9 + optimization_criterion: + name: "CrossEntropyLoss" + checkpointing: + activated: False + selection_strategy: + name: NewDataStrategy + maximum_keys_in_memory: 1000000 + config: + limit: -1 + reset_after_trigger: True +data: + dataset_id: cloc + transformations: ["transforms.RandomResizedCrop(224)", + "transforms.RandomHorizontalFlip()", + "transforms.ToTensor()", + "transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])"] + bytes_parser_function: | + from PIL import Image + import io + def bytes_parser_function(data: bytes) -> Image: + return Image.open(io.BytesIO(data)).convert("RGB") + +trigger: + id: TimeTrigger + trigger_config: + trigger_every: "1y" \ No newline at end of file From ba33727e9226df1925924dca890f5d1edbfb1c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Wed, 18 Oct 2023 19:43:58 +0200 Subject: [PATCH 08/20] parallel download --- benchmark/cloc/data_generation.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index e6ef65750..753c8f0d1 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -206,6 +206,8 @@ def download_directory_from_gcloud(self, prefix): bucket = storage_client.bucket(bucket_name=bucket_name) blobs = bucket.list_blobs(prefix=prefix) # Get list of files first_zip_downloaded = False + blobs_to_download = [] + for blob in blobs: print(blob.name) if blob.name.endswith("/"): @@ -222,10 +224,23 @@ def download_directory_from_gcloud(self, prefix): target = dl_dir / blob.name if not target.exists(): - blob.download_to_filename(dl_dir / blob.name) + blobs_to_download.append((dl_dir / blob.name, blob)) else: print(f"Skipping {target} as it already exists") + with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(blobs_to_download)) as pbar: + futures_list = [] + download_blob = lambda target, blob: blob.download_to_filename(target) + + for blob in blobs_to_download: + future = executor.submit(download_blob, *blob) + future.add_done_callback(lambda p: pbar.update(1)) + futures_list.append(future) + + # Wait for all tasks to complete + for future in futures_list: + future.result() + def download_dataset(self): """ Download the order files from Google Cloud Storage. From e199e8989c6b7ef7a26c407859d845daf3e3a51b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Wed, 18 Oct 2023 19:47:14 +0200 Subject: [PATCH 09/20] unnecessary print --- benchmark/cloc/data_generation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 753c8f0d1..79e36dc3b 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -209,7 +209,6 @@ def download_directory_from_gcloud(self, prefix): blobs_to_download = [] for blob in blobs: - print(blob.name) if blob.name.endswith("/"): continue if blob.name.endswith("zip"): From 3338ea0685f201d4c236e7a6983de8764075a112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Thu, 19 Oct 2023 17:49:42 +0200 Subject: [PATCH 10/20] add skip unzip flag --- benchmark/cloc/data_generation.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 79e36dc3b..821a15725 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -51,6 +51,11 @@ def setup_argparser() -> argparse.ArgumentParser: action="store_true", help="Skips the download and only (re)creates labels and timestamps.", ) + parser_.add_argument( + "--skip_unzip", + action="store_true", + help="Skips the unzipping and only (re)creates labels and timestamps.", + ) return parser_ @@ -62,8 +67,12 @@ def main(): downloader = CLDatasets(str(args.dir), test_mode=args.test) if not args.skip_download: - logger.info("Starting download and extraction.") - downloader.download_and_extract() + logger.info("Starting download") + downloader.download_dataset() + + if not args.skip_unzip: + logger.info("Starting extraction") + downloader.extract() downloader.convert_labels_and_timestamps(args.all) @@ -102,9 +111,7 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): if not os.path.exists(self.directory): os.makedirs(self.directory) - def download_and_extract(self): - self.download_dataset() - + def extract(self): if self.unzip: self.unzip_data_files(self.directory + "/CLOC/data") From 45ad57a91904a6192ac1dbfee6697949514c9418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Thu, 19 Oct 2023 17:53:48 +0200 Subject: [PATCH 11/20] use processes for downloading --- benchmark/cloc/data_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 821a15725..c53b1dacb 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -9,7 +9,7 @@ import pathlib import time import zipfile -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from shutil import copy, which import torch @@ -234,7 +234,7 @@ def download_directory_from_gcloud(self, prefix): else: print(f"Skipping {target} as it already exists") - with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(blobs_to_download)) as pbar: + with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(blobs_to_download)) as pbar: futures_list = [] download_blob = lambda target, blob: blob.download_to_filename(target) @@ -281,7 +281,7 @@ def extract_single_zip(zip_file: str) -> None: with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(output_dir) - with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar: + with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar: futures_list = [] for zip_file in zip_files: future = executor.submit(extract_single_zip, zip_file) From 8c554f41a37edc4b7e2fd97e3c807c0c06813ca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Thu, 19 Oct 2023 17:55:11 +0200 Subject: [PATCH 12/20] move function to root --- benchmark/cloc/data_generation.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index c53b1dacb..c9ef7508a 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -25,6 +25,14 @@ DAY_LENGTH_SECONDS = 24 * 60 * 60 +def extract_single_zip(directory: str, zip_file: str) -> None: + zip_path = os.path.join(directory, zip_file) + output_dir = os.path.join(directory, os.path.splitext(zip_file)[0]) + + os.makedirs(output_dir, exist_ok=True) + + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(output_dir) def setup_argparser() -> argparse.ArgumentParser: parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script") @@ -272,19 +280,12 @@ def unzip_data_files(self, directory: str) -> None: zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")] - def extract_single_zip(zip_file: str) -> None: - zip_path = os.path.join(directory, zip_file) - output_dir = os.path.join(directory, os.path.splitext(zip_file)[0]) - - os.makedirs(output_dir, exist_ok=True) - with zipfile.ZipFile(zip_path, "r") as zip_ref: - zip_ref.extractall(output_dir) with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar: futures_list = [] for zip_file in zip_files: - future = executor.submit(extract_single_zip, zip_file) + future = executor.submit(extract_single_zip, directory, zip_file) future.add_done_callback(lambda p: pbar.update(1)) futures_list.append(future) From a39bde941771687243e59711a6fcd312d5d1af66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Thu, 19 Oct 2023 17:57:27 +0200 Subject: [PATCH 13/20] add try catch --- benchmark/cloc/data_generation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index c9ef7508a..880dead12 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -31,8 +31,12 @@ def extract_single_zip(directory: str, zip_file: str) -> None: os.makedirs(output_dir, exist_ok=True) - with zipfile.ZipFile(zip_path, "r") as zip_ref: - zip_ref.extractall(output_dir) + try: + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(output_dir) + except Exception as e: + logger.error(f"Error while extracing file {zip_path}") + logger.error(e) def setup_argparser() -> argparse.ArgumentParser: parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script") From fa52e91b14ade209b3b4f577cc4b745e3f39258c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Thu, 26 Oct 2023 15:25:06 +0200 Subject: [PATCH 14/20] try using tmpdir --- benchmark/cloc/data_generation.py | 53 ++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 880dead12..1fd03217a 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -25,9 +25,9 @@ DAY_LENGTH_SECONDS = 24 * 60 * 60 -def extract_single_zip(directory: str, zip_file: str) -> None: +def extract_single_zip(directory: str, target: str, zip_file: str) -> None: zip_path = os.path.join(directory, zip_file) - output_dir = os.path.join(directory, os.path.splitext(zip_file)[0]) + output_dir = os.path.join(target, os.path.splitext(zip_file)[0]) os.makedirs(output_dir, exist_ok=True) @@ -68,6 +68,14 @@ def setup_argparser() -> argparse.ArgumentParser: action="store_true", help="Skips the unzipping and only (re)creates labels and timestamps.", ) + parser_.add_argument( + "--tmpdir", type=pathlib.Path, action="store", help="If given, use a different directory for storing temporary data" + ) + parser_.add_argument( + "--keep_zips", + action="store_true", + help="Keep the downloaded zipfiles.", + ) return parser_ @@ -75,9 +83,11 @@ def main(): parser = setup_argparser() args = parser.parse_args() - logger.info(f"Destination is {args.dir}") + tmpdir = args.tmpdir if hasattr(args, "tmpdir") else args.dir + + logger.info(f"Final destination is {args.dir}; download destination is {tmpdir}") - downloader = CLDatasets(str(args.dir), test_mode=args.test) + downloader = CLDatasets(str(args.dir), str(tmpdir), test_mode=args.test, keep_zips = args.keep_zips) if not args.skip_download: logger.info("Starting download") downloader.download_dataset() @@ -104,7 +114,7 @@ class CLDatasets: A class for downloading datasets from Google Cloud Storage. """ - def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): + def __init__(self, directory: str, tmpdir: str, test_mode: bool = False, unzip: bool = True, keep_zips: bool = False): """ Initialize the CLDatasets object. @@ -114,8 +124,10 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): self.dataset = "CLOC" self.directory = directory + self.tmpdir = tmpdir self.unzip = unzip self.test_mode = test_mode + self.keep_zips = keep_zips self.max_timestamp = 0 self.example_path = "" self.example_label_path = "" @@ -123,24 +135,28 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True): if not os.path.exists(self.directory): os.makedirs(self.directory) + if not os.path.exists(self.tmpdir): + os.makedirs(self.tmpdir) + + def extract(self): if self.unzip: self.unzip_data_files(self.directory + "/CLOC/data") def convert_labels_and_timestamps(self, all_data: bool): self.convert_labels_and_timestamps_impl( - self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", - self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", - self.directory + "/CLOC_torchsave_order_files/train_time.torchSave", + self.tmpdir + "/CLOC_torchsave_order_files/train_store_loc.torchSave", + self.tmpdir + "/CLOC_torchsave_order_files/train_labels.torchSave", + self.tmpdir + "/CLOC_torchsave_order_files/train_time.torchSave", ) if all_data: logger.info("Converting all data") self.convert_labels_and_timestamps_impl( - self.directory + self.tmpdir + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", - self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", - self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave", + self.tmpdir + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", + self.tmpdir + "/CLOC_torchsave_order_files/cross_val_time.torchSave", ) self.remove_images_without_label() @@ -220,7 +236,7 @@ def add_dummy_year(self): def download_directory_from_gcloud(self, prefix): bucket_name = "cl-datasets" - dl_dir = pathlib.Path(self.directory) + dl_dir = pathlib.Path(self.tmpdir) storage_client = storage.Client.create_anonymous_client() bucket = storage_client.bucket(bucket_name=bucket_name) blobs = bucket.list_blobs(prefix=prefix) # Get list of files @@ -246,7 +262,7 @@ def download_directory_from_gcloud(self, prefix): else: print(f"Skipping {target} as it already exists") - with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(blobs_to_download)) as pbar: + with ThreadPoolExecutor(max_workers=32) as executor, tqdm(total=len(blobs_to_download)) as pbar: futures_list = [] download_blob = lambda target, blob: blob.download_to_filename(target) @@ -284,12 +300,10 @@ def unzip_data_files(self, directory: str) -> None: zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")] - - with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar: futures_list = [] for zip_file in zip_files: - future = executor.submit(extract_single_zip, directory, zip_file) + future = executor.submit(extract_single_zip, directory, self.tmpdir, zip_file) future.add_done_callback(lambda p: pbar.update(1)) futures_list.append(future) @@ -297,9 +311,10 @@ def unzip_data_files(self, directory: str) -> None: for future in futures_list: future.result() - # Remove zip files - remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip" - os.system(remove_command) + if not self.keep_zips: + # Remove zip files + remove_command = f"rm {self.tmpdir}/{self.dataset}/data/*.zip" + os.system(remove_command) if __name__ == "__main__": From 222ec9f0d3c271fe24fd651b2f949e2e76db0358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Thu, 26 Oct 2023 15:26:57 +0200 Subject: [PATCH 15/20] more workers --- benchmark/cloc/data_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 1fd03217a..0141141ae 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -300,7 +300,7 @@ def unzip_data_files(self, directory: str) -> None: zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")] - with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar: + with ProcessPoolExecutor(max_workers=96) as executor, tqdm(total=len(zip_files)) as pbar: futures_list = [] for zip_file in zip_files: future = executor.submit(extract_single_zip, directory, self.tmpdir, zip_file) From 7658a365dad05c5a6640d66ae320d46d1194c461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Tue, 31 Oct 2023 11:22:03 +0100 Subject: [PATCH 16/20] small fix --- benchmark/cloc/data_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 0141141ae..47fbcc0a6 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -141,7 +141,7 @@ def __init__(self, directory: str, tmpdir: str, test_mode: bool = False, unzip: def extract(self): if self.unzip: - self.unzip_data_files(self.directory + "/CLOC/data") + self.unzip_data_files(self.tmpdir + "/CLOC/data") def convert_labels_and_timestamps(self, all_data: bool): self.convert_labels_and_timestamps_impl( From 6db5f225bd7dc87c3e67b3a891b7cba1787c95af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Tue, 31 Oct 2023 11:42:26 +0100 Subject: [PATCH 17/20] another fix --- benchmark/cloc/data_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 47fbcc0a6..565601351 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -303,7 +303,7 @@ def unzip_data_files(self, directory: str) -> None: with ProcessPoolExecutor(max_workers=96) as executor, tqdm(total=len(zip_files)) as pbar: futures_list = [] for zip_file in zip_files: - future = executor.submit(extract_single_zip, directory, self.tmpdir, zip_file) + future = executor.submit(extract_single_zip, self.tmpdir, directory, zip_file) future.add_done_callback(lambda p: pbar.update(1)) futures_list.append(future) From 49b6081a593d9774930f70e841f8cfa4a64ce7d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Tue, 31 Oct 2023 11:47:40 +0100 Subject: [PATCH 18/20] fix not so optimal naming --- benchmark/cloc/data_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index 565601351..d2a43fa08 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -287,7 +287,7 @@ def download_dataset(self): elapsed_time = time.time() - start_time print("Elapsed time:", elapsed_time) - def unzip_data_files(self, directory: str) -> None: + def unzip_data_files(self, zip_file_directory: str) -> None: """ Extracts the contents of zip files in a directory into nested folders. @@ -298,12 +298,12 @@ def unzip_data_files(self, directory: str) -> None: None """ - zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")] + zip_files = [file for file in os.listdir(zip_file_directory) if file.endswith(".zip")] with ProcessPoolExecutor(max_workers=96) as executor, tqdm(total=len(zip_files)) as pbar: futures_list = [] for zip_file in zip_files: - future = executor.submit(extract_single_zip, self.tmpdir, directory, zip_file) + future = executor.submit(extract_single_zip, zip_file_directory, self.directory, zip_file) future.add_done_callback(lambda p: pbar.update(1)) futures_list.append(future) From 740bd153f0bf485edb8c52b3180c3e4eae88da7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Wed, 15 Nov 2023 22:15:29 +0100 Subject: [PATCH 19/20] port some cloc fixes --- benchmark/cloc/data_generation.py | 32 +++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py index d2a43fa08..29493d8ce 100644 --- a/benchmark/cloc/data_generation.py +++ b/benchmark/cloc/data_generation.py @@ -68,6 +68,11 @@ def setup_argparser() -> argparse.ArgumentParser: action="store_true", help="Skips the unzipping and only (re)creates labels and timestamps.", ) + parser_.add_argument( + "--skip_labels", + action="store_true", + help="Skips the labeling", + ) parser_.add_argument( "--tmpdir", type=pathlib.Path, action="store", help="If given, use a different directory for storing temporary data" ) @@ -96,7 +101,11 @@ def main(): logger.info("Starting extraction") downloader.extract() - downloader.convert_labels_and_timestamps(args.all) + if not args.skip_labels: + logger.info("Starting labeling") + downloader.convert_labels_and_timestamps(args.all) + + downloader.remove_images_without_label() if args.dummyyear: downloader.add_dummy_year() @@ -159,13 +168,13 @@ def convert_labels_and_timestamps(self, all_data: bool): self.tmpdir + "/CLOC_torchsave_order_files/cross_val_time.torchSave", ) - self.remove_images_without_label() def remove_images_without_label(self): print("Removing images without label...") removed_files = 0 - for filename in glob.iglob(self.directory + "**/*.jpg", recursive=True): + image_paths = pathlib.Path(self.directory).glob("**/*.jpg") + for filename in tqdm(image_paths): file_path = pathlib.Path(filename) label_path = pathlib.Path(file_path.parent / f"{file_path.stem}.label") @@ -189,23 +198,24 @@ def convert_labels_and_timestamps_impl( warned_once = False logger.info("Labels and timestamps loaded, applying") + missing_files = 0 for store_location, label, timestamp in tqdm( zip(store_loc, labels, timestamps), total=len(store_loc) ): path = pathlib.Path( - self.directory - + "/CLOC/data/" + self.directory + "/" + store_location.strip().replace("\n", "") ) if not path.exists(): if not self.test_mode: - raise FileExistsError(f"Cannot find file {store_location}") + raise FileExistsError(f"Cannot find file {path}") if not warned_once: logger.warning( - f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning." + f"Cannot find file {path}, but we are in test mode. Will not repeat this warning." ) warned_once = True + missing_files += 1 continue label_path = pathlib.Path(path.parent / f"{path.stem}.label") @@ -221,9 +231,11 @@ def convert_labels_and_timestamps_impl( os.utime(path, (actual_timestamp, actual_timestamp)) + logger.info(f"missing files for {store_loc_path} = {missing_files}/{len(store_loc)}") + def add_dummy_year(self): - dummy_path = pathlib.Path(self.directory + "/CLOC/data/dummy.jpg") - dummy_label_path = pathlib.Path(self.directory + "/CLOC/data/dummy.label") + dummy_path = pathlib.Path(self.directory + "/dummy.jpg") + dummy_label_path = pathlib.Path(self.directory + "/dummy.label") assert not dummy_path.exists() and not dummy_label_path.exists() @@ -262,7 +274,7 @@ def download_directory_from_gcloud(self, prefix): else: print(f"Skipping {target} as it already exists") - with ThreadPoolExecutor(max_workers=32) as executor, tqdm(total=len(blobs_to_download)) as pbar: + with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(blobs_to_download)) as pbar: futures_list = [] download_blob = lambda target, blob: blob.download_to_filename(target) From 4466938b0eb5c9a921da53f27e21c6c3a47eaedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= Date: Fri, 17 Nov 2023 21:03:23 +0100 Subject: [PATCH 20/20] add working cloc pipeline --- benchmark/cloc/example_pipeline.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmark/cloc/example_pipeline.yml b/benchmark/cloc/example_pipeline.yml index c031dc33b..65d103e53 100644 --- a/benchmark/cloc/example_pipeline.yml +++ b/benchmark/cloc/example_pipeline.yml @@ -5,12 +5,19 @@ model: id: ResNet50 config: num_classes: 713 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" dataloader_workers: 8 + num_prefetched_partitions: 4 + parallel_prefetch_requests: 4 use_previous_model: True initial_model: random + initial_pass: + activated: False batch_size: 256 optimizers: - name: "default" @@ -28,8 +35,10 @@ training: activated: False selection_strategy: name: NewDataStrategy + # With that setting, there is most often only 1 partition maximum_keys_in_memory: 1000000 config: + storage_backend: "database" limit: -1 reset_after_trigger: True data: @@ -43,8 +52,7 @@ data: import io def bytes_parser_function(data: bytes) -> Image: return Image.open(io.BytesIO(data)).convert("RGB") - trigger: id: TimeTrigger trigger_config: - trigger_every: "1y" \ No newline at end of file + trigger_every: "52w"