From 0acb4633407a0d1abcf184012a82900c80b636e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Fri, 13 Oct 2023 18:00:07 +0200
Subject: [PATCH 01/20] start working on CLOC benchmark

---
 benchmark/cloc/.gitignore         |   1 +
 benchmark/cloc/data_generation.py | 220 ++++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+)
 create mode 100644 benchmark/cloc/.gitignore
 create mode 100644 benchmark/cloc/data_generation.py

diff --git a/benchmark/cloc/.gitignore b/benchmark/cloc/.gitignore
new file mode 100644
index 000000000..fe30281ee
--- /dev/null
+++ b/benchmark/cloc/.gitignore
@@ -0,0 +1 @@
+cloc_data/
\ No newline at end of file
diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
new file mode 100644
index 000000000..f1d581825
--- /dev/null
+++ b/benchmark/cloc/data_generation.py
@@ -0,0 +1,220 @@
+# This code is partially adapted from https://github.com/hammoudhasan/CLDatasets/blob/main/src/downloader.py which is provided license-free
+# We thank the authors for their work and hosting the data.
+
+import argparse
+import pathlib
+from shutil import which
+import os
+import time
+import zipfile
+import logging
+import torch
+from concurrent.futures import ThreadPoolExecutor
+from google.cloud import storage
+
+from tqdm import tqdm
+
+logging.basicConfig(
+    level=logging.NOTSET,
+    format="[%(asctime)s]  [%(filename)15s:%(lineno)4d] %(levelname)-8s %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+
+def setup_argparser() -> argparse.ArgumentParser:
+    parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script")
+    parser_.add_argument(
+        "dir", type=pathlib.Path, action="store", help="Path to data directory"
+    )
+    parser_.add_argument(
+        "--dummyyear", action="store_true", help="Add a final dummy year to train also on the last trigger in Modyn"
+    )
+    parser_.add_argument(
+        "--all", action="store_true", help="Store all the available data, including the validation and test sets."
+    )
+    parser_.add_argument(
+        "--test", action="store_true", help="Enable test mode (just download one zip file)"
+    )
+    parser_.add_argument(
+        "--skip_download", action="store_true", help="Skips the download and only (re)creates labels and timestamps."
+    )
+    return parser_
+
+
+def main():
+    parser = setup_argparser()
+    args = parser.parse_args()
+
+    logger.info(f"Destination is {args.dir}")
+
+    downloader = CLDatasets(str(args.dir), test_mode = args.test)
+    if not args.skip_download:
+        logger.info("Starting download and extraction.")
+        downloader.download_and_extract()
+
+    downloader.convert_labels()
+    downloader.update_timestamps()
+    
+    if args.dummyyear:
+        downloader.add_dummy_year()
+
+    logger.info("Cleaning up")
+    downloader.cleanup()
+
+
+def is_tool(name):
+    """Check whether `name` is on PATH and marked as executable."""
+    return which(name) is not None
+
+
+class CLDatasets:
+    """
+    A class for downloading datasets from Google Cloud Storage.
+    """
+
+    def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
+        """
+        Initialize the CLDatasets object.
+
+        Args:
+            directory (str): The directory where the dataset will be saved.
+        """
+
+        self.dataset = "CLOC"
+        self.directory = directory
+        self.unzip = unzip
+        self.test_mode = test_mode
+
+        if not os.path.exists(self.directory):
+            os.makedirs(self.directory)
+
+    def download_and_extract(self):
+        self.download_dataset()
+
+        if self.unzip:
+            self.unzip_data_files(self.directory + "/CLOC/data")
+
+    def convert_labels(self):
+        logger.info("Loading labels and timestamps.")
+        store_loc = torch.load(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave")
+        labels = torch.load(self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave")
+        timestamps = torch.load(self.directory + "/CLOC_torchsave_order_files/train_time.torchSave")
+
+        warned_once = False
+
+        logger.info("Labels and timestamps loaded, applying")
+        for store_location, label, timestamp in tqdm(zip(store_loc, labels, timestamps), total=len(store_loc)):
+
+            path = pathlib.Path(self.directory + "/CLOC/data/" + store_location.strip().replace("\n", ""))
+            
+            if not path.exists():
+                if not self.test_mode:
+                    raise FileExistsError(f"Cannot find file {store_location}")
+                if not warned_once:
+                    logger.warning(f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning.")
+                    warned_once = True
+                continue
+
+
+            label_path = pathlib.Path(path.parent / f"{path.stem}.label")
+            with open(label_path, "w+", encoding="utf-8") as file:
+                file.write(str(int(label)))
+
+            # Note: The timestamps obtained in the hd5 file are (very likely) seconds since 2004 (1072911600 GMT timestamp)
+            actual_timestamp = timestamp + 1072911600
+            os.utime(path, (actual_timestamp, actual_timestamp))
+
+            
+
+
+    def update_timestamps(self):
+        
+        pass
+
+    def add_dummy_year(self):
+        pass
+
+    def cleanup(self):
+        pass
+
+    def download_directory_from_gcloud(self, prefix):
+        bucket_name = 'cl-datasets'
+        dl_dir = pathlib.Path(self.directory)
+        storage_client = storage.Client.create_anonymous_client()
+        bucket = storage_client.bucket(bucket_name=bucket_name)
+        blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
+        first_zip_downloaded = False
+        for blob in blobs:
+            print(blob.name)
+            if blob.name.endswith("/"):
+                continue
+            if blob.name.endswith("zip"):
+                if first_zip_downloaded and self.test_mode:
+                    continue
+                else:
+                    first_zip_downloaded = True
+                
+            file_split = blob.name.split("/")
+            directory = "/".join(file_split[0:-1])
+            pathlib.Path(dl_dir / directory).mkdir(parents=True, exist_ok=True)
+            target = dl_dir / blob.name
+
+            if not target.exists():
+                blob.download_to_filename(dl_dir / blob.name) 
+            else:
+                print(f"Skipping {target} as it already exists")
+
+    def download_dataset(self):
+        """
+        Download the order files from Google Cloud Storage.
+        """
+        print("Order files are being downloaded...")
+        start_time = time.time()
+        
+        self.download_directory_from_gcloud(self.dataset)
+
+        elapsed_time = time.time() - start_time
+        print("Elapsed time:", elapsed_time)
+
+    def unzip_data_files(self, directory: str) -> None:
+        """
+        Extracts the contents of zip files in a directory into nested folders.
+
+        Args:
+            directory: The path to the directory containing the zip files.
+
+        Returns:
+            None
+        """
+
+        zip_files = [file for file in os.listdir(directory) if file.endswith('.zip')]
+
+        def extract_single_zip(zip_file: str) -> None:
+
+            zip_path = os.path.join(directory, zip_file)
+            output_dir = os.path.join(
+                directory, os.path.splitext(zip_file)[0])
+
+            os.makedirs(output_dir, exist_ok=True)
+
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(output_dir)
+
+        with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar:
+            futures_list = []
+            for zip_file in zip_files:
+                future = executor.submit(extract_single_zip, zip_file)
+                future.add_done_callback(lambda p: pbar.update(1))
+                futures_list.append(future)
+
+            # Wait for all tasks to complete
+            for future in futures_list:
+                future.result()
+
+        # Remove zip files
+        remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip"
+        os.system(remove_command)
+
+if __name__ == "__main__":
+    main()

From bd51ed85585601089570762551d47d4449d78df8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Mon, 16 Oct 2023 20:50:47 +0200
Subject: [PATCH 02/20] progress

---
 benchmark/cloc/data_generation.py | 46 +++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index f1d581825..76d3fcf57 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -9,6 +9,7 @@
 import zipfile
 import logging
 import torch
+import glob
 from concurrent.futures import ThreadPoolExecutor
 from google.cloud import storage
 
@@ -53,8 +54,7 @@ def main():
         logger.info("Starting download and extraction.")
         downloader.download_and_extract()
 
-    downloader.convert_labels()
-    downloader.update_timestamps()
+    downloader.convert_labels_and_timestamps()
     
     if args.dummyyear:
         downloader.add_dummy_year()
@@ -85,6 +85,7 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
         self.directory = directory
         self.unzip = unzip
         self.test_mode = test_mode
+        self.max_timestamp = 0
 
         if not os.path.exists(self.directory):
             os.makedirs(self.directory)
@@ -95,17 +96,38 @@ def download_and_extract(self):
         if self.unzip:
             self.unzip_data_files(self.directory + "/CLOC/data")
 
-    def convert_labels(self):
+    def convert_labels_and_timestamps(self):
+        self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/train_time.torchSave")
+        self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave")
+        self.remove_images_without_label()
+
+    def remove_images_without_label(self):
+        print("Removing images without label...")
+        removed_files = 0
+
+        for filename in glob.iglob(self.directory + '**/*.jpg', recursive=True):
+            file_path = pathlib.Path(filename)
+            label_path = pathlib.Path(file_path.parent / f"{file_path.stem}.label")
+
+            if not label_path.exists():
+                removed_files += 1
+                file_path.unlink()
+
+        print(f"Removed {removed_files} images that do not have a label.")
+
+    def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timestamps_path):
         logger.info("Loading labels and timestamps.")
-        store_loc = torch.load(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave")
-        labels = torch.load(self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave")
-        timestamps = torch.load(self.directory + "/CLOC_torchsave_order_files/train_time.torchSave")
+        store_loc = torch.load(store_loc_path)
+        labels = torch.load(labels_path)
+        timestamps = torch.load(timestamps_path)
+
+        assert len(store_loc) == len(labels)
+        assert len(store_loc) == len(timestamps)
 
         warned_once = False
 
         logger.info("Labels and timestamps loaded, applying")
         for store_location, label, timestamp in tqdm(zip(store_loc, labels, timestamps), total=len(store_loc)):
-
             path = pathlib.Path(self.directory + "/CLOC/data/" + store_location.strip().replace("\n", ""))
             
             if not path.exists():
@@ -116,21 +138,15 @@ def convert_labels(self):
                     warned_once = True
                 continue
 
-
             label_path = pathlib.Path(path.parent / f"{path.stem}.label")
             with open(label_path, "w+", encoding="utf-8") as file:
                 file.write(str(int(label)))
 
             # Note: The timestamps obtained in the hd5 file are (very likely) seconds since 2004 (1072911600 GMT timestamp)
             actual_timestamp = timestamp + 1072911600
-            os.utime(path, (actual_timestamp, actual_timestamp))
-
+            self.max_timestamp = max(self.max_timestamp, actual_timestamp)
             
-
-
-    def update_timestamps(self):
-        
-        pass
+            os.utime(path, (actual_timestamp, actual_timestamp))
 
     def add_dummy_year(self):
         pass

From 0cb455b124a8833252422a041b8a2d7bce225945 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Mon, 16 Oct 2023 21:13:49 +0200
Subject: [PATCH 03/20] work

---
 benchmark/cloc/data_generation.py | 38 +++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 76d3fcf57..6bb9daac4 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -2,17 +2,17 @@
 # We thank the authors for their work and hosting the data.
 
 import argparse
-import pathlib
-from shutil import which
+import glob
+import logging
 import os
+import pathlib
 import time
 import zipfile
-import logging
-import torch
-import glob
 from concurrent.futures import ThreadPoolExecutor
-from google.cloud import storage
+from shutil import copy, which
 
+import torch
+from google.cloud import storage
 from tqdm import tqdm
 
 logging.basicConfig(
@@ -22,6 +22,8 @@
 )
 logger = logging.getLogger(__name__)
 
+DAY_LENGTH_SECONDS = 24 * 60 * 60
+
 
 def setup_argparser() -> argparse.ArgumentParser:
     parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script")
@@ -59,8 +61,7 @@ def main():
     if args.dummyyear:
         downloader.add_dummy_year()
 
-    logger.info("Cleaning up")
-    downloader.cleanup()
+    logger.info("Done.")
 
 
 def is_tool(name):
@@ -86,6 +87,8 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
         self.unzip = unzip
         self.test_mode = test_mode
         self.max_timestamp = 0
+        self.example_path = ""
+        self.example_label_path = ""
 
         if not os.path.exists(self.directory):
             os.makedirs(self.directory)
@@ -145,14 +148,25 @@ def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timest
             # Note: The timestamps obtained in the hd5 file are (very likely) seconds since 2004 (1072911600 GMT timestamp)
             actual_timestamp = timestamp + 1072911600
             self.max_timestamp = max(self.max_timestamp, actual_timestamp)
-            
+
+            self.example_label_path = label_path
+            self.example_path = path
+
             os.utime(path, (actual_timestamp, actual_timestamp))
 
     def add_dummy_year(self):
-        pass
+        dummy_path = pathlib.Path(self.directory + "/CLOC/data/dummy.jpg")
+        dummy_label_path = pathlib.Path(self.directory + "/CLOC/data/dummy.label")
+
+        assert not dummy_path.exists() and not dummy_label_path.exists()    
+
+        copy(self.example_path, dummy_path)                      
+        copy(self.example_label_path, dummy_label_path)
+        # Two years in the future
+        dummy_timestamp = self.max_timestamp + (DAY_LENGTH_SECONDS * 365 * 2)
+
+        os.utime(dummy_path, (dummy_timestamp, dummy_timestamp))
 
-    def cleanup(self):
-        pass
 
     def download_directory_from_gcloud(self, prefix):
         bucket_name = 'cl-datasets'

From 9ba980db3b73755c1e47bd8a26a7a7e316c583e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Mon, 16 Oct 2023 21:14:04 +0200
Subject: [PATCH 04/20] formatting

---
 benchmark/cloc/data_generation.py | 77 ++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 6bb9daac4..86e2807c6 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -31,16 +31,24 @@ def setup_argparser() -> argparse.ArgumentParser:
         "dir", type=pathlib.Path, action="store", help="Path to data directory"
     )
     parser_.add_argument(
-        "--dummyyear", action="store_true", help="Add a final dummy year to train also on the last trigger in Modyn"
+        "--dummyyear",
+        action="store_true",
+        help="Add a final dummy year to train also on the last trigger in Modyn",
     )
     parser_.add_argument(
-        "--all", action="store_true", help="Store all the available data, including the validation and test sets."
+        "--all",
+        action="store_true",
+        help="Store all the available data, including the validation and test sets.",
     )
     parser_.add_argument(
-        "--test", action="store_true", help="Enable test mode (just download one zip file)"
+        "--test",
+        action="store_true",
+        help="Enable test mode (just download one zip file)",
     )
     parser_.add_argument(
-        "--skip_download", action="store_true", help="Skips the download and only (re)creates labels and timestamps."
+        "--skip_download",
+        action="store_true",
+        help="Skips the download and only (re)creates labels and timestamps.",
     )
     return parser_
 
@@ -51,13 +59,13 @@ def main():
 
     logger.info(f"Destination is {args.dir}")
 
-    downloader = CLDatasets(str(args.dir), test_mode = args.test)
+    downloader = CLDatasets(str(args.dir), test_mode=args.test)
     if not args.skip_download:
         logger.info("Starting download and extraction.")
         downloader.download_and_extract()
 
     downloader.convert_labels_and_timestamps()
-    
+
     if args.dummyyear:
         downloader.add_dummy_year()
 
@@ -100,15 +108,24 @@ def download_and_extract(self):
             self.unzip_data_files(self.directory + "/CLOC/data")
 
     def convert_labels_and_timestamps(self):
-        self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/train_time.torchSave")
-        self.convert_labels_and_timestamps_impl(self.directory + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave", self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave")
+        self.convert_labels_and_timestamps_impl(
+            self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave",
+            self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave",
+            self.directory + "/CLOC_torchsave_order_files/train_time.torchSave",
+        )
+        self.convert_labels_and_timestamps_impl(
+            self.directory
+            + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave",
+            self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave",
+            self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave",
+        )
         self.remove_images_without_label()
 
     def remove_images_without_label(self):
         print("Removing images without label...")
         removed_files = 0
 
-        for filename in glob.iglob(self.directory + '**/*.jpg', recursive=True):
+        for filename in glob.iglob(self.directory + "**/*.jpg", recursive=True):
             file_path = pathlib.Path(filename)
             label_path = pathlib.Path(file_path.parent / f"{file_path.stem}.label")
 
@@ -118,7 +135,9 @@ def remove_images_without_label(self):
 
         print(f"Removed {removed_files} images that do not have a label.")
 
-    def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timestamps_path):
+    def convert_labels_and_timestamps_impl(
+        self, store_loc_path, labels_path, timestamps_path
+    ):
         logger.info("Loading labels and timestamps.")
         store_loc = torch.load(store_loc_path)
         labels = torch.load(labels_path)
@@ -130,14 +149,22 @@ def convert_labels_and_timestamps_impl(self, store_loc_path, labels_path, timest
         warned_once = False
 
         logger.info("Labels and timestamps loaded, applying")
-        for store_location, label, timestamp in tqdm(zip(store_loc, labels, timestamps), total=len(store_loc)):
-            path = pathlib.Path(self.directory + "/CLOC/data/" + store_location.strip().replace("\n", ""))
-            
+        for store_location, label, timestamp in tqdm(
+            zip(store_loc, labels, timestamps), total=len(store_loc)
+        ):
+            path = pathlib.Path(
+                self.directory
+                + "/CLOC/data/"
+                + store_location.strip().replace("\n", "")
+            )
+
             if not path.exists():
                 if not self.test_mode:
                     raise FileExistsError(f"Cannot find file {store_location}")
                 if not warned_once:
-                    logger.warning(f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning.")
+                    logger.warning(
+                        f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning."
+                    )
                     warned_once = True
                 continue
 
@@ -158,18 +185,17 @@ def add_dummy_year(self):
         dummy_path = pathlib.Path(self.directory + "/CLOC/data/dummy.jpg")
         dummy_label_path = pathlib.Path(self.directory + "/CLOC/data/dummy.label")
 
-        assert not dummy_path.exists() and not dummy_label_path.exists()    
+        assert not dummy_path.exists() and not dummy_label_path.exists()
 
-        copy(self.example_path, dummy_path)                      
+        copy(self.example_path, dummy_path)
         copy(self.example_label_path, dummy_label_path)
         # Two years in the future
         dummy_timestamp = self.max_timestamp + (DAY_LENGTH_SECONDS * 365 * 2)
 
         os.utime(dummy_path, (dummy_timestamp, dummy_timestamp))
 
-
     def download_directory_from_gcloud(self, prefix):
-        bucket_name = 'cl-datasets'
+        bucket_name = "cl-datasets"
         dl_dir = pathlib.Path(self.directory)
         storage_client = storage.Client.create_anonymous_client()
         bucket = storage_client.bucket(bucket_name=bucket_name)
@@ -184,14 +210,14 @@ def download_directory_from_gcloud(self, prefix):
                     continue
                 else:
                     first_zip_downloaded = True
-                
+
             file_split = blob.name.split("/")
             directory = "/".join(file_split[0:-1])
             pathlib.Path(dl_dir / directory).mkdir(parents=True, exist_ok=True)
             target = dl_dir / blob.name
 
             if not target.exists():
-                blob.download_to_filename(dl_dir / blob.name) 
+                blob.download_to_filename(dl_dir / blob.name)
             else:
                 print(f"Skipping {target} as it already exists")
 
@@ -201,7 +227,7 @@ def download_dataset(self):
         """
         print("Order files are being downloaded...")
         start_time = time.time()
-        
+
         self.download_directory_from_gcloud(self.dataset)
 
         elapsed_time = time.time() - start_time
@@ -218,17 +244,15 @@ def unzip_data_files(self, directory: str) -> None:
             None
         """
 
-        zip_files = [file for file in os.listdir(directory) if file.endswith('.zip')]
+        zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")]
 
         def extract_single_zip(zip_file: str) -> None:
-
             zip_path = os.path.join(directory, zip_file)
-            output_dir = os.path.join(
-                directory, os.path.splitext(zip_file)[0])
+            output_dir = os.path.join(directory, os.path.splitext(zip_file)[0])
 
             os.makedirs(output_dir, exist_ok=True)
 
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                 zip_ref.extractall(output_dir)
 
         with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar:
@@ -246,5 +270,6 @@ def extract_single_zip(zip_file: str) -> None:
         remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip"
         os.system(remove_command)
 
+
 if __name__ == "__main__":
     main()

From 416f8f6d822db3a62333635e81b4fb55432191ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Mon, 16 Oct 2023 21:17:11 +0200
Subject: [PATCH 05/20] all flag

---
 benchmark/cloc/data_generation.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 86e2807c6..8ac52c4a2 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -64,7 +64,7 @@ def main():
         logger.info("Starting download and extraction.")
         downloader.download_and_extract()
 
-    downloader.convert_labels_and_timestamps()
+    downloader.convert_labels_and_timestamps(args.all)
 
     if args.dummyyear:
         downloader.add_dummy_year()
@@ -107,18 +107,22 @@ def download_and_extract(self):
         if self.unzip:
             self.unzip_data_files(self.directory + "/CLOC/data")
 
-    def convert_labels_and_timestamps(self):
+    def convert_labels_and_timestamps(self, all_data: bool):
         self.convert_labels_and_timestamps_impl(
             self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave",
             self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave",
             self.directory + "/CLOC_torchsave_order_files/train_time.torchSave",
         )
-        self.convert_labels_and_timestamps_impl(
-            self.directory
-            + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave",
-            self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave",
-            self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave",
-        )
+
+        if all_data:
+            logger.info("Converting all data")
+            self.convert_labels_and_timestamps_impl(
+                self.directory
+                + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave",
+                self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave",
+                self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave",
+            )
+
         self.remove_images_without_label()
 
     def remove_images_without_label(self):

From 1ff03aa72fd693c112bb1c1c2fd255dd4c7178eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Mon, 16 Oct 2023 21:50:52 +0200
Subject: [PATCH 06/20] add readme

---
 benchmark/cloc/README.md          | 38 +++++++++++++++++++++++++++++++
 benchmark/cloc/data_generation.py |  1 +
 2 files changed, 39 insertions(+)
 create mode 100644 benchmark/cloc/README.md

diff --git a/benchmark/cloc/README.md b/benchmark/cloc/README.md
new file mode 100644
index 000000000..c86139e60
--- /dev/null
+++ b/benchmark/cloc/README.md
@@ -0,0 +1,38 @@
+# CLOC Data
+
+In this directory, you can find the files necessary to run experiments with the CLOC dataset. 
+The dataset was introduced [in this paper](https://arxiv.org/pdf/2108.09020.pdf), and we make use of the [CLDatasets](https://github.com/hammoudhasan/CLDatasets) mirror.
+
+## Data Generation
+To run the downloading script you need to install the `google-cloud-storage` package. 
+Then, you can use the `data_generation.py` script to download the data and set the timestamps accordingly. 
+Use the `-h` flag to find out more.
+
+## License
+
+The CLOC Dataset comes with the MIT License.
+The CLDatasets repository does not have an explicit license but allows to "adapt the code to suit your specific requirements".
+
+### CLOC License Copy
+
+MIT License
+
+Copyright (c) 2021 Intel Labs
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 8ac52c4a2..e6ef65750 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -1,5 +1,6 @@
 # This code is partially adapted from https://github.com/hammoudhasan/CLDatasets/blob/main/src/downloader.py which is provided license-free
 # We thank the authors for their work and hosting the data.
+# This code requires the pip google-cloud-storage package
 
 import argparse
 import glob

From 1b3bb74ceb0d68a17e99255ea9537e8dd57ba732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Tue, 17 Oct 2023 13:55:33 +0200
Subject: [PATCH 07/20] add example pipeline (yet to be tested)

---
 benchmark/cloc/example_pipeline.yml | 50 +++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 benchmark/cloc/example_pipeline.yml

diff --git a/benchmark/cloc/example_pipeline.yml b/benchmark/cloc/example_pipeline.yml
new file mode 100644
index 000000000..c031dc33b
--- /dev/null
+++ b/benchmark/cloc/example_pipeline.yml
@@ -0,0 +1,50 @@
+pipeline:
+  name: CLOC Pipeline
+  version: 1.0.0
+model:
+  id: ResNet50
+  config:
+    num_classes: 713
+training:
+  gpus: 1
+  device: "cuda:0"
+  dataloader_workers: 8
+  use_previous_model: True
+  initial_model: random
+  batch_size: 256
+  optimizers:
+    - name: "default"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model"
+          config:
+            lr: 0.025
+            weight_decay: 0.0001
+            momentum: 0.9
+  optimization_criterion:
+    name: "CrossEntropyLoss"
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 1000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: cloc
+  transformations: ["transforms.RandomResizedCrop(224)",
+                    "transforms.RandomHorizontalFlip()",
+                    "transforms.ToTensor()",
+                    "transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])"]
+  bytes_parser_function: |
+    from PIL import Image
+    import io
+    def bytes_parser_function(data: bytes) -> Image:
+      return Image.open(io.BytesIO(data)).convert("RGB")
+
+trigger:
+  id: TimeTrigger
+  trigger_config:
+    trigger_every: "1y"
\ No newline at end of file

From ba33727e9226df1925924dca890f5d1edbfb1c20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Wed, 18 Oct 2023 19:43:58 +0200
Subject: [PATCH 08/20] parallel download

---
 benchmark/cloc/data_generation.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index e6ef65750..753c8f0d1 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -206,6 +206,8 @@ def download_directory_from_gcloud(self, prefix):
         bucket = storage_client.bucket(bucket_name=bucket_name)
         blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
         first_zip_downloaded = False
+        blobs_to_download = []
+
         for blob in blobs:
             print(blob.name)
             if blob.name.endswith("/"):
@@ -222,10 +224,23 @@ def download_directory_from_gcloud(self, prefix):
             target = dl_dir / blob.name
 
             if not target.exists():
-                blob.download_to_filename(dl_dir / blob.name)
+                blobs_to_download.append((dl_dir / blob.name, blob))
             else:
                 print(f"Skipping {target} as it already exists")
 
+        with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(blobs_to_download)) as pbar:
+            futures_list = []
+            download_blob = lambda target, blob: blob.download_to_filename(target)
+
+            for blob in blobs_to_download:
+                future = executor.submit(download_blob, *blob)
+                future.add_done_callback(lambda p: pbar.update(1))
+                futures_list.append(future)
+
+            # Wait for all tasks to complete
+            for future in futures_list:
+                future.result()
+
     def download_dataset(self):
         """
         Download the order files from Google Cloud Storage.

From e199e8989c6b7ef7a26c407859d845daf3e3a51b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Wed, 18 Oct 2023 19:47:14 +0200
Subject: [PATCH 09/20] unnecessary print

---
 benchmark/cloc/data_generation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 753c8f0d1..79e36dc3b 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -209,7 +209,6 @@ def download_directory_from_gcloud(self, prefix):
         blobs_to_download = []
 
         for blob in blobs:
-            print(blob.name)
             if blob.name.endswith("/"):
                 continue
             if blob.name.endswith("zip"):

From 3338ea0685f201d4c236e7a6983de8764075a112 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Thu, 19 Oct 2023 17:49:42 +0200
Subject: [PATCH 10/20] add skip unzip flag

---
 benchmark/cloc/data_generation.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 79e36dc3b..821a15725 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -51,6 +51,11 @@ def setup_argparser() -> argparse.ArgumentParser:
         action="store_true",
         help="Skips the download and only (re)creates labels and timestamps.",
     )
+    parser_.add_argument(
+        "--skip_unzip",
+        action="store_true",
+        help="Skips the unzipping and only (re)creates labels and timestamps.",
+    )
     return parser_
 
 
@@ -62,8 +67,12 @@ def main():
 
     downloader = CLDatasets(str(args.dir), test_mode=args.test)
     if not args.skip_download:
-        logger.info("Starting download and extraction.")
-        downloader.download_and_extract()
+        logger.info("Starting download")
+        downloader.download_dataset()
+
+    if not args.skip_unzip:
+        logger.info("Starting extraction")
+        downloader.extract()
 
     downloader.convert_labels_and_timestamps(args.all)
 
@@ -102,9 +111,7 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
         if not os.path.exists(self.directory):
             os.makedirs(self.directory)
 
-    def download_and_extract(self):
-        self.download_dataset()
-
+    def extract(self):
         if self.unzip:
             self.unzip_data_files(self.directory + "/CLOC/data")
 

From 45ad57a91904a6192ac1dbfee6697949514c9418 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Thu, 19 Oct 2023 17:53:48 +0200
Subject: [PATCH 11/20] use processes for downloading

---
 benchmark/cloc/data_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 821a15725..c53b1dacb 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -9,7 +9,7 @@
 import pathlib
 import time
 import zipfile
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from shutil import copy, which
 
 import torch
@@ -234,7 +234,7 @@ def download_directory_from_gcloud(self, prefix):
             else:
                 print(f"Skipping {target} as it already exists")
 
-        with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(blobs_to_download)) as pbar:
+        with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(blobs_to_download)) as pbar:
             futures_list = []
             download_blob = lambda target, blob: blob.download_to_filename(target)
 
@@ -281,7 +281,7 @@ def extract_single_zip(zip_file: str) -> None:
             with zipfile.ZipFile(zip_path, "r") as zip_ref:
                 zip_ref.extractall(output_dir)
 
-        with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar:
+        with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar:
             futures_list = []
             for zip_file in zip_files:
                 future = executor.submit(extract_single_zip, zip_file)

From 8c554f41a37edc4b7e2fd97e3c807c0c06813ca1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Thu, 19 Oct 2023 17:55:11 +0200
Subject: [PATCH 12/20] move function to root

---
 benchmark/cloc/data_generation.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index c53b1dacb..c9ef7508a 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -25,6 +25,14 @@
 
 DAY_LENGTH_SECONDS = 24 * 60 * 60
 
+def extract_single_zip(directory: str, zip_file: str) -> None:
+    zip_path = os.path.join(directory, zip_file)
+    output_dir = os.path.join(directory, os.path.splitext(zip_file)[0])
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(output_dir)
 
 def setup_argparser() -> argparse.ArgumentParser:
     parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script")
@@ -272,19 +280,12 @@ def unzip_data_files(self, directory: str) -> None:
 
         zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")]
 
-        def extract_single_zip(zip_file: str) -> None:
-            zip_path = os.path.join(directory, zip_file)
-            output_dir = os.path.join(directory, os.path.splitext(zip_file)[0])
-
-            os.makedirs(output_dir, exist_ok=True)
 
-            with zipfile.ZipFile(zip_path, "r") as zip_ref:
-                zip_ref.extractall(output_dir)
 
         with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar:
             futures_list = []
             for zip_file in zip_files:
-                future = executor.submit(extract_single_zip, zip_file)
+                future = executor.submit(extract_single_zip, directory, zip_file)
                 future.add_done_callback(lambda p: pbar.update(1))
                 futures_list.append(future)
 

From a39bde941771687243e59711a6fcd312d5d1af66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Thu, 19 Oct 2023 17:57:27 +0200
Subject: [PATCH 13/20] add try catch

---
 benchmark/cloc/data_generation.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index c9ef7508a..880dead12 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -31,8 +31,12 @@ def extract_single_zip(directory: str, zip_file: str) -> None:
 
     os.makedirs(output_dir, exist_ok=True)
 
-    with zipfile.ZipFile(zip_path, "r") as zip_ref:
-        zip_ref.extractall(output_dir)
+    try:
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(output_dir)
+    except Exception as e:
+        logger.error(f"Error while extracing file {zip_path}")
+        logger.error(e) 
 
 def setup_argparser() -> argparse.ArgumentParser:
     parser_ = argparse.ArgumentParser(description=f"CLOC Benchmark Storage Script")

From fa52e91b14ade209b3b4f577cc4b745e3f39258c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Thu, 26 Oct 2023 15:25:06 +0200
Subject: [PATCH 14/20] try using tmpdir

---
 benchmark/cloc/data_generation.py | 53 ++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 880dead12..1fd03217a 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -25,9 +25,9 @@
 
 DAY_LENGTH_SECONDS = 24 * 60 * 60
 
-def extract_single_zip(directory: str, zip_file: str) -> None:
+def extract_single_zip(directory: str, target: str, zip_file: str) -> None:
     zip_path = os.path.join(directory, zip_file)
-    output_dir = os.path.join(directory, os.path.splitext(zip_file)[0])
+    output_dir = os.path.join(target, os.path.splitext(zip_file)[0])
 
     os.makedirs(output_dir, exist_ok=True)
 
@@ -68,6 +68,14 @@ def setup_argparser() -> argparse.ArgumentParser:
         action="store_true",
         help="Skips the unzipping and only (re)creates labels and timestamps.",
     )
+    parser_.add_argument(
+        "--tmpdir", type=pathlib.Path, action="store", help="If given, use a different directory for storing temporary data"
+    )
+    parser_.add_argument(
+        "--keep_zips",
+        action="store_true",
+        help="Keep the downloaded zipfiles.",
+    )
     return parser_
 
 
@@ -75,9 +83,11 @@ def main():
     parser = setup_argparser()
     args = parser.parse_args()
 
-    logger.info(f"Destination is {args.dir}")
+    tmpdir = args.tmpdir if hasattr(args, "tmpdir") else args.dir
+
+    logger.info(f"Final destination is {args.dir}; download destination is {tmpdir}")
 
-    downloader = CLDatasets(str(args.dir), test_mode=args.test)
+    downloader = CLDatasets(str(args.dir), str(tmpdir), test_mode=args.test, keep_zips = args.keep_zips)
     if not args.skip_download:
         logger.info("Starting download")
         downloader.download_dataset()
@@ -104,7 +114,7 @@ class CLDatasets:
     A class for downloading datasets from Google Cloud Storage.
     """
 
-    def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
+    def __init__(self, directory: str, tmpdir: str, test_mode: bool = False, unzip: bool = True, keep_zips: bool = False):
         """
         Initialize the CLDatasets object.
 
@@ -114,8 +124,10 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
 
         self.dataset = "CLOC"
         self.directory = directory
+        self.tmpdir = tmpdir
         self.unzip = unzip
         self.test_mode = test_mode
+        self.keep_zips = keep_zips
         self.max_timestamp = 0
         self.example_path = ""
         self.example_label_path = ""
@@ -123,24 +135,28 @@ def __init__(self, directory: str, test_mode: bool = False, unzip: bool = True):
         if not os.path.exists(self.directory):
             os.makedirs(self.directory)
 
+        if not os.path.exists(self.tmpdir):
+            os.makedirs(self.tmpdir)
+
+
     def extract(self):
         if self.unzip:
             self.unzip_data_files(self.directory + "/CLOC/data")
 
     def convert_labels_and_timestamps(self, all_data: bool):
         self.convert_labels_and_timestamps_impl(
-            self.directory + "/CLOC_torchsave_order_files/train_store_loc.torchSave",
-            self.directory + "/CLOC_torchsave_order_files/train_labels.torchSave",
-            self.directory + "/CLOC_torchsave_order_files/train_time.torchSave",
+            self.tmpdir + "/CLOC_torchsave_order_files/train_store_loc.torchSave",
+            self.tmpdir + "/CLOC_torchsave_order_files/train_labels.torchSave",
+            self.tmpdir + "/CLOC_torchsave_order_files/train_time.torchSave",
         )
 
         if all_data:
             logger.info("Converting all data")
             self.convert_labels_and_timestamps_impl(
-                self.directory
+                self.tmpdir
                 + "/CLOC_torchsave_order_files/cross_val_store_loc.torchSave",
-                self.directory + "/CLOC_torchsave_order_files/cross_val_labels.torchSave",
-                self.directory + "/CLOC_torchsave_order_files/cross_val_time.torchSave",
+                self.tmpdir + "/CLOC_torchsave_order_files/cross_val_labels.torchSave",
+                self.tmpdir + "/CLOC_torchsave_order_files/cross_val_time.torchSave",
             )
 
         self.remove_images_without_label()
@@ -220,7 +236,7 @@ def add_dummy_year(self):
 
     def download_directory_from_gcloud(self, prefix):
         bucket_name = "cl-datasets"
-        dl_dir = pathlib.Path(self.directory)
+        dl_dir = pathlib.Path(self.tmpdir)
         storage_client = storage.Client.create_anonymous_client()
         bucket = storage_client.bucket(bucket_name=bucket_name)
         blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
@@ -246,7 +262,7 @@ def download_directory_from_gcloud(self, prefix):
             else:
                 print(f"Skipping {target} as it already exists")
 
-        with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(blobs_to_download)) as pbar:
+        with ThreadPoolExecutor(max_workers=32) as executor, tqdm(total=len(blobs_to_download)) as pbar:
             futures_list = []
             download_blob = lambda target, blob: blob.download_to_filename(target)
 
@@ -284,12 +300,10 @@ def unzip_data_files(self, directory: str) -> None:
 
         zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")]
 
-
-
         with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar:
             futures_list = []
             for zip_file in zip_files:
-                future = executor.submit(extract_single_zip, directory, zip_file)
+                future = executor.submit(extract_single_zip, directory, self.tmpdir, zip_file)
                 future.add_done_callback(lambda p: pbar.update(1))
                 futures_list.append(future)
 
@@ -297,9 +311,10 @@ def unzip_data_files(self, directory: str) -> None:
             for future in futures_list:
                 future.result()
 
-        # Remove zip files
-        remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip"
-        os.system(remove_command)
+        if not self.keep_zips:
+            # Remove zip files
+            remove_command = f"rm {self.tmpdir}/{self.dataset}/data/*.zip"
+            os.system(remove_command)
 
 
 if __name__ == "__main__":

From 222ec9f0d3c271fe24fd651b2f949e2e76db0358 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Thu, 26 Oct 2023 15:26:57 +0200
Subject: [PATCH 15/20] more workers

---
 benchmark/cloc/data_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 1fd03217a..0141141ae 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -300,7 +300,7 @@ def unzip_data_files(self, directory: str) -> None:
 
         zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")]
 
-        with ProcessPoolExecutor(max_workers=32) as executor, tqdm(total=len(zip_files)) as pbar:
+        with ProcessPoolExecutor(max_workers=96) as executor, tqdm(total=len(zip_files)) as pbar:
             futures_list = []
             for zip_file in zip_files:
                 future = executor.submit(extract_single_zip, directory, self.tmpdir, zip_file)

From 7658a365dad05c5a6640d66ae320d46d1194c461 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Tue, 31 Oct 2023 11:22:03 +0100
Subject: [PATCH 16/20] small fix

---
 benchmark/cloc/data_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 0141141ae..47fbcc0a6 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -141,7 +141,7 @@ def __init__(self, directory: str, tmpdir: str, test_mode: bool = False, unzip:
 
     def extract(self):
         if self.unzip:
-            self.unzip_data_files(self.directory + "/CLOC/data")
+            self.unzip_data_files(self.tmpdir + "/CLOC/data")
 
     def convert_labels_and_timestamps(self, all_data: bool):
         self.convert_labels_and_timestamps_impl(

From 6db5f225bd7dc87c3e67b3a891b7cba1787c95af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Tue, 31 Oct 2023 11:42:26 +0100
Subject: [PATCH 17/20] another fix

---
 benchmark/cloc/data_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 47fbcc0a6..565601351 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -303,7 +303,7 @@ def unzip_data_files(self, directory: str) -> None:
         with ProcessPoolExecutor(max_workers=96) as executor, tqdm(total=len(zip_files)) as pbar:
             futures_list = []
             for zip_file in zip_files:
-                future = executor.submit(extract_single_zip, directory, self.tmpdir, zip_file)
+                future = executor.submit(extract_single_zip, self.tmpdir, directory, zip_file)
                 future.add_done_callback(lambda p: pbar.update(1))
                 futures_list.append(future)
 

From 49b6081a593d9774930f70e841f8cfa4a64ce7d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Tue, 31 Oct 2023 11:47:40 +0100
Subject: [PATCH 18/20] fix not so optimal naming

---
 benchmark/cloc/data_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index 565601351..d2a43fa08 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -287,7 +287,7 @@ def download_dataset(self):
         elapsed_time = time.time() - start_time
         print("Elapsed time:", elapsed_time)
 
-    def unzip_data_files(self, directory: str) -> None:
+    def unzip_data_files(self, zip_file_directory: str) -> None:
         """
         Extracts the contents of zip files in a directory into nested folders.
 
@@ -298,12 +298,12 @@ def unzip_data_files(self, directory: str) -> None:
             None
         """
 
-        zip_files = [file for file in os.listdir(directory) if file.endswith(".zip")]
+        zip_files = [file for file in os.listdir(zip_file_directory) if file.endswith(".zip")]
 
         with ProcessPoolExecutor(max_workers=96) as executor, tqdm(total=len(zip_files)) as pbar:
             futures_list = []
             for zip_file in zip_files:
-                future = executor.submit(extract_single_zip, self.tmpdir, directory, zip_file)
+                future = executor.submit(extract_single_zip, zip_file_directory, self.directory, zip_file)
                 future.add_done_callback(lambda p: pbar.update(1))
                 futures_list.append(future)
 

From 740bd153f0bf485edb8c52b3180c3e4eae88da7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Wed, 15 Nov 2023 22:15:29 +0100
Subject: [PATCH 19/20] port some cloc fixes

---
 benchmark/cloc/data_generation.py | 32 +++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/benchmark/cloc/data_generation.py b/benchmark/cloc/data_generation.py
index d2a43fa08..29493d8ce 100644
--- a/benchmark/cloc/data_generation.py
+++ b/benchmark/cloc/data_generation.py
@@ -68,6 +68,11 @@ def setup_argparser() -> argparse.ArgumentParser:
         action="store_true",
         help="Skips the unzipping and only (re)creates labels and timestamps.",
     )
+    parser_.add_argument(
+        "--skip_labels",
+        action="store_true",
+        help="Skips the labeling",
+    )
     parser_.add_argument(
         "--tmpdir", type=pathlib.Path, action="store", help="If given, use a different directory for storing temporary data"
     )
@@ -96,7 +101,11 @@ def main():
         logger.info("Starting extraction")
         downloader.extract()
 
-    downloader.convert_labels_and_timestamps(args.all)
+    if not args.skip_labels:
+        logger.info("Starting labeling")
+        downloader.convert_labels_and_timestamps(args.all)
+
+    downloader.remove_images_without_label()
 
     if args.dummyyear:
         downloader.add_dummy_year()
@@ -159,13 +168,13 @@ def convert_labels_and_timestamps(self, all_data: bool):
                 self.tmpdir + "/CLOC_torchsave_order_files/cross_val_time.torchSave",
             )
 
-        self.remove_images_without_label()
 
     def remove_images_without_label(self):
         print("Removing images without label...")
         removed_files = 0
 
-        for filename in glob.iglob(self.directory + "**/*.jpg", recursive=True):
+        image_paths = pathlib.Path(self.directory).glob("**/*.jpg")
+        for filename in tqdm(image_paths):
             file_path = pathlib.Path(filename)
             label_path = pathlib.Path(file_path.parent / f"{file_path.stem}.label")
 
@@ -189,23 +198,24 @@ def convert_labels_and_timestamps_impl(
         warned_once = False
 
         logger.info("Labels and timestamps loaded, applying")
+        missing_files = 0
         for store_location, label, timestamp in tqdm(
             zip(store_loc, labels, timestamps), total=len(store_loc)
         ):
             path = pathlib.Path(
-                self.directory
-                + "/CLOC/data/"
+                self.directory + "/"
                 + store_location.strip().replace("\n", "")
             )
 
             if not path.exists():
                 if not self.test_mode:
-                    raise FileExistsError(f"Cannot find file {store_location}")
+                    raise FileExistsError(f"Cannot find file {path}")
                 if not warned_once:
                     logger.warning(
-                        f"Cannot find file {store_location}, but we are in test mode. Will not repeat this warning."
+                        f"Cannot find file {path}, but we are in test mode. Will not repeat this warning."
                     )
                     warned_once = True
+                missing_files += 1
                 continue
 
             label_path = pathlib.Path(path.parent / f"{path.stem}.label")
@@ -221,9 +231,11 @@ def convert_labels_and_timestamps_impl(
 
             os.utime(path, (actual_timestamp, actual_timestamp))
 
+        logger.info(f"missing files for {store_loc_path} = {missing_files}/{len(store_loc)}")
+
     def add_dummy_year(self):
-        dummy_path = pathlib.Path(self.directory + "/CLOC/data/dummy.jpg")
-        dummy_label_path = pathlib.Path(self.directory + "/CLOC/data/dummy.label")
+        dummy_path = pathlib.Path(self.directory + "/dummy.jpg")
+        dummy_label_path = pathlib.Path(self.directory + "/dummy.label")
 
         assert not dummy_path.exists() and not dummy_label_path.exists()
 
@@ -262,7 +274,7 @@ def download_directory_from_gcloud(self, prefix):
             else:
                 print(f"Skipping {target} as it already exists")
 
-        with ThreadPoolExecutor(max_workers=32) as executor, tqdm(total=len(blobs_to_download)) as pbar:
+        with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(blobs_to_download)) as pbar:
             futures_list = []
             download_blob = lambda target, blob: blob.download_to_filename(target)
 

From 4466938b0eb5c9a921da53f27e21c6c3a47eaedd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <maxi+git@boether.de>
Date: Fri, 17 Nov 2023 21:03:23 +0100
Subject: [PATCH 20/20] add working cloc pipeline

---
 benchmark/cloc/example_pipeline.yml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmark/cloc/example_pipeline.yml b/benchmark/cloc/example_pipeline.yml
index c031dc33b..65d103e53 100644
--- a/benchmark/cloc/example_pipeline.yml
+++ b/benchmark/cloc/example_pipeline.yml
@@ -5,12 +5,19 @@ model:
   id: ResNet50
   config:
     num_classes: 713
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"
   dataloader_workers: 8
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 4
   use_previous_model: True
   initial_model: random
+  initial_pass:
+    activated: False
   batch_size: 256
   optimizers:
     - name: "default"
@@ -28,8 +35,10 @@ training:
     activated: False
   selection_strategy:
     name: NewDataStrategy
+    # With that setting, there is most often only 1 partition
     maximum_keys_in_memory: 1000000
     config:
+      storage_backend: "database"
       limit: -1
       reset_after_trigger: True
 data:
@@ -43,8 +52,7 @@ data:
     import io
     def bytes_parser_function(data: bytes) -> Image:
       return Image.open(io.BytesIO(data)).convert("RGB")
-
 trigger:
   id: TimeTrigger
   trigger_config:
-    trigger_every: "1y"
\ No newline at end of file
+    trigger_every: "52w"