digitalmethodsinitiative · dale-wahl · Oct 24, 2023 · Oct 25, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/.gitignore b/.gitignore
@@ -44,14 +44,6 @@ webtool/venv/
 *.ipynb
 venv/
 
-# do not ignore interface images
-!webtool/static/img/*.png
-!webtool/static/img/*.gif
-!webtool/static/img/*.jpg
-!webtool/static/img/favicon/*.ico
-!webtool/static/img/flags/*.png
-!common/assets/github-screenshots/*.png
-
 # generated by 4CAT
 webtool/static/css/colours.css
 
@@ -65,3 +57,13 @@ keys/
 images/
 sphinx-3.3.1/
 sphinx/
+
+# do not ignore interface images
+!webtool/static/img/*.png
+!webtool/static/img/*.gif
+!webtool/static/img/*.jpg
+!webtool/static/img/favicon/*.ico
+!webtool/static/img/flags/*.png
+!webtool/static/pixplot_template/assets/images/*
+!webtool/static/pixplot_template/assets/images/icons/*
+!common/assets/github-screenshots/*.png
diff --git a/backend/lib/preset.py b/backend/lib/preset.py
@@ -27,7 +27,9 @@ def process(self):
 		# also make sure there is always a "parameters" key
 		pipeline = [{"parameters": {}, **p} for p in pipeline.copy()]
 
-		pipeline[-1]["parameters"]["attach_to"] = self.dataset.key
+		# check that preset has an "attach_to" parameter in one of the processors
+		if not any("attach_to" in p["parameters"] for p in pipeline):
+			pipeline[-1]["parameters"]["attach_to"] = self.dataset.key
 
 		# map the linear pipeline to a nested processor parameter set
 		while len(pipeline) > 1:

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
@@ -307,8 +307,7 @@ def after_process(self):
 
 				if self.dataset.get_results_path().exists():
 					# Update the surrogate's results file suffix to match this dataset's suffix
-					surrogate.data["result_file"] = surrogate.get_results_path().with_suffix(self.dataset.get_results_path().suffix)
-					shutil.copyfile(str(self.dataset.get_results_path()), str(surrogate.get_results_path()))
+					surrogate.result_file = str(self.dataset.get_results_path().name)
 
 				try:
 					surrogate.finish(self.dataset.data["num_rows"])
@@ -626,7 +625,7 @@ def write_csv_items_and_finish(self, data):
 		self.dataset.update_status("Finished")
 		self.dataset.finish(len(data))
 
-	def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED):
+	def write_archive_and_finish(self, filelist_or_folder, num_items=None, compression=zipfile.ZIP_STORED):
 		"""
 		Archive a bunch of files into a zip archive and finish processing
 
@@ -639,21 +638,30 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI
 		  are not compressed, to speed up unarchiving.
 		"""
 		is_folder = False
-		if issubclass(type(files), PurePath):
-			is_folder = files
-			if not files.exists() or not files.is_dir():
-				raise RuntimeError("Folder %s is not a folder that can be archived" % files)
+		if issubclass(type(filelist_or_folder), PurePath):
+			# folder with files
+			is_folder = filelist_or_folder
+			if not filelist_or_folder.exists() or not filelist_or_folder.is_dir():
+				raise RuntimeError("Folder %s is not a folder that can be archived" % filelist_or_folder)
 
-			files = files.glob("*")
+			#files = files.glob("*")
 
 		# create zip of archive and delete temporary files and folder
 		self.dataset.update_status("Compressing results into archive")
 		done = 0
-		with zipfile.ZipFile(self.dataset.get_results_path(), "w", compression=compression) as zip:
-			for output_path in files:
-				zip.write(output_path, output_path.name)
-				output_path.unlink()
-				done += 1
+		with zipfile.ZipFile(self.dataset.get_results_path(), "w", compression=compression) as zipf:
+			if is_folder:
+				for root, dirs, files in os.walk(filelist_or_folder):
+					for file in files:
+						zipf.write(os.path.join(root, file),
+								  os.path.relpath(os.path.join(root, file), filelist_or_folder))
+						done += 1
+			else:
+				# list of files
+				for output_path in filelist_or_folder:
+					zipf.write(output_path, output_path.name)
+					output_path.unlink()
+					done += 1
 
 		# delete temporary folder
 		if is_folder:

diff --git a/backend/workers/cleanup_tempfiles.py b/backend/workers/cleanup_tempfiles.py
@@ -51,10 +51,12 @@ def work(self):
             # if for whatever reason there are multiple hashes in the filename,
             # the key would always be the last one
             key = possible_keys.pop()
-
             try:
                 dataset = DataSet(key=key, db=self.db)
             except DataSetException:
+                if self.db.fetchone(f"select * from datasets where result_file = '{file.name}'") is not None:
+                    # Another dataset is using this file
+                    continue
                 # the dataset has been deleted since, but the result file still
                 # exists - should be safe to clean up
                 self.log.info("No matching dataset with key %s for file %s, deleting file" % (key, str(file)))

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
@@ -39,7 +39,7 @@ class DataSet(FourcatModule):
 	data = None
 	key = ""
 
-	children = None
+	_children = None
 	available_processors = None
 	genealogy = None
 	preset_parent = None
@@ -71,7 +71,6 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
 		# Ensure mutable attributes are set in __init__ as they are unique to each DataSet
 		self.data = {}
 		self.parameters = {}
-		self.children = []
 		self.available_processors = {}
 		self.genealogy = []
 		self.staging_areas = []
@@ -148,11 +147,6 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
 			# Reserve filename and update data['result_file']
 			self.reserve_result_file(parameters, extension)
 
-		# retrieve analyses and processors that may be run for this dataset
-		analyses = self.db.fetchall("SELECT * FROM datasets WHERE key_parent = %s ORDER BY timestamp ASC", (self.key,))
-		self.children = sorted([DataSet(data=analysis, db=self.db) for analysis in analyses],
-							   key=lambda dataset: dataset.is_finished(), reverse=True)
-
 		self.refresh_owners()
 
 	def check_dataset_finished(self):
@@ -566,16 +560,17 @@ def delete(self, commit=True):
 		self.db.delete("datasets_owners", where={"key": self.key}, commit=commit)
 		self.db.delete("users_favourites", where={"key": self.key}, commit=commit)
 
-		# delete from drive
-		try:
-			self.get_results_path().unlink()
-			if self.get_results_path().with_suffix(".log").exists():
-				self.get_results_path().with_suffix(".log").unlink()
-			if self.get_results_folder_path().exists():
-				shutil.rmtree(self.get_results_folder_path())
-		except FileNotFoundError:
-			# already deleted, apparently
-			pass
+		# delete from drive if not used elsewhere
+		if self.db.fetchone(f"select * from datasets where result_file = '{self.get_results_path().name}' and key != '{self.key}'") is None:
+			try:
+				self.get_results_path().unlink()
+				if self.get_results_path().with_suffix(".log").exists():
+					self.get_results_path().with_suffix(".log").unlink()
+				if self.get_results_folder_path().exists():
+					shutil.rmtree(self.get_results_folder_path())
+			except FileNotFoundError:
+				# already deleted, apparently
+				pass
 
 	def update_children(self, **kwargs):
 		"""
@@ -724,7 +719,7 @@ def add_owner(self, username, role="owner"):
 			self.refresh_owners()
 
 		# make sure children's owners remain in sync
-		for child in self.children:
+		for child in self.get_children(instantiate_datasets=True):
 			child.add_owner(username, role)
 			# not recursive, since we're calling it from recursive code!
 			child.copy_ownership_from(self, recursive=False)
@@ -755,7 +750,7 @@ def remove_owner(self, username):
 			del self.tagged_owners[username]
 
 		# make sure children's owners remain in sync
-		for child in self.children:
+		for child in self.get_children(instantiate_datasets=True):
 			child.remove_owner(username)
 			# not recursive, since we're calling it from recursive code!
 			child.copy_ownership_from(self, recursive=False)
@@ -800,7 +795,7 @@ def copy_ownership_from(self, dataset, recursive=True):
 
 		self.db.commit()
 		if recursive:
-			for child in self.children:
+			for child in self.get_children(instantiate_datasets=True):
 				child.copy_ownership_from(self, recursive=recursive)
 
 	def get_parameters(self):
@@ -1242,7 +1237,29 @@ def get_genealogy(self, inclusive=False):
 		self.genealogy = genealogy
 		return self.genealogy
 
-	def get_all_children(self, recursive=True):
+	def get_children(self, instantiate_datasets=True, update=False):
+		"""
+		Get children of this dataset
+
+		:param bool instantiate_datasets:  Instantiate DataSet objects for each child else return ChildDataset objects w/ only key and type attributes
+		:param bool update:  Update the list of children from database if True, else return cached value
+		:return list:  List of child datasets
+		"""
+		if self._children and not update:
+			return self._children
+
+		if instantiate_datasets:
+			analyses = self.db.fetchall("SELECT * FROM datasets WHERE key_parent = %s ORDER BY timestamp ASC",
+										(self.key,))
+			self._children = sorted([DataSet(data=analysis, db=self.db) for analysis in analyses],
+							  key=lambda dataset: dataset.is_finished(), reverse=True)
+			return self._children
+		else:
+			# Returns simple ChildDataset objects with only key and type
+			# Do not update self._children since this is not a list of DataSet objects
+			return [ChildDataset(key=key, type=dataset_type) for key, dataset_type in self.db.fetchall("SELECT key, type FROM datasets WHERE key_parent = %s ORDER BY timestamp ASC", (self.key,))]
+
+	def get_all_children(self, recursive=True, instantiate_datasets=True):
 		"""
 		Get all children of this dataset
 
@@ -1252,11 +1269,20 @@ def get_all_children(self, recursive=True):
 
 		:return list:  List of DataSets
 		"""
-		children = [DataSet(data=record, db=self.db) for record in self.db.fetchall("SELECT * FROM datasets WHERE key_parent = %s", (self.key,))]
+		children = self.get_children(instantiate_datasets=instantiate_datasets)
 		results = children.copy()
 		if recursive:
-			for child in children:
-				results += child.get_all_children(recursive)
+			if instantiate_datasets:
+				# Can use the DataSet.get_all_children method for each child
+				for child in children:
+					results += child.get_all_children(recursive)
+			else:
+				# Need to check database directly for children of children
+				while children:
+					child = children.pop(0)
+					new_kids = [ChildDataset(key=key, type=dataset_type) for key, dataset_type in self.db.fetchall("SELECT key, type FROM datasets WHERE key_parent = %s ORDER BY timestamp ASC", (child.key,))]
+					children += new_kids
+					results += new_kids
 
 		return results
 
@@ -1374,9 +1400,11 @@ def get_own_processor(self):
 
 		:return:  Processor class, or `None` if not available.
 		"""
-		processor_type = self.parameters.get("type", self.data.get("type"))
+		processor_type = self.type if hasattr(self, "type") else self.parameters.get("type")
 		return backend.all_modules.processors.get(processor_type)
 
+	def get(self, key):
+		return self.data.get(key)
 
 	def get_available_processors(self, user=None):
 		"""
@@ -1397,7 +1425,7 @@ def get_available_processors(self, user=None):
 
 		processors = self.get_compatible_processors(user=user)
 
-		for analysis in self.children:
+		for analysis in self.get_children(instantiate_datasets=False):
 			if analysis.type not in processors:
 				continue
 
@@ -1591,6 +1619,18 @@ def warn_unmappable_item(self, item_count, processor=None, error_message=None, w
 			else:
 				# No other log available
 				raise DataSetException(f"Unable to map item {item_count} for dataset {closest_dataset.key} and properly warn")
+	@staticmethod
+	def get_dataset_by_key(key, db=None):
+		"""
+		Get dataset by key
+
+		:param str key:  Dataset key
+		:return DataSet:  Dataset
+		"""
+		if db is None:
+			config.with_db()
+			db = config.db
+		return DataSet(key=key, db=db)
 
 	def __getattr__(self, attr):
 		"""
@@ -1640,3 +1680,17 @@ def __setattr__(self, attr, value):
 
 		if attr == "parameters":
 			self.parameters = json.loads(value)
+
+class ChildDataset:
+	"""
+	Allows for easy access to child some dataset attributes without instantiating them all
+	"""
+	def __init__(self, key, type):
+		self.key = key
+		self.type = type
+
+	def instantiate(self, db):
+		"""
+		Instantiates the dataset
+		"""
+		return DataSet(key=self.key, db=db)
diff --git a/common/lib/helpers.py b/common/lib/helpers.py
@@ -2,6 +2,8 @@
 Miscellaneous helper functions for the 4CAT backend
 """
 import subprocess
+import zipfile
+
 import requests
 import datetime
 import smtplib
@@ -99,6 +101,13 @@ def sniff_encoding(file):
     return "utf-8-sig" if maybe_bom == b"\xef\xbb\xbf" else "utf-8"
 
 
+def get_html_redirect_page(url):
+    """
+    Returns a html string to redirect to PixPlot.
+    """
+    return f"<head><meta http-equiv='refresh' charset='utf-8' content='0; URL={url}'></head>"
+
+
 def get_software_commit():
     """
     Get current 4CAT commit hash
@@ -829,4 +838,20 @@ def _sets_to_lists_gen(d):
             else:
                 yield k, v
 
-    return dict(_sets_to_lists_gen(d))
+    return dict(_sets_to_lists_gen(d))
+
+def get_archived_file(archive_path, archived_file, temp_dir):
+    with zipfile.ZipFile(archive_path, "r") as archive_file:
+        archive_contents = sorted(archive_file.namelist())
+
+        if archived_file in archive_contents:
+            info = archive_file.getinfo(archived_file)
+            if info.is_dir():
+                raise IsADirectoryError("File is a directory")
+
+            archive_file.extract(archived_file, temp_dir)
+
+            return temp_dir.joinpath(archived_file)
+
+        else:
+            raise FileNotFoundError("File not found in archive")
diff --git a/processors/machine-learning/pix-plot.py b/processors/machine-learning/pix-plot.py
@@ -11,7 +11,7 @@
 
 from common.config_manager import config
 from common.lib.dmi_service_manager import DmiServiceManager, DsmOutOfMemory, DmiServiceManagerException
-from common.lib.helpers import UserInput, convert_to_int
+from common.lib.helpers import UserInput, get_html_redirect_page
 from backend.lib.processor import BasicProcessor
 
 __author__ = "Dale Wahl"
@@ -227,7 +227,7 @@ def process(self):
 
         # Results HTML file redirects to output_dir/index.html
         plot_url = ('https://' if config.get("flask.https") else 'http://') + config.get("flask.server_name") + '/result/' + f"{os.path.relpath(self.dataset.get_results_folder_path(), self.dataset.folder)}/index.html"
-        html_file = self.get_html_page(plot_url)
+        html_file = get_html_redirect_page(plot_url)
 
         # Write HTML file
         with self.dataset.get_results_path().open("w", encoding="utf-8") as output_file:
@@ -362,12 +362,6 @@ def format_metadata(self, temp_path):
         self.dataset.update_status("Metadata.csv created")
         return metadata_file_path if rows_written != 0 else False
 
-    def get_html_page(self, url):
-        """
-        Returns a html string to redirect to PixPlot.
-        """
-        return f"<head><meta http-equiv='refresh' charset='utf-8' content='0; URL={url}'></head>"
-
     def clean_filename(self, s):
         """
         Given a string that points to a filename, return a clean filename