From eb981590ed65e0df0008b3a11bcce552bd2ec8ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Stucke?= <joerg.stucke@fkie.fraunhofer.de>
Date: Thu, 2 Nov 2023 17:30:21 +0100
Subject: [PATCH] better sanitiziation of analysis results

---
 src/storage/entry_conversion.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/storage/entry_conversion.py b/src/storage/entry_conversion.py
index 736ff0880..95dc8bb53 100644
--- a/src/storage/entry_conversion.py
+++ b/src/storage/entry_conversion.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import json
 import logging
+import re
 from datetime import datetime
 from time import time
 
@@ -9,6 +11,8 @@
 from objects.firmware import Firmware
 from storage.schema import AnalysisEntry, FileObjectEntry, FirmwareEntry, VirtualFilePath
 
+JSON_UNICODE_REGEX = re.compile(r'\\u[0-9a-f]{4}')
+
 
 def firmware_from_entry(fw_entry: FirmwareEntry, analysis_filter: list[str] | None = None) -> Firmware:
     firmware = Firmware()
@@ -142,17 +146,23 @@ def _sanitize_value(analysis_data: dict, key: str, value):
 
 
 def _sanitize_string(string: str) -> str:
-    string = string.replace('\0', '')
     try:
         string.encode()
     except UnicodeEncodeError:
         string = string.encode(errors='replace').decode()
+    # replace all characters that are converted to unicode characters in JSON, because unicode characters can't be
+    # saved in the PostgreSQL database
+    json_string = json.dumps(string)
+    if JSON_UNICODE_REGEX.search(json_string):
+        logging.warning(f'Sanitizing unicode characters in string {json_string[100:]}')
+        string = json.loads(JSON_UNICODE_REGEX.sub('', json_string))
     return string
 
 
 def _sanitize_key(analysis_data: dict, key: str):
-    if '\0' in key:
-        analysis_data[key.replace('\0', '')] = analysis_data.pop(key)
+    sanitized_key = _sanitize_string(key)
+    if sanitized_key != key:
+        analysis_data[sanitized_key] = analysis_data.pop(key)
 
 
 def _sanitize_list(value: list) -> list: