From fbdb420d1d532f632148c7f2a41a34eb0c70a35d Mon Sep 17 00:00:00 2001 From: Derek Date: Tue, 30 Jan 2024 15:54:53 +0000 Subject: [PATCH] Added in another try/except block to catch a unicode error that was being through when we read LD+JSON structued data from one particular site/page --- extruct/jsonld.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 495ad07b..6cf848cb 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -40,9 +40,13 @@ def _extract_items(self, node): data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub("", script), strict=False) - if isinstance(data, list): - for item in data: - yield item - elif isinstance(data, dict): - yield data + try: + data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub("", script), strict=False) + if isinstance(data, list): + for item in data: + yield item + elif isinstance(data, dict): + yield data + except Exception as e: + print("Exception trying to parse JSON data: "+str(e)+" so skipping this data...") + yield {}