fixed base64 encoded image handling in html files

bigabig · bigabig · commit 160b2f3375f2 · 2025-06-11T10:51:47.000Z
diff --git a/DATSIMPORT.md b/DATSIMPORT.md
@@ -0,0 +1,9 @@
+uv run importer/dats_importer.py --input_dir /home/tfischer/Development/interactive-topic-modelling/datasets/20ngtest --backend_url http://localhost:19220/ --project_id 4 --tag_key tags --is_json --doctype text --content_key content --mime_type text/plain
+
+https://docs.nomic.ai/atlas/data-maps/topic-modeling
+
+https://docs.nomic.ai/atlas/data-maps/controls#lasso-and-tagging
+
+https://docs.nomic.ai/atlas/data-maps/guides/collaborative-tagging
+
+https://atlas.nomic.ai/data/nomic/airline-reviews-data
diff --git a/backend/src/app/preprocessing/pipeline/steps/text/init/add_text_init_steps.py b/backend/src/app/preprocessing/pipeline/steps/text/init/add_text_init_steps.py
@@ -8,11 +8,14 @@ def add_text_init_steps(pipeline: PreprocessingPipeline) -> None:
     from app.preprocessing.pipeline.steps.text.init.create_pptd import (
         create_pptd,
     )
+    from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_html_docs import (
+        extract_content_in_html_from_html_docs,
+    )
     from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_pdf_docs import (
         extract_content_in_html_from_pdf_docs,
     )
-    from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_raw_text_docs import (
-        extract_content_in_html_from_raw_text_docs,
+    from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_text_docs import (
+        extract_content_in_html_from_text_docs,
     )
     from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_word_docs import (
         extract_content_in_html_from_word_docs,
@@ -35,7 +38,12 @@ def add_text_init_steps(pipeline: PreprocessingPipeline) -> None:
 
     pipeline.register_step(
         required_data=["pptd"],
-        func=extract_content_in_html_from_raw_text_docs,
+        func=extract_content_in_html_from_html_docs,
+    )
+
+    pipeline.register_step(
+        required_data=["pptd"],
+        func=extract_content_in_html_from_text_docs,
     )
 
     pipeline.register_step(
diff --git a/backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_html_docs.py b/backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_html_docs.py
@@ -0,0 +1,78 @@
+import uuid
+from pathlib import Path
+
+from app.core.data.repo.repo_service import RepoService
+from app.core.data.repo.utils import base64_to_image
+from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
+from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
+from bs4 import BeautifulSoup, Tag
+from loguru import logger
+
+repo = RepoService()
+
+
+def __extract_base64_images_from_html_docs(
+    filepath: Path, content: str
+) -> tuple[str, list[Path]]:
+    """
+    Extracts content from HTML documents and returns the HTML content along with a list of extracted image paths.
+
+    Args:
+        filepath (Path): The path to the HTML file.
+        content (str): The raw HTML content of the document.
+
+    Returns:
+        tuple: A tuple containing the modified HTML content and a list of extracted image paths.
+    """
+    # Parse the HTML content
+    soup = BeautifulSoup(content, "html.parser")
+
+    # Extract base64 encoded images from the HTML content
+    base64_images = {}
+    for img_tag in soup.find_all("img"):
+        src = img_tag.get("src", "")
+        if src.startswith("data:image") and "base64," in src:
+            base64_data = src.split("base64,")[1]
+            unique_filename = f"{uuid.uuid4()}.png"
+            base64_images[unique_filename] = base64_data
+            img_tag["src"] = unique_filename  # Replace src with the filename
+
+    # Store all extracted images in the same directory as the HTML
+    extracted_images: list[Path] = []
+    output_path = filepath.parent
+    for img_fn, b64_img in base64_images.items():
+        img_path = output_path / img_fn
+        try:
+            img = base64_to_image(b64_img)
+        except Exception as e:
+            logger.error(
+                f"Error decoding base64 image {img_fn} from {filepath.name}: {e}"
+            )
+            # delete the image tag entirely from the HTML
+            img_tag = soup.find("img", {"src": img_fn})
+            if img_tag and isinstance(img_tag, Tag):
+                img_tag.decompose()
+            continue
+        img.save(img_path, format="PNG")
+        extracted_images.append(img_path)
+        logger.debug(f"Saved extracted image {img_path} from HTML {filepath.name}.")
+
+    return str(soup), extracted_images
+
+
+def extract_content_in_html_from_html_docs(cargo: PipelineCargo) -> PipelineCargo:
+    pptd: PreProTextDoc = cargo.data["pptd"]
+
+    if pptd.mime_type not in ["text/html"]:
+        return cargo
+
+    content = pptd.filepath.read_text(encoding="utf-8")
+
+    html, extracted_images = __extract_base64_images_from_html_docs(
+        pptd.filepath, content
+    )
+
+    pptd.html = html
+    pptd.extracted_images = extracted_images
+
+    return cargo
diff --git a/backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_raw_text_docs.py b/backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_raw_text_docs.py
diff --git a/backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_text_docs.py b/backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_text_docs.py
@@ -0,0 +1,17 @@
+from app.core.data.repo.repo_service import RepoService
+from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
+from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
+
+repo = RepoService()
+
+
+def extract_content_in_html_from_text_docs(cargo: PipelineCargo) -> PipelineCargo:
+    pptd: PreProTextDoc = cargo.data["pptd"]
+
+    if pptd.mime_type not in ["text/plain"]:
+        return cargo
+
+    content = pptd.filepath.read_text(encoding="utf-8")
+    pptd.html = f"<html><body><p>{content}</p></body></html>"
+
+    return cargo