Skip to content

Commit 160b2f3

Browse files
committed
fixed base64 encoded image handling in html files
1 parent c7c1ae9 commit 160b2f3

File tree

5 files changed

+115
-28
lines changed

5 files changed

+115
-28
lines changed

DATSIMPORT.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
uv run importer/dats_importer.py --input_dir /home/tfischer/Development/interactive-topic-modelling/datasets/20ngtest --backend_url http://localhost:19220/ --project_id 4 --tag_key tags --is_json --doctype text --content_key content --mime_type text/plain
2+
3+
https://docs.nomic.ai/atlas/data-maps/topic-modeling
4+
5+
https://docs.nomic.ai/atlas/data-maps/controls#lasso-and-tagging
6+
7+
https://docs.nomic.ai/atlas/data-maps/guides/collaborative-tagging
8+
9+
https://atlas.nomic.ai/data/nomic/airline-reviews-data

backend/src/app/preprocessing/pipeline/steps/text/init/add_text_init_steps.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ def add_text_init_steps(pipeline: PreprocessingPipeline) -> None:
88
from app.preprocessing.pipeline.steps.text.init.create_pptd import (
99
create_pptd,
1010
)
11+
from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_html_docs import (
12+
extract_content_in_html_from_html_docs,
13+
)
1114
from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_pdf_docs import (
1215
extract_content_in_html_from_pdf_docs,
1316
)
14-
from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_raw_text_docs import (
15-
extract_content_in_html_from_raw_text_docs,
17+
from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_text_docs import (
18+
extract_content_in_html_from_text_docs,
1619
)
1720
from app.preprocessing.pipeline.steps.text.init.extract_content_in_html_from_word_docs import (
1821
extract_content_in_html_from_word_docs,
@@ -35,7 +38,12 @@ def add_text_init_steps(pipeline: PreprocessingPipeline) -> None:
3538

3639
pipeline.register_step(
3740
required_data=["pptd"],
38-
func=extract_content_in_html_from_raw_text_docs,
41+
func=extract_content_in_html_from_html_docs,
42+
)
43+
44+
pipeline.register_step(
45+
required_data=["pptd"],
46+
func=extract_content_in_html_from_text_docs,
3947
)
4048

4149
pipeline.register_step(
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import uuid
2+
from pathlib import Path
3+
4+
from app.core.data.repo.repo_service import RepoService
5+
from app.core.data.repo.utils import base64_to_image
6+
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
7+
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
8+
from bs4 import BeautifulSoup, Tag
9+
from loguru import logger
10+
11+
repo = RepoService()
12+
13+
14+
def __extract_base64_images_from_html_docs(
15+
filepath: Path, content: str
16+
) -> tuple[str, list[Path]]:
17+
"""
18+
Extracts content from HTML documents and returns the HTML content along with a list of extracted image paths.
19+
20+
Args:
21+
filepath (Path): The path to the HTML file.
22+
content (str): The raw HTML content of the document.
23+
24+
Returns:
25+
tuple: A tuple containing the modified HTML content and a list of extracted image paths.
26+
"""
27+
# Parse the HTML content
28+
soup = BeautifulSoup(content, "html.parser")
29+
30+
# Extract base64 encoded images from the HTML content
31+
base64_images = {}
32+
for img_tag in soup.find_all("img"):
33+
src = img_tag.get("src", "")
34+
if src.startswith("data:image") and "base64," in src:
35+
base64_data = src.split("base64,")[1]
36+
unique_filename = f"{uuid.uuid4()}.png"
37+
base64_images[unique_filename] = base64_data
38+
img_tag["src"] = unique_filename # Replace src with the filename
39+
40+
# Store all extracted images in the same directory as the HTML
41+
extracted_images: list[Path] = []
42+
output_path = filepath.parent
43+
for img_fn, b64_img in base64_images.items():
44+
img_path = output_path / img_fn
45+
try:
46+
img = base64_to_image(b64_img)
47+
except Exception as e:
48+
logger.error(
49+
f"Error decoding base64 image {img_fn} from {filepath.name}: {e}"
50+
)
51+
# delete the image tag entirely from the HTML
52+
img_tag = soup.find("img", {"src": img_fn})
53+
if img_tag and isinstance(img_tag, Tag):
54+
img_tag.decompose()
55+
continue
56+
img.save(img_path, format="PNG")
57+
extracted_images.append(img_path)
58+
logger.debug(f"Saved extracted image {img_path} from HTML {filepath.name}.")
59+
60+
return str(soup), extracted_images
61+
62+
63+
def extract_content_in_html_from_html_docs(cargo: PipelineCargo) -> PipelineCargo:
64+
pptd: PreProTextDoc = cargo.data["pptd"]
65+
66+
if pptd.mime_type not in ["text/html"]:
67+
return cargo
68+
69+
content = pptd.filepath.read_text(encoding="utf-8")
70+
71+
html, extracted_images = __extract_base64_images_from_html_docs(
72+
pptd.filepath, content
73+
)
74+
75+
pptd.html = html
76+
pptd.extracted_images = extracted_images
77+
78+
return cargo

backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_raw_text_docs.py

Lines changed: 0 additions & 25 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from app.core.data.repo.repo_service import RepoService
2+
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
3+
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
4+
5+
repo = RepoService()
6+
7+
8+
def extract_content_in_html_from_text_docs(cargo: PipelineCargo) -> PipelineCargo:
9+
pptd: PreProTextDoc = cargo.data["pptd"]
10+
11+
if pptd.mime_type not in ["text/plain"]:
12+
return cargo
13+
14+
content = pptd.filepath.read_text(encoding="utf-8")
15+
pptd.html = f"<html><body><p>{content}</p></body></html>"
16+
17+
return cargo

0 commit comments

Comments
 (0)