|
| 1 | +import uuid |
| 2 | +from pathlib import Path |
| 3 | + |
| 4 | +from app.core.data.repo.repo_service import RepoService |
| 5 | +from app.core.data.repo.utils import base64_to_image |
| 6 | +from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo |
| 7 | +from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc |
| 8 | +from bs4 import BeautifulSoup, Tag |
| 9 | +from loguru import logger |
| 10 | + |
| 11 | +repo = RepoService() |
| 12 | + |
| 13 | + |
| 14 | +def __extract_base64_images_from_html_docs( |
| 15 | + filepath: Path, content: str |
| 16 | +) -> tuple[str, list[Path]]: |
| 17 | + """ |
| 18 | + Extracts content from HTML documents and returns the HTML content along with a list of extracted image paths. |
| 19 | +
|
| 20 | + Args: |
| 21 | + filepath (Path): The path to the HTML file. |
| 22 | + content (str): The raw HTML content of the document. |
| 23 | +
|
| 24 | + Returns: |
| 25 | + tuple: A tuple containing the modified HTML content and a list of extracted image paths. |
| 26 | + """ |
| 27 | + # Parse the HTML content |
| 28 | + soup = BeautifulSoup(content, "html.parser") |
| 29 | + |
| 30 | + # Extract base64 encoded images from the HTML content |
| 31 | + base64_images = {} |
| 32 | + for img_tag in soup.find_all("img"): |
| 33 | + src = img_tag.get("src", "") |
| 34 | + if src.startswith("data:image") and "base64," in src: |
| 35 | + base64_data = src.split("base64,")[1] |
| 36 | + unique_filename = f"{uuid.uuid4()}.png" |
| 37 | + base64_images[unique_filename] = base64_data |
| 38 | + img_tag["src"] = unique_filename # Replace src with the filename |
| 39 | + |
| 40 | + # Store all extracted images in the same directory as the HTML |
| 41 | + extracted_images: list[Path] = [] |
| 42 | + output_path = filepath.parent |
| 43 | + for img_fn, b64_img in base64_images.items(): |
| 44 | + img_path = output_path / img_fn |
| 45 | + try: |
| 46 | + img = base64_to_image(b64_img) |
| 47 | + except Exception as e: |
| 48 | + logger.error( |
| 49 | + f"Error decoding base64 image {img_fn} from {filepath.name}: {e}" |
| 50 | + ) |
| 51 | + # delete the image tag entirely from the HTML |
| 52 | + img_tag = soup.find("img", {"src": img_fn}) |
| 53 | + if img_tag and isinstance(img_tag, Tag): |
| 54 | + img_tag.decompose() |
| 55 | + continue |
| 56 | + img.save(img_path, format="PNG") |
| 57 | + extracted_images.append(img_path) |
| 58 | + logger.debug(f"Saved extracted image {img_path} from HTML {filepath.name}.") |
| 59 | + |
| 60 | + return str(soup), extracted_images |
| 61 | + |
| 62 | + |
| 63 | +def extract_content_in_html_from_html_docs(cargo: PipelineCargo) -> PipelineCargo: |
| 64 | + pptd: PreProTextDoc = cargo.data["pptd"] |
| 65 | + |
| 66 | + if pptd.mime_type not in ["text/html"]: |
| 67 | + return cargo |
| 68 | + |
| 69 | + content = pptd.filepath.read_text(encoding="utf-8") |
| 70 | + |
| 71 | + html, extracted_images = __extract_base64_images_from_html_docs( |
| 72 | + pptd.filepath, content |
| 73 | + ) |
| 74 | + |
| 75 | + pptd.html = html |
| 76 | + pptd.extracted_images = extracted_images |
| 77 | + |
| 78 | + return cargo |
0 commit comments