Skip to content

Commit c7c1ae9

Browse files
committed
long pdf docling model fix
1 parent d2edfce commit c7c1ae9

File tree

1 file changed

+45
-44
lines changed
  • backend/src/app/preprocessing/ray_model_worker/models

1 file changed

+45
-44
lines changed

backend/src/app/preprocessing/ray_model_worker/models/docling.py

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -19,49 +19,6 @@
1919
logger = logging.getLogger("ray.serve")
2020

2121

22-
def __read_html_and_replace_absolute_image_paths(
23-
html_filename: Path, rel_to: Path
24-
) -> str:
25-
if (
26-
not html_filename.exists()
27-
or not html_filename.is_file()
28-
or not html_filename.suffix.lower() == ".html"
29-
):
30-
raise ValueError(f"Input file {html_filename} is not a valid HTML file.")
31-
if not rel_to.exists() or not rel_to.is_dir():
32-
raise ValueError(
33-
f"Relative path {rel_to} does not exist or is not a directory."
34-
)
35-
# load html and replace absolute image paths with relative ones
36-
html_content = html_filename.read_text(encoding="utf-8")
37-
soup = BeautifulSoup(html_content, "html.parser")
38-
for img in soup.find_all("img"):
39-
img_src = Path(img["src"]) # type: ignore
40-
if img_src.is_absolute():
41-
img["src"] = str(img_src.relative_to(rel_to)) # type: ignore
42-
html_content = str(soup)
43-
return html_content
44-
45-
46-
def __create_docling_pdf_conversion_output(
47-
html_filename: Path,
48-
out_dir: Path,
49-
) -> DoclingPDF2HTMLOutput:
50-
html_content = __read_html_and_replace_absolute_image_paths(
51-
html_filename,
52-
out_dir,
53-
)
54-
55-
base64_images = {}
56-
for img_path in out_dir.glob("**/*.png"):
57-
base64_images[img_path.name] = image_to_base64(img_path)
58-
59-
return DoclingPDF2HTMLOutput(
60-
html_content=html_content,
61-
base64_images=base64_images,
62-
)
63-
64-
6522
@serve.deployment(**build_ray_model_deployment_config("docling"))
6623
class DoclingModel:
6724
def __init__(
@@ -82,6 +39,50 @@ def __init__(
8239
doc_converter.initialize_pipeline(InputFormat.PDF)
8340
self.doc_converter = doc_converter
8441

42+
def __read_html_and_replace_absolute_image_paths(
43+
self,
44+
html_filename: Path,
45+
rel_to: Path,
46+
) -> str:
47+
if (
48+
not html_filename.exists()
49+
or not html_filename.is_file()
50+
or not html_filename.suffix.lower() == ".html"
51+
):
52+
raise ValueError(f"Input file {html_filename} is not a valid HTML file.")
53+
if not rel_to.exists() or not rel_to.is_dir():
54+
raise ValueError(
55+
f"Relative path {rel_to} does not exist or is not a directory."
56+
)
57+
# load html and replace absolute image paths with relative ones
58+
html_content = html_filename.read_text(encoding="utf-8")
59+
soup = BeautifulSoup(html_content, "html.parser")
60+
for img in soup.find_all("img"):
61+
img_src = Path(img["src"]) # type: ignore
62+
if img_src.is_absolute():
63+
img["src"] = str(img_src.relative_to(rel_to)) # type: ignore
64+
html_content = str(soup)
65+
return html_content
66+
67+
def __create_docling_pdf_conversion_output(
68+
self,
69+
html_filename: Path,
70+
out_dir: Path,
71+
) -> DoclingPDF2HTMLOutput:
72+
html_content = self.__read_html_and_replace_absolute_image_paths(
73+
html_filename,
74+
out_dir,
75+
)
76+
77+
base64_images = {}
78+
for img_path in out_dir.glob("**/*.png"):
79+
base64_images[img_path.name] = image_to_base64(img_path)
80+
81+
return DoclingPDF2HTMLOutput(
82+
html_content=html_content,
83+
base64_images=base64_images,
84+
)
85+
8586
def pdf2html(self, pdf_chunk: Path) -> DoclingPDF2HTMLOutput:
8687
# Here we assume that the pdf_chunk is a valid PDF file chunk
8788
if (
@@ -109,7 +110,7 @@ def pdf2html(self, pdf_chunk: Path) -> DoclingPDF2HTMLOutput:
109110
)
110111

111112
logger.info(f"Creating Docling PDF conversion output for {pdf_chunk} ...")
112-
conversion_output = __create_docling_pdf_conversion_output(
113+
conversion_output = self.__create_docling_pdf_conversion_output(
113114
html_filename=html_filename,
114115
out_dir=out_dir,
115116
)

0 commit comments

Comments
 (0)