1919logger = logging .getLogger ("ray.serve" )
2020
2121
22- def __read_html_and_replace_absolute_image_paths (
23- html_filename : Path , rel_to : Path
24- ) -> str :
25- if (
26- not html_filename .exists ()
27- or not html_filename .is_file ()
28- or not html_filename .suffix .lower () == ".html"
29- ):
30- raise ValueError (f"Input file { html_filename } is not a valid HTML file." )
31- if not rel_to .exists () or not rel_to .is_dir ():
32- raise ValueError (
33- f"Relative path { rel_to } does not exist or is not a directory."
34- )
35- # load html and replace absolute image paths with relative ones
36- html_content = html_filename .read_text (encoding = "utf-8" )
37- soup = BeautifulSoup (html_content , "html.parser" )
38- for img in soup .find_all ("img" ):
39- img_src = Path (img ["src" ]) # type: ignore
40- if img_src .is_absolute ():
41- img ["src" ] = str (img_src .relative_to (rel_to )) # type: ignore
42- html_content = str (soup )
43- return html_content
44-
45-
46- def __create_docling_pdf_conversion_output (
47- html_filename : Path ,
48- out_dir : Path ,
49- ) -> DoclingPDF2HTMLOutput :
50- html_content = __read_html_and_replace_absolute_image_paths (
51- html_filename ,
52- out_dir ,
53- )
54-
55- base64_images = {}
56- for img_path in out_dir .glob ("**/*.png" ):
57- base64_images [img_path .name ] = image_to_base64 (img_path )
58-
59- return DoclingPDF2HTMLOutput (
60- html_content = html_content ,
61- base64_images = base64_images ,
62- )
63-
64-
6522@serve .deployment (** build_ray_model_deployment_config ("docling" ))
6623class DoclingModel :
6724 def __init__ (
@@ -82,6 +39,50 @@ def __init__(
8239 doc_converter .initialize_pipeline (InputFormat .PDF )
8340 self .doc_converter = doc_converter
8441
42+ def __read_html_and_replace_absolute_image_paths (
43+ self ,
44+ html_filename : Path ,
45+ rel_to : Path ,
46+ ) -> str :
47+ if (
48+ not html_filename .exists ()
49+ or not html_filename .is_file ()
50+ or not html_filename .suffix .lower () == ".html"
51+ ):
52+ raise ValueError (f"Input file { html_filename } is not a valid HTML file." )
53+ if not rel_to .exists () or not rel_to .is_dir ():
54+ raise ValueError (
55+ f"Relative path { rel_to } does not exist or is not a directory."
56+ )
57+ # load html and replace absolute image paths with relative ones
58+ html_content = html_filename .read_text (encoding = "utf-8" )
59+ soup = BeautifulSoup (html_content , "html.parser" )
60+ for img in soup .find_all ("img" ):
61+ img_src = Path (img ["src" ]) # type: ignore
62+ if img_src .is_absolute ():
63+ img ["src" ] = str (img_src .relative_to (rel_to )) # type: ignore
64+ html_content = str (soup )
65+ return html_content
66+
67+ def __create_docling_pdf_conversion_output (
68+ self ,
69+ html_filename : Path ,
70+ out_dir : Path ,
71+ ) -> DoclingPDF2HTMLOutput :
72+ html_content = self .__read_html_and_replace_absolute_image_paths (
73+ html_filename ,
74+ out_dir ,
75+ )
76+
77+ base64_images = {}
78+ for img_path in out_dir .glob ("**/*.png" ):
79+ base64_images [img_path .name ] = image_to_base64 (img_path )
80+
81+ return DoclingPDF2HTMLOutput (
82+ html_content = html_content ,
83+ base64_images = base64_images ,
84+ )
85+
8586 def pdf2html (self , pdf_chunk : Path ) -> DoclingPDF2HTMLOutput :
8687 # Here we assume that the pdf_chunk is a valid PDF file chunk
8788 if (
@@ -109,7 +110,7 @@ def pdf2html(self, pdf_chunk: Path) -> DoclingPDF2HTMLOutput:
109110 )
110111
111112 logger .info (f"Creating Docling PDF conversion output for { pdf_chunk } ..." )
112- conversion_output = __create_docling_pdf_conversion_output (
113+ conversion_output = self . __create_docling_pdf_conversion_output (
113114 html_filename = html_filename ,
114115 out_dir = out_dir ,
115116 )
0 commit comments