44"""
55
66from abc import ABC , abstractmethod
7- from typing import Any
7+ from enum import Enum
8+ from pathlib import Path
9+ from typing import Any , Optional
810
11+ import typer
912from docling .backend .docling_parse_v2_backend import DoclingParseV2DocumentBackend
1013from docling .datamodel .base_models import InputFormat
1114from docling .datamodel .pipeline_options import (
2427from unstructured .documents .elements import Table
2528from unstructured .partition .pdf import partition_pdf
2629
30+ app = typer .Typer ()
31+
2732
2833class ToolBase (ABC ):
2934 """Abstract base class for PDF table extraction tools.
@@ -37,7 +42,7 @@ def extract_tables(self, pdf_file: str) -> Any:
3742 pass
3843
3944 @abstractmethod
40- def convert_to_html (self , tables : Any ) -> list [str ]:
45+ def convert_to_html (self , tables : Any ) -> Optional [str ]:
4146 """Convert the extracted table data to HTML format."""
4247 pass
4348
@@ -63,7 +68,7 @@ def extract_tables(self, pdf_file: str) -> list[Table]:
6368 tables .append (el )
6469 return tables
6570
66- def convert_to_html (self , tables : Table ) -> Any :
71+ def convert_to_html (self , tables : Table ) -> Optional [ str ] :
6772 """Convert extracted table data using Unstructured Tool to HTML format."""
6873 try :
6974 tables_html = tables .metadata .text_as_html
@@ -98,7 +103,7 @@ def extract_tables(self, pdf_file: str) -> list[CroppedTable]:
98103 """Extract tables from a PDF file using GMFT."""
99104 return self .ingest_pdf (pdf_file )
100105
101- def convert_to_html (self , tables : list [CroppedTable ]) -> Any :
106+ def convert_to_html (self , tables : list [CroppedTable ]) -> Optional [ str ] :
102107 """Convert extracted table data using GMFT Tool to HTML format."""
103108 ft = self .formatter .extract (tables )
104109 try :
@@ -123,7 +128,7 @@ def extract_tables(self, pdf_file: str) -> Any:
123128 )
124129 return extracted_tables [0 ]
125130
126- def convert_to_html (self , tables : Any ) -> Any :
131+ def convert_to_html (self , tables : Any ) -> Optional [ str ] :
127132 """Convert extracted table data using Img2Table Tool to HTML format."""
128133 try :
129134 tables_html = tables .html_repr ()
@@ -172,16 +177,55 @@ def convert_to_html(self, tables: Any) -> Any:
172177 return tables_html
173178
174179
175- def initialize_tools (tools : list [str ] = ["all" ]) -> dict :
180+ class OCRMethod (str , Enum ):
181+ """Enum for specifying the OCR method to use for table extraction."""
182+
183+ unstructured = "unstructured"
184+ gmft = "gmft"
185+ img2table = "img2table"
186+ docling = "docling"
187+
188+
189+ def initialize_tool (tool : OCRMethod ) -> ToolBase :
176190 """Initialize and return selected table extraction tools. Default is all tools."""
177- available_tools = {
178- "unstructured" : UnstructuredTool (),
179- "gmft" : GMFTTool (),
180- "img2table" : Img2TableTool (),
181- "docling" : DoclingTool (),
182- }
183191
184- if tools == ["all" ]:
185- return available_tools
192+ match tool :
193+ case OCRMethod .unstructured :
194+ return UnstructuredTool ()
195+ case OCRMethod .gmft :
196+ return GMFTTool ()
197+ case OCRMethod .img2table :
198+ return Img2TableTool ()
199+ case OCRMethod .docling :
200+ return DoclingTool ()
201+ case _:
202+ raise ValueError (f"Invalid tool: { tool } " )
203+
204+
205+ @app .command ()
206+ def main (method : OCRMethod , input_dir : Path , output_dir : Path ):
207+ tool = initialize_tool (method )
208+
209+ # ensure input dir is a directory
210+ if not input_dir .is_dir ():
211+ raise ValueError (f"{ input_dir } is not a directory" )
212+
213+ # ensure output dir exists
214+ output_dir .mkdir (exist_ok = True )
215+
216+ # iterate over all pdf files in input_dir
217+ for pdf_file in input_dir .glob ("*.pdf" ):
218+ tables = tool .extract_tables (str (pdf_file .absolute ()))
219+ tables_html = tool .convert_to_html (tables )
220+
221+ if tables_html is None :
222+ print (f"Error processing { pdf_file .stem } " )
223+ continue
224+
225+ # write html to file
226+ with open (output_dir / f"{ pdf_file .stem } .html" , "w" ) as f :
227+ f .write (tables_html )
228+
186229
187- return {name : tool for name , tool in available_tools .items () if name in tools }
230+ if __name__ == "__main__" :
231+ app ()
0 commit comments