updated ocr tool

bigabig · bigabig · commit 5f7ab40b25b7 · 2025-02-05T16:38:25.000Z
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -49,3 +49,5 @@ repos:
             "--cache-location",
             "frontend/node_modules/.cache/prettier",
           ]
+
+exclude: ^tools/
diff --git a/tools/ocr/README.md b/tools/ocr/README.md
@@ -0,0 +1,24 @@
+# OCR
+
+A tool for OCR of PDF documents
+
+Available methods are
+
+- docling
+- unstructured
+- gmft
+- img2table
+
+## Usage
+
+#### 1. Install dependencies
+
+```
+pip install -r requirements.txt
+```
+
+#### 2. Execute tool
+
+```
+python ocr.py docling /path/to/input/dir /path/to/output/dir
+```
diff --git a/tools/ocr/ocr.py b/tools/ocr/ocr.py
@@ -4,8 +4,11 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Any
+from enum import Enum
+from pathlib import Path
+from typing import Any, Optional
 
+import typer
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
@@ -24,6 +27,8 @@
 from unstructured.documents.elements import Table
 from unstructured.partition.pdf import partition_pdf
 
+app = typer.Typer()
+
 
 class ToolBase(ABC):
     """Abstract base class for PDF table extraction tools.
@@ -37,7 +42,7 @@ def extract_tables(self, pdf_file: str) -> Any:
         pass
 
     @abstractmethod
-    def convert_to_html(self, tables: Any) -> list[str]:
+    def convert_to_html(self, tables: Any) -> Optional[str]:
         """Convert the extracted table data to HTML format."""
         pass
 
@@ -63,7 +68,7 @@ def extract_tables(self, pdf_file: str) -> list[Table]:
                 tables.append(el)
         return tables
 
-    def convert_to_html(self, tables: Table) -> Any:
+    def convert_to_html(self, tables: Table) -> Optional[str]:
         """Convert extracted table data using Unstructured Tool to HTML format."""
         try:
             tables_html = tables.metadata.text_as_html
@@ -98,7 +103,7 @@ def extract_tables(self, pdf_file: str) -> list[CroppedTable]:
         """Extract tables from a PDF file using GMFT."""
         return self.ingest_pdf(pdf_file)
 
-    def convert_to_html(self, tables: list[CroppedTable]) -> Any:
+    def convert_to_html(self, tables: list[CroppedTable]) -> Optional[str]:
         """Convert extracted table data using GMFT Tool to HTML format."""
         ft = self.formatter.extract(tables)
         try:
@@ -123,7 +128,7 @@ def extract_tables(self, pdf_file: str) -> Any:
         )
         return extracted_tables[0]
 
-    def convert_to_html(self, tables: Any) -> Any:
+    def convert_to_html(self, tables: Any) -> Optional[str]:
         """Convert extracted table data using Img2Table Tool to HTML format."""
         try:
             tables_html = tables.html_repr()
@@ -172,16 +177,55 @@ def convert_to_html(self, tables: Any) -> Any:
         return tables_html
 
 
-def initialize_tools(tools: list[str] = ["all"]) -> dict:
+class OCRMethod(str, Enum):
+    """Enum for specifying the OCR method to use for table extraction."""
+
+    unstructured = "unstructured"
+    gmft = "gmft"
+    img2table = "img2table"
+    docling = "docling"
+
+
+def initialize_tool(tool: OCRMethod) -> ToolBase:
     """Initialize and return selected table extraction tools. Default is all tools."""
-    available_tools = {
-        "unstructured": UnstructuredTool(),
-        "gmft": GMFTTool(),
-        "img2table": Img2TableTool(),
-        "docling": DoclingTool(),
-    }
 
-    if tools == ["all"]:
-        return available_tools
+    match tool:
+        case OCRMethod.unstructured:
+            return UnstructuredTool()
+        case OCRMethod.gmft:
+            return GMFTTool()
+        case OCRMethod.img2table:
+            return Img2TableTool()
+        case OCRMethod.docling:
+            return DoclingTool()
+        case _:
+            raise ValueError(f"Invalid tool: {tool}")
+
+
+@app.command()
+def main(method: OCRMethod, input_dir: Path, output_dir: Path):
+    tool = initialize_tool(method)
+
+    # ensure input dir is a directory
+    if not input_dir.is_dir():
+        raise ValueError(f"{input_dir} is not a directory")
+
+    # ensure output dir exists
+    output_dir.mkdir(exist_ok=True)
+
+    # iterate over all pdf files in input_dir
+    for pdf_file in input_dir.glob("*.pdf"):
+        tables = tool.extract_tables(str(pdf_file.absolute()))
+        tables_html = tool.convert_to_html(tables)
+
+        if tables_html is None:
+            print(f"Error processing {pdf_file.stem}")
+            continue
+
+        # write html to file
+        with open(output_dir / f"{pdf_file.stem}.html", "w") as f:
+            f.write(tables_html)
+
 
-    return {name: tool for name, tool in available_tools.items() if name in tools}
+if __name__ == "__main__":
+    app()
diff --git a/tools/ocr/requirements.txt b/tools/ocr/requirements.txt
@@ -1,4 +1,5 @@
 docling>=2.15.1
 gmft>=0.4.0
+gmft_pymupdf @ git+https://github.com/conjuncts/gmft_pymupdf.git
 img2table>=1.4.0
 unstructured[all-docs]>=0.14.10

Original file line number	Diff line number	Diff line change
`@@ -49,3 +49,5 @@ repos:`
`49`	`49`	`"--cache-location",`
`50`	`50`	`"frontend/node_modules/.cache/prettier",`
`51`	`51`	`]`
	`52`	`+`
	`53`	`+exclude: ^tools/`