Skip to content

Commit 5f7ab40

Browse files
committed
updated ocr tool
1 parent 5ce90fb commit 5f7ab40

File tree

4 files changed

+86
-15
lines changed

4 files changed

+86
-15
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,5 @@ repos:
4949
"--cache-location",
5050
"frontend/node_modules/.cache/prettier",
5151
]
52+
53+
exclude: ^tools/

tools/ocr/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# OCR
2+
3+
A tool for OCR of PDF documents
4+
5+
Available methods are
6+
7+
- docling
8+
- unstructured
9+
- gmft
10+
- img2table
11+
12+
## Usage
13+
14+
#### 1. Install dependencies
15+
16+
```
17+
pip install -r requirements.txt
18+
```
19+
20+
#### 2. Execute tool
21+
22+
```
23+
python ocr.py docling /path/to/input/dir /path/to/output/dir
24+
```
Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
"""
55

66
from abc import ABC, abstractmethod
7-
from typing import Any
7+
from enum import Enum
8+
from pathlib import Path
9+
from typing import Any, Optional
810

11+
import typer
912
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
1013
from docling.datamodel.base_models import InputFormat
1114
from docling.datamodel.pipeline_options import (
@@ -24,6 +27,8 @@
2427
from unstructured.documents.elements import Table
2528
from unstructured.partition.pdf import partition_pdf
2629

30+
app = typer.Typer()
31+
2732

2833
class ToolBase(ABC):
2934
"""Abstract base class for PDF table extraction tools.
@@ -37,7 +42,7 @@ def extract_tables(self, pdf_file: str) -> Any:
3742
pass
3843

3944
@abstractmethod
40-
def convert_to_html(self, tables: Any) -> list[str]:
45+
def convert_to_html(self, tables: Any) -> Optional[str]:
4146
"""Convert the extracted table data to HTML format."""
4247
pass
4348

@@ -63,7 +68,7 @@ def extract_tables(self, pdf_file: str) -> list[Table]:
6368
tables.append(el)
6469
return tables
6570

66-
def convert_to_html(self, tables: Table) -> Any:
71+
def convert_to_html(self, tables: Table) -> Optional[str]:
6772
"""Convert extracted table data using Unstructured Tool to HTML format."""
6873
try:
6974
tables_html = tables.metadata.text_as_html
@@ -98,7 +103,7 @@ def extract_tables(self, pdf_file: str) -> list[CroppedTable]:
98103
"""Extract tables from a PDF file using GMFT."""
99104
return self.ingest_pdf(pdf_file)
100105

101-
def convert_to_html(self, tables: list[CroppedTable]) -> Any:
106+
def convert_to_html(self, tables: list[CroppedTable]) -> Optional[str]:
102107
"""Convert extracted table data using GMFT Tool to HTML format."""
103108
ft = self.formatter.extract(tables)
104109
try:
@@ -123,7 +128,7 @@ def extract_tables(self, pdf_file: str) -> Any:
123128
)
124129
return extracted_tables[0]
125130

126-
def convert_to_html(self, tables: Any) -> Any:
131+
def convert_to_html(self, tables: Any) -> Optional[str]:
127132
"""Convert extracted table data using Img2Table Tool to HTML format."""
128133
try:
129134
tables_html = tables.html_repr()
@@ -172,16 +177,55 @@ def convert_to_html(self, tables: Any) -> Any:
172177
return tables_html
173178

174179

175-
def initialize_tools(tools: list[str] = ["all"]) -> dict:
180+
class OCRMethod(str, Enum):
181+
"""Enum for specifying the OCR method to use for table extraction."""
182+
183+
unstructured = "unstructured"
184+
gmft = "gmft"
185+
img2table = "img2table"
186+
docling = "docling"
187+
188+
189+
def initialize_tool(tool: OCRMethod) -> ToolBase:
176190
"""Initialize and return selected table extraction tools. Default is all tools."""
177-
available_tools = {
178-
"unstructured": UnstructuredTool(),
179-
"gmft": GMFTTool(),
180-
"img2table": Img2TableTool(),
181-
"docling": DoclingTool(),
182-
}
183191

184-
if tools == ["all"]:
185-
return available_tools
192+
match tool:
193+
case OCRMethod.unstructured:
194+
return UnstructuredTool()
195+
case OCRMethod.gmft:
196+
return GMFTTool()
197+
case OCRMethod.img2table:
198+
return Img2TableTool()
199+
case OCRMethod.docling:
200+
return DoclingTool()
201+
case _:
202+
raise ValueError(f"Invalid tool: {tool}")
203+
204+
205+
@app.command()
206+
def main(method: OCRMethod, input_dir: Path, output_dir: Path):
207+
tool = initialize_tool(method)
208+
209+
# ensure input dir is a directory
210+
if not input_dir.is_dir():
211+
raise ValueError(f"{input_dir} is not a directory")
212+
213+
# ensure output dir exists
214+
output_dir.mkdir(exist_ok=True)
215+
216+
# iterate over all pdf files in input_dir
217+
for pdf_file in input_dir.glob("*.pdf"):
218+
tables = tool.extract_tables(str(pdf_file.absolute()))
219+
tables_html = tool.convert_to_html(tables)
220+
221+
if tables_html is None:
222+
print(f"Error processing {pdf_file.stem}")
223+
continue
224+
225+
# write html to file
226+
with open(output_dir / f"{pdf_file.stem}.html", "w") as f:
227+
f.write(tables_html)
228+
186229

187-
return {name: tool for name, tool in available_tools.items() if name in tools}
230+
if __name__ == "__main__":
231+
app()

tools/ocr/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
docling>=2.15.1
22
gmft>=0.4.0
3+
gmft_pymupdf @ git+https://github.com/conjuncts/gmft_pymupdf.git
34
img2table>=1.4.0
45
unstructured[all-docs]>=0.14.10

0 commit comments

Comments
 (0)