Skip to content

Commit b4bac37

Browse files
Add PDF table extraction tools and requirements (#485)
* Add PDF table extraction tools and requirements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent e4eb41d commit b4bac37

File tree

3 files changed

+191
-0
lines changed

3 files changed

+191
-0
lines changed

tools/ocr/__init__.py

Whitespace-only changes.

tools/ocr/pdf_tools.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
"""Module for PDF table extraction tools.
2+
3+
This module defines tools for extracting tables from PDFs, converting them to HTML.
4+
"""
5+
6+
from abc import ABC, abstractmethod
7+
from typing import Any
8+
9+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
10+
from docling.datamodel.base_models import InputFormat
11+
from docling.datamodel.pipeline_options import (
12+
EasyOcrOptions,
13+
PdfPipelineOptions,
14+
TableFormerMode,
15+
)
16+
from docling.document_converter import DocumentConverter, PdfFormatOption
17+
from docling_core.types.doc.document import TableItem
18+
from gmft.auto import AutoFormatConfig, AutoTableFormatter
19+
from gmft.detectors.common import CroppedTable
20+
from gmft.detectors.tatr import TATRDetector
21+
from gmft_pymupdf import PyMuPDFDocument
22+
from img2table.document import PDF
23+
from img2table.ocr import TesseractOCR
24+
from unstructured.documents.elements import Table
25+
from unstructured.partition.pdf import partition_pdf
26+
27+
28+
class ToolBase(ABC):
29+
"""Abstract base class for PDF table extraction tools.
30+
31+
Defines the interface that all tools must implement.
32+
"""
33+
34+
@abstractmethod
35+
def extract_tables(self, pdf_file: str) -> Any:
36+
"""Extract tables from a PDF file and return them in a raw, tool-specific format."""
37+
pass
38+
39+
@abstractmethod
40+
def convert_to_html(self, tables: Any) -> list[str]:
41+
"""Convert the extracted table data to HTML format."""
42+
pass
43+
44+
45+
class UnstructuredTool(ToolBase):
46+
"""Tool for extracting tables from PDFs using the Unstructured library."""
47+
48+
def __init__(self) -> None:
49+
pass
50+
51+
def extract_tables(self, pdf_file: str) -> list[Table]:
52+
"""Uses Unstructured Tool to extract tables from a PDF file."""
53+
elements = partition_pdf(
54+
filename=pdf_file,
55+
infer_table_structure=True,
56+
strategy="hi_res",
57+
languages=["eng"],
58+
model_name="yolox",
59+
)
60+
tables: list[Table] = []
61+
for el in elements:
62+
if isinstance(el, Table):
63+
tables.append(el)
64+
return tables
65+
66+
def convert_to_html(self, tables: Table) -> Any:
67+
"""Convert extracted table data using Unstructured Tool to HTML format."""
68+
try:
69+
tables_html = tables.metadata.text_as_html
70+
except Exception as e:
71+
print(f"Error processing table: {e}")
72+
tables_html = None
73+
return tables_html
74+
75+
76+
class GMFTTool(ToolBase):
77+
"""Tool for extracting tables using the GMFT library."""
78+
79+
def __init__(self) -> None:
80+
self.detector = TATRDetector()
81+
config = AutoFormatConfig()
82+
config.semantic_spanning_cells = True
83+
config.enable_multi_header = True
84+
self.formatter = AutoTableFormatter(config)
85+
86+
def ingest_pdf(self, pdf_path: str) -> list[CroppedTable]:
87+
"""Opens the PDF with PyMuPDFDocument, iterates over each page.
88+
89+
Uses GMFT Tool and the TATRDetector to extract tables.
90+
"""
91+
doc = PyMuPDFDocument(pdf_path)
92+
tables = []
93+
for page in doc:
94+
tables.extend(self.detector.extract(page))
95+
return tables
96+
97+
def extract_tables(self, pdf_file: str) -> list[CroppedTable]:
98+
"""Extract tables from a PDF file using GMFT."""
99+
return self.ingest_pdf(pdf_file)
100+
101+
def convert_to_html(self, tables: list[CroppedTable]) -> Any:
102+
"""Convert extracted table data using GMFT Tool to HTML format."""
103+
ft = self.formatter.extract(tables)
104+
try:
105+
tables_html = ft.df().fillna("").to_html()
106+
except Exception as e:
107+
print(f"Error processing table: {e}")
108+
tables_html = None
109+
return tables_html
110+
111+
112+
class Img2TableTool(ToolBase):
113+
"""Tool for extracting tables from PDFs using the Img2Table library."""
114+
115+
def __init__(self) -> None:
116+
self.ocr = TesseractOCR(n_threads=1, lang="eng")
117+
118+
def extract_tables(self, pdf_file: str) -> Any:
119+
"""Uses Img2Table Tool to extract tables from a PDF file."""
120+
pdf = PDF(pdf_file, detect_rotation=False, pdf_text_extraction=True)
121+
extracted_tables = pdf.extract_tables(
122+
ocr=self.ocr, implicit_rows=True, borderless_tables=True, min_confidence=50
123+
)
124+
return extracted_tables[0]
125+
126+
def convert_to_html(self, tables: Any) -> Any:
127+
"""Convert extracted table data using Img2Table Tool to HTML format."""
128+
try:
129+
tables_html = tables.html_repr()
130+
except Exception as e:
131+
print(f"Error processing table: {e}")
132+
tables_html = None
133+
return tables_html
134+
135+
136+
class DoclingTool(ToolBase):
137+
"""Tool for extracting tables from PDFs using the Docling library."""
138+
139+
def __init__(self) -> None:
140+
self.pipeline_options = PdfPipelineOptions(
141+
do_table_structure=True, ocr_options=EasyOcrOptions()
142+
)
143+
self.pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
144+
145+
self.doc_converter = DocumentConverter(
146+
format_options={
147+
InputFormat.PDF: PdfFormatOption(
148+
pipeline_options=self.pipeline_options,
149+
backend=DoclingParseV2DocumentBackend,
150+
)
151+
}
152+
)
153+
154+
def extract_tables(self, pdf_file: str) -> list[TableItem]:
155+
"""Uses Docling tool to extract tables from the given PDF file.
156+
157+
Returns a list of table objects (Docling-specific).
158+
"""
159+
result = self.doc_converter.convert(pdf_file)
160+
161+
tables = result.document.tables
162+
163+
return tables
164+
165+
def convert_to_html(self, tables: Any) -> Any:
166+
"""Convert the Docling table objects to HTML strings."""
167+
try:
168+
tables_html = tables.export_to_html()
169+
except Exception as e:
170+
print(f"Error processing tables: {e}")
171+
return []
172+
return tables_html
173+
174+
175+
def initialize_tools(tools: list[str] = ["all"]) -> dict:
176+
"""Initialize and return selected table extraction tools. Default is all tools."""
177+
available_tools = {
178+
"unstructured": UnstructuredTool(),
179+
"gmft": GMFTTool(),
180+
"img2table": Img2TableTool(),
181+
"docling": DoclingTool(),
182+
}
183+
184+
if tools == ["all"]:
185+
return available_tools
186+
187+
return {name: tool for name, tool in available_tools.items() if name in tools}

tools/ocr/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
docling>=2.15.1
2+
gmft>=0.4.0
3+
img2table>=1.4.0
4+
unstructured[all-docs]>=0.14.10

0 commit comments

Comments
 (0)