|
| 1 | +"""Module for PDF table extraction tools. |
| 2 | +
|
| 3 | +This module defines tools for extracting tables from PDFs, converting them to HTML. |
| 4 | +""" |
| 5 | + |
| 6 | +from abc import ABC, abstractmethod |
| 7 | +from typing import Any |
| 8 | + |
| 9 | +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend |
| 10 | +from docling.datamodel.base_models import InputFormat |
| 11 | +from docling.datamodel.pipeline_options import ( |
| 12 | + EasyOcrOptions, |
| 13 | + PdfPipelineOptions, |
| 14 | + TableFormerMode, |
| 15 | +) |
| 16 | +from docling.document_converter import DocumentConverter, PdfFormatOption |
| 17 | +from docling_core.types.doc.document import TableItem |
| 18 | +from gmft.auto import AutoFormatConfig, AutoTableFormatter |
| 19 | +from gmft.detectors.common import CroppedTable |
| 20 | +from gmft.detectors.tatr import TATRDetector |
| 21 | +from gmft_pymupdf import PyMuPDFDocument |
| 22 | +from img2table.document import PDF |
| 23 | +from img2table.ocr import TesseractOCR |
| 24 | +from unstructured.documents.elements import Table |
| 25 | +from unstructured.partition.pdf import partition_pdf |
| 26 | + |
| 27 | + |
| 28 | +class ToolBase(ABC): |
| 29 | + """Abstract base class for PDF table extraction tools. |
| 30 | +
|
| 31 | + Defines the interface that all tools must implement. |
| 32 | + """ |
| 33 | + |
| 34 | + @abstractmethod |
| 35 | + def extract_tables(self, pdf_file: str) -> Any: |
| 36 | + """Extract tables from a PDF file and return them in a raw, tool-specific format.""" |
| 37 | + pass |
| 38 | + |
| 39 | + @abstractmethod |
| 40 | + def convert_to_html(self, tables: Any) -> list[str]: |
| 41 | + """Convert the extracted table data to HTML format.""" |
| 42 | + pass |
| 43 | + |
| 44 | + |
| 45 | +class UnstructuredTool(ToolBase): |
| 46 | + """Tool for extracting tables from PDFs using the Unstructured library.""" |
| 47 | + |
| 48 | + def __init__(self) -> None: |
| 49 | + pass |
| 50 | + |
| 51 | + def extract_tables(self, pdf_file: str) -> list[Table]: |
| 52 | + """Uses Unstructured Tool to extract tables from a PDF file.""" |
| 53 | + elements = partition_pdf( |
| 54 | + filename=pdf_file, |
| 55 | + infer_table_structure=True, |
| 56 | + strategy="hi_res", |
| 57 | + languages=["eng"], |
| 58 | + model_name="yolox", |
| 59 | + ) |
| 60 | + tables: list[Table] = [] |
| 61 | + for el in elements: |
| 62 | + if isinstance(el, Table): |
| 63 | + tables.append(el) |
| 64 | + return tables |
| 65 | + |
| 66 | + def convert_to_html(self, tables: Table) -> Any: |
| 67 | + """Convert extracted table data using Unstructured Tool to HTML format.""" |
| 68 | + try: |
| 69 | + tables_html = tables.metadata.text_as_html |
| 70 | + except Exception as e: |
| 71 | + print(f"Error processing table: {e}") |
| 72 | + tables_html = None |
| 73 | + return tables_html |
| 74 | + |
| 75 | + |
| 76 | +class GMFTTool(ToolBase): |
| 77 | + """Tool for extracting tables using the GMFT library.""" |
| 78 | + |
| 79 | + def __init__(self) -> None: |
| 80 | + self.detector = TATRDetector() |
| 81 | + config = AutoFormatConfig() |
| 82 | + config.semantic_spanning_cells = True |
| 83 | + config.enable_multi_header = True |
| 84 | + self.formatter = AutoTableFormatter(config) |
| 85 | + |
| 86 | + def ingest_pdf(self, pdf_path: str) -> list[CroppedTable]: |
| 87 | + """Opens the PDF with PyMuPDFDocument, iterates over each page. |
| 88 | +
|
| 89 | + Uses GMFT Tool and the TATRDetector to extract tables. |
| 90 | + """ |
| 91 | + doc = PyMuPDFDocument(pdf_path) |
| 92 | + tables = [] |
| 93 | + for page in doc: |
| 94 | + tables.extend(self.detector.extract(page)) |
| 95 | + return tables |
| 96 | + |
| 97 | + def extract_tables(self, pdf_file: str) -> list[CroppedTable]: |
| 98 | + """Extract tables from a PDF file using GMFT.""" |
| 99 | + return self.ingest_pdf(pdf_file) |
| 100 | + |
| 101 | + def convert_to_html(self, tables: list[CroppedTable]) -> Any: |
| 102 | + """Convert extracted table data using GMFT Tool to HTML format.""" |
| 103 | + ft = self.formatter.extract(tables) |
| 104 | + try: |
| 105 | + tables_html = ft.df().fillna("").to_html() |
| 106 | + except Exception as e: |
| 107 | + print(f"Error processing table: {e}") |
| 108 | + tables_html = None |
| 109 | + return tables_html |
| 110 | + |
| 111 | + |
| 112 | +class Img2TableTool(ToolBase): |
| 113 | + """Tool for extracting tables from PDFs using the Img2Table library.""" |
| 114 | + |
| 115 | + def __init__(self) -> None: |
| 116 | + self.ocr = TesseractOCR(n_threads=1, lang="eng") |
| 117 | + |
| 118 | + def extract_tables(self, pdf_file: str) -> Any: |
| 119 | + """Uses Img2Table Tool to extract tables from a PDF file.""" |
| 120 | + pdf = PDF(pdf_file, detect_rotation=False, pdf_text_extraction=True) |
| 121 | + extracted_tables = pdf.extract_tables( |
| 122 | + ocr=self.ocr, implicit_rows=True, borderless_tables=True, min_confidence=50 |
| 123 | + ) |
| 124 | + return extracted_tables[0] |
| 125 | + |
| 126 | + def convert_to_html(self, tables: Any) -> Any: |
| 127 | + """Convert extracted table data using Img2Table Tool to HTML format.""" |
| 128 | + try: |
| 129 | + tables_html = tables.html_repr() |
| 130 | + except Exception as e: |
| 131 | + print(f"Error processing table: {e}") |
| 132 | + tables_html = None |
| 133 | + return tables_html |
| 134 | + |
| 135 | + |
| 136 | +class DoclingTool(ToolBase): |
| 137 | + """Tool for extracting tables from PDFs using the Docling library.""" |
| 138 | + |
| 139 | + def __init__(self) -> None: |
| 140 | + self.pipeline_options = PdfPipelineOptions( |
| 141 | + do_table_structure=True, ocr_options=EasyOcrOptions() |
| 142 | + ) |
| 143 | + self.pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE |
| 144 | + |
| 145 | + self.doc_converter = DocumentConverter( |
| 146 | + format_options={ |
| 147 | + InputFormat.PDF: PdfFormatOption( |
| 148 | + pipeline_options=self.pipeline_options, |
| 149 | + backend=DoclingParseV2DocumentBackend, |
| 150 | + ) |
| 151 | + } |
| 152 | + ) |
| 153 | + |
| 154 | + def extract_tables(self, pdf_file: str) -> list[TableItem]: |
| 155 | + """Uses Docling tool to extract tables from the given PDF file. |
| 156 | +
|
| 157 | + Returns a list of table objects (Docling-specific). |
| 158 | + """ |
| 159 | + result = self.doc_converter.convert(pdf_file) |
| 160 | + |
| 161 | + tables = result.document.tables |
| 162 | + |
| 163 | + return tables |
| 164 | + |
| 165 | + def convert_to_html(self, tables: Any) -> Any: |
| 166 | + """Convert the Docling table objects to HTML strings.""" |
| 167 | + try: |
| 168 | + tables_html = tables.export_to_html() |
| 169 | + except Exception as e: |
| 170 | + print(f"Error processing tables: {e}") |
| 171 | + return [] |
| 172 | + return tables_html |
| 173 | + |
| 174 | + |
| 175 | +def initialize_tools(tools: list[str] = ["all"]) -> dict: |
| 176 | + """Initialize and return selected table extraction tools. Default is all tools.""" |
| 177 | + available_tools = { |
| 178 | + "unstructured": UnstructuredTool(), |
| 179 | + "gmft": GMFTTool(), |
| 180 | + "img2table": Img2TableTool(), |
| 181 | + "docling": DoclingTool(), |
| 182 | + } |
| 183 | + |
| 184 | + if tools == ["all"]: |
| 185 | + return available_tools |
| 186 | + |
| 187 | + return {name: tool for name, tool in available_tools.items() if name in tools} |
0 commit comments