diff --git a/src/openparse/__init__.py b/src/openparse/__init__.py index fa61a23..fc91420 100644 --- a/src/openparse/__init__.py +++ b/src/openparse/__init__.py @@ -1,9 +1,9 @@ -from openparse.pdf import Pdf +from openparse import processing, version +from openparse.config import config from openparse.doc_parser import ( DocumentParser, ) -from openparse import processing, version -from openparse.config import config +from openparse.pdf import Pdf from openparse.schemas import ( Bbox, LineElement, diff --git a/src/openparse/_types.py b/src/openparse/_types.py index 2b61bd9..e226aa3 100644 --- a/src/openparse/_types.py +++ b/src/openparse/_types.py @@ -1,6 +1,6 @@ -from typing import Union, Any, TypeVar -from typing_extensions import Literal, override +from typing import TypeVar, Union +from typing_extensions import Literal, override _T = TypeVar("_T") diff --git a/src/openparse/cli.py b/src/openparse/cli.py index 1521c1b..8a88755 100644 --- a/src/openparse/cli.py +++ b/src/openparse/cli.py @@ -1,7 +1,7 @@ +import argparse import os from pathlib import Path from urllib.request import urlretrieve -import argparse def download_weights(weight_url, destination): diff --git a/src/openparse/config.py b/src/openparse/config.py index 3c3b25a..7978d1a 100644 --- a/src/openparse/config.py +++ b/src/openparse/config.py @@ -1,6 +1,5 @@ from typing import Literal - TorchDevice = Literal["cuda", "cpu", "mps"] diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py index 390f549..4b9aa76 100644 --- a/src/openparse/doc_parser.py +++ b/src/openparse/doc_parser.py @@ -1,16 +1,15 @@ from pathlib import Path -from typing import List, Literal, Optional, TypedDict, Union, TypeVar +from typing import List, Literal, TypedDict, TypeVar, Union -from openparse import tables, text, consts -from openparse.pdf import Pdf +from openparse import consts, tables, text from openparse._types import NOT_GIVEN, NotGiven +from openparse.pdf import Pdf from openparse.processing import ( - IngestionPipeline, BasicIngestionPipeline, + IngestionPipeline, NoOpIngestionPipeline, ) -from openparse.schemas import Node, TableElement, TextElement, ParsedDocument - +from openparse.schemas import Node, ParsedDocument, TableElement, TextElement IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline) diff --git a/src/openparse/pdf.py b/src/openparse/pdf.py index 04d6b5c..5f4fd22 100644 --- a/src/openparse/pdf.py +++ b/src/openparse/pdf.py @@ -1,13 +1,14 @@ -import os -import mimetypes import datetime as dt -import random import io +import mimetypes +import os +import random from pathlib import Path -from typing import Iterator, List, Literal, Optional, Union, Tuple, Any, Dict -from pydantic import BaseModel +from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union + from pdfminer.high_level import extract_pages from pdfminer.layout import LTPage +from pydantic import BaseModel from pypdf import PdfReader, PdfWriter from openparse.schemas import Bbox, Node @@ -57,7 +58,7 @@ def _prepare_bboxes_for_drawing( ) ) - text = f"continued ..." + text = "continued ..." return res @@ -91,7 +92,7 @@ class Pdf: def __init__(self, file: Union[str, Path, PdfReader]): self.file_path = None - self.file_metadata = dict() + self.file_metadata = {} if isinstance(file, (str, Path)): self.file_path = str(file) self.file_metadata = file_metadata(file) @@ -111,8 +112,7 @@ def extract_layout_pages(self) -> Iterator[LTPage]: self.file_path is not None ), "PDF file path is required for this method for now." - for page_layout in extract_pages(self.file_path): - yield page_layout + yield from extract_pages(self.file_path) def save(self, output_pdf: Union[str, Path]) -> None: """ diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py index aabb0fe..8375714 100644 --- a/src/openparse/processing/__init__.py +++ b/src/openparse/processing/__init__.py @@ -1,19 +1,19 @@ -from .ingest import ( - IngestionPipeline, - BasicIngestionPipeline, - SemanticIngestionPipeline, - NoOpIngestionPipeline, -) from .basic_transforms import ( - ProcessingStep, - RemoveTextInsideTables, - RemoveFullPageStubs, - RemoveMetadataElements, - RemoveRepeatedElements, CombineBullets, CombineHeadingsWithClosestText, CombineNodesSpatially, + ProcessingStep, + RemoveFullPageStubs, + RemoveMetadataElements, RemoveNodesBelowNTokens, + RemoveRepeatedElements, + RemoveTextInsideTables, +) +from .ingest import ( + BasicIngestionPipeline, + IngestionPipeline, + NoOpIngestionPipeline, + SemanticIngestionPipeline, ) from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py index a055c0f..d78efb1 100644 --- a/src/openparse/processing/basic_transforms.py +++ b/src/openparse/processing/basic_transforms.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from collections import defaultdict -from typing import List, Literal, Dict +from typing import Dict, List, Literal from openparse.schemas import Bbox, Node, TextElement diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index cef9414..a08f84b 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -1,24 +1,24 @@ -from typing import List, Optional from abc import ABC +from typing import List, Optional -from openparse.schemas import Node from openparse import consts from openparse.processing.basic_transforms import ( + CombineBullets, + CombineHeadingsWithClosestText, + CombineNodesSpatially, ProcessingStep, - RemoveTextInsideTables, RemoveFullPageStubs, - CombineNodesSpatially, - CombineHeadingsWithClosestText, - CombineBullets, RemoveMetadataElements, - RemoveRepeatedElements, RemoveNodesBelowNTokens, + RemoveRepeatedElements, + RemoveTextInsideTables, ) from openparse.processing.semantic_transforms import ( CombineNodesSemantically, - OpenAIEmbeddings, EmbeddingModel, + OpenAIEmbeddings, ) +from openparse.schemas import Node class IngestionPipeline(ABC): diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index 2a0f28d..8369035 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -1,8 +1,9 @@ -from typing import List, Literal, Dict, Union +from typing import List, Literal, Union import numpy as np from openparse.schemas import Node + from .basic_transforms import ProcessingStep EmbeddingModel = Literal[ diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index 0d0f45f..ba19868 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -1,12 +1,12 @@ +import datetime as dt import re +import uuid from collections import defaultdict, namedtuple from enum import Enum -import datetime as dt -import uuid from functools import cached_property -from typing import Any, List, Literal, Optional, Tuple, Union, Set +from typing import Any, List, Literal, Optional, Set, Tuple, Union -from pydantic import BaseModel, ConfigDict, computed_field, model_validator, Field +from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator from openparse import consts from openparse.utils import num_tokens @@ -386,7 +386,7 @@ def node_id(self) -> str: @computed_field # type: ignore @cached_property def variant(self) -> Set[Literal["text", "table"]]: - return set(e.variant.value for e in self.elements) + return {e.variant.value for e in self.elements} @computed_field # type: ignore @cached_property @@ -493,7 +493,7 @@ def is_large(self) -> bool: @cached_property def num_pages(self) -> int: - return len(set(element.bbox.page for element in self.elements)) + return len({element.bbox.page for element in self.elements}) @cached_property def start_page(self) -> int: diff --git a/src/openparse/tables/parse.py b/src/openparse/tables/parse.py index ccad154..54ffbbb 100644 --- a/src/openparse/tables/parse.py +++ b/src/openparse/tables/parse.py @@ -4,7 +4,7 @@ from openparse.pdf import Pdf from openparse.schemas import Bbox, TableElement -from openparse.tables.utils import crop_img_with_padding, adjust_bbox_with_padding +from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding from . import pymupdf @@ -102,7 +102,7 @@ def _ingest_with_table_transformers( raise ImportError( "Table detection and extraction requires the `torch`, `torchvision` and `transformers` libraries to be installed.", e, - ) + ) from e pdoc = doc.to_pymupdf_doc() # type: ignore pdf_as_imgs = doc_to_imgs(pdoc) @@ -163,6 +163,7 @@ def _ingest_with_unitable( ) -> List[TableElement]: try: from openparse.tables.utils import doc_to_imgs + from .table_transformers.ml import find_table_bboxes from .unitable.core import table_img_to_html @@ -170,7 +171,7 @@ def _ingest_with_unitable( raise ImportError( "Table detection and extraction requires the `torch`, `torchvision` and `transformers` libraries to be installed.", e, - ) + ) from e pdoc = doc.to_pymupdf_doc() # type: ignore pdf_as_imgs = doc_to_imgs(pdoc) diff --git a/src/openparse/tables/schemas.py b/src/openparse/tables/schemas.py index f7c5a68..a9618e6 100644 --- a/src/openparse/tables/schemas.py +++ b/src/openparse/tables/schemas.py @@ -1,7 +1,4 @@ -from typing import List, Literal, Optional, Sequence, Tuple, Union - -import fitz -from pydantic import BaseModel, model_validator +from typing import Tuple Size = Tuple[int, int] BBox = Tuple[float, float, float, float] diff --git a/src/openparse/tables/table_transformers/geometry.py b/src/openparse/tables/table_transformers/geometry.py index caa931c..83e4195 100644 --- a/src/openparse/tables/table_transformers/geometry.py +++ b/src/openparse/tables/table_transformers/geometry.py @@ -1,7 +1,5 @@ from __future__ import annotations -from .schemas import BBox, Size - def calc_bbox_intersection(bbox1, bbox2, safety_margin=5.0): if safety_margin < 0: diff --git a/src/openparse/tables/table_transformers/ml.py b/src/openparse/tables/table_transformers/ml.py index ad4ec48..e33c33a 100644 --- a/src/openparse/tables/table_transformers/ml.py +++ b/src/openparse/tables/table_transformers/ml.py @@ -8,29 +8,31 @@ from transformers import ( AutoModelForObjectDetection, # type: ignore TableTransformerForObjectDetection, # type: ignore -) # type: ignore +) +# type: ignore from openparse.config import config + from ..schemas import ( BBox, Size, ) from ..utils import ( - display_cells_on_img, - crop_img_with_padding, convert_croppped_cords_to_full_img_cords, convert_img_cords_to_pdf_cords, + crop_img_with_padding, + display_cells_on_img, ) from .geometry import ( calc_bbox_intersection, ) from .schemas import ( - _TableCellModelOutput, - _TableModelOutput, _Table, + _TableCellModelOutput, _TableDataCell, _TableHeader, _TableHeaderCell, + _TableModelOutput, _TableRow, ) diff --git a/src/openparse/tables/table_transformers/schemas.py b/src/openparse/tables/table_transformers/schemas.py index fecaf87..e66f71b 100644 --- a/src/openparse/tables/table_transformers/schemas.py +++ b/src/openparse/tables/table_transformers/schemas.py @@ -147,14 +147,14 @@ def _generate_row_str( " {} ".format(cell.content.ljust(width) if cell.content else " " * width) for cell, width in zip(cells, column_widths) ) - return "|{}|".format(row_content) + return f"|{row_content}|" def _generate_horizontal_border_str(self, column_widths: List[int]) -> str: """ Generates the horizontal border string based on the column widths. """ border = "+".join("-" * (width + 2) for width in column_widths) - return "+{}+".format(border) + return f"+{border}+" def sort(self) -> None: self.headers.sort( diff --git a/src/openparse/tables/unitable/config.py b/src/openparse/tables/unitable/config.py index fabb424..41ebbaa 100644 --- a/src/openparse/tables/unitable/config.py +++ b/src/openparse/tables/unitable/config.py @@ -1,6 +1,8 @@ -from pydantic import BaseModel -from pathlib import Path import sys +from pathlib import Path + +from pydantic import BaseModel + from openparse import consts root = Path(consts.__file__).parent diff --git a/src/openparse/tables/unitable/core.py b/src/openparse/tables/unitable/core.py index 4acdfa5..b6d530b 100644 --- a/src/openparse/tables/unitable/core.py +++ b/src/openparse/tables/unitable/core.py @@ -1,32 +1,33 @@ -from typing import Tuple, List, Sequence, Optional, Union import re +from typing import List, Optional, Sequence, Tuple -from PIL import Image # type: ignore import torch # type: ignore +from PIL import Image # type: ignore +from torch import Tensor # type: ignore from torchvision import transforms # type: ignore -from torch import nn, Tensor # type: ignore -from .tokens import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN +from openparse.config import config + +from .tokens import INVALID_CELL_TOKEN, VALID_BBOX_TOKEN, VALID_HTML_TOKEN +from .unitable_model import ( + EncoderDecoder, + bbox_model, + bbox_vocab, + cell_model, + cell_vocab, + structure_model, + structure_vocab, +) from .utils import ( - subsequent_mask, - pred_token_within_range, - greedy_sampling, - html_str_to_token_list, bbox_str_to_token_list, - cell_str_to_token_list, # cell-content-detection build_table_from_html_and_cell, # cell-content-detection + cell_str_to_token_list, # cell-content-detection + greedy_sampling, + html_str_to_token_list, html_table_template, # cell-content-detection + pred_token_within_range, + subsequent_mask, ) -from .unitable_model import ( - structure_vocab, - structure_model, - bbox_vocab, - bbox_model, - cell_vocab, - cell_model, - EncoderDecoder, -) -from openparse.config import config Size = Tuple[int, int] BBox = Tuple[int, int, int, int] diff --git a/src/openparse/tables/unitable/schemas.py b/src/openparse/tables/unitable/schemas.py index 0898df0..62efd52 100644 --- a/src/openparse/tables/unitable/schemas.py +++ b/src/openparse/tables/unitable/schemas.py @@ -6,12 +6,9 @@ We could potentially drastically speed up inference if we use tesseract to extract the text from the table instead of unitable. """ -from typing import List, Tuple, Optional, Union -from pydantic import BaseModel, Field -from ..utils import ( - convert_croppped_cords_to_full_img_cords, - convert_img_cords_to_pdf_cords, -) +from typing import List, Optional, Tuple, Union + +from pydantic import BaseModel Size = Tuple[int, int] diff --git a/src/openparse/tables/unitable/tabular_transformer.py b/src/openparse/tables/unitable/tabular_transformer.py index 3299f2f..f2e0951 100644 --- a/src/openparse/tables/unitable/tabular_transformer.py +++ b/src/openparse/tables/unitable/tabular_transformer.py @@ -1,6 +1,7 @@ +from functools import partial + import torch from torch import Tensor, nn -from functools import partial class TokenEmbedding(nn.Module): diff --git a/src/openparse/tables/unitable/unitable_model.py b/src/openparse/tables/unitable/unitable_model.py index 676f637..b8837a9 100644 --- a/src/openparse/tables/unitable/unitable_model.py +++ b/src/openparse/tables/unitable/unitable_model.py @@ -1,20 +1,21 @@ -from typing import Tuple, List, Union, Optional, Sequence -from pathlib import Path -import torch # type: ignore -import tokenizers as tk # type: ignore import warnings +from functools import partial +from pathlib import Path +from typing import Tuple, Union +import tokenizers as tk # type: ignore +import torch # type: ignore from torch import nn -from functools import partial + +from openparse.config import config as global_config from .config import config from .tabular_transformer import ( + Decoder, + Encoder, EncoderDecoder, ImgLinearBackbone, - Encoder, - Decoder, ) -from openparse.config import config as global_config device = global_config.get_device() warnings.filterwarnings("ignore") diff --git a/src/openparse/tables/unitable/utils.py b/src/openparse/tables/unitable/utils.py index 18aacd7..1fe9cac 100644 --- a/src/openparse/tables/unitable/utils.py +++ b/src/openparse/tables/unitable/utils.py @@ -1,8 +1,9 @@ from typing import List, Optional, Tuple + import tokenizers as tk # type: ignore import torch -from torch import Tensor import torch.nn.functional as F +from torch import Tensor from .tokens import TASK_TOKENS @@ -113,7 +114,7 @@ def build_table_from_html_and_cell( structure: List[str], content: Optional[List[str]] = None ) -> List[str]: assert structure is not None - html_code = list() + html_code = [] if content is None: content_copy = ["placeholder"] * len(structure) diff --git a/src/openparse/tables/utils.py b/src/openparse/tables/utils.py index ce72bd4..780d4df 100644 --- a/src/openparse/tables/utils.py +++ b/src/openparse/tables/utils.py @@ -47,7 +47,7 @@ def crop_img_with_padding( return padded_image except Exception as e: - raise ValueError(f"Failed to crop the image: {e}") + raise ValueError(f"Failed to crop the image: {e}") from e def doc_to_imgs(doc) -> List[Image.Image]: diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py index 895bd4a..8da2d34 100644 --- a/src/openparse/text/pdfminer/core.py +++ b/src/openparse/text/pdfminer/core.py @@ -1,4 +1,4 @@ -from typing import Any, Iterable, List, Union, Tuple +from typing import Any, Iterable, List, Tuple, Union from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine from pydantic import BaseModel, model_validator diff --git a/src/openparse/text/pymupdf/core.py b/src/openparse/text/pymupdf/core.py index f71ef09..74abc0c 100644 --- a/src/openparse/text/pymupdf/core.py +++ b/src/openparse/text/pymupdf/core.py @@ -6,22 +6,22 @@ def flags_decomposer(flags: int) -> str: """Make font flags human readable.""" - l = [] + attrs = [] if flags & 2**0: - l.append("superscript") + attrs.append("superscript") if flags & 2**1: - l.append("italic") + attrs.append("italic") if flags & 2**2: - l.append("serifed") + attrs.append("serifed") else: - l.append("sans") + attrs.append("sans") if flags & 2**3: - l.append("monospaced") + attrs.append("monospaced") else: - l.append("proportional") + attrs.append("proportional") if flags & 2**4: - l.append("bold") - return ", ".join(l) + attrs.append("bold") + return ", ".join(attrs) def is_bold(flags) -> bool: