Skip to content

Commit

Permalink
ruff formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Filimoa committed Jun 13, 2024
1 parent 4e9936f commit 86a256b
Show file tree
Hide file tree
Showing 25 changed files with 114 additions and 114 deletions.
6 changes: 3 additions & 3 deletions src/openparse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from openparse.pdf import Pdf
from openparse import processing, version
from openparse.config import config
from openparse.doc_parser import (
DocumentParser,
)
from openparse import processing, version
from openparse.config import config
from openparse.pdf import Pdf
from openparse.schemas import (
Bbox,
LineElement,
Expand Down
4 changes: 2 additions & 2 deletions src/openparse/_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Union, Any, TypeVar
from typing_extensions import Literal, override
from typing import TypeVar, Union

from typing_extensions import Literal, override

_T = TypeVar("_T")

Expand Down
2 changes: 1 addition & 1 deletion src/openparse/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import os
from pathlib import Path
from urllib.request import urlretrieve
import argparse


def download_weights(weight_url, destination):
Expand Down
1 change: 0 additions & 1 deletion src/openparse/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import Literal


TorchDevice = Literal["cuda", "cpu", "mps"]


Expand Down
11 changes: 5 additions & 6 deletions src/openparse/doc_parser.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from pathlib import Path
from typing import List, Literal, Optional, TypedDict, Union, TypeVar
from typing import List, Literal, TypedDict, TypeVar, Union

from openparse import tables, text, consts
from openparse.pdf import Pdf
from openparse import consts, tables, text
from openparse._types import NOT_GIVEN, NotGiven
from openparse.pdf import Pdf
from openparse.processing import (
IngestionPipeline,
BasicIngestionPipeline,
IngestionPipeline,
NoOpIngestionPipeline,
)
from openparse.schemas import Node, TableElement, TextElement, ParsedDocument

from openparse.schemas import Node, ParsedDocument, TableElement, TextElement

IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline)

Expand Down
18 changes: 9 additions & 9 deletions src/openparse/pdf.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import mimetypes
import datetime as dt
import random
import io
import mimetypes
import os
import random
from pathlib import Path
from typing import Iterator, List, Literal, Optional, Union, Tuple, Any, Dict
from pydantic import BaseModel
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage
from pydantic import BaseModel
from pypdf import PdfReader, PdfWriter

from openparse.schemas import Bbox, Node
Expand Down Expand Up @@ -57,7 +58,7 @@ def _prepare_bboxes_for_drawing(
)
)

text = f"continued ..."
text = "continued ..."
return res


Expand Down Expand Up @@ -91,7 +92,7 @@ class Pdf:

def __init__(self, file: Union[str, Path, PdfReader]):
self.file_path = None
self.file_metadata = dict()
self.file_metadata = {}
if isinstance(file, (str, Path)):
self.file_path = str(file)
self.file_metadata = file_metadata(file)
Expand All @@ -111,8 +112,7 @@ def extract_layout_pages(self) -> Iterator[LTPage]:
self.file_path is not None
), "PDF file path is required for this method for now."

for page_layout in extract_pages(self.file_path):
yield page_layout
yield from extract_pages(self.file_path)

def save(self, output_pdf: Union[str, Path]) -> None:
"""
Expand Down
22 changes: 11 additions & 11 deletions src/openparse/processing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from .ingest import (
IngestionPipeline,
BasicIngestionPipeline,
SemanticIngestionPipeline,
NoOpIngestionPipeline,
)
from .basic_transforms import (
ProcessingStep,
RemoveTextInsideTables,
RemoveFullPageStubs,
RemoveMetadataElements,
RemoveRepeatedElements,
CombineBullets,
CombineHeadingsWithClosestText,
CombineNodesSpatially,
ProcessingStep,
RemoveFullPageStubs,
RemoveMetadataElements,
RemoveNodesBelowNTokens,
RemoveRepeatedElements,
RemoveTextInsideTables,
)
from .ingest import (
BasicIngestionPipeline,
IngestionPipeline,
NoOpIngestionPipeline,
SemanticIngestionPipeline,
)
from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings

Expand Down
2 changes: 1 addition & 1 deletion src/openparse/processing/basic_transforms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import List, Literal, Dict
from typing import Dict, List, Literal

from openparse.schemas import Bbox, Node, TextElement

Expand Down
16 changes: 8 additions & 8 deletions src/openparse/processing/ingest.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
from typing import List, Optional
from abc import ABC
from typing import List, Optional

from openparse.schemas import Node
from openparse import consts
from openparse.processing.basic_transforms import (
CombineBullets,
CombineHeadingsWithClosestText,
CombineNodesSpatially,
ProcessingStep,
RemoveTextInsideTables,
RemoveFullPageStubs,
CombineNodesSpatially,
CombineHeadingsWithClosestText,
CombineBullets,
RemoveMetadataElements,
RemoveRepeatedElements,
RemoveNodesBelowNTokens,
RemoveRepeatedElements,
RemoveTextInsideTables,
)
from openparse.processing.semantic_transforms import (
CombineNodesSemantically,
OpenAIEmbeddings,
EmbeddingModel,
OpenAIEmbeddings,
)
from openparse.schemas import Node


class IngestionPipeline(ABC):
Expand Down
3 changes: 2 additions & 1 deletion src/openparse/processing/semantic_transforms.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import List, Literal, Dict, Union
from typing import List, Literal, Union

import numpy as np

from openparse.schemas import Node

from .basic_transforms import ProcessingStep

EmbeddingModel = Literal[
Expand Down
12 changes: 6 additions & 6 deletions src/openparse/schemas.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import datetime as dt
import re
import uuid
from collections import defaultdict, namedtuple
from enum import Enum
import datetime as dt
import uuid
from functools import cached_property
from typing import Any, List, Literal, Optional, Tuple, Union, Set
from typing import Any, List, Literal, Optional, Set, Tuple, Union

from pydantic import BaseModel, ConfigDict, computed_field, model_validator, Field
from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator

from openparse import consts
from openparse.utils import num_tokens
Expand Down Expand Up @@ -386,7 +386,7 @@ def node_id(self) -> str:
@computed_field # type: ignore
@cached_property
def variant(self) -> Set[Literal["text", "table"]]:
return set(e.variant.value for e in self.elements)
return {e.variant.value for e in self.elements}

@computed_field # type: ignore
@cached_property
Expand Down Expand Up @@ -493,7 +493,7 @@ def is_large(self) -> bool:

@cached_property
def num_pages(self) -> int:
return len(set(element.bbox.page for element in self.elements))
return len({element.bbox.page for element in self.elements})

@cached_property
def start_page(self) -> int:
Expand Down
7 changes: 4 additions & 3 deletions src/openparse/tables/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from openparse.pdf import Pdf
from openparse.schemas import Bbox, TableElement
from openparse.tables.utils import crop_img_with_padding, adjust_bbox_with_padding
from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding

from . import pymupdf

Expand Down Expand Up @@ -102,7 +102,7 @@ def _ingest_with_table_transformers(
raise ImportError(
"Table detection and extraction requires the `torch`, `torchvision` and `transformers` libraries to be installed.",
e,
)
) from e
pdoc = doc.to_pymupdf_doc() # type: ignore
pdf_as_imgs = doc_to_imgs(pdoc)

Expand Down Expand Up @@ -163,14 +163,15 @@ def _ingest_with_unitable(
) -> List[TableElement]:
try:
from openparse.tables.utils import doc_to_imgs

from .table_transformers.ml import find_table_bboxes
from .unitable.core import table_img_to_html

except ImportError as e:
raise ImportError(
"Table detection and extraction requires the `torch`, `torchvision` and `transformers` libraries to be installed.",
e,
)
) from e
pdoc = doc.to_pymupdf_doc() # type: ignore
pdf_as_imgs = doc_to_imgs(pdoc)

Expand Down
5 changes: 1 addition & 4 deletions src/openparse/tables/schemas.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from typing import List, Literal, Optional, Sequence, Tuple, Union

import fitz
from pydantic import BaseModel, model_validator
from typing import Tuple

Size = Tuple[int, int]
BBox = Tuple[float, float, float, float]
Expand Down
2 changes: 0 additions & 2 deletions src/openparse/tables/table_transformers/geometry.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from .schemas import BBox, Size


def calc_bbox_intersection(bbox1, bbox2, safety_margin=5.0):
if safety_margin < 0:
Expand Down
12 changes: 7 additions & 5 deletions src/openparse/tables/table_transformers/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,31 @@
from transformers import (
AutoModelForObjectDetection, # type: ignore
TableTransformerForObjectDetection, # type: ignore
) # type: ignore
)

# type: ignore
from openparse.config import config

from ..schemas import (
BBox,
Size,
)
from ..utils import (
display_cells_on_img,
crop_img_with_padding,
convert_croppped_cords_to_full_img_cords,
convert_img_cords_to_pdf_cords,
crop_img_with_padding,
display_cells_on_img,
)
from .geometry import (
calc_bbox_intersection,
)
from .schemas import (
_TableCellModelOutput,
_TableModelOutput,
_Table,
_TableCellModelOutput,
_TableDataCell,
_TableHeader,
_TableHeaderCell,
_TableModelOutput,
_TableRow,
)

Expand Down
4 changes: 2 additions & 2 deletions src/openparse/tables/table_transformers/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,14 @@ def _generate_row_str(
" {} ".format(cell.content.ljust(width) if cell.content else " " * width)
for cell, width in zip(cells, column_widths)
)
return "|{}|".format(row_content)
return f"|{row_content}|"

def _generate_horizontal_border_str(self, column_widths: List[int]) -> str:
"""
Generates the horizontal border string based on the column widths.
"""
border = "+".join("-" * (width + 2) for width in column_widths)
return "+{}+".format(border)
return f"+{border}+"

def sort(self) -> None:
self.headers.sort(
Expand Down
6 changes: 4 additions & 2 deletions src/openparse/tables/unitable/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pydantic import BaseModel
from pathlib import Path
import sys
from pathlib import Path

from pydantic import BaseModel

from openparse import consts

root = Path(consts.__file__).parent
Expand Down
39 changes: 20 additions & 19 deletions src/openparse/tables/unitable/core.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
from typing import Tuple, List, Sequence, Optional, Union
import re
from typing import List, Optional, Sequence, Tuple

from PIL import Image # type: ignore
import torch # type: ignore
from PIL import Image # type: ignore
from torch import Tensor # type: ignore
from torchvision import transforms # type: ignore
from torch import nn, Tensor # type: ignore

from .tokens import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN
from openparse.config import config

from .tokens import INVALID_CELL_TOKEN, VALID_BBOX_TOKEN, VALID_HTML_TOKEN
from .unitable_model import (
EncoderDecoder,
bbox_model,
bbox_vocab,
cell_model,
cell_vocab,
structure_model,
structure_vocab,
)
from .utils import (
subsequent_mask,
pred_token_within_range,
greedy_sampling,
html_str_to_token_list,
bbox_str_to_token_list,
cell_str_to_token_list, # cell-content-detection
build_table_from_html_and_cell, # cell-content-detection
cell_str_to_token_list, # cell-content-detection
greedy_sampling,
html_str_to_token_list,
html_table_template, # cell-content-detection
pred_token_within_range,
subsequent_mask,
)
from .unitable_model import (
structure_vocab,
structure_model,
bbox_vocab,
bbox_model,
cell_vocab,
cell_model,
EncoderDecoder,
)
from openparse.config import config

Size = Tuple[int, int]
BBox = Tuple[int, int, int, int]
Expand Down
Loading

0 comments on commit 86a256b

Please sign in to comment.