From 2ecdd5d2fee67197cb3eaee90aca90482db6d1cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Alt=C4=B1parmak?= Date: Fri, 29 Nov 2024 16:24:01 +0300 Subject: [PATCH] Update rapid-latex-ocr --- README.md | 2 +- pyproject.toml | 4 ++-- requirements.txt | 2 +- src/extraction_formats/extract_formula_formats.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 4d22679..86d3bb3 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Run the service: - With GPU support: ``` -docker run --rm --name pdf-document-layout-analysis --gpus '"device=0"' -p 5060:5060 --entrypoint ./start.sh huridocs/pdf-document-layout-analysis:v0.0.19 +docker run --rm --name pdf-document-layout-analysis --gpus '"device=0"' -p 5060:5060 --entrypoint ./start.sh huridocs/pdf-document-layout-analysis:v0.0.20 ``` - Without GPU support: diff --git a/pyproject.toml b/pyproject.toml index 433031f..e48c8e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pdf-document-layout-analysis" -version = "2024.11.12.1" +version = "2024.11.29.1" description = "This tool is for PDF document layout analysis" license = { file = "LICENSE" } authors = [{ name = "HURIDOCS" }] @@ -29,7 +29,7 @@ dependencies = [ "roman==4.2", "hydra-core==1.3.2", "pypandoc==1.13", - "rapid-latex-ocr==0.0.8", + "rapid-latex-ocr==0.0.9", "struct_eqtable @ git+https://github.com/UniModal4Reasoning/StructEqTable-Deploy.git@fd06078bfa9364849eb39330c075dd63cbed73ff" ] diff --git a/requirements.txt b/requirements.txt index 5a3e2a8..14f7cac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,5 +21,5 @@ setuptools==75.4.0 roman==4.2 hydra-core==1.3.2 pypandoc==1.13 -rapid-latex-ocr==0.0.8 +rapid-latex-ocr==0.0.9 git+https://github.com/UniModal4Reasoning/StructEqTable-Deploy.git@fd06078bfa9364849eb39330c075dd63cbed73ff \ No newline at end of file diff --git a/src/extraction_formats/extract_formula_formats.py b/src/extraction_formats/extract_formula_formats.py index 004ff57..521aaed 100644 --- a/src/extraction_formats/extract_formula_formats.py +++ b/src/extraction_formats/extract_formula_formats.py @@ -1,6 +1,6 @@ import io from PIL.Image import Image -from rapid_latex_ocr import LatexOCR +from rapid_latex_ocr import LaTeXOCR from data_model.PdfImages import PdfImages from fast_trainer.PdfSegment import PdfSegment from pdf_token_type_labels.TokenType import TokenType @@ -10,7 +10,7 @@ def has_arabic(text: str) -> bool: return any("\u0600" <= char <= "\u06FF" or "\u0750" <= char <= "\u077F" for char in text) -def get_latex_format(model: LatexOCR, formula_image: Image): +def get_latex_format(model: LaTeXOCR, formula_image: Image): buffer = io.BytesIO() formula_image.save(buffer, format="jpeg") image_bytes = buffer.getvalue() @@ -25,7 +25,7 @@ def extract_formula_format(pdf_images: PdfImages, predicted_segments: list[PdfSe if not formula_segments: return - model = LatexOCR() + model = LaTeXOCR() for index, formula_segment in formula_segments: if has_arabic(formula_segment.text_content):