Updated requirements.

CogStack · Mar 23, 2023 · fb41569 · fb41569
1 parent f7911ec
commit fb41569
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 12 deletions.
diff --git a/config.py b/config.py
@@ -17,7 +17,7 @@
 # Tesseract model path
 TESSDATA_PREFIX = os.environ.get("TESSDATA_PREFIX", "/usr/local/share/tessdata")
 
-# Integer or Float - duration in seconds for the OCR processing, after which, pytesseract will terminate and raise RuntimeError
+# Integer or Float - duration in seconds for the OCR processing, after which, tesseract will terminate and raise RuntimeError
 TESSERACT_TIMEOUT = os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30)
 
 # Tesseract language code string. Defaults to eng if not specified! Example for multiple languages: lang='eng+fra'
@@ -26,7 +26,7 @@
 # Integer - modifies the processor priority for the Tesseract run. Not supported on Windows. Nice adjusts the niceness of unix-like processes.
 TESSERACT_NICE = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18))
 
-# Any additional custom configuration flags that are not available via the pytesseract function. For example: config='--psm 6'
+# Any additional custom configuration flags that are not available via the tesseract function. For example: config='--psm 6'
 TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
 
 # controls both threads and cpus

diff --git a/ocr_service/api/api.py b/ocr_service/api/api.py
@@ -42,7 +42,7 @@ def process() -> Response:
         log.info("Processing binary as data-binary, generating temporary file name...")
         file_name = uuid.uuid4().hex
         log.info("Generated file name:" + file_name)
-       
+
         stream = request.get_data(cache=False, as_text=False, parse_form_data=False)
 
     output_text, doc_metadata = processor.process_stream(stream=stream, file_name=file_name)

diff --git a/ocr_service/processor/processor.py b/ocr_service/processor/processor.py
@@ -109,6 +109,8 @@ def _preprocess_pdf_to_img(self, stream: bytes) -> List[PILImage]:
         """
 
         self.log.info("pre-processing pdf...")
+
+        pdf_image_pages = []
 
         try:
            pdf_image_pages, doc_metadata = self._pdf_to_img(stream)

diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,14 @@
-unoserver==1.2
-pytesseract==0.3.10
+unoserver==1.3
 filetype==1.2.0
-opencv-python==4.7.0.68
+opencv-python==4.7.0.72
 Pillow==9.4.0
-html2image==2.0.1
-Flask==2.2.2
-Werkzeug==2.2.2
+html2image==2.0.3
+Flask==2.2.3
+Werkzeug==2.2.3
 injector==0.20.1
 gunicorn==20.1.0
-tesserocr==2.5.2
-pypdfium2==3.18.0
+tesserocr==2.6.0
+pypdfium2==3.21.1
 psutil==5.9.4
-uharfbuzz==0.33.0
+uharfbuzz==0.35.0
 pyxml2pdf==0.3.4
-Original file line number
+Diff line change
@@ Expand Up @@
             """
             self.log.info("pre-processing pdf...")
+            pdf_image_pages = []
             try:
                pdf_image_pages, doc_metadata = self._pdf_to_img(stream)
@@ Expand Down @@