-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfig.py
133 lines (99 loc) · 6.45 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import multiprocessing
from sys import platform
OCR_SERVICE_VERSION = "0.2.3"
# 50 - CRITICAL, 40 - ERROR, 30 - WARNING, 20 - INFO, 10 - DEBUG, 0 - NOTSET
LOG_LEVEL = int(os.environ.get("OCR_SERVICE_LOG_LEVEL", 40))
DEBUG_MODE = os.environ.get("OCR_SERVICE_DEBUG_MODE", False)
ROOT_DIR = os.path.abspath(os.curdir)
TMP_FILE_DIR = os.path.join(ROOT_DIR, "tmp")
WORKER_PORT_MAP_FILE_PATH = os.path.join(TMP_FILE_DIR, './worker_process_data.txt')
# Should we actually ocr or just extract text from PDFs ?
# NOTE: OCR IS STILL APPLIED TO IMAGES if detected | possible vals : "OCR", "NO_OCR"
OPERATION_MODE = os.environ.get("OCR_SERVICE_OPERATION_MODE", "OCR")
# basic app settings
OCR_SERVICE_PORT = os.environ.get("OCR_SERVICE_PORT", 8090)
# Tesseract model path
TESSDATA_PREFIX = os.environ.get("OCR_TESSDATA_PREFIX", "/usr/local/share/tessdata")
# Integer or Float - duration in seconds for the OCR processing, after which,
# tesseract will terminate and raise RuntimeError
TESSERACT_TIMEOUT = int(os.environ.get("OCR_SERVICE_TESSERACT_TIMEOUT", 30))
# Tesseract language code string. Defaults to eng if not specified! Example for multiple languages: lang='eng+fra'
TESSERACT_LANGUAGE = os.environ.get("OCR_SERVICE_TESSERACT_LANG", "eng+lat")
# Integer - modifies the processor priority for the Tesseract run. Not supported on Windows.
# Nice adjusts the niceness of unix-like processes.
TESSERACT_NICE = int(os.environ.get("OCR_SERVICE_TESSERACT_NICE", -18))
# Any additional custom configuration flags that are not available via the tesseract function.
# For example: config='--psm 6'
TESSERACT_CUSTOM_CONFIG_FLAGS = os.environ.get("OCR_SERVICE_TESSERACT_CUSTOM_CONFIG_FLAGS", "")
# Controls both threads and cpus for one WEB SERVICE thread, basically, one request handler.
# This is derived normally from the amount of threads Gunicorn is running with, for example:
# - if we have OCR_SERVICE_THREADS = 4, the OCR service can handle at most 4 requests at the same time,
# this means that you can not use all of your CPUS for OCR-ing for 1 request,
# because that means the other requests are sitting idle while the first one uses all resources,
# and so it is recommended to regulate the number of threads per request
OCR_WEB_SERVICE_THREADS = int(os.environ.get("OCR_WEB_SERVICE_THREADS", 1))
# This controls the number of workers the ocr service may have it is recommended to use this value
# instead of OCR_WEB_SERVICE_THREADS if you want to process multiple requests in parallel
# WARNING: using more than 1 workers assumes you have set the OCR_WEB_SERVICE_WORKER_CLASS setting to sync !
# with the above mentioned,
# setting OCR_WEB_SERVICE_WORKER_CLASS to sync means that a worker will use 1 THREAD only,
# therefore OCR_WEB_SERVICE_THREADS is disregarded
OCR_WEB_SERVICE_WORKERS = int(os.environ.get("OCR_WEB_SERVICE_WORKERS", 1))
# set this to control the number of threads used for OCR-ing per web request thread (check OCR_WEB_SERVICE_THREADS)
CPU_THREADS = int(os.environ.get("OCR_SERVICE_CPU_THREADS", (multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
# conversion thread number for the pdf -> PIL img conversion, per web request thread (check OCR_WEB_SERVICE_THREADS)
CONVERTER_THREAD_NUM = int(os.environ.get("OCR_SERVICE_CONVERTER_THREADS",
(multiprocessing.cpu_count() / OCR_WEB_SERVICE_WORKERS)))
# should we convert detected images to greyscale before OCR-ing
OCR_CONVERT_GRAYSCALE_IMAGES = True
# dpi used for images in TESSERACT and other stuff
OCR_IMAGE_DPI = int(os.environ.get("OCR_SERVICE_IMAGE_DPI", 200))
# LIBRE OFFICE SECTION
# 60 seconds before terminating processes
LIBRE_OFFICE_PROCESS_TIMEOUT = int(os.environ.get("OCR_SERVICE_LIBRE_OFFICE_PROCESS_TIMEOUT", 20))
# This is the port for the background soffice listener service that gets started with the app
# used internally for LibreOffice doc conversion
# the service should start multiple libre office servers for doc conversions,
# a libre office server will only use 1 CPU by default (not changable), thus,
# for handling multiple requests, we will have one service per OCR_WEB_SERVICE_THREAD
DEFAULT_LIBRE_OFFICE_SERVER_PORT = 9900
LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + 1
if OCR_WEB_SERVICE_THREADS > 1:
LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_THREADS
if OCR_WEB_SERVICE_WORKERS > 1:
LIBRE_OFFICE_PORT_CAP = DEFAULT_LIBRE_OFFICE_SERVER_PORT + OCR_WEB_SERVICE_WORKERS
LIBRE_OFFICE_LISTENER_PORT_RANGE = os.environ.get("OCR_SERVICE_LIBRE_OFFICE_LISTENER_PORT_RANGE",
range(DEFAULT_LIBRE_OFFICE_SERVER_PORT, LIBRE_OFFICE_PORT_CAP))
LIBRE_OFFICE_NETWORK_INTERFACE = "localhost"
# seconds to check for possible failure of port
LIBRE_OFFICE_PROCESSES_LISTENER_INTERVAL = 10
# DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY
# Description: this sets the paths to the LibreOffice python binary,
# it is required by default when using the unoserver package
#
# MacOS X: /Applications/LibreOffice.app/Contents/Resources/python
# Windows: C:/Windows/py.exe
# Linux(Ubuntu): /usr/bin/python3.11 (forcefully uses python3.11,
# to point to the default python on your system just use /usr/bin/python3)
LIBRE_OFFICE_PYTHON_PATH = "/Applications/LibreOffice.app/Contents/Resources/python"
# DO NOT CHANGE THIS UNLESS YOU ARE DEVELOPING OR RUNNING THIS APP LOCALLY
# Description: this sets the path to the LibreOffice executable,
# unoserver uses it to start a daemon in the background
# that listens to any incoming conversion requests
#
# MacOS X: /Applications/LibreOffice.app/Contents/MacOS/soffice
# Windows: %ProgramFiles%/LibreOffice/Program/soffice
# Linux(Ubuntu): /usr/bin/soffice
LIBRE_OFFICE_EXEC_PATH = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
if platform == "linux" or platform == "linux2":
LIBRE_OFFICE_EXEC_PATH = "/usr/bin/soffice"
LIBRE_OFFICE_PYTHON_PATH = "/usr/bin/python3.11"
# this is the path from the Docker image, Ubuntu Lunar, Noble too.
TESSDATA_PREFIX = "/usr/share/tesseract-ocr/5/tessdata"
# if not found, then set the path to tesseract 4 data, tested with Ubuntu 22.04 LTS on WSL 2
if os.path.exists(TESSDATA_PREFIX) is False:
TESSDATA_PREFIX = "/usr/share/tesseract-ocr/4.00/tessdata"
elif platform == "win32":
LIBRE_OFFICE_EXEC_PATH = "%ProgramFiles%/LibreOffice/Program/soffice"
LIBRE_OFFICE_PYTHON_PATH = "C:/Windows/py.exe"