Skip to content

Commit 9685937

Browse files
committed
add fold dashes flag
1 parent c9b1931 commit 9685937

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

prodigy_pdf/__init__.py

+15
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,17 @@ def page_to_cropped_image(pil_page, span, scale):
9898
return cropped, f"data:image/png;base64,{img_str.decode('utf-8')}"
9999

100100

101+
def fold_ocr_dashes(ocr_input):
102+
new = ""
103+
for line in ocr_input.split("\n"):
104+
if line.rfind("-") == -1:
105+
newline = line + " "
106+
else:
107+
newline = line[:line.rfind("-")]
108+
new += newline
109+
return new
110+
111+
101112
@recipe(
102113
"pdf.ocr.correct",
103114
# fmt: off
@@ -106,6 +117,7 @@ def page_to_cropped_image(pil_page, span, scale):
106117
labels=("Labels to consider", "option", "l", str),
107118
scale=("Zoom for higher resolution for OCR algorithm", "option", "s", int),
108119
remove_base64=("Remove base64-encoded image data", "flag", "R", bool),
120+
fold_dashes=("Removes dashes at the end of a textline and folds them with the next term.", "flag", "f", bool),
109121
autofocus=("Autofocus on the transcript UI", "flag", "af", bool)
110122
# fmt: on
111123
)
@@ -115,6 +127,7 @@ def pdf_ocr_correct(
115127
labels: str,
116128
scale: int = 3,
117129
remove_base64:bool=False,
130+
fold_dashes:bool = False,
118131
autofocus: bool = False
119132
) -> ControllerComponentsDict:
120133
"""Applies OCR to annotated segments and gives a textbox for corrections."""
@@ -134,6 +147,8 @@ def new_stream(stream):
134147
cropped, img_str = page_to_cropped_image(pil_page, span=annot, scale=scale)
135148
annot["image"] = img_str
136149
annot["text"] = pytesseract.image_to_string(cropped)
150+
if fold_dashes:
151+
annot["text"] = fold_ocr_dashes(annot["text"])
137152
annot["transcription"] = annot["text"]
138153
text_input_fields = {
139154
"field_rows": 12,

0 commit comments

Comments
 (0)