@@ -98,6 +98,17 @@ def page_to_cropped_image(pil_page, span, scale):
98
98
return cropped , f"data:image/png;base64,{ img_str .decode ('utf-8' )} "
99
99
100
100
101
+ def fold_ocr_dashes (ocr_input ):
102
+ new = ""
103
+ for line in ocr_input .split ("\n " ):
104
+ if line .rfind ("-" ) == - 1 :
105
+ newline = line + " "
106
+ else :
107
+ newline = line [:line .rfind ("-" )]
108
+ new += newline
109
+ return new
110
+
111
+
101
112
@recipe (
102
113
"pdf.ocr.correct" ,
103
114
# fmt: off
@@ -106,6 +117,7 @@ def page_to_cropped_image(pil_page, span, scale):
106
117
labels = ("Labels to consider" , "option" , "l" , str ),
107
118
scale = ("Zoom for higher resolution for OCR algorithm" , "option" , "s" , int ),
108
119
remove_base64 = ("Remove base64-encoded image data" , "flag" , "R" , bool ),
120
+ fold_dashes = ("Removes dashes at the end of a textline and folds them with the next term." , "flag" , "f" , bool ),
109
121
autofocus = ("Autofocus on the transcript UI" , "flag" , "af" , bool )
110
122
# fmt: on
111
123
)
@@ -115,6 +127,7 @@ def pdf_ocr_correct(
115
127
labels : str ,
116
128
scale : int = 3 ,
117
129
remove_base64 :bool = False ,
130
+ fold_dashes :bool = False ,
118
131
autofocus : bool = False
119
132
) -> ControllerComponentsDict :
120
133
"""Applies OCR to annotated segments and gives a textbox for corrections."""
@@ -134,6 +147,8 @@ def new_stream(stream):
134
147
cropped , img_str = page_to_cropped_image (pil_page , span = annot , scale = scale )
135
148
annot ["image" ] = img_str
136
149
annot ["text" ] = pytesseract .image_to_string (cropped )
150
+ if fold_dashes :
151
+ annot ["text" ] = fold_ocr_dashes (annot ["text" ])
137
152
annot ["transcription" ] = annot ["text" ]
138
153
text_input_fields = {
139
154
"field_rows" : 12 ,
0 commit comments