diff --git a/ocr_service/processor/processor.py b/ocr_service/processor/processor.py index 87af516..c5cce09 100644 --- a/ocr_service/processor/processor.py +++ b/ocr_service/processor/processor.py @@ -53,7 +53,7 @@ def _preprocess_html_to_img(self, stream: bytes, file_name: str) -> List[PILImag Returns: List[PILImage]: _description_ - """ + """ hti = Html2Image(output_path=TMP_FILE_DIR, temp_path=TMP_FILE_DIR) html_file_path = os.path.join(TMP_FILE_DIR, file_name) png_img_file_path = html_file_path + ".png" @@ -94,7 +94,9 @@ def _pdf_to_text(self, stream: bytes) -> str: for page in pdf: textpage = page.get_textpage() output_text += textpage.get_text_range() - output_text += "\n" + + # this has caused issues before with the output text + # output_text += "\n" return output_text, doc_metadata