Removed extra newlines PDF->text.

CogStack · Jan 29, 2025 · f607e64 · f607e64
1 parent b68013d
commit f607e64
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/ocr_service/processor/processor.py b/ocr_service/processor/processor.py
@@ -53,7 +53,7 @@ def _preprocess_html_to_img(self, stream: bytes, file_name: str) -> List[PILImag
 
         Returns:
             List[PILImage]: _description_
-        """   
+        """
         hti = Html2Image(output_path=TMP_FILE_DIR, temp_path=TMP_FILE_DIR)
         html_file_path = os.path.join(TMP_FILE_DIR, file_name)
         png_img_file_path = html_file_path + ".png"
@@ -94,7 +94,9 @@ def _pdf_to_text(self, stream: bytes) -> str:
             for page in pdf:
                 textpage = page.get_textpage()
                 output_text += textpage.get_text_range()
-                output_text += "\n"
+
+                # this has caused issues before with the output text
+                # output_text += "\n"
 
         return output_text, doc_metadata