Skip to content

Commit

Permalink
Removed extra newlines PDF->text.
Browse files Browse the repository at this point in the history
  • Loading branch information
vladd-bit committed Jan 29, 2025
1 parent b68013d commit f607e64
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions ocr_service/processor/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _preprocess_html_to_img(self, stream: bytes, file_name: str) -> List[PILImag
Returns:
List[PILImage]: _description_
"""
"""
hti = Html2Image(output_path=TMP_FILE_DIR, temp_path=TMP_FILE_DIR)
html_file_path = os.path.join(TMP_FILE_DIR, file_name)
png_img_file_path = html_file_path + ".png"
Expand Down Expand Up @@ -94,7 +94,9 @@ def _pdf_to_text(self, stream: bytes) -> str:
for page in pdf:
textpage = page.get_textpage()
output_text += textpage.get_text_range()
output_text += "\n"

# this has caused issues before with the output text
# output_text += "\n"

return output_text, doc_metadata

Expand Down

0 comments on commit f607e64

Please sign in to comment.