Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add password with PDF files #3721

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.16.21-dev2

### Enhancements
- **Use password** to load PDF with all modes

- **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear

Expand Down
Binary file added example-docs/pdf/password.pdf
Binary file not shown.
42 changes: 41 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def _test(result):
strategy=strategy,
starting_page_number=starting_page_number,
)
_test(result)
_test(result)


@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
Expand Down Expand Up @@ -1545,3 +1545,43 @@ def test_document_to_element_list_sets_category_depth_titles():
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
pprados marked this conversation as resolved.
Show resolved Hide resolved
file_mode,
strategy,
filename=example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == "File with password"

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
_test(result)
else:
with open(filename, "rb") as test_file:
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, password="password"
)
_test(result)
4 changes: 4 additions & 0 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def partition_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Expand Down Expand Up @@ -91,6 +92,8 @@ def partition_image(
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
password
The password to decrypt the PDF file.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -113,5 +116,6 @@ def partition_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
27 changes: 23 additions & 4 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def partition_pdf(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -224,6 +225,7 @@ def partition_pdf(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)

Expand All @@ -245,6 +247,7 @@ def partition_pdf_or_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -273,6 +276,7 @@ def partition_pdf_or_image(
languages=languages,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)
pdf_text_extractable = any(
Expand Down Expand Up @@ -322,6 +326,7 @@ def partition_pdf_or_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -347,6 +352,7 @@ def partition_pdf_or_image(
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -360,6 +366,7 @@ def extractable_elements(
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
if isinstance(file, bytes):
Expand All @@ -370,6 +377,7 @@ def extractable_elements(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
Expand All @@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -427,14 +438,16 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str],
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs,
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height

Expand Down Expand Up @@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partition using package installed locally"""
Expand Down Expand Up @@ -592,10 +606,11 @@ def _partition_pdf_or_image_local(
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

extracted_layout, layouts_links = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -635,20 +650,22 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)
else:
inferred_document_layout = process_data_with_model(
file,
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

if hasattr(file, "seek"):
file.seek(0)

extracted_layout, layouts_links = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -690,6 +707,7 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)

# vectorization of the data structure ends here
Expand Down Expand Up @@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr(
is_image: bool = False,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
Expand All @@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements)
else:
for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image,
Expand Down
4 changes: 4 additions & 0 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def process_data_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password: Optional[str] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given data and supplement the output DocumentLayout
Expand Down Expand Up @@ -89,6 +90,7 @@ def process_data_with_ocr(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)

return merged_layouts
Expand All @@ -105,6 +107,7 @@ def process_file_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password: Optional[str] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given file and supplement the output DocumentLayout
Expand Down Expand Up @@ -165,6 +168,7 @@ def process_file_with_ocr(
dpi=pdf_image_dpi,
output_folder=temp_dir,
paths_only=True,
userpw=password or "",
)
image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths):
Expand Down
11 changes: 9 additions & 2 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def convert_pdf_to_image(
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""

Expand All @@ -71,6 +72,7 @@ def convert_pdf_to_image(
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
userpw=password,
)
else:
images = pdf2image.convert_from_path(
Expand Down Expand Up @@ -125,6 +127,7 @@ def save_elements(
is_image: bool = False,
extract_image_block_to_payload: bool = False,
output_dir_path: str | None = None,
password: Optional[str] = None,
):
"""
Saves specific elements from a PDF as images either to a directory or embeds them in the
Expand Down Expand Up @@ -167,6 +170,7 @@ def save_elements(
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
password=password,
)
image_paths = cast(List[str], _image_paths)

Expand Down Expand Up @@ -389,15 +393,16 @@ def convert_pdf_to_images(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
chunk_size: int = 10,
password: Optional[str] = None,
) -> Iterator[Image.Image]:
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
exactly_one(filename=filename, file=file)
if file is not None:
f_bytes = convert_to_bytes(file)
info = pdf2image.pdfinfo_from_bytes(f_bytes)
info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
else:
f_bytes = None
info = pdf2image.pdfinfo_from_path(filename)
info = pdf2image.pdfinfo_from_path(filename, userpw=password)

total_pages = info["Pages"]
for start_page in range(1, total_pages + 1, chunk_size):
Expand All @@ -407,12 +412,14 @@ def convert_pdf_to_images(
f_bytes,
first_page=start_page,
last_page=end_page,
userpw=password,
)
else:
chunk_images = pdf2image.convert_from_path(
filename,
first_page=start_page,
last_page=end_page,
userpw=password,
)

for image in chunk_images:
Expand Down
7 changes: 6 additions & 1 deletion unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@
def process_file_with_pdfminer(
filename: str = "",
dpi: int = 200,
password: Optional[str] = None,
) -> tuple[List[List["TextRegion"]], List[List]]:
with open_filename(filename, "rb") as fp:
fp = cast(BinaryIO, fp)
extracted_layout, layouts_links = process_data_with_pdfminer(
file=fp,
dpi=dpi,
password=password,
)
return extracted_layout, layouts_links

Expand Down Expand Up @@ -432,6 +434,7 @@ def process_page_layout_from_pdfminer(
def process_data_with_pdfminer(
file: Optional[Union[bytes, BinaryIO]] = None,
dpi: int = 200,
password: Optional[str] = None,
) -> tuple[List[LayoutElements], List[List]]:
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
pdf pages using pdf2image"""
Expand All @@ -442,7 +445,9 @@ def process_data_with_pdfminer(
layouts_links = []
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)):
for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(file, password=password)
):
width, height = page_layout.width, page_layout.height

annotation_list = []
Expand Down
Loading