Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add password with PDF files #3721

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
62 changes: 34 additions & 28 deletions CHANGELOG.md

Large diffs are not rendered by default.

Binary file added example-docs/pdf/password.pdf
Binary file not shown.
77 changes: 55 additions & 22 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,48 +208,34 @@ def test_partition_pdf_local_raises_with_no_filename():

@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer", "ocr_tesseract"}),
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
pprados marked this conversation as resolved.
Show resolved Hide resolved
file_mode,
strategy,
starting_page_number,
expected_page_numbers,
origin,
filename=example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == expected_page_numbers
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
print(
[
(element.metadata.detection_origin, element.category, element.text)
for element in result
]
)
assert {element.metadata.detection_origin for element in result} == origin

if file_mode == "filename":
result = pdf.partition_pdf(
filename=filename, strategy=strategy, starting_page_number=starting_page_number
filename=filename, strategy=strategy,
)
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(
file=f, strategy=strategy, starting_page_number=starting_page_number
file=f, strategy=strategy,
)
_test(result)
else:
Expand All @@ -260,9 +246,8 @@ def _test(result):
result = pdf.partition_pdf(
file=spooled_temp_file,
strategy=strategy,
starting_page_number=starting_page_number,
)
_test(result)
_test(result)


@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
Expand Down Expand Up @@ -1545,3 +1530,51 @@ def test_document_to_element_list_sets_category_depth_titles():
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
pprados marked this conversation as resolved.
Show resolved Hide resolved
file_mode,
strategy,
filename=example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == 'File with password'

if file_mode == "filename":
result = pdf.partition_pdf(
filename=filename, strategy=strategy,
password="password"
)
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(
file=f, strategy=strategy,
password="password"
)
_test(result)
else:
with open(filename, "rb") as test_file:
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file,
strategy=strategy,
password="password"
)
_test(result)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.20" # pragma: no cover
__version__ = "0.16.20-dev1" # pragma: no cover
4 changes: 4 additions & 0 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def partition_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Expand Down Expand Up @@ -91,6 +92,8 @@ def partition_image(
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
password
The password to decrypt the PDF file.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -113,5 +116,6 @@ def partition_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
29 changes: 25 additions & 4 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def partition_pdf(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -224,6 +225,7 @@ def partition_pdf(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)

Expand All @@ -245,6 +247,7 @@ def partition_pdf_or_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -273,6 +276,7 @@ def partition_pdf_or_image(
languages=languages,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)
pdf_text_extractable = any(
Expand Down Expand Up @@ -322,6 +326,7 @@ def partition_pdf_or_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -347,6 +352,7 @@ def partition_pdf_or_image(
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -360,6 +366,7 @@ def extractable_elements(
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password:Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
if isinstance(file, bytes):
Expand All @@ -370,6 +377,7 @@ def extractable_elements(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
password:Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
Expand All @@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -427,14 +438,16 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str],
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs,
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height

Expand Down Expand Up @@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None,
password:Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partition using package installed locally"""
Expand Down Expand Up @@ -592,10 +606,12 @@ def _partition_pdf_or_image_local(
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

extracted_layout, layouts_links = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi,
password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -635,20 +651,22 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)
else:
inferred_document_layout = process_data_with_model(
file,
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

if hasattr(file, "seek"):
file.seek(0)

extracted_layout, layouts_links = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -690,6 +708,7 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)

# vectorization of the data structure ends here
Expand Down Expand Up @@ -837,6 +856,7 @@ def _partition_pdf_or_image_with_ocr(
is_image: bool = False,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
Expand All @@ -861,7 +881,8 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements)
else:
for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number
convert_pdf_to_images(filename, file, password=password),
start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image,
Expand Down
4 changes: 4 additions & 0 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def process_data_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password:Optional[str] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given data and supplement the output DocumentLayout
Expand Down Expand Up @@ -89,6 +90,7 @@ def process_data_with_ocr(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)

return merged_layouts
Expand All @@ -105,6 +107,7 @@ def process_file_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password:Optional[str] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given file and supplement the output DocumentLayout
Expand Down Expand Up @@ -165,6 +168,7 @@ def process_file_with_ocr(
dpi=pdf_image_dpi,
output_folder=temp_dir,
paths_only=True,
userpw=password or ""
)
image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths):
Expand Down
Loading