Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Very approximate support for hiding text using clipping path #1026

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
- `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))
- text hidden using clipping path isn't hidden ([#414](https://github.com/pdfminer/pdfminer.six/issues/414))

### Removed

Expand Down
11 changes: 10 additions & 1 deletion pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,16 @@ def render_char(
ncs,
graphicstate,
)
self.cur_item.add(item)
x0, y0, x1, y1 = item.bbox
if (
(self.laparams is not None and self.laparams.hidden_texts)
or self.clippath is None
or self.clippath.contains((x0, y0))
and self.clippath.contains((x1, y1))
):
self.cur_item.add(item)
else:
log.debug("Character %r outside clippath %r", item, self.clippath)
return item.adv

def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
Expand Down
14 changes: 12 additions & 2 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ class LAParams:
layout analysis
:param all_texts: If layout analysis should be performed on text in
figures.
:param hidden_texts: If layout analysis should be performed on hidden
text outside the clipping path.
"""

def __init__(
Expand All @@ -89,6 +91,7 @@ def __init__(
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = False,
all_texts: bool = False,
hidden_texts: bool = False,
) -> None:
self.line_overlap = line_overlap
self.char_margin = char_margin
Expand All @@ -97,6 +100,7 @@ def __init__(
self.boxes_flow = boxes_flow
self.detect_vertical = detect_vertical
self.all_texts = all_texts
self.hidden_texts = hidden_texts

self._validate()

Expand All @@ -115,8 +119,14 @@ def _validate(self) -> None:
def __repr__(self) -> str:
return (
"<LAParams: char_margin=%.1f, line_margin=%.1f, "
"word_margin=%.1f all_texts=%r>"
% (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
"word_margin=%.1f all_texts=%r hidden_texts=%r>"
% (
self.char_margin,
self.line_margin,
self.word_margin,
self.all_texts,
self.hidden_texts,
)
)


Expand Down
5 changes: 5 additions & 0 deletions pdfminer/pdfdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

if TYPE_CHECKING:
from pdfminer.pdfinterp import (
PDFClippingPath,
PDFGraphicState,
PDFResourceManager,
PDFStackT,
Expand All @@ -35,6 +36,7 @@ class PDFDevice:
def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
self.rsrcmgr = rsrcmgr
self.ctm: Optional[Matrix] = None
self.clippath: Optional[PDFClippingPath] = None

def __repr__(self) -> str:
return "<PDFDevice>"
Expand All @@ -51,6 +53,9 @@ def close(self) -> None:
def set_ctm(self, ctm: Matrix) -> None:
self.ctm = ctm

def set_clippath(self, clippath: "PDFClippingPath") -> None:
self.clippath = clippath

def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
pass

Expand Down
121 changes: 112 additions & 9 deletions pdfminer/pdfinterp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import math
import re
from io import BytesIO
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
Expand Down Expand Up @@ -44,7 +45,9 @@
PathSegment,
Point,
Rect,
apply_matrix_pt,
choplist,
get_bound,
mult_matrix,
)

Expand Down Expand Up @@ -176,6 +179,86 @@ def __repr__(self) -> str:
)


class PDFClippingPath:
"""Rather approximate representation of a clipping path."""

bbox: Rect

def __init__(self, mediabox: Rect):
self.bbox = mediabox

def copy(self) -> "PDFClippingPath":
return PDFClippingPath(self.bbox)

def add(self, path: List[PathSegment], ctm: Matrix, evenodd: bool = False) -> None:
"""Intersect with a path, applying evenodd rule if requested.

NOTE: in practice, does no such thing at the moment, but
simply intersects the bounding box with path if it's a
rectangle (it usually is).
"""
# Find the BBox of the requested path (it's usually a
# rectangle) using similar method to
# PDFLayoutAnalyzer.paint_path (FIXME: ideally we would reuse
# that code)
shape = "".join(x[0] for x in path)
if shape[:1] != "m":
# invalid path, do nothing
log.warning("Invalid path for clipping: %r", path)
elif shape.count("m") > 1:
log.warning("Multiple subpaths for clipping, will not clip: %r", path)
else:
raw_pts = [
cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
]
pts = [apply_matrix_pt(ctm, pt) for pt in raw_pts]
if shape in {"mlh", "ml"}:
# A single line segment which has no area. Make bbox
# empty (NOTE: points on this line are technically not
# clipped)
log.warning("Clipping with empty shape (line: %r)", pts[0:2])
self.bbox = (*pts[0], *pts[0])
elif shape in {"mlllh", "mllll"}:
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts

is_closed_loop = pts[0] == pts[4]
has_square_coordinates = (
x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
if is_closed_loop and has_square_coordinates:
# A rectangle. Intersect with bbox.
log.debug("Clipping with rectangle: %r", pts[0:4])
bbox = get_bound(pts[0:4])
# FIXME: not at all supporting evenodd/winding
# rules (impossible to do since we are just
# tracking a bbox)
ax0, ay0, ax1, ay1 = bbox
bx0, by0, bx1, by1 = self.bbox
self.bbox = (
max(ax0, bx0),
max(ay0, by0),
min(ax1, bx1),
min(ay1, by1),
)
log.debug("Clipped to: %r", self.bbox)
else:
log.warning("Path is not a rectangle, will not clip: %r", path)

def contains(self, point: Point) -> bool:
"""Is the given point inside the clipping path?

NOTE: Only very approximately supported for the moment."""
x, y = point
bx0, by0, bx1, by1 = self.bbox
return x >= bx0 and x <= bx1 and y >= by0 and y <= by1

def __repr__(self) -> str:
return "<PDFClippingPath BBox=%r>" % (self.bbox,)


CLIP_INFINITY = PDFClippingPath((0, 0, math.inf, math.inf))


class PDFResourceManager:
"""Repository of shared resources.

Expand Down Expand Up @@ -412,12 +495,16 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
for xobjid, xobjstrm in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm

def init_state(self, ctm: Matrix) -> None:
def init_state(self, ctm: Matrix, clippath: PDFClippingPath) -> None:
"""Initialize the text and graphic states for rendering a page."""
# gstack: stack for graphical states.
self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
self.gstack: List[
Tuple[Matrix, PDFTextState, PDFGraphicState, PDFClippingPath]
] = []
self.ctm = ctm
self.device.set_ctm(self.ctm)
self.clippath = clippath
self.device.set_clippath(self.clippath)
self.textstate = PDFTextState()
self.graphicstate = PDFGraphicState()
self.curpath: List[PathSegment] = []
Expand All @@ -439,15 +526,23 @@ def pop(self, n: int) -> List[PDFStackT]:
self.argstack = self.argstack[:-n]
return x

def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
def get_current_state(
self,
) -> Tuple[Matrix, PDFTextState, PDFGraphicState, PDFClippingPath]:
return (
self.ctm,
self.textstate.copy(),
self.graphicstate.copy(),
self.clippath.copy(),
)

def set_current_state(
self,
state: Tuple[Matrix, PDFTextState, PDFGraphicState],
state: Tuple[Matrix, PDFTextState, PDFGraphicState, PDFClippingPath],
) -> None:
(self.ctm, self.textstate, self.graphicstate) = state
(self.ctm, self.textstate, self.graphicstate, self.clippath) = state
self.device.set_ctm(self.ctm)
self.device.set_clippath(self.clippath)

def do_q(self) -> None:
"""Save graphics state"""
Expand Down Expand Up @@ -610,9 +705,13 @@ def do_n(self) -> None:

def do_W(self) -> None:
"""Set clipping path using nonzero winding number rule"""
self.clippath.add(self.curpath, self.ctm, False)
self.device.set_clippath(self.clippath)

def do_W_a(self) -> None:
"""Set clipping path using even-odd rule"""
self.clippath.add(self.curpath, self.ctm, True)
self.device.set_clippath(self.clippath)

def do_CS(self, name: PDFStackT) -> None:
"""Set color space for stroking operations
Expand Down Expand Up @@ -946,6 +1045,7 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None:
resources,
[xobj],
ctm=mult_matrix(matrix, self.ctm),
clippath=self.clippath,
)
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
Expand All @@ -968,27 +1068,30 @@ def process_page(self, page: PDFPage) -> None:
else:
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
clippath = PDFClippingPath(page.mediabox)
self.render_contents(page.resources, page.contents, ctm=ctm, clippath=clippath)
self.device.end_page(page)

def render_contents(
self,
resources: Dict[object, object],
streams: Sequence[object],
ctm: Matrix = MATRIX_IDENTITY,
clippath: PDFClippingPath = CLIP_INFINITY,
) -> None:
"""Render the content streams.

This method may be called recursively.
"""
log.debug(
"render_contents: resources=%r, streams=%r, ctm=%r",
"render_contents: resources=%r, streams=%r, ctm=%r, clippath=%r",
resources,
streams,
ctm,
clippath,
)
self.init_resources(resources)
self.init_state(ctm)
self.init_state(ctm, clippath)
self.execute(list_value(streams))

def execute(self, streams: Sequence[object]) -> None:
Expand Down
Binary file added samples/contrib/issue-414-hidden-text.pdf
Binary file not shown.
15 changes: 13 additions & 2 deletions tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ def run_with_string(sample_path, laparams=None):
return s


def run_with_file(sample_path):
def run_with_file(sample_path, laparams=None):
if laparams is None:
laparams = {}
absolute_path = absolute_sample_path(sample_path)
with open(absolute_path, "rb") as in_file:
s = extract_text(in_file)
s = extract_text(in_file, laparams=LAParams(**laparams))
return s


Expand Down Expand Up @@ -146,6 +148,15 @@ def test_issue_791_non_unicode_cmap(self):
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])

def test_issue_414_hidden_text(self):
test_file = "contrib/issue-414-hidden-text.pdf"
s = run_with_file(test_file)
# Hidden text should be hidden
self.assertFalse("VR-181 (11-03)" in s)
# Unless we say it isn't
s = run_with_file(test_file, laparams={"hidden_texts": True})
self.assertTrue("VR-181 (11-03)" in s)


class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self):
Expand Down
9 changes: 9 additions & 0 deletions tools/pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,14 @@ def create_parser() -> argparse.ArgumentParser:
action="store_true",
help="If layout analysis should be performed on text in figures.",
)
la_param_group.add_argument(
"--hidden-texts",
"-H",
default=la_params.hidden_texts,
action="store_true",
help="If layout analysis should be performed on hidden text outside "
"the clipping path.",
)

output_params = parser.add_argument_group(
"Output",
Expand Down Expand Up @@ -296,6 +304,7 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
boxes_flow=parsed_args.boxes_flow,
detect_vertical=parsed_args.detect_vertical,
all_texts=parsed_args.all_texts,
hidden_texts=parsed_args.hidden_texts,
)

if parsed_args.page_numbers:
Expand Down
Loading