pdfminer · dhdaines · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
 - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
 - inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))
+- text hidden using clipping path isn't hidden ([#414](https://github.com/pdfminer/pdfminer.six/issues/414))
 
 ### Removed
 

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -258,7 +258,16 @@ def render_char(
  ncs,
  graphicstate,
  )
- self.cur_item.add(item)
+ x0, y0, x1, y1 = item.bbox
+ if (
+ (self.laparams is not None and self.laparams.hidden_texts)
+ or self.clippath is None
+ or self.clippath.contains((x0, y0))
+ and self.clippath.contains((x1, y1))
+ ):
+ self.cur_item.add(item)
+ else:
+ log.debug("Character %r outside clippath %r", item, self.clippath)
  return item.adv
 
  def handle_undefined_char(self, font: PDFFont, cid: int) -> str:

diff --git a/pdfminer/layout.py b/pdfminer/layout.py
@@ -78,6 +78,8 @@ class LAParams:
  layout analysis
  :param all_texts: If layout analysis should be performed on text in
  figures.
+ :param hidden_texts: If layout analysis should be performed on hidden
+ text outside the clipping path.
  """
 
  def __init__(
@@ -89,6 +91,7 @@ def __init__(
  boxes_flow: Optional[float] = 0.5,
  detect_vertical: bool = False,
  all_texts: bool = False,
+ hidden_texts: bool = False,
  ) -> None:
  self.line_overlap = line_overlap
  self.char_margin = char_margin
@@ -97,6 +100,7 @@ def __init__(
  self.boxes_flow = boxes_flow
  self.detect_vertical = detect_vertical
  self.all_texts = all_texts
+ self.hidden_texts = hidden_texts
 
  self._validate()
 
@@ -115,8 +119,14 @@ def _validate(self) -> None:
  def __repr__(self) -> str:
  return (
  "<LAParams: char_margin=%.1f, line_margin=%.1f, "
- "word_margin=%.1f all_texts=%r>"
- % (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
+ "word_margin=%.1f all_texts=%r hidden_texts=%r>"
+ % (
+ self.char_margin,
+ self.line_margin,
+ self.word_margin,
+ self.all_texts,
+ self.hidden_texts,
+ )
  )
 
 

diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
@@ -19,6 +19,7 @@
 
 if TYPE_CHECKING:
  from pdfminer.pdfinterp import (
+ PDFClippingPath,
  PDFGraphicState,
  PDFResourceManager,
  PDFStackT,
@@ -35,6 +36,7 @@ class PDFDevice:
  def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
  self.rsrcmgr = rsrcmgr
  self.ctm: Optional[Matrix] = None
+ self.clippath: Optional[PDFClippingPath] = None
 
  def __repr__(self) -> str:
  return "<PDFDevice>"
@@ -51,6 +53,9 @@ def close(self) -> None:
  def set_ctm(self, ctm: Matrix) -> None:
  self.ctm = ctm
 
+ def set_clippath(self, clippath: "PDFClippingPath") -> None:
+ self.clippath = clippath
+
  def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
  pass
 

diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
@@ -1,4 +1,5 @@
 import logging
+import math
 import re
 from io import BytesIO
 from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
@@ -44,7 +45,9 @@
  PathSegment,
  Point,
  Rect,
+ apply_matrix_pt,
  choplist,
+ get_bound,
  mult_matrix,
 )
 
@@ -176,6 +179,86 @@ def __repr__(self) -> str:
  )
 
 
+class PDFClippingPath:
+ """Rather approximate representation of a clipping path."""
+
+ bbox: Rect
+
+ def __init__(self, mediabox: Rect):
+ self.bbox = mediabox
+
+ def copy(self) -> "PDFClippingPath":
+ return PDFClippingPath(self.bbox)
+
+ def add(self, path: List[PathSegment], ctm: Matrix, evenodd: bool = False) -> None:
+ """Intersect with a path, applying evenodd rule if requested.
+
+ NOTE: in practice, does no such thing at the moment, but
+ simply intersects the bounding box with path if it's a
+ rectangle (it usually is).
+ """
+ # Find the BBox of the requested path (it's usually a
+ # rectangle) using similar method to
+ # PDFLayoutAnalyzer.paint_path (FIXME: ideally we would reuse
+ # that code)
+ shape = "".join(x[0] for x in path)
+ if shape[:1] != "m":
+ # invalid path, do nothing
+ log.warning("Invalid path for clipping: %r", path)
+ elif shape.count("m") > 1:
+ log.warning("Multiple subpaths for clipping, will not clip: %r", path)
+ else:
+ raw_pts = [
+ cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
+ ]
+ pts = [apply_matrix_pt(ctm, pt) for pt in raw_pts]
+ if shape in {"mlh", "ml"}:
+ # A single line segment which has no area. Make bbox
+ # empty (NOTE: points on this line are technically not
+ # clipped)
+ log.warning("Clipping with empty shape (line: %r)", pts[0:2])
+ self.bbox = (*pts[0], *pts[0])
+ elif shape in {"mlllh", "mllll"}:
+ (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
+
+ is_closed_loop = pts[0] == pts[4]
+ has_square_coordinates = (
+ x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
+ ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
+ if is_closed_loop and has_square_coordinates:
+ # A rectangle. Intersect with bbox.
+ log.debug("Clipping with rectangle: %r", pts[0:4])
+ bbox = get_bound(pts[0:4])
+ # FIXME: not at all supporting evenodd/winding
+ # rules (impossible to do since we are just
+ # tracking a bbox)
+ ax0, ay0, ax1, ay1 = bbox
+ bx0, by0, bx1, by1 = self.bbox
+ self.bbox = (
+ max(ax0, bx0),
+ max(ay0, by0),
+ min(ax1, bx1),
+ min(ay1, by1),
+ )
+ log.debug("Clipped to: %r", self.bbox)
+ else:
+ log.warning("Path is not a rectangle, will not clip: %r", path)
+
+ def contains(self, point: Point) -> bool:
+ """Is the given point inside the clipping path?
+
+ NOTE: Only very approximately supported for the moment."""
+ x, y = point
+ bx0, by0, bx1, by1 = self.bbox
+ return x >= bx0 and x <= bx1 and y >= by0 and y <= by1
+
+ def __repr__(self) -> str:
+ return "<PDFClippingPath BBox=%r>" % (self.bbox,)
+
+
+CLIP_INFINITY = PDFClippingPath((0, 0, math.inf, math.inf))
+
+
 class PDFResourceManager:
  """Repository of shared resources.
 
@@ -412,12 +495,16 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
  for xobjid, xobjstrm in dict_value(v).items():
  self.xobjmap[xobjid] = xobjstrm
 
- def init_state(self, ctm: Matrix) -> None:
+ def init_state(self, ctm: Matrix, clippath: PDFClippingPath) -> None:
  """Initialize the text and graphic states for rendering a page."""
  # gstack: stack for graphical states.
- self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
+ self.gstack: List[
+ Tuple[Matrix, PDFTextState, PDFGraphicState, PDFClippingPath]
+ ] = []
  self.ctm = ctm
  self.device.set_ctm(self.ctm)
+ self.clippath = clippath
+ self.device.set_clippath(self.clippath)
  self.textstate = PDFTextState()
  self.graphicstate = PDFGraphicState()
  self.curpath: List[PathSegment] = []
@@ -439,15 +526,23 @@ def pop(self, n: int) -> List[PDFStackT]:
  self.argstack = self.argstack[:-n]
  return x
 
- def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
- return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
+ def get_current_state(
+ self,
+ ) -> Tuple[Matrix, PDFTextState, PDFGraphicState, PDFClippingPath]:
+ return (
+ self.ctm,
+ self.textstate.copy(),
+ self.graphicstate.copy(),
+ self.clippath.copy(),
+ )
 
  def set_current_state(
  self,
- state: Tuple[Matrix, PDFTextState, PDFGraphicState],
+ state: Tuple[Matrix, PDFTextState, PDFGraphicState, PDFClippingPath],
  ) -> None:
- (self.ctm, self.textstate, self.graphicstate) = state
+ (self.ctm, self.textstate, self.graphicstate, self.clippath) = state
  self.device.set_ctm(self.ctm)
+ self.device.set_clippath(self.clippath)
 
  def do_q(self) -> None:
  """Save graphics state"""
@@ -610,9 +705,13 @@ def do_n(self) -> None:
 
  def do_W(self) -> None:
  """Set clipping path using nonzero winding number rule"""
+ self.clippath.add(self.curpath, self.ctm, False)
+ self.device.set_clippath(self.clippath)
 
  def do_W_a(self) -> None:
  """Set clipping path using even-odd rule"""
+ self.clippath.add(self.curpath, self.ctm, True)
+ self.device.set_clippath(self.clippath)
 
  def do_CS(self, name: PDFStackT) -> None:
  """Set color space for stroking operations
@@ -946,6 +1045,7 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None:
  resources,
  [xobj],
  ctm=mult_matrix(matrix, self.ctm),
+ clippath=self.clippath,
  )
  self.device.end_figure(xobjid)
  elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
@@ -968,27 +1068,30 @@ def process_page(self, page: PDFPage) -> None:
  else:
  ctm = (1, 0, 0, 1, -x0, -y0)
  self.device.begin_page(page, ctm)
- self.render_contents(page.resources, page.contents, ctm=ctm)
+ clippath = PDFClippingPath(page.mediabox)
+ self.render_contents(page.resources, page.contents, ctm=ctm, clippath=clippath)
  self.device.end_page(page)
 
  def render_contents(
  self,
  resources: Dict[object, object],
  streams: Sequence[object],
  ctm: Matrix = MATRIX_IDENTITY,
+ clippath: PDFClippingPath = CLIP_INFINITY,
  ) -> None:
  """Render the content streams.
 
  This method may be called recursively.
  """
  log.debug(
- "render_contents: resources=%r, streams=%r, ctm=%r",
+ "render_contents: resources=%r, streams=%r, ctm=%r, clippath=%r",
  resources,
  streams,
  ctm,
+ clippath,
  )
  self.init_resources(resources)
- self.init_state(ctm)
+ self.init_state(ctm, clippath)
  self.execute(list_value(streams))
 
  def execute(self, streams: Sequence[object]) -> None:

diff --git a/samples/contrib/issue-414-hidden-text.pdf b/samples/contrib/issue-414-hidden-text.pdf
diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -13,10 +13,12 @@ def run_with_string(sample_path, laparams=None):
  return s
 
 
-def run_with_file(sample_path):
+def run_with_file(sample_path, laparams=None):
+ if laparams is None:
+ laparams = {}
  absolute_path = absolute_sample_path(sample_path)
  with open(absolute_path, "rb") as in_file:
- s = extract_text(in_file)
+ s = extract_text(in_file, laparams=LAParams(**laparams))
  return s
 
 
@@ -146,6 +148,15 @@ def test_issue_791_non_unicode_cmap(self):
  s = run_with_file(test_file)
  self.assertEqual(s.strip(), test_strings[test_file])
 
+ def test_issue_414_hidden_text(self):
+ test_file = "contrib/issue-414-hidden-text.pdf"
+ s = run_with_file(test_file)
+ # Hidden text should be hidden
+ self.assertFalse("VR-181 (11-03)" in s)
+ # Unless we say it isn't
+ s = run_with_file(test_file, laparams={"hidden_texts": True})
+ self.assertTrue("VR-181 (11-03)" in s)
+
 
 class TestExtractPages(unittest.TestCase):
  def _get_test_file_path(self):

diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
@@ -214,6 +214,14 @@ def create_parser() -> argparse.ArgumentParser:
  action="store_true",
  help="If layout analysis should be performed on text in figures.",
  )
+ la_param_group.add_argument(
+ "--hidden-texts",
+ "-H",
+ default=la_params.hidden_texts,
+ action="store_true",
+ help="If layout analysis should be performed on hidden text outside "
+ "the clipping path.",
+ )
 
  output_params = parser.add_argument_group(
  "Output",
@@ -296,6 +304,7 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
  boxes_flow=parsed_args.boxes_flow,
  detect_vertical=parsed_args.detect_vertical,
  all_texts=parsed_args.all_texts,
+ hidden_texts=parsed_args.hidden_texts,
  )
 
  if parsed_args.page_numbers: