From 3d6878d7f0439f35bce727a74bbc49bc4cdf1908 Mon Sep 17 00:00:00 2001
From: OlteanuRares <rares.olteanu@3pillarglobal.com>
Date: Tue, 3 Sep 2024 16:02:54 +0300
Subject: [PATCH] revert on line length error instead of cursor placement

---
 docs/changelog.rst                       |  9 ++-
 pycaption/scc/__init__.py                | 31 +++-------
 pycaption/scc/specialized_collections.py | 38 +++++-------
 pycaption/scc/state_machines.py          | 14 ++---
 setup.py                                 |  2 +-
 tests/fixtures/dfxp.py                   | 42 ++++++-------
 tests/fixtures/scc.py                    |  2 +-
 tests/test_scc.py                        | 79 +++---------------------
 8 files changed, 68 insertions(+), 149 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index e6e0823c..400e9cdd 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,14 +1,19 @@
 Changelog
 ---------
-2.2.12
+2.2.13
 ^^^^^^
 - Mid-row codes only add spaces only if there isn't one before
 - Mid-row codes only add spaces only if affects the text in the same row (not adding if after previous text follows break or breaks)
 - Remove spaces to the end of the lines
-- CaptionLineLengthError is now raised if the cursor goes after column 32 instead of string length over 32
 - Change error message for the 32 character limit.
 - Close italics on receiving another style setting command.
 - Throw an CaptionReadNoCaptions error in case of empty input file are provided
+- Properly add breaks (it was only for jumps to next row). Now it adds as many breaks as the difference between row numbers.
+- Ignore repositioning commands which are not followed by any text before breaks.
+
+2.2.12
+^^^^^^
+- Pinned nltk to 3.8.0
 
 2.2.11
 ^^^^^^
diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
index 0d0a1de9..b28444a0 100644
--- a/pycaption/scc/__init__.py
+++ b/pycaption/scc/__init__.py
@@ -233,11 +233,14 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
 
         # check captions for incorrect lengths
         lines_too_long = defaultdict(list)
-
         for caption in self.caption_stash._collection:
-            line_length = self.get_scc_line_line_length(caption.to_real_caption())
-            if line_length:
-                lines_too_long.update(line_length)
+            caption_start = caption.to_real_caption().format_start()
+            caption_text = "".join(caption.to_real_caption().get_text_nodes())
+            text_too_long = [line for line in caption_text.split("\n") if len(line) > 32]
+            if caption_start in lines_too_long:
+                lines_too_long[caption_start] = text_too_long
+            else:
+                lines_too_long[caption_start].extend(text_too_long)
 
         msg = ""
         if bool(lines_too_long.keys()):
@@ -248,8 +251,8 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
                         msg += line + f" - Length { len(line)}" + "\n"
         if len(msg):
             raise CaptionLineLengthError(
-                f"Cursor goes over column 32 for caption cue in scc file.\n"
-                f"Affected lines:\n"
+                f"32 character limit for caption cue in scc file.\n"
+                f"Lines longer than 32:\n"
                 f"{msg}"
             )
 
@@ -513,22 +516,6 @@ def _pop_on(self, end=0):
         pop_on_cue = self.pop_ons_queue.pop()
         self.caption_stash.create_and_store(pop_on_cue.buffer, pop_on_cue.start, end)
 
-    @staticmethod
-    def get_scc_line_line_length(caption):
-        long_line = defaultdict(list)
-        caption_start = caption.format_start()
-        text_nodes = [node for node in caption.nodes if node.type_ == CaptionNode.TEXT]
-        if not text_nodes:
-            return None
-        start_writing_at = text_nodes[0].position[1]
-        caption_text = "".join(caption.get_text_nodes())
-        if start_writing_at:
-            caption_text = " " * start_writing_at + caption_text
-        long_line[caption_start] = [
-            line for line in caption_text.split("\n") if len(line) > 32
-        ]
-        return long_line if len(long_line[caption_start]) else None
-
 
 class SCCWriter(BaseWriter):
     def __init__(self, *args, **kw):
diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py
index 5fe79eef..85bad6ba 100644
--- a/pycaption/scc/specialized_collections.py
+++ b/pycaption/scc/specialized_collections.py
@@ -310,8 +310,6 @@ def __init__(self, collection=None, position_tracker=None):
             self._collection = collection
 
         self.last_style = None  # can be italic on or italic off as we only support italics
-        self._cursor_position = 0
-
         self._position_tracer = position_tracker
 
     def is_empty(self):
@@ -342,14 +340,17 @@ def add_chars(self, *chars):
 
         # handle a simple line break
         if self._position_tracer.is_linebreak_required():
-            # must insert a line break here
-            self._collection.append(
-                _InstructionNode.create_break(position=current_position)
-            )
+            for _ in range(self._position_tracer._breaks_required):
+                self._collection.append(
+                    _InstructionNode.create_break(position=current_position)
+                )
+            self._position_tracer.acknowledge_linebreak_consumed()
             node = _InstructionNode.create_text(current_position)
             self._collection.append(node)
-            self._cursor_position = self._position_tracer._last_column
-            self._position_tracer.acknowledge_linebreak_consumed()
+            if self._position_tracer.is_repositioning_required():
+                # it means we have a reposition command which was not followed by
+                # any text, so we just ignore it and break
+                self._position_tracer.acknowledge_position_changed()
 
         # handle completely new positioning
         elif self._position_tracer.is_repositioning_required():
@@ -361,7 +362,6 @@ def add_chars(self, *chars):
             self._position_tracer.acknowledge_position_changed()
 
         node.add_chars(*chars)
-        self._cursor_position += len("".join(chars))
 
     @staticmethod
     def get_style_for_command(command):
@@ -385,8 +385,6 @@ def interpret_command(self, command):
         """
         self._update_positioning(command)
 
-        text = COMMANDS.get(command, "")
-
         if command == "94a1":
             self.handle_backspace("94a1")
 
@@ -412,9 +410,10 @@ def interpret_command(self, command):
                     #  it should open italic tag
                     #  if break is required, break then add style tag
                     if self._position_tracer.is_linebreak_required():
-                        self._collection.append(
-                            _InstructionNode.create_break(position=current_position)
-                        )
+                        for _ in range(self._position_tracer._breaks_required):
+                            self._collection.append(
+                                _InstructionNode.create_break(position=current_position)
+                            )
                         self._position_tracer.acknowledge_linebreak_consumed()
                     self._collection.append(
                         _InstructionNode.create_italics_style(current_position)
@@ -433,9 +432,10 @@ def interpret_command(self, command):
                     )
                     self.last_style = "italics off"
                     if self._position_tracer.is_linebreak_required():
-                        self._collection.append(
-                            _InstructionNode.create_break(position=current_position)
-                        )
+                        for _ in range(self._position_tracer._breaks_required):
+                            self._collection.append(
+                                _InstructionNode.create_break(position=current_position)
+                            )
                         self._position_tracer.acknowledge_linebreak_consumed()
 
         #  handle mid-row codes that follows a text node
@@ -459,7 +459,6 @@ def interpret_command(self, command):
                 # need to close italics tag, add a space
                 # to the end of the previous text node
                 prev_text_node.text = prev_text_node.text + " "
-                self._cursor_position += 1
 
     def _update_positioning(self, command):
         """Sets the positioning information to use for the next nodes
@@ -470,7 +469,6 @@ def _update_positioning(self, command):
         if command in PAC_TAB_OFFSET_COMMANDS:
             tab_offset = PAC_TAB_OFFSET_COMMANDS[command]
             positioning = (prev_positioning[0], prev_positioning[1] + tab_offset)
-            self._cursor_position += tab_offset
         else:
             first, second = command[:2], command[2:]
             try:
@@ -479,7 +477,6 @@ def _update_positioning(self, command):
             except KeyError:
                 # if not PAC or OFFSET we're not changing position
                 return
-            self._cursor_position = positioning[1]
         self._position_tracer.update_positioning(positioning)
 
     def __iter__(self):
@@ -533,7 +530,6 @@ def handle_backspace(self, word):
         # only if the previous character in not also extended
         if delete_previous_condition:
             node.text = node.text[:-1]
-        self._cursor_position -= 1
 
     def get_previous_text_node(self):
         for node in self._collection[::-1]:
diff --git a/pycaption/scc/state_machines.py b/pycaption/scc/state_machines.py
index 04fc632b..fed5e508 100644
--- a/pycaption/scc/state_machines.py
+++ b/pycaption/scc/state_machines.py
@@ -11,7 +11,7 @@ def __init__(self, positioning=None):
         :type positioning: tuple[int]
         """
         self._positions = [positioning]
-        self._break_required = False
+        self._breaks_required = 0
         self._repositioning_required = False
         # Since the actual column is not applied when encountering a line break
         # this attribute is used to store it and determine by comparison if the
@@ -35,18 +35,18 @@ def update_positioning(self, positioning):
             return
 
         row, col = current
-        if self._break_required:
+        if self._breaks_required:
             col = self._last_column
         new_row, new_col = positioning
         is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3
 
         # One line below will be treated as line break, not repositioning
-        if new_row == row + 1:
+        if new_row > row:
             self._positions.append((new_row, col))
-            self._break_required = True
+            self._breaks_required = new_row - row
             self._last_column = new_col
         # Tab offsets after line breaks will be ignored to avoid repositioning
-        elif self._break_required and is_tab_offset:
+        elif self._breaks_required and is_tab_offset:
             return
         else:
             # Reset the "current" position altogether.
@@ -86,11 +86,11 @@ def is_linebreak_required(self):
         """If the current position is simply one line below the previous.
         :rtype: bool
         """
-        return self._break_required
+        return self._breaks_required > 0
 
     def acknowledge_linebreak_consumed(self):
         """Call to acknowledge that the line required was consumed"""
-        self._break_required = False
+        self._breaks_required = 0
 
 
 class DefaultProvidingPositionTracker(_PositioningTracker):
diff --git a/setup.py b/setup.py
index dc03856f..3d4bd260 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@
 
 setup(
     name='pycaption',
-    version='2.2.12.dev5',
+    version='2.2.12.dev6',
     description='Closed caption converter',
     long_description=open(README_PATH).read(),
     author='Joe Norton',
diff --git a/tests/fixtures/dfxp.py b/tests/fixtures/dfxp.py
index 33fb6ef5..d0a052d0 100644
--- a/tests/fixtures/dfxp.py
+++ b/tests/fixtures/dfxp.py
@@ -914,16 +914,9 @@ def sample_dfxp_from_scc_output():
    <region tts:displayAlign="after" tts:textAlign="start" xml:id="bottom"/>
    <region tts:displayAlign="before" tts:origin="10% 77%" tts:textAlign="left" xml:id="r0"/>
    <region tts:displayAlign="before" tts:origin="40% 5%" tts:textAlign="left" xml:id="r1"/>
-   <region tts:displayAlign="before" tts:origin="70% 23%" tts:textAlign="left" xml:id="r2"/>
-   <region tts:displayAlign="before" tts:origin="20% 47%" tts:textAlign="left" xml:id="r3"/>
-   <region tts:displayAlign="before" tts:origin="20% 89%" tts:textAlign="left" xml:id="r4"/>
-   <region tts:displayAlign="before" tts:origin="40% 53%" tts:textAlign="left" xml:id="r5"/>
-   <region tts:displayAlign="before" tts:origin="70% 17%" tts:textAlign="left" xml:id="r6"/>
-   <region tts:displayAlign="before" tts:origin="20% 35%" tts:textAlign="left" xml:id="r7"/>
-   <region tts:displayAlign="before" tts:origin="20% 83%" tts:textAlign="left" xml:id="r8"/>
-   <region tts:displayAlign="before" tts:origin="70% 11%" tts:textAlign="left" xml:id="r9"/>
-   <region tts:displayAlign="before" tts:origin="40% 41%" tts:textAlign="left" xml:id="r10"/>
-   <region tts:displayAlign="before" tts:origin="20% 71%" tts:textAlign="left" xml:id="r11"/>
+   <region tts:displayAlign="before" tts:origin="40% 53%" tts:textAlign="left" xml:id="r2"/>
+   <region tts:displayAlign="before" tts:origin="70% 17%" tts:textAlign="left" xml:id="r3"/>
+   <region tts:displayAlign="before" tts:origin="70% 11%" tts:textAlign="left" xml:id="r4"/>
   </layout>
  </head>
  <body>
@@ -932,40 +925,41 @@ def sample_dfxp_from_scc_output():
     abab
    </p>
    <p begin="00:00:01.134" end="00:00:03.136" region="r1" style="default">
-    cdcd
-   </p>
-   <p begin="00:00:01.134" end="00:00:03.136" region="r2" style="default">
+    cdcd<br/><br/><br/>
     efef
    </p>
-   <p begin="00:00:03.136" end="00:00:09.709" region="r3" style="default">
+   <p begin="00:00:03.136" end="00:00:09.709" region="r1" style="default">
+    <br/><br/><br/><br/>
     ghgh<br/>
     ijij<br/>
     klkl
    </p>
-   <p begin="00:00:09.709" end="00:00:11.711" region="r4" style="default">
+   <p begin="00:00:09.709" end="00:00:11.711" region="r1" style="default">
+    <br/><br/><br/><br/><br/>
     mnmn
    </p>
-   <p begin="00:00:09.709" end="00:00:11.711" region="r5" style="default">
+   <p begin="00:00:09.709" end="00:00:11.711" region="r2" style="default">
     opop
    </p>
-   <p begin="00:00:09.709" end="00:00:11.711" region="r6" style="default">
+   <p begin="00:00:09.709" end="00:00:11.711" region="r3" style="default">
     qrqr
    </p>
-   <p begin="00:00:11.711" end="00:00:20.086" region="r7" style="default">
+   <p begin="00:00:11.711" end="00:00:20.086" region="r3" style="default">
+    <br/><br/><br/>
     stst<br/>
     uvuv<br/>
     wxwx
    </p>
-   <p begin="00:00:20.086" end="00:00:22.088" region="r8" style="default">
+   <p begin="00:00:20.086" end="00:00:22.088" region="r3" style="default">
+    <br/><br/><br/><br/><br/><br/>
     yzyz
    </p>
-   <p begin="00:00:20.086" end="00:00:22.088" region="r9" style="default">
-    0101
-   </p>
-   <p begin="00:00:20.086" end="00:00:22.088" region="r10" style="default">
+   <p begin="00:00:20.086" end="00:00:22.088" region="r4" style="default">
+    0101<br/><br/><br/><br/><br/>
     2323
    </p>
-   <p begin="00:00:22.088" end="00:00:26.088" region="r11" style="default">
+   <p begin="00:00:22.088" end="00:00:26.088" region="r4" style="default">
+    <br/><br/><br/><br/><br/>
     4545<br/>
     6767<br/>
     8989
diff --git a/tests/fixtures/scc.py b/tests/fixtures/scc.py
index 96f8c2cc..e09a10d3 100644
--- a/tests/fixtures/scc.py
+++ b/tests/fixtures/scc.py
@@ -81,7 +81,7 @@ def sample_scc_pop_on():
 def sample_scc_multiple_positioning():
     return """Scenarist_SCC V1.0
 
-00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 927c 927c e5e6 e5e6 942c 942c 942f 942f
+00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 92fd 92fd e5e6 e5e6 942c 942c 942f 942f
 
 00:00:02:16 94ae 94ae 9420 9420 16f2 16f2 6768 6768 9752 9752 e9ea e9ea 97f2 97f2 6bec 6bec 942c 942c 942f 942f
 
diff --git a/tests/test_scc.py b/tests/test_scc.py
index 21e52dfd..fd8c7a1e 100644
--- a/tests/test_scc.py
+++ b/tests/test_scc.py
@@ -70,17 +70,16 @@ def test_positioning(self, sample_scc_multiple_positioning):
         expected_positioning = [
             ((10.0, UnitEnum.PERCENT), (77.0, UnitEnum.PERCENT)),
             ((40.0, UnitEnum.PERCENT), (5.0, UnitEnum.PERCENT)),
-            ((70.0, UnitEnum.PERCENT), (23.0, UnitEnum.PERCENT)),
-            ((20.0, UnitEnum.PERCENT), (47.0, UnitEnum.PERCENT)),
-            ((20.0, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
+            ((40.0, UnitEnum.PERCENT), (5.0, UnitEnum.PERCENT)),
+            ((40.0, UnitEnum.PERCENT), (5.0, UnitEnum.PERCENT)),
             ((40.0, UnitEnum.PERCENT), (53.0, UnitEnum.PERCENT)),
             ((70.0, UnitEnum.PERCENT), (17.0, UnitEnum.PERCENT)),
-            ((20.0, UnitEnum.PERCENT), (35.0, UnitEnum.PERCENT)),
-            ((20.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
+            ((70.0, UnitEnum.PERCENT), (17.0, UnitEnum.PERCENT)),
+            ((70.0, UnitEnum.PERCENT), (17.0, UnitEnum.PERCENT)),
             ((70.0, UnitEnum.PERCENT), (11.0, UnitEnum.PERCENT)),
-            ((40.0, UnitEnum.PERCENT), (41.0, UnitEnum.PERCENT)),
-            ((20.0, UnitEnum.PERCENT), (71.0, UnitEnum.PERCENT)),
+            ((70.0, UnitEnum.PERCENT), (11.0, UnitEnum.PERCENT))
         ]
+
         actual_positioning = [
             caption_.layout_info.origin.serialized()
             for caption_ in captions.get_captions("en-US")
@@ -267,11 +266,11 @@ def test_line_too_long(self, sample_scc_with_line_too_long):
             SCCReader().read(sample_scc_with_line_too_long)
 
         assert exc_info.value.args[0].startswith(
-            "Cursor goes over column 32 for caption cue in scc file."
+            "32 character limit for caption cue in scc file."
         )
         str_to_check = (
             "was Cal l l l l l l l l l l l l l l l l l l l l l l l l l l l l "
-            "Denison, a friend - Length 84"
+            "Denison, a friend - Length 81"
         )
         assert "around 00:00:05.900" in exc_info.value.args[0].split("\n")[2]
         assert str_to_check in exc_info.value.args[0].split("\n")[2]
@@ -578,68 +577,6 @@ def test_italics_commands_are_formatted_properly(self):
         assert result[13].is_explicit_break()
         assert result[14].is_text_node()
 
-    @staticmethod
-    def check_cursor_placement(node_creator):
-        node_creator.interpret_command("91df")  # row 01, column 28,
-        assert node_creator._cursor_position == 28
-        # check that it keeps the column of the last pac
-        # before text
-        node_creator.interpret_command("9152")  # row 01, column 04
-        assert node_creator._cursor_position == 4
-        node_creator.add_chars("foo")
-        assert node_creator._cursor_position == 7  # 4 + len(foo)
-        # check on break
-        node_creator.interpret_command("91f2")  # row 02, column 04
-        # it should be 4
-        assert node_creator._cursor_position == 4
-        # check mid-row codes with open italics
-        node_creator.interpret_command("91ae")
-        node_creator.add_chars("bar")
-        # writing "foo" then breaks to row 2, col 4 then writes "bar"
-        # starts at (2, 4) and because it follows a break it should not add space
-        assert node_creator._cursor_position == 7
-        # check mid-row codes with closed italics
-        node_creator.add_chars("baz")
-        node_creator.interpret_command("9120")
-        node_creator.add_chars("spam")
-        # should be 7 + len("baz") + one space + len(spam) = 15
-        assert node_creator._cursor_position == 15
-        node_creator.add_chars("aaa ")
-        node_creator.interpret_command("91ae")
-        node_creator.add_chars("bbb")
-        # 15 + 4 + 3 = 22
-        assert node_creator._cursor_position == 22
-        # break again
-        node_creator.interpret_command("9254")  # row 03, column 08
-        assert node_creator._cursor_position == 8
-        node_creator.interpret_command("91ae")
-        node_creator.add_chars("ccc")
-        # should start at 8, not adding space as it has a break in front
-        # should be 8 + len(ccc) = 11
-        assert node_creator._cursor_position == 11
-
-    def test_cursor_placement(self):
-        #  pop-on caption
-        node_creator = InstructionNodeCreator(
-            position_tracker=(DefaultProvidingPositionTracker())
-        )
-        node_creator.interpret_command("9420")
-        self.check_cursor_placement(node_creator=node_creator)
-
-        #  roll-on caption
-        node_creator = InstructionNodeCreator(
-            position_tracker=(DefaultProvidingPositionTracker())
-        )
-        node_creator.interpret_command("9425")
-        self.check_cursor_placement(node_creator=node_creator)
-
-        #  paint caption
-        node_creator = InstructionNodeCreator(
-            position_tracker=(DefaultProvidingPositionTracker())
-        )
-        node_creator.interpret_command("9429")
-        self.check_cursor_placement(node_creator=node_creator)
-
     @staticmethod
     def check_closing_italics_closing_on_style_change(node_creator):
         node_creator.interpret_command("9140")  # row 01, col 0 plaintext