From 3d6878d7f0439f35bce727a74bbc49bc4cdf1908 Mon Sep 17 00:00:00 2001 From: OlteanuRares Date: Tue, 3 Sep 2024 16:02:54 +0300 Subject: [PATCH] revert on line length error instead of cursor placement --- docs/changelog.rst | 9 ++- pycaption/scc/__init__.py | 31 +++------- pycaption/scc/specialized_collections.py | 38 +++++------- pycaption/scc/state_machines.py | 14 ++--- setup.py | 2 +- tests/fixtures/dfxp.py | 42 ++++++------- tests/fixtures/scc.py | 2 +- tests/test_scc.py | 79 +++--------------------- 8 files changed, 68 insertions(+), 149 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index e6e0823c..400e9cdd 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,14 +1,19 @@ Changelog --------- -2.2.12 +2.2.13 ^^^^^^ - Mid-row codes only add spaces only if there isn't one before - Mid-row codes only add spaces only if affects the text in the same row (not adding if after previous text follows break or breaks) - Remove spaces to the end of the lines -- CaptionLineLengthError is now raised if the cursor goes after column 32 instead of string length over 32 - Change error message for the 32 character limit. - Close italics on receiving another style setting command. - Throw an CaptionReadNoCaptions error in case of empty input file are provided +- Properly add breaks (it was only for jumps to next row). Now it adds as many breaks as the difference between row numbers. +- Ignore repositioning commands which are not followed by any text before breaks. + +2.2.12 +^^^^^^ +- Pinned nltk to 3.8.0 2.2.11 ^^^^^^ diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py index 0d0a1de9..b28444a0 100644 --- a/pycaption/scc/__init__.py +++ b/pycaption/scc/__init__.py @@ -233,11 +233,14 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0): # check captions for incorrect lengths lines_too_long = defaultdict(list) - for caption in self.caption_stash._collection: - line_length = self.get_scc_line_line_length(caption.to_real_caption()) - if line_length: - lines_too_long.update(line_length) + caption_start = caption.to_real_caption().format_start() + caption_text = "".join(caption.to_real_caption().get_text_nodes()) + text_too_long = [line for line in caption_text.split("\n") if len(line) > 32] + if caption_start in lines_too_long: + lines_too_long[caption_start] = text_too_long + else: + lines_too_long[caption_start].extend(text_too_long) msg = "" if bool(lines_too_long.keys()): @@ -248,8 +251,8 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0): msg += line + f" - Length { len(line)}" + "\n" if len(msg): raise CaptionLineLengthError( - f"Cursor goes over column 32 for caption cue in scc file.\n" - f"Affected lines:\n" + f"32 character limit for caption cue in scc file.\n" + f"Lines longer than 32:\n" f"{msg}" ) @@ -513,22 +516,6 @@ def _pop_on(self, end=0): pop_on_cue = self.pop_ons_queue.pop() self.caption_stash.create_and_store(pop_on_cue.buffer, pop_on_cue.start, end) - @staticmethod - def get_scc_line_line_length(caption): - long_line = defaultdict(list) - caption_start = caption.format_start() - text_nodes = [node for node in caption.nodes if node.type_ == CaptionNode.TEXT] - if not text_nodes: - return None - start_writing_at = text_nodes[0].position[1] - caption_text = "".join(caption.get_text_nodes()) - if start_writing_at: - caption_text = " " * start_writing_at + caption_text - long_line[caption_start] = [ - line for line in caption_text.split("\n") if len(line) > 32 - ] - return long_line if len(long_line[caption_start]) else None - class SCCWriter(BaseWriter): def __init__(self, *args, **kw): diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py index 5fe79eef..85bad6ba 100644 --- a/pycaption/scc/specialized_collections.py +++ b/pycaption/scc/specialized_collections.py @@ -310,8 +310,6 @@ def __init__(self, collection=None, position_tracker=None): self._collection = collection self.last_style = None # can be italic on or italic off as we only support italics - self._cursor_position = 0 - self._position_tracer = position_tracker def is_empty(self): @@ -342,14 +340,17 @@ def add_chars(self, *chars): # handle a simple line break if self._position_tracer.is_linebreak_required(): - # must insert a line break here - self._collection.append( - _InstructionNode.create_break(position=current_position) - ) + for _ in range(self._position_tracer._breaks_required): + self._collection.append( + _InstructionNode.create_break(position=current_position) + ) + self._position_tracer.acknowledge_linebreak_consumed() node = _InstructionNode.create_text(current_position) self._collection.append(node) - self._cursor_position = self._position_tracer._last_column - self._position_tracer.acknowledge_linebreak_consumed() + if self._position_tracer.is_repositioning_required(): + # it means we have a reposition command which was not followed by + # any text, so we just ignore it and break + self._position_tracer.acknowledge_position_changed() # handle completely new positioning elif self._position_tracer.is_repositioning_required(): @@ -361,7 +362,6 @@ def add_chars(self, *chars): self._position_tracer.acknowledge_position_changed() node.add_chars(*chars) - self._cursor_position += len("".join(chars)) @staticmethod def get_style_for_command(command): @@ -385,8 +385,6 @@ def interpret_command(self, command): """ self._update_positioning(command) - text = COMMANDS.get(command, "") - if command == "94a1": self.handle_backspace("94a1") @@ -412,9 +410,10 @@ def interpret_command(self, command): # it should open italic tag # if break is required, break then add style tag if self._position_tracer.is_linebreak_required(): - self._collection.append( - _InstructionNode.create_break(position=current_position) - ) + for _ in range(self._position_tracer._breaks_required): + self._collection.append( + _InstructionNode.create_break(position=current_position) + ) self._position_tracer.acknowledge_linebreak_consumed() self._collection.append( _InstructionNode.create_italics_style(current_position) @@ -433,9 +432,10 @@ def interpret_command(self, command): ) self.last_style = "italics off" if self._position_tracer.is_linebreak_required(): - self._collection.append( - _InstructionNode.create_break(position=current_position) - ) + for _ in range(self._position_tracer._breaks_required): + self._collection.append( + _InstructionNode.create_break(position=current_position) + ) self._position_tracer.acknowledge_linebreak_consumed() # handle mid-row codes that follows a text node @@ -459,7 +459,6 @@ def interpret_command(self, command): # need to close italics tag, add a space # to the end of the previous text node prev_text_node.text = prev_text_node.text + " " - self._cursor_position += 1 def _update_positioning(self, command): """Sets the positioning information to use for the next nodes @@ -470,7 +469,6 @@ def _update_positioning(self, command): if command in PAC_TAB_OFFSET_COMMANDS: tab_offset = PAC_TAB_OFFSET_COMMANDS[command] positioning = (prev_positioning[0], prev_positioning[1] + tab_offset) - self._cursor_position += tab_offset else: first, second = command[:2], command[2:] try: @@ -479,7 +477,6 @@ def _update_positioning(self, command): except KeyError: # if not PAC or OFFSET we're not changing position return - self._cursor_position = positioning[1] self._position_tracer.update_positioning(positioning) def __iter__(self): @@ -533,7 +530,6 @@ def handle_backspace(self, word): # only if the previous character in not also extended if delete_previous_condition: node.text = node.text[:-1] - self._cursor_position -= 1 def get_previous_text_node(self): for node in self._collection[::-1]: diff --git a/pycaption/scc/state_machines.py b/pycaption/scc/state_machines.py index 04fc632b..fed5e508 100644 --- a/pycaption/scc/state_machines.py +++ b/pycaption/scc/state_machines.py @@ -11,7 +11,7 @@ def __init__(self, positioning=None): :type positioning: tuple[int] """ self._positions = [positioning] - self._break_required = False + self._breaks_required = 0 self._repositioning_required = False # Since the actual column is not applied when encountering a line break # this attribute is used to store it and determine by comparison if the @@ -35,18 +35,18 @@ def update_positioning(self, positioning): return row, col = current - if self._break_required: + if self._breaks_required: col = self._last_column new_row, new_col = positioning is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3 # One line below will be treated as line break, not repositioning - if new_row == row + 1: + if new_row > row: self._positions.append((new_row, col)) - self._break_required = True + self._breaks_required = new_row - row self._last_column = new_col # Tab offsets after line breaks will be ignored to avoid repositioning - elif self._break_required and is_tab_offset: + elif self._breaks_required and is_tab_offset: return else: # Reset the "current" position altogether. @@ -86,11 +86,11 @@ def is_linebreak_required(self): """If the current position is simply one line below the previous. :rtype: bool """ - return self._break_required + return self._breaks_required > 0 def acknowledge_linebreak_consumed(self): """Call to acknowledge that the line required was consumed""" - self._break_required = False + self._breaks_required = 0 class DefaultProvidingPositionTracker(_PositioningTracker): diff --git a/setup.py b/setup.py index dc03856f..3d4bd260 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ setup( name='pycaption', - version='2.2.12.dev5', + version='2.2.12.dev6', description='Closed caption converter', long_description=open(README_PATH).read(), author='Joe Norton', diff --git a/tests/fixtures/dfxp.py b/tests/fixtures/dfxp.py index 33fb6ef5..d0a052d0 100644 --- a/tests/fixtures/dfxp.py +++ b/tests/fixtures/dfxp.py @@ -914,16 +914,9 @@ def sample_dfxp_from_scc_output(): - - - - - - - - - - + + + @@ -932,40 +925,41 @@ def sample_dfxp_from_scc_output(): abab

- cdcd -

-

+ cdcd


efef

-

+

+



ghgh
ijij
klkl

-

+

+




mnmn

-

+

opop

-

+

qrqr

-

+

+


stst
uvuv
wxwx

-

+

+





yzyz

-

- 0101 -

-

+

+ 0101




2323

-

+

+




4545
6767
8989 diff --git a/tests/fixtures/scc.py b/tests/fixtures/scc.py index 96f8c2cc..e09a10d3 100644 --- a/tests/fixtures/scc.py +++ b/tests/fixtures/scc.py @@ -81,7 +81,7 @@ def sample_scc_pop_on(): def sample_scc_multiple_positioning(): return """Scenarist_SCC V1.0 -00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 927c 927c e5e6 e5e6 942c 942c 942f 942f +00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 92fd 92fd e5e6 e5e6 942c 942c 942f 942f 00:00:02:16 94ae 94ae 9420 9420 16f2 16f2 6768 6768 9752 9752 e9ea e9ea 97f2 97f2 6bec 6bec 942c 942c 942f 942f diff --git a/tests/test_scc.py b/tests/test_scc.py index 21e52dfd..fd8c7a1e 100644 --- a/tests/test_scc.py +++ b/tests/test_scc.py @@ -70,17 +70,16 @@ def test_positioning(self, sample_scc_multiple_positioning): expected_positioning = [ ((10.0, UnitEnum.PERCENT), (77.0, UnitEnum.PERCENT)), ((40.0, UnitEnum.PERCENT), (5.0, UnitEnum.PERCENT)), - ((70.0, UnitEnum.PERCENT), (23.0, UnitEnum.PERCENT)), - ((20.0, UnitEnum.PERCENT), (47.0, UnitEnum.PERCENT)), - ((20.0, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)), + ((40.0, UnitEnum.PERCENT), (5.0, UnitEnum.PERCENT)), + ((40.0, UnitEnum.PERCENT), (5.0, UnitEnum.PERCENT)), ((40.0, UnitEnum.PERCENT), (53.0, UnitEnum.PERCENT)), ((70.0, UnitEnum.PERCENT), (17.0, UnitEnum.PERCENT)), - ((20.0, UnitEnum.PERCENT), (35.0, UnitEnum.PERCENT)), - ((20.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)), + ((70.0, UnitEnum.PERCENT), (17.0, UnitEnum.PERCENT)), + ((70.0, UnitEnum.PERCENT), (17.0, UnitEnum.PERCENT)), ((70.0, UnitEnum.PERCENT), (11.0, UnitEnum.PERCENT)), - ((40.0, UnitEnum.PERCENT), (41.0, UnitEnum.PERCENT)), - ((20.0, UnitEnum.PERCENT), (71.0, UnitEnum.PERCENT)), + ((70.0, UnitEnum.PERCENT), (11.0, UnitEnum.PERCENT)) ] + actual_positioning = [ caption_.layout_info.origin.serialized() for caption_ in captions.get_captions("en-US") @@ -267,11 +266,11 @@ def test_line_too_long(self, sample_scc_with_line_too_long): SCCReader().read(sample_scc_with_line_too_long) assert exc_info.value.args[0].startswith( - "Cursor goes over column 32 for caption cue in scc file." + "32 character limit for caption cue in scc file." ) str_to_check = ( "was Cal l l l l l l l l l l l l l l l l l l l l l l l l l l l l " - "Denison, a friend - Length 84" + "Denison, a friend - Length 81" ) assert "around 00:00:05.900" in exc_info.value.args[0].split("\n")[2] assert str_to_check in exc_info.value.args[0].split("\n")[2] @@ -578,68 +577,6 @@ def test_italics_commands_are_formatted_properly(self): assert result[13].is_explicit_break() assert result[14].is_text_node() - @staticmethod - def check_cursor_placement(node_creator): - node_creator.interpret_command("91df") # row 01, column 28, - assert node_creator._cursor_position == 28 - # check that it keeps the column of the last pac - # before text - node_creator.interpret_command("9152") # row 01, column 04 - assert node_creator._cursor_position == 4 - node_creator.add_chars("foo") - assert node_creator._cursor_position == 7 # 4 + len(foo) - # check on break - node_creator.interpret_command("91f2") # row 02, column 04 - # it should be 4 - assert node_creator._cursor_position == 4 - # check mid-row codes with open italics - node_creator.interpret_command("91ae") - node_creator.add_chars("bar") - # writing "foo" then breaks to row 2, col 4 then writes "bar" - # starts at (2, 4) and because it follows a break it should not add space - assert node_creator._cursor_position == 7 - # check mid-row codes with closed italics - node_creator.add_chars("baz") - node_creator.interpret_command("9120") - node_creator.add_chars("spam") - # should be 7 + len("baz") + one space + len(spam) = 15 - assert node_creator._cursor_position == 15 - node_creator.add_chars("aaa ") - node_creator.interpret_command("91ae") - node_creator.add_chars("bbb") - # 15 + 4 + 3 = 22 - assert node_creator._cursor_position == 22 - # break again - node_creator.interpret_command("9254") # row 03, column 08 - assert node_creator._cursor_position == 8 - node_creator.interpret_command("91ae") - node_creator.add_chars("ccc") - # should start at 8, not adding space as it has a break in front - # should be 8 + len(ccc) = 11 - assert node_creator._cursor_position == 11 - - def test_cursor_placement(self): - # pop-on caption - node_creator = InstructionNodeCreator( - position_tracker=(DefaultProvidingPositionTracker()) - ) - node_creator.interpret_command("9420") - self.check_cursor_placement(node_creator=node_creator) - - # roll-on caption - node_creator = InstructionNodeCreator( - position_tracker=(DefaultProvidingPositionTracker()) - ) - node_creator.interpret_command("9425") - self.check_cursor_placement(node_creator=node_creator) - - # paint caption - node_creator = InstructionNodeCreator( - position_tracker=(DefaultProvidingPositionTracker()) - ) - node_creator.interpret_command("9429") - self.check_cursor_placement(node_creator=node_creator) - @staticmethod def check_closing_italics_closing_on_style_change(node_creator): node_creator.interpret_command("9140") # row 01, col 0 plaintext