From 03571ddd8f2f44de373a8cb08853ce2df6cab99d Mon Sep 17 00:00:00 2001
From: OlteanuRares
Date: Tue, 10 Sep 2024 10:03:54 +0300
Subject: [PATCH] revert handling jumping over multiple rows with
repositioning instead of breaks
---
docs/changelog.rst | 10 +-
pycaption/scc/__init__.py | 56 ++++---
pycaption/scc/specialized_collections.py | 64 +++++---
pycaption/scc/state_machines.py | 20 +--
setup.py | 2 +-
tests/fixtures/dfxp.py | 45 +++---
tests/test_scc.py | 193 ++++++++++-------------
tests/test_scc_conversion.py | 36 +++--
8 files changed, 225 insertions(+), 201 deletions(-)
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 400e9cdd..a058b2d3 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -2,14 +2,14 @@ Changelog
---------
2.2.13
^^^^^^
-- Mid-row codes only add spaces only if there isn't one before
-- Mid-row codes only add spaces only if affects the text in the same row (not adding if after previous text follows break or breaks)
-- Remove spaces to the end of the lines
-- Change error message for the 32 character limit.
+- Mid-row codes only add spaces only if there isn't one before.
+- Mid-row codes add spaces only if affects the text in the same row (not adding if it follows break or PACS).
+- Remove spaces to the end of the lines.
- Close italics on receiving another style setting command.
- Throw an CaptionReadNoCaptions error in case of empty input file are provided
-- Properly add breaks (it was only for jumps to next row). Now it adds as many breaks as the difference between row numbers.
- Ignore repositioning commands which are not followed by any text before breaks.
+- Mid-row codes will not add the space if is in front of punctuation.
+- Fix a bug with background codes when InstructionNodeCreator collection is empty.
2.2.12
^^^^^^
diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
index b28444a0..398745ed 100644
--- a/pycaption/scc/__init__.py
+++ b/pycaption/scc/__init__.py
@@ -85,20 +85,35 @@
from copy import deepcopy
from pycaption.base import BaseReader, BaseWriter, CaptionNode, CaptionSet
-from pycaption.exceptions import (CaptionLineLengthError,
- CaptionReadNoCaptions,
- CaptionReadTimingError, InvalidInputError)
-
-from .constants import (CHARACTER_TO_CODE, CHARACTERS, COMMANDS,
- CUE_STARTING_COMMAND, EXTENDED_CHARS, HEADER,
- MICROSECONDS_PER_CODEWORD,
- PAC_BYTES_TO_POSITIONING_MAP, PAC_HIGH_BYTE_BY_ROW,
- PAC_LOW_BYTE_BY_ROW_RESTRICTED,
- PAC_TAB_OFFSET_COMMANDS, SPECIAL_CHARS,
- SPECIAL_OR_EXTENDED_CHAR_TO_CODE)
+from pycaption.exceptions import (
+ CaptionLineLengthError,
+ CaptionReadNoCaptions,
+ CaptionReadTimingError,
+ InvalidInputError,
+)
+
+from .constants import (
+ CHARACTER_TO_CODE,
+ CHARACTERS,
+ COMMANDS,
+ CUE_STARTING_COMMAND,
+ EXTENDED_CHARS,
+ HEADER,
+ MICROSECONDS_PER_CODEWORD,
+ PAC_BYTES_TO_POSITIONING_MAP,
+ PAC_HIGH_BYTE_BY_ROW,
+ PAC_LOW_BYTE_BY_ROW_RESTRICTED,
+ PAC_TAB_OFFSET_COMMANDS,
+ SPECIAL_CHARS,
+ SPECIAL_OR_EXTENDED_CHAR_TO_CODE,
+)
from .specialized_collections import CaptionCreator # noqa: F401
-from .specialized_collections import (InstructionNodeCreator, NotifyingDict,
- PopOnCue, TimingCorrectingCaptionList)
+from .specialized_collections import (
+ InstructionNodeCreator,
+ NotifyingDict,
+ PopOnCue,
+ TimingCorrectingCaptionList,
+)
from .state_machines import DefaultProvidingPositionTracker
@@ -236,7 +251,9 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
for caption in self.caption_stash._collection:
caption_start = caption.to_real_caption().format_start()
caption_text = "".join(caption.to_real_caption().get_text_nodes())
- text_too_long = [line for line in caption_text.split("\n") if len(line) > 32]
+ text_too_long = [
+ line for line in caption_text.split("\n") if len(line) > 32
+ ]
if caption_start in lines_too_long:
lines_too_long[caption_start] = text_too_long
else:
@@ -313,9 +330,10 @@ def _translate_line(self, line):
for idx, word in enumerate(word_list):
word = word.strip()
if len(word) == 4:
- self._translate_word(word=word)
+ next_command = word_list[idx + 1] if idx + 1 < len(word_list) else None
+ self._translate_word(word=word, next_command=next_command)
- def _translate_word(self, word):
+ def _translate_word(self, word, next_command=None):
if self._handle_double_command(word):
# count frames for timing
self.time_translator.increment_frames()
@@ -324,7 +342,7 @@ def _translate_word(self, word):
# TODO - check that all the positioning commands are here, or use
# some other strategy to determine if the word is a command.
if word in COMMANDS or _is_pac_command(word):
- self._translate_command(word=word)
+ self._translate_command(word=word, next_command=next_command)
# second, check if word is a special character
elif word in SPECIAL_CHARS:
@@ -396,7 +414,7 @@ def _translate_extended_char(self, word):
# add to buffer
self.buffer.add_chars(EXTENDED_CHARS[word])
- def _translate_command(self, word):
+ def _translate_command(self, word, next_command=None):
# if command is pop_up
if word == "9420":
self.buffer_dict.set_active("pop")
@@ -462,7 +480,7 @@ def _translate_command(self, word):
# If command is not one of the aforementioned, add it to buffer
else:
- self.buffer.interpret_command(command=word)
+ self.buffer.interpret_command(command=word, next_command=next_command)
def _translate_characters(self, word):
# split word into the 2 bytes
diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py
index 85bad6ba..a5e132a8 100644
--- a/pycaption/scc/specialized_collections.py
+++ b/pycaption/scc/specialized_collections.py
@@ -14,14 +14,14 @@
BACKGROUND_COLOR_CODES,
COMMANDS,
EXTENDED_CHARS,
+ ITALICS_COMMANDS,
MICROSECONDS_PER_CODEWORD,
MID_ROW_CODES,
PAC_BYTES_TO_POSITIONING_MAP,
PAC_TAB_OFFSET_COMMANDS,
- ITALICS_COMMANDS,
- UNDERLINE_COMMANDS,
PLAIN_TEXT_COMMANDS,
- STYLE_SETTING_COMMANDS
+ STYLE_SETTING_COMMANDS,
+ UNDERLINE_COMMANDS,
)
PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")
@@ -309,7 +309,9 @@ def __init__(self, collection=None, position_tracker=None):
else:
self._collection = collection
- self.last_style = None # can be italic on or italic off as we only support italics
+ self.last_style = (
+ None # can be italic on or italic off as we only support italics
+ )
self._position_tracer = position_tracker
def is_empty(self):
@@ -340,10 +342,9 @@ def add_chars(self, *chars):
# handle a simple line break
if self._position_tracer.is_linebreak_required():
- for _ in range(self._position_tracer._breaks_required):
- self._collection.append(
- _InstructionNode.create_break(position=current_position)
- )
+ self._collection.append(
+ _InstructionNode.create_break(position=current_position)
+ )
self._position_tracer.acknowledge_linebreak_consumed()
node = _InstructionNode.create_text(current_position)
self._collection.append(node)
@@ -374,7 +375,7 @@ def get_style_for_command(command):
# only remaining possibility is plain text
return "plaintext"
- def interpret_command(self, command):
+ def interpret_command(self, command, next_command=None):
"""Given a command determines whether to turn italics on or off,
or to set the positioning
@@ -382,6 +383,7 @@ def interpret_command(self, command):
:type command: str
or a PAC_TAB_OFFSET_COMMANDS
+ :type next_command: the command that follows next
"""
self._update_positioning(command)
@@ -394,7 +396,7 @@ def interpret_command(self, command):
# which will be deleted when the code is applied.
# ex: 2080 97ad 94a1
if (
- len(self._collection) > 1
+ len(self._collection) > 0
and self._collection[-1].is_text_node()
and self._collection[-1].text[-1].isspace()
):
@@ -410,10 +412,9 @@ def interpret_command(self, command):
# it should open italic tag
# if break is required, break then add style tag
if self._position_tracer.is_linebreak_required():
- for _ in range(self._position_tracer._breaks_required):
- self._collection.append(
- _InstructionNode.create_break(position=current_position)
- )
+ self._collection.append(
+ _InstructionNode.create_break(position=current_position)
+ )
self._position_tracer.acknowledge_linebreak_consumed()
self._collection.append(
_InstructionNode.create_italics_style(current_position)
@@ -432,23 +433,28 @@ def interpret_command(self, command):
)
self.last_style = "italics off"
if self._position_tracer.is_linebreak_required():
- for _ in range(self._position_tracer._breaks_required):
- self._collection.append(
- _InstructionNode.create_break(position=current_position)
- )
+ self._collection.append(
+ _InstructionNode.create_break(position=current_position)
+ )
self._position_tracer.acknowledge_linebreak_consumed()
# handle mid-row codes that follows a text node
+ # don't add space if the next command adds one of
+ # ['.', '!', '?', ',']
+ punctuation = ["ae", "a1", "bf", "2c"]
+ next_is_punctuation = next_command and next_command[:2] in punctuation
prev_text_node = self.get_previous_text_node()
prev_node_is_break = prev_text_node is not None and any(
- x.is_explicit_break() for x in self._collection[self._collection.index(prev_text_node):]
+ x.is_explicit_break()
+ for x in self._collection[self._collection.index(prev_text_node) :]
)
if (
- command in MID_ROW_CODES and
- prev_text_node and not
- prev_node_is_break and not
- prev_text_node.text[-1].isspace() and
- command not in PAC_TAB_OFFSET_COMMANDS
+ command in MID_ROW_CODES
+ and prev_text_node
+ and not prev_node_is_break
+ and not prev_text_node.text[-1].isspace()
+ and command not in PAC_TAB_OFFSET_COMMANDS
+ and not next_is_punctuation
):
if self.last_style == "italics off":
# need to open italics tag, add a space
@@ -465,8 +471,8 @@ def _update_positioning(self, command):
:type command: str
"""
- prev_positioning = self._position_tracer.default
if command in PAC_TAB_OFFSET_COMMANDS:
+ prev_positioning = self._position_tracer.default
tab_offset = PAC_TAB_OFFSET_COMMANDS[command]
positioning = (prev_positioning[0], prev_positioning[1] + tab_offset)
else:
@@ -751,6 +757,12 @@ def _format_italics(collection):
new_collection = _remove_noop_italics(new_collection)
# remove spaces to the end of the lines
+ new_collection = _remove_spaces_at_end_of_the_line(new_collection)
+
+ return new_collection
+
+
+def _remove_spaces_at_end_of_the_line(collection: list[_InstructionNode]):
for idx, node in enumerate(collection):
if (
idx > 0
@@ -762,7 +774,7 @@ def _format_italics(collection):
# handle last node
if collection[-1].is_text_node():
collection[-1].text = collection[-1].text.rstrip()
- return new_collection
+ return collection
def _remove_noop_on_off_italics(collection):
diff --git a/pycaption/scc/state_machines.py b/pycaption/scc/state_machines.py
index fed5e508..7353eff8 100644
--- a/pycaption/scc/state_machines.py
+++ b/pycaption/scc/state_machines.py
@@ -5,13 +5,14 @@ class _PositioningTracker:
"""Helps determine the positioning of a node, having kept track of
positioning-related commands.
"""
+
def __init__(self, positioning=None):
"""
:param positioning: positioning information (row, column)
:type positioning: tuple[int]
"""
self._positions = [positioning]
- self._breaks_required = 0
+ self._break_required = False
self._repositioning_required = False
# Since the actual column is not applied when encountering a line break
# this attribute is used to store it and determine by comparison if the
@@ -35,18 +36,18 @@ def update_positioning(self, positioning):
return
row, col = current
- if self._breaks_required:
+ if self._break_required:
col = self._last_column
new_row, new_col = positioning
is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3
# One line below will be treated as line break, not repositioning
- if new_row > row:
+ if new_row == row + 1:
self._positions.append((new_row, col))
- self._breaks_required = new_row - row
+ self._break_required = 1
self._last_column = new_col
# Tab offsets after line breaks will be ignored to avoid repositioning
- elif self._breaks_required and is_tab_offset:
+ elif self._break_required and is_tab_offset:
return
else:
# Reset the "current" position altogether.
@@ -64,9 +65,7 @@ def get_current_position(self):
:raise: CaptionReadSyntaxError
"""
if not any(self._positions):
- raise CaptionReadSyntaxError(
- 'No Preamble Address Code [PAC] was provided'
- )
+ raise CaptionReadSyntaxError("No Preamble Address Code [PAC] was provided")
else:
return self._positions[0]
@@ -86,17 +85,18 @@ def is_linebreak_required(self):
"""If the current position is simply one line below the previous.
:rtype: bool
"""
- return self._breaks_required > 0
+ return self._break_required
def acknowledge_linebreak_consumed(self):
"""Call to acknowledge that the line required was consumed"""
- self._breaks_required = 0
+ self._break_required = False
class DefaultProvidingPositionTracker(_PositioningTracker):
"""A _PositioningTracker that provides if needed a default value (14, 0), or
uses the last positioning value set anywhere in the document
"""
+
default = (14, 0)
def __init__(self, positioning=None, default=None):
diff --git a/setup.py b/setup.py
index 3d4bd260..a139f657 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@
setup(
name='pycaption',
- version='2.2.12.dev6',
+ version='2.2.12.dev7',
description='Closed caption converter',
long_description=open(README_PATH).read(),
author='Joe Norton',
diff --git a/tests/fixtures/dfxp.py b/tests/fixtures/dfxp.py
index d0a052d0..864f3e7a 100644
--- a/tests/fixtures/dfxp.py
+++ b/tests/fixtures/dfxp.py
@@ -729,6 +729,7 @@ def sample_dfxp_to_render_with_only_default_positioning_input():
@@ -925,41 +933,40 @@ def sample_dfxp_from_scc_output():
abab
- cdcd
+ cdcd
+
+
efef
-
-
+
ghgh
ijij
klkl
-
-
+
mnmn
-
+
opop
-
+
qrqr
-
-
+
stst
uvuv
wxwx
-
-
+
yzyz
-
- 0101
+
+ 0101
+
+
2323
-
-
+
4545
6767
8989
@@ -1519,4 +1526,4 @@ def sample_dfxp_default_styling_p_tags():