vrt-tools/vrt-fix-sentbreaks

#! /usr/bin/env python3
# -*- mode: Python; -*-


"""
vrt-fix-sentbreaks

Fix in VRT input spurious sentence breaks added by the UDPipe Finnish
tokenizer.

The logic of the code is taken from Jussi Piitulainen's unbreak.py for
the CEAL corpus, either directly or slightly modified.
"""


import re
import sys

from itertools import groupby, chain

import vrtdatalib
import vrtnamelib

from vrtargsoolib import InputProcessor


class SentenceBreakFixer(InputProcessor):

    """Fix spurious sentence breaks added by the UDPipe Finnish tokenizer."""

    _default_endpunct = '.!?:'
    """End-of-sentence punctuation marks"""
    _default_dashes = '-–—'
    _default_quotes = '"”“\'’‘«»'

    DESCRIPTION = """
    Fix spurious sentence breaks in VRT input added by the UDPipe
    Finnish tokenizer.
    """
    ARGSPECS = [
        ('--comments',
         'output XML comments for removed breaks'),
        ('--verbose',
         'log output to stderr'),
        ('--word|w = name -> word_attr',
         'use positional attribute name as the word-form attribute;'
         ' alternatively, name may be a one-based integer denoting the index'
         ' of the word-form attribute, or a combination of the two, separated'
         ' by a "|" (the latter is used if the former is not found)',
         # The default contains a literal "|", so it cannot be specified on the
         # argspec line itself.
         dict(default="word|1")),
        ('--max-tokens=num :int =200',
         'do not remove a sentence break if the combined sentence would have'
         ' more than num tokens (0 = unlimited)'),
        ('--end-punctuation = chars -> endpunct',
         'consider chars as possible sentence-ending punctuation marks',
         dict(default=_default_endpunct)),
        ('--dashes = chars',
         'consider chars as dashes',
         dict(default=_default_dashes)),
        ('--quotes = chars',
         'consider chars as quotation marks',
         dict(default=_default_quotes)),
    ]

    def __init__(self):
        super().__init__()

    def main(self, args, inf, ouf):

        LESS_THAN = '<'.encode()[0]
        endpunct = args.endpunct
        dashes = args.dashes
        quotes = args.quotes
        max_token_count = args.max_tokens
        word_attrnum = 0
        sent_token_count = 0
        this_token_count = 0
        # Parts for different types of nobreak labels and comments
        comment_parts = {
            # (label prefix, label suffix, comment prefix, comment suffix)
            'noskip': ('', '', 'Removed ', ''),
            'skip': ('SKIP-', '', 'Skipped removing ', ' (long sentence)'),
            'sure': ('', '', '', ''),
            'unsure': ('', '-ALARM', '', ' (unsure)')
        }
        nobreak_labels = {}
        nobreak_comments = {}
        for skip in ['noskip', 'skip']:
            for sure in ['sure', 'unsure']:
                nobreak_labels[(skip, sure)] = (
                    comment_parts[skip][0] + comment_parts[sure][0]
                    + 'NOBREAK'
                    + comment_parts[sure][1] + comment_parts[skip][1])
                nobreak_comments[(skip, sure)] = (
                    nobreak_labels[(skip, sure)] + ': '
                    + comment_parts[skip][2] + comment_parts[sure][2]
                    + 'sentence break'
                    + comment_parts[sure][3] + comment_parts[skip][3])

        def isbreak(line):
            return (line[0] == LESS_THAN
                    and (line.startswith(b'<sentence')
                         or line.startswith(b'</sentence')))

        def get_wordform(line):
            return vrtdatalib.binlineref(line, word_attrnum).decode()

        def isword(line):
            # mainly all-alphabetic but also alphabetic-alphabetic and
            # alphabetic's that both occurred (also alpha-alpha's if any)
            return re.fullmatch(R"\w+(-\w+)?('s)?", get_wordform(line))

        def isdash(line):
            return get_wordform(line) in dashes

        def isquote(line):
            return get_wordform(line) in quotes

        def isendpunct(line):
            wf = get_wordform(line)
            # The word form contains an end-of-sentence punctuation
            # mark and it is _not_ an upper-case letter followed by a
            # full stop, since "A." typically does not end a sentence.
            return (any((c in wf) for c in endpunct)
                    and not (len(wf) == 2 and wf[1] == '.' and wf[0].isupper()))

        def istitle(line):
            return get_wordform(line).istitle()

        def log(before, kind, after):
            if args.verbose:
                print('#',
                      *chain((w.decode().rstrip('\n')
                              for w in ([b'##'] + before)[-4:]),
                             [kind],
                             (w.decode().rstrip('\n')
                              for w in (after + [b'##'])[:6])),
                      file=sys.stderr)

        def comment(text):
            if args.comments:
                ouf.write(b'<!-- ' + text.encode() + b' -->\n')

        def count_tokens(lines):
            return sum(int(line[0] != LESS_THAN) for line in lines)

        def output_removed_break(broken, before, this, sure='sure'):
            # Output information for a removed break (possible comment and log
            # lines), or if the combined sentence would be longer than the
            # maximum length, output the break.
            nonlocal sent_token_count
            # comment('token count: ' + str(sent_token_count))
            if (max_token_count > 0
                    and sent_token_count + this_token_count > max_token_count):
                comment(nobreak_comments[('skip', sure)])
                output_lines(broken)
                log(before, nobreak_labels[('skip', sure)], this)
                sent_token_count = 0
            else:
                comment(nobreak_comments[('noskip', sure)])
                log(before, nobreak_labels[('noskip', sure)], this)

        def output_lines(lines):
            for line in lines:
                ouf.write(line)

        before = ['<outside>']
        broken = []
        before_first_sent = True
        for kind, group in groupby(inf, isbreak):
            if kind:
                broken = list(group)
                continue
            this = list(group)
            this_token_count = count_tokens(this)
            # comment('this token count: ' + str(this_token_count))
            if before_first_sent:
                # Find out which positional attribute is the word form, based
                # on args.word_attr and the possible Positional attributes
                # comment before the first sentence
                word_attrnum = vrtnamelib.extract_numnameindex(
                    this, args.word_attr)
                before_first_sent = False
            if len(broken) == 0:
                # before first sentence
                pass
            elif len(broken) == 1:
                # either just '<sentence ...>' or just '</sentence>' at a
                # larger boundary so not relevant so ship the break
                output_lines(broken)
                sent_token_count = 0
            elif isword(before[-1]) and isword(this[0]):
                # omit the sentence break for sure - grep out the (NO)BREAK
                # comments to get the final valid output
                output_removed_break(broken, before, this, 'sure')
            elif any(isendpunct(w) for w in before[-2:]):
                # appropriate punctuation before the break so ship the break
                # without logging (no there is no certainty of anything ever
                # but one has to draw the line somewhere - nothing to grep)
                output_lines(broken)
                sent_token_count = 0
            elif ((len(before) > 2 and
                   isdash(before[-2]) and isquote(before[-1]) and
                   istitle(this[0]))
                  or
                  (len(this) > 2 and
                   isdash(this[0]) and isquote(this[1]) and
                   istitle(this[2]))):
                # one side is dash quote and something more and what follows
                # the dash quote is capitalized - keep the break with ALARM
                comment(
                    'BREAK-ALARM: Kept sentence break near a dash and quote')
                output_lines(broken)
                log(before, 'BREAK-ALARM', this)
                sent_token_count = 0
            else:
                # omit the sentence break with ALARM - grep out the
                # (NO)BREAK comments to get the final valid output
                output_removed_break(broken, before, this, 'unsure')
            output_lines(this)
            sent_token_count += this_token_count
            # comment('token count: ' + str(sent_token_count))
            before = this


if __name__ == '__main__':
    SentenceBreakFixer().run()