vrt-tools/vrt-dehype

#! /usr/bin/env python3
# -*- mode: Python; -*-

# This script attempts to remove line-breaking hyphens (and page numbers)
# in plaintext within VRT markup. The input plaintext itself is not yet
# in the VRT form, only the markup around it is assumed to be.

# An important structural assumption is that tag lines start with < and
# other lines do not start with <. The intended mechanism to achieve that
# is to have < as &lt; in content lines.

# Do not use re.IGNORECASE while we have Python 3 at 3.4 in Taito.

import argparse, os, re, sys
from html import unescape, escape
from itertools import groupby

import signal
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

# match object has two groups of two letters each,
# the hyphen-newline combination was between them;
# it is then passed to hyper or debug_hyper
hyphen = re.compile(R'''
    ([a-zäöA-ZÄÖ]{2})  -\n
    ([a-zäöA-ZÄÖ]{2})
''', re.VERBOSE)

# match object has three groups: two letters
# and an optional sentence-internal punctuation
# mark (including hyphen), a possible page number
# on its own line, and another two letters;
# it is then passed to number or debug_number
page = re.compile(R'''
    ([a-zäöA-ZÄÖ]{2} [,\-]?)  \n+
    (\d+)  \n+
    ([a-zäöA-ZÄÖ]{2})
''', re.VERBOSE)

vowels = 'aeiouyäöAEIOUYÄÖ'

def hyper(hype):
    left, right = hype.group(1), hype.group(2)
    ment = '{}-{}' if left[1] ==  right[0] in vowels else '{}{}'
    return ment.format(left, right)

def debug_hyper(hype):
    left, right = hype.group(1), hype.group(2)
    ment = 'HYP({})-({})' if left[1] ==  right[0] in vowels else 'HYP({})({})'
    return ment.format(left, right)

def number(numb):
    left, page, right = map(numb.group, (1, 2, 3))
    return '{}\n{}'.format(left, right)

def debug_number(numb):
    left, page, right = map(numb.group, (1, 2, 3))
    return 'NUM({}\n{})'.format(left, right)

def dehyphenate(source, target, *, denum = False, debug = False):
    for ismeta, part in groupby(source, lambda line:
                                line.startswith('<')):
        if ismeta:
            print(*part, sep = '', end = '', file = target)
        else:
            content = ''.join(part)
            if denum:
                content = page.sub(debug_number if debug else number, content)
            content = hyphen.sub(debug_hyper if debug else hyper, content)
            print(content, end = '', file = target)

def main():
   parser = argparse.ArgumentParser(description = '''
   Attempts to remove hyphens at line ends, and optionally
   page numbers between lines, in plaintext blocks within VRT
   markup. This may or may not be an improvement over the
   input. Errors of omission and of commission are to be
   expected. The question is how many they are.''')
   parser.add_argument('arg', metavar = 'FILE', nargs = '?',
                       type = argparse.FileType('r', encoding = 'UTF-8'),
                       default = sys.stdin,
                       help = 'input file (default stdin)')
   parser.add_argument('--out', '-o', metavar = 'outfile',
                       type = argparse.FileType('w', encoding = 'UTF-8'),
                       default = sys.stdout,
                       help = 'output file (default stdout)')
   parser.add_argument('--denum', action = 'store_true',
                       help = 'first attempt to remove some page numbers'
                       ' that occur within a sentence (even within a word)')
   parser.add_argument('--debug', action = 'store_true',
                       help = 'indicate removals with HYP(..)(..)'
                       ' and NUM(..\\n..)')
   parser.add_argument('--version', action = 'store_true',
                       help = 'print a  version indicator and exit')
   args = parser.parse_args()

   if args.version:
       print('vrt-dehype 0.1a (FIN-CLARIN 2018)')
       exit(0)

   with args.arg as source, args.out as target:
       dehyphenate(source, target, debug = args.debug, denum = args.denum)

if __name__ == '__main__':
    main()