vrt-tools/hrt-udpipe

#! /usr/bin/env python3
# -*- mode: Python; -*-

'''Use UDPipe to parse near-plaintext within markup above sentence.

'''

import sys

from itertools import groupby
from queue import Empty as EmptyQueue
from subprocess import Popen, PIPE
from threading import Thread

from vrtargslib import trans_args, trans_main
from vrtdatalib import binescape, binasrecord
from vrtnamelib import binmakenames
from hrtlib import tokenize

from outsidelib import UDPIPE, UDPIPEMODEL

def parsearguments():
    description = '''

    Segment text data between any (non-sentence) meta into sentences
    and tokens, tag and parse, using UDPipe with one of the UD2
    models. WIP! Does not cope with UD m.n tokens.

    '''

    parser = trans_args(description = description, inplace = False)

    parser.add_argument('--model',
                        choices = [
                            'finnish-tdt',
                            'finnish-ftb',
                            'swedish-talbanken',
                            'english-ewt',
                            'english-gum',
                            # ADD MODELS
                        ],
                        default = 'finnish-tdt',
                        help = '''

                        UDPipe language model (default finnish-tdt)

                        ''')

    # should have options to set field names but meh

    args = parser.parse_args()
    args.inplace = False
    args.backup = None
    args.prog = parser.prog
    return args

def main(args, inf, ouf):

    proc = Popen([ UDPIPE, '--immediate',
                   '--tokenize', '--tag', '--parse',
                   '--output=conllu',
                   # --output formats (among others):
                   # conllu: id, word, ..., misc (SpacesAfter, SpaceAfter)
                   # vertical: word only
                   # horizontal: input with normalized spacing
                   # plaintext: reconstructed input with original spacing
                   # matxin: whatever it is, it _hangs_ (TO INVESTIGATE)
                   # (still not investigated and udpipe version updated)
                   UDPIPEMODEL.format(args.model) ],
                 stdin = PIPE,
                 stdout = PIPE,
                 stderr = PIPE)

    # start a watcherr here since proc.stderr in PIPE
    # (rather redundant for this particular tool)
    Thread(target=watcherr, args=[args, proc]).start()

    try:
        tokenize(inf, proc, combiner(args), ouf)
    finally:
        proc.stdin.close()

def combiner(args):
    def combine(proc, meta, sent, ouf):

        '''Reads the proc and meta in synch; writes to output stream; on
        exception, should this close proc.stdout? when proc might also
        not be there any more?

        '''
        try:
            implement_combine(args, proc, meta, sent, ouf)
        except EmptyQueue:
            print('{}: combine thread: empty queue'.format(args.prog),
                  file = sys.stderr)
        except Exception as exn:
            print('{}: combine thread:'.format(args.prog), exn,
                  file = sys.stderr)

    return combine

def implement_combine(args, proc, meta, sent, ouf):

    ouf.write(binmakenames(b'id word lemma upos xpos feat head rel aux misc'))

    for group in (tuple(group)
                  # must reify group to recognize sentinel group
                  for isspace, group in groupby(proc.stdout,
                                                bytes.isspace)
                  if not isspace):
        if all((line.startswith(b'#') or sent in line)
               for line in group):
            # Hope that UDPipe considers sentinel a sentence
            # of its own but it may still come with comments
            shipmeta(meta.get_nowait(), ouf)
            meta.task_done()
        else:
            shipdata(group, ouf)
    else:
        # the final meta but is this a bit racy?
        shipmeta(meta.get_nowait(), ouf)
        meta.task_done()

def shipmeta(lines, ouf):
    for line in lines: ouf.write(line)

def shipdata(lines, ouf):
    ouf.write(b'<sentence>\n')
    end = b'0' # aka none
    for line in lines:
        if line.startswith(b'#'): continue
        [
            jd, word, lemma, upos, xpos,
            feat, head, rel, aux, misc
        ] = binasrecord(line)
        if b'-' in jd:
            # 1-2  Ellei  _
            # 1  Ell  _
            # 2  ei  _
            # ...
            # 6-7  miksei  _
            # ...
            # (with finnish-ftb-ud)
            for part in (b'<multitoken id="', jd,
                         b'" misc="', misc,
                         b'" word="', binescape(word),
                         b'">\n'):
                ouf.write(part)
            # TODO also escape any quotes in word!
            _, end = jd.split(b'-')
        else:
            for part in (jd, b'\t',
                         binescape(word), b'\t',
                         binescape(lemma), b'\t',
                         upos, b'\t',
                         xpos, b'\t',
                         feat, b'\t',
                         head, b'\t',
                         rel, b'\t',
                         aux, b'\t',
                         misc, b'\n'):
                ouf.write(part)
            if jd == end: ouf.write(b'</multitoken>\n')                
    else:
        ouf.write(b'</sentence>\n')

def watcherr(args, proc):
    for line in proc.stderr:
        sys.stderr.buffer.write(b'stderr: ')
        sys.stderr.buffer.write(line)

if __name__ == '__main__':
    trans_main(parsearguments(), main,
               in_as_text = False,
               out_as_text = False)