onmt.tconf

##################################################################################################
# Packages used
##################################################################################################

package sockeye :: .versioner=git .repo="https://github.com/mjpost/sockeye" .ref=HEAD { }
package sacrebleu :: .versioner=pip .package="sacrebleu" .tag="1.2.12" { }
package subword_nmt :: .versioner=pip .package="subword-nmt" .tag="0.3.5" { }
package mosesdecoder :: .versioner=git .repo="https://github.com/moses-smt/mosesdecoder" .ref=HEAD { }

package sentencepiece :: .versioner=git .repo="https://github.com/google/sentencepiece" .ref="tags/v0.1.5" {  # v0.1.6 throws segfault
  mkdir build
  cd build
  cmake ..
  make -j $(nproc)
}

package tools
    :: .versioner=git .repo="https://github.com/shuoyangd/tape4nmt-tools" .ref=HEAD {
  pip install -r requirements.txt
}

package onmt :: .versioner=git .repo="https://github.com/sebastianGehrmann/OpenNMT-py.git" .ref=states_in_translation { }

global {

  ##################################################################################################
  # Data-related stuff
  ##################################################################################################

  SRC=(TrainDataSource:
    iwslt_deen_2014="de"
  )
  TRG=(TrainDataSource:
    iwslt_deen_2014="en"
  )
  trg_lang=en  # FIXME (only used by wrap_xml, under some rare cases)

  train_data=(TrainDataSource:
    iwslt_deen_2014=(side:
      src="/path/to/iwslt/train.tags.nourl.de-en.de"
      trg="/path/to/iwslt/train.tags.nourl.de-en.en"
    )
  )

  dev_data=(DevDataSource:
    iwslt_deen_dev2010=(side:
      src="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.en.xml"
    )
    iwslt_deen_dev2012=(side:
      src="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.en.xml"
    )
  )

  test_data=(TestDataSource:
    iwslt_deen_test2010=(side:
      src="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.en.xml"
    )
    iwslt_deen_test2011=(side:
      src="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.en.xml"
    )
    iwslt_deen_test2012=(side:
      src="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.en.xml"
    )
  )

  ##################################################################################################
  # General options you should set for your environment
  ##################################################################################################

  # All ducttape files will be written underneath this directory
  ducttape_output="out"

  # TRAINING CONFIGURATIONS
  # all default is consistent with nematus
  train_train_from="" # if there is a previous model to start with
  train_train_from_state_dict="" # if there is a previous dict to start with
  train_start_epoch="" # if trained for certain amount of epochs previously

  train_layers="2"
  train_rnn_size="1024"
  train_word_vec_size="300"
  # train_brnn="true" # deprecated!
  train_batch_size="80"
  train_epochs="20"
  train_optim="adadelta"
  train_dropout="0.2" # implementation may not be consistent with nematus
  train_learning_rate="1.0"
  train_pre_word_vecs_enc=""
  train_pre_word_vecs_dec=""
  train_pre_word_vecs_enc_features_prefix=""
  train_enc_fix=""
  train_encoder_type="brnn"

  # TEST CONFIGURATIONS
  test_model_selection_strategy="acc"
  test_max_sent_length="150"
  test_beam_size="12"
  test_batch_size="1"
  test_replace_unk="true"

  ##################################################################################################
  # Job parameters
  ##################################################################################################

  # SGE: generic job flags
  resource_flags="-l mem_free=2g"

  # SGE: larger job flags
  resource_flags_16g="-l mem_free=16g"

  # SGE: flags for training a model
  resource_flags_train="-q g.q -l gpu=1,mem_free=4g"

  # SGE: flags for decoding
  resource_flags_decode="-q g.q -l gpu=1,mem_free=4g"

  # SGE: flags for notifying about job completion (put in your email address!)
  action_flags="-m ae -M YOUR_EMAIL_HERE"

  # The default submitter: shell (run locally) or sge (run on a grid)
  submitter=(TestMode: no="sge" yes="shell")

  # Virtual env location. This should be a file path to the virtual env you want loaded before tasks.
  # This variable supports both conda and Python's virtualenv. For conda, use "conda:ENV" as the value,
  # where "ENV" is the name of the conda environment that should be loaded. For virtualenv, supply
  # the path to the script that should be loaded.
  pyenv=(TestMode: no="conda:sockeye" yes="conda:sockeye-cpu")

  ##################################################################################################
  # Preprocessing options
  ##################################################################################################

  # sentencepiece options
  sentencepiece_vocab_size=(SentencePieceOps: 8000 49500)
  sentencepiece_model_type="unigram"

  # no of BPE operations
  bpe_operations=(BpeMergeOps: 49500 9500)

  # options for cleaning training data
  MaxLen=80
  Ratio=1

  # flags for moses tokenizer
  tokenizer_flags="-no-escape -a -q"

  use_cpu=(TestMode: no yes)
}