Add en_ewt config example, upgrade AllenNLP to 0.9.0

Hyperparticle · Hyperparticle · commit cbabef684b52 · 2019-11-02T15:53:28.000-07:00
diff --git a/README.md b/README.md
@@ -48,6 +48,25 @@ be saved under `logs/multilingual`. Note that this process is highly memory inte
 12+ GB of GPU memory (requirements are half if fp16 is enabled in AllenNLP, but this [requires custom changes to the library](https://github.com/allenai/allennlp/issues/2149)). 
 The training may take 20 or more days to complete all 80 epochs depending on the type of your GPU.
 
+### Training on Other Datasets
+
+An example config is given for fine-tuning on just English EWT. Just run:
+
+```bash
+python train.py --config config/ud/en/udify_bert_finetune_en_ewt.json --name en_ewt
+```
+
+To run your own dataset, copy `config/ud/multilingual/udify_bert_finetune_multilingual.json` and modify the following
+json parameters:
+
+- `train_data_path`, `validation_data_path`, and `test_data_path` to the paths of the dataset conllu files. These can
+be optionally `null`.
+- `directory_path` to `data/vocab/<dataset_name>/vocabulary`.
+- `warmup_steps` and `start_step` to be equal to the number of steps in the first epoch. A good initial value is in the 
+range `100-1000`. Alternatively, run the training script first to see the number of steps to the right of the progress 
+bar.
+- If using just one treebank, optionally add `xpos` to the `tasks` list.
+
 ### Viewing Model Performance
 
 One can view how well the models are performing by running TensorBoard
@@ -110,9 +129,15 @@ python train.py --config config/sigmorphon/multilingual/udify_bert_sigmorphon_mu
 
 1. When fine-tuning, my scores/metrics show poor performance.
 
-It should take about 10 epochs to start seeing good scores coming from all the metrics, and 80 epochs to be competitive with UDPipe Future.
+It should take about 10 epochs to start seeing good scores coming from all the metrics, and 80 epochs to be competitive
+ with UDPipe Future.
 
-One caveat is that if you use a subset of treebanks for fine-tuning instead of all 124 UD v2.3 treebanks, *you must modify the configuration file*. Make sure to tune the learning rate scheduler to the number of training steps. Copy the [`udify_bert_finetune_multilingual.json`](https://github.com/Hyperparticle/udify/blob/master/config/ud/multilingual/udify_bert_finetune_multilingual.json) config and modify the `"warmup_steps"` and `"start_step"` values. A good initial choice would be to set both to be equal to the number of training batches of one epoch ( run the training script first to see the batches remaining).
+One caveat is that if you use a subset of treebanks for fine-tuning instead of all 124 UD v2.3 treebanks, 
+*you must modify the configuration file*. Make sure to tune the learning rate scheduler to the number of 
+training steps. Copy the [`udify_bert_finetune_multilingual.json`](https://github.com/Hyperparticle/udify/blob/master/config/ud/multilingual/udify_bert_finetune_multilingual.json) 
+config and modify the `"warmup_steps"` and `"start_step"` values. A good initial choice would be to set both to be 
+equal to the number of training batches of one epoch (run the training script first to see the batches remaining, to 
+the right of the progress bar).
 
 ## Cite This Paper
 
diff --git a/config/ud/en/udify_bert_finetune_en_ewt.json b/config/ud/en/udify_bert_finetune_en_ewt.json
@@ -0,0 +1,130 @@
+{
+  "dataset_reader": {
+    "lazy": false,
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "bert": {
+        "type": "udify-bert-pretrained",
+        "pretrained_model": "config/archive/bert-base-multilingual-cased/vocab.txt",
+        "do_lowercase": false,
+        "use_starting_offsets": true
+      }
+    }
+  },
+  "train_data_path": "data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-train.conllu",
+  "validation_data_path": "data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-dev.conllu",
+  "test_data_path": "data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-test.conllu",
+  "vocabulary": {
+    "directory_path": "data/vocab/en_ewt/vocabulary"
+  },
+  "model": {
+    "word_dropout": 0.2,
+    "mix_embedding": 12,
+    "layer_dropout": 0.1,
+    "tasks": ["upos", "feats", "lemmas", "deps"],
+    "text_field_embedder": {
+      "type": "udify_embedder",
+      "dropout": 0.5,
+      "allow_unmatched_keys": true,
+      "embedder_to_indexer_map": {
+        "bert": ["bert", "bert-offsets"]
+      },
+      "token_embedders": {
+        "bert": {
+          "type": "udify-bert-pretrained",
+          "pretrained_model": "bert-base-multilingual-cased",
+          "requires_grad": true,
+          "dropout": 0.15,
+          "layer_dropout": 0.1,
+          "combine_layers": "all"
+        }
+      }
+    },
+    "encoder": {
+      "type": "pass_through",
+      "input_dim": 768
+    },
+    "decoders": {
+      "upos": {
+        "encoder": {
+          "type": "pass_through",
+          "input_dim": 768
+        }
+      },
+      "feats": {
+        "encoder": {
+          "type": "pass_through",
+          "input_dim": 768
+        },
+        "adaptive": true
+      },
+      "lemmas": {
+        "encoder": {
+          "type": "pass_through",
+          "input_dim": 768
+        },
+        "adaptive": true
+      },
+      "deps": {
+        "tag_representation_dim": 256,
+        "arc_representation_dim": 768,
+        "encoder": {
+          "type": "pass_through",
+          "input_dim": 768
+        }
+      }
+    }
+  },
+  "iterator": {
+    "batch_size": 32,
+    "maximum_samples_per_batch": ["num_tokens", 32 * 100]
+  },
+  "trainer": {
+    "num_epochs": 80,
+    "patience": 80,
+    "num_serialized_models_to_keep": 1,
+    "should_log_learning_rate": true,
+    "summary_interval": 100,
+    "optimizer": {
+      "type": "bert_adam",
+      "b1": 0.9,
+      "b2": 0.99,
+      "weight_decay": 0.01,
+      "lr": 1e-3,
+      "parameter_groups": [
+        [["^text_field_embedder.*.bert_model.embeddings",
+          "^text_field_embedder.*.bert_model.encoder"], {}],
+        [["^text_field_embedder.*._scalar_mix",
+          "^text_field_embedder.*.pooler",
+          "^scalar_mix",
+          "^decoders",
+          "^shared_encoder"], {}]
+      ]
+    },
+    "learning_rate_scheduler": {
+      "type": "ulmfit_sqrt",
+      "model_size": 1,
+      "warmup_steps": 392,
+      "start_step": 392,
+      "factor": 5.0,
+      "gradual_unfreezing": true,
+      "discriminative_fine_tuning": true,
+      "decay_factor": 0.04
+    }
+  },
+  "udify_replace": [
+    "dataset_reader.token_indexers",
+    "model.text_field_embedder",
+    "model.encoder",
+    "model.decoders.xpos",
+    "model.decoders.deps.encoder",
+    "model.decoders.upos.encoder",
+    "model.decoders.feats.encoder",
+    "model.decoders.lemmas.encoder",
+    "trainer.learning_rate_scheduler",
+    "trainer.optimizer"
+  ]
+}
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-allennlp==0.8.5
+allennlp==0.9.0
 tensorflow
 pandas
 jupyter
diff --git a/udify/models/dependency_decoder.py b/udify/models/dependency_decoder.py
@@ -331,7 +331,7 @@ def _greedy_decode(self,
         attended_arcs = attended_arcs + torch.diag(attended_arcs.new(mask.size(1)).fill_(-numpy.inf))
         # Mask padded tokens, because we only want to consider actual words as heads.
         if mask is not None:
-            minus_mask = (1 - mask).byte().unsqueeze(2)
+            minus_mask = (1 - mask).bool().unsqueeze(2)
             attended_arcs.masked_fill_(minus_mask, -numpy.inf)
 
         # Compute the heads greedily.
diff --git a/udify/models/udify_model.py b/udify/models/udify_model.py
@@ -185,9 +185,9 @@ def token_dropout(tokens: torch.LongTensor,
             device = tokens.device
 
             # This creates a mask that only considers unpadded tokens for mapping to oov
-            padding_mask = torch.ones(tokens.size(), dtype=torch.uint8).to(device)
+            padding_mask = torch.ones(tokens.size(), dtype=torch.bool).to(device)
             for pad in padding_tokens:
-                padding_mask &= tokens != pad
+                padding_mask &= (tokens != pad)
 
             # Create a uniformly random mask selecting either the original words or OOV tokens
             dropout_mask = (torch.empty(tokens.size()).uniform_() < p).to(device)
diff --git a/udify/modules/bert_pretrained.py b/udify/modules/bert_pretrained.py
@@ -253,12 +253,16 @@ def get_padding_lengths(self, token: int) -> Dict[str, int]:  # pylint: disable=
         return {}
 
     @overrides
-    def pad_token_sequence(self,
-                           tokens: Dict[str, List[int]],
-                           desired_num_tokens: Dict[str, int],
-                           padding_lengths: Dict[str, int]) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
-        return {key: pad_sequence_to_length(val, desired_num_tokens[key])
-                for key, val in tokens.items()}
+    def as_padded_tensor(
+            self,
+            tokens: Dict[str, List[int]],
+            desired_num_tokens: Dict[str, int],
+            padding_lengths: Dict[str, int],
+    ) -> Dict[str, torch.Tensor]:
+        return {
+            key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key]))
+            for key, val in tokens.items()
+        }
 
     @overrides
     def get_keys(self, index_name: str) -> List[str]: