Update README, website, and PARADE reproduction docs (#103)

Update documentation, make PARADE use tranfsormer aggregation by default, and bump to v0.2.5.
capreolus-ir · Oct 20, 2020 · 9aa1f70 · 9aa1f70
1 parent ddc1080
commit 9aa1f70
Show file tree

Hide file tree

Showing 10 changed files with 50 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -3,21 +3,23 @@
 [![Documentation Status](https://readthedocs.org/projects/capreolus/badge/?version=latest)](https://capreolus.readthedocs.io/?badge=latest)
 [![PyPI version fury.io](https://badge.fury.io/py/capreolus.svg)](https://pypi.python.org/pypi/capreolus/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 
-[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/capreolus-ir/capreolus.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/capreolus-ir/capreolus/context:python)
 
 # Capreolus
 [![Capreolus](https://people.mpi-inf.mpg.de/~ayates/capreolus/capreolus-100px.png)](https://capreolus.ai) <br/>
 Capreolus is a toolkit for conducting end-to-end ad hoc retrieval experiments. Capreolus provides fine control over the entire experimental pipeline through the use of interchangeable and configurable modules.
 
+[Get started with a Notebook](https://colab.research.google.com/drive/161FnmLt3PgIXG-Z5eNg45z2iSZucVAnr?usp=sharing)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/161FnmLt3PgIXG-Z5eNg45z2iSZucVAnr?usp=sharing)
+
 [Read the documentation for a detailed overview.](http://capreolus.ai/)
 
 ## Quick Start
-1. Prerequisites: Python 3.6+ and Java 11
+1. Prerequisites: Python 3.7+ and Java 11. See the [installation instructions](https://capreolus.ai/en/latest/installation.html)
 2. Install the pip package: `pip install capreolus`
-3. Train a model: `capreolus rerank.traineval with reranker.name=KNRM reranker.trainer.niters=2`
-4. If the `train` command completed successfully, you've trained your first Capreolus reranker on robust04! This command created several outputs, such as run files, a loss plot, and a ranking metric plot on the dev set queries. To learn about these files, [read about running experiments with Capreolus](http://capreolus.ai/en/latest/cli.html).
-5. To learn about different configuration options, try: `capreolus rerank.print_config with reranker.name=KNRM`
-5. To learn about different modules you can use, such as `reranker.name=DRMM`, try: `capreolus modules`
+3. Train a model: `capreolus rerank.traineval with benchmark.name=nf reranker.name=KNRM reranker.trainer.niters=2`
+4. If the `train` command completed successfully, you've trained your first Capreolus reranker on [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/)! This command created several outputs, such as model checkpoints and TREC-format run files. To learn about these files, [read about running experiments with Capreolus](http://capreolus.ai/en/latest/cli.html).
+5. To learn about different configuration options, try: `capreolus rerank.print_config with benchmark.name=nf reranker.name=KNRM`
+6. To learn about different modules you can use, such as `reranker.name=DRMM`, try: `capreolus modules`
+7. Learn about [running experiments via the Python API](https://capreolus.ai/en/latest/quick.html)
 
 ## Environment Variables
 Capreolus uses environment variables to indicate where outputs should be stored and where document inputs can be found. Consult the table below to determine which variables should be set. Set them either on the fly before running Capreolus (`export CAPREOLUS_RESULTS=...`) or by editing your shell's initialization files (e.g., `~/.bashrc` or `~/.zshrc`).

diff --git a/capreolus/__init__.py b/capreolus/__init__.py
@@ -5,7 +5,7 @@
 from profane import ConfigOption, Dependency, ModuleBase, constants, config_list_to_dict, module_registry
 
 
-__version__ = "0.2.4.1"
+__version__ = "0.2.5"
 
 
 ### set constants used by capreolus and profane ###

diff --git a/capreolus/extractor/embedtext.py b/capreolus/extractor/embedtext.py
@@ -24,10 +24,10 @@ class EmbedText(Extractor):
         Dependency(key="tokenizer", module="tokenizer", name="anserini"),
     ]
     config_spec = [
-        ConfigOption("embeddings", "glove6b"),
+        ConfigOption("embeddings", "glove6b", "embeddings to use: fasttext, glove6b, glove6b.50d, or w2vnews"),
         ConfigOption("calcidf", True),
-        ConfigOption("maxqlen", 4),
-        ConfigOption("maxdoclen", 800),
+        ConfigOption("maxqlen", 4, "maximum query length (shorter will be truncated)"),
+        ConfigOption("maxdoclen", 800, "maximum doc length (shorter will be truncated)"),
     ]
 
     pad_tok = "<pad>"

diff --git a/capreolus/extractor/slowembedtext.py b/capreolus/extractor/slowembedtext.py
@@ -26,11 +26,11 @@ class SlowEmbedText(Extractor):
         Dependency(key="tokenizer", module="tokenizer", name="anserini"),
     ]
     config_spec = [
-        ConfigOption("embeddings", "glove6b"),
-        ConfigOption("zerounk", False),
+        ConfigOption("embeddings", "glove6b", "embeddings to use: fasttext, glove6b, glove6b.50d, or w2vnews"),
+        ConfigOption("zerounk", False, "use all zeros for unknown terms (True) or generate a random embedding (False)"),
         ConfigOption("calcidf", True),
-        ConfigOption("maxqlen", 4),
-        ConfigOption("maxdoclen", 800),
+        ConfigOption("maxqlen", 4, "maximum query length (shorter will be truncated)"),
+        ConfigOption("maxdoclen", 800, "maximum doc length (shorter will be truncated)"),
         ConfigOption("usecache", False),
     ]
 

diff --git a/capreolus/reranker/parade.py b/capreolus/reranker/parade.py
@@ -116,7 +116,7 @@ class TFParade(Reranker):
         ConfigOption(
             "pretrained", "bert-base-uncased", "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco"
         ),
-        ConfigOption("aggregation", "maxp"),
+        ConfigOption("aggregation", "transformer"),
     ]
 
     def build_model(self):

diff --git a/docs/index.rst b/docs/index.rst
@@ -2,6 +2,8 @@ Capreolus
 =========================================
 Capreolus is a toolkit for constructing flexible *ad hoc retrieval pipelines*. Capreolus pipelines can be run via a Python or command line interface.
 
+Want to jump in? `Get started with a Notebook. <https://colab.research.google.com/drive/161FnmLt3PgIXG-Z5eNg45z2iSZucVAnr?usp=sharing>`_ |Colab Badge|
+
 Capreolus is organized around the idea of interchangeable and configurable *modules*, such as a neural ``Reranker`` or a first stage ``Searcher``. Researchers can implement new module classes, such as a new neural ``Reranker``, to experiment with a new module while controlling for all other variables in the pipeline (e.g., the first stage ranking method and its parameters, folds used for cross-validation, tokenization and embeddings if applicable used with the reranker, neural training options like the number of iterations, batch size, and loss function, etc).
 
 Since Capreolus v0.2, *pipelines* are instances of the ``Task`` module and can be combined like any other module.
@@ -27,3 +29,11 @@ Looking for the code? `Find Capreolus on GitHub. <https://github.com/capreolus-i
 
 Looking for the previous "search-then-rerank" pipeline that was presented in the WSDM'20 demo paper?
 Check out Capreolus v0.1 and `the corresponding documentation. <https://capreolus.ai/en/v0.1.4/>`_
+
+.. |Colab Badge| image:: https://colab.research.google.com/assets/colab-badge.svg
+    :alt: Open in Colab
+    :scale: 100%
+    :target: https://colab.research.google.com/drive/161FnmLt3PgIXG-Z5eNg45z2iSZucVAnr?usp=sharing
+
+
+
diff --git a/docs/quick.md b/docs/quick.md
@@ -1,5 +1,8 @@
 # Getting Started
 
+Want to jump in? [Get started with a Notebook.](https://colab.research.google.com/drive/161FnmLt3PgIXG-Z5eNg45z2iSZucVAnr?usp=sharing)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/161FnmLt3PgIXG-Z5eNg45z2iSZucVAnr?usp=sharing)
+
+## Prerequisites
 - Requirements: Python 3.7+, a Python environment you can install packages in (e.g., a [Conda environment](https://gist.github.com/andrewyates/970c570411c4a36785f6c0e9362eb1eb)), and Java 11. See the [detailed installation instructions](installation) for help with these.
 - Install: `pip install capreolus`
 

diff --git a/docs/reproduction/PARADE.md b/docs/reproduction/PARADE.md
@@ -16,15 +16,25 @@ This section contains instructions for installing Capreolus. **Do not** install
 3. Briefly read about [configuring Capreolus](https://capreolus.ai/en/latest/installation.html#configuring-capreolus). The main thing to note is that results will be stored in `~/.capreolus` by default.
 
 ## Running PARADE (reduced memory usage)
-This section describes how to run PARADE on a GPU with 16GB RAM. This is substantially less than used in the [paper](https://arxiv.org/abs/2008.09093), so we'll reduce the batch size and the size of each passage to make the data to fit.
+This section describes how to run PARADE on a GPU with 16GB RAM. This is substantially less than used in the [paper](https://arxiv.org/abs/2008.09093), so we'll train on a single fold and change many hyperparameters to make this run smoothly. However, this won't reach the same effectiveness as the full PARADE model (see instructions below).
 
 1. Make sure you have an available GPU and are in the top-level `capreolus` directory.
 2. Train and evaluate PARADE on a single fold: `python -m capreolus.run rerank.traineval with file=docs/reproduction/config_parade_small.txt fold=s1`
 3. This command takes about 3.5 hours on a Titan Xp GPU. Once it finishes, metrics on the dev and test sets are shown:
-> 2020-09-01 15:45:10,053 - INFO - capreolus.task.rerank.evaluate - rerank: fold=s1 dev metrics: P_1=0.750 P_10=0.500 P_20=0.443 P_5=0.554 judged_10=0.992 judged_20=0.989 judged_200=0.947 map=0.267 ndcg_cut_10=0.533 ndcg_cut_20=0.513 ndcg_cut_5=0.562 recall_100=0.453 recall_1000=0.453 recip_rank=0.817
+> 2020-10-20 12:39:37,265 - INFO - capreolus.task.rerank.evaluate - rerank: fold=s1 dev metrics: P_1=0.688 P_10=0.529 P_20=0.428 P_5=0.596 judged_10=0.998 judged_20=0.995 judged_200=0.947 map=0.271 ndcg_cut_10=0.545 ndcg_cut_20=0.504 ndcg_cut_5=0.577 recall_100=0.453 recall_1000=0.453 recip_rank=0.787
 
-> 2020-09-01 15:45:10,095 - INFO - capreolus.task.rerank.evaluate - rerank: fold=s1 test metrics: P_1=0.596 P_10=0.487 P_20=0.419 P_5=0.549 judged_10=0.989 judged_20=0.985 judged_200=0.931 map=0.285 ndcg_cut_10=0.491 ndcg_cut_20=0.486 ndcg_cut_5=0.518 recall_100=0.490 recall_1000=0.490 recip_rank=0.727
+> 2020-10-20 12:39:37,343 - INFO - capreolus.task.rerank.evaluate - rerank: fold=s1 test metrics: P_1=0.532 P_10=0.472 P_20=0.418 P_5=0.528 judged_10=0.989 judged_20=0.989 judged_200=0.931 map=0.285 ndcg_cut_10=0.470 ndcg_cut_20=0.471 ndcg_cut_5=0.485 recall_100=0.490 recall_1000=0.490 recip_rank=0.672
 4. Compare your *fold=s1* results to those shown here. Do they match? If so, we can move on to reproducing the full PARADE model.
 
 ## Running PARADE (full model with normal memory usage)
-TODO. This requires a 48GB GPU, a TPU, or porting PARADE to Pytorch so we can iterate over passages rather than loading all of them in memory at once (see issue #86). The corresponding config is in `docs/reproduction/config_parade.txt`.
+This requires a 48GB GPU, a TPU, or porting PARADE to Pytorch so we can iterate over passages rather than loading all of them in memory at once (see issue #86). It has been tested on NVIDIA Quadro RTX 8000s and Google Cloud TPUs.
+
+1. Make sure you have an available GPU and are in the top-level `capreolus` directory.
+2. Train and evaluate PARADE on each of the five robust04 folds (splits *s1-s5*): <br/>
+`python -m capreolus.run rerank.traineval with file=docs/reproduction/config_parade.txt fold=s1` <br/>
+`python -m capreolus.run rerank.traineval with file=docs/reproduction/config_parade.txt fold=s2` <br/>
+`python -m capreolus.run rerank.traineval with file=docs/reproduction/config_parade.txt fold=s3` <br/>
+`python -m capreolus.run rerank.traineval with file=docs/reproduction/config_parade.txt fold=s4` <br/>
+`python -m capreolus.run rerank.traineval with file=docs/reproduction/config_parade.txt fold=s5`
+3. Each command will take a long time; approximately 36 hours on a Quadro 8000 (much faster on TPU). As above, per-fold metrics are displayed after each fold completes.
+4. When the final fold completes, cross-validated metrics are also displayed.
diff --git a/docs/reproduction/config_parade.txt b/docs/reproduction/config_parade.txt
@@ -6,6 +6,7 @@ rank.searcher.name=bm25staticrob04yang19
 
 reranker.name=parade
 reranker.aggregation=transformer
+reranker.pretrained=electra-base-msmarco
 
 reranker.extractor.usecache=True
 reranker.extractor.maxseqlen=256

diff --git a/docs/reproduction/config_parade_small.txt b/docs/reproduction/config_parade_small.txt
@@ -6,15 +6,16 @@ rank.searcher.name=bm25staticrob04yang19
 
 reranker.name=parade
 reranker.aggregation=transformer
+reranker.pretrained=electra-base-msmarco
 
 reranker.extractor.usecache=True
-reranker.extractor.maxseqlen=150
+reranker.extractor.maxseqlen=125
 reranker.extractor.numpassages=12
-reranker.extractor.passagelen=125
-reranker.extractor.stride=25
+reranker.extractor.passagelen=100
+reranker.extractor.stride=75
 reranker.extractor.prob=0.1
 
-reranker.trainer.niters=36
+reranker.trainer.niters=4
 reranker.trainer.itersize=256
 reranker.trainer.validatefreq=2
 reranker.trainer.batch=2