From f57c5764e37c9648782c92c55aa7032122bcc30e Mon Sep 17 00:00:00 2001
From: liaoxingyu <sherlockliao01@gmail.com>
Date: Tue, 9 Mar 2021 20:07:28 +0800
Subject: [PATCH] support multi-node training

---
 GETTING_STARTED.md               | 20 ++++++++++++++++++++
 fastreid/engine/defaults.py      | 19 +++++++++++--------
 fastreid/engine/hooks.py         | 15 ++++++++-------
 fastreid/evaluation/evaluator.py | 14 +++++++++-----
 fastreid/evaluation/testing.py   | 18 ++++++++----------
 5 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
index a47363157..172006a05 100644
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@@ -32,6 +32,26 @@ If you want to train model with 4 GPUs, you can run:
 python3 tools/train_net.py --config-file ./configs/Market1501/bagtricks_R50.yml --num-gpus 4
 ```
 
+If you want to train model with multiple machines, you can run:
+
+```
+# machine 1
+export GLOO_SOCKET_IFNAME=eth0
+export NCCL_SOCKET_IFNAME=eth0
+
+python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
+--num-gpus 4 --num-machines 2 --machine-rank 0 --dist-url tcp://ip:port 
+
+# machine 2
+export GLOO_SOCKET_IFNAME=eth0
+export NCCL_SOCKET_IFNAME=eth0
+
+python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
+--num-gpus 4 --num-machines 2 --machine-rank 1 --dist-url tcp://ip:port 
+```
+
+Make sure the dataset path and code are the same in different machines, and machines can communicate with each other. 
+
 To evaluate a model's performance, use
 
 ```bash
diff --git a/fastreid/engine/defaults.py b/fastreid/engine/defaults.py
index 2950775e9..c16a9b612 100644
--- a/fastreid/engine/defaults.py
+++ b/fastreid/engine/defaults.py
@@ -467,15 +467,18 @@ def test(cls, cfg, model):
             results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP_ENABLED)
             results[dataset_name] = results_i
 
-        if comm.is_main_process():
-            assert isinstance(
-                results, dict
-            ), "Evaluator must return a dict on the main process. Got {} instead.".format(
-                results
-            )
-            print_csv_format(results)
+            if comm.is_main_process():
+                assert isinstance(
+                    results, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                results_i['dataset'] = dataset_name
+                print_csv_format(results_i)
 
-        if len(results) == 1: results = list(results.values())[0]
+        if len(results) == 1:
+            results = list(results.values())[0]
 
         return results
 
diff --git a/fastreid/engine/hooks.py b/fastreid/engine/hooks.py
index 9226315b2..6a63770f6 100644
--- a/fastreid/engine/hooks.py
+++ b/fastreid/engine/hooks.py
@@ -360,19 +360,20 @@ def _do_eval(self):
                     )
             self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
 
-        # Remove extra memory cache of main process due to evaluation
-        torch.cuda.empty_cache()
+        # Evaluation may take different time among workers.
+        # A barrier make them start the next iteration together.
+        comm.synchronize()
 
     def after_epoch(self):
         next_epoch = self.trainer.epoch + 1
-        is_final = next_epoch == self.trainer.max_epoch
-        if is_final or (self._period > 0 and next_epoch % self._period == 0):
+        if self._period > 0 and next_epoch % self._period == 0:
             self._do_eval()
-        # Evaluation may take different time among workers.
-        # A barrier make them start the next iteration together.
-        comm.synchronize()
 
     def after_train(self):
+        next_epoch = self.trainer.epoch + 1
+        # This condition is to prevent the eval from running after a failed training
+        if next_epoch % self._period != 0 and next_epoch >= self.trainer.max_epoch:
+            self._do_eval()
         # func is likely a closure that holds reference to the trainer
         # therefore we clean it to avoid circular reference in the end
         del self._func
diff --git a/fastreid/evaluation/evaluator.py b/fastreid/evaluation/evaluator.py
index ed84f80d8..7da449705 100644
--- a/fastreid/evaluation/evaluator.py
+++ b/fastreid/evaluation/evaluator.py
@@ -6,6 +6,7 @@
 
 import torch
 
+from fastreid.utils import comm
 from fastreid.utils.logger import log_every_n_seconds
 
 
@@ -96,6 +97,7 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
     Returns:
         The return value of `evaluator.evaluate()`
     """
+    num_devices = comm.get_world_size()
     logger = logging.getLogger(__name__)
     logger.info("Start inference on {} images".format(len(data_loader.dataset)))
 
@@ -118,10 +120,11 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
                 inputs["images"] = inputs["images"].flip(dims=[3])
                 flip_outputs = model(inputs)
                 outputs = (outputs + flip_outputs) / 2
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
             total_compute_time += time.perf_counter() - start_compute_time
             evaluator.process(inputs, outputs)
 
-            idx += 1
             iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
             seconds_per_batch = total_compute_time / iters_after_start
             if idx >= num_warmup * 2 or seconds_per_batch > 30:
@@ -140,17 +143,18 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
     total_time_str = str(datetime.timedelta(seconds=total_time))
     # NOTE this format is parsed by grep
     logger.info(
-        "Total inference time: {} ({:.6f} s / batch per device)".format(
-            total_time_str, total_time / (total - num_warmup)
+        "Total inference time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
         )
     )
     total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
     logger.info(
-        "Total inference pure compute time: {} ({:.6f} s / batch per device)".format(
-            total_compute_time_str, total_compute_time / (total - num_warmup)
+        "Total inference pure compute time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
         )
     )
     results = evaluator.evaluate()
+
     # An evaluator may return None when not in main process.
     # Replace it by an empty dict instead to make it easier for downstream code to handle
     if results is None:
diff --git a/fastreid/evaluation/testing.py b/fastreid/evaluation/testing.py
index 284d5c860..cf4abc3c0 100644
--- a/fastreid/evaluation/testing.py
+++ b/fastreid/evaluation/testing.py
@@ -8,23 +8,21 @@
 from tabulate import tabulate
 from termcolor import colored
 
-logger = logging.getLogger(__name__)
-
 
 def print_csv_format(results):
     """
-    Print main metrics in a format similar to Detectron,
+    Print main metrics in a format similar to Detectron2,
     so that they are easy to copypaste into a spreadsheet.
     Args:
-        results (OrderedDict[dict]): task_name -> {metric -> score}
+        results (OrderedDict): {metric -> score}
     """
-    assert isinstance(results, OrderedDict), results  # unordered results cannot be properly printed
-    task = list(results.keys())[0]
-    metrics = ["Datasets"] + [k for k in results[task]]
+    # unordered results cannot be properly printed
+    assert isinstance(results, OrderedDict) or not len(results), results
+    logger = logging.getLogger(__name__)
 
-    csv_results = []
-    for task, res in results.items():
-        csv_results.append((task, *list(res.values())))
+    dataset_name = results.pop('dataset')
+    metrics = ["Dataset"] + [k for k in results]
+    csv_results = [(dataset_name, *list(results.values()))]
 
     # tabulate it
     table = tabulate(