From f57c5764e37c9648782c92c55aa7032122bcc30e Mon Sep 17 00:00:00 2001 From: liaoxingyu Date: Tue, 9 Mar 2021 20:07:28 +0800 Subject: [PATCH] support multi-node training --- GETTING_STARTED.md | 20 ++++++++++++++++++++ fastreid/engine/defaults.py | 19 +++++++++++-------- fastreid/engine/hooks.py | 15 ++++++++------- fastreid/evaluation/evaluator.py | 14 +++++++++----- fastreid/evaluation/testing.py | 18 ++++++++---------- 5 files changed, 56 insertions(+), 30 deletions(-) diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index a47363157..172006a05 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -32,6 +32,26 @@ If you want to train model with 4 GPUs, you can run: python3 tools/train_net.py --config-file ./configs/Market1501/bagtricks_R50.yml --num-gpus 4 ``` +If you want to train model with multiple machines, you can run: + +``` +# machine 1 +export GLOO_SOCKET_IFNAME=eth0 +export NCCL_SOCKET_IFNAME=eth0 + +python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \ +--num-gpus 4 --num-machines 2 --machine-rank 0 --dist-url tcp://ip:port + +# machine 2 +export GLOO_SOCKET_IFNAME=eth0 +export NCCL_SOCKET_IFNAME=eth0 + +python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \ +--num-gpus 4 --num-machines 2 --machine-rank 1 --dist-url tcp://ip:port +``` + +Make sure the dataset path and code are the same in different machines, and machines can communicate with each other. + To evaluate a model's performance, use ```bash diff --git a/fastreid/engine/defaults.py b/fastreid/engine/defaults.py index 2950775e9..c16a9b612 100644 --- a/fastreid/engine/defaults.py +++ b/fastreid/engine/defaults.py @@ -467,15 +467,18 @@ def test(cls, cfg, model): results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP_ENABLED) results[dataset_name] = results_i - if comm.is_main_process(): - assert isinstance( - results, dict - ), "Evaluator must return a dict on the main process. Got {} instead.".format( - results - ) - print_csv_format(results) + if comm.is_main_process(): + assert isinstance( + results, dict + ), "Evaluator must return a dict on the main process. Got {} instead.".format( + results + ) + logger.info("Evaluation results for {} in csv format:".format(dataset_name)) + results_i['dataset'] = dataset_name + print_csv_format(results_i) - if len(results) == 1: results = list(results.values())[0] + if len(results) == 1: + results = list(results.values())[0] return results diff --git a/fastreid/engine/hooks.py b/fastreid/engine/hooks.py index 9226315b2..6a63770f6 100644 --- a/fastreid/engine/hooks.py +++ b/fastreid/engine/hooks.py @@ -360,19 +360,20 @@ def _do_eval(self): ) self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) - # Remove extra memory cache of main process due to evaluation - torch.cuda.empty_cache() + # Evaluation may take different time among workers. + # A barrier make them start the next iteration together. + comm.synchronize() def after_epoch(self): next_epoch = self.trainer.epoch + 1 - is_final = next_epoch == self.trainer.max_epoch - if is_final or (self._period > 0 and next_epoch % self._period == 0): + if self._period > 0 and next_epoch % self._period == 0: self._do_eval() - # Evaluation may take different time among workers. - # A barrier make them start the next iteration together. - comm.synchronize() def after_train(self): + next_epoch = self.trainer.epoch + 1 + # This condition is to prevent the eval from running after a failed training + if next_epoch % self._period != 0 and next_epoch >= self.trainer.max_epoch: + self._do_eval() # func is likely a closure that holds reference to the trainer # therefore we clean it to avoid circular reference in the end del self._func diff --git a/fastreid/evaluation/evaluator.py b/fastreid/evaluation/evaluator.py index ed84f80d8..7da449705 100644 --- a/fastreid/evaluation/evaluator.py +++ b/fastreid/evaluation/evaluator.py @@ -6,6 +6,7 @@ import torch +from fastreid.utils import comm from fastreid.utils.logger import log_every_n_seconds @@ -96,6 +97,7 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False): Returns: The return value of `evaluator.evaluate()` """ + num_devices = comm.get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader.dataset))) @@ -118,10 +120,11 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False): inputs["images"] = inputs["images"].flip(dims=[3]) flip_outputs = model(inputs) outputs = (outputs + flip_outputs) / 2 + if torch.cuda.is_available(): + torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) - idx += 1 iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_batch = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_batch > 30: @@ -140,17 +143,18 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False): total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( - "Total inference time: {} ({:.6f} s / batch per device)".format( - total_time_str, total_time / (total - num_warmup) + "Total inference time: {} ({:.6f} s / batch per device, on {} devices)".format( + total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( - "Total inference pure compute time: {} ({:.6f} s / batch per device)".format( - total_compute_time_str, total_compute_time / (total - num_warmup) + "Total inference pure compute time: {} ({:.6f} s / batch per device, on {} devices)".format( + total_compute_time_str, total_compute_time / (total - num_warmup), num_devices ) ) results = evaluator.evaluate() + # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: diff --git a/fastreid/evaluation/testing.py b/fastreid/evaluation/testing.py index 284d5c860..cf4abc3c0 100644 --- a/fastreid/evaluation/testing.py +++ b/fastreid/evaluation/testing.py @@ -8,23 +8,21 @@ from tabulate import tabulate from termcolor import colored -logger = logging.getLogger(__name__) - def print_csv_format(results): """ - Print main metrics in a format similar to Detectron, + Print main metrics in a format similar to Detectron2, so that they are easy to copypaste into a spreadsheet. Args: - results (OrderedDict[dict]): task_name -> {metric -> score} + results (OrderedDict): {metric -> score} """ - assert isinstance(results, OrderedDict), results # unordered results cannot be properly printed - task = list(results.keys())[0] - metrics = ["Datasets"] + [k for k in results[task]] + # unordered results cannot be properly printed + assert isinstance(results, OrderedDict) or not len(results), results + logger = logging.getLogger(__name__) - csv_results = [] - for task, res in results.items(): - csv_results.append((task, *list(res.values()))) + dataset_name = results.pop('dataset') + metrics = ["Dataset"] + [k for k in results] + csv_results = [(dataset_name, *list(results.values()))] # tabulate it table = tabulate(