From 3cca8c395d26aa76eb5063841b58ec7fe0624ae5 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 24 Apr 2020 19:07:01 +0300 Subject: [PATCH] Report std as it's more common, clarify --- README.rst | 3 +++ evaluate.py | 14 +++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index b8a5cc9..8288170 100644 --- a/README.rst +++ b/README.rst @@ -65,6 +65,9 @@ For evaluation, run:: python3 evaluation.py +We report precision, recall, F1, accuracy and their standard deviation estimated with bootstrap. +Please refer to the technical report for more details. + License ------- diff --git a/evaluate.py b/evaluate.py index 9d29634..0e343c2 100755 --- a/evaluate.py +++ b/evaluate.py @@ -26,10 +26,10 @@ def main(): name = path.stem metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap) print('{name:<20} ' - 'precision={precision:.3f} ± {precision_ci:.3f} ' - 'recall={recall:.3f} ± {recall_ci:.3f} ' - 'F1={f1:.3f} ± {f1_ci:.3f} ' - 'accuracy={accuracy:.3f} ± {accuracy_ci:.3f} ' + 'precision={precision:.3f} ± {precision_std:.3f} ' + 'recall={recall:.3f} ± {recall_std:.3f} ' + 'F1={f1:.3f} ± {f1_std:.3f} ' + 'accuracy={accuracy:.3f} ± {accuracy_std:.3f} ' .format(name=name, **metrics)) metrics_by_name[name] = metrics @@ -78,7 +78,7 @@ def evaluate( b_values.setdefault('accuracy', []).append( statistics.mean([accuracies[i] for i in indices])) for key, values in sorted(b_values.items()): - metrics[f'{key}_ci'] = 1.96 * statistics.stdev(values) + metrics[f'{key}_std'] = statistics.stdev(values) return metrics @@ -95,8 +95,8 @@ def print_metrics_diff(tp_fp_fns, other_tp_fp_fns, n_bootstrap): diffs.setdefault(key, []).append(metrics[key] - other_metrics[key]) for key, values in sorted(diffs.items()): mean = statistics.mean(values) - confidence_interval = 1.96 * statistics.stdev(values) - print(f'{key:<10} {mean:.3f} ± {confidence_interval:.3f}') + std = statistics.stdev(values) + print(f'{key:<10} {mean:.3f} ± {std:.3f}') TP_FP_FN = Tuple[float, float, float]