diff --git a/llm_judge/show_result.py b/llm_judge/show_result.py index 92a6cd0..df4a660 100644 --- a/llm_judge/show_result.py +++ b/llm_judge/show_result.py @@ -41,7 +41,10 @@ def calculate_win_rate(results: list[dict]): num_win_2 = 0 num_tie = 0 for result in results: - if result["g1_winner"] == "tie" or result["g1_winner"] != result["g2_winner"]: + if ( + "tie" in {result["g1_winner"], result["g2_winner"]} + or result["g1_winner"] != result["g2_winner"] + ): num_tie += 1 elif result["g1_winner"] == "model_1": num_win_1 += 1 diff --git a/tests/test_show_result.py b/tests/test_show_result.py index c42c794..619322d 100644 --- a/tests/test_show_result.py +++ b/tests/test_show_result.py @@ -1,6 +1,6 @@ import unittest -from llm_judge.show_result import calculate_average_score +from llm_judge.show_result import calculate_average_score, calculate_win_rate class TestCalculateAverageScore(unittest.TestCase): @@ -10,3 +10,39 @@ def test_calculate_average_score(self): results = [{"score": 1}, {"score": 2}, {"score": 3}, {"score": 4}] self.assertEqual(calculate_average_score(results), 2.5) + + +class TestCalculateWinRate(unittest.TestCase): + def test_calculate_win_rate(self): + results = [ + {"g1_winner": "model_1", "g2_winner": "model_1"}, + ] + self.assertEqual( + calculate_win_rate(results), + { + "model_1": {"win_rate": 1.0, "adjusted_win_rate": 1.0}, + "model_2": {"win_rate": 0.0, "adjusted_win_rate": 0.0}, + }, + ) + + results = [ + {"g1_winner": "model_1", "g2_winner": "model_2"}, + ] + self.assertEqual( + calculate_win_rate(results), + { + "model_1": {"win_rate": 0.0, "adjusted_win_rate": 0.5}, + "model_2": {"win_rate": 0.0, "adjusted_win_rate": 0.5}, + }, + ) + + results = [ + {"g1_winner": "model_1", "g2_winner": "tie"}, + ] + self.assertEqual( + calculate_win_rate(results), + { + "model_1": {"win_rate": 0.0, "adjusted_win_rate": 0.5}, + "model_2": {"win_rate": 0.0, "adjusted_win_rate": 0.5}, + }, + )