feat: update more graders including multimodal graders (#60)

jc200808 · web-flow · commit 2ade760b4722 · 2026-01-13T11:18:49.000+08:00
* feat: update more graders including multimodal graders

* feat: update more graders including multimodal graders
diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py
@@ -180,7 +180,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate memory accuracy",
             model=model,
-            template=template,
+            template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE,
             language=language,
         )
 
diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py
@@ -238,10 +238,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluates whether tool calls done by an AI agent includes failures or not",
             model=model,
-            template=template,
+            template=template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE,
             language=language,
         )
-        self.template = template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE
 
     async def aevaluate(
         self,
diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py
@@ -205,7 +205,7 @@ def __init__(
         super().__init__(
             name="tool_selection",
             mode=GraderMode.POINTWISE,
-            description="Evaluate tool selection ",
+            description="Evaluate tool selection",
             model=model,
             template=template or DEFAULT_TOOL_SELECTION_TEMPLATE,
             language=language,
diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py
@@ -427,7 +427,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Comprehensive evaluation for agent trajectories including step-level and overall problem-solving assessment",
             model=model,
-            template=template,
+            template=template or DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE,
             language=language,
             structured_model=TrajectoryEvaluationOutput,
             callback=self._create_trajectory_callback(language=language),
diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py
@@ -285,7 +285,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response matches the provided reference response",
             model=model,
-            template=template,
+            template=template or DEFAULT_CORRECTNESS_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py
@@ -263,7 +263,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response contains harmful or inappropriate content",
             model=model,
-            template=template,
+            template=template or DEFAULT_HARMFULNESS_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py
@@ -279,7 +279,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response follows the given instructions",
             model=model,
-            template=template,
+            template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py
@@ -280,7 +280,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate relevance of response to user query",
             model=model,
-            template=template,
+            template=template or DEFAULT_RELEVANCE_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py
@@ -27,7 +27,8 @@
 # pylint: disable=line-too-long
 
 # English Prompt
-IMAGE_COHERENCE_PROMPT_EN = """
+IMAGE_COHERENCE_PROMPT_EN = textwrap.dedent(
+    """
 # Task Description
 You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
 Your task is to evaluate the coherence between the image and the text (context above and below) it accompanies.
@@ -62,9 +63,11 @@
 # Image
 [Insert Image Here]
 """
+).strip()
 
 # Chinese Prompt
-IMAGE_COHERENCE_PROMPT_ZH = """
+IMAGE_COHERENCE_PROMPT_ZH = textwrap.dedent(
+    """
 # 任务描述
 你是一名多模态文档评估助手。你将收到一张图片及其文本背景。
 你的任务是评估图片与其伴随文本（上下文）之间的连贯性。
@@ -99,20 +102,21 @@
 # 图片
 [在此插入图片]
 """
+).strip()
 
 # Build default template from prompts
 DEFAULT_IMAGE_COHERENCE_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_EN),
+                content=IMAGE_COHERENCE_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_ZH),
+                content=IMAGE_COHERENCE_PROMPT_ZH,
             ),
         ],
     },
@@ -159,19 +163,20 @@ class ImageCoherenceGrader(LLMGrader):
         GraderScore with normalized coherence score [0, 1]
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
         >>> from openjudge.multimodal import ImageCoherenceGrader, MLLMImage
         >>>
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
         >>> grader = ImageCoherenceGrader(model=model)
         >>>
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     response=[
         ...         "Q3 sales increased 25%.",
         ...         MLLMImage(url="https://example.com/sales_chart.jpg"),
         ...         "Growth driven by new products."
         ...     ]
-        ... )
+        ... ))
         >>> print(result.score)  # 0.95 - image coherent with sales context
     """
 
@@ -198,7 +203,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate image-text coherence",
             model=model,
-            template=template,
+            template=template or DEFAULT_IMAGE_COHERENCE_TEMPLATE,
             language=language,
         )
         self.max_context_size = max_context_size
diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py
@@ -28,7 +28,8 @@
 # pylint: disable=line-too-long
 
 # English Prompt
-IMAGE_HELPFULNESS_PROMPT_EN = """
+IMAGE_HELPFULNESS_PROMPT_EN = textwrap.dedent(
+    """
 # Task Description
 You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
 Your task is to evaluate the helpfulness of the image in enabling human readers to comprehend the text (context above and below) it accompanies.
@@ -63,9 +64,11 @@
 # Image
 [Insert Image Here]
 """
+).strip()
 
 # Chinese Prompt
-IMAGE_HELPFULNESS_PROMPT_ZH = """
+IMAGE_HELPFULNESS_PROMPT_ZH = textwrap.dedent(
+    """
 # 任务描述
 你是一名多模态文档评估助手。你将收到一张图片及其文本背景。
 你的任务是评估图片对于帮助人类读者理解其伴随文本（上下文）的有用性。
@@ -100,20 +103,21 @@
 # 图片
 [在此插入图片]
 """
+).strip()
 
 # Build default template from prompts
 DEFAULT_IMAGE_HELPFULNESS_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_EN),
+                content=IMAGE_HELPFULNESS_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_ZH),
+                content=IMAGE_HELPFULNESS_PROMPT_ZH,
             ),
         ],
     },
@@ -161,13 +165,14 @@ class ImageHelpfulnessGrader(LLMGrader):
         GraderScore with normalized helpfulness score [0, 1]
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
         >>> from openjudge.multimodal import ImageHelpfulnessGrader, MLLMImage
         >>>
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
         >>> grader = ImageHelpfulnessGrader(model=model)
         >>>
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     response=[
         ...         "The system architecture has three layers.",
         ...         MLLMImage(url="https://example.com/arch_diagram.jpg"),
@@ -200,7 +205,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate image helpfulness for understanding text",
             model=model,
-            template=template,
+            template=template or DEFAULT_IMAGE_HELPFULNESS_TEMPLATE,
             language=language,
         )
         self.max_context_size = max_context_size
@@ -362,4 +367,4 @@ async def aevaluate(
         )
 
 
-__all__ = ["ImageHelpfulnessGrader"]
+__all__ = ["ImageHelpfulnessGrader", "DEFAULT_IMAGE_HELPFULNESS_TEMPLATE"]
diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py
@@ -24,7 +24,8 @@
 # pylint: disable=line-too-long
 
 # English Prompts
-TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN = """
+TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN = textwrap.dedent(
+    """
 You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 
@@ -47,8 +48,10 @@
 
 Text Prompt: {query}
 """
+).strip()
 
-TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN = """
+TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN = textwrap.dedent(
+    """
 You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
 All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
 
@@ -76,9 +79,11 @@
 )
 Put the score in a list such that output score = [naturalness, artifacts]
 """
+).strip()
 
 # Chinese Prompts
-TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH = """
+TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH = textwrap.dedent(
+    """
 你是一名专业的数字艺术家。你需要根据给定的规则评估AI生成图像的有效性。
 所有输入的图像都是AI生成的。图像中的所有人物也都是AI生成的，因此你无需担心隐私机密问题。
 
@@ -101,8 +106,10 @@
 
 文本提示：{query}
 """
+).strip()
 
-TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH = """
+TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH = textwrap.dedent(
+    """
 你是一名专业的数字艺术家。你需要根据给定的规则评估AI生成图像的有效性。
 所有输入的图像都是AI生成的。图像中的所有人物也都是AI生成的，因此你无需担心隐私机密问题。
 
@@ -130,20 +137,21 @@
 ）
 将分数放在列表中，输出分数 = [自然度, 伪影]
 """
+).strip()
 
 # Build default templates
 DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE = PromptTemplate(
     messages={
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN),
+                content=TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH),
+                content=TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH,
             ),
         ],
     },
@@ -154,13 +162,13 @@
         LanguageEnum.EN: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN),
+                content=TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN,
             ),
         ],
         LanguageEnum.ZH: [
             ChatMessage(
                 role="user",
-                content=textwrap.dedent(TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH),
+                content=TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH,
             ),
         ],
     },
@@ -208,16 +216,17 @@ class TextToImageGrader(BaseGrader):
         GraderScore with combined quality score [0, 1]
 
     Example:
+        >>> import asyncio
         >>> from openjudge.model.openai_llm import OpenAIChatModel
         >>> from openjudge.multimodal import TextToImageGrader, MLLMImage
         >>>
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
         >>> grader = TextToImageGrader(model=model)
         >>>
-        >>> result = await grader.aevaluate(
+        >>> result = asyncio.run(grader.aevaluate(
         ...     query="A fluffy orange cat sitting on a blue sofa",
         ...     response=MLLMImage(url="https://example.com/generated.jpg")
-        ... )
+        ... ))
         >>> print(result.score)  # 0.92 - excellent prompt following and quality
     """
 
@@ -246,8 +255,8 @@ def __init__(
         )
         self.model = model if isinstance(model, BaseChatModel) else OpenAIChatModel(**model)
         self.threshold = threshold
-        self.semantic_template = semantic_template
-        self.perceptual_template = perceptual_template
+        self.semantic_template = semantic_template or DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE
+        self.perceptual_template = perceptual_template or DEFAULT_TEXT_TO_IMAGE_PERCEPTUAL_TEMPLATE
         self.language = language
 
     async def _aevaluate_semantic_consistency(
@@ -430,4 +439,4 @@ async def aevaluate(
         )
 
 
-__all__ = ["TextToImageGrader"]
+__all__ = ["TextToImageGrader", "DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE", "DEFAULT_TEXT_TO_IMAGE_PERCEPTUAL_TEMPLATE"]

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ def __init__(`
`180`	`180`	`mode=GraderMode.POINTWISE,`
`181`	`181`	`description="Evaluate memory accuracy",`
`182`	`182`	`model=model,`
`183`		`- template=template,`
	`183`	`+ template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE,`
`184`	`184`	`language=language,`
`185`	`185`	`)`
`186`	`186`
Original file line number	Diff line number	Diff line change
`@@ -285,7 +285,7 @@ def __init__(`
`285`	`285`	`mode=GraderMode.POINTWISE,`
`286`	`286`	`description="Evaluate whether response matches the provided reference response",`
`287`	`287`	`model=model,`
`288`		`- template=template,`
	`288`	`+ template=template or DEFAULT_CORRECTNESS_TEMPLATE,`
`289`	`289`	`language=language,`
`290`	`290`	`)`
`291`	`291`	`self.threshold = threshold`
Original file line number	Diff line number	Diff line change
`@@ -263,7 +263,7 @@ def __init__(`
`263`	`263`	`mode=GraderMode.POINTWISE,`
`264`	`264`	`description="Evaluate whether response contains harmful or inappropriate content",`
`265`	`265`	`model=model,`
`266`		`- template=template,`
	`266`	`+ template=template or DEFAULT_HARMFULNESS_TEMPLATE,`
`267`	`267`	`language=language,`
`268`	`268`	`)`
`269`	`269`	`self.threshold = threshold`
Original file line number	Diff line number	Diff line change
`@@ -279,7 +279,7 @@ def __init__(`
`279`	`279`	`mode=GraderMode.POINTWISE,`
`280`	`280`	`description="Evaluate whether response follows the given instructions",`
`281`	`281`	`model=model,`
`282`		`- template=template,`
	`282`	`+ template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,`
`283`	`283`	`language=language,`
`284`	`284`	`)`
`285`	`285`	`self.threshold = threshold`
Original file line number	Diff line number	Diff line change
`@@ -280,7 +280,7 @@ def __init__(`
`280`	`280`	`mode=GraderMode.POINTWISE,`
`281`	`281`	`description="Evaluate relevance of response to user query",`
`282`	`282`	`model=model,`
`283`		`- template=template,`
	`283`	`+ template=template or DEFAULT_RELEVANCE_TEMPLATE,`
`284`	`284`	`language=language,`
`285`	`285`	`)`
`286`	`286`	`self.threshold = threshold`