Skip to content

Commit 2ade760

Browse files
authored
feat: update more graders including multimodal graders (#60)
* feat: update more graders including multimodal graders * feat: update more graders including multimodal graders
1 parent c14c8fd commit 2ade760

File tree

11 files changed

+54
-36
lines changed

11 files changed

+54
-36
lines changed

openjudge/graders/agent/memory/memory_accuracy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def __init__(
180180
mode=GraderMode.POINTWISE,
181181
description="Evaluate memory accuracy",
182182
model=model,
183-
template=template,
183+
template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE,
184184
language=language,
185185
)
186186

openjudge/graders/agent/tool/tool_call_success.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,10 +238,9 @@ def __init__(
238238
mode=GraderMode.POINTWISE,
239239
description="Evaluates whether tool calls done by an AI agent includes failures or not",
240240
model=model,
241-
template=template,
241+
template=template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE,
242242
language=language,
243243
)
244-
self.template = template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE
245244

246245
async def aevaluate(
247246
self,

openjudge/graders/agent/tool/tool_selection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def __init__(
205205
super().__init__(
206206
name="tool_selection",
207207
mode=GraderMode.POINTWISE,
208-
description="Evaluate tool selection ",
208+
description="Evaluate tool selection",
209209
model=model,
210210
template=template or DEFAULT_TOOL_SELECTION_TEMPLATE,
211211
language=language,

openjudge/graders/agent/trajectory/trajectory_comprehensive.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ def __init__(
427427
mode=GraderMode.POINTWISE,
428428
description="Comprehensive evaluation for agent trajectories including step-level and overall problem-solving assessment",
429429
model=model,
430-
template=template,
430+
template=template or DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE,
431431
language=language,
432432
structured_model=TrajectoryEvaluationOutput,
433433
callback=self._create_trajectory_callback(language=language),

openjudge/graders/common/correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def __init__(
285285
mode=GraderMode.POINTWISE,
286286
description="Evaluate whether response matches the provided reference response",
287287
model=model,
288-
template=template,
288+
template=template or DEFAULT_CORRECTNESS_TEMPLATE,
289289
language=language,
290290
)
291291
self.threshold = threshold

openjudge/graders/common/harmfulness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def __init__(
263263
mode=GraderMode.POINTWISE,
264264
description="Evaluate whether response contains harmful or inappropriate content",
265265
model=model,
266-
template=template,
266+
template=template or DEFAULT_HARMFULNESS_TEMPLATE,
267267
language=language,
268268
)
269269
self.threshold = threshold

openjudge/graders/common/instruction_following.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ def __init__(
279279
mode=GraderMode.POINTWISE,
280280
description="Evaluate whether response follows the given instructions",
281281
model=model,
282-
template=template,
282+
template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
283283
language=language,
284284
)
285285
self.threshold = threshold

openjudge/graders/common/relevance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def __init__(
280280
mode=GraderMode.POINTWISE,
281281
description="Evaluate relevance of response to user query",
282282
model=model,
283-
template=template,
283+
template=template or DEFAULT_RELEVANCE_TEMPLATE,
284284
language=language,
285285
)
286286
self.threshold = threshold

openjudge/graders/multimodal/image_coherence.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
# pylint: disable=line-too-long
2828

2929
# English Prompt
30-
IMAGE_COHERENCE_PROMPT_EN = """
30+
IMAGE_COHERENCE_PROMPT_EN = textwrap.dedent(
31+
"""
3132
# Task Description
3233
You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
3334
Your task is to evaluate the coherence between the image and the text (context above and below) it accompanies.
@@ -62,9 +63,11 @@
6263
# Image
6364
[Insert Image Here]
6465
"""
66+
).strip()
6567

6668
# Chinese Prompt
67-
IMAGE_COHERENCE_PROMPT_ZH = """
69+
IMAGE_COHERENCE_PROMPT_ZH = textwrap.dedent(
70+
"""
6871
# 任务描述
6972
你是一名多模态文档评估助手。你将收到一张图片及其文本背景。
7073
你的任务是评估图片与其伴随文本(上下文)之间的连贯性。
@@ -99,20 +102,21 @@
99102
# 图片
100103
[在此插入图片]
101104
"""
105+
).strip()
102106

103107
# Build default template from prompts
104108
DEFAULT_IMAGE_COHERENCE_TEMPLATE = PromptTemplate(
105109
messages={
106110
LanguageEnum.EN: [
107111
ChatMessage(
108112
role="user",
109-
content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_EN),
113+
content=IMAGE_COHERENCE_PROMPT_EN,
110114
),
111115
],
112116
LanguageEnum.ZH: [
113117
ChatMessage(
114118
role="user",
115-
content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_ZH),
119+
content=IMAGE_COHERENCE_PROMPT_ZH,
116120
),
117121
],
118122
},
@@ -159,19 +163,20 @@ class ImageCoherenceGrader(LLMGrader):
159163
GraderScore with normalized coherence score [0, 1]
160164
161165
Example:
166+
>>> import asyncio
162167
>>> from openjudge.model.openai_llm import OpenAIChatModel
163168
>>> from openjudge.multimodal import ImageCoherenceGrader, MLLMImage
164169
>>>
165170
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
166171
>>> grader = ImageCoherenceGrader(model=model)
167172
>>>
168-
>>> result = await grader.aevaluate(
173+
>>> result = asyncio.run(grader.aevaluate(
169174
... response=[
170175
... "Q3 sales increased 25%.",
171176
... MLLMImage(url="https://example.com/sales_chart.jpg"),
172177
... "Growth driven by new products."
173178
... ]
174-
... )
179+
... ))
175180
>>> print(result.score) # 0.95 - image coherent with sales context
176181
"""
177182

@@ -198,7 +203,7 @@ def __init__(
198203
mode=GraderMode.POINTWISE,
199204
description="Evaluate image-text coherence",
200205
model=model,
201-
template=template,
206+
template=template or DEFAULT_IMAGE_COHERENCE_TEMPLATE,
202207
language=language,
203208
)
204209
self.max_context_size = max_context_size

openjudge/graders/multimodal/image_helpfulness.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
# pylint: disable=line-too-long
2929

3030
# English Prompt
31-
IMAGE_HELPFULNESS_PROMPT_EN = """
31+
IMAGE_HELPFULNESS_PROMPT_EN = textwrap.dedent(
32+
"""
3233
# Task Description
3334
You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
3435
Your task is to evaluate the helpfulness of the image in enabling human readers to comprehend the text (context above and below) it accompanies.
@@ -63,9 +64,11 @@
6364
# Image
6465
[Insert Image Here]
6566
"""
67+
).strip()
6668

6769
# Chinese Prompt
68-
IMAGE_HELPFULNESS_PROMPT_ZH = """
70+
IMAGE_HELPFULNESS_PROMPT_ZH = textwrap.dedent(
71+
"""
6972
# 任务描述
7073
你是一名多模态文档评估助手。你将收到一张图片及其文本背景。
7174
你的任务是评估图片对于帮助人类读者理解其伴随文本(上下文)的有用性。
@@ -100,20 +103,21 @@
100103
# 图片
101104
[在此插入图片]
102105
"""
106+
).strip()
103107

104108
# Build default template from prompts
105109
DEFAULT_IMAGE_HELPFULNESS_TEMPLATE = PromptTemplate(
106110
messages={
107111
LanguageEnum.EN: [
108112
ChatMessage(
109113
role="user",
110-
content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_EN),
114+
content=IMAGE_HELPFULNESS_PROMPT_EN,
111115
),
112116
],
113117
LanguageEnum.ZH: [
114118
ChatMessage(
115119
role="user",
116-
content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_ZH),
120+
content=IMAGE_HELPFULNESS_PROMPT_ZH,
117121
),
118122
],
119123
},
@@ -161,13 +165,14 @@ class ImageHelpfulnessGrader(LLMGrader):
161165
GraderScore with normalized helpfulness score [0, 1]
162166
163167
Example:
168+
>>> import asyncio
164169
>>> from openjudge.model.openai_llm import OpenAIChatModel
165170
>>> from openjudge.multimodal import ImageHelpfulnessGrader, MLLMImage
166171
>>>
167172
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
168173
>>> grader = ImageHelpfulnessGrader(model=model)
169174
>>>
170-
>>> result = await grader.aevaluate(
175+
>>> result = asyncio.run(grader.aevaluate(
171176
... response=[
172177
... "The system architecture has three layers.",
173178
... MLLMImage(url="https://example.com/arch_diagram.jpg"),
@@ -200,7 +205,7 @@ def __init__(
200205
mode=GraderMode.POINTWISE,
201206
description="Evaluate image helpfulness for understanding text",
202207
model=model,
203-
template=template,
208+
template=template or DEFAULT_IMAGE_HELPFULNESS_TEMPLATE,
204209
language=language,
205210
)
206211
self.max_context_size = max_context_size
@@ -362,4 +367,4 @@ async def aevaluate(
362367
)
363368

364369

365-
__all__ = ["ImageHelpfulnessGrader"]
370+
__all__ = ["ImageHelpfulnessGrader", "DEFAULT_IMAGE_HELPFULNESS_TEMPLATE"]

0 commit comments

Comments
 (0)