Skip to content

Commit b85bbef

Browse files
authored
refactor(graders): improve parameter validation and streaming support (#62)
* refactor(graders): improve parameter validation and streaming support - Add threshold validation in common graders (correctness, harmfulness, etc.) - Fix streaming response handling in text_to_image grader - Preserve metadata in grader score returns - Update template default pattern (None with or fallback) - Update model name in examples from qwen3-max to qwen3-32b * refactor(text_to_image): simplify streaming response handling - Extract common score/reason parsing logic - Remove unused collected_content variable * refactor(multimodal): improve error handling and streaming support - Move error handling to aevaluate level for consistent GraderError returns - Simplify streaming response handling across multimodal graders - Refactor image_coherence, image_helpfulness, text_to_image graders * refactor(multimodal): unify structured response parsing and error handling - Add parse_structured_chat_response utility for streaming/non-streaming responses - Return GraderError instead of score=0 on exceptions in multimodal graders - Update tests to verify GraderError behavior - Move exception handling to aevaluate level for cleaner code
1 parent 7c8cca3 commit b85bbef

File tree

16 files changed

+214
-151
lines changed

16 files changed

+214
-151
lines changed

docs/built_in_graders/general.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Evaluates how well a response addresses the user's query. Measures whether the a
5656
| `query` | str | Yes | The user's question or request |
5757
| `response` | str | Yes | The model's response to evaluate |
5858
| `context` | str | No | Additional context (e.g., conversation history) |
59-
| `ground_truth` | str | No | Reference answer for comparison |
59+
| `reference_response` | str | No | Reference answer for comparison |
6060

6161
**Grading Criteria:**
6262
- **5**: Comprehensive response with helpful insights
@@ -107,7 +107,7 @@ Detects fabricated information not supported by the provided context or common k
107107
| `query` | str | Yes | The user's question |
108108
| `response` | str | Yes | The model's response to evaluate |
109109
| `context` | str | No | Source documents to verify against |
110-
| `ground_truth` | str | No | Reference answer |
110+
| `reference_response` | str | No | Reference answer |
111111

112112
!!! note
113113
If no context is provided, evaluation is based on common knowledge and factual consistency.
@@ -175,7 +175,7 @@ Identifies harmful, offensive, or inappropriate content in responses. Critical f
175175
| `query` | str | Yes | The user's input |
176176
| `response` | str | Yes | The model's response to evaluate |
177177
| `context` | str | No | Additional context |
178-
| `ground_truth` | str | No | Reference response |
178+
| `reference_response` | str | No | Reference response |
179179

180180
**What it detects:**
181181
- Violence, hatred, discrimination

openjudge/graders/common/correctness.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ class CorrectnessGrader(LLMGrader):
244244
>>> from openjudge.graders.common.correctness import CorrectnessGrader
245245
>>>
246246
>>> # Initialize grader
247-
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
247+
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
248248
>>> grader = CorrectnessGrader(model=model, threshold=3)
249249
>>>
250250
>>> # Good match
@@ -268,7 +268,7 @@ def __init__(
268268
self,
269269
model: BaseChatModel | dict,
270270
threshold: float = 3,
271-
template: Optional[PromptTemplate] = DEFAULT_CORRECTNESS_TEMPLATE,
271+
template: Optional[PromptTemplate] = None,
272272
language: LanguageEnum = LanguageEnum.EN,
273273
):
274274
"""
@@ -279,7 +279,13 @@ def __init__(
279279
threshold: Success threshold [1, 5] (default: 3)
280280
template: PromptTemplate for evaluation prompts (default: DEFAULT_CORRECTNESS_TEMPLATE)
281281
language: Language for prompts (default: LanguageEnum.EN)
282+
283+
Raises:
284+
ValueError: If threshold is not in range [1, 5]
282285
"""
286+
if not 1 <= threshold <= 5:
287+
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
288+
283289
super().__init__(
284290
name="correctness",
285291
mode=GraderMode.POINTWISE,
@@ -330,11 +336,11 @@ async def aevaluate(
330336
name=self.name,
331337
score=result.score,
332338
reason=result.reason,
333-
metadata={"threshold": self.threshold},
339+
metadata={**result.metadata, "threshold": self.threshold},
334340
)
335341

336342
except Exception as e:
337-
logger.error(f"Error evaluating correctness: {e}")
343+
logger.exception(f"Error evaluating correctness: {e}")
338344
return GraderError(
339345
name=self.name,
340346
error=f"Evaluation error: {str(e)}",

openjudge/graders/common/hallucination.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ class HallucinationGrader(LLMGrader):
216216
>>> # Initialize model
217217
>>> model = OpenAIChatModel(
218218
... api_key="sk-...",
219-
... model="qwen3-max",
219+
... model="qwen3-32b",
220220
... temperature=0.1
221221
... )
222222
>>>
@@ -253,7 +253,7 @@ def __init__(
253253
self,
254254
model: BaseChatModel | dict,
255255
threshold: float = 3,
256-
template: Optional[PromptTemplate] = DEFAULT_HALLUCINATION_TEMPLATE,
256+
template: Optional[PromptTemplate] = None,
257257
language: LanguageEnum = LanguageEnum.EN,
258258
):
259259
"""
@@ -264,7 +264,13 @@ def __init__(
264264
threshold: Success threshold [1, 5] (default: 3)
265265
template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE)
266266
language: Language for prompts (default: LanguageEnum.EN)
267+
268+
Raises:
269+
ValueError: If threshold is not in range [1, 5]
267270
"""
271+
if not 1 <= threshold <= 5:
272+
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
273+
268274
super().__init__(
269275
name="hallucination",
270276
mode=GraderMode.POINTWISE,
@@ -322,11 +328,11 @@ async def aevaluate(
322328
name=self.name,
323329
score=result.score,
324330
reason=result.reason,
325-
metadata={"threshold": self.threshold},
331+
metadata={**result.metadata, "threshold": self.threshold},
326332
)
327333

328334
except Exception as e:
329-
logger.error(f"Error evaluating hallucination: {e}")
335+
logger.exception(f"Error evaluating hallucination: {e}")
330336
return GraderError(
331337
name=self.name,
332338
error=f"Evaluation error: {str(e)}",

openjudge/graders/common/harmfulness.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ class HarmfulnessGrader(LLMGrader):
223223
>>> from openjudge.graders.common.harmfulness import HarmfulnessGrader
224224
>>>
225225
>>> # Initialize grader
226-
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
226+
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
227227
>>> grader = HarmfulnessGrader(model=model, threshold=3)
228228
>>>
229229
>>> # Safe output
@@ -246,7 +246,7 @@ def __init__(
246246
self,
247247
model: BaseChatModel | dict,
248248
threshold: float = 3,
249-
template: Optional[PromptTemplate] = DEFAULT_HARMFULNESS_TEMPLATE,
249+
template: Optional[PromptTemplate] = None,
250250
language: LanguageEnum = LanguageEnum.EN,
251251
):
252252
"""
@@ -257,7 +257,13 @@ def __init__(
257257
threshold: Success threshold [1, 5] (default: 3)
258258
template: PromptTemplate for evaluation prompts (default: DEFAULT_HARMFULNESS_TEMPLATE)
259259
language: Language for prompts (default: LanguageEnum.EN)
260+
261+
Raises:
262+
ValueError: If threshold is not in range [1, 5]
260263
"""
264+
if not 1 <= threshold <= 5:
265+
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
266+
261267
super().__init__(
262268
name="harmfulness",
263269
mode=GraderMode.POINTWISE,
@@ -307,11 +313,11 @@ async def aevaluate(
307313
name=self.name,
308314
score=result.score,
309315
reason=result.reason,
310-
metadata={"threshold": self.threshold},
316+
metadata={**result.metadata, "threshold": self.threshold},
311317
)
312318

313319
except Exception as e:
314-
logger.error(f"Error evaluating harmfulness: {e}")
320+
logger.exception(f"Error evaluating harmfulness: {e}")
315321
return GraderError(
316322
name=self.name,
317323
error=f"Evaluation error: {str(e)}",

openjudge/graders/common/instruction_following.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ class InstructionFollowingGrader(LLMGrader):
238238
>>> from openjudge.graders.common.instruction_following import InstructionFollowingGrader
239239
>>>
240240
>>> # Initialize grader
241-
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
241+
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
242242
>>> grader = InstructionFollowingGrader(model=model, threshold=3)
243243
>>>
244244
>>> # Good adherence
@@ -262,7 +262,7 @@ def __init__(
262262
self,
263263
model: BaseChatModel | dict,
264264
threshold: float = 3,
265-
template: Optional[PromptTemplate] = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
265+
template: Optional[PromptTemplate] = None,
266266
language: LanguageEnum = LanguageEnum.EN,
267267
):
268268
"""
@@ -273,7 +273,13 @@ def __init__(
273273
threshold: Success threshold [1, 5] (default: 3)
274274
template: PromptTemplate for evaluation prompts (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE)
275275
language: Language for prompts (default: LanguageEnum.EN)
276+
277+
Raises:
278+
ValueError: If threshold is not in range [1, 5]
276279
"""
280+
if not 1 <= threshold <= 5:
281+
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
282+
277283
super().__init__(
278284
name="instruction_following",
279285
mode=GraderMode.POINTWISE,
@@ -318,11 +324,11 @@ async def aevaluate(
318324
name=self.name,
319325
score=result.score,
320326
reason=result.reason,
321-
metadata={"threshold": self.threshold},
327+
metadata={**result.metadata, "threshold": self.threshold},
322328
)
323329

324330
except Exception as e:
325-
logger.error(f"Error evaluating instruction following: {e}")
331+
logger.exception(f"Error evaluating instruction following: {e}")
326332
return GraderError(
327333
name=self.name,
328334
error=f"Evaluation error: {str(e)}",

openjudge/graders/common/relevance.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ class RelevanceGrader(LLMGrader):
217217
218218
Args:
219219
model: BaseChatModel instance or dict config for OpenAIChatModel
220-
threshold: Minimum score [0, 1] to pass (default: 0.7)
220+
threshold: Minimum score [1, 5] to pass (default: 3)
221221
template: Custom evaluation template (default: DEFAULT_RELEVANCE_TEMPLATE)
222222
language: Prompt language - EN or ZH (default: LanguageEnum.EN)
223223
@@ -234,7 +234,7 @@ class RelevanceGrader(LLMGrader):
234234
>>>
235235
>>> # Initialize grader
236236
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
237-
>>> grader = RelevanceGrader(model=model, threshold=0.7)
237+
>>> grader = RelevanceGrader(model=model, threshold=3)
238238
>>>
239239
>>> # Relevant response
240240
>>> result = asyncio.run(grader.aevaluate(
@@ -262,19 +262,25 @@ class RelevanceGrader(LLMGrader):
262262
def __init__(
263263
self,
264264
model: BaseChatModel | dict,
265-
threshold: float = 0.7,
266-
template: Optional[PromptTemplate] = DEFAULT_RELEVANCE_TEMPLATE,
265+
threshold: float = 3,
266+
template: Optional[PromptTemplate] = None,
267267
language: LanguageEnum = LanguageEnum.EN,
268268
):
269269
"""
270270
Initialize RelevanceGrader
271271
272272
Args:
273273
model: BaseChatModel instance or dict config for OpenAIChatModel
274-
threshold: Success threshold [0, 1] (default: 0.7)
274+
threshold: Success threshold [1, 5] (default: 3)
275275
template: PromptTemplate for evaluation prompts (default: DEFAULT_RELEVANCE_TEMPLATE)
276276
language: Language for prompts (default: LanguageEnum.EN)
277+
278+
Raises:
279+
ValueError: If threshold is not in range [1, 5]
277280
"""
281+
if not 1 <= threshold <= 5:
282+
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
283+
278284
super().__init__(
279285
name="relevance",
280286
mode=GraderMode.POINTWISE,
@@ -323,11 +329,11 @@ async def aevaluate(
323329
name=self.name,
324330
score=result.score,
325331
reason=result.reason,
326-
metadata={"threshold": self.threshold},
332+
metadata={**result.metadata, "threshold": self.threshold},
327333
)
328334

329335
except Exception as e:
330-
logger.error(f"Error evaluating relevance: {e}")
336+
logger.exception(f"Error evaluating relevance: {e}")
331337
return GraderError(
332338
name=self.name,
333339
error=f"Evaluation error: {str(e)}",

openjudge/graders/multimodal/_internal/criteria_utils.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,13 @@ def validate_and_sort_rubrics(
8787
# Sort rubrics by start of range
8888
sorted_rubrics = sorted(rubrics, key=lambda r: r.score_range[0])
8989

90-
# Full overlap check
90+
# Full overlap check (adjacent ranges like (0,5) and (5,7) are allowed)
9191
for i in range(len(sorted_rubrics)):
9292
a_start, a_end = sorted_rubrics[i].score_range
9393
for j in range(i + 1, len(sorted_rubrics)):
9494
b_start, b_end = sorted_rubrics[j].score_range
95-
# Check if ranges overlap
96-
if a_end >= b_start:
95+
# Check if ranges overlap (> allows adjacent ranges to touch)
96+
if a_end > b_start:
9797
raise ValueError(
9898
f"Overlapping score ranges: {sorted_rubrics[i].score_range} and {sorted_rubrics[j].score_range}",
9999
)
@@ -147,7 +147,7 @@ def construct_params_string(
147147
>>> construct_params_string(params)
148148
'Input and Actual Output'
149149
"""
150-
params = [PARAM_DISPLAY_NAMES[param] for param in evaluation_params]
150+
params = [PARAM_DISPLAY_NAMES.get(param, param.replace("_", " ").title()) for param in evaluation_params]
151151

152152
if len(params) == 1:
153153
params_str = params[0]
@@ -164,7 +164,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
164164
Get the overall score range from rubrics
165165
166166
Args:
167-
rubric: List of rubric definitions
167+
rubric: List of rubric definitions (does not need to be sorted)
168168
169169
Returns:
170170
Tuple of (min_score, max_score)
@@ -180,7 +180,9 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
180180
if not rubric:
181181
return (0, 10)
182182

183-
return rubric[0].score_range[0], rubric[-1].score_range[1]
183+
min_score = min(r.score_range[0] for r in rubric)
184+
max_score = max(r.score_range[1] for r in rubric)
185+
return (min_score, max_score)
184186

185187

186188
__all__ = [

openjudge/graders/multimodal/image_coherence.py

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from openjudge.models.base_chat_model import BaseChatModel
2424
from openjudge.models.schema.oai.message import ChatMessage
2525
from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
26+
from openjudge.utils.utils import parse_structured_chat_response
2627

2728
# pylint: disable=line-too-long
2829

@@ -222,30 +223,27 @@ async def _aevaluate_single_image(
222223
context_below=context_below or "",
223224
)
224225

225-
try:
226-
# Format image content for OpenAI API
227-
content = [{"type": "text", "text": prompt}]
228-
229-
if image.url:
230-
content.append({"type": "image_url", "image_url": {"url": image.url}})
231-
elif image.base64:
232-
# Format base64 image with data URL scheme
233-
image_format = image.format or "jpeg"
234-
data_url = f"data:image/{image_format};base64,{image.base64}"
235-
content.append({"type": "image_url", "image_url": {"url": data_url}})
236-
237-
# Call model without structured output
238-
chat_response = await self.model.achat(
239-
messages=[{"role": "user", "content": content}],
240-
structured_model=GraderScoreCallback,
241-
)
242-
score = chat_response.parsed["score"]
243-
reason = chat_response.parsed["reason"]
244-
return score, reason
226+
# Format image content for OpenAI API
227+
content = [{"type": "text", "text": prompt}]
245228

246-
except Exception as e:
247-
logger.error(f"Error evaluating image coherence: {e}")
248-
return 0.0, f"Evaluation error: {str(e)}"
229+
if image.url:
230+
content.append({"type": "image_url", "image_url": {"url": image.url}})
231+
elif image.base64:
232+
# Format base64 image with data URL scheme
233+
image_format = image.format or "jpeg"
234+
data_url = f"data:image/{image_format};base64,{image.base64}"
235+
content.append({"type": "image_url", "image_url": {"url": data_url}})
236+
237+
chat_response = await self.model.achat(
238+
messages=[{"role": "user", "content": content}],
239+
structured_model=GraderScoreCallback,
240+
)
241+
242+
# Default to 5.0 (neutral score on 0-10 scale) for missing fields
243+
parsed = await parse_structured_chat_response(chat_response)
244+
score = parsed.get("score", 5.0)
245+
reason = parsed.get("reason", "")
246+
return score, reason
249247

250248
async def _acompute(
251249
self,
@@ -331,7 +329,16 @@ async def aevaluate(
331329
... ]
332330
... )
333331
"""
334-
score, details = await self._acompute(response, **kwargs)
332+
try:
333+
score, details = await self._acompute(response, **kwargs)
334+
except Exception as e:
335+
logger.exception(f"Error evaluating image coherence: {e}")
336+
from openjudge.graders.base_grader import GraderError
337+
338+
return GraderError(
339+
name=self.name,
340+
error=f"Evaluation error: {str(e)}",
341+
)
335342

336343
if "error" in details:
337344
return GraderScore(

0 commit comments

Comments
 (0)