Finalize examples

miquido · Jul 19, 2024 · 689e4bd · 689e4bd
1 parent f6f90e7
commit 689e4bd
Show file tree

Hide file tree

Showing 10 changed files with 357 additions and 170 deletions.
diff --git a/src/draive/evaluators/score.py b/src/draive/evaluators/score.py
@@ -0,0 +1,58 @@
+from typing import Any
+
+from draive.evaluation import EvaluationScore
+from draive.parameters import DataModel, Field, ParameterValidationContext, ParameterValidationError
+
+__all__ = [
+ "CommonScoreModel",
+]
+
+
+def _score_validator(
+ value: Any,
+ context: ParameterValidationContext,
+) -> float:
+ match value:
+ case float() as float_value:
+ return float_value
+
+ case int() as int_value:
+ return float(int_value)
+
+ case str() as str_value:
+ try:
+ return float(str_value)
+
+ except Exception as exc:
+ raise ParameterValidationError.invalid_type(
+ context=context,
+ expected=float,
+ received=str,
+ ) from exc
+
+ case _:
+ raise ParameterValidationError.invalid_type(
+ context=context,
+ expected=float,
+ received=type(value),
+ )
+
+
+class CommonScoreModel(DataModel):
+ score: float = Field(
+ description="Decimal score value",
+ validator=_score_validator,
+ )
+ comment: str | None = Field(
+ description="Explanation of the score",
+ default=None,
+ )
+
+ def normalized(
+ self,
+ divider: float | None = None,
+ ) -> EvaluationScore:
+ return EvaluationScore(
+ value=self.score / divider if divider else self.score,
+ comment=self.comment,
+ )
diff --git a/src/draive/evaluators/text_coherence.py b/src/draive/evaluators/text_coherence.py
@@ -1,17 +1,12 @@
-from draive.evaluation import evaluator
+from draive.evaluation import EvaluationScore, evaluator
+from draive.evaluators.score import CommonScoreModel
 from draive.generation import generate_model
-from draive.parameters import DataModel
 
 __all__ = [
  "text_coherence_evaluator",
 ]
 
 
-class CoherenceScore(DataModel):
- score: float
- comment: str | None = None
-
-
 INSTRUCTION: str = """\
 You will be given a reference text and a compared text based on the reference text.
 Your task is to rate the compared text using only the Coherence metric, \
@@ -45,10 +40,14 @@ class CoherenceScore(DataModel):
 do not exceed this value.
 """
 
-INPUT: str = """
-Reference text: {reference}
+INPUT_TEMPLATE: str = """
+<REFERENCE_TEXT>
+{reference}
+</REFERENCE_TEXT>
 
-Compered text: {compared}
+<COMPARED_TEXT>
+{compared}
+</COMPARED_TEXT>
 """
 
 
@@ -57,14 +56,29 @@ async def text_coherence_evaluator(
  compared: str,
  /,
  reference: str,
-) -> float:
- model: CoherenceScore = await generate_model(
- CoherenceScore,
+) -> EvaluationScore:
+ if not compared:
+ return EvaluationScore(
+ value=0,
+ comment="Input text was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference text was empty!",
+ )
+
+ score: CommonScoreModel = await generate_model(
+ CommonScoreModel,
  instruction=INSTRUCTION,
- input=INPUT.format(reference=reference, compared=compared),
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ compared=compared,
+ ),
  examples=[
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Solar energy is a renewable energy source that is gaining popularity. "
  "Solar panels convert sunlight into electricity. "
@@ -78,10 +92,10 @@ async def text_coherence_evaluator(
  "Technology is developing fast. People like to save money."
  ),
  ),
- CoherenceScore(score=0.0),
+ CommonScoreModel(score=0.0),
  ),
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Coffee is a popular beverage worldwide. "
  "It's made from roasted coffee beans. Caffeine in coffee "
@@ -95,10 +109,10 @@ async def text_coherence_evaluator(
  "Some people add milk or sugar to their coffee."
  ),
  ),
- CoherenceScore(score=2.0),
+ CommonScoreModel(score=2.0),
  ),
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Honey is a natural sweetener produced by bees. "
  "It has antibacterial properties and is rich in antioxidants. "
@@ -113,8 +127,8 @@ async def text_coherence_evaluator(
  "honey's high caloric content necessitates mindful consumption."
  ),
  ),
- CoherenceScore(score=4.0),
+ CommonScoreModel(score=4.0),
  ),
  ],
  )
- return model.score / 4
+ return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_conciseness.py b/src/draive/evaluators/text_conciseness.py
@@ -1,17 +1,12 @@
-from draive.evaluation import evaluator
+from draive.evaluation import EvaluationScore, evaluator
+from draive.evaluators.score import CommonScoreModel
 from draive.generation import generate_model
-from draive.parameters import DataModel
 
 __all__ = [
  "text_conciseness_evaluator",
 ]
 
 
-class ConcisenessScore(DataModel):
- score: float
- comment: str | None = None
-
-
 INSTRUCTION: str = """\
 You will be given a reference text and a compared text based on the reference text.
 Your task is to rate the compared text using only the Conciseness metric, \
@@ -44,10 +39,15 @@ class ConcisenessScore(DataModel):
 do not exceed this value.
 """
 
-INPUT: str = """
-Reference text: {reference}
 
-Compered text: {compared}
+INPUT_TEMPLATE: str = """
+<REFERENCE_TEXT>
+{reference}
+</REFERENCE_TEXT>
+
+<COMPARED_TEXT>
+{compared}
+</COMPARED_TEXT>
 """
 
 
@@ -56,14 +56,29 @@ async def text_conciseness_evaluator(
  compared: str,
  /,
  reference: str,
-) -> float:
- model: ConcisenessScore = await generate_model(
- ConcisenessScore,
+) -> EvaluationScore:
+ if not compared:
+ return EvaluationScore(
+ value=0,
+ comment="Input text was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference text was empty!",
+ )
+
+ score: CommonScoreModel = await generate_model(
+ CommonScoreModel,
  instruction=INSTRUCTION,
- input=f"Reference text: {reference}\n\nCompered text: {compared}",
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ compared=compared,
+ ),
  examples=[
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Solar energy is a renewable energy source that is gaining popularity. "
  "Solar panels convert sunlight into electricity. "
@@ -85,10 +100,10 @@ async def text_conciseness_evaluator(
  "but then you save on all those coffee shop visits."
  ),
  ),
- ConcisenessScore(score=0.0),
+ CommonScoreModel(score=0.0),
  ),
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Coffee is a popular beverage worldwide. "
  "It's made from roasted coffee beans. Caffeine in coffee "
@@ -103,10 +118,10 @@ async def text_conciseness_evaluator(
  "important to consume it in moderation."
  ),
  ),
- ConcisenessScore(score=2.0),
+ CommonScoreModel(score=2.0),
  ),
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "The water cycle, also known as the hydrologic cycle, "
  "describes the continuous movement of water within the Earth and "
@@ -118,8 +133,8 @@ async def text_conciseness_evaluator(
  "It includes evaporation, condensation, precipitation, and runoff."
  ),
  ),
- ConcisenessScore(score=4.0),
+ CommonScoreModel(score=4.0),
  ),
  ],
  )
- return model.score / 4
+ return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_consistency.py b/src/draive/evaluators/text_consistency.py
@@ -1,17 +1,12 @@
-from draive.evaluation import evaluator
+from draive.evaluation import EvaluationScore, evaluator
+from draive.evaluators.score import CommonScoreModel
 from draive.generation import generate_model
-from draive.parameters import DataModel
 
 __all__ = [
  "text_consistency_evaluator",
 ]
 
 
-class ConsistencyScore(DataModel):
- score: float
- comment: str | None = None
-
-
 INSTRUCTION: str = """\
 You will be given a reference text and a compared text based on the reference text.
 Your task is to rate the compared text using only the Consistency metric, \
@@ -47,10 +42,15 @@ class ConsistencyScore(DataModel):
 do not exceed this value.
 """
 
-INPUT: str = """
-Reference text: {reference}
 
-Compered text: {compared}
+INPUT_TEMPLATE: str = """
+<REFERENCE_TEXT>
+{reference}
+</REFERENCE_TEXT>
+
+<COMPARED_TEXT>
+{compared}
+</COMPARED_TEXT>
 """
 
 
@@ -59,14 +59,29 @@ async def text_consistency_evaluator(
  compared: str,
  /,
  reference: str,
-) -> float:
- model: ConsistencyScore = await generate_model(
- ConsistencyScore,
+) -> EvaluationScore:
+ if not compared:
+ return EvaluationScore(
+ value=0,
+ comment="Input text was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference text was empty!",
+ )
+
+ score: CommonScoreModel = await generate_model(
+ CommonScoreModel,
  instruction=INSTRUCTION,
- input=f"Reference text: {reference}\n\nCompered text: {compared}",
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ compared=compared,
+ ),
  examples=[
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Dolphins are intelligent marine mammals. They use echolocation "
  "to navigate and hunt. Dolphins live in social groups called pods."
@@ -77,10 +92,10 @@ async def text_consistency_evaluator(
  "to learn hunting techniques."
  ),
  ),
- ConsistencyScore(score=0.0),
+ CommonScoreModel(score=0.0),
  ),
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Coffee is a popular beverage worldwide. "
  "It's made from roasted coffee beans. Caffeine in coffee "
@@ -95,10 +110,10 @@ async def text_consistency_evaluator(
  "the risk of certain diseases."
  ),
  ),
- ConsistencyScore(score=2.0),
+ CommonScoreModel(score=2.0),
  ),
  (
- INPUT.format(
+ INPUT_TEMPLATE.format(
  reference=(
  "Photosynthesis is the process by which plants use sunlight to "
  "produce energy. It requires water, carbon dioxide, and chlorophyll. "
@@ -111,8 +126,8 @@ async def text_consistency_evaluator(
  "they release oxygen into the environment."
  ),
  ),
- ConsistencyScore(score=4.0),
+ CommonScoreModel(score=4.0),
  ),
  ],
  )
- return model.score / 4
+ return score.normalized(divider=4)