From f6f90e766f2452b80db86dc2ac41f38c6de502a0 Mon Sep 17 00:00:00 2001 From: Karolina Holewa Date: Thu, 18 Jul 2024 11:41:24 +0200 Subject: [PATCH] Add examples for evaluators --- src/draive/evaluators/text_coherence.py | 83 ++++++++++++++++++---- src/draive/evaluators/text_conciseness.py | 86 ++++++++++++++++++++--- src/draive/evaluators/text_consistency.py | 76 +++++++++++++++++--- src/draive/evaluators/text_coverage.py | 79 ++++++++++++++++++--- src/draive/evaluators/text_fluency.py | 53 +++++++++++--- src/draive/evaluators/text_readability.py | 56 ++++++++++++--- src/draive/evaluators/text_relevance.py | 71 ++++++++++++++++--- src/draive/evaluators/text_similarity.py | 60 ++++++++++++++-- 8 files changed, 494 insertions(+), 70 deletions(-) diff --git a/src/draive/evaluators/text_coherence.py b/src/draive/evaluators/text_coherence.py index 16827a6..9f300eb 100644 --- a/src/draive/evaluators/text_coherence.py +++ b/src/draive/evaluators/text_coherence.py @@ -20,33 +20,35 @@ class CoherenceScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Coherence (1-5) - the collective quality of all sentences. +Coherence (0.0-4.0) - the collective quality of all sentences. We align this dimension with the DUC (Document Understanding Conference) quality question of \ structure and coherence, whereby the text should be well-structured and well-organized. The compared text should not just be a heap of related information, but should build from sentence to sentence into a coherent body of information about a topic. Rating Scale: -1: Very low coherence - the text is chaotic, lacking logical connections between sentences. -2: Low coherence - some connections are visible, but the overall structure is weak. -3: Moderate coherence - the text has a noticeable structure, but with some shortcomings. -4: Good coherence - the text is well-organized with minor imperfections. -5: Excellent coherence - the text is exemplarily structured, with smooth transitions between ideas. +0.0: Very low coherence - the text is chaotic, lacking logical connections between sentences. +1.0: Low coherence - some connections are visible, but the overall structure is weak. +2.0: Moderate coherence - the text has a noticeable structure, but with some shortcomings. +3.0: Good coherence - the text is well-organized with minor imperfections. +4.0: Excellent coherence - the text is exemplarily structured, with smooth transitions \ +between ideas. Evaluation Steps: 1. Read the reference text carefully and identify the main topic and key points. 2. Read the compared text and compare it to the reference text. Check if the compared text covers the main topic and key points of the reference text, \ and if it presents them in a clear and logical order. -3. Assign a coherence score from 1 to 5 based on the provided criteria. +3. Assign a coherence score from 0.0 to 4.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ +do not exceed this value. """ -INPUT: str = """\ -Reference text: -{reference} +INPUT: str = """ +Reference text: {reference} -Compered text: -{compared} +Compered text: {compared} """ @@ -60,6 +62,59 @@ async def text_coherence_evaluator( CoherenceScore, instruction=INSTRUCTION, input=INPUT.format(reference=reference, compared=compared), + examples=[ + ( + INPUT.format( + reference=( + "Solar energy is a renewable energy source that is gaining popularity. " + "Solar panels convert sunlight into electricity. " + "This technology is environmentally friendly and can reduce electricity " + "bills. However, installing solar panels requires an initial investment " + "and is dependent on weather conditions." + ), + compared=( + "Solar panels are on roofs. Energy is important. " + "The sun shines brightly. Electricity bills can be high. " + "Technology is developing fast. People like to save money." + ), + ), + CoherenceScore(score=0.0), + ), + ( + INPUT.format( + reference=( + "Coffee is a popular beverage worldwide. " + "It's made from roasted coffee beans. Caffeine in coffee " + "can boost energy and alertness. However, excessive consumption may " + "lead to sleep issues." + ), + compared=( + "Coffee is drunk by many people. It comes from beans that are roasted. " + "Caffeine makes you feel more awake. " + "Drinking too much coffee might make it hard to sleep. " + "Some people add milk or sugar to their coffee." + ), + ), + CoherenceScore(score=2.0), + ), + ( + INPUT.format( + reference=( + "Honey is a natural sweetener produced by bees. " + "It has antibacterial properties and is rich in antioxidants. " + "People use honey in cooking, as a spread, and for medicinal " + "purposes. However, it's high in calories and should be consumed " + "in moderation." + ), + compared=( + "Bees create honey, a natural sweetener with multiple benefits. " + "Its antibacterial and antioxidant-rich composition makes it valuable " + "for culinary, nutritional, and medicinal uses. While versatile, " + "honey's high caloric content necessitates mindful consumption." + ), + ), + CoherenceScore(score=4.0), + ), + ], ) - - return model.score / 5 + return model.score / 4 diff --git a/src/draive/evaluators/text_conciseness.py b/src/draive/evaluators/text_conciseness.py index c2b1738..e85bd13 100644 --- a/src/draive/evaluators/text_conciseness.py +++ b/src/draive/evaluators/text_conciseness.py @@ -20,17 +20,17 @@ class ConcisenessScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Conciseness (1-5) - the extent to which the compared text is brief and to the point \ +Conciseness (0.0-4.0) - the extent to which the compared text is brief and to the point \ while still covering all key information. A concise compared text avoids unnecessary details and repetition. Annotators should penalize compared texts that are overly verbose or include irrelevant information. Rating Scale: -1: Very low conciseness - the text is excessively verbose with much irrelevant information. -2: Low conciseness - the text contains unnecessary details and some irrelevant information. -3: Moderate conciseness - the text is somewhat concise but could be more focused. -4: Good conciseness - the text is mostly concise with minimal unnecessary information. -5: Excellent conciseness - the text is highly concise, containing only essential information. +0.0: Very low conciseness - the text is excessively verbose with much irrelevant information. +1.0: Low conciseness - the text contains unnecessary details and some irrelevant information. +2.0: Moderate conciseness - the text is somewhat concise but could be more focused. +3.0: Good conciseness - the text is mostly concise with minimal unnecessary information. +4.0: Excellent conciseness - the text is highly concise, containing only essential information. Evaluation Steps: 1. Read the derived text and the reference text carefully. @@ -38,7 +38,16 @@ class ConcisenessScore(DataModel): points of the reference text. 3. Assess how well the compared text covers the main points of the reference text, \ and how much irrelevant or redundant information it contains. -4. Assign a conciseness score from 1 to 5 based on the provided criteria. +4. Assign a conciseness score from 0.0 to 4.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Reference text: {reference} + +Compered text: {compared} """ @@ -52,6 +61,65 @@ async def text_conciseness_evaluator( ConcisenessScore, instruction=INSTRUCTION, input=f"Reference text: {reference}\n\nCompered text: {compared}", + examples=[ + ( + INPUT.format( + reference=( + "Solar energy is a renewable energy source that is gaining popularity. " + "Solar panels convert sunlight into electricity. " + "This technology is environmentally friendly and can reduce electricity " + "bills. However,installing solar panels requires an initial investment and " + "is dependent on weather conditions." + ), + compared=( + "Did you know that solar energy is becoming super popular these days? " + "It's this amazing, eco-friendly way to make electricity using " + "the sun's rays. People are getting really excited about it! Basically, " + "you put these special panels on your roof, and they soak up the sunlight " + "like a sponge. Then, through some pretty cool science stuff, " + "they turn that sunlight into electricity you can use in your house. " + "It's pretty neat, right? And get this - it can actually help you save " + "money on your electricity bills in the long run. But here's the thing: " + "you've got to shell out some cash upfront to get those panels installed. " + "It's kind of like buying a fancy coffee machine - costs a bit at first, " + "but then you save on all those coffee shop visits." + ), + ), + ConcisenessScore(score=0.0), + ), + ( + INPUT.format( + reference=( + "Coffee is a popular beverage worldwide. " + "It's made from roasted coffee beans. Caffeine in coffee " + "can boost energy and alertness. However, excessive consumption may " + "lead to sleep issues." + ), + compared=( + "Coffee is a widely consumed beverage made from roasted coffee beans. " + "It contains caffeine, which can enhance energy and alertness. However, " + "drinking too much coffee may cause sleep problems. " + "People enjoy coffee for its taste and stimulating effects, but it's " + "important to consume it in moderation." + ), + ), + ConcisenessScore(score=2.0), + ), + ( + INPUT.format( + reference=( + "The water cycle, also known as the hydrologic cycle, " + "describes the continuous movement of water within the Earth and " + "atmosphere. It involves processes such as evaporation, condensation, " + "precipitation, and runoff." + ), + compared=( + "The water cycle is the continuous movement of water on Earth. " + "It includes evaporation, condensation, precipitation, and runoff." + ), + ), + ConcisenessScore(score=4.0), + ), + ], ) - - return model.score / 5 + return model.score / 4 diff --git a/src/draive/evaluators/text_consistency.py b/src/draive/evaluators/text_consistency.py index 55af0a4..36fa33c 100644 --- a/src/draive/evaluators/text_consistency.py +++ b/src/draive/evaluators/text_consistency.py @@ -20,19 +20,19 @@ class ConsistencyScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Consistency(1-5) - the factual alignment between the reference text and the compared text. +Consistency(0.0-4.0) - the factual alignment between the reference text and the compared text. A factually consistent compared text contains only statements that are entailed \ by the reference text. Annotators should penalize compared texts that contain hallucinated facts. Rating Scale: -1: Very low consistency - the text contains multiple hallucinated facts \ +0.0: Very low consistency - the text contains multiple hallucinated facts \ or significant misalignments with the reference text. -2: Low consistency - the text has several instances of information not supported by \ +1.0: Low consistency - the text has several instances of information not supported by \ the reference text. -3: Moderate consistency - the text is mostly consistent but contains a few unsupported statements. -4: Good consistency - the text is largely consistent with minor discrepancies. -5: Excellent consistency - the text is fully consistent with the reference text, \ +2.0: Moderate consistency - the text is mostly consistent but contains a few unsupported statements. +3.0: Good consistency - the text is largely consistent with minor discrepancies. +4.0: Excellent consistency - the text is fully consistent with the reference text, \ containing only supported information. Evaluation Steps: @@ -41,7 +41,16 @@ class ConsistencyScore(DataModel): of the reference text. 3. Assess how well the compared text covers the main points of the reference text \ and how much irrelevant or redundant information it contains. -4. Assign a consistency score from 1 to 5 based on the provided criteria. +4. Assign a consistency score from 0.0 to 4.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Reference text: {reference} + +Compered text: {compared} """ @@ -55,6 +64,55 @@ async def text_consistency_evaluator( ConsistencyScore, instruction=INSTRUCTION, input=f"Reference text: {reference}\n\nCompered text: {compared}", + examples=[ + ( + INPUT.format( + reference=( + "Dolphins are intelligent marine mammals. They use echolocation " + "to navigate and hunt. Dolphins live in social groups called pods." + ), + compared=( + "Dolphins are smart fish that can fly short distances. They use sonar " + "to talk to whales. Dolphins live in families and go to school " + "to learn hunting techniques." + ), + ), + ConsistencyScore(score=0.0), + ), + ( + INPUT.format( + reference=( + "Coffee is a popular beverage worldwide. " + "It's made from roasted coffee beans. Caffeine in coffee " + "can boost energy and alertness. However, excessive consumption may " + "lead to sleep issues." + ), + compared=( + "Coffee is a widely consumed drink around the world. It's produced " + "by roasting coffee beans. The caffeine in coffee can increase energy " + "levels and improve alertness. However, drinking too much coffee might " + "cause sleep problems. Coffee is also known to improve memory and reduce " + "the risk of certain diseases." + ), + ), + ConsistencyScore(score=2.0), + ), + ( + INPUT.format( + reference=( + "Photosynthesis is the process by which plants use sunlight to " + "produce energy. It requires water, carbon dioxide, and chlorophyll. " + "Oxygen is released as a byproduct of photosynthesis." + ), + compared=( + "Plants carry out photosynthesis to create energy from sunlight. " + "This process needs water, carbon dioxide, and the green pigment " + "chlorophyll. As plants photosynthesize, " + "they release oxygen into the environment." + ), + ), + ConsistencyScore(score=4.0), + ), + ], ) - - return model.score / 5 + return model.score / 4 diff --git a/src/draive/evaluators/text_coverage.py b/src/draive/evaluators/text_coverage.py index 6cb7fa7..f8cc255 100644 --- a/src/draive/evaluators/text_coverage.py +++ b/src/draive/evaluators/text_coverage.py @@ -20,18 +20,18 @@ class CoverageScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Coverage (1-5) - the extent to which the compared text includes all \ +Coverage (0.0-4.0) - the extent to which the compared text includes all \ the key points from the reference text. A compared text with good coverage includes all the important information from \ the reference text without omitting critical points. Annotators should penalize compared texts that miss significant content. Rating Scale: -1: Very low coverage - the text misses most key points from the reference text. -2: Low coverage - the text includes some key points but omits several important ones. -3: Moderate coverage - the text covers most key points but misses a few important details. -4: Good coverage - the text includes nearly all key points with minor omissions. -5: Excellent coverage - the text comprehensively covers all key points from the reference text. +0.0: Very low coverage - the text misses most key points from the reference text. +1.0: Low coverage - the text includes some key points but omits several important ones. +2.0: Moderate coverage - the text covers most key points but misses a few important details. +3.0: Good coverage - the text includes nearly all key points with minor omissions. +4.0: Excellent coverage - the text comprehensively covers all key points from the reference text. Evaluation Steps: 1. Read the reference text carefully and identify all key points and important information. @@ -40,7 +40,16 @@ class CoverageScore(DataModel): from the reference text. 3. Assess how well the compared text covers the reference text, \ and if any critical points are missing. -4. Assign a coverage score from 1 to 5 based on the provided criteria. +4. Assign a coverage score from 0.0 to 4.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Reference text: {reference} + +Compered text: {compared} """ @@ -54,6 +63,60 @@ async def text_coverage_evaluator( CoverageScore, instruction=INSTRUCTION, input=f"Reference text: {reference}\n\nCompered text: {compared}", + examples=[ + ( + INPUT.format( + reference=( + "Smartphones are versatile devices. They can make calls, send messages, " + "access the internet, take photos, and run various apps. " + "Many people use smartphones for work and entertainment. " + "However, excessive use can lead to addiction and sleep problems." + ), + compared=( + "Smartphones can make calls and send messages. They are popular devices." + ), + ), + CoverageScore(score=0.0), + ), + ( + INPUT.format( + reference=( + "Recycling helps protect the environment. It reduces waste in landfills, " + "conserves natural resources, and saves energy. Common recyclable items " + "include paper, plastic, glass, and metal. Many cities have recycling " + "programs, but individual participation is crucial for success." + ), + compared=( + "Recycling is good for the environment. " + "It reduces waste and saves resources. " + "People can recycle things like paper and plastic. " + "Many cities have recycling programs." + ), + ), + CoverageScore(score=2.0), + ), + ( + INPUT.format( + reference=( + "Regular exercise is important for health. It strengthens the heart, " + "builds muscle, and improves flexibility. Exercise can also reduce stress " + "and boost mood. Experts recommend at least 30 minutes of moderate " + "activity most days of the week. Walking, swimming, and cycling are " + "good options for many people." + ), + compared=( + "Regular exercise is crucial for maintaining good health. " + "It has many benefits, including strengthening the heart, " + "building muscle, and enhancing flexibility. Exercise also has " + "mental health benefits, such as reducing stress and improving mood. " + "Health experts advise doing at least 30 minutes of moderate exercise " + "on most days. Some popular and accessible forms of exercise " + "include walking, swimming, and cycling." + ), + ), + CoverageScore(score=4.0), + ), + ], ) - return model.score / 5 + return model.score / 4 diff --git a/src/draive/evaluators/text_fluency.py b/src/draive/evaluators/text_fluency.py index 4388712..d8720b0 100644 --- a/src/draive/evaluators/text_fluency.py +++ b/src/draive/evaluators/text_fluency.py @@ -19,18 +19,25 @@ class FluencyScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Fluency (1-3) - the quality of the text in terms of grammar, spelling, punctuation, word choice, \ -and sentence structure. +Fluency (0.0-2.0) - the quality of the text in terms of grammar, spelling, punctuation, \ +word choice, and sentence structure. Rating Scale: -1: Poor - the text has many errors that make it hard to understand or sound unnatural. -2: Fair - the text has some errors that affect the clarity or smoothness of the text, \ +0.0: Poor - the text has many errors that make it hard to understand or sound unnatural. +1.0: Fair - the text has some errors that affect the clarity or smoothness of the text, \ but the main points are still comprehensible. -3: Good - the text has few or no errors and is easy to read and follow. +2.0: Good - the text has few or no errors and is easy to read and follow. Evaluation Steps: 1. Read the text and evaluate its fluency based on the given criteria. -2. Assign a fluency score from 1 to 3 based on the provided criteria. +2. Assign a fluency score from 0.0 to 2.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Text: {text} """ @@ -43,6 +50,36 @@ async def text_fluency_evaluator( FluencyScore, instruction=INSTRUCTION, input=text, + examples=[ + ( + INPUT.format( + text=( + "The cat sitted on mat. It were very comfrotable. " + "The sun shine bright in sky." + ), + ), + FluencyScore(score=0.0), + ), + ( + INPUT.format( + text=( + "The movie was good, but I didn't liked the ending. " + "It left me feeling confuse and unsatisfied." + ), + ), + FluencyScore(score=1.0), + ), + ( + INPUT.format( + text=( + "The concert last night was amazing. " + "The band played all their hit songs, and the crowd was energetic " + "throughout the performance." + ), + ), + FluencyScore(score=2.0), + ), + ], ) - - return model.score / 3 + print(model.score) + return model.score / 2 diff --git a/src/draive/evaluators/text_readability.py b/src/draive/evaluators/text_readability.py index b49d061..380bd79 100644 --- a/src/draive/evaluators/text_readability.py +++ b/src/draive/evaluators/text_readability.py @@ -19,26 +19,34 @@ class ReadabilityScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Readability (1-5) - the ease with which a reader can understand the text. +Readability (0.0-4.0) - the ease with which a reader can understand the text. A readable text uses clear and concise language, is well-structured, and avoids complex or convoluted sentences. Annotators should penalize texts that \ are difficult to read or understand. Rating Scale: -1: Very low readability - the text is extremely difficult to understand, \ +0.0: Very low readability - the text is extremely difficult to understand, \ with complex language and convoluted structure. -2: Low readability - the text is challenging to read, with frequent use of \ +1.0: Low readability - the text is challenging to read, with frequent use of \ complex sentences or unclear language. -3: Moderate readability - the text is somewhat clear but has some areas \ +2.0: Moderate readability - the text is somewhat clear but has some areas \ that are difficult to understand. -4: Good readability - the text is mostly clear and easy to read, with minor instances of complexity. -5: Excellent readability - the text is highly clear, concise, and easy to understand throughout. +3.0: Good readability - the text is mostly clear and easy to read, with minor instances \ +of complexity. +4.0: Excellent readability - the text is highly clear, concise, and easy to understand throughout. Evaluation Steps: 1. Read the text carefully and evaluate how easy it is to read and understand. 2. Consider the language used in the text, including clarity, simplicity, and sentence structure. 3. Assess whether the text is well-structured and free from complex or convoluted sentences. -4. Assign a readability score from 1 to 5 based on the provided criteria. +4. Assign a readability score from 0.0 to 4.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Text: {text} """ @@ -51,6 +59,38 @@ async def text_readability_evaluator( ReadabilityScore, instruction=INSTRUCTION, input=text, + examples=[ + ( + INPUT.format( + text=( + "The canine species, frequently domesticated for companionship purposes, " + "exhibit characteristics of fidelity and ludic propensities that engender " + "their widespread appeal among human populations as domestic " + "animal companions." + ), + ), + ReadabilityScore(score=0.0), + ), + ( + INPUT.format( + text=( + "Pizza, a widely consumed dish, consists of a circular bread foundation " + "adorned with various ingredients. Typically, it includes a layer of " + "tomato-based sauce and cheese, though additional toppings may be " + "incorporated to suit individual preferences." + ), + ), + ReadabilityScore(score=2.0), + ), + ( + INPUT.format( + text=( + "Exercise is good for health. It helps maintain fitness and reduces stress." + ), + ), + ReadabilityScore(score=4.0), + ), + ], ) - return model.score / 5 + return model.score / 4 diff --git a/src/draive/evaluators/text_relevance.py b/src/draive/evaluators/text_relevance.py index 466a4cb..06ef370 100644 --- a/src/draive/evaluators/text_relevance.py +++ b/src/draive/evaluators/text_relevance.py @@ -20,19 +20,19 @@ class RelevanceScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Relevance (1-5) - selection of important content from the reference text. +Relevance (0.0-4.0) - selection of important content from the reference text. The compared text should include only important information from the reference text. Annotators should penalize compared texts that contain redundancies and excess information. Rating Scale: -1: Very low relevance - the text contains mostly irrelevant or redundant information. -2: Low relevance - the text includes some important points but has \ +0.0: Very low relevance - the text contains mostly irrelevant or redundant information. +1.0: Low relevance - the text includes some important points but has \ significant irrelevant content. -3: Moderate relevance - the text covers most important points but includes \ +2.0: Moderate relevance - the text covers most important points but includes \ some unnecessary information. -4: Good relevance - the text focuses on important information with minor inclusions \ +3.0: Good relevance - the text focuses on important information with minor inclusions \ of less relevant content. -5: Excellent relevance - the text precisely captures only the most important information \ +4.0: Excellent relevance - the text precisely captures only the most important information \ from the reference text. Evaluation Steps: @@ -41,7 +41,16 @@ class RelevanceScore(DataModel): the main points of the reference text. 3. Assess how well the compared text covers the main points of the reference text, \ and note any irrelevant or redundant information it contains. -4. Assign a relevance score from 1 to 5 based on the provided criteria. +4. Assign a relevance score from 0.0 to 4.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Reference text: {reference} + +Compered text: {compared} """ @@ -55,6 +64,52 @@ async def text_relevance_evaluator( RelevanceScore, instruction=INSTRUCTION, input=f"Reference text: {reference}\n\nCompered text: {compared}", + examples=[ + ( + INPUT.format( + reference=( + "The sun is the star at the center of our solar system. " + "It provides light and heat to Earth." + ), + compared=( + "Stars twinkle in the night sky. Some people believe in astrology. " + "The moon orbits the Earth. Astronauts have been to space. " + "Solar panels use energy from the sun." + ), + ), + RelevanceScore(score=0.0), + ), + ( + INPUT.format( + reference=( + "Elephants are the largest land animals. They have long trunks and tusks. " + "Elephants live in herds and are known for their intelligence." + ), + compared=( + "Elephants are very big animals. They use their trunks to grab food " + "and water. Elephants live together in groups. They're smart and have " + "good memories. Some people ride elephants in zoos, " + "but this can be harmful to the animals." + ), + ), + RelevanceScore(score=2.0), + ), + ( + INPUT.format( + reference=( + "Bicycles are a popular mode of transportation. They are eco-friendly " + "and provide exercise. However, cyclists need to follow " + "traffic rules for safety." + ), + compared=( + "Bicycles are widely used for travel. " + "They don't pollute and help people stay fit. " + "Cyclists must obey traffic laws to stay safe." + ), + ), + RelevanceScore(score=4.0), + ), + ], ) - return model.score / 5 + return model.score / 4 diff --git a/src/draive/evaluators/text_similarity.py b/src/draive/evaluators/text_similarity.py index 95ab163..16a3877 100644 --- a/src/draive/evaluators/text_similarity.py +++ b/src/draive/evaluators/text_similarity.py @@ -23,19 +23,28 @@ class SimilarityScore(DataModel): Keep this document open while reviewing, and refer to it as needed. Evaluation Criteria: -Similarity (1-3) - the degree of semantic similarity between the reference text \ +Similarity (0.0-2.0) - the degree of semantic similarity between the reference text \ and the compared text. Rating Scale: -1: No similarity - the reference text and compared text are completely unrelated in meaning. -2: Moderate similarity - the reference text and compared text share some common themes or ideas. -3: High similarity - the reference text and compared text are very close in meaning \ +0.0: No similarity - the reference text and compared text are completely unrelated in meaning. +1.0: Moderate similarity - the reference text and compared text share some common themes or ideas. +2.0: High similarity - the reference text and compared text are very close in meaning \ or convey the same information. Evaluation Steps: 1. Read both the reference text and the compared text carefully. 2. Compare the semantic meaning of the reference text and the compared text. -3. Assign a similarity score from 1 to 3 based on the provided criteria. +3. Assign a similarity score from 0.0 to 2.0 based on the provided criteria. + +Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \ +do not exceed this value. +""" + +INPUT: str = """ +Reference text: {reference} + +Compered text: {compared} """ @@ -49,9 +58,48 @@ async def text_similarity_evaluator( SimilarityScore, instruction=INSTRUCTION, input=f"Reference text: {reference}\n\nCompered text: {compared}", + examples=[ + ( + INPUT.format( + reference=( + "Cats are popular pets. They are independent and like to groom themselves." + ), + compared=( + "Bananas are a healthy fruit. They are rich in potassium and easy to peel." + ), + ), + SimilarityScore(score=0.0), + ), + ( + INPUT.format( + reference=( + "The beach is a great place for relaxation. " + "People enjoy swimming and sunbathing." + ), + compared=( + "Many people like to spend time outdoors. " + "Parks are popular for picnics and walking." + ), + ), + SimilarityScore(score=1.0), + ), + ( + INPUT.format( + reference=( + "Coffee is a popular morning drink. It contains caffeine which helps " + "people feel more alert." + ), + compared=( + "Many people start their day with coffee. " + "The caffeine in coffee can increase alertness and energy." + ), + ), + SimilarityScore(score=2.0), + ), + ], ) - return model.score / 3 + return model.score / 2 @evaluator(name="text_vector_similarity")