Skip to content

Commit

Permalink
Evals
Browse files Browse the repository at this point in the history
  • Loading branch information
ddebowczyk committed Oct 12, 2024
1 parent 15c8020 commit 60e6989
Show file tree
Hide file tree
Showing 36 changed files with 918 additions and 116 deletions.
2 changes: 1 addition & 1 deletion evals/ComplexExtraction/report.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Acme Insurance project to implement SalesTech CRM solution is currently
in RED status due to delayed delivery of document production system, led
by 3rd party vendor - Alfatech. Customer (Acme) is discussing the resolution
with the vendor. Due to dependencies it will result in delay of the
ecommerce track by 2 sprints. System integrator (SysCorp) are working
ecommerce track by 2 sprints. System integrator (SysCorp) is working
to absorb some of the delay by deploying extra resources to speed up
development when the doc production is done. Another issue is that the
customer is not able to provide the test data for the ecommerce track.
Expand Down
68 changes: 39 additions & 29 deletions evals/ComplexExtraction/run.php
Original file line number Diff line number Diff line change
@@ -1,60 +1,70 @@
<?php

use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Contracts\CanEvaluateExperiment;
use Cognesy\Instructor\Extras\Evals\Contracts\Metric;
use Cognesy\Instructor\Extras\Evals\Data\Evaluation;
use Cognesy\Instructor\Extras\Evals\Data\Feedback;
use Cognesy\Instructor\Extras\Evals\Data\InferenceCases;
use Cognesy\Instructor\Extras\Evals\Data\InstructorData;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Inference\RunInstructor;
use Cognesy\Instructor\Extras\Evals\Metrics\BooleanCorrectness;
use Cognesy\Instructor\Extras\Evals\ExperimentSuite;
use Cognesy\Instructor\Extras\Evals\Metrics\PercentageCorrectness;
use Cognesy\Instructor\Extras\Sequence\Sequence;
use Cognesy\Instructor\Features\LLM\Data\Usage;

$loader = require 'vendor/autoload.php';
$loader->add('Cognesy\\Instructor\\', __DIR__ . '../../src/');

$cases = InferenceCases::get(
connections: [],
modes: [],
stream: []
);

class Company {
public string $name;
public int $foundingYear;
}

class CompanyEval implements CanEvaluateExperiment
{
public array $expectations;

public function __construct(array $expectations) {
$this->expectations = $expectations;
}

public function evaluate(Experiment $experiment) : Evaluation {
$expectedEvents = $this->expectations['events'];
/** @var Sequence $events */
$events = $experiment->response->value();
$result = ($expectedEvents - count($events->list)) / $expectedEvents;
return new Evaluation(
metric: new PercentageCorrectness('found', $result),
feedback: Feedback::none(),
usage: Usage::none(),
);
}
}

$report = file_get_contents(__DIR__ . '/report.txt');
$examples = require 'examples.php';
$prompt = 'Extract a list of project events with all the details from the provided input in JSON format using schema: <|json_schema|>';
$responseModel = Sequence::of(ProjectEvent::class);

$data = new InstructorData(
messages: [
['role' => 'user', 'content' => 'YOUR GOAL: Use tools to store the information from context based on user questions.'],
['role' => 'user', 'content' => 'CONTEXT: Our company ACME was founded in 2020.'],
//['role' => 'user', 'content' => 'EXAMPLE CONTEXT: Sony was established in 1946 by Akio Morita.'],
//['role' => 'user', 'content' => 'EXAMPLE RESPONSE: ```json{"name":"Sony","year":1899}```'],
['role' => 'user', 'content' => 'What is the name and founding year of our company?'],
],
responseModel: Company::class,
);

class CompanyEval implements CanEvaluateExperiment
{
public function evaluate(Experiment $experiment) : Metric {
/** @var Person $decoded */
$person = $experiment->response->value();
$result = $person->name === 'ACME'
&& $person->foundingYear === 2020;
return new BooleanCorrectness($result);
}
}

//Debug::enable();

//$report = file_get_contents(__DIR__ . '/report.txt');
//$examples = require 'examples.php';
//$prompt = 'Extract a list of project events with all the details from the provided input in JSON format using schema: <|json_schema|>';
//$responseModel = Sequence::of(ProjectEvent::class);

$runner = new ExperimentSuite(
cases: InferenceCases::only(
connections: ['openai', 'anthropic', 'gemini', 'cohere'],
modes: [Mode::Tools],
stream: [true, false]
),
executor: new RunInstructor($data),
evaluator: new CompanyEval(),
evaluators: new CompanyEval(expectations: ['events' => 12]),
);

$outputs = $runner->execute($cases);
$outputs = $runner->execute();
12 changes: 9 additions & 3 deletions evals/LLMModes/CompanyEval.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Contracts\CanEvaluateExperiment;
use Cognesy\Instructor\Extras\Evals\Data\Evaluation;
use Cognesy\Instructor\Extras\Evals\Data\Feedback;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Metrics\BooleanCorrectness;
use Cognesy\Instructor\Extras\Evals\Contracts\Metric;
use Cognesy\Instructor\Features\LLM\Data\Usage;
use Cognesy\Instructor\Utils\Str;

class CompanyEval implements CanEvaluateExperiment
Expand All @@ -17,13 +19,17 @@ public function __construct(array $expectations) {
$this->expectations = $expectations;
}

public function evaluate(Experiment $experiment) : Metric {
public function evaluate(Experiment $experiment) : Evaluation {
$isCorrect = match ($experiment->mode) {
Mode::Text => $this->validateText($experiment),
Mode::Tools => $this->validateToolsData($experiment),
default => $this->validateDefault($experiment),
};
return new BooleanCorrectness($isCorrect);
return new Evaluation(
metric: new BooleanCorrectness('is_correct', $isCorrect),
feedback: Feedback::none(),
usage: Usage::none(),
);
}

private function validateToolsData(Experiment $experiment) : bool {
Expand Down
21 changes: 11 additions & 10 deletions evals/LLMModes/run.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,15 @@
$loader->add('Cognesy\\Instructor\\', __DIR__ . '../../src/');
$loader->add('Cognesy\\Evals\\', __DIR__ . '../../evals/');

use Cognesy\Instructor\Extras\Evals\Aggregators\FirstMetric;
use Cognesy\Instructor\Extras\Evals\Aggregators\SelectedMetric;
use Cognesy\Instructor\Extras\Evals\Data\InferenceCases;
use Cognesy\Instructor\Extras\Evals\Data\InferenceData;
use Cognesy\Instructor\Extras\Evals\Data\InferenceSchema;
use Cognesy\Instructor\Extras\Evals\Inference\RunInference;
use Cognesy\Instructor\Extras\Evals\ExperimentSuite;
use Cognesy\Evals\LLMModes\CompanyEval;

$cases = InferenceCases::except(
connections: [],
modes: [],
stream: []
);

$data = new InferenceData(
messages: [
['role' => 'user', 'content' => 'YOUR GOAL: Use tools to store the information from context based on user questions.'],
Expand Down Expand Up @@ -46,13 +42,18 @@
),
);

$runner = new ExperimentSuite(
$experiments = new ExperimentSuite(
cases: InferenceCases::except(
connections: [],
modes: [],
stream: [],
),
executor: new RunInference($data),
evaluator: new CompanyEval(expectations: [
evaluators: new CompanyEval(expectations: [
'name' => 'ACME',
'foundingYear' => 2020
]),
cases: $cases,
aggregator: new FirstMetric()
);

$outputs = $runner->execute();
$outputs = $experiments->execute();
12 changes: 9 additions & 3 deletions evals/SimpleExtraction/CompanyEval.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
namespace Cognesy\Evals\SimpleExtraction;

use Cognesy\Instructor\Extras\Evals\Contracts\CanEvaluateExperiment;
use Cognesy\Instructor\Extras\Evals\Data\Evaluation;
use Cognesy\Instructor\Extras\Evals\Data\Feedback;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Metrics\BooleanCorrectness;
use Cognesy\Instructor\Extras\Evals\Contracts\Metric;
use Cognesy\Instructor\Features\LLM\Data\Usage;

class CompanyEval implements CanEvaluateExperiment
{
Expand All @@ -15,10 +17,14 @@ public function __construct(array $expectations) {
$this->expectations = $expectations;
}

public function evaluate(Experiment $experiment) : Metric {
public function evaluate(Experiment $experiment) : Evaluation {
$company = $experiment->response->value();
$isCorrect = $company->name === $this->expectations['name']
&& $company->foundingYear === $this->expectations['foundingYear'];
return new BooleanCorrectness($isCorrect);
return new Evaluation(
metric: new BooleanCorrectness('is_correct', $isCorrect),
feedback: Feedback::none(),
usage: Usage::none(),
);
}
}
18 changes: 8 additions & 10 deletions evals/SimpleExtraction/run.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<?php

use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Aggregators\FirstMetric;
use Cognesy\Instructor\Extras\Evals\Data\InferenceCases;
use Cognesy\Instructor\Extras\Evals\Data\InstructorData;
use Cognesy\Instructor\Extras\Evals\Inference\RunInstructor;
Expand All @@ -11,30 +12,27 @@
$loader = require 'vendor/autoload.php';
$loader->add('Cognesy\\Instructor\\', __DIR__ . '../../src/');

$cases = InferenceCases::except(
connections: [],
modes: [Mode::Text],
stream: []
);

$data = new InstructorData(
messages: [
['role' => 'user', 'content' => 'YOUR GOAL: Use tools to store the information from context based on user questions.'],
['role' => 'user', 'content' => 'CONTEXT: Our company ACME was founded in 2020.'],
//['role' => 'user', 'content' => 'EXAMPLE CONTEXT: Sony was established in 1946 by Akio Morita.'],
//['role' => 'user', 'content' => 'EXAMPLE RESPONSE: ```json{"name":"Sony","year":1899}```'],
['role' => 'user', 'content' => 'What is the name and founding year of our company?'],
],
responseModel: Company::class,
);

$runner = new ExperimentSuite(
cases: InferenceCases::except(
connections: ['ollama'],
modes: [Mode::Text],
stream: []
),
executor: new RunInstructor($data),
evaluator: new CompanyEval(expectations: [
evaluators: new CompanyEval(expectations: [
'name' => 'ACME',
'foundingYear' => 2020
]),
cases: $cases,
aggregator: new FirstMetric()
);

$outputs = $runner->execute();
16 changes: 16 additions & 0 deletions src/Extras/Evals/Aggregators/FirstMetric.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace Cognesy\Instructor\Extras\Evals\Aggregators;

use Cognesy\Instructor\Extras\Evals\Contracts\CanAggregateValues;
use Cognesy\Instructor\Extras\Evals\Contracts\Metric;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Metrics\NullMetric;

class FirstMetric implements CanAggregateValues
{
public function aggregate(Experiment $experiment): Metric {
$firstEval = $experiment->evaluations[0] ?? null;
return $firstEval ? $firstEval->metric : new NullMetric();
}
}
24 changes: 24 additions & 0 deletions src/Extras/Evals/Aggregators/SelectedMetric.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

namespace Cognesy\Instructor\Extras\Evals\Aggregators;

use Cognesy\Instructor\Extras\Evals\Contracts\CanAggregateValues;
use Cognesy\Instructor\Extras\Evals\Contracts\Metric;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Metrics\NullMetric;

class SelectedMetric implements CanAggregateValues
{
public function __construct(
private string $name
) {}

public function aggregate(Experiment $experiment): Metric {
foreach ($experiment->evaluations as $evaluation) {
if ($evaluation->metric->name() === $this->name) {
return $evaluation->metric;
}
}
return new NullMetric();
}
}
4 changes: 2 additions & 2 deletions src/Extras/Evals/Console/Display.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public function before(Experiment $experiment) : void {
public function after(Experiment $experiment) : void {
$answer = $experiment->notes;
$answerLine = str_replace("\n", '\n', $answer);
$metric = $experiment->metric;
$value = $experiment->aggregate;
$timeElapsed = $experiment->timeElapsed;
$tokensPerSec = $experiment->outputTps();
$exception = $experiment->exception;
Expand All @@ -45,7 +45,7 @@ public function after(Experiment $experiment) : void {
echo Console::columns([
[9, $this->timeFormat($timeElapsed), STR_PAD_LEFT, [Color::DARK_YELLOW]],
[10, $this->tokensPerSecFormat($tokensPerSec), STR_PAD_LEFT, [Color::CYAN]],
[6, $metric->toString(), STR_PAD_BOTH, $metric->toCliColor()],
[6, $value->toString(), STR_PAD_BOTH, $value->toCliColor()],
[60, $answerLine, STR_PAD_RIGHT, [Color::WHITE, Color::BG_BLACK]],
], 120);
}
Expand Down
16 changes: 16 additions & 0 deletions src/Extras/Evals/Contracts/CanAggregateValues.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace Cognesy\Instructor\Extras\Evals\Contracts;

use Cognesy\Instructor\Extras\Evals\Experiment;

interface CanAggregateValues
{
/**
* Aggregate the given values into a single metric.
*
* @param Experiment $experiment
* @return Metric
*/
public function aggregate(Experiment $experiment): Metric;
}
3 changes: 2 additions & 1 deletion src/Extras/Evals/Contracts/CanEvaluateExperiment.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

namespace Cognesy\Instructor\Extras\Evals\Contracts;

use Cognesy\Instructor\Extras\Evals\Data\Evaluation;
use Cognesy\Instructor\Extras\Evals\Experiment;

interface CanEvaluateExperiment
{
public function evaluate(Experiment $experiment) : Metric;
public function evaluate(Experiment $experiment) : Evaluation;
}
4 changes: 3 additions & 1 deletion src/Extras/Evals/Contracts/Metric.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

interface Metric
{
public function name() : string;
public function value() : mixed;
public function toLoss() : float;
public function toScore() : float;
}
public function toString () : string;
}
17 changes: 17 additions & 0 deletions src/Extras/Evals/Data/BooleanCorrectnessAnalysis.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?php

namespace Cognesy\Instructor\Extras\Evals\Data;

use Cognesy\Instructor\Features\Schema\Attributes\Description;

#[Description("The result of correctness evaluation.")]
class BooleanCorrectnessAnalysis
{
#[Description("Step by step assessment of the expected versus actual results.")]
public string $assessment;
#[Description("Decision if the actual result is correct.")]
public bool $isCorrect;
#[Description("If the result is incorrect - list of individual issues found in the actual result considering the expected values. Otherwise empty.")]
/** @var ParameterFeedback[] */
public array $feedback;
}
Loading

0 comments on commit 60e6989

Please sign in to comment.