Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/actions/install-internal-pip/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,7 @@ runs:
else
URL="git+ssh://git@${{ inputs.host }}/${{ inputs.repo }}.git"
fi
pip install "$URL" ${{ inputs.pip-extra-args }}
echo "Installing from URL: $URL"
pip install --no-cache-dir --force-reinstall "$URL" ${{ inputs.pip-extra-args }}
# Get and print the installed version/commit hash
python -c "import importlib.metadata; import sys; print(f'Installed {importlib.metadata.version(\"llmevalkit\")} from {sys.modules.get(\"llmevalkit\").__path__[0] if \"llmevalkit\" in sys.modules else \"not loaded\"}')" || echo "Failed to get LLMEvalKit version info"
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ repos:
hooks:
- id: enforce-relative-imports
name: Enforce Relative Imports
entry: python utils/enforce_relative_imports.py
entry: python3 utils/enforce_relative_imports.py
language: system
# Adjust the files pattern to match your needs
files: ^src/.*\.py$
Expand All @@ -40,7 +40,7 @@ repos:
hooks:
- id: enforce-library-imports
name: Enforce Library Imports
entry: python utils/enforce_library_imports.py
entry: python3 utils/enforce_library_imports.py
language: system
# Adjust the files pattern to match your needs
exclude: (^src/.*\.py$)|utils/enforce_library_imports.py|utils/enforce_relative_imports.py
Expand Down
5 changes: 4 additions & 1 deletion examples/evaluate_tool_calling_with_reflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@
test_set=data,
split="test",
format="formats.chat_api",
metrics=["metrics.tool_calling.reflection.syntactic"],
metrics=[
"metrics.tool_calling.reflection.syntactic",
"metrics.tool_calling.reflection",
],
max_test_instances=10,
)

Expand Down
13 changes: 11 additions & 2 deletions prepare/metrics/tool_calling.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unitxt.catalog import add_to_catalog
from unitxt.metrics import (
MultiTurnToolCallingMetric,
ReflectionToolCallingMetric,
ReflectionToolCallingMetricSyntactic,
ToolCallingMetric,
ToolCallKeyValueExtraction,
Expand Down Expand Up @@ -48,15 +49,23 @@

add_to_catalog(
MultiTurnToolCallingMetric(
__description__="""Metric that evaluates tool call predictions for the validity with regards to the tools schema."""
__description__="""A metric that assesses tool call predictions for their conformity to the tool schema."""
),
"metrics.tool_calling.multi_turn.validity",
overwrite=True,
)

add_to_catalog(
ReflectionToolCallingMetric(
__description__="""A metric that assesses tool call predictions for both syntactic correctness and semantic validity, using predefined checks combined with LLM-based evaluations. For each instance, it returns a score reflecting its overall validity, as well as a breakdown of the specific checks/metrics that passed or failed, including hallucination check, value format alignment, function selection and agentic constraints satisfaction. Each metric also contains an evidence from the input, an explanation describing the reflection decision, a confidence, and a validity score with a range of 1-5 (higher score -> more valid)."""
),
"metrics.tool_calling.reflection",
overwrite=True,
)

add_to_catalog(
ReflectionToolCallingMetricSyntactic(
__description__="""This metric evaluates whether a model's tool call outputs are structurally valid by checking their compliance with the provided tool schema. For each instance, it returns a binary score (True for valid, False for invalid), and aggregates these into a global percentage across all instances. The evaluation covers a wide range of possible issues, including nonexistent functions or parameters, incorrect parameter types, missing required parameters, values outside allowed ranges, JSON schema violations, invalid or empty API specifications, and malformed tool calls. The main reported score, overall_valid (aliased as score), reflects the proportion of calls that are fully valid, making the metric a measure of syntactic and schema-level correctness rather than semantic accuracy."""
__description__="""This metric evaluates whether a model's tool call outputs are structurally valid by checking their compliance with the provided tool schema. For each instance, it returns a binary score (True for valid, False for invalid), and aggregates these into a global percentage across all instances. The evaluation covers a wide range of possible issues, including nonexistent functions or parameters, incorrect parameter types, missing required parameters, values outside allowed ranges, JSON schema violations, invalid or empty API specifications, and malformed tool calls. The main reported score, overall_valid (aliased as score), reflects the proportion of calls that are fully valid, making the metric a measure of syntactic and schema-level correctness rather than semantic accuracy. Each metric also contains an explanation describing the errors that it detected (if no errors were found - the explanation will be None)."""
),
"metrics.tool_calling.reflection.syntactic",
overwrite=True,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"__type__": "multi_turn_tool_calling_metric",
"__description__": "Metric that evaluates tool call predictions for the validity with regards to the tools schema."
"__description__": "A metric that assesses tool call predictions for their conformity to the tool schema."
}
4 changes: 4 additions & 0 deletions src/unitxt/catalog/metrics/tool_calling/reflection.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"__type__": "reflection_tool_calling_metric",
"__description__": "A metric that assesses tool call predictions for both syntactic correctness and semantic validity, using predefined checks combined with LLM-based evaluations. For each instance, it returns a score reflecting its overall validity, as well as a breakdown of the specific checks/metrics that passed or failed, including hallucination check, value format alignment, function selection and agentic constraints satisfaction. Each metric also contains an evidence from the input, an explanation describing the reflection decision, a confidence, and a validity score with a range of 1-5 (higher score -> more valid)."
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"__type__": "reflection_tool_calling_metric_syntactic",
"__description__": "This metric evaluates whether a model's tool call outputs are structurally valid by checking their compliance with the provided tool schema. For each instance, it returns a binary score (True for valid, False for invalid), and aggregates these into a global percentage across all instances. The evaluation covers a wide range of possible issues, including nonexistent functions or parameters, incorrect parameter types, missing required parameters, values outside allowed ranges, JSON schema violations, invalid or empty API specifications, and malformed tool calls. The main reported score, overall_valid (aliased as score), reflects the proportion of calls that are fully valid, making the metric a measure of syntactic and schema-level correctness rather than semantic accuracy."
"__description__": "This metric evaluates whether a model's tool call outputs are structurally valid by checking their compliance with the provided tool schema. For each instance, it returns a binary score (True for valid, False for invalid), and aggregates these into a global percentage across all instances. The evaluation covers a wide range of possible issues, including nonexistent functions or parameters, incorrect parameter types, missing required parameters, values outside allowed ranges, JSON schema violations, invalid or empty API specifications, and malformed tool calls. The main reported score, overall_valid (aliased as score), reflects the proportion of calls that are fully valid, making the metric a measure of syntactic and schema-level correctness rather than semantic accuracy. Each metric also contains an explanation describing the errors that it detected (if no errors were found - the explanation will be None)."
}
Loading
Loading