-
Notifications
You must be signed in to change notification settings - Fork 445
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add benchmark suite for compilation (#542)
- Loading branch information
Showing
6 changed files
with
183 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import pytest | ||
|
||
from outlines.fsm.fsm import RegexFSM | ||
from outlines.models.transformers import TransformerTokenizer | ||
|
||
|
||
@pytest.fixture | ||
def tokenizer(): | ||
return TransformerTokenizer("gpt2") | ||
|
||
|
||
@pytest.fixture | ||
def ensure_numba_compiled(tokenizer): | ||
RegexFSM("a", tokenizer) | ||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import pytest | ||
|
||
import outlines | ||
|
||
outlines.disable_cache() | ||
|
||
from outlines.fsm.fsm import RegexFSM # noqa: E402 | ||
from outlines.fsm.json_schema import build_regex_from_object # noqa: E402 | ||
|
||
simple_schema = """{ | ||
"$defs": { | ||
"Armor": { | ||
"enum": ["leather", "chainmail", "plate"], | ||
"title": "Armor", | ||
"type": "string" | ||
} | ||
}, | ||
"properties": { | ||
"name": {"maxLength": 10, "title": "Name", "type": "string"}, | ||
"age": {"title": "Age", "type": "integer"}, | ||
"armor": {"$ref": "#/$defs/Armor"}, | ||
"strength": {"title": "Strength", "type": "integer"}\ | ||
}, | ||
"required": ["name", "age", "armor", "strength"], | ||
"title": "Character", | ||
"type": "object" | ||
}""" | ||
|
||
|
||
complex_schema = """{ | ||
"$schema": "http://json-schema.org/draft-04/schema#", | ||
"title": "Schema for a recording", | ||
"type": "object", | ||
"definitions": { | ||
"artist": { | ||
"type": "object", | ||
"properties": { | ||
"id": {"type": "number"}, | ||
"name": {"type": "string"}, | ||
"functions": { | ||
"type": "array", | ||
"items": {"type": "string"} | ||
} | ||
}, | ||
"required": ["id", "name", "functions"] | ||
} | ||
}, | ||
"properties": { | ||
"id": {"type": "number"}, | ||
"work": { | ||
"type": "object", | ||
"properties": { | ||
"id": {"type": "number"}, | ||
"name": {"type": "string"}, | ||
"composer": {"$ref": "#/definitions/artist"} | ||
} | ||
}, | ||
"recording_artists": { | ||
"type": "array", | ||
"items": {"$ref": "#/definitions/artist"} | ||
} | ||
}, | ||
"required": ["id", "work", "recording_artists"] | ||
}""" | ||
|
||
|
||
schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema) | ||
|
||
|
||
@pytest.mark.parametrize("schema_name", schemas.keys()) | ||
def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name): | ||
"""Benchmark convert json schema to regex""" | ||
schema = schemas[schema_name] | ||
benchmark.pedantic( | ||
build_regex_from_object, | ||
args=(schema,), | ||
rounds=8, | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("schema_name", schemas.keys()) | ||
def test_benchmark_json_schema_to_fsm( | ||
benchmark, tokenizer, ensure_numba_compiled, schema_name | ||
): | ||
"""Benchmark compile json schema as FSM""" | ||
schema = schemas[schema_name] | ||
regex = build_regex_from_object(schema) | ||
benchmark.pedantic( | ||
RegexFSM, | ||
args=(regex, tokenizer), | ||
rounds=8, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import importlib | ||
|
||
import interegular | ||
import numba | ||
|
||
import outlines | ||
|
||
outlines.disable_cache() | ||
|
||
|
||
def test_benchmark_compile_numba(benchmark, tokenizer, mocker): | ||
"""Compile a basic regex to benchmark the numba compilation time""" | ||
|
||
def setup(): | ||
from outlines.fsm import regex | ||
|
||
original_njit = numba.njit | ||
|
||
def mock_njit(*args, **kwargs): | ||
kwargs["cache"] = False | ||
return original_njit(*args, **kwargs) | ||
|
||
mocker.patch("numba.njit", new=mock_njit) | ||
importlib.reload(regex) | ||
|
||
regex_pattern, _ = regex.make_deterministic_fsm( | ||
interegular.parse_pattern("a").to_fsm().reduce() | ||
) | ||
return (regex, regex_pattern, tokenizer), {} | ||
|
||
benchmark.pedantic( | ||
lambda r, *args: r.create_fsm_index_tokenizer(*args), rounds=2, setup=setup | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import pytest | ||
|
||
import outlines | ||
|
||
outlines.disable_cache() | ||
|
||
from outlines.fsm.fsm import RegexFSM # noqa: E402 | ||
|
||
regex_samples = { | ||
"email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", | ||
"complex_phone": "\\+?\\d{1,4}?[-.\\s]?\\(?\\d{1,3}?\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}", | ||
"simple_phone": "\\+?[1-9][0-9]{7,14}", | ||
"date": r"([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])|([0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])", | ||
"time": r"(0?[1-9]|1[0-2]):[0-5]\d\s?(am|pm)?", | ||
"ip": r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)", | ||
"url": r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?", | ||
"ssn": r"\d{3}-\d{2}-\d{4}", | ||
"complex_span_constrained_relation_extraction": "(['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?\\s\\|\\s([^|\\(\\)\n]{1,})\\s\\|\\s['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?(\\s\\|\\s\\(([^|\\(\\)\n]{1,})\\s\\|\\s([^|\\(\\)\n]{1,})\\))*\\n)*", | ||
} | ||
|
||
|
||
@pytest.mark.parametrize("regex_name", regex_samples.keys()) | ||
def test_benchmark_regex_to_fsm( | ||
benchmark, tokenizer, ensure_numba_compiled, regex_name | ||
): | ||
"""Benchmark converting regex to FSM""" | ||
regex_str = regex_samples[regex_name] | ||
benchmark.pedantic( | ||
RegexFSM, | ||
args=(regex_str, tokenizer), | ||
rounds=8, | ||
) |