Skip to content

Commit

Permalink
Add benchmark suite for compilation (#542)
Browse files Browse the repository at this point in the history
  • Loading branch information
lapp0 authored Jan 25, 2024
1 parent 46dc706 commit d534c2f
Show file tree
Hide file tree
Showing 6 changed files with 183 additions and 0 deletions.
10 changes: 10 additions & 0 deletions docs/community/contribute.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ And run the code style checks:
pre-commit run --all-files
```

#### Performance testing

Run benchmark tests:

```python
pytest --benchmark-only
```

([other pytest-benchmark command line options](https://pytest-benchmark.readthedocs.io/en/latest/usage.html#commandline-options))

### Open a Pull Request

Create a new branch on your fork, commit and push the changes:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ dynamic = ["version"]
test = [
"pre-commit",
"pytest",
"pytest-benchmark",
"pytest-cov",
"pytest-mock",
"transformers",
Expand Down
15 changes: 15 additions & 0 deletions tests/benchmark/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest

from outlines.fsm.fsm import RegexFSM
from outlines.models.transformers import TransformerTokenizer


@pytest.fixture
def tokenizer():
return TransformerTokenizer("gpt2")


@pytest.fixture
def ensure_numba_compiled(tokenizer):
RegexFSM("a", tokenizer)
return True
92 changes: 92 additions & 0 deletions tests/benchmark/test_benchmark_json_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import pytest

import outlines

outlines.disable_cache()

from outlines.fsm.fsm import RegexFSM # noqa: E402
from outlines.fsm.json_schema import build_regex_from_object # noqa: E402

simple_schema = """{
"$defs": {
"Armor": {
"enum": ["leather", "chainmail", "plate"],
"title": "Armor",
"type": "string"
}
},
"properties": {
"name": {"maxLength": 10, "title": "Name", "type": "string"},
"age": {"title": "Age", "type": "integer"},
"armor": {"$ref": "#/$defs/Armor"},
"strength": {"title": "Strength", "type": "integer"}\
},
"required": ["name", "age", "armor", "strength"],
"title": "Character",
"type": "object"
}"""


complex_schema = """{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "Schema for a recording",
"type": "object",
"definitions": {
"artist": {
"type": "object",
"properties": {
"id": {"type": "number"},
"name": {"type": "string"},
"functions": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["id", "name", "functions"]
}
},
"properties": {
"id": {"type": "number"},
"work": {
"type": "object",
"properties": {
"id": {"type": "number"},
"name": {"type": "string"},
"composer": {"$ref": "#/definitions/artist"}
}
},
"recording_artists": {
"type": "array",
"items": {"$ref": "#/definitions/artist"}
}
},
"required": ["id", "work", "recording_artists"]
}"""


schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema)


@pytest.mark.parametrize("schema_name", schemas.keys())
def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name):
"""Benchmark convert json schema to regex"""
schema = schemas[schema_name]
benchmark.pedantic(
build_regex_from_object,
args=(schema,),
rounds=8,
)


@pytest.mark.parametrize("schema_name", schemas.keys())
def test_benchmark_json_schema_to_fsm(
benchmark, tokenizer, ensure_numba_compiled, schema_name
):
"""Benchmark compile json schema as FSM"""
schema = schemas[schema_name]
regex = build_regex_from_object(schema)
benchmark.pedantic(
RegexFSM,
args=(regex, tokenizer),
rounds=8,
)
33 changes: 33 additions & 0 deletions tests/benchmark/test_benchmark_numba_compile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import importlib

import interegular
import numba

import outlines

outlines.disable_cache()


def test_benchmark_compile_numba(benchmark, tokenizer, mocker):
"""Compile a basic regex to benchmark the numba compilation time"""

def setup():
from outlines.fsm import regex

original_njit = numba.njit

def mock_njit(*args, **kwargs):
kwargs["cache"] = False
return original_njit(*args, **kwargs)

mocker.patch("numba.njit", new=mock_njit)
importlib.reload(regex)

regex_pattern, _ = regex.make_deterministic_fsm(
interegular.parse_pattern("a").to_fsm().reduce()
)
return (regex, regex_pattern, tokenizer), {}

benchmark.pedantic(
lambda r, *args: r.create_fsm_index_tokenizer(*args), rounds=2, setup=setup
)
32 changes: 32 additions & 0 deletions tests/benchmark/test_benchmark_regex_fsm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest

import outlines

outlines.disable_cache()

from outlines.fsm.fsm import RegexFSM # noqa: E402

regex_samples = {
"email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
"complex_phone": "\\+?\\d{1,4}?[-.\\s]?\\(?\\d{1,3}?\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}",
"simple_phone": "\\+?[1-9][0-9]{7,14}",
"date": r"([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])|([0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])",
"time": r"(0?[1-9]|1[0-2]):[0-5]\d\s?(am|pm)?",
"ip": r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
"url": r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?",
"ssn": r"\d{3}-\d{2}-\d{4}",
"complex_span_constrained_relation_extraction": "(['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?\\s\\|\\s([^|\\(\\)\n]{1,})\\s\\|\\s['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?(\\s\\|\\s\\(([^|\\(\\)\n]{1,})\\s\\|\\s([^|\\(\\)\n]{1,})\\))*\\n)*",
}


@pytest.mark.parametrize("regex_name", regex_samples.keys())
def test_benchmark_regex_to_fsm(
benchmark, tokenizer, ensure_numba_compiled, regex_name
):
"""Benchmark converting regex to FSM"""
regex_str = regex_samples[regex_name]
benchmark.pedantic(
RegexFSM,
args=(regex_str, tokenizer),
rounds=8,
)

0 comments on commit d534c2f

Please sign in to comment.