From d2755059b6794ce02a84124003f6095feed176d2 Mon Sep 17 00:00:00 2001 From: Alex Dixon Date: Wed, 20 Nov 2024 12:45:55 -0800 Subject: [PATCH 1/5] docs(readme): use consistent install command (#388) make consistent with top line of readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 62073674..8abfd908 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ To install `ell` and `ell studio`, you can use pip. Follow these steps: 2. Run the following command to install the `ell-ai` package from PyPI: ```bash - pip install ell-ai + pip install ell-ai[all] ``` 3. Verify the installation by checking the version of `ell`: From 1be0e1920dec1ffb7f1967442fad2a738e11d41f Mon Sep 17 00:00:00 2001 From: William Guss Date: Wed, 20 Nov 2024 13:15:29 -0800 Subject: [PATCH 2/5] Fixed docs --- docs/src/core_concepts/configuration.rst | 8 +-- src/ell/configurator.py | 63 +++++++++++++++++++----- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/docs/src/core_concepts/configuration.rst b/docs/src/core_concepts/configuration.rst index eb1afd7e..cc32dc89 100644 --- a/docs/src/core_concepts/configuration.rst +++ b/docs/src/core_concepts/configuration.rst @@ -5,6 +5,7 @@ Configuration ell provides various configuration options to customize its behavior. .. autofunction:: ell.init + :no-index: This ``init`` function is a convenience function that sets up the configuration for ell. It is a thin wrapper around the ``Config`` class, which is a Pydantic model. @@ -12,9 +13,10 @@ You can modify the global configuration using the ``ell.config`` object which is .. autopydantic_model:: ell.Config :members: - :exclude-members: default_client, registry, store + :exclude-members: default_client, registry, store, providers :model-show-json: false :model-show-validator-members: false :model-show-config-summary: false - :model-show-field-summary: false - :model-show-validator-summary: false \ No newline at end of file + :model-show-field-summary: true + :model-show-validator-summary: false + :no-index: \ No newline at end of file diff --git a/src/ell/configurator.py b/src/ell/configurator.py index f0a2d463..42765c52 100644 --- a/src/ell/configurator.py +++ b/src/ell/configurator.py @@ -33,18 +33,57 @@ class _Model: class Config(BaseModel): - model_config = ConfigDict(arbitrary_types_allowed=True) - registry: Dict[str, _Model] = Field(default_factory=dict, description="A dictionary mapping model names to their configurations.") - verbose: bool = Field(default=False, description="If True, enables verbose logging.") - wrapped_logging: bool = Field(default=True, description="If True, enables wrapped logging for better readability.") - override_wrapped_logging_width: Optional[int] = Field(default=None, description="If set, overrides the default width for wrapped logging.") - store: Optional[Store] = Field(default=None, description="An optional Store instance for persistence.") - autocommit: bool = Field(default=False, description="If True, enables automatic committing of changes to the store.") - lazy_versioning: bool = Field(default=True, description="If True, enables lazy versioning for improved performance.") - default_api_params: Dict[str, Any] = Field(default_factory=dict, description="Default parameters for language models.") - default_client: Optional[openai.Client] = Field(default=None, description="The default OpenAI client used when a specific model client is not found.") - autocommit_model: str = Field(default="gpt-4o-mini", description="When set, changes the default autocommit model from GPT 4o mini.") - providers: Dict[Type, Provider] = Field(default_factory=dict, description="A dictionary mapping client types to provider classes.") + """Configuration class for ELL.""" + + model_config = ConfigDict( + arbitrary_types_allowed=True, + protected_namespaces=('protect_', ) # Override protected namespaces + ) + + registry: Dict[str, _Model] = Field( + default_factory=dict, + description="A dictionary mapping model names to their configurations." + ) + verbose: bool = Field( + default=False, + description="If True, enables verbose logging." + ) + wrapped_logging: bool = Field( + default=True, + description="If True, enables wrapped logging for better readability." + ) + override_wrapped_logging_width: Optional[int] = Field( + default=None, + description="If set, overrides the default width for wrapped logging." + ) + store: Optional[Store] = Field( + default=None, + description="An optional Store instance for persistence." + ) + autocommit: bool = Field( + default=False, + description="If True, enables automatic committing of changes to the store." + ) + lazy_versioning: bool = Field( + default=True, + description="If True, enables lazy versioning for improved performance." + ) + default_api_params: Dict[str, Any] = Field( + default_factory=dict, + description="Default parameters for language models." + ) + default_client: Optional[openai.Client] = Field( + default=None, + description="The default OpenAI client used when a specific model client is not found." + ) + autocommit_model: str = Field( + default="gpt-4o-mini", + description="When set, changes the default autocommit model from GPT 4o mini." + ) + providers: Dict[Type, Provider] = Field( + default_factory=dict, + description="A dictionary mapping client types to provider classes." + ) def __init__(self, **data): super().__init__(**data) self._lock = threading.Lock() From 6b4e4abae45676f8558687d4b6e98b68cc12bff2 Mon Sep 17 00:00:00 2001 From: William Guss Date: Wed, 20 Nov 2024 13:29:26 -0800 Subject: [PATCH 3/5] OpenAI models Nov 20 --- src/ell/models/openai.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ell/models/openai.py b/src/ell/models/openai.py index a53941fd..37577daa 100644 --- a/src/ell/models/openai.py +++ b/src/ell/models/openai.py @@ -59,6 +59,14 @@ def register(client: openai.Client): 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini', 'gpt-4o-2024-08-06', + 'gpt-4o-2024-11-20', + 'chatgpt-4o-latest', + 'gpt-4o-realtime-preview', + 'gpt-4o-realtime-preview-2024-10-01', + 'gpt-4o-audio-preview', + 'gpt-4o-audio-preview-2024-10-01', + 'o1-preview-2024-09-12', + 'o1-mini-2024-09-12', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'tts-1', @@ -82,8 +90,8 @@ def register(client: openai.Client): config.register_model(model_id, client) #XXX: Deprecation in 0.1.0 - config.register_model('o1-preview', client, supports_streaming=False) - config.register_model('o1-mini', client, supports_streaming=False) + config.register_model('o1-preview', client, supports_streaming=True) + config.register_model('o1-mini', client, supports_streaming=True) default_client = None try: From 56af99a9061436d93021235fd9f2ef5267f88837 Mon Sep 17 00:00:00 2001 From: William Guss Date: Thu, 21 Nov 2024 10:36:44 -0800 Subject: [PATCH 4/5] v0.0.15 --- docs/src/core_concepts/evaluation.rst.partial | 210 ++++++++++++++++++ examples/evals/summaries.py | 8 +- examples/rag/rag.py | 2 +- pyproject.toml | 2 +- src/ell/evaluation/evaluation.py | 9 +- src/ell/stores/sql.py | 51 ++--- tests/.exampleignore | 6 +- 7 files changed, 253 insertions(+), 35 deletions(-) create mode 100644 docs/src/core_concepts/evaluation.rst.partial diff --git a/docs/src/core_concepts/evaluation.rst.partial b/docs/src/core_concepts/evaluation.rst.partial new file mode 100644 index 00000000..f2bda0ed --- /dev/null +++ b/docs/src/core_concepts/evaluation.rst.partial @@ -0,0 +1,210 @@ +Evaluations +=========== + +Evaluations in ELL provide a powerful framework for assessing and analyzing Language Model Programs (LMPs). This guide covers the core concepts and features of the evaluation system. + +Basic Usage +---------- + +Here's a simple example of creating and running an evaluation: + +.. code-block:: python + + import ell + from ell import Evaluation + + @ell.simple(model="gpt-4") + def my_lmp(input_text: str): + return f"Process this: {input_text}" + + # Define a metric function + def accuracy_metric(datapoint, output): + return float(datapoint["expected_output"].lower() in output.lower()) + + # Create an evaluation + eval = Evaluation( + name="basic_evaluation", + n_evals=10, + metrics={"accuracy": accuracy_metric} + ) + + # Run the evaluation + results = eval.run(my_lmp, n_workers=10) + +Core Components +------------- + +Evaluation Configuration +~~~~~~~~~~~~~~~~~~~~~~~ + +The ``Evaluation`` class accepts several key parameters: + +- ``name``: A unique identifier for the evaluation +- ``n_evals``: Number of evaluations to run +- ``metrics``: Dictionary of metric functions +- ``dataset``: Optional dataset for evaluation +- ``samples_per_datapoint``: Number of samples per dataset point (default: 1) + +Metrics +~~~~~~~ + +Metrics are functions that assess the performance of your LMP. They can be: + +1. Simple scalar metrics: + +.. code-block:: python + + def length_metric(_, output): + return len(output) + +2. Structured metrics: + +.. code-block:: python + + def detailed_metric(datapoint, output): + return { + "length": len(output), + "contains_keyword": datapoint["keyword"] in output, + "response_time": datapoint["response_time"] + } + +3. Multiple metrics: + +.. code-block:: python + + metrics = { + "accuracy": accuracy_metric, + "length": length_metric, + "detailed": detailed_metric + } + +Dataset Handling +~~~~~~~~~~~~~~ + +Evaluations can use custom datasets: + +.. code-block:: python + + dataset = [ + { + "input": {"question": "What is the capital of France?"}, + "expected_output": "Paris" + }, + { + "input": {"question": "What is the capital of Italy?"}, + "expected_output": "Rome" + } + ] + + eval = Evaluation( + name="geography_quiz", + dataset=dataset, + metrics={"accuracy": accuracy_metric} + ) + +Parallel Execution +~~~~~~~~~~~~~~~~ + +Evaluations support parallel execution for improved performance: + +.. code-block:: python + + # Run with 10 parallel workers + results = eval.run(my_lmp, n_workers=10, verbose=True) + +Results and Analysis +------------------ + +Result Structure +~~~~~~~~~~~~~~ + +Evaluation results include: + +- Metric summaries (mean, std, min, max) +- Individual run details +- Execution metadata +- Error tracking + +Accessing Results +~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Get mean accuracy + mean_accuracy = results.metrics["accuracy"].mean() + + # Get standard deviation + std_accuracy = results.metrics["accuracy"].std() + + # Access individual runs + for run in results.runs: + print(f"Run ID: {run.id}") + print(f"Success: {run.success}") + print(f"Duration: {run.end_time - run.start_time}") + +Advanced Features +--------------- + +Evaluation Types +~~~~~~~~~~~~~~ + +ELL supports different types of evaluations: + +- ``METRIC``: Numerical performance metrics +- ``ANNOTATION``: Human or model annotations +- ``CRITERION``: Pass/fail criteria + +Version Control +~~~~~~~~~~~~~ + +Evaluations support versioning: + +- Version numbers +- Commit messages +- History tracking +- Multiple runs per version + +Error Handling +~~~~~~~~~~~~ + +Robust error handling and reporting: + +- Automatic error capture +- Failed run management +- Success status tracking +- Detailed error messages + +ELL Studio Integration +-------------------- + +The evaluation system integrates with ELL Studio, providing: + +- Visual evaluation management +- Result visualization +- Run comparisons +- Filtering and search +- Metric summaries +- Version control interface + +Best Practices +------------ + +1. **Metric Design** + - Keep metrics focused and specific + - Use appropriate return types + - Handle edge cases + +2. **Dataset Management** + - Use representative data + - Include edge cases + - Maintain dataset versioning + +3. **Performance Optimization** + - Use appropriate worker counts + - Monitor resource usage + - Cache results when possible + +4. **Version Control** + - Use meaningful commit messages + - Track major changes + - Maintain evaluation history diff --git a/examples/evals/summaries.py b/examples/evals/summaries.py index 919c4d46..5e59fa8b 100644 --- a/examples/evals/summaries.py +++ b/examples/evals/summaries.py @@ -9,7 +9,7 @@ import ell.lmp.function -dataset: List[ell.evaluation.Datapoint] = [ +dataset = [ { "input": { # I really don't like this. Forcing "input" without typing feels disgusting. "text": "The Industrial Revolution was a period of major industrialization and innovation that took place during the late 1700s and early 1800s. It began in Great Britain and quickly spread throughout Western Europe and North America. This revolution saw a shift from an economy based on agriculture and handicrafts to one dominated by industry and machine manufacturing. Key technological advancements included the steam engine, which revolutionized transportation and manufacturing processes. The textile industry, in particular, saw significant changes with the invention of spinning jennies, water frames, and power looms. These innovations led to increased productivity and the rise of factories. The Industrial Revolution also brought about significant social changes, including urbanization, as people moved from rural areas to cities for factory work. While it led to economic growth and improved living standards for some, it also resulted in poor working conditions, child labor, and environmental pollution. The effects of this period continue to shape our modern world." @@ -126,7 +126,7 @@ def length_criterion(_, output): eval_list = ell.evaluation.Evaluation( name="test_list", dataset=dataset, - criteria=[score_criterion, length_criterion], + metrics=[score_criterion, length_criterion], ) # Example using a dictionary of criteria (as before) @@ -139,8 +139,8 @@ def length_criterion(_, output): # Run evaluation with list-based criteria print("EVAL WITH GPT-4o (list-based criteria)") results = eval_list.run(summarizer, n_workers=4, verbose=False).results -print("Mean critic score:", results.metrics["score"].mean()) -print("Mean length of completions:", results.metrics["length"].mean()) +print("Mean critic score:", results.metrics["score_criterion"].mean()) +print("Mean length of completions:", results.metrics["length_criterion"].mean()) # Run evaluation with dict-based criteria print("EVAL WITH GPT-4o (dict-based criteria)") diff --git a/examples/rag/rag.py b/examples/rag/rag.py index db7b89b7..87b8a658 100644 --- a/examples/rag/rag.py +++ b/examples/rag/rag.py @@ -2,7 +2,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np -from ell import ell +import ell class VectorStore: diff --git a/pyproject.toml b/pyproject.toml index eb9d2366..d79e9255 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ell-ai" -version = "0.0.14" +version = "0.0.15" description = "ell - the language model programming library" authors = ["William Guss "] license = "MIT" diff --git a/src/ell/evaluation/evaluation.py b/src/ell/evaluation/evaluation.py index 7bcbe968..9ecee050 100644 --- a/src/ell/evaluation/evaluation.py +++ b/src/ell/evaluation/evaluation.py @@ -83,12 +83,19 @@ def wrap_callable(value): ) for k, v in value.items() } + elif isinstance(value, list): + return [ + function(type=LMPType.LABELER)(v) + if callable(v) and not hasattr(v, "__ell_track__") + else v + for v in value + ] elif callable(value) and not hasattr(value, "__ell_track__"): return function()(value) elif value is None: return value else: - raise ValueError(f"Expected dict, callable, or None, got {type(value)}") + raise ValueError(f"Expected dict, list, callable, or None, got {type(value)}") # Validate dataset/n_evals if self.dataset is None and self.n_evals is None: diff --git a/src/ell/stores/sql.py b/src/ell/stores/sql.py index b2f25d75..7766c21b 100644 --- a/src/ell/stores/sql.py +++ b/src/ell/stores/sql.py @@ -117,7 +117,7 @@ def write_invocation( def write_evaluation(self, evaluation: SerializedEvaluation) -> str: with Session(self.engine) as session: - try: + with session.no_autoflush: # Prevent autoflush while we query # Check if the evaluation already exists existing_evaluation = session.exec( select(SerializedEvaluation).where( @@ -136,33 +136,30 @@ def write_evaluation(self, evaluation: SerializedEvaluation) -> str: # Add the new evaluation session.add(evaluation) - # Process labelers - for labeler in evaluation.labelers: - existing_labeler = session.exec( - select(EvaluationLabeler).where( - (EvaluationLabeler.evaluation_id == evaluation.id) - & (EvaluationLabeler.name == labeler.name) - ) - ).first() - - if existing_labeler: - # Update existing labeler - existing_labeler.type = labeler.type - existing_labeler.labeling_lmp_id = labeler.labeling_lmp_id - existing_labeler.labeling_rubric = labeler.labeling_rubric - else: - # Add new labeler - labeler.evaluation_id = evaluation.id - session.add(labeler) + # Process labelers + for labeler in evaluation.labelers: + existing_labeler = session.exec( + select(EvaluationLabeler).where( + and_( + EvaluationLabeler.evaluation_id == evaluation.id, + EvaluationLabeler.name == labeler.name + ) + ) + ).first() + + if existing_labeler: + # Update existing labeler + existing_labeler.type = labeler.type + existing_labeler.labeling_lmp_id = labeler.labeling_lmp_id + existing_labeler.labeling_rubric = labeler.labeling_rubric + else: + # Add new labeler + labeler.evaluation_id = evaluation.id + session.add(labeler) + + session.commit() + return evaluation.id - session.commit() - return evaluation.id - except IntegrityError as e: - session.rollback() - raise ValueError(f"Error writing evaluation: {str(e)}") - except Exception as e: - session.rollback() - raise e def write_evaluation_run(self, evaluation_run: SerializedEvaluationRun) -> int: with Session(self.engine) as session: diff --git a/tests/.exampleignore b/tests/.exampleignore index f77107cb..5a6f6ff9 100644 --- a/tests/.exampleignore +++ b/tests/.exampleignore @@ -13,4 +13,8 @@ azure_ex.py openrouter_ex.py vllm_ex.py *_ex.py -bedrock_hello.py \ No newline at end of file +bedrock_hello.py +hello_postgres.py +exa/exa.py +exa.py +wikipedia_mini_rag.py \ No newline at end of file From 36ca5eea1a5d4c266071d6a50848ad154034c44d Mon Sep 17 00:00:00 2001 From: William Guss Date: Thu, 21 Nov 2024 13:08:53 -0800 Subject: [PATCH 5/5] example of notebooks working --- examples/notebook.ipynb | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 examples/notebook.ipynb diff --git a/examples/notebook.ipynb b/examples/notebook.ipynb new file mode 100644 index 00000000..57c8e520 --- /dev/null +++ b/examples/notebook.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import ell\n", + "ell.init(verbose=True, store='./logdir')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "╔══════════════════════════════════════════════════════════════════════════════╗\n", + "║ hello_world(world)\n", + "╠══════════════════════════════════════════════════════════════════════════════╣\n", + "║ Prompt:\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "│ user: Say hello, world!\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "║ Output:\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "│ assistant: Hello, world! 🌍\n", + "╚══════════════════════════════════════════════════════════════════════════════╝\n" + ] + }, + { + "data": { + "text/plain": [ + "'Hello, world! 🌍'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@ell.simple(model=\"gpt-4o-mini\")\n", + "def hello_world(input_text: str):\n", + " return f\"Say hello, {input_text}!\"\n", + "\n", + "hello_world(\"world\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "╔══════════════════════════════════════════════════════════════════════════════╗\n", + "║ hello_world(world)\n", + "╠══════════════════════════════════════════════════════════════════════════════╣\n", + "║ Prompt:\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "│ user: Say hello, world!\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "║ Output:\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "│ assistant: Hello, world! 😊 How can I assist you today?\n", + "╚══════════════════════════════════════════════════════════════════════════════╝\n", + "╔══════════════════════════════════════════════════════════════════════════════╗\n", + "║ my_other_lmp(world)\n", + "╠══════════════════════════════════════════════════════════════════════════════╣\n", + "║ Prompt:\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "│ user: Summarize Hello, world! 😊 How can I assist you today?\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "║ Output:\n", + "╟──────────────────────────────────────────────────────────────────────────────╢\n", + "│ assistant: \"Hello, world! 😊 How can I assist you today?\" is a\n", + "│ friendly greeting that expresses openness and willingness to\n", + "│ help.\n", + "╚══════════════════════════════════════════════════════════════════════════════╝\n" + ] + }, + { + "data": { + "text/plain": [ + "'\"Hello, world! 😊 How can I assist you today?\" is a friendly greeting that expresses openness and willingness to help.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@ell.simple(model=\"gpt-4o-mini\")\n", + "def sentiment(input_text: str):\n", + " return f\"assess the sentiment {hello_world(input_text)}\"\n", + "\n", + "my_other_lmp(\"world\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ell-ai-lj28Ksf3-py3.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}