diff --git a/docs/_static/imgs/question_types.png b/docs/_static/imgs/question_types.png index e0f989df0..a248ef6e5 100644 Binary files a/docs/_static/imgs/question_types.png and b/docs/_static/imgs/question_types.png differ diff --git a/docs/concepts/testset_generation.md b/docs/concepts/testset_generation.md index 832574e19..6d8c31d31 100644 --- a/docs/concepts/testset_generation.md +++ b/docs/concepts/testset_generation.md @@ -57,40 +57,25 @@ Checkout [llama-index](https://gpt-index.readthedocs.io/en/stable/core_modules/d ```{code-block} python -:caption: Customising test set generation -from ragas.testset import TestsetGenerator -from langchain.embeddings import OpenAIEmbeddings -from langchain.chat_models import ChatOpenAI -from ragas.llms import LangchainLLM +:caption: Customising test data distribution +from ragas.testset.generator import TestsetGenerator +from ragas.testset.evolutions import simple, reasoning, multi_context # documents = load your documents -# Add custom llms and embeddings -generator_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-3.5-turbo")) -critic_llm = LangchainLLM(llm=ChatOpenAI(model="gpt-4")) -embeddings_model = OpenAIEmbeddings() +# generator with openai models +generator = TestsetGenerator.with_openai() # Change resulting question type distribution -testset_distribution = { - "simple": 0.25, - "reasoning": 0.5, - "multi_context": 0.0, - "conditional": 0.25, +distributions = { + simple: 0.5, + multi_context: 0.4, + reasoning: 0.1 } -# percentage of conversational question -chat_qa = 0.2 - - -test_generator = TestsetGenerator( - generator_llm=generator_llm, - critic_llm=critic_llm, - embeddings_model=embeddings_model, - testset_distribution=testset_distribution, - chat_qa=chat_qa, -) - -testset = test_generator.generate(documents, test_size=5) +# use generator.generate_with_llamaindex_docs if you use llama-index as document loader +testset = generator.generate_with_langchain_docs(documents, 10, distributions) +testset.to_pandas() ``` @@ -109,16 +94,6 @@ test_df.head() Analyze the frequency of different question types in the created dataset - ```{code-block} python - :caption: bar graph of question types -import seaborn as sns -sns.set(rc={'figure.figsize':(9,6)}) - -test_data_dist = test_df.question_type.value_counts().to_frame().reset_index() -sns.set_theme(style="whitegrid") -g = sns.barplot(y='count',x='question_type', data=test_data_dist) -g.set_title("Question type distribution",fontdict = { 'fontsize': 20}) - ```

test-outputs diff --git a/docs/getstarted/testset_generation.md b/docs/getstarted/testset_generation.md index 06222f8c3..d4652f02d 100644 --- a/docs/getstarted/testset_generation.md +++ b/docs/getstarted/testset_generation.md @@ -11,30 +11,23 @@ os.environ["OPENAI_API_KEY"] = "your-openai-key" ## Documents -To begin, we require a collection of documents to generate synthetic Question/Context/Answer samples. Here, we will employ the llama-index document loaders to retrieve documents. +To begin, we require a collection of documents to generate synthetic Question/Context/Answer samples. Here, we will employ the langchain document loader to load documents. ```{code-block} python -:caption: Load documents from Semantic Scholar -from llama_index import download_loader - -SemanticScholarReader = download_loader("SemanticScholarReader") -loader = SemanticScholarReader() -# Narrow down the search space -query_space = "large language models" -# Increase the limit to obtain more documents -documents = loader.load_data(query=query_space, limit=10) +:caption: Load documents from directory +from langchain.document_loaders import DirectoryLoader +loader = DirectoryLoader("your-directory") +documents = loader.load() ``` :::{note} Each Document object contains a metadata dictionary, which can be used to store additional information about the document which can be accessed with `Document.metadata`. Please ensure that the metadata dictionary contains a key called `file_name` as this will be used in the generation process. The `file_name` attribute in metadata is used to identify chunks belonging to the same document. For example, pages belonging to the same research publication can be identifies using filename. -An example of how to do this for `SemanticScholarReader` is shown below. +An example of how to do this is shown below. ```{code-block} python -for d in documents: - d.metadata["file_name"] = d.metadata["title"] - -documents[0].metadata +for document in documents: + document.metadata['file_name'] = document.metadata['source'] ``` ::: @@ -46,11 +39,15 @@ We will now import and use Ragas' `Testsetgenerator` to promptly generate a synt ```{code-block} python :caption: Create 10 samples using default configuration -from ragas.testset import TestsetGenerator +from ragas.testset.generator import TestsetGenerator +from ragas.testset.evolutions import simple, reasoning, multi_context + +# generator with openai models +generator = TestsetGenerator.with_openai() -testsetgenerator = TestsetGenerator.from_default() -test_size = 10 -testset = testsetgenerator.generate(documents, test_size=test_size) +# generate testset +testset = generator.generate_with_langchain_docs(documents, test_size=10) +testset.to_pandas() ``` Subsequently, we can export the results into a Pandas DataFrame. diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py index 7db63dd79..f0b292268 100644 --- a/src/ragas/testset/generator.py +++ b/src/ragas/testset/generator.py @@ -13,17 +13,25 @@ from ragas.executor import Executor from ragas.llms import BaseRagasLLM, LangchainLLMWrapper from ragas.testset.docstore import Document, DocumentStore, InMemoryDocumentStore -from ragas.testset.evolutions import ComplexEvolution, CurrentNodes, DataRow +from ragas.testset.evolutions import ( + ComplexEvolution, + CurrentNodes, + DataRow, + multi_context, + reasoning, + simple, +) from ragas.testset.filters import EvolutionFilter, NodeFilter, QuestionFilter if t.TYPE_CHECKING: from llama_index.readers.schema import Document as LlamaindexDocument from langchain_core.documents import Document as LCDocument -Distributions = t.Dict[t.Any, float] - logger = logging.getLogger(__name__) +Distributions = t.Dict[t.Any, float] +DEFAULT_DISTRIBUTION = {simple: 0.5, reasoning: 0.25, multi_context: 0.25} + @dataclass class TestDataset: @@ -126,7 +134,7 @@ def generate_with_langchain_docs( def generate( self, test_size: int, - distributions: Distributions = {}, + distributions: Distributions = DEFAULT_DISTRIBUTION, with_debugging_logs=False, ): # init filters and evolutions