From 6bf4c4c976f9ed625e4a8ae7e7c3ae3e391dfb9d Mon Sep 17 00:00:00 2001 From: AlexArtrip Date: Fri, 3 Nov 2023 02:47:11 -0500 Subject: [PATCH 1/4] basic multicorp implementation --- integration-tests/corpus/test_multicorpus.py | 41 ++++++++++++++++++++ memas/corpus/corpus_searching.py | 39 +++++++++++++++++-- memas/dataplane.py | 25 +++++++++--- 3 files changed, 96 insertions(+), 9 deletions(-) create mode 100644 integration-tests/corpus/test_multicorpus.py diff --git a/integration-tests/corpus/test_multicorpus.py b/integration-tests/corpus/test_multicorpus.py new file mode 100644 index 0000000..502d2af --- /dev/null +++ b/integration-tests/corpus/test_multicorpus.py @@ -0,0 +1,41 @@ +import numpy as np +import uuid +import time +from memas.corpus import basic_corpus +from memas.corpus.corpus_searching import mult_corpus_search +from memas.interface.corpus import Citation, CorpusInfo, CorpusType + +corpus_name = "test corpus1" + + +def test_multicorpus_search(ctx, test_client): + namespace_id = uuid.uuid4() + corpus_id1 = uuid.uuid4() + corpus_id2 = uuid.uuid4() + corpus_id3 = uuid.uuid4() + corpus_info1 = CorpusInfo("test_corpus1", namespace_id, corpus_id1, CorpusType.CONVERSATION) + corpus_info2 = CorpusInfo("test_corpus2", namespace_id, corpus_id2, CorpusType.KNOWLEDGE) + corpus_info3 = CorpusInfo("test_corpus3", namespace_id, corpus_id3, CorpusType.CONVERSATION) + test_corpus1 = basic_corpus.BasicCorpus(corpus_info1, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec) + test_corpus2 = basic_corpus.BasicCorpus(corpus_info2, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec) + test_corpus3 = basic_corpus.BasicCorpus(corpus_info3, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec) + + text1 = "The sun is high. California sunshine is great. " + text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks" + text3 = "The weather is great today, but I worry that tomorrow it won't be. My umbrella is in the repair shop." + + assert test_corpus1.store_and_index(text1, Citation("www.docsource1", "SSSdoc1", "", "doc1")) + assert test_corpus2.store_and_index(text2, Citation("were.docsource2", "SSSdoc2", "", "doc2")) + assert test_corpus3.store_and_index(text3, Citation("docsource3.ai", "SSSdoc3", "", "doc3")) + + + time.sleep(1) + + corpus_dict = {} + corpus_dict[CorpusType.CONVERSATION] = [test_corpus1, test_corpus3] + corpus_dict[CorpusType.KNOWLEDGE] = [test_corpus2] + + output = mult_corpus_search(corpus_dict, "It is sunny", ctx, 5) + # Check that text was retrieved from all 3 corpuses upon searching + assert len(output) == 3 + diff --git a/memas/corpus/corpus_searching.py b/memas/corpus/corpus_searching.py index fd2727e..bc772ca 100644 --- a/memas/corpus/corpus_searching.py +++ b/memas/corpus/corpus_searching.py @@ -1,17 +1,48 @@ # from search_redirect import SearchSettings from uuid import UUID from functools import reduce -from memas.interface.corpus import Corpus, CorpusFactory +from memas.interface.corpus import Corpus, CorpusFactory, CorpusType from memas.interface.corpus import Citation from memas.interface.storage_driver import DocumentEntity from memas.interface.exceptions import SentenceLengthOverflowException -def corpora_search(corpus_ids: list[UUID], clue: str) -> list[tuple[float, str, Citation]]: +def mult_corpus_search(corpus_sets : dict[Corpus], clue, ctx, result_limit) -> list[tuple[float, str, Citation]]: + results = [] + + # Direct each multicorpus search to the right algorithm + for corpusType, corpora_list in corpus_sets.items() : + # Default basic corpus handling + if corpusType == CorpusType.KNOWLEDGE or corpusType == CorpusType.CONVERSATION : + corpus_type_results = basic_corpora_search(corpora_list, clue, ctx) + + results.append(corpus_type_results) + + # To combine results for corpora that don't have compareable scoring take equal sized subsets of each Corpus type + # TODO : This means that, for example, searching 1 conversation and 100 knowledge corpuses will return half of the + # results from the conversation corpus. Is that really the way to go? + combined_results = [] + for j in range(max([len(x) for x in results])) : + for i in range(len(results)) : + if j >= len(results[i]) or len(combined_results) >= result_limit: + break + combined_results.append(results[i][j]) + if len(combined_results) >= result_limit: + break + + return combined_results + +""" +All corpora here should be of the same CorpusType implementation (basic_corpus) +""" +def basic_corpora_search(corpora: list[Corpus], clue: str, ctx) -> list[tuple[float, str, Citation]]: + # Extract information needed for a search + corpus_ids = [x.corpus_id for x in corpora] + vector_search_count: int = 10 doc_store_results: list[tuple[float, str, Citation]] = [] - temp_res = ctx.corpus_doc.multi_corpus_search(corpus_ids, clue) + temp_res = ctx.corpus_doc.search_corpora(corpus_ids, clue) # Search the document store for score, doc_entity in temp_res: document_text = doc_entity.document @@ -21,7 +52,7 @@ def corpora_search(corpus_ids: list[UUID], clue: str) -> list[tuple[float, str, # Search for the vectors vec_store_results: list[tuple[float, str, Citation]] = [] - temp_res2 = ctx.corpus_vec.multi_corpus_search(corpus_ids, clue) + temp_res2 = ctx.corpus_vec.search_corpora(corpus_ids, clue) for score, doc_entity, start_index, end_index in temp_res2: # Verify that the text recovered from the vectors fits the maximum sentence criteria diff --git a/memas/dataplane.py b/memas/dataplane.py index 893765f..2f7ca8c 100644 --- a/memas/dataplane.py +++ b/memas/dataplane.py @@ -1,7 +1,11 @@ from dataclasses import asdict from flask import Blueprint, current_app, request from memas.context_manager import ctx +from memas.corpus.corpus_searching import mult_corpus_search from memas.interface.corpus import Citation, Corpus, CorpusType +from collections import defaultdict +from memas.interface.namespace import CORPUS_SEPARATOR + dataplane = Blueprint("dp", __name__, url_prefix="/dp") @@ -14,16 +18,27 @@ def recall(): current_app.logger.info(f"Recalling [namespace_pathname=\"{namespace_pathname}\"]") + corpus_infos = ctx.memas_metadata.get_query_corpora(namespace_pathname) current_app.logger.debug(f"Querying corpuses: {corpus_infos}") - search_results: list[tuple[str, Citation]] = [] - for corpus_info in corpus_infos: + # search_results: list[tuple[str, Citation]] = [] + # for corpus_info in corpus_infos: + # corpus: Corpus = ctx.corpus_provider.get_corpus_by_info(corpus_info) + # search_results.extend(corpus.search(clue=clue)) + + # Group the corpora to search into sets based on their CorpusType + corpora_grouped_by_type = defaultdict(list) + for corpus_info in corpus_infos : + corpus_type = corpus_info.corpus_type corpus: Corpus = ctx.corpus_provider.get_corpus_by_info(corpus_info) - search_results.extend(corpus.search(clue=clue)) + corpora_grouped_by_type[corpus_type].append(corpus) + + # Execute a multicorpus search + # Need to refactor to remove ctx later and have a cleaner solution, but thats time i dont have right now : ( + search_results = mult_corpus_search(corpora_grouped_by_type, clue, ctx, 4) + current_app.logger.debug(f"Search Results are: {search_results}") - # Combine the results and only take the top ones - search_results.sort(key=lambda x: x[0], reverse=True) # TODO : It will improve Query speed significantly to fetch citations after determining which documents to send to user From ef9a7643fccf1708364155960b9f117265a63b71 Mon Sep 17 00:00:00 2001 From: AlexArtrip Date: Sat, 4 Nov 2023 22:19:54 -0500 Subject: [PATCH 2/4] basic multicorpus implemented --- integration-tests/corpus/test_basic_corpus.py | 6 ++-- integration-tests/corpus/test_multicorpus.py | 3 ++ memas/corpus/corpus_searching.py | 28 +++++++++++-------- memas/dataplane.py | 3 +- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/integration-tests/corpus/test_basic_corpus.py b/integration-tests/corpus/test_basic_corpus.py index 318eb63..91cd3a0 100644 --- a/integration-tests/corpus/test_basic_corpus.py +++ b/integration-tests/corpus/test_basic_corpus.py @@ -25,8 +25,8 @@ def test_save_then_search_one_corpus(ctx): output = test_corpus.search("It is sunny") # print("OUTPUT IS : ") # print(output) - assert "sunshine" in output[1][0] - assert "weather" in output[0][0] + assert "sunshine" in output[1][1] + assert "weather" in output[0][1] def test_delete_all_content(ctx): @@ -45,7 +45,7 @@ def test_delete_all_content(ctx): time.sleep(1) output = test_corpus.search("It is sunny") - assert "sunshine" in output[1][0] + assert "sunshine" in output[1][1] test_corpus.delete_all_content() time.sleep(1) diff --git a/integration-tests/corpus/test_multicorpus.py b/integration-tests/corpus/test_multicorpus.py index 502d2af..947c448 100644 --- a/integration-tests/corpus/test_multicorpus.py +++ b/integration-tests/corpus/test_multicorpus.py @@ -38,4 +38,7 @@ def test_multicorpus_search(ctx, test_client): output = mult_corpus_search(corpus_dict, "It is sunny", ctx, 5) # Check that text was retrieved from all 3 corpuses upon searching assert len(output) == 3 + + assert "sunshine" in output[1][1] + assert "weather" in output[0][1] diff --git a/memas/corpus/corpus_searching.py b/memas/corpus/corpus_searching.py index bc772ca..8b64dd7 100644 --- a/memas/corpus/corpus_searching.py +++ b/memas/corpus/corpus_searching.py @@ -3,30 +3,37 @@ from functools import reduce from memas.interface.corpus import Corpus, CorpusFactory, CorpusType from memas.interface.corpus import Citation +from collections import defaultdict from memas.interface.storage_driver import DocumentEntity from memas.interface.exceptions import SentenceLengthOverflowException def mult_corpus_search(corpus_sets : dict[Corpus], clue, ctx, result_limit) -> list[tuple[float, str, Citation]]: - results = [] + results = defaultdict(list) # Direct each multicorpus search to the right algorithm for corpusType, corpora_list in corpus_sets.items() : # Default basic corpus handling if corpusType == CorpusType.KNOWLEDGE or corpusType == CorpusType.CONVERSATION : corpus_type_results = basic_corpora_search(corpora_list, clue, ctx) + results["BASIC_SCORING"].extend(corpus_type_results) - results.append(corpus_type_results) + + sorted_results_matrix = [] + # Sort results with compareable scoring schemes + for scored_results in results.values() : + # Sort by descending scoring so best results come first + sorted_scored_results = sorted(scored_results, key=lambda x: x[0], reverse=True) + sorted_results_matrix.append(sorted_scored_results) # To combine results for corpora that don't have compareable scoring take equal sized subsets of each Corpus type - # TODO : This means that, for example, searching 1 conversation and 100 knowledge corpuses will return half of the - # results from the conversation corpus. Is that really the way to go? + # TODO : Consider changing this at some point in the future to have better searching of corpus sets with non-comparable scoring combined_results = [] - for j in range(max([len(x) for x in results])) : - for i in range(len(results)) : - if j >= len(results[i]) or len(combined_results) >= result_limit: + for j in range(max([len(x) for x in sorted_results_matrix])) : + for i in range(len(sorted_results_matrix)) : + if j >= len(sorted_results_matrix[i]) or len(combined_results) >= result_limit: break - combined_results.append(results[i][j]) + combined_results.append(sorted_results_matrix[i][j]) if len(combined_results) >= result_limit: break @@ -148,7 +155,4 @@ def normalize_and_combine(doc_results: list, vec_results: list): doc_results_normalized.extend(unique_vectors) - # Sort by descending scoring so best results come first - doc_results_normalized.sort(key=lambda x: x[0], reverse=True) - - return [(y, z) for [x, y, z] in doc_results_normalized] + return doc_results_normalized diff --git a/memas/dataplane.py b/memas/dataplane.py index 2f7ca8c..1ffe9be 100644 --- a/memas/dataplane.py +++ b/memas/dataplane.py @@ -39,11 +39,10 @@ def recall(): search_results = mult_corpus_search(corpora_grouped_by_type, clue, ctx, 4) current_app.logger.debug(f"Search Results are: {search_results}") - # TODO : It will improve Query speed significantly to fetch citations after determining which documents to send to user # Take only top few scores and remove scoring element before sending - return [{"document": doc, "citation": asdict(citation)} for doc, citation in search_results[0:5]] + return [{"score" : score, "document": doc, "citation": asdict(citation)} for score, doc, citation in search_results[0:5]] @dataplane.route('/memorize', methods=["POST"]) From 707a17b8ad47f89dfe3d8c338a6cf12726eea9b4 Mon Sep 17 00:00:00 2001 From: AlexArtrip Date: Mon, 6 Nov 2023 00:59:01 -0600 Subject: [PATCH 3/4] fixed nits & removed sending scores to users --- integration-tests/corpus/test_multicorpus.py | 4 ++-- memas/corpus/corpus_searching.py | 16 +++------------- memas/dataplane.py | 8 ++++---- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/integration-tests/corpus/test_multicorpus.py b/integration-tests/corpus/test_multicorpus.py index 947c448..e9746d3 100644 --- a/integration-tests/corpus/test_multicorpus.py +++ b/integration-tests/corpus/test_multicorpus.py @@ -2,7 +2,7 @@ import uuid import time from memas.corpus import basic_corpus -from memas.corpus.corpus_searching import mult_corpus_search +from memas.corpus.corpus_searching import multi_corpus_search from memas.interface.corpus import Citation, CorpusInfo, CorpusType corpus_name = "test corpus1" @@ -35,7 +35,7 @@ def test_multicorpus_search(ctx, test_client): corpus_dict[CorpusType.CONVERSATION] = [test_corpus1, test_corpus3] corpus_dict[CorpusType.KNOWLEDGE] = [test_corpus2] - output = mult_corpus_search(corpus_dict, "It is sunny", ctx, 5) + output = multi_corpus_search(corpus_dict, "It is sunny", ctx, 5) # Check that text was retrieved from all 3 corpuses upon searching assert len(output) == 3 diff --git a/memas/corpus/corpus_searching.py b/memas/corpus/corpus_searching.py index 8b64dd7..a1fae83 100644 --- a/memas/corpus/corpus_searching.py +++ b/memas/corpus/corpus_searching.py @@ -8,13 +8,13 @@ from memas.interface.exceptions import SentenceLengthOverflowException -def mult_corpus_search(corpus_sets : dict[Corpus], clue, ctx, result_limit) -> list[tuple[float, str, Citation]]: +def multi_corpus_search(corpus_sets : dict[CorpusType, list[Corpus]], clue : str, ctx, result_limit : int) -> list[tuple[float, str, Citation]]: results = defaultdict(list) # Direct each multicorpus search to the right algorithm - for corpusType, corpora_list in corpus_sets.items() : + for corpus_type, corpora_list in corpus_sets.items() : # Default basic corpus handling - if corpusType == CorpusType.KNOWLEDGE or corpusType == CorpusType.CONVERSATION : + if corpus_type == CorpusType.KNOWLEDGE or corpus_type == CorpusType.CONVERSATION : corpus_type_results = basic_corpora_search(corpora_list, clue, ctx) results["BASIC_SCORING"].extend(corpus_type_results) @@ -46,8 +46,6 @@ def basic_corpora_search(corpora: list[Corpus], clue: str, ctx) -> list[tuple[fl # Extract information needed for a search corpus_ids = [x.corpus_id for x in corpora] - vector_search_count: int = 10 - doc_store_results: list[tuple[float, str, Citation]] = [] temp_res = ctx.corpus_doc.search_corpora(corpus_ids, clue) # Search the document store @@ -71,10 +69,6 @@ def basic_corpora_search(corpora: list[Corpus], clue: str, ctx) -> list[tuple[fl vec_store_results.append([score, doc_entity.document, citation]) - # print("Docs then Vecs : ") - # print(doc_store_results) - # print(vec_store_results) - # If any of the searches returned no results combine and return if len(vec_store_results) == 0: doc_store_results.sort(key=lambda x: x[0], reverse=True) @@ -90,10 +84,6 @@ def basic_corpora_search(corpora: list[Corpus], clue: str, ctx) -> list[tuple[fl def normalize_and_combine(doc_results: list, vec_results: list): - # print("Docs then Vecs : ") - # print(doc_results) - # print(vec_results) - # normalization with assumption that top score matches are approximately equal # Vec scores are based on distance, so smaller is better. Need to inverse the diff --git a/memas/dataplane.py b/memas/dataplane.py index 1ffe9be..0bd2d0c 100644 --- a/memas/dataplane.py +++ b/memas/dataplane.py @@ -1,7 +1,7 @@ from dataclasses import asdict from flask import Blueprint, current_app, request from memas.context_manager import ctx -from memas.corpus.corpus_searching import mult_corpus_search +from memas.corpus.corpus_searching import multi_corpus_search from memas.interface.corpus import Citation, Corpus, CorpusType from collections import defaultdict from memas.interface.namespace import CORPUS_SEPARATOR @@ -35,14 +35,14 @@ def recall(): corpora_grouped_by_type[corpus_type].append(corpus) # Execute a multicorpus search - # Need to refactor to remove ctx later and have a cleaner solution, but thats time i dont have right now : ( - search_results = mult_corpus_search(corpora_grouped_by_type, clue, ctx, 4) + # TODO : Should look into refactor to remove ctx later and have a cleaner solution + search_results = multi_corpus_search(corpora_grouped_by_type, clue, ctx, 4) current_app.logger.debug(f"Search Results are: {search_results}") # TODO : It will improve Query speed significantly to fetch citations after determining which documents to send to user # Take only top few scores and remove scoring element before sending - return [{"score" : score, "document": doc, "citation": asdict(citation)} for score, doc, citation in search_results[0:5]] + return [{"document": doc, "citation": asdict(citation)} for score, doc, citation in search_results[0:5]] @dataplane.route('/memorize', methods=["POST"]) From 152b97db59ba0ad40c22c78bca1403fffc5513b5 Mon Sep 17 00:00:00 2001 From: AlexArtrip Date: Mon, 6 Nov 2023 00:59:55 -0600 Subject: [PATCH 4/4] formatting --- integration-tests/corpus/test_multicorpus.py | 4 +--- memas/corpus/corpus_searching.py | 18 ++++++++++-------- memas/dataplane.py | 6 ++---- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/integration-tests/corpus/test_multicorpus.py b/integration-tests/corpus/test_multicorpus.py index e9746d3..8b8860b 100644 --- a/integration-tests/corpus/test_multicorpus.py +++ b/integration-tests/corpus/test_multicorpus.py @@ -28,7 +28,6 @@ def test_multicorpus_search(ctx, test_client): assert test_corpus2.store_and_index(text2, Citation("were.docsource2", "SSSdoc2", "", "doc2")) assert test_corpus3.store_and_index(text3, Citation("docsource3.ai", "SSSdoc3", "", "doc3")) - time.sleep(1) corpus_dict = {} @@ -38,7 +37,6 @@ def test_multicorpus_search(ctx, test_client): output = multi_corpus_search(corpus_dict, "It is sunny", ctx, 5) # Check that text was retrieved from all 3 corpuses upon searching assert len(output) == 3 - + assert "sunshine" in output[1][1] assert "weather" in output[0][1] - diff --git a/memas/corpus/corpus_searching.py b/memas/corpus/corpus_searching.py index a1fae83..044ee2c 100644 --- a/memas/corpus/corpus_searching.py +++ b/memas/corpus/corpus_searching.py @@ -3,25 +3,24 @@ from functools import reduce from memas.interface.corpus import Corpus, CorpusFactory, CorpusType from memas.interface.corpus import Citation -from collections import defaultdict +from collections import defaultdict from memas.interface.storage_driver import DocumentEntity from memas.interface.exceptions import SentenceLengthOverflowException -def multi_corpus_search(corpus_sets : dict[CorpusType, list[Corpus]], clue : str, ctx, result_limit : int) -> list[tuple[float, str, Citation]]: +def multi_corpus_search(corpus_sets: dict[CorpusType, list[Corpus]], clue: str, ctx, result_limit: int) -> list[tuple[float, str, Citation]]: results = defaultdict(list) # Direct each multicorpus search to the right algorithm - for corpus_type, corpora_list in corpus_sets.items() : + for corpus_type, corpora_list in corpus_sets.items(): # Default basic corpus handling - if corpus_type == CorpusType.KNOWLEDGE or corpus_type == CorpusType.CONVERSATION : + if corpus_type == CorpusType.KNOWLEDGE or corpus_type == CorpusType.CONVERSATION: corpus_type_results = basic_corpora_search(corpora_list, clue, ctx) results["BASIC_SCORING"].extend(corpus_type_results) - sorted_results_matrix = [] # Sort results with compareable scoring schemes - for scored_results in results.values() : + for scored_results in results.values(): # Sort by descending scoring so best results come first sorted_scored_results = sorted(scored_results, key=lambda x: x[0], reverse=True) sorted_results_matrix.append(sorted_scored_results) @@ -29,8 +28,8 @@ def multi_corpus_search(corpus_sets : dict[CorpusType, list[Corpus]], clue : str # To combine results for corpora that don't have compareable scoring take equal sized subsets of each Corpus type # TODO : Consider changing this at some point in the future to have better searching of corpus sets with non-comparable scoring combined_results = [] - for j in range(max([len(x) for x in sorted_results_matrix])) : - for i in range(len(sorted_results_matrix)) : + for j in range(max([len(x) for x in sorted_results_matrix])): + for i in range(len(sorted_results_matrix)): if j >= len(sorted_results_matrix[i]) or len(combined_results) >= result_limit: break combined_results.append(sorted_results_matrix[i][j]) @@ -39,9 +38,12 @@ def multi_corpus_search(corpus_sets : dict[CorpusType, list[Corpus]], clue : str return combined_results + """ All corpora here should be of the same CorpusType implementation (basic_corpus) """ + + def basic_corpora_search(corpora: list[Corpus], clue: str, ctx) -> list[tuple[float, str, Citation]]: # Extract information needed for a search corpus_ids = [x.corpus_id for x in corpora] diff --git a/memas/dataplane.py b/memas/dataplane.py index 0bd2d0c..b1699a8 100644 --- a/memas/dataplane.py +++ b/memas/dataplane.py @@ -3,11 +3,10 @@ from memas.context_manager import ctx from memas.corpus.corpus_searching import multi_corpus_search from memas.interface.corpus import Citation, Corpus, CorpusType -from collections import defaultdict +from collections import defaultdict from memas.interface.namespace import CORPUS_SEPARATOR - dataplane = Blueprint("dp", __name__, url_prefix="/dp") @@ -18,7 +17,6 @@ def recall(): current_app.logger.info(f"Recalling [namespace_pathname=\"{namespace_pathname}\"]") - corpus_infos = ctx.memas_metadata.get_query_corpora(namespace_pathname) current_app.logger.debug(f"Querying corpuses: {corpus_infos}") @@ -29,7 +27,7 @@ def recall(): # Group the corpora to search into sets based on their CorpusType corpora_grouped_by_type = defaultdict(list) - for corpus_info in corpus_infos : + for corpus_info in corpus_infos: corpus_type = corpus_info.corpus_type corpus: Corpus = ctx.corpus_provider.get_corpus_by_info(corpus_info) corpora_grouped_by_type[corpus_type].append(corpus)