Skip to content
This repository was archived by the owner on Aug 27, 2024. It is now read-only.

Commit e7846e7

Browse files
authored
Refactor Corpus instantiation flow, as well as API data structure (#9)
1 parent 070d870 commit e7846e7

13 files changed

+65
-66
lines changed

format.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
directories="memas tests integration-tests memas_client memas_sdk benchmarking"
1+
directories="memas tests integration-tests"
22
for dir in $directories; do
33
find $dir -type f -name "*.py" -exec autopep8 --max-line-length 120 -i {} \;
44
done

integration-tests/corpus/test_basic_corpus.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,22 @@
11
import numpy as np
22
import uuid
33
import time
4-
from memas.interface.storage_driver import DocumentEntity
5-
from memas.storage_driver.corpus_vector_store import CorpusVectorStore
6-
from memas.storage_driver import corpus_vector_store, corpus_doc_store, corpus_doc_metadata
4+
from memas.context_manager import ctx
75
from memas.corpus import basic_corpus
86
from memas.interface.corpus import Citation
97

108
corpus_name = "test corpus1"
11-
test_corpus = basic_corpus.BasicCorpus(uuid.uuid4(), corpus_name)
12-
139

1410
def test_save_then_search_one_corpus(es_client):
11+
test_corpus = basic_corpus.BasicCorpus(uuid.uuid4(), corpus_name, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec)
12+
1513
text1 = "The sun is high. California sunshine is great. "
1614
text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks"
1715
text3 = "The weather is great today, but I worry that tomorrow it won't be. My umbrella is in the repair shop."
1816

19-
assert test_corpus.store_and_index(text1, "doc1", Citation("www.docsource1", "SSSdoc1", ""))
20-
assert test_corpus.store_and_index(text2, "doc2", Citation("were.docsource2", "SSSdoc2", ""))
21-
assert test_corpus.store_and_index(text3, "doc3", Citation("docsource3.ai", "SSSdoc3", ""))
17+
assert test_corpus.store_and_index(text1, Citation("www.docsource1", "SSSdoc1", "", "doc1"))
18+
assert test_corpus.store_and_index(text2, Citation("were.docsource2", "SSSdoc2", "", "doc2"))
19+
assert test_corpus.store_and_index(text3, Citation("docsource3.ai", "SSSdoc3", "", "doc3"))
2220

2321
time.sleep(1)
2422
output = test_corpus.search("It is sunny")

integration-tests/storage_driver/test_corpus_doc_metadata.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@ def test_insert_and_get():
1414
corpus_id = uuid.uuid4()
1515
document_id = uuid.uuid4()
1616

17-
citation = Citation("google.com", "test google", "just a simple test")
18-
metadata.insert_document_metadata(corpus_id, document_id, 1, "test", citation)
17+
citation = Citation("google.com", "test google", "just a simple test", "test")
18+
metadata.insert_document_metadata(corpus_id, document_id, 1, citation)
1919
assert metadata.get_document_citation(corpus_id, document_id) == citation

memas/app.py

-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import yaml
22
from flask import Flask
33
from memas.context_manager import ContextManager
4-
from memas.interface.corpus import CorpusType
5-
from memas.corpus.basic_corpus import BasicCorpusFactory
64

75

86
def create_app(config_filename, *, first_init=False):
@@ -16,10 +14,6 @@ def create_app(config_filename, *, first_init=False):
1614

1715
app.ctx.init()
1816

19-
# TODO : Need a better place to put this
20-
app.ctx.corpus_provider.setCorpusFactory(CorpusType.KNOWLEDGE, BasicCorpusFactory())
21-
app.ctx.corpus_provider.setCorpusFactory(CorpusType.CONVERSATION, BasicCorpusFactory())
22-
2317
from memas.dataplane import dataplane
2418
from memas.controlplane import controlplane
2519
app.register_blueprint(dataplane)

memas/context_manager.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def __init__(self, app_config: Config):
7171
self.es: Elasticsearch
7272

7373
# Corpus provider
74-
self.corpus_provider: CorpusProvider = CorpusProvider()
74+
self.corpus_provider: CorpusProvider
7575

7676
def setup_cassandra_keyspace(self):
7777
"""Setup the cassandra keyspace. We only want to run the very first server launch.
@@ -121,6 +121,8 @@ def init_datastores(self) -> None:
121121
self.corpus_vec.init()
122122
self.corpus_doc.init()
123123

124+
self.corpus_provider = CorpusProvider(self.corpus_metadata, self.corpus_doc, self.corpus_vec)
125+
124126
def init(self) -> None:
125127
self.init_clients()
126128
self.init_datastores()

memas/corpus/basic_corpus.py

+23-19
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
# from search_redirect import SearchSettings
22
import logging
33
import uuid
4-
from functools import reduce
54
from memas.interface.corpus import Corpus, CorpusFactory
65
from memas.interface.corpus import Citation
7-
from memas.interface.storage_driver import DocumentEntity
6+
from memas.interface.storage_driver import CorpusDocumentMetadataStore, CorpusDocumentStore, CorpusVectorStore, DocumentEntity
87
from memas.interface.exceptions import SentenceLengthOverflowException
9-
from memas.context_manager import ctx
108
from memas.text_parsing.text_parsers import segment_document
119
from memas.corpus.corpus_searching import normalize_and_combine
1210

@@ -17,27 +15,30 @@
1715

1816
class BasicCorpus(Corpus):
1917

20-
def __init__(self, corpus_id: uuid.UUID, corpus_name: str):
18+
def __init__(self, corpus_id: uuid.UUID, corpus_name: str, metadata_store: CorpusDocumentMetadataStore, doc_store: CorpusDocumentStore, vec_store: CorpusVectorStore):
2119
super().__init__(corpus_id, corpus_name)
20+
self.metadata_store: CorpusDocumentMetadataStore = metadata_store
21+
self.doc_store: CorpusDocumentStore = doc_store
22+
self.vec_store: CorpusVectorStore = vec_store
2223

2324
"""
2425
The function stores a document in the elastic search DB, vecDB, and doc MetaData.
2526
Returns True on Success, False on Failure
2627
"""
2728

28-
def store_and_index(self, document: str, document_name: str, citation: Citation) -> bool:
29+
def store_and_index(self, document: str, citation: Citation) -> bool:
2930
_log.debug(f"Corpus storing and indexing [corpus_id={self.corpus_id}]")
3031

3132
doc_id = uuid.uuid4()
32-
doc_entity = DocumentEntity(self.corpus_id, doc_id, document_name, document)
33+
doc_entity = DocumentEntity(self.corpus_id, doc_id, citation.document_name, document)
3334

3435
document_chunks = segment_document(document, MAX_SEGMENT_LENGTH)
3536

3637
# TODO : Need to investigate how to undo when failures on partial insert
37-
meta_save = ctx.corpus_metadata.insert_document_metadata(
38-
self.corpus_id, doc_id, len(document_chunks), document_name, citation)
38+
meta_save = self.metadata_store.insert_document_metadata(
39+
self.corpus_id, doc_id, len(document_chunks), citation)
3940

40-
vec_save = ctx.corpus_vec.save_documents([doc_entity])
41+
vec_save = self.vec_store.save_documents([doc_entity])
4142

4243
# Divide longer documents for document store
4344
chunk_num = 0
@@ -46,11 +47,11 @@ def store_and_index(self, document: str, document_name: str, citation: Citation)
4647
# Create the new IDs for the document chunk combo
4748
chunk_id = doc_id.hex + '{:032b}'.format(chunk_num)
4849
chunk_num = chunk_num + 1
49-
doc_chunk_entity = DocumentEntity(self.corpus_id, doc_id, document_name, chunk)
50+
doc_chunk_entity = DocumentEntity(self.corpus_id, doc_id, citation.document_name, chunk)
5051
chunk_id_entity_pairs.append((chunk_id, doc_chunk_entity))
5152

5253
# Insert all chunks of document at once
53-
doc_save = ctx.corpus_doc.save_documents(id_doc_pairs=chunk_id_entity_pairs)
54+
doc_save = self.doc_store.save_documents(id_doc_pairs=chunk_id_entity_pairs)
5455

5556
return meta_save and vec_save and doc_save
5657

@@ -67,24 +68,24 @@ def search(self, clue: str) -> list[tuple[float, str, Citation]]:
6768
vector_search_count: int = 10
6869

6970
doc_store_results: list[tuple[float, str, Citation]] = []
70-
temp_res = ctx.corpus_doc.search_corpora([self.corpus_id], clue)
71+
temp_res = self.doc_store.search_corpora([self.corpus_id], clue)
7172
# Search the document store
7273
for score, doc_entity in temp_res:
7374
document_text = doc_entity.document
74-
citation = ctx.corpus_metadata.get_document_citation(self.corpus_id, doc_entity.document_id)
75+
citation = self.metadata_store.get_document_citation(self.corpus_id, doc_entity.document_id)
7576
doc_store_results.append([score, document_text, citation])
7677

7778
# Search for the vectors
7879
vec_store_results: list[tuple[float, str, Citation]] = []
79-
temp_res2 = ctx.corpus_vec.search_corpora([self.corpus_id], clue)
80+
temp_res2 = self.vec_store.search_corpora([self.corpus_id], clue)
8081
for score, doc_entity, start_index, end_index in temp_res2:
8182

8283
# Verify that the text recovered from the vectors fits the maximum sentence criteria
8384
if end_index - start_index != len(doc_entity.document):
8485
_log.error("Index not aligned with actual document", exc_info=True)
8586
raise SentenceLengthOverflowException(end_index - start_index)
8687

87-
citation = ctx.corpus_metadata.get_document_citation(self.corpus_id, doc_entity.document_id)
88+
citation = self.metadata_store.get_document_citation(self.corpus_id, doc_entity.document_id)
8889
vec_store_results.append([score, doc_entity.document, citation])
8990

9091
# If any of the searches returned no results combine and return
@@ -100,11 +101,14 @@ def search(self, clue: str) -> list[tuple[float, str, Citation]]:
100101

101102
return results
102103

103-
def generate_search_instructions(self, clue: str) -> any:
104-
pass
105-
106104

107105
class BasicCorpusFactory(CorpusFactory):
106+
def __init__(self, metadata_store: CorpusDocumentMetadataStore, doc_store: CorpusDocumentStore, vec_store: CorpusVectorStore) -> None:
107+
super().__init__()
108+
self.metadata_store: CorpusDocumentMetadataStore = metadata_store
109+
self.doc_store: CorpusDocumentStore = doc_store
110+
self.vec_store: CorpusVectorStore = vec_store
111+
108112
def produce(self, corpus_id: uuid.UUID):
109113
# TODO: Maybe change the Corpus Name Parameter
110-
return BasicCorpus(corpus_id, "BasicCorpus")
114+
return BasicCorpus(corpus_id, "BasicCorpus", self.metadata_store, self.doc_store, self.vec_store)

memas/corpus/corpus_provider.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
11
import logging
22
from uuid import UUID
3+
from memas.corpus.basic_corpus import BasicCorpusFactory
34
from memas.interface.corpus import Corpus, CorpusFactory, CorpusType
5+
from memas.interface.storage_driver import CorpusDocumentMetadataStore, CorpusDocumentStore, CorpusVectorStore
46

57

68
_log = logging.getLogger(__name__)
79

810

911
class CorpusProvider:
10-
def __init__(self) -> None:
12+
def __init__(self, metadata_store: CorpusDocumentMetadataStore, doc_store: CorpusDocumentStore, vec_store: CorpusVectorStore) -> None:
1113
self.factory_dict: dict[CorpusType, CorpusFactory] = dict()
14+
15+
basic_corpus_factory = BasicCorpusFactory(metadata_store, doc_store, vec_store)
16+
self.factory_dict[CorpusType.CONVERSATION] = basic_corpus_factory
17+
self.factory_dict[CorpusType.KNOWLEDGE] = basic_corpus_factory
1218

13-
def setCorpusFactory(self, corpus_type: CorpusType, corpus_factory: CorpusFactory):
14-
self.factory_dict[corpus_type] = corpus_factory
15-
16-
# TODO : Fix the last parameter that was just removed - what is that supposed to be for? namespace_id
17-
18-
def get_corpus(self, corpus_id: UUID, *, corpus_type: CorpusType) -> Corpus:
19+
def get_corpus(self, corpus_id: UUID, *, corpus_type: CorpusType, namespace_id: UUID=None) -> Corpus:
1920
"""Gets the Corpus class based on the corpus_id
2021
2122
Args:
2223
corpus_id (UUID): corpus_id
2324
corpus_type (CorpusType): type of the corpus, this is necessary unless a namespace_id is provided
24-
namespace_id (UUID): namespace_id of the corpus. This is necessary when
25+
namespace_id (UUID): namespace_id of the corpus, this is necessary when a corpus type is not provided,
26+
since it's needed to find the corpus type.
2527
2628
Returns:
2729
Corpus: _description_

memas/corpus/corpus_searching.py

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from memas.interface.corpus import Citation
66
from memas.interface.storage_driver import DocumentEntity
77
from memas.interface.exceptions import SentenceLengthOverflowException
8-
from memas.context_manager import ctx
98

109

1110
def corpora_search(corpus_ids: list[UUID], clue: str) -> list[tuple[float, str, Citation]]:

memas/dataplane.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
from flask import Blueprint, current_app, request
33
from memas.context_manager import ctx
44
from memas.interface.corpus import Citation, Corpus, CorpusType
5-
from memas.storage_driver.memas_metadata import split_corpus_pathname
6-
from memas.corpus.basic_corpus import BasicCorpusFactory
7-
from memas.corpus.corpus_searching import corpora_search
5+
86

97
dataplane = Blueprint("dp", __name__, url_prefix="/dp")
108

@@ -38,20 +36,21 @@ def recall():
3836
def memorize():
3937
corpus_pathname: str = request.json["corpus_pathname"]
4038
document: str = request.json["document"]
41-
document_name: str = request.json.get("document_name", "")
39+
raw_citation: str = request.json["citation"]
40+
41+
document_name = raw_citation.get("document_name", "")
4242

4343
current_app.logger.info(f"Memorizing [corpus_pathname=\"{corpus_pathname}\"] [document_name=\"{document_name}\"]")
4444

45-
# TODO : need to be able to fetch the corpus name for citation purposes
46-
corpus_name = split_corpus_pathname(corpus_pathname)[1]
47-
raw_citation: str = request.json["citation"]
48-
citation = Citation(raw_citation["source_uri"], raw_citation["source_name"],
49-
raw_citation["description"])
45+
citation = Citation(source_uri=raw_citation.get("source_uri", ""),
46+
source_name=raw_citation.get("source_name", ""),
47+
description=raw_citation.get("description", ""),
48+
document_name=document_name)
5049

5150
corpus_info = ctx.memas_metadata.get_corpus_info(corpus_pathname)
5251

5352
corpus: Corpus = ctx.corpus_provider.get_corpus(corpus_info.corpus_id, corpus_type=corpus_info.corpus_type)
54-
success = corpus.store_and_index(document, document_name, citation)
53+
success = corpus.store_and_index(document, citation)
5554

5655
current_app.logger.info(f"Memorize finished [success={success}]")
5756
return {"success": success}

memas/interface/corpus.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class Citation:
1414
source_uri: str
1515
source_name: str
1616
description: str
17+
document_name: str
1718

1819

1920
@dataclass
@@ -29,11 +30,11 @@ class Corpus(ABC):
2930
"""
3031

3132
def __init__(self, corpus_id: UUID, corpus_name: str):
32-
self.corpus_id = corpus_id
33-
self.corpus_name = corpus_name
33+
self.corpus_id: UUID = corpus_id
34+
self.corpus_name: str = corpus_name
3435

3536
@abstractmethod
36-
def store_and_index(self, document: str, document_name: str, citation: Citation) -> bool:
37+
def store_and_index(self, document: str, citation: Citation) -> bool:
3738
"""Store and index a "document"
3839
3940
Args:

memas/interface/storage_driver.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,12 @@ class CorpusDocumentMetadataStore(StorageDriver):
9292
Metadata store for storing citations and other metadata for documents within the corpus.
9393
"""
9494
@abstractmethod
95-
def insert_document_metadata(self, corpus_id: UUID, document_id: UUID, num_segments: int, document_name: str, citation: Citation) -> bool:
95+
def insert_document_metadata(self, corpus_id: UUID, document_id: UUID, num_segments: int, citation: Citation) -> bool:
9696
"""Inserts document metadata
9797
9898
Args:
9999
corpus_id (UUID): corpus id
100100
document_id (UUID): document id
101-
document_name (str): document name
102101
citation (Citation): citation object
103102
104103
Returns:
@@ -122,6 +121,7 @@ def get_document_citation(self, corpus_id: UUID, document_id: UUID) -> Citation:
122121
class DocumentEntity:
123122
corpus_id: UUID
124123
document_id: UUID
124+
# while strictly speaking this is metadata, this increases data readability
125125
document_name: str
126126
document: str
127127

memas/storage_driver/corpus_doc_metadata.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,12 @@ def init(self):
3131
def first_init(self):
3232
self.init()
3333

34-
def insert_document_metadata(self, corpus_id: UUID, document_id: UUID, num_segments: int, document_name: str, citation: Citation) -> bool:
34+
def insert_document_metadata(self, corpus_id: UUID, document_id: UUID, num_segments: int, citation: Citation) -> bool:
3535
"""Inserts document metadata
3636
3737
Args:
3838
corpus_id (UUID): corpus id
3939
document_id (UUID): document id
40-
document_name (str): document name
4140
citation (Citation): citation object
4241
4342
Returns:
@@ -47,7 +46,7 @@ def insert_document_metadata(self, corpus_id: UUID, document_id: UUID, num_segme
4746

4847
DocumentMetadata.create(corpus_id=corpus_id,
4948
document_id=document_id,
50-
document_name=document_name,
49+
document_name=citation.document_name,
5150
source_name=citation.source_name,
5251
source_uri=citation.source_uri,
5352
description=citation.description,
@@ -70,7 +69,8 @@ def get_document_citation(self, corpus_id: UUID, document_id: UUID) -> Citation:
7069
corpus_id=corpus_id, document_id=document_id)
7170
return Citation(source_uri=result.source_uri,
7271
source_name=result.source_name,
73-
description=result.description)
72+
description=result.description,
73+
document_name=result.document_name)
7474

7575
def get_document_segment_count(self, corpus_id: UUID, document_id: UUID) -> int:
7676
"""Retrieves the number of segments a stored document was split into

memas/storage_driver/corpus_vector_store.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
max_length=32, is_partition_key=True),
4242
FieldSchema(name=DOCUMENT_NAME, dtype=DataType.VARCHAR,
4343
max_length=256),
44-
FieldSchema(name="text_preview", dtype=DataType.VARCHAR, max_length=MAX_TEXT_LENGTH),
44+
FieldSchema(name=TEXT_PREVIEW, dtype=DataType.VARCHAR, max_length=MAX_TEXT_LENGTH),
4545
FieldSchema(name=EMBEDDING_FIELD, dtype=DataType.FLOAT_VECTOR, dim=USE_VECTOR_DIMENSION),
4646
FieldSchema(name=START_FIELD, dtype=DataType.INT64),
4747
FieldSchema(name=END_FIELD, dtype=DataType.INT64),

0 commit comments

Comments
 (0)