Skip to content
This repository has been archived by the owner on Aug 27, 2024. It is now read-only.

Commit

Permalink
Implemented delete corpus using celery. Also made many refactors (#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
maxyu1115 authored Oct 30, 2023
1 parent 8d5affa commit 56d2eb7
Show file tree
Hide file tree
Showing 38 changed files with 690 additions and 133 deletions.
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Port to run memas
FLASK_RUN_PORT=8010

MEMAS_CONF_FILE=memas-config.yml

# Password for the 'elastic' user (at least 6 characters)
ELASTIC_PASSWORD=abc123
Expand Down
2 changes: 2 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[FORMAT]
max-line-length=120
8 changes: 4 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ To stop docker execution, run Control+C in the terminal you are running `docker

If you want to clean your local docker images, run
```bash
docker compose down --volumes
docker compose --profile dev down --volumes
```

FYI you may need to run `sysctl -w vm.max_map_count=262144` if you get an error when trying to start elasticsearch.
Expand All @@ -46,20 +46,20 @@ docker compose up
If this is your first time initializing the MeMaS server, after `docker compose up` and wait till the dependencies are fully started, run `source setup-env.sh`, then

```bash
flask --app 'memas.app:create_app(config_filename="memas-config.yml", first_init=True)' run
flask --app 'memas.app:create_app(first_init=True)' run
```

This will run for a while then exit. Upon exit, your MeMaS is properly setup. **NOTE: Only run this phase when you are working with a clean set of docker dependencies, aka a fresh start or after `docker compose down --volumes`.**

After MeMaS is properly initialized, run `source setup-env.sh`, then:
```bash
flask --app 'memas.app:create_app(config_filename="memas-config.yml")' run
flask --app 'memas.app:create_app' run
```
to start the memas server.

And to run the app with wsgi server, run
```bash
gunicorn -w 1 -k eventlet 'memas.app:create_app(config_filename="memas-config.yml")'
gunicorn -w 1 -k eventlet 'memas.app:create_app'
```
note `-w` sets the number of worker threads.

Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ COPY --chmod=0755 memas-docker/init.sh ./init.sh

# Copy in the default config
ARG conf_file=memas-config.yml
ENV conf_file=${conf_file}
ENV MEMAS_CONF_FILE=${conf_file}
COPY memas-docker/${conf_file} ./memas/${conf_file}
# TODO: provide way to use custom configs in docker compose

Expand All @@ -34,4 +34,4 @@ ENV PYTHONPATH "$PYTHONPATH:memas"


EXPOSE 8010
CMD gunicorn -b :8010 -w 1 -k eventlet "memas.app:create_app(config_filename=\"${conf_file}\")"
CMD gunicorn -b :8010 -w 1 -k eventlet "memas.app:create_app()"
37 changes: 34 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,29 @@ services:
# command: ./wait-for-it.sh milvus-standalone:19530 -t 300 -- gunicorn -w 1 -k eventlet 'memas.app:create_app(config_filename="memas-config.yml")'
profiles: ["dev"]

memas-worker:
build:
context: .
image: memas:latest
container_name: memas-worker
depends_on:
memas-init:
condition: service_completed_successfully
env_file:
- .env
volumes:
- memas_data:/memas
command: celery --app memas.make_celery worker --loglevel INFO
profiles: ["dev"]

redis:
image: redis
container_name: redis
ports:
- 6379:6379
volumes:
- redis_data:/data

scylla:
image: scylladb/scylla
container_name: scylla
Expand All @@ -63,6 +86,11 @@ services:
volumes:
- etcd_data:/etcd
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
healthcheck:
test: ["CMD", "etcdctl", "endpoint", "health"]
interval: 30s
timeout: 20s
retries: 3

minio:
container_name: milvus-minio
Expand All @@ -71,8 +99,8 @@ services:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
volumes:
- minio_data:/data
command: minio server /data
- minio_data:/minio_data
command: minio server /minio_data --console-address ":9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
Expand All @@ -81,7 +109,7 @@ services:

milvus:
container_name: milvus-standalone
image: milvusdb/milvus:v2.2.8
image: milvusdb/milvus:v2.3.2
command: ["milvus", "run", "standalone"]
environment:
ETCD_ENDPOINTS: etcd:2379
Expand Down Expand Up @@ -117,6 +145,9 @@ volumes:
memas_data:
driver: local

redis_data:
driver: local

esdata01:
driver: local

Expand Down
16 changes: 11 additions & 5 deletions integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from pymilvus import connections as milvus_connection
from elasticsearch import Elasticsearch
from flask import Flask, Config
from memas import context_manager
from memas.app import create_app
from memas.context_manager import ctx, EnvironmentConstants, read_env
from memas.storage_driver import corpus_doc_store, corpus_vector_store


Expand All @@ -23,7 +23,7 @@
def clean_resources():
config = Config(os.getcwd() + "/memas/")
config.from_file(CONFIG_PATH, load=yaml.safe_load)
constants = EnvironmentConstants(config)
constants = context_manager.EnvironmentConstants(config)

try:
connection.setup([constants.cassandra_ip], "system", protocol_version=4)
Expand Down Expand Up @@ -54,9 +54,9 @@ def create_test_app():
# first init to setup
with pytest.raises(SystemExit):
# Note that first init should exit after initializing. So we need to catch and verify
create_app(CONFIG_PATH, first_init=True)
create_app(config_filename=CONFIG_PATH, first_init=True)

app = create_app(CONFIG_PATH)
app = create_app(config_filename=CONFIG_PATH)

return app

Expand All @@ -72,4 +72,10 @@ def test_client():
@pytest.fixture
def es_client():
with app.app_context():
yield ctx.es
yield context_manager.ctx.es


@pytest.fixture
def ctx():
with app.app_context():
yield context_manager.ctx
35 changes: 30 additions & 5 deletions integration-tests/corpus/test_basic_corpus.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import numpy as np
import uuid
import time
from memas.context_manager import ctx
from memas.corpus import basic_corpus
from memas.interface.corpus import Citation
from memas.interface.corpus import Citation, CorpusInfo, CorpusType

corpus_name = "test corpus1"


def test_save_then_search_one_corpus(es_client):
test_corpus = basic_corpus.BasicCorpus(
uuid.uuid4(), corpus_name, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec)
def test_save_then_search_one_corpus(ctx):
namespace_id = uuid.uuid4()
corpus_id = uuid.uuid4()
corpus_info = CorpusInfo("test_corpus_save_search", namespace_id, corpus_id, CorpusType.CONVERSATION)
test_corpus = basic_corpus.BasicCorpus(corpus_info, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec)

text1 = "The sun is high. California sunshine is great. "
text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks"
Expand All @@ -26,3 +27,27 @@ def test_save_then_search_one_corpus(es_client):
# print(output)
assert "sunshine" in output[1][0]
assert "weather" in output[0][0]


def test_delete_all_content(ctx):
namespace_id = uuid.uuid4()
corpus_id = uuid.uuid4()
corpus_info = CorpusInfo("test_delete_all_content", namespace_id, corpus_id, CorpusType.CONVERSATION)
test_corpus = basic_corpus.BasicCorpus(corpus_info, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec)

text1 = "The sun is high. California sunshine is great. "
text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks"
text3 = "The weather is great today, but I worry that tomorrow it won't be. My umbrella is in the repair shop."

assert test_corpus.store_and_index(text1, Citation("www.docsource1", "SSSdoc1", "", "doc1"))
assert test_corpus.store_and_index(text2, Citation("were.docsource2", "SSSdoc2", "", "doc2"))
assert test_corpus.store_and_index(text3, Citation("docsource3.ai", "SSSdoc3", "", "doc3"))

time.sleep(1)
output = test_corpus.search("It is sunny")
assert "sunshine" in output[1][0]

test_corpus.delete_all_content()
time.sleep(1)
output = test_corpus.search("It is sunny")
assert output == []
5 changes: 5 additions & 0 deletions integration-tests/integ-test-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ ELASTICSEARCH:
MILVUS:
ip: "127.0.0.1"
port: 19530

CELERY:
broker_url: "redis://localhost"
result_backend: "redis://localhost"
task_ignore_result: True
14 changes: 14 additions & 0 deletions integration-tests/storage_driver/test_corpus_doc_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest
import uuid
from memas.interface.corpus import Citation
from memas.interface.exceptions import DocumentMetadataNotFound
from memas.storage_driver.corpus_doc_metadata import CorpusDocumentMetadataStoreImpl


Expand All @@ -17,3 +19,15 @@ def test_insert_and_get():
citation = Citation("google.com", "test google", "just a simple test", "test")
metadata.insert_document_metadata(corpus_id, document_id, 1, citation)
assert metadata.get_document_citation(corpus_id, document_id) == citation


def test_delete_corpus():
corpus_id = uuid.uuid4()
document_id = uuid.uuid4()

citation = Citation("google.com", "test google", "just a simple test", "test")
metadata.insert_document_metadata(corpus_id, document_id, 1, citation)
metadata.delete_corpus(corpus_id)

with pytest.raises(DocumentMetadataNotFound):
metadata.get_document_citation(corpus_id, document_id)
8 changes: 4 additions & 4 deletions integration-tests/storage_driver/test_corpus_doc_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
import time


def test_init(es_client: Elasticsearch):
doc_store = corpus_doc_store.ESDocumentStore(es_client)
def test_init(ctx: ContextManager):
doc_store = corpus_doc_store.ESDocumentStore(ctx.es)
doc_store.init()


def test_save_then_search(es_client):
doc_store = corpus_doc_store.ESDocumentStore(es_client)
def test_save_then_search(ctx: ContextManager):
doc_store = corpus_doc_store.ESDocumentStore(ctx.es)
doc_store.init()

corpus_id1 = uuid.uuid1()
Expand Down
26 changes: 25 additions & 1 deletion integration-tests/storage_driver/test_corpus_vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_init():
store.init()


def test_save_then_search2():
def test_save_then_search():
print("before UIDs")
corpus_id1 = uuid.uuid4()
corpus_id2 = uuid.uuid4()
Expand Down Expand Up @@ -46,3 +46,27 @@ def test_save_then_search2():

# Test that the document stored in the other corpus isn't a result
assert document_id3 not in {t[1].document_id for t in result}


def test_delete_corpus():
corpus_id1 = uuid.uuid4()
document_id0 = uuid.uuid4()
document_id1 = uuid.uuid4()
document_id2 = uuid.uuid4()

best_match_str = "The sun is high. California sunshine is great. "

store.save_documents([DocumentEntity(corpus_id1, document_id0, "doc0",
"Before This is a runon sentence meant to test the logic of the splitting capabilites but that is only the start, there is nothing that can break this sentecne up other than some handy logic even in the worst case, too bad I only know how to use commas")])
store.save_documents([DocumentEntity(corpus_id1, document_id1, "doc1",
"The sun is high! California sunshine is great. Did you catch my quest? Oh oh! lol")])
store.save_documents([DocumentEntity(corpus_id1, document_id2, "doc2",
"I picked up my phone and then dropped it again")])
time.sleep(1)

store.delete_corpus(corpus_id1)
time.sleep(1)

result = store.search_corpora([corpus_id1], "How's the weather today?")

assert len(result) == 0
61 changes: 58 additions & 3 deletions integration-tests/storage_driver/test_memas_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
from memas.interface.corpus import CorpusType
from memas.interface.exceptions import NamespaceExistsException
import uuid
from memas.interface.corpus import CorpusInfo, CorpusType
from memas.interface.exceptions import IllegalStateException, NamespaceExistsException, NamespaceDoesNotExistException
from memas.interface.namespace import ROOT_ID
from memas.storage_driver.memas_metadata import MemasMetadataStoreImpl

Expand Down Expand Up @@ -56,4 +57,58 @@ def test_get_query_corpora():
corpus_id2 = metadata.create_conversation_corpus("test_get_query_corpora:convo1")

corpora = metadata.get_query_corpora("test_get_query_corpora")
assert corpora == {corpus_id1, corpus_id2}
assert corpora == {
CorpusInfo("test_get_query_corpora:knowledge1", namespace_id, corpus_id1, CorpusType.CONVERSATION),
CorpusInfo("test_get_query_corpora:convo1", namespace_id, corpus_id2, CorpusType.CONVERSATION)
}


def test_initiate_delete_corpus():
metadata.init()

namespace_id = metadata.create_namespace("test_initiate_delete_corpus")
corpus_pathname = "test_initiate_delete_corpus:convo1"
corpus_id = metadata.create_conversation_corpus(corpus_pathname)

corpus_info = metadata.get_corpus_info(corpus_pathname)
assert corpus_info.corpus_id == corpus_id

metadata.initiate_delete_corpus(namespace_id, corpus_id, corpus_pathname)

# since it's not a full delete, we'll still be able to query metadata given the ids
corpus_info = metadata.get_corpus_info_by_id(namespace_id, corpus_id)
assert corpus_info.corpus_id == corpus_id

with pytest.raises(NamespaceDoesNotExistException):
corpus_info = metadata.get_corpus_info(corpus_pathname)


def test_initiate_delete_wrong_corpus():
metadata.init()

namespace_id = metadata.create_namespace("initiate_delete_wrong_corpus")
corpus_pathname = "initiate_delete_wrong_corpus:convo1"
corpus_id = metadata.create_conversation_corpus(corpus_pathname)

corpus_info = metadata.get_corpus_info(corpus_pathname)
assert corpus_info.corpus_id == corpus_id

with pytest.raises(NamespaceDoesNotExistException):
metadata.initiate_delete_corpus(namespace_id, uuid.uuid4(), corpus_pathname)


def test_finish_delete_corpus():
metadata.init()

namespace_id = metadata.create_namespace("test_finish_delete_corpus")
corpus_pathname = "test_finish_delete_corpus:convo1"
corpus_id = metadata.create_conversation_corpus(corpus_pathname)

corpus_info = metadata.get_corpus_info_by_id(namespace_id, corpus_id)
assert corpus_info.corpus_id == corpus_id

metadata.initiate_delete_corpus(namespace_id, corpus_id, corpus_pathname)
metadata.finish_delete_corpus(namespace_id, corpus_id)

with pytest.raises(IllegalStateException):
metadata.get_corpus_info_by_id(namespace_id, corpus_id)
15 changes: 15 additions & 0 deletions integration-tests/test_celery_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest
from unittest import mock
import memas.celery_worker
from memas.interface.exceptions import NamespaceDoesNotExistException


@mock.patch("memas.celery_worker.time.sleep")
def test_delete_corpus(mock_sleep, ctx):
namespace_id = ctx.memas_metadata.create_namespace("celery_delete_corpus")
corpus_pathname = "celery_delete_corpus:corpus1"
corpus_id = ctx.memas_metadata.create_conversation_corpus(corpus_pathname)
memas.celery_worker.delete_corpus(namespace_id, corpus_id, corpus_pathname)

with pytest.raises(NamespaceDoesNotExistException):
ctx.memas_metadata.get_corpus_ids_by_name(corpus_pathname)
Loading

0 comments on commit 56d2eb7

Please sign in to comment.