From 8ccf77a9eed250bd5923675a933ba46505166ca1 Mon Sep 17 00:00:00 2001 From: Max Yu <18641481+maxyu1115@users.noreply.github.com> Date: Wed, 6 Sep 2023 21:33:55 -0700 Subject: [PATCH] Created docker image and compose configuration for MeMaS (#39) --- .dockerignore | 1 + .github/workflows/python-app.yml | 2 +- CONTRIBUTING.md | 32 +-- Dockerfile | 37 ++++ docker-compose.yml | 56 +++++- integration-tests/corpus/test_basic_corpus.py | 4 +- memas-docker/init.sh | 29 +++ memas-docker/memas-config.yml | 13 ++ memas-docker/wait-for-it.sh | 183 ++++++++++++++++++ memas/app.py | 1 + memas/context_manager.py | 2 +- memas/corpus/basic_corpus.py | 2 +- memas/corpus/corpus_provider.py | 4 +- 13 files changed, 342 insertions(+), 24 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 memas-docker/init.sh create mode 100644 memas-docker/memas-config.yml create mode 100644 memas-docker/wait-for-it.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..20cc2f9 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +memas/memas-config.yml diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index c523d9f..4f98d5f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -34,7 +34,7 @@ jobs: - name: Run integration tests run: | source setup-env.sh - docker compose up --detach --wait --wait-timeout 60 + docker compose up --build --detach --wait --wait-timeout 60 python3 -m pytest integration-tests docker compose down --volumes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index df71631..f630afa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,9 +15,17 @@ If you are working using WSL, follow this guide to [configure Docker](https://do Run `source setup-env.sh`, this will install all the needed development tools, as well as setup the needed environment variables. +**And please run `source format.sh` before each commit!** + **NOTE that this command needs to be ran for each new shell instance, since it sets up environment variables.** ### Using Docker -In the top level of this repo, run `docker compose up`, and it will spin up 1 es nodes, 1 scylla nodes, 1 milvus node, and a few more. This is a very basic development setup. +In the top level of this repo, run + +```bash +docker compose --profile dev up --build +``` + +This will spin up 1 MeMaS instance running in gunicorn, 1 es nodes, 1 scylla nodes, 1 milvus node, and a few more. This is a very basic development setup. To stop docker execution, run Control+C in the terminal you are running `docker compose up`, or run `docker compose down`. @@ -28,30 +36,28 @@ docker compose down --volumes FYI you may need to run `sysctl -w vm.max_map_count=262144` if you get an error when trying to start elasticsearch. -### First time initializing the MeMaS server -**NOTE: Only run this phase when you are working with a clean set of docker dependencies, aka a fresh start or after `docker compose down --volumes`.** +### Developing with local MeMaS outside of Docker +If you only need the MeMaS dependencies and want to run flask/gunicorn locally outside of docker, run this instead to bring up the dependencies in docker: -Due to the service dependencies, the first time running MeMaS, we need to use a special command to initialize and configure the dependencies. +```bash +docker compose up +``` -After `source setup-env.sh` and `docker compose up`, wait till the services are fully started. +If this is your first time initializing the MeMaS server, after `docker compose up` and wait till the dependencies are fully started, run `source setup-env.sh`, then -Then run ```bash flask --app 'memas.app:create_app(config_filename="memas-config.yml", first_init=True)' run ``` -This will run for a while then exit. Upon exit, your MeMaS is properly setup. +This will run for a while then exit. Upon exit, your MeMaS is properly setup. **NOTE: Only run this phase when you are working with a clean set of docker dependencies, aka a fresh start or after `docker compose down --volumes`.** -### Running the MeMaS server -After `source setup-env.sh` and `docker compose up`, wait till the services are fully started. - -Then run +After MeMaS is properly initialized, run `source setup-env.sh`, then: ```bash flask --app 'memas.app:create_app(config_filename="memas-config.yml")' run ``` -to start the memas server +to start the memas server. -To run the app with wsgi server, run +And to run the app with wsgi server, run ```bash gunicorn -w 1 -k eventlet 'memas.app:create_app(config_filename="memas-config.yml")' ``` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2c3abc6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.10 + +# Create app directory +WORKDIR /memas + +# Install Universal Sentence Encoder +RUN wget https://tfhub.dev/google/universal-sentence-encoder/4?tf-hub-format=compressed -O use4.tar +RUN mkdir -p encoder/universal-sentence-encoder_4 +RUN tar -xf use4.tar -C encoder/universal-sentence-encoder_4 +RUN rm use4.tar + +# Install app dependencies +COPY requirements.txt ./requirements.txt +RUN pip install -r requirements.txt +RUN python3 -c "import nltk; nltk.download('punkt')" + + +# Bundle app source +COPY logging.ini ./logging.ini +COPY memas ./memas +COPY --chmod=0755 memas-docker/wait-for-it.sh ./wait-for-it.sh +COPY --chmod=0755 memas-docker/init.sh ./init.sh + + +# Copy in the default config +ARG conf_file=memas-config.yml +ENV conf_file=${conf_file} +COPY memas-docker/${conf_file} ./memas/${conf_file} +# TODO: provide way to use custom configs in docker compose + + +# Set the python path to include memas, since memas isn't technically a python package +ENV PYTHONPATH "$PYTHONPATH:memas" + + +EXPOSE 8010 +CMD gunicorn -b :8010 -w 1 -k eventlet "memas.app:create_app(config_filename=\"${conf_file}\")" diff --git a/docker-compose.yml b/docker-compose.yml index dde57bd..b7b70a2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,14 +1,56 @@ version: '3' services: + + memas-init: + build: + context: . + image: memas:latest + container_name: memas-init + depends_on: + scylla: + condition: service_healthy + milvus: + condition: service_started + es01: + condition: service_started + env_file: + - .env + volumes: + - memas_data:/memas + command: /memas/init.sh 30 + profiles: ["dev"] + + memas: + build: + context: . + image: memas:latest + container_name: memas + depends_on: + memas-init: + condition: service_completed_successfully + env_file: + - .env + volumes: + - memas_data:/memas + ports: + - 8010:8010 + # command: ./wait-for-it.sh milvus-standalone:19530 -t 300 -- gunicorn -w 1 -k eventlet 'memas.app:create_app(config_filename="memas-config.yml")' + profiles: ["dev"] + scylla: image: scylladb/scylla container_name: scylla command: --smp=2 ports: - - "9042:9042" + - 9042:9042 volumes: - scylla_data:/var/lib/scylla + healthcheck: + test: ["CMD-SHELL", "[ $$(nodetool statusgossip) = running ]"] + interval: 10s + timeout: 5s + retries: 10 etcd: container_name: milvus-etcd @@ -37,7 +79,7 @@ services: timeout: 20s retries: 3 - standalone: + milvus: container_name: milvus-standalone image: milvusdb/milvus:v2.2.8 command: ["milvus", "run", "standalone"] @@ -47,13 +89,14 @@ services: volumes: - milvus_data:/var/lib/milvus ports: - - "19530:19530" - - "9091:9091" + - 19530:19530 + - 9091:9091 depends_on: - "etcd" - "minio" es01: + container_name: memas-es01 image: elasticsearch:${ES_VERSION} volumes: - esdata01:/usr/share/elasticsearch/data @@ -71,6 +114,9 @@ services: volumes: + memas_data: + driver: local + esdata01: driver: local @@ -88,4 +134,4 @@ volumes: networks: default: - name: milvus_dev \ No newline at end of file + name: memas_dev diff --git a/integration-tests/corpus/test_basic_corpus.py b/integration-tests/corpus/test_basic_corpus.py index c7dea18..896766b 100644 --- a/integration-tests/corpus/test_basic_corpus.py +++ b/integration-tests/corpus/test_basic_corpus.py @@ -7,8 +7,10 @@ corpus_name = "test corpus1" + def test_save_then_search_one_corpus(es_client): - test_corpus = basic_corpus.BasicCorpus(uuid.uuid4(), corpus_name, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec) + test_corpus = basic_corpus.BasicCorpus( + uuid.uuid4(), corpus_name, ctx.corpus_metadata, ctx.corpus_doc, ctx.corpus_vec) text1 = "The sun is high. California sunshine is great. " text2 = "I picked up my phone and then dropped it again. I cant seem to get a good grip on things these days. It persists into my everyday tasks" diff --git a/memas-docker/init.sh b/memas-docker/init.sh new file mode 100644 index 0000000..ae61126 --- /dev/null +++ b/memas-docker/init.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Init script for MeMaS. Sleeps for x seconds to wait for service initialization + +# Check if an argument is provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +num=$1 +version="2023-09-06" + +# Check if the entered value is a valid number +if ! [[ "$num" =~ ^[0-9]+$ ]]; then + echo "Please enter a valid number." + exit 1 +fi + +# TODO: introduce actual way of waiting for the dependencies reliably, instead of sleeping. (Note even after the current health checks path, scylla is still not ready) +echo "sleeping $num" +sleep $num + + +if [ ! -e /memas/first-init.lock ] +then + # If initialization succeeded, create the lock file, and write our current version to it + # FIXME: is running flask instead of gunicorn a security concern? Gunicorn keeps on trying to restart the worker thread despite we're intentionally exiting + flask --app "memas.app:create_app(config_filename=\"$conf_file\", first_init=True)" run && touch /memas/first-init.lock; echo $version > /memas/first-init.lock +fi diff --git a/memas-docker/memas-config.yml b/memas-docker/memas-config.yml new file mode 100644 index 0000000..1574691 --- /dev/null +++ b/memas-docker/memas-config.yml @@ -0,0 +1,13 @@ +CASSANDRA: + ip: "scylla" + port: 9042 + keyspace: "memas" + replication_factor: 1 + +ELASTICSEARCH: + ip: "memas-es01" + port: 9200 + +MILVUS: + ip: "milvus-standalone" + port: 19530 diff --git a/memas-docker/wait-for-it.sh b/memas-docker/wait-for-it.sh new file mode 100644 index 0000000..b645310 --- /dev/null +++ b/memas-docker/wait-for-it.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +# COPIED from https://github.com/vishnubob/wait-for-it +# Use this script to test if a given TCP host/port are available + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo -n > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# Check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) + +WAITFORIT_BUSYTIMEFLAG="" +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + # Check if busybox timeout uses -t flag + # (recent Alpine versions don't support -t anymore) + if timeout &>/dev/stdout | grep -q -e '-t '; then + WAITFORIT_BUSYTIMEFLAG="-t" + fi +else + WAITFORIT_ISBUSY=0 +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi diff --git a/memas/app.py b/memas/app.py index a053237..78545b8 100644 --- a/memas/app.py +++ b/memas/app.py @@ -10,6 +10,7 @@ def create_app(config_filename, *, first_init=False): app.ctx: ContextManager = ContextManager(app.config) if first_init: app.ctx.first_init() + app.logger.info("Finished first time initialization") exit(0) app.ctx.init() diff --git a/memas/context_manager.py b/memas/context_manager.py index eaf98db..c8a6375 100644 --- a/memas/context_manager.py +++ b/memas/context_manager.py @@ -91,7 +91,7 @@ def setup_cassandra_keyspace(self): def init_clients(self) -> None: # connect to cassandra - c_connection.setup(['127.0.0.1'], self.consts.cassandra_keyspace, protocol_version=4) + c_connection.setup([self.consts.cassandra_ip], self.consts.cassandra_keyspace, protocol_version=4) # TODO: properly support https # connect to elastic search diff --git a/memas/corpus/basic_corpus.py b/memas/corpus/basic_corpus.py index f2515f8..e80d24b 100644 --- a/memas/corpus/basic_corpus.py +++ b/memas/corpus/basic_corpus.py @@ -108,7 +108,7 @@ def __init__(self, metadata_store: CorpusDocumentMetadataStore, doc_store: Corpu self.metadata_store: CorpusDocumentMetadataStore = metadata_store self.doc_store: CorpusDocumentStore = doc_store self.vec_store: CorpusVectorStore = vec_store - + def produce(self, corpus_id: uuid.UUID): # TODO: Maybe change the Corpus Name Parameter return BasicCorpus(corpus_id, "BasicCorpus", self.metadata_store, self.doc_store, self.vec_store) diff --git a/memas/corpus/corpus_provider.py b/memas/corpus/corpus_provider.py index 4e068b0..c8fab86 100644 --- a/memas/corpus/corpus_provider.py +++ b/memas/corpus/corpus_provider.py @@ -11,12 +11,12 @@ class CorpusProvider: def __init__(self, metadata_store: CorpusDocumentMetadataStore, doc_store: CorpusDocumentStore, vec_store: CorpusVectorStore) -> None: self.factory_dict: dict[CorpusType, CorpusFactory] = dict() - + basic_corpus_factory = BasicCorpusFactory(metadata_store, doc_store, vec_store) self.factory_dict[CorpusType.CONVERSATION] = basic_corpus_factory self.factory_dict[CorpusType.KNOWLEDGE] = basic_corpus_factory - def get_corpus(self, corpus_id: UUID, *, corpus_type: CorpusType, namespace_id: UUID=None) -> Corpus: + def get_corpus(self, corpus_id: UUID, *, corpus_type: CorpusType, namespace_id: UUID = None) -> Corpus: """Gets the Corpus class based on the corpus_id Args: