marqo-ai · vicilliar · Oct 2, 2024 · Oct 9, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -139,5 +139,6 @@ local_only/
 tests/cache/
 
 cache/
+src/marqo/cache/
 
 __pycache__/
diff --git a/.github/workflows/arm64_docker_marqo.yml b/.github/workflows/arm64_docker_marqo.yml
@@ -83,7 +83,7 @@ jobs:
  with:
  fetch-depth: 0
 
- - name: Set up Python 3.9 # TODO: Check if 3.9 is okay instead of 3.8. So far, so good
+ - name: Set up Python 3.9
  run: |
  apt-get -y update
  apt-get -y install python3.9

diff --git a/.github/workflows/cpu_docker_marqo.yml b/.github/workflows/cpu_docker_marqo.yml
@@ -91,10 +91,10 @@ jobs:
  with:
  fetch-depth: 0
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
  cache: "pip"
 
  - name: Install Dependencies

diff --git a/.github/workflows/cpu_local_marqo.yml b/.github/workflows/cpu_local_marqo.yml
@@ -91,10 +91,10 @@ jobs:
  with:
  fetch-depth: 0
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
  cache: "pip"
 
  - name: Install Dependencies

diff --git a/.github/workflows/cuda_docker_marqo.yml b/.github/workflows/cuda_docker_marqo.yml
@@ -86,10 +86,10 @@ jobs:
  with:
  fetch-depth: 0
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
  cache: "pip"
 
  - name: Install Dependencies

diff --git a/.github/workflows/largemodel_unit_test_CI.yml b/.github/workflows/largemodel_unit_test_CI.yml
@@ -66,10 +66,10 @@ jobs:
  fetch-depth: 0
  path: marqo
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
  cache: "pip"
 
  - name: Checkout marqo-base for requirements

diff --git a/.github/workflows/locust_perf_test.yml b/.github/workflows/locust_perf_test.yml
@@ -112,10 +112,10 @@ jobs:
  with:
  ref: ${{ github.event.inputs.marqo_ref }}
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
 
  - name: Set up Docker Buildx
  if: github.event.inputs.marqo_host == 'http://localhost:8882' && github.event.inputs.image_to_test == 'marqo_docker_0'

diff --git a/.github/workflows/test_documentation.yml b/.github/workflows/test_documentation.yml
@@ -20,10 +20,10 @@ jobs:
  fetch-depth: 0
  path: marqo
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
  cache: "pip"
 
  - name: Install dependencies

diff --git a/.github/workflows/unit_test_200gb_CI.yml b/.github/workflows/unit_test_200gb_CI.yml
@@ -66,10 +66,10 @@ jobs:
  fetch-depth: 0
  path: marqo
 
- - name: Set up Python 3.8
+ - name: Set up Python 3.9
  uses: actions/setup-python@v3
  with:
- python-version: "3.8"
+ python-version: "3.9"
  cache: "pip"
 
  - name: Checkout marqo-base for requirements

diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ COPY vespa .
 RUN mvn clean package
 
 # Stage 2: Base image for Python setup
-FROM marqoai/marqo-base:36 as base_image
+FROM marqoai/marqo-base:37 as base_image
 
 # Allow mounting volume containing data and configs for vespa
 VOLUME /opt/vespa/var

diff --git a/tests/s2_inference/embeddings_reference/embeddings_large_e5_python_3_8.json b/tests/s2_inference/embeddings_reference/embeddings_large_e5_python_3_8.json
diff --git a/tests/s2_inference/embeddings_reference/embeddings_large_multilingual_e5_python_3_8.json b/tests/s2_inference/embeddings_reference/embeddings_large_multilingual_e5_python_3_8.json
diff --git a/tests/s2_inference/embeddings_reference/embeddings_large_open_clip_python_3_8.json b/tests/s2_inference/embeddings_reference/embeddings_large_open_clip_python_3_8.json
diff --git a/tests/s2_inference/embeddings_reference/embeddings_open_clip_python_3_8.json b/tests/s2_inference/embeddings_reference/embeddings_open_clip_python_3_8.json
diff --git a/tests/s2_inference/embeddings_reference/embeddings_python_3_8.json b/tests/s2_inference/embeddings_reference/embeddings_python_3_8.json
diff --git a/tests/s2_inference/embeddings_reference/info.txt b/tests/s2_inference/embeddings_reference/info.txt
@@ -0,0 +1,7 @@
+16/10/24 - All embeddings were generated with:
+- Marqo mainline head: 055237ae6c4a8121b4026650582f3a23bd416564 (2.12.2 release notes)
+- Python 3.8.20
+- open_clip_torch==2.24.0
+- torch==1.12.1
+- Ubuntu 22.04.4 LTS
+- g4dn.xlarge EC2 instance
diff --git a/tests/s2_inference/test_encoding.py b/tests/s2_inference/test_encoding.py
@@ -1,12 +1,15 @@
 import unittest
 import torch
+import json
+import numpy as np
 from unittest.mock import MagicMock, patch
 from marqo.s2_inference.types import FloatTensor
 from marqo.s2_inference.s2_inference import clear_loaded_models, get_model_properties_from_registry
 from marqo.s2_inference.model_registry import load_model_properties, _get_open_clip_properties
 from marqo.s2_inference.s2_inference import _convert_tensor_to_numpy
 import numpy as np
 import functools
+import os
 
 from marqo.s2_inference.s2_inference import (
  _check_output_type, vectorise,
@@ -17,6 +20,13 @@
 
 _load_model = functools.partial(og_load_model, calling_func = "unit_test")
 
+
+def get_absolute_file_path(filename: str) -> str:
+ currentdir = os.path.dirname(os.path.abspath(__file__))
+ abspath = os.path.join(currentdir, filename)
+ return abspath
+
+
 class TestEncoding(unittest.TestCase):
 
  def setUp(self) -> None:
@@ -26,8 +36,12 @@ def tearDown(self) -> None:
  clear_loaded_models()
 
  def test_vectorize(self):
- names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32",
- 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
+ """
+ Ensure that vectorised output from vectorise function matches both the model.encode output and
+ hardcoded embeddings from Python 3.8.20
+ """
+
+ names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
  "all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6",
  "hf/bge-small-en-v1.5", "onnx/all-MiniLM-L6-v1", "onnx/all_datasets_v4_MiniLM-L6"]
 
@@ -43,21 +57,42 @@ def test_vectorize(self):
  sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
  device = 'cpu'
  eps = 1e-9
+ embeddings_file_name = get_absolute_file_path("embeddings_reference/embeddings_python_3_8.json")
+
+ # Load in hardcoded embeddings json file
+ with open(embeddings_file_name, "r") as f:
+ embeddings_python_3_8 = json.load(f)
 
  for name in names:
- model_properties = get_model_properties_from_registry(name)
- model = _load_model(model_properties['name'], model_properties=model_properties, device=device)
+ with self.subTest(name=name):
+ # Add hardcoded embeddings into the variable.
+ model_properties = get_model_properties_from_registry(name)
+ model = _load_model(model_properties['name'], model_properties=model_properties, device=device)
 
- for sentence in sentences:
- output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
+ for sentence in sentences:
+ with self.subTest(sentence=sentence):
+ output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
+ assert _check_output_type(output_v)
 
- assert _check_output_type(output_v)
+  output_m = model.encode(sentence, normalize=True)
 
- output_m = model.encode(sentence, normalize=True)
+ # Embeddings must match hardcoded python 3.8.20 embeddings
+ if isinstance(sentence, str):
+ with self.subTest("Hardcoded Python 3.8 Embeddings Comparison"):
+ try:
+ self.assertEqual(np.allclose(output_m, embeddings_python_3_8[name][sentence],
+ atol=1e-6),
+ True)
+ except KeyError:
+ raise KeyError(f"Hardcoded Python 3.8 embeddings not found for "
+ f"model: {name}, sentence: {sentence} in JSON file: "
+ f"{embeddings_file_name}")
 
- assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
+ with self.subTest("Model encode vs vectorize"):
+ self.assertEqual(np.allclose(output_m, output_v, atol=eps), True)
+
+ clear_loaded_models()
 
- clear_loaded_models()
 
  def test_vectorize_normalise(self):
  open_clip_names = ["open_clip/ViT-B-32/laion2b_s34b_b79k"]
@@ -120,6 +155,7 @@ def test_cpu_encode_type(self):
 
  clear_loaded_models()
 
+
  def test_load_clip_text_model(self):
  names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
  'RN50', "ViT-B/16"]
@@ -313,6 +349,11 @@ def test_open_clip_vectorize(self):
  sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
  device = 'cpu'
  eps = 1e-9
+ embeddings_reference_file = get_absolute_file_path("embeddings_reference/embeddings_open_clip_python_3_8.json")
+
+ # Load in hardcoded embeddings json file
+ with open(embeddings_reference_file, "r") as f:
+ embeddings_python_3_8 = json.load(f)
 
  for name in names:
  model_properties = get_model_properties_from_registry(name)
@@ -327,7 +368,21 @@ def test_open_clip_vectorize(self):
 
  output_m = model.encode(sentence, normalize=normalize_embeddings)
 
- assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
+ # Embeddings must match hardcoded python 3.8.20 embeddings
+ if isinstance(sentence, str):
+ with self.subTest("Hardcoded Python 3.8 Embeddings Comparison"):
+ try:
+ self.assertEqual(np.allclose(output_m, embeddings_python_3_8[name][sentence], atol=1e-5),
+ True, f"For model {name} and sentence {sentence}: "
+ f"Calculated embedding is {output_m} but "
+ f"hardcoded embedding is {embeddings_python_3_8[name][sentence]}")
+ except KeyError:
+ raise KeyError(f"Hardcoded Python 3.8 embeddings not found for "
+ f"model: {name}, sentence: {sentence} in JSON file: "
+ f"{embeddings_reference_file}")
+
+ with self.subTest("Model encode vs vectorize"):
+ self.assertEqual(np.allclose(output_m, output_v, atol=eps), True)
 
  clear_loaded_models()
 

diff --git a/tests/s2_inference/test_large_model_encoding.py b/tests/s2_inference/test_large_model_encoding.py
@@ -1,6 +1,7 @@
 import os
 import torch
 import pytest
+import json
 from marqo.s2_inference.types import FloatTensor
 from marqo.s2_inference.s2_inference import clear_loaded_models, get_model_properties_from_registry, _convert_tensor_to_numpy
 from unittest.mock import patch
@@ -34,10 +35,31 @@ def remove_cached_model_files():
  elif os.path.isdir(item_path):
  shutil.rmtree(item_path)
 
-def run_test_vectorize(models):
+
+def get_absolute_file_path(filename: str) -> str:
+ currentdir = os.path.dirname(os.path.abspath(__file__))
+ abspath = os.path.join(currentdir, filename)
+ return abspath
+
+
+def run_test_vectorize(models, model_type):
+
+ # model_type determines the filename with which the embeddings are saved/loaded
+ # Ensure that vectorised output from vectorise function matches both the model.encode output and
+ # hardcoded embeddings from Python 3.8
+
+
  sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
  device = "cuda"
  eps = 1e-9
+ embeddings_reference_file = get_absolute_file_path(
+ f"embeddings_reference/embeddings_{model_type}_python_3_8.json"
+ )
+
+ # Load in hardcoded embeddings json file
+ with open(embeddings_reference_file, "r") as f:
+ embeddings_python_3_8 = json.load(f)
+
  with patch.dict(os.environ, {"MARQO_MAX_CUDA_MODEL_MEMORY": "10"}):
  def run():
  for name in models:
@@ -55,7 +77,16 @@ def run():
  if type(output_m) == torch.Tensor:
  output_m = output_m.cpu().numpy()
 
- assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
+ # Embeddings must match hardcoded python 3.8.20 embeddings
+ if isinstance(sentence, str):
+ try:
+ assert np.allclose(output_m, embeddings_python_3_8[name][sentence], atol=1e-6)
+ except KeyError:
+ raise KeyError(f"Hardcoded Python 3.8 embeddings not found for "
+ f"model: {name}, sentence: {sentence} in JSON file: "
+ f"{embeddings_reference_file}")
+
+ assert np.allclose(output_m, output_v, atol=eps)
 
  clear_loaded_models()
  torch.cuda.empty_cache()
@@ -67,6 +98,7 @@ def run():
 
  assert run()
 
+
 def run_test_model_outputs(models):
  sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
  device = "cuda"
@@ -155,8 +187,7 @@ def tearDownClass(cls) -> None:
 
  def test_vectorize(self):
  # For GPU Memory Optimization, we shouldn't load all models at once
- for model_name in self.models:
- run_test_vectorize(models=[model_name])
+ run_test_vectorize(models=self.models, model_type="large_open_clip")
 
  def test_load_clip_text_model(self):
  device = "cuda"
@@ -224,8 +255,7 @@ def tearDownClass(cls) -> None:
 
  def test_vectorize(self):
  # For GPU Memory Optimization, we shouldn't load all models at once
- for model_name in self.models:
- run_test_vectorize(models=[model_name])
+ run_test_vectorize(models=self.models, model_type="large_e5")
 
  def test_model_outputs(self):
  for model_name in self.models:
@@ -259,8 +289,7 @@ def tearDownClass(cls) -> None:
 
  def test_vectorize(self):
  # For GPU Memory Optimization, we shouldn't load all models at once
- for model_name in self.models:
- run_test_vectorize(models=[model_name])
+ run_test_vectorize(models=self.models, model_type="large_bge")
 
  def test_model_outputs(self):
  for model_name in self.models:
@@ -294,8 +323,7 @@ def tearDownClass(cls) -> None:
 
  def test_vectorize(self):
  # For GPU Memory Optimization, we shouldn't load all models at once
- for model_name in self.models:
- run_test_vectorize(models=[model_name])
+ run_test_vectorize(models=self.models, model_type="large_snowflake")
 
  def test_model_outputs(self):
  for model_name in self.models:
@@ -334,8 +362,7 @@ def tearDownClass(cls) -> None:
 
  def test_vectorize(self):
  # For GPU Memory Optimization, we shouldn't load all models at once
- for model_name in self.models:
- run_test_vectorize(models=[model_name])
+ run_test_vectorize(models=self.models, model_type="large_multilingual_e5")
 
  def test_model_outputs(self):
  for model_name in self.models: