marqo-ai · wanliAlex · Aug 16, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/.github/workflows/largemodel_unit_test_CI.yml b/.github/workflows/largemodel_unit_test_CI.yml
@@ -150,6 +150,10 @@ jobs:
  export MARQO_MAX_CPU_MODEL_MEMORY=15
  export MARQO_MAX_CUDA_MODEL_MEMORY=15
  
+ export PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID=${{ secrets.PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID }}
+ export PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY=${{ secrets.PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY }}
+ export PRIVATE_MODEL_TESTS_HF_TOKEN=${{ secrets.PRIVATE_MODEL_TESTS_HF_TOKEN }}
+ 
  export PYTHONPATH="./marqo/tests:./marqo/src:./marqo"
  pytest marqo/tests --largemodel --ignore=marqo/tests/test_documentation.py
 

diff --git a/.github/workflows/unit_test_200gb_CI.yml b/.github/workflows/unit_test_200gb_CI.yml
@@ -155,6 +155,10 @@ jobs:
  export VESPA_DOCUMENT_URL=http://localhost:8080
  export VESPA_QUERY_URL=http://localhost:8080
  
+ export PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID=${{ secrets.PRIVATE_MODEL_TESTS_AWS_ACCESS_KEY_ID }}
+ export PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY=${{ secrets.PRIVATE_MODEL_TESTS_AWS_SECRET_ACCESS_KEY }}
+ export PRIVATE_MODEL_TESTS_HF_TOKEN=${{ secrets.PRIVATE_MODEL_TESTS_HF_TOKEN }}
+ 
  cd marqo
  export PYTHONPATH="./tests:./src:."
  pytest --ignore=tests/test_documentation.py --durations=100 --cov=src --cov-branch --cov-context=test --cov-report=html:cov_html --cov-report=lcov:lcov.info tests

diff --git a/...s2_inference/model_downloading/from_hf.py → .../core/inference/download_model_from_hf.py b/...s2_inference/model_downloading/from_hf.py → .../core/inference/download_model_from_hf.py
@@ -2,7 +2,7 @@
 from typing import Optional
 from huggingface_hub import hf_hub_download
 from marqo.s2_inference.logger import get_logger
-from huggingface_hub.errors import RepositoryNotFoundError
+from huggingface_hub.utils import RepositoryNotFoundError
 from marqo.s2_inference.errors import ModelDownloadError
 
 logger = get_logger(__name__)

diff --git a/...s2_inference/model_downloading/from_s3.py → .../core/inference/download_model_from_s3.py b/...s2_inference/model_downloading/from_s3.py → .../core/inference/download_model_from_s3.py
diff --git a/src/marqo/core/inference/models/__init__.py → ...re/inference/inference_models/__init__.py b/src/marqo/core/inference/models/__init__.py → ...re/inference/inference_models/__init__.py
diff --git a/...e/inference/models/abstract_clip_model.py → ...e/inference_models/abstract_clip_model.py b/...e/inference/models/abstract_clip_model.py → ...e/inference_models/abstract_clip_model.py
@@ -1,13 +1,15 @@
 from abc import abstractmethod
 
+import numpy as np
+import torch
 from PIL import UnidentifiedImageError
 
-from marqo.core.inference.models.abstract_embedding_model import AbstractEmbeddingModel
-from marqo.s2_inference.types import *
-from marqo.core.inference.image_download import (_is_image, format_and_load_CLIP_images,
- format_and_load_CLIP_image)
+from marqo.core.inference.inference_models.abstract_embedding_model import AbstractEmbeddingModel
+from marqo.core.inference.inference_models.image_download import (_is_image, format_and_load_CLIP_images,
+ format_and_load_CLIP_image)
 from marqo.s2_inference.logger import get_logger
-import torch
+from marqo.s2_inference.types import *
+from marqo.tensor_search.models.private_models import ModelAuth
 
 logger = get_logger(__name__)
 
@@ -25,14 +27,14 @@ class AbstractCLIPModel(AbstractEmbeddingModel):
  """
 
  def __init__(self, device: Optional[str] = None, model_properties: Optional[dict] = None,
- model_auth: Optional[dict] = None):
+ model_auth: Optional[ModelAuth] = None):
  """Instantiate the abstract CLIP model.
 
  Args:
  device (str): The device to load the model on, typically 'cpu' or 'cuda'.
  model_properties (dict): A dictionary containing additional properties or configurations
  specific to the model. Defaults to an empty dictionary if not provided.
- model_auth (dict): The authentication information for the model. Defaults to `None` if not provided
+ model_auth (ModelAuth): The authentication information for the model. Defaults to `None` if not provided
  """
 
  super().__init__(model_properties, device, model_auth)
@@ -42,20 +44,20 @@ def __init__(self, device: Optional[str] = None, model_properties: Optional[dict
  self.preprocess = None
 
  @abstractmethod
- def encode_text(self, inputs: Union[str, List[str]], normalize: bool = True) -> FloatTensor:
+ def encode_text(self, inputs: Union[str, List[str]], normalize: bool = True) -> np.ndarray:
  pass
 
  @abstractmethod
- def encode_image(self, inputs, normalize: bool = True, image_download_headers: dict = None) -> FloatTensor:
+ def encode_image(self, inputs, normalize: bool = True, image_download_headers: dict = None) -> np.ndarray:
  pass
 
  def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]],
- default: str = 'text', normalize=True, **kwargs) -> FloatTensor:
+ default: str = 'text', normalize=True, **kwargs) -> np.ndarray:
  infer = kwargs.pop('infer', True)
-
  if infer and _is_image(inputs):
  is_image = True
  else:
+ is_image = False
  if default == 'text':
  is_image = False
  elif default == 'image':

diff --git a/...erence/models/abstract_embedding_model.py → ...erence_models/abstract_embedding_model.py b/...erence/models/abstract_embedding_model.py → ...erence_models/abstract_embedding_model.py
@@ -1,12 +1,14 @@
 from abc import ABC, abstractmethod
 from typing import Optional
 
+from marqo.tensor_search.models.private_models import ModelAuth
+
 
 class AbstractEmbeddingModel(ABC):
  """This is the abstract base class for all models in Marqo."""
 
  def __init__(self, model_properties: Optional[dict] = None, device: Optional[str] = None,
- model_auth: Optional[dict] = None):
+ model_auth: Optional[ModelAuth] = None):
  """Load the model with the given properties.
 
  Args:
@@ -20,7 +22,6 @@ def __init__(self, model_properties: Optional[dict] = None, device: Optional[str
  if model_properties is None:
  model_properties = dict()
 
- self.model_properties = self._build_model_properties(model_properties)
  self.device = device
  self.model_auth = model_auth
 
@@ -33,11 +34,6 @@ def load(self):
  self._load_necessary_components()
  self._check_loaded_components()
 
- @abstractmethod
- def _build_model_properties(self, model_properties: dict):
- """Parse the model properties from the user input and convert it to a pydantic model."""
- pass
-
  @abstractmethod
  def _load_necessary_components(self):
  """Load the necessary components for the model."""
@@ -54,4 +50,5 @@ def _check_loaded_components(self):
 
  @abstractmethod
  def encode(self):
- pass
+ """Encode the input data."""
+ pass
diff --git a/src/marqo/core/inference/inference_models/hf_tokenizer.py b/src/marqo/core/inference/inference_models/hf_tokenizer.py
@@ -0,0 +1,33 @@
+import html
+from typing import Union, List
+
+import ftfy
+import regex as re
+import torch
+
+
+def whitespace_clean(text):
+ text = re.sub(r'\s+', ' ', text)
+ text = text.strip()
+ return text
+
+def basic_clean(text):
+ text = ftfy.fix_text(text)
+ text = html.unescape(html.unescape(text))
+ return text.strip()
+
+class HFTokenizer:
+ # HuggingFace _tokenizer wrapper
+ # Check https://github.com/mlfoundations/open_clip/blob/16e229c596cafaec46a4defaf27e0e30ffcca12d/src/open_clip/tokenizer.py#L188-L201
+ def __init__(self, tokenizer_name: str):
+ from transformers import AutoTokenizer
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+ def __call__(self, texts: Union[str, List[str]]) -> torch.Tensor:
+ # same cleaning as for default _tokenizer, except lowercasing
+ # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
+ if isinstance(texts, str):
+ texts = [texts]
+ texts = [whitespace_clean(basic_clean(text)) for text in texts]
+ input_ids = self.tokenizer(texts, return_tensors='pt', padding='max_length', truncation=True).input_ids
+ return input_ids