ML job to recompute & reindex sentence embeddings (#552)

fynnos · web-flow · commit 906657b60c29 · 2025-07-21T13:57:10.000+02:00
* ML job to recompute &amp; reindex sentence embeddings
diff --git a/backend/src/app/core/data/crud/source_document_job_status.py b/backend/src/app/core/data/crud/source_document_job_status.py
@@ -1,3 +1,9 @@
+from typing import List
+
+from fastapi.encoders import jsonable_encoder
+from sqlalchemy import and_, false, or_
+from sqlalchemy.orm import Session
+
 from app.core.data.crud.crud_base import CRUDBase
 from app.core.data.dto.source_document_job_status import (
     SourceDocumentJobStatusCreate,
@@ -13,7 +19,26 @@ class CRUDSourceDocumentJobStatus(
         SourceDocumentJobStatusUpdate,
     ]
 ):
-    pass
+    def create_multi(
+        self, db: Session, *, create_dtos: List[SourceDocumentJobStatusCreate]
+    ) -> List[SourceDocumentJobStatusORM]:
+        db_objs = [self.model(**jsonable_encoder(x)) for x in create_dtos]
+        q = db.query(self.model).where(
+            or_(
+                false(),
+                *[
+                    and_(
+                        SourceDocumentJobStatusORM.id == x.id,
+                        SourceDocumentJobStatusORM.type == x.type,
+                    )
+                    for x in create_dtos
+                ],
+            )
+        )
+        q.delete()
+        db.add_all(db_objs)
+        db.commit()
+        return db_objs
 
 
 crud_sdoc_job_status = CRUDSourceDocumentJobStatus(SourceDocumentJobStatusORM)
diff --git a/backend/src/app/core/data/dto/ml_job.py b/backend/src/app/core/data/dto/ml_job.py
@@ -15,6 +15,7 @@ class MLJobType(StrEnum):
     DOC_TAG_RECOMMENDATION = "DOC_TAG_RECOMMENDATION"
     COREFERENCE_RESOLUTION = "COREFERENCE_RESOLUTION"
     DOCUMENT_EMBEDDING = "DOCUMENT_EMBEDDING"
+    SENTENCE_EMBEDDING = "SENTENCE_EMBEDDING"
 
 
 class QuotationAttributionParams(BaseModel):
@@ -53,6 +54,13 @@ class DocumentEmbeddingParams(BaseModel):
     )
 
 
+class SentenceEmbeddingParams(BaseModel):
+    ml_job_type: Literal[MLJobType.SENTENCE_EMBEDDING]
+    recompute: bool = Field(
+        default=False, description="Whether to recompute already processed documents"
+    )
+
+
 class MLJobParameters(BaseModel):
     ml_job_type: MLJobType = Field(description="The type of the MLJob")
     project_id: int = Field(description="The ID of the Project to analyse")
@@ -61,6 +69,7 @@ class MLJobParameters(BaseModel):
         DocTagRecommendationParams,
         CoreferenceResolutionParams,
         DocumentEmbeddingParams,
+        SentenceEmbeddingParams,
         None,
     ] = Field(
         description="Specific parameters for the MLJob w.r.t it's type",
diff --git a/backend/src/app/core/data/orm/source_document_job_status.py b/backend/src/app/core/data/orm/source_document_job_status.py
@@ -15,6 +15,7 @@ class JobType(IntEnum):
     QUOTATION_ATTRIBUTION = 100
     COREFERENCE_RESOLUTION = 101
     DOCUMENT_EMBEDDING = 102
+    SENTENCE_EMBEDDING = 103
 
 
 class JobStatus(IntEnum):
diff --git a/backend/src/app/core/db/simsearch_service.py b/backend/src/app/core/db/simsearch_service.py
@@ -1,18 +1,9 @@
-from typing import (
-    Any,
-    Dict,
-    List,
-    Optional,
-    Union,
-)
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 from loguru import logger
 
-from app.core.data.dto.search import (
-    SimSearchImageHit,
-    SimSearchSentenceHit,
-)
+from app.core.data.dto.search import SimSearchImageHit, SimSearchSentenceHit
 from app.core.ml.embedding_service import EmbeddingService
 from app.core.vector.crud.image_embedding import crud_image_embedding
 from app.core.vector.crud.sentence_embedding import crud_sentence_embedding
@@ -44,7 +35,7 @@ def _encode_query(
             query_emb = (
                 self.emb.encode_document(" ".join(text_query))
                 if document_query
-                else self.emb.encode_sentences(sentences=text_query)
+                else self.emb.encode_sentences(sentences=text_query)[0]
             )
         elif image_query_id is not None:
             query_emb = self.emb.encode_image(sdoc_id=image_query_id)
diff --git a/backend/src/app/core/ml/embedding_service.py b/backend/src/app/core/ml/embedding_service.py
@@ -18,7 +18,9 @@
 from app.core.data.repo.utils import image_to_base64, load_image
 from app.core.db.sql_service import SQLService
 from app.core.vector.crud.document_embedding import crud_document_embedding
+from app.core.vector.crud.sentence_embedding import crud_sentence_embedding
 from app.core.vector.dto.document_embedding import DocumentObjectIdentifier
+from app.core.vector.dto.sentence_embedding import SentenceObjectIdentifier
 from app.core.vector.weaviate_service import WeaviateService
 from app.preprocessing.ray_model_service import RayModelService
 from app.preprocessing.ray_model_worker.dto.clip import (
@@ -46,10 +48,7 @@ def encode_sentences(self, sentences: List[str]) -> np.ndarray:
         encoded_query = self.rms.clip_text_embedding(
             ClipTextEmbeddingInput(text=sentences)
         )
-        if len(encoded_query.embeddings) == 1:
-            return encoded_query.numpy().squeeze()
-        else:
-            return encoded_query.numpy()
+        return encoded_query.numpy()
 
     def encode_image(self, sdoc_id: int) -> np.ndarray:
         with self.sqls.db_session() as db:
@@ -69,6 +68,87 @@ def encode_image(self, sdoc_id: int) -> np.ndarray:
         )
         return encoded_query.numpy().squeeze()
 
+    def embed_sentences(
+        self, project_id: int, filter_criterion: ColumnElement, recompute=False
+    ) -> int:
+        total_processed = 0
+        num_processed = -1
+
+        with self.weaviate.weaviate_session() as client:
+            if recompute:
+                crud_sentence_embedding.remove_embeddings_by_project(client, project_id)
+
+            while num_processed != 0:
+                num_processed = self._process_sentences_batch(
+                    client,
+                    filter_criterion,
+                    project_id,
+                )
+                total_processed += num_processed
+            return total_processed
+
+    def _process_sentences_batch(
+        self,
+        client: WeaviateClient,
+        filter_criterion: ColumnElement,
+        project_id: int,
+        batch_size=16,
+    ):
+        with self.sqls.db_session() as db:
+            query = (
+                db.query(SourceDocumentDataORM)
+                .outerjoin(
+                    SourceDocumentJobStatusORM,
+                    and_(
+                        SourceDocumentJobStatusORM.id == SourceDocumentDataORM.id,
+                        SourceDocumentJobStatusORM.type == JobType.SENTENCE_EMBEDDING,
+                    ),
+                    full=True,
+                )
+                .filter(filter_criterion)
+                .limit(batch_size)
+            )
+            sdoc_data = query.all()
+        doc_sentences = [doc.sentences for doc in sdoc_data]
+        sdoc_ids = [doc.id for doc in sdoc_data]
+        num_docs = len(doc_sentences)
+
+        if num_docs == 0:
+            return num_docs
+
+        # Embed the sentences for a batch of documents
+        embeddings = self.encode_sentences(
+            [s for sents in doc_sentences for s in sents]
+        ).tolist()
+
+        ids = [
+            SentenceObjectIdentifier(sdoc_id=sdoc_id, sentence_id=i)
+            for sdoc_id, sents in zip(sdoc_ids, doc_sentences)
+            for i in range(len(sents))
+        ]
+
+        # Store the embeddings of a batch of documents
+        crud_sentence_embedding.add_embedding_batch(
+            client,
+            project_id,
+            ids=ids,
+            embeddings=embeddings,
+        )
+
+        crud_sdoc_job_status.create_multi(
+            db,
+            create_dtos=[
+                SourceDocumentJobStatusCreate(
+                    id=id,
+                    type=JobType.SENTENCE_EMBEDDING,
+                    status=JobStatus.FINISHED,
+                    timestamp=datetime.now(),
+                )
+                for id in sdoc_ids
+            ],
+        )
+        return num_docs
+
     def embed_documents(
         self, project_id: int, filter_criterion: ColumnElement, recompute=False
     ) -> int:
@@ -88,7 +168,7 @@ def embed_documents(
                     project_id,
                     force_override=(recompute and (total_processed == 0)),
                 )
-                total_processed = +num_processed
+                total_processed += num_processed
             return total_processed
 
     def _process_document_batch(
diff --git a/backend/src/app/core/ml/ml_service.py b/backend/src/app/core/ml/ml_service.py
@@ -12,6 +12,7 @@
     MLJobType,
     MLJobUpdate,
     QuotationAttributionParams,
+    SentenceEmbeddingParams,
 )
 from app.core.data.orm.source_document_job_status import (
     JobStatus,
@@ -122,6 +123,19 @@ def start_ml_job_sync(self, ml_job_id: str) -> MLJobRead:
                     EmbeddingService().embed_documents(
                         mlj.parameters.project_id, filter_criterion, recompute
                     )
+                case MLJobType.SENTENCE_EMBEDDING:
+                    assert isinstance(
+                        mlj.parameters.specific_ml_job_parameters,
+                        SentenceEmbeddingParams,
+                    ), "SentencetEmbeddingParams expected"
+                    recompute = mlj.parameters.specific_ml_job_parameters.recompute
+                    filter_criterion = self._build_filter_criterion(
+                        start_time, recompute
+                    )
+                    EmbeddingService().embed_sentences(
+                        mlj.parameters.project_id, filter_criterion, recompute
+                    )
+
             mlj = self._update_ml_job(
                 ml_job_id, MLJobUpdate(status=BackgroundJobStatus.FINISHED)
             )
diff --git a/backend/src/app/preprocessing/pipeline/steps/common/storage/index_text_document_for_simsearch.py b/backend/src/app/preprocessing/pipeline/steps/common/storage/index_text_document_for_simsearch.py
@@ -19,7 +19,6 @@ def index_text_document_for_simsearch(cargo: PipelineCargo) -> PipelineCargo:
     if len(sentences) > 0:
         # embed the sentences
         embeddings = emb.encode_sentences(sentences=sentences).tolist()
-        embeddings = embeddings if len(sentences) > 1 else [embeddings]
 
         # store the embeddings
         logger.debug(
diff --git a/frontend/src/api/openapi/models/MLJobParameters_Input.ts b/frontend/src/api/openapi/models/MLJobParameters_Input.ts
@@ -7,6 +7,7 @@ import type { DocTagRecommendationParams } from "./DocTagRecommendationParams";
 import type { DocumentEmbeddingParams } from "./DocumentEmbeddingParams";
 import type { MLJobType } from "./MLJobType";
 import type { QuotationAttributionParams } from "./QuotationAttributionParams";
+import type { SentenceEmbeddingParams } from "./SentenceEmbeddingParams";
 export type MLJobParameters_Input = {
   /**
    * The type of the MLJob
@@ -20,6 +21,12 @@ export type MLJobParameters_Input = {
    * Specific parameters for the MLJob w.r.t it's type
    */
   specific_ml_job_parameters:
-    | (QuotationAttributionParams | DocTagRecommendationParams | CoreferenceResolutionParams | DocumentEmbeddingParams)
+    | (
+        | QuotationAttributionParams
+        | DocTagRecommendationParams
+        | CoreferenceResolutionParams
+        | DocumentEmbeddingParams
+        | SentenceEmbeddingParams
+      )
     | null;
 };
diff --git a/frontend/src/api/openapi/models/MLJobParameters_Output.ts b/frontend/src/api/openapi/models/MLJobParameters_Output.ts
@@ -7,6 +7,7 @@ import type { DocTagRecommendationParams } from "./DocTagRecommendationParams";
 import type { DocumentEmbeddingParams } from "./DocumentEmbeddingParams";
 import type { MLJobType } from "./MLJobType";
 import type { QuotationAttributionParams } from "./QuotationAttributionParams";
+import type { SentenceEmbeddingParams } from "./SentenceEmbeddingParams";
 export type MLJobParameters_Output = {
   /**
    * The type of the MLJob
@@ -20,6 +21,12 @@ export type MLJobParameters_Output = {
    * Specific parameters for the MLJob w.r.t it's type
    */
   specific_ml_job_parameters:
-    | (QuotationAttributionParams | DocTagRecommendationParams | CoreferenceResolutionParams | DocumentEmbeddingParams)
+    | (
+        | QuotationAttributionParams
+        | DocTagRecommendationParams
+        | CoreferenceResolutionParams
+        | DocumentEmbeddingParams
+        | SentenceEmbeddingParams
+      )
     | null;
 };
diff --git a/frontend/src/api/openapi/models/MLJobType.ts b/frontend/src/api/openapi/models/MLJobType.ts
@@ -7,4 +7,5 @@ export enum MLJobType {
   DOC_TAG_RECOMMENDATION = "DOC_TAG_RECOMMENDATION",
   COREFERENCE_RESOLUTION = "COREFERENCE_RESOLUTION",
   DOCUMENT_EMBEDDING = "DOCUMENT_EMBEDDING",
+  SENTENCE_EMBEDDING = "SENTENCE_EMBEDDING",
 }
diff --git a/frontend/src/api/openapi/models/SentenceEmbeddingParams.ts b/frontend/src/api/openapi/models/SentenceEmbeddingParams.ts
@@ -0,0 +1,11 @@
+/* generated using openapi-typescript-codegen -- do not edit */
+/* istanbul ignore file */
+/* tslint:disable */
+/* eslint-disable */
+export type SentenceEmbeddingParams = {
+  ml_job_type: string;
+  /**
+   * Whether to recompute already processed documents
+   */
+  recompute?: boolean;
+};
diff --git a/frontend/src/openapi.json b/frontend/src/openapi.json
@@ -8742,15 +8742,17 @@
                   { "$ref": "#/components/schemas/QuotationAttributionParams" },
                   { "$ref": "#/components/schemas/DocTagRecommendationParams" },
                   { "$ref": "#/components/schemas/CoreferenceResolutionParams" },
-                  { "$ref": "#/components/schemas/DocumentEmbeddingParams" }
+                  { "$ref": "#/components/schemas/DocumentEmbeddingParams" },
+                  { "$ref": "#/components/schemas/SentenceEmbeddingParams" }
                 ],
                 "discriminator": {
                   "propertyName": "ml_job_type",
                   "mapping": {
                     "COREFERENCE_RESOLUTION": "#/components/schemas/CoreferenceResolutionParams",
                     "DOCUMENT_EMBEDDING": "#/components/schemas/DocumentEmbeddingParams",
                     "DOC_TAG_RECOMMENDATION": "#/components/schemas/DocTagRecommendationParams",
-                    "QUOTATION_ATTRIBUTION": "#/components/schemas/QuotationAttributionParams"
+                    "QUOTATION_ATTRIBUTION": "#/components/schemas/QuotationAttributionParams",
+                    "SENTENCE_EMBEDDING": "#/components/schemas/SentenceEmbeddingParams"
                   }
                 }
               },
@@ -8775,15 +8777,17 @@
                   { "$ref": "#/components/schemas/QuotationAttributionParams" },
                   { "$ref": "#/components/schemas/DocTagRecommendationParams" },
                   { "$ref": "#/components/schemas/CoreferenceResolutionParams" },
-                  { "$ref": "#/components/schemas/DocumentEmbeddingParams" }
+                  { "$ref": "#/components/schemas/DocumentEmbeddingParams" },
+                  { "$ref": "#/components/schemas/SentenceEmbeddingParams" }
                 ],
                 "discriminator": {
                   "propertyName": "ml_job_type",
                   "mapping": {
                     "COREFERENCE_RESOLUTION": "#/components/schemas/CoreferenceResolutionParams",
                     "DOCUMENT_EMBEDDING": "#/components/schemas/DocumentEmbeddingParams",
                     "DOC_TAG_RECOMMENDATION": "#/components/schemas/DocTagRecommendationParams",
-                    "QUOTATION_ATTRIBUTION": "#/components/schemas/QuotationAttributionParams"
+                    "QUOTATION_ATTRIBUTION": "#/components/schemas/QuotationAttributionParams",
+                    "SENTENCE_EMBEDDING": "#/components/schemas/SentenceEmbeddingParams"
                   }
                 }
               },
@@ -8833,7 +8837,13 @@
       },
       "MLJobType": {
         "type": "string",
-        "enum": ["QUOTATION_ATTRIBUTION", "DOC_TAG_RECOMMENDATION", "COREFERENCE_RESOLUTION", "DOCUMENT_EMBEDDING"],
+        "enum": [
+          "QUOTATION_ATTRIBUTION",
+          "DOC_TAG_RECOMMENDATION",
+          "COREFERENCE_RESOLUTION",
+          "DOCUMENT_EMBEDDING",
+          "SENTENCE_EMBEDDING"
+        ],
         "title": "MLJobType"
       },
       "MemoColumns": {
@@ -10027,6 +10037,20 @@
         "required": ["sentence_annotations"],
         "title": "SentenceAnnotatorResult"
       },
+      "SentenceEmbeddingParams": {
+        "properties": {
+          "ml_job_type": { "type": "string", "const": "SENTENCE_EMBEDDING", "title": "Ml Job Type" },
+          "recompute": {
+            "type": "boolean",
+            "title": "Recompute",
+            "description": "Whether to recompute already processed documents",
+            "default": false
+          }
+        },
+        "type": "object",
+        "required": ["ml_job_type"],
+        "title": "SentenceEmbeddingParams"
+      },
       "SimSearchImageHit": {
         "properties": {
           "sdoc_id": {
diff --git a/frontend/src/views/tools/MlAutomation/MlAutomation.tsx b/frontend/src/views/tools/MlAutomation/MlAutomation.tsx

Original file line number	Diff line number	Diff line change
`@@ -7,4 +7,5 @@ export enum MLJobType {`
`7`	`7`	`DOC_TAG_RECOMMENDATION = "DOC_TAG_RECOMMENDATION",`
`8`	`8`	`COREFERENCE_RESOLUTION = "COREFERENCE_RESOLUTION",`
`9`	`9`	`DOCUMENT_EMBEDDING = "DOCUMENT_EMBEDDING",`
	`10`	`+ SENTENCE_EMBEDDING = "SENTENCE_EMBEDDING",`
`10`	`11`	`}`