uhh-lt · bigabig · Mar 28, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
diff --git a/backend/environment.yml b/backend/environment.yml
@@ -20,7 +20,6 @@ dependencies:
   - ftfy=6.1
   - httpx=0.23.0
   - isort=5.12.0
-  - langdetect=1.0
   - loguru=0.6
   - matplotlib=3.7.1
   - multiprocess=0.70.15
@@ -44,6 +43,7 @@ dependencies:
   - redis-py=4.3
   - rope=1.9.0
   - scikit-learn=1.3.2
+  - starlette==0.32.0.post1
   - sqlalchemy-utils=0.41.1
   - sqlalchemy=2.0.25
   - srsly=2.4.8

diff --git a/backend/src/app/core/authorization/oauth_service.py b/backend/src/app/core/authorization/oauth_service.py
@@ -1,3 +1,6 @@
+import random
+import string
+
 from authlib.integrations.starlette_client import OAuth, OAuthError
 from fastapi import Request
 from loguru import logger
@@ -68,9 +71,8 @@ async def authenticate_oidc(self, request: Request) -> UserORM:
                             last_name=userinfo.get("family_name", "Unknown"),
                             # Set a random password since we'll only use OIDC
                             password="".join(
-                                __import__("random").choices(
-                                    __import__("string").ascii_letters
-                                    + __import__("string").digits,
+                                random.choices(
+                                    string.ascii_letters + string.digits,
                                     k=32,
                                 )
                             ),

diff --git a/backend/src/app/preprocessing/pipeline/steps/text/process/detect_content_language.py b/backend/src/app/preprocessing/pipeline/steps/text/process/detect_content_language.py
@@ -1,16 +1,38 @@
-from langdetect import detect_langs
 from loguru import logger
 
 from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
 from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
+from app.preprocessing.ray_model_service import RayModelService
+from app.preprocessing.ray_model_worker.dto.glotlid import GlotLIDInput, GlotLIDOutput
+
+rms = RayModelService()
 
 
 def detect_content_language(cargo: PipelineCargo) -> PipelineCargo:
     pptd: PreProTextDoc = cargo.data["pptd"]
     if "language" not in pptd.metadata:
         try:
             # TODO Flo: what to do with mixed lang docs?
-            pptd.metadata["language"] = detect_langs(pptd.text)[0].lang
+            glotlid_input = GlotLIDInput(text=pptd.text)
+            glotlid_output: GlotLIDOutput = rms.language_identification(glotlid_input)
+
+            # map the GlodLID language code to the ISO 639-1 language code we support in our spaCy Pipeline
+            # TODO: we should set this in a config file or so
+            code_map = {
+                "eng_Latn": "en",
+                "deu_Latn": "de",
+                "ita_Latn": "it",
+            }
+
+            lang_code = glotlid_output.best_match.lang_code
+            lang_code = code_map.get(lang_code, None)
+            if lang_code is None:
+                logger.warning(
+                    f"Unsupported language of {pptd.filename}: {glotlid_output.best_match}"
+                )
+                lang_code = "en"
+
+            pptd.metadata["language"] = lang_code
         except Exception as e:
             logger.warning(f"Cannot detect language of {pptd.filename}! {e}")
             pptd.metadata["language"] = "en"

diff --git a/backend/src/app/preprocessing/ray_model_service.py b/backend/src/app/preprocessing/ray_model_service.py
@@ -17,6 +17,7 @@
     DETRImageInput,
     DETRObjectDetectionOutput,
 )
+from app.preprocessing.ray_model_worker.dto.glotlid import GlotLIDInput, GlotLIDOutput
 from app.preprocessing.ray_model_worker.dto.quote import QuoteJobInput, QuoteJobOutput
 from app.preprocessing.ray_model_worker.dto.seqsenttagger import (
     SeqSentTaggerJobInput,
@@ -161,3 +162,9 @@ def quote_prediction(self, input: QuoteJobInput) -> QuoteJobOutput:
             "/quote/predict", input.model_dump()
         )
         return QuoteJobOutput.model_validate(response.json())
+
+    def language_identification(self, input: GlotLIDInput) -> GlotLIDOutput:
+        response = self._make_post_request_with_json_data(
+            "/glotlid/lid", input.model_dump()
+        )
+        return GlotLIDOutput.model_validate(response.json())
diff --git a/backend/src/app/preprocessing/ray_model_worker/Dockerfile b/backend/src/app/preprocessing/ray_model_worker/Dockerfile
@@ -12,9 +12,14 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 RUN sudo apt-get update -q && sudo apt-get install -q -y --no-install-recommends\
     ffmpeg curl
 
+# install uv and cache dependencies (this drastically (!) reduces build time)
+RUN --mount=type=cache,target=/root/.cache pip install uv
+
 COPY requirements.txt /tmp/requirements.txt
 
-RUN pip install -r /tmp/requirements.txt
+# install and cache dependencies via uv (this drastically (!) reduces build time)
+RUN --mount=type=cache,target=/root/.cache uv pip install -r /tmp/requirements.txt --system
+
 
 # copy source code into the image
 WORKDIR /dats_code_ray

diff --git a/backend/src/app/preprocessing/ray_model_worker/apps/glotlid.py b/backend/src/app/preprocessing/ray_model_worker/apps/glotlid.py
@@ -0,0 +1,28 @@
+import logging
+
+from dto.glotlid import GlotLIDInput, GlotLIDOutput
+from fastapi import FastAPI
+from models.glotlid import GlotLIDModel
+from ray import serve
+from ray.serve.handle import DeploymentHandle
+
+logger = logging.getLogger("ray.serve")
+
+api = FastAPI()
+
+
+@serve.deployment(num_replicas=1, route_prefix="/glotlid")
+@serve.ingress(api)
+class GlotLIDApi:
+    def __init__(self, glotlid_model_handle: DeploymentHandle) -> None:
+        self.glotlid = glotlid_model_handle
+
+    @api.post("/lid", response_model=GlotLIDOutput)
+    async def language_identification(self, input: GlotLIDInput):
+        predicted_langs = await self.glotlid.identify_language.remote(input)  # type: ignore
+        return predicted_langs
+
+
+app = GlotLIDApi.bind(
+    glotlid_model_handle=GlotLIDModel.bind(),
+)
diff --git a/backend/src/app/preprocessing/ray_model_worker/config_cpu.yaml b/backend/src/app/preprocessing/ray_model_worker/config_cpu.yaml
@@ -187,3 +187,14 @@ quote:
     autoscaling_config:
       min_replicas: 1
       max_replicas: 1
+
+glotlid:
+  model: "cis-lmu/glotlid"
+  version: "latest"
+  # the device is always CPU!
+  deployment:
+    ray_actor_options:
+      num_gpus: .0
+    autoscaling_config:
+      min_replicas: 1
+      max_replicas: 1
diff --git a/backend/src/app/preprocessing/ray_model_worker/config_gpu.yaml b/backend/src/app/preprocessing/ray_model_worker/config_gpu.yaml
@@ -186,3 +186,14 @@ quote:
     autoscaling_config:
       min_replicas: 1
       max_replicas: 1
+
+glotlid:
+  model: "cis-lmu/glotlid"
+  version: "latest"
+  # the device is always CPU!
+  deployment:
+    ray_actor_options:
+      num_gpus: .0
+    autoscaling_config:
+      min_replicas: 1
+      max_replicas: 1
diff --git a/backend/src/app/preprocessing/ray_model_worker/dto/glotlid.py b/backend/src/app/preprocessing/ray_model_worker/dto/glotlid.py
@@ -0,0 +1,41 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+
+class DetectedLanguage(BaseModel):
+    lang_code: str = Field(examples=["eng_Latn", "deu_Latn"])
+    lang_name: str = Field(examples=["English", "German"])
+    confidence: float = Field(examples=[0.9, 0.8])
+
+
+class GlotLIDInput(BaseModel):
+    text: str = Field(
+        examples=["Some random text. E.g., the content of an SDoc or any other text."],
+        description="The text for which the language should be detected.",
+    )
+    top_k: int = Field(
+        default=3,
+        examples=[3],
+        description="The number of top languages to return.",
+    )
+
+
+class GlotLIDOutput(BaseModel):
+    best_match: DetectedLanguage = Field(
+        examples=[
+            DetectedLanguage(lang_code="eng_Latn", lang_name="English", confidence=0.9)
+        ]
+    )
+    detected_languages: List[DetectedLanguage] = Field(
+        examples=[
+            [
+                DetectedLanguage(
+                    lang_code="eng_Latn", lang_name="English", confidence=0.9
+                ),
+                DetectedLanguage(
+                    lang_code="deu_Latn", lang_name="German", confidence=0.1
+                ),
+            ]
+        ]
+    )