Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ dependencies:
- ftfy=6.1
- httpx=0.23.0
- isort=5.12.0
- langdetect=1.0
- loguru=0.6
- matplotlib=3.7.1
- multiprocess=0.70.15
Expand All @@ -44,6 +43,7 @@ dependencies:
- redis-py=4.3
- rope=1.9.0
- scikit-learn=1.3.2
- starlette==0.32.0.post1
- sqlalchemy-utils=0.41.1
- sqlalchemy=2.0.25
- srsly=2.4.8
Expand Down
8 changes: 5 additions & 3 deletions backend/src/app/core/authorization/oauth_service.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import random
import string

from authlib.integrations.starlette_client import OAuth, OAuthError
from fastapi import Request
from loguru import logger
Expand Down Expand Up @@ -68,9 +71,8 @@ async def authenticate_oidc(self, request: Request) -> UserORM:
last_name=userinfo.get("family_name", "Unknown"),
# Set a random password since we'll only use OIDC
password="".join(
__import__("random").choices(
__import__("string").ascii_letters
+ __import__("string").digits,
random.choices(
string.ascii_letters + string.digits,
k=32,
)
),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,38 @@
from langdetect import detect_langs
from loguru import logger

from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
from app.preprocessing.ray_model_service import RayModelService
from app.preprocessing.ray_model_worker.dto.glotlid import GlotLIDInput, GlotLIDOutput

rms = RayModelService()


def detect_content_language(cargo: PipelineCargo) -> PipelineCargo:
pptd: PreProTextDoc = cargo.data["pptd"]
if "language" not in pptd.metadata:
try:
# TODO Flo: what to do with mixed lang docs?
pptd.metadata["language"] = detect_langs(pptd.text)[0].lang
glotlid_input = GlotLIDInput(text=pptd.text)
glotlid_output: GlotLIDOutput = rms.language_identification(glotlid_input)

# map the GlodLID language code to the ISO 639-1 language code we support in our spaCy Pipeline
# TODO: we should set this in a config file or so
code_map = {
"eng_Latn": "en",
"deu_Latn": "de",
"ita_Latn": "it",
}

lang_code = glotlid_output.best_match.lang_code
lang_code = code_map.get(lang_code, None)
if lang_code is None:
logger.warning(
f"Unsupported language of {pptd.filename}: {glotlid_output.best_match}"
)
lang_code = "en"

pptd.metadata["language"] = lang_code
except Exception as e:
logger.warning(f"Cannot detect language of {pptd.filename}! {e}")
pptd.metadata["language"] = "en"
Expand Down
7 changes: 7 additions & 0 deletions backend/src/app/preprocessing/ray_model_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
DETRImageInput,
DETRObjectDetectionOutput,
)
from app.preprocessing.ray_model_worker.dto.glotlid import GlotLIDInput, GlotLIDOutput
from app.preprocessing.ray_model_worker.dto.quote import QuoteJobInput, QuoteJobOutput
from app.preprocessing.ray_model_worker.dto.seqsenttagger import (
SeqSentTaggerJobInput,
Expand Down Expand Up @@ -161,3 +162,9 @@ def quote_prediction(self, input: QuoteJobInput) -> QuoteJobOutput:
"/quote/predict", input.model_dump()
)
return QuoteJobOutput.model_validate(response.json())

def language_identification(self, input: GlotLIDInput) -> GlotLIDOutput:
response = self._make_post_request_with_json_data(
"/glotlid/lid", input.model_dump()
)
return GlotLIDOutput.model_validate(response.json())
7 changes: 6 additions & 1 deletion backend/src/app/preprocessing/ray_model_worker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
RUN sudo apt-get update -q && sudo apt-get install -q -y --no-install-recommends\
ffmpeg curl

# install uv and cache dependencies (this drastically (!) reduces build time)
RUN --mount=type=cache,target=/root/.cache pip install uv

COPY requirements.txt /tmp/requirements.txt

RUN pip install -r /tmp/requirements.txt
# install and cache dependencies via uv (this drastically (!) reduces build time)
RUN --mount=type=cache,target=/root/.cache uv pip install -r /tmp/requirements.txt --system


# copy source code into the image
WORKDIR /dats_code_ray
Expand Down
28 changes: 28 additions & 0 deletions backend/src/app/preprocessing/ray_model_worker/apps/glotlid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import logging

from dto.glotlid import GlotLIDInput, GlotLIDOutput
from fastapi import FastAPI
from models.glotlid import GlotLIDModel
from ray import serve
from ray.serve.handle import DeploymentHandle

logger = logging.getLogger("ray.serve")

api = FastAPI()


@serve.deployment(num_replicas=1, route_prefix="/glotlid")
@serve.ingress(api)
class GlotLIDApi:
def __init__(self, glotlid_model_handle: DeploymentHandle) -> None:
self.glotlid = glotlid_model_handle

@api.post("/lid", response_model=GlotLIDOutput)
async def language_identification(self, input: GlotLIDInput):
predicted_langs = await self.glotlid.identify_language.remote(input) # type: ignore
return predicted_langs


app = GlotLIDApi.bind(
glotlid_model_handle=GlotLIDModel.bind(),
)
11 changes: 11 additions & 0 deletions backend/src/app/preprocessing/ray_model_worker/config_cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,14 @@ quote:
autoscaling_config:
min_replicas: 1
max_replicas: 1

glotlid:
model: "cis-lmu/glotlid"
version: "latest"
# the device is always CPU!
deployment:
ray_actor_options:
num_gpus: .0
autoscaling_config:
min_replicas: 1
max_replicas: 1
11 changes: 11 additions & 0 deletions backend/src/app/preprocessing/ray_model_worker/config_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,14 @@ quote:
autoscaling_config:
min_replicas: 1
max_replicas: 1

glotlid:
model: "cis-lmu/glotlid"
version: "latest"
# the device is always CPU!
deployment:
ray_actor_options:
num_gpus: .0
autoscaling_config:
min_replicas: 1
max_replicas: 1
41 changes: 41 additions & 0 deletions backend/src/app/preprocessing/ray_model_worker/dto/glotlid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from typing import List

from pydantic import BaseModel, Field


class DetectedLanguage(BaseModel):
lang_code: str = Field(examples=["eng_Latn", "deu_Latn"])
lang_name: str = Field(examples=["English", "German"])
confidence: float = Field(examples=[0.9, 0.8])


class GlotLIDInput(BaseModel):
text: str = Field(
examples=["Some random text. E.g., the content of an SDoc or any other text."],
description="The text for which the language should be detected.",
)
top_k: int = Field(
default=3,
examples=[3],
description="The number of top languages to return.",
)


class GlotLIDOutput(BaseModel):
best_match: DetectedLanguage = Field(
examples=[
DetectedLanguage(lang_code="eng_Latn", lang_name="English", confidence=0.9)
]
)
detected_languages: List[DetectedLanguage] = Field(
examples=[
[
DetectedLanguage(
lang_code="eng_Latn", lang_name="English", confidence=0.9
),
DetectedLanguage(
lang_code="deu_Latn", lang_name="German", confidence=0.1
),
]
]
)
Loading