uhh-lt · bigabig · Jan 29, 2025 · Jun 25, 2024 · Jul 2, 2024 · Jul 15, 2024
diff --git a/backend/.env.example b/backend/.env.example
@@ -16,7 +16,7 @@ JWT_SECRET=
 
 # Where to store uploaded files.
 # <path_to_dats_repo>/docker/backend_repo
-REPO_ROOT=/insert_path_to_dats_repo/docker/backend_repo
+SHARED_REPO_ROOT=/insert_path_to_dats_repo/docker/backend_repo
 
 # The system user is automatically created and owns automatically generated data.
 SYSTEM_USER_EMAIL="[email protected]"

diff --git a/backend/src/alembic/versions/4eb64db5a67a_add_token_times_to_sdoc_data.py b/backend/src/alembic/versions/4eb64db5a67a_add_token_times_to_sdoc_data.py
@@ -0,0 +1,45 @@
+"""Add token time starts and ends to sdoc data
+Revision ID: 4eb64db5a67a
+Revises: 050f9378a3e1
+Create Date: 2025-01-09 17:00:29.037251
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "4eb64db5a67a"
+down_revision: Union[str, None] = "050f9378a3e1"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+
+    # TODO: if anybody on production uses audio or video files
+    # - read metadata word-level-transcriptions -> write to correpsonding new sdoc_data field
+    # - read .transcript.txt files -> transfer to sdoc_data content & html of corresponding audio / video data
+    # - if the transcript was annotated -> move all annotations to corresponding audio / video file annotations
+
+    op.add_column(
+        "sourcedocumentdata",
+        sa.Column("token_time_starts", postgresql.ARRAY(sa.Integer()), nullable=True),
+    )
+    op.add_column(
+        "sourcedocumentdata",
+        sa.Column("token_time_ends", postgresql.ARRAY(sa.Integer()), nullable=True),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("sourcedocumentdata", "token_time_ends")
+    op.drop_column("sourcedocumentdata", "token_time_starts")
+    # ### end Alembic commands ###
diff --git a/backend/src/alembic/versions/970c55224a39_add_repourl_to_sdocdata.py b/backend/src/alembic/versions/970c55224a39_add_repourl_to_sdocdata.py
@@ -0,0 +1,71 @@
+"""add repo_url to sdocdata
+
+Revision ID: 970c55224a39
+Revises: 4eb64db5a67a
+Create Date: 2025-01-13 10:21:43.457535
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from sqlalchemy.orm import sessionmaker
+
+from alembic import op
+from app.core.data.crud.source_document import crud_sdoc
+from app.core.data.doc_type import DocType
+from app.core.data.dto.source_document import SourceDocumentRead
+from app.core.data.repo.repo_service import RepoService
+
+# revision identifiers, used by Alembic.
+revision: str = "970c55224a39"
+down_revision: Union[str, None] = "4eb64db5a67a"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # add the column, non-nullable
+    op.add_column(
+        "sourcedocumentdata", sa.Column("repo_url", sa.String(), nullable=True)
+    )
+
+    conn = op.get_bind()
+
+    # 1. Read all existing project ids
+    projects = conn.execute(sa.text("SELECT id FROM project")).fetchall()
+    print("Found projects:", len(projects))
+
+    # 2. Read all existing Source Documents
+    db = sessionmaker(bind=conn)()
+    for row in projects:
+        proj_id = row.id
+        print("Processing project:", proj_id)
+
+        sdocs = crud_sdoc.read_by_project(db=db, proj_id=proj_id, only_finished=False)  # type: ignore
+
+        # 3. Use the repo service to get the URL of the Source Document
+        urls = []
+        for sdoc in sdocs:
+            url = RepoService().get_sdoc_url(
+                sdoc=SourceDocumentRead.model_validate(sdoc),
+                relative=True,
+                webp=sdoc.doctype == DocType.image,
+                thumbnail=False,
+            )
+            urls.append(url)
+
+        # 4. Update the repo_url field in the Source Document Data table
+        for sdoc, url in zip(sdocs, urls):
+            op.execute(
+                f"UPDATE sourcedocumentdata SET repo_url = '{url}' WHERE id = {sdoc.id}"
+            )
+
+    # change the column to non-nullable
+    op.alter_column("sourcedocumentdata", "repo_url", nullable=False)
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("sourcedocumentdata", "repo_url")
+    # ### end Alembic commands ###
diff --git a/backend/src/api/endpoints/import_.py b/backend/src/api/endpoints/import_.py
@@ -0,0 +1,156 @@
+import uuid
+
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
+from sqlalchemy.orm import Session
+
+from api.dependencies import get_current_user, get_db_session
+from app.celery.background_jobs import prepare_and_start_import_job_async
+from app.core.authorization.authz_user import AuthzUser
+from app.core.data.crud.project import crud_project
+from app.core.data.dto.import_job import (
+    ImportJobParameters,
+    ImportJobRead,
+    ImportJobType,
+)
+from app.core.data.dto.project import ProjectCreate
+from app.core.data.dto.user import UserRead
+from app.core.data.import_.import_service import ImportService
+from app.core.data.repo.repo_service import RepoService
+
+router = APIRouter(
+    prefix="/import", dependencies=[Depends(get_current_user)], tags=["import"]
+)
+
+ims: ImportService = ImportService()
+repo: RepoService = RepoService()
+
+
+@router.post(
+    "/{proj_id}/codes",
+    response_model=ImportJobRead,
+    summary="Starts the import codes job on given project id.",
+)
+def start_import_codes_job(
+    *,
+    # Ahmad: Since we're uploading a file we have to use multipart/form-data directly in the router method (see project put)
+    proj_id: int,
+    uploaded_file: UploadFile = File(
+        ...,
+        description=("CSV file of codes that gets uploaded into project"),
+    ),
+    authz_user: AuthzUser = Depends(),
+) -> ImportJobRead:
+    authz_user.assert_in_project(proj_id)
+    if not uploaded_file:
+        raise HTTPException(
+            status_code=418,
+            detail="Missing codes file.",
+        )
+    if not __is_file_csv(uploaded_file=uploaded_file):
+        raise HTTPException(
+            status_code=415,
+            detail="Codes need to be in csv format.",
+        )
+    user_id = authz_user.user.id
+    filename = f"import_user_code_{user_id}_{proj_id}.csv"
+    filepath = repo.get_dst_path_for_temp_file(filename)
+    filepath = repo.store_uploaded_file(
+        uploaded_file=uploaded_file, filepath=filepath, fn=filename
+    )
+
+    import_job_params = ImportJobParameters(
+        proj_id=proj_id,
+        filename=filename,
+        user_id=user_id,
+        import_job_type=ImportJobType.CODES,
+    )
+    return prepare_and_start_import_job_async(import_job_params=import_job_params)
+
+
+@router.post(
+    "/{proj_id}/tags",
+    response_model=ImportJobRead,
+    summary="Starts the import tags job on given project.",
+)
+def start_import_tags_job(
+    *,
+    # Ahmad: Since we're uploading a file we have to use multipart/form-data directly in the router method (see project put)
+    proj_id: int,
+    uploaded_file: UploadFile = File(
+        ...,
+        description=("CSV file of codes that gets uploaded into project"),
+    ),
+    authz_user: AuthzUser = Depends(),
+) -> ImportJobRead:
+    authz_user.assert_in_project(proj_id)
+    if not __is_file_csv(uploaded_file=uploaded_file):
+        raise HTTPException(
+            status_code=415,
+            detail="Codes need to be in csv format.",
+        )
+    user_id = authz_user.user.id
+    filename = f"import_tags_{user_id}_{proj_id}.csv"
+    filepath = repo.get_dst_path_for_temp_file(filename)
+    filepath = repo.store_uploaded_file(
+        uploaded_file=uploaded_file, filepath=filepath, fn=filename
+    )
+
+    import_job_params = ImportJobParameters(
+        proj_id=proj_id,
+        filename=filename,
+        user_id=user_id,
+        import_job_type=ImportJobType.TAGS,
+    )
+    return prepare_and_start_import_job_async(import_job_params=import_job_params)
+
+
+@router.post(
+    "",
+    response_model=ImportJobRead,
+    summary="Starts the import project job on given project",
+)
+def start_import_project_job(
+    *,
+    db: Session = Depends(get_db_session),
+    uploaded_file: UploadFile = File(
+        ...,
+        description=("Zip file of project metadata that gets uploaded into project"),
+    ),
+    current_user: UserRead = Depends(get_current_user),
+) -> ImportJobRead:
+    if not __is_file_zip(uploaded_file=uploaded_file):
+        raise HTTPException(
+            status_code=415,
+            detail="Project need to be in zip format.",
+        )
+    user_id = current_user.id
+    random_temp_project_name = str(uuid.uuid4())
+    filename = f"import_project_{random_temp_project_name}_for_user_{user_id}.zip"
+    filepath = repo.get_dst_path_for_temp_file(filename)
+    filepath = repo.store_uploaded_file(
+        uploaded_file=uploaded_file, filepath=filepath, fn=filename
+    )
+    project_create = ProjectCreate(title=random_temp_project_name, description="")
+    db_obj = crud_project.create(
+        db=db, create_dto=project_create, creating_user=current_user
+    )
+
+    import_job_params = ImportJobParameters(
+        proj_id=db_obj.id,
+        filename=filename,
+        user_id=user_id,
+        import_job_type=ImportJobType.PROJECT,
+    )
+    return prepare_and_start_import_job_async(import_job_params=import_job_params)
+
+
+def __is_file_csv(uploaded_file: UploadFile):
+    return uploaded_file.content_type == "text/csv"
+
+
+def __is_file_json(uploaded_file: UploadFile):
+    return uploaded_file.content_type == "application/json"
+
+
+def __is_file_zip(uploaded_file: UploadFile):
+    return uploaded_file.content_type == "application/zip"
diff --git a/backend/src/api/endpoints/source_document.py b/backend/src/api/endpoints/source_document.py
@@ -19,7 +19,6 @@
 from app.core.data.crud.source_document_metadata import crud_sdoc_meta
 from app.core.data.crud.span_annotation import crud_span_anno
 from app.core.data.crud.span_group import crud_span_group
-from app.core.data.doc_type import DocType
 from app.core.data.dto.bbox_annotation import (
     BBoxAnnotationRead,
     BBoxAnnotationReadResolved,
@@ -91,26 +90,7 @@ def get_by_id_with_data(
         crud_sdoc.get_status(db=db, sdoc_id=sdoc_id, raise_error_on_unfinished=True)
 
     sdoc_data = crud_sdoc.read_data(db=db, id=sdoc_id)
-    if sdoc_data is None:
-        # if data is none, that means the document is not a text document
-        # instead of returning html, we return the URL to the image / video / audio file
-        sdoc = SourceDocumentRead.model_validate(crud_sdoc.read(db=db, id=sdoc_id))
-        url = RepoService().get_sdoc_url(
-            sdoc=sdoc,
-            relative=True,
-            webp=sdoc.doctype == DocType.image,
-            thumbnail=False,
-        )
-        return SourceDocumentDataRead(
-            id=sdoc_id,
-            project_id=sdoc.project_id,
-            token_character_offsets=[],
-            tokens=[],
-            sentences=[],
-            html=url,
-        )
-    else:
-        return SourceDocumentDataRead.model_validate(sdoc_data)
+    return SourceDocumentDataRead.model_validate(sdoc_data)
 
 
 @router.delete(

diff --git a/backend/src/app/celery/background_jobs/__init__.py b/backend/src/app/celery/background_jobs/__init__.py
@@ -1,13 +1,16 @@
 from pathlib import Path
 from typing import Any, List
 
-from celery import Task
+from celery import Task, group
+from celery.result import GroupResult
 
 from app.core.data.crawler.crawler_service import CrawlerService
 from app.core.data.dto.crawler_job import CrawlerJobParameters, CrawlerJobRead
 from app.core.data.dto.export_job import ExportJobParameters, ExportJobRead
+from app.core.data.dto.import_job import ImportJobParameters, ImportJobRead
 from app.core.data.dto.llm_job import LLMJobParameters2, LLMJobRead
 from app.core.data.export.export_service import ExportService
+from app.core.data.import_.import_service import ImportService
 from app.core.data.llm.llm_service import LLMService
 from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
 
@@ -53,10 +56,23 @@ def prepare_and_start_export_job_async(
 
     exs: ExportService = ExportService()
     ex_job = exs.prepare_export_job(export_params)
+    print("-----ex id", ex_job.id)
     start_export_job.apply_async(kwargs={"export_job": ex_job})
     return ex_job
 
 
+def prepare_and_start_import_job_async(
+    import_job_params: ImportJobParameters,
+) -> ImportJobRead:
+    from app.celery.background_jobs.tasks import start_import_job
+
+    assert isinstance(start_import_job, Task), "Not a Celery Task"
+    ims: ImportService = ImportService()
+    ims_job = ims.prepare_import_job(import_job_params)
+    start_import_job.apply_async(kwargs={"import_job": ims_job})
+    return ims_job
+
+
 def prepare_and_start_crawling_job_async(
     crawler_params: CrawlerJobParameters,
 ) -> CrawlerJobRead:
@@ -98,7 +114,7 @@ def prepare_and_start_llm_job_async(
 
 def execute_text_preprocessing_pipeline_apply_async(
     cargos: List[PipelineCargo],
-) -> None:
+) -> GroupResult:
     from app.celery.background_jobs.tasks import (
         execute_text_preprocessing_pipeline_task,
     )
@@ -107,8 +123,10 @@ def execute_text_preprocessing_pipeline_apply_async(
         execute_text_preprocessing_pipeline_task, Task
     ), "Not a Celery Task"
 
+    tasks = []
     for cargo in cargos:
-        execute_text_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
+        tasks.append(execute_text_preprocessing_pipeline_task.s(cargo=cargo))
+    return group(tasks).apply_async()
 
 
 def execute_image_preprocessing_pipeline_apply_async(