Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
0e5c878
export project metadata and user data when exporting project (full)
AhmadHAW Jun 25, 2024
660ec0a
implement import job for user codes.
AhmadHAW Jul 2, 2024
d80d3b9
starting to implement import tags
AhmadHAW Jul 15, 2024
6acb416
export project and certain sdocs now also exports the relevant metada…
AhmadHAW Jul 16, 2024
eea5e4c
make export metadata more user friendly and implement import tags
AhmadHAW Jul 23, 2024
dbfa19c
removing redundant sdoc medatadata export, change name of sdoc annota…
AhmadHAW Jul 23, 2024
e17b681
when importing tags also applied tags to documents if document filena…
AhmadHAW Jul 23, 2024
139258d
import project project metadata
AhmadHAW Jul 23, 2024
bf26938
import project metadata
AhmadHAW Jul 23, 2024
8309108
adjust import service and text cargo pipeline
AhmadHAW Oct 1, 2024
45838d4
fixing merge conflicts
AhmadHAW Oct 1, 2024
ef3837d
fixing minor bugs
AhmadHAW Oct 8, 2024
6c64aa3
make is_init as obsolete as possible
AhmadHAW Oct 8, 2024
ae33dee
fixinf logs and annotation passing to pptd on init
AhmadHAW Oct 8, 2024
e764ff9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 8, 2024
3c3a6f8
Update OpenAPI spec and client
github-actions[bot] Oct 8, 2024
e28292e
Remving all ids and project names from exported dfs where not needed.…
AhmadHAW Oct 8, 2024
fe96bd5
finish text import pipeline and start image pipeline (still buggy)
AhmadHAW Oct 15, 2024
eab69fa
Update OpenAPI spec and client
github-actions[bot] Oct 15, 2024
fbfef1e
Fixing bbox import
AhmadHAW Oct 22, 2024
62696f1
finishing image pipeline
AhmadHAW Nov 19, 2024
9b3a11a
transcript is no longer an sdoc. transcript is linked to the audio do…
AhmadHAW Nov 28, 2024
9d06c6b
fixed for video pipeline as well
AhmadHAW Nov 28, 2024
06734a3
changing audio video and image pipeline, so transcript is not saved s…
AhmadHAW Dec 3, 2024
0a55aa9
fix export fso import could work on image and text
AhmadHAW Dec 3, 2024
1513956
import audio and video pipeline
AhmadHAW Dec 10, 2024
f263bfa
finishing import export and transcript in original sdoc merge
AhmadHAW Jan 9, 2025
fbfc418
Update OpenAPI spec and client
github-actions[bot] Jan 9, 2025
9c3fd78
add new field to sdoc data: repo_url
bigabig Jan 13, 2025
7f7a54e
update api
bigabig Jan 13, 2025
dcdfb60
updated image / video / audio rendering to use the new repo_url
bigabig Jan 13, 2025
9130fa4
Update OpenAPI spec and client
github-actions[bot] Jan 13, 2025
03e9315
added preprocessing concept readme
bigabig Jan 13, 2025
738c1de
fixing bug where orders changed in payload cargo
AhmadHAW Jan 14, 2025
aa321cd
refactored preprocessing pipeline structure and add word level transc…
AhmadHAW Jan 28, 2025
70b2371
updated api
AhmadHAW Jan 28, 2025
75d4b87
added migration todos
AhmadHAW Jan 28, 2025
c1d0742
updated api
AhmadHAW Jan 28, 2025
2495602
added back annotation for audio and video docs
AhmadHAW Jan 28, 2025
2ac3359
added the import endpoints
AhmadHAW Jan 28, 2025
b67de2f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 28, 2025
c04c3f9
added sentence annotation export / import
bigabig Jan 29, 2025
867bf98
refactored import & export service, splitting into multiple files
bigabig Jan 29, 2025
528eb8f
add sentencea annotator to video and audio
bigabig Jan 29, 2025
0d8ae15
add sentence annotation comparison to audio and video docs
bigabig Jan 29, 2025
4ec06c6
fixed bug in metadata copying
bigabig Jan 29, 2025
087dea7
Update OpenAPI spec and client
github-actions[bot] Jan 29, 2025
8fde822
updated api
bigabig Jan 29, 2025
5612cbe
fixed typo
bigabig Jan 29, 2025
ddb28ef
fixed typo
bigabig Jan 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ JWT_SECRET=

# Where to store uploaded files.
# <path_to_dats_repo>/docker/backend_repo
REPO_ROOT=/insert_path_to_dats_repo/docker/backend_repo
SHARED_REPO_ROOT=/insert_path_to_dats_repo/docker/backend_repo

# The system user is automatically created and owns automatically generated data.
SYSTEM_USER_EMAIL="[email protected]"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Add token time starts and ends to sdoc data
Revision ID: 4eb64db5a67a
Revises: 050f9378a3e1
Create Date: 2025-01-09 17:00:29.037251

"""

from typing import Sequence, Union

import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "4eb64db5a67a"
down_revision: Union[str, None] = "050f9378a3e1"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###

# TODO: if anybody on production uses audio or video files
# - read metadata word-level-transcriptions -> write to correpsonding new sdoc_data field
# - read .transcript.txt files -> transfer to sdoc_data content & html of corresponding audio / video data
# - if the transcript was annotated -> move all annotations to corresponding audio / video file annotations

op.add_column(
"sourcedocumentdata",
sa.Column("token_time_starts", postgresql.ARRAY(sa.Integer()), nullable=True),
)
op.add_column(
"sourcedocumentdata",
sa.Column("token_time_ends", postgresql.ARRAY(sa.Integer()), nullable=True),
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("sourcedocumentdata", "token_time_ends")
op.drop_column("sourcedocumentdata", "token_time_starts")
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""add repo_url to sdocdata

Revision ID: 970c55224a39
Revises: 4eb64db5a67a
Create Date: 2025-01-13 10:21:43.457535

"""

from typing import Sequence, Union

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

from alembic import op
from app.core.data.crud.source_document import crud_sdoc
from app.core.data.doc_type import DocType
from app.core.data.dto.source_document import SourceDocumentRead
from app.core.data.repo.repo_service import RepoService

# revision identifiers, used by Alembic.
revision: str = "970c55224a39"
down_revision: Union[str, None] = "4eb64db5a67a"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# add the column, non-nullable
op.add_column(
"sourcedocumentdata", sa.Column("repo_url", sa.String(), nullable=True)
)

conn = op.get_bind()

# 1. Read all existing project ids
projects = conn.execute(sa.text("SELECT id FROM project")).fetchall()
print("Found projects:", len(projects))

# 2. Read all existing Source Documents
db = sessionmaker(bind=conn)()
for row in projects:
proj_id = row.id
print("Processing project:", proj_id)

sdocs = crud_sdoc.read_by_project(db=db, proj_id=proj_id, only_finished=False) # type: ignore

# 3. Use the repo service to get the URL of the Source Document
urls = []
for sdoc in sdocs:
url = RepoService().get_sdoc_url(
sdoc=SourceDocumentRead.model_validate(sdoc),
relative=True,
webp=sdoc.doctype == DocType.image,
thumbnail=False,
)
urls.append(url)

# 4. Update the repo_url field in the Source Document Data table
for sdoc, url in zip(sdocs, urls):
op.execute(
f"UPDATE sourcedocumentdata SET repo_url = '{url}' WHERE id = {sdoc.id}"
)

# change the column to non-nullable
op.alter_column("sourcedocumentdata", "repo_url", nullable=False)


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("sourcedocumentdata", "repo_url")
# ### end Alembic commands ###
156 changes: 156 additions & 0 deletions backend/src/api/endpoints/import_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import uuid

from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from sqlalchemy.orm import Session

from api.dependencies import get_current_user, get_db_session
from app.celery.background_jobs import prepare_and_start_import_job_async
from app.core.authorization.authz_user import AuthzUser
from app.core.data.crud.project import crud_project
from app.core.data.dto.import_job import (
ImportJobParameters,
ImportJobRead,
ImportJobType,
)
from app.core.data.dto.project import ProjectCreate
from app.core.data.dto.user import UserRead
from app.core.data.import_.import_service import ImportService
from app.core.data.repo.repo_service import RepoService

router = APIRouter(
prefix="/import", dependencies=[Depends(get_current_user)], tags=["import"]
)

ims: ImportService = ImportService()
repo: RepoService = RepoService()


@router.post(
"/{proj_id}/codes",
response_model=ImportJobRead,
summary="Starts the import codes job on given project id.",
)
def start_import_codes_job(
*,
# Ahmad: Since we're uploading a file we have to use multipart/form-data directly in the router method (see project put)
proj_id: int,
uploaded_file: UploadFile = File(
...,
description=("CSV file of codes that gets uploaded into project"),
),
authz_user: AuthzUser = Depends(),
) -> ImportJobRead:
authz_user.assert_in_project(proj_id)
if not uploaded_file:
raise HTTPException(
status_code=418,
detail="Missing codes file.",
)
if not __is_file_csv(uploaded_file=uploaded_file):
raise HTTPException(
status_code=415,
detail="Codes need to be in csv format.",
)
user_id = authz_user.user.id
filename = f"import_user_code_{user_id}_{proj_id}.csv"
filepath = repo.get_dst_path_for_temp_file(filename)
filepath = repo.store_uploaded_file(
uploaded_file=uploaded_file, filepath=filepath, fn=filename
)

import_job_params = ImportJobParameters(
proj_id=proj_id,
filename=filename,
user_id=user_id,
import_job_type=ImportJobType.CODES,
)
return prepare_and_start_import_job_async(import_job_params=import_job_params)


@router.post(
"/{proj_id}/tags",
response_model=ImportJobRead,
summary="Starts the import tags job on given project.",
)
def start_import_tags_job(
*,
# Ahmad: Since we're uploading a file we have to use multipart/form-data directly in the router method (see project put)
proj_id: int,
uploaded_file: UploadFile = File(
...,
description=("CSV file of codes that gets uploaded into project"),
),
authz_user: AuthzUser = Depends(),
) -> ImportJobRead:
authz_user.assert_in_project(proj_id)
if not __is_file_csv(uploaded_file=uploaded_file):
raise HTTPException(
status_code=415,
detail="Codes need to be in csv format.",
)
user_id = authz_user.user.id
filename = f"import_tags_{user_id}_{proj_id}.csv"
filepath = repo.get_dst_path_for_temp_file(filename)
filepath = repo.store_uploaded_file(
uploaded_file=uploaded_file, filepath=filepath, fn=filename
)

import_job_params = ImportJobParameters(
proj_id=proj_id,
filename=filename,
user_id=user_id,
import_job_type=ImportJobType.TAGS,
)
return prepare_and_start_import_job_async(import_job_params=import_job_params)


@router.post(
"",
response_model=ImportJobRead,
summary="Starts the import project job on given project",
)
def start_import_project_job(
*,
db: Session = Depends(get_db_session),
uploaded_file: UploadFile = File(
...,
description=("Zip file of project metadata that gets uploaded into project"),
),
current_user: UserRead = Depends(get_current_user),
) -> ImportJobRead:
if not __is_file_zip(uploaded_file=uploaded_file):
raise HTTPException(
status_code=415,
detail="Project need to be in zip format.",
)
user_id = current_user.id
random_temp_project_name = str(uuid.uuid4())
filename = f"import_project_{random_temp_project_name}_for_user_{user_id}.zip"
filepath = repo.get_dst_path_for_temp_file(filename)
filepath = repo.store_uploaded_file(
uploaded_file=uploaded_file, filepath=filepath, fn=filename
)
project_create = ProjectCreate(title=random_temp_project_name, description="")
db_obj = crud_project.create(
db=db, create_dto=project_create, creating_user=current_user
)

import_job_params = ImportJobParameters(
proj_id=db_obj.id,
filename=filename,
user_id=user_id,
import_job_type=ImportJobType.PROJECT,
)
return prepare_and_start_import_job_async(import_job_params=import_job_params)


def __is_file_csv(uploaded_file: UploadFile):
return uploaded_file.content_type == "text/csv"


def __is_file_json(uploaded_file: UploadFile):
return uploaded_file.content_type == "application/json"


def __is_file_zip(uploaded_file: UploadFile):
return uploaded_file.content_type == "application/zip"
22 changes: 1 addition & 21 deletions backend/src/api/endpoints/source_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from app.core.data.crud.source_document_metadata import crud_sdoc_meta
from app.core.data.crud.span_annotation import crud_span_anno
from app.core.data.crud.span_group import crud_span_group
from app.core.data.doc_type import DocType
from app.core.data.dto.bbox_annotation import (
BBoxAnnotationRead,
BBoxAnnotationReadResolved,
Expand Down Expand Up @@ -91,26 +90,7 @@ def get_by_id_with_data(
crud_sdoc.get_status(db=db, sdoc_id=sdoc_id, raise_error_on_unfinished=True)

sdoc_data = crud_sdoc.read_data(db=db, id=sdoc_id)
if sdoc_data is None:
# if data is none, that means the document is not a text document
# instead of returning html, we return the URL to the image / video / audio file
sdoc = SourceDocumentRead.model_validate(crud_sdoc.read(db=db, id=sdoc_id))
url = RepoService().get_sdoc_url(
sdoc=sdoc,
relative=True,
webp=sdoc.doctype == DocType.image,
thumbnail=False,
)
return SourceDocumentDataRead(
id=sdoc_id,
project_id=sdoc.project_id,
token_character_offsets=[],
tokens=[],
sentences=[],
html=url,
)
else:
return SourceDocumentDataRead.model_validate(sdoc_data)
return SourceDocumentDataRead.model_validate(sdoc_data)


@router.delete(
Expand Down
24 changes: 21 additions & 3 deletions backend/src/app/celery/background_jobs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from pathlib import Path
from typing import Any, List

from celery import Task
from celery import Task, group
from celery.result import GroupResult

from app.core.data.crawler.crawler_service import CrawlerService
from app.core.data.dto.crawler_job import CrawlerJobParameters, CrawlerJobRead
from app.core.data.dto.export_job import ExportJobParameters, ExportJobRead
from app.core.data.dto.import_job import ImportJobParameters, ImportJobRead
from app.core.data.dto.llm_job import LLMJobParameters2, LLMJobRead
from app.core.data.export.export_service import ExportService
from app.core.data.import_.import_service import ImportService
from app.core.data.llm.llm_service import LLMService
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo

Expand Down Expand Up @@ -53,10 +56,23 @@ def prepare_and_start_export_job_async(

exs: ExportService = ExportService()
ex_job = exs.prepare_export_job(export_params)
print("-----ex id", ex_job.id)
start_export_job.apply_async(kwargs={"export_job": ex_job})
return ex_job


def prepare_and_start_import_job_async(
import_job_params: ImportJobParameters,
) -> ImportJobRead:
from app.celery.background_jobs.tasks import start_import_job

assert isinstance(start_import_job, Task), "Not a Celery Task"
ims: ImportService = ImportService()
ims_job = ims.prepare_import_job(import_job_params)
start_import_job.apply_async(kwargs={"import_job": ims_job})
return ims_job


def prepare_and_start_crawling_job_async(
crawler_params: CrawlerJobParameters,
) -> CrawlerJobRead:
Expand Down Expand Up @@ -98,7 +114,7 @@ def prepare_and_start_llm_job_async(

def execute_text_preprocessing_pipeline_apply_async(
cargos: List[PipelineCargo],
) -> None:
) -> GroupResult:
from app.celery.background_jobs.tasks import (
execute_text_preprocessing_pipeline_task,
)
Expand All @@ -107,8 +123,10 @@ def execute_text_preprocessing_pipeline_apply_async(
execute_text_preprocessing_pipeline_task, Task
), "Not a Celery Task"

tasks = []
for cargo in cargos:
execute_text_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
tasks.append(execute_text_preprocessing_pipeline_task.s(cargo=cargo))
return group(tasks).apply_async()


def execute_image_preprocessing_pipeline_apply_async(
Expand Down
Loading