Skip to content

Commit 9a28666

Browse files
authored
Merge pull request #550 from uhh-lt/interactive-labeling
Interactive labeling
2 parents 4235f0a + cc886b4 commit 9a28666

File tree

194 files changed

+17570
-3955
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

194 files changed

+17570
-3955
lines changed

.github/workflows/frontend_checks.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ jobs:
3333
run: |
3434
git fetch origin ${{ github.event.pull_request.base.ref }}
3535
base_branch="origin/${{ github.event.pull_request.base.ref }}"
36-
if git diff --name-only $base_branch HEAD | grep -q -e '^backend/requirements.txt' -e '^backend/environment.yml'; then
36+
if git diff --name-only $base_branch HEAD | grep -q -e '^backend/uv.lock'; then
3737
echo "BACKEND_HAS_NEW_REQUIREMENTS=true" >> $GITHUB_ENV
3838
fi
3939
- name: Check for new backend requirements (push)
4040
id: check_backend_requirements_push
4141
if: github.event_name == 'push'
4242
run: |
43-
if git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep -q -e '^backend/requirements.txt' -e '^backend/environment.yml'; then
43+
if git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep -q -e '^backend/uv.lock'; then
4444
echo "BACKEND_HAS_NEW_REQUIREMENTS=true" >> $GITHUB_ENV
4545
fi
4646
- name: Check for changes in ray_model_worker (pull request)

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,6 @@
4949
},
5050
"[json]": {
5151
"editor.defaultFormatter": "esbenp.prettier-vscode"
52-
}
52+
},
53+
"python.analysis.exclude": ["**/node_modules", "**/__pycache__", ".git", "./backend/.venv", "./docker"]
5354
}

DATSIMPORT.md

Lines changed: 0 additions & 9 deletions
This file was deleted.

backend/.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ JWT_SECRET=
1717
# Use `pwgen` to generate a secret or use any long random string
1818
SESSION_SECRET=
1919

20+
# Use `python -c "import uuid; print(uuid.uuid4())"` to generate a UUID namespace for the backend
21+
UUID_NAMESPACE=
22+
2023
# Where to store uploaded files.
2124
# <path_to_dats_repo>/docker/backend_repo
2225
SHARED_REPO_ROOT=/insert_path_to_dats_repo/docker/backend_repo
@@ -70,6 +73,7 @@ REDIS_PORT=13124
7073
REDIS_PASSWORD=dats123
7174

7275
WEAVIATE_PORT=13132
76+
WEAVIATE_GRPC_PORT=13134
7377

7478
RAY_HOST=localhost
7579
RAY_PORT=13130

backend/pyproject.toml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ dependencies = [
3030
"ffmpeg-python==0.2.0",
3131
"frozendict==2.3",
3232
"ftfy==6.1",
33+
"hdbscan==0.8.40",
3334
"httpx==0.28.1",
3435
"isort==5.12.0",
3536
"itsdangerous==2.2.0",
@@ -56,7 +57,6 @@ dependencies = [
5657
"python-jose==3.3",
5758
"python-magic==0.4.27",
5859
"python-multipart==0.0.20",
59-
"qdrant-client==1.9.1",
6060
"readability-lxml==0.8.1",
6161
"redis==4.3",
6262
"rope==1.9.0",
@@ -71,11 +71,10 @@ dependencies = [
7171
"tenacity==9.1.2",
7272
"tqdm==4.66.3",
7373
"twisted==22.10.0",
74-
"typesense==0.21.0",
7574
"ujson>=5.10.0",
7675
"umap-learn==0.5.5",
7776
"uvicorn==0.23.2",
78-
"weaviate-client==3.24.1",
77+
"weaviate-client==4.14.4",
7978
"webdriver-manager==4.0.1",
8079
"yake==0.4.8",
8180
]
@@ -92,6 +91,8 @@ dev = [
9291
"fasttext==0.9.3",
9392
"ffmpeg-python==0.2.0",
9493
"huggingface-hub==0.29.3",
94+
"ipykernel==6.29.5",
95+
"ipywidgets>=8.1.7",
9596
"lightning==2.5.1",
9697
"maverick-coref-de==1.0.5",
9798
"omegaconf==2.3.0",
@@ -101,11 +102,11 @@ dev = [
101102
"pytorch-crf==0.7.2",
102103
"quotect==1.1.5",
103104
"ray==2.44.1",
104-
"sentence-transformers==4.0.1",
105-
"setfit==1.1.1",
105+
"sentence-transformers==4.1.0",
106+
"setfit==1.1.2",
106107
"spacy==3.8.4",
107108
"spacy-curated-transformers==0.3.0",
108-
"spacy-transformers==1.3.8",
109+
"spacy-transformers==1.3.9",
109110
"timm==1.0.15",
110111
"torch==2.6.0",
111112
"torchaudio==2.6.0",
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""add perspectives tables (aspect, cluster, documentaspect, documentcluster)
2+
3+
Revision ID: 114328a16f17
4+
Revises: 42c759c92f5b
5+
Create Date: 2025-07-03 14:29:46.966812
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = "114328a16f17"
16+
down_revision: Union[str, None] = "42c759c92f5b"
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
# ### commands auto generated by Alembic - please adjust! ###
23+
op.create_table(
24+
"aspect",
25+
sa.Column("id", sa.Integer(), nullable=False),
26+
sa.Column("name", sa.String(), nullable=False),
27+
sa.Column("doc_embedding_prompt", sa.Text(), nullable=False),
28+
sa.Column("doc_modification_prompt", sa.Text(), nullable=True),
29+
sa.Column("is_hierarchical", sa.Boolean(), nullable=False),
30+
sa.Column("most_recent_job_id", sa.String(), nullable=True),
31+
sa.Column(
32+
"embedding_model", sa.String(), server_default="default", nullable=False
33+
),
34+
sa.Column("project_id", sa.Integer(), nullable=False),
35+
sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
36+
sa.PrimaryKeyConstraint("id"),
37+
)
38+
op.create_index(op.f("ix_aspect_id"), "aspect", ["id"], unique=False)
39+
op.create_index(
40+
op.f("ix_aspect_project_id"), "aspect", ["project_id"], unique=False
41+
)
42+
op.create_table(
43+
"cluster",
44+
sa.Column("id", sa.Integer(), nullable=False),
45+
sa.Column("is_outlier", sa.Boolean(), nullable=False),
46+
sa.Column("name", sa.String(), nullable=True),
47+
sa.Column("description", sa.Text(), nullable=True),
48+
sa.Column("top_words", sa.ARRAY(sa.String()), nullable=True),
49+
sa.Column("top_word_scores", sa.ARRAY(sa.Float()), nullable=True),
50+
sa.Column("level", sa.Integer(), nullable=False),
51+
sa.Column("top_docs", sa.ARRAY(sa.Integer()), nullable=True),
52+
sa.Column("x", sa.Float(), nullable=True),
53+
sa.Column("y", sa.Float(), nullable=True),
54+
sa.Column("parent_cluster_id", sa.Integer(), nullable=True),
55+
sa.Column("aspect_id", sa.Integer(), nullable=False),
56+
sa.ForeignKeyConstraint(["aspect_id"], ["aspect.id"], ondelete="CASCADE"),
57+
sa.ForeignKeyConstraint(
58+
["parent_cluster_id"], ["cluster.id"], ondelete="SET NULL"
59+
),
60+
sa.PrimaryKeyConstraint("id"),
61+
)
62+
op.create_index(
63+
op.f("ix_cluster_aspect_id"), "cluster", ["aspect_id"], unique=False
64+
)
65+
op.create_index(op.f("ix_cluster_id"), "cluster", ["id"], unique=False)
66+
op.create_index(
67+
op.f("ix_cluster_parent_cluster_id"),
68+
"cluster",
69+
["parent_cluster_id"],
70+
unique=False,
71+
)
72+
op.create_table(
73+
"documentaspect",
74+
sa.Column("sdoc_id", sa.Integer(), nullable=False),
75+
sa.Column("aspect_id", sa.Integer(), nullable=False),
76+
sa.Column("content", sa.Text(), nullable=False),
77+
sa.Column("x", sa.Float(), nullable=True),
78+
sa.Column("y", sa.Float(), nullable=True),
79+
sa.ForeignKeyConstraint(["aspect_id"], ["aspect.id"], ondelete="CASCADE"),
80+
sa.ForeignKeyConstraint(["sdoc_id"], ["sourcedocument.id"], ondelete="CASCADE"),
81+
sa.PrimaryKeyConstraint("sdoc_id", "aspect_id"),
82+
)
83+
op.create_index(
84+
op.f("ix_documentaspect_aspect_id"),
85+
"documentaspect",
86+
["aspect_id"],
87+
unique=False,
88+
)
89+
op.create_index(
90+
op.f("ix_documentaspect_sdoc_id"), "documentaspect", ["sdoc_id"], unique=False
91+
)
92+
op.create_table(
93+
"documentcluster",
94+
sa.Column("sdoc_id", sa.Integer(), nullable=False),
95+
sa.Column("cluster_id", sa.Integer(), nullable=False),
96+
sa.Column("similarity", sa.Float(), nullable=True),
97+
sa.Column("is_accepted", sa.Boolean(), nullable=False),
98+
sa.ForeignKeyConstraint(["cluster_id"], ["cluster.id"], ondelete="CASCADE"),
99+
sa.ForeignKeyConstraint(["sdoc_id"], ["sourcedocument.id"], ondelete="CASCADE"),
100+
sa.PrimaryKeyConstraint("sdoc_id", "cluster_id"),
101+
)
102+
op.create_index(
103+
op.f("ix_documentcluster_cluster_id"),
104+
"documentcluster",
105+
["cluster_id"],
106+
unique=False,
107+
)
108+
op.create_index(
109+
op.f("ix_documentcluster_sdoc_id"), "documentcluster", ["sdoc_id"], unique=False
110+
)
111+
# ### end Alembic commands ###
112+
113+
114+
def downgrade() -> None:
115+
# ### commands auto generated by Alembic - please adjust! ###
116+
op.drop_index(op.f("ix_documentcluster_sdoc_id"), table_name="documentcluster")
117+
op.drop_index(op.f("ix_documentcluster_cluster_id"), table_name="documentcluster")
118+
op.drop_table("documentcluster")
119+
op.drop_index(op.f("ix_documentaspect_sdoc_id"), table_name="documentaspect")
120+
op.drop_index(op.f("ix_documentaspect_aspect_id"), table_name="documentaspect")
121+
op.drop_table("documentaspect")
122+
op.drop_index(op.f("ix_cluster_parent_cluster_id"), table_name="cluster")
123+
op.drop_index(op.f("ix_cluster_id"), table_name="cluster")
124+
op.drop_index(op.f("ix_cluster_aspect_id"), table_name="cluster")
125+
op.drop_table("cluster")
126+
op.drop_index(op.f("ix_aspect_project_id"), table_name="aspect")
127+
op.drop_index(op.f("ix_aspect_id"), table_name="aspect")
128+
op.drop_table("aspect")
129+
# ### end Alembic commands ###

backend/src/api/dependencies.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
from app.core.data.orm.user import UserORM
55
from app.core.db.sql_service import SQLService
66
from app.core.security import decode_jwt
7+
from app.core.vector.weaviate_service import WeaviateService
78
from config import conf
89
from fastapi import Depends, Query
910
from fastapi.security import OAuth2PasswordBearer
1011
from jose import JWTError
1112
from pydantic import ValidationError
1213
from sqlalchemy.orm import Session
14+
from weaviate import WeaviateClient
1315

1416
from api.util import credentials_exception
1517

@@ -51,6 +53,15 @@ async def get_db_session() -> AsyncGenerator[Session, None]:
5153
session.close()
5254

5355

56+
async def get_weaviate_session() -> AsyncGenerator[WeaviateClient, None]:
57+
session = WeaviateService().weaviate_session()
58+
try:
59+
yield session
60+
finally:
61+
if session is not None:
62+
session.close()
63+
64+
5465
def get_current_user(
5566
db: Session = Depends(get_db_session), token: str = Depends(reusable_oauth2_scheme)
5667
) -> UserORM:

backend/src/api/endpoints/chat.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
chat_session,
77
retrieval_augmented_generation_with_session,
88
)
9-
from app.core.search.filtering import Filter
10-
from app.core.search.sdoc_search.sdoc_search_columns import SdocColumns
119
from fastapi import APIRouter, Depends
1210
from sqlalchemy.orm import Session
1311

@@ -28,7 +26,7 @@ def rag_with_session(
2826
query: Union[str, List[str], int],
2927
top_k: int,
3028
threshold: float,
31-
filter: Filter[SdocColumns],
29+
sdoc_ids: Optional[List[int]],
3230
authz_user: AuthzUser = Depends(),
3331
session_id: Optional[str] = None,
3432
db: Session = Depends(get_db_session),
@@ -40,7 +38,7 @@ def rag_with_session(
4038
query=query,
4139
top_k=top_k,
4240
threshold=threshold,
43-
filter=filter,
41+
sdoc_ids=sdoc_ids,
4442
db=db,
4543
session_id=session_id,
4644
)

0 commit comments

Comments
 (0)