Skip to content

Commit 0174963

Browse files
fix document tag recommendation and annoscaling after disrupting vector DB code changes (#553)
* fix document tg recommendation and annoscaling after disrupting vector DB code changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 1c50504 commit 0174963

File tree

3 files changed

+17
-36
lines changed

3 files changed

+17
-36
lines changed

backend/src/app/core/annoscaling/annoscaling_service.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,10 @@
11
from time import perf_counter_ns
2-
from typing import (
3-
Any,
4-
Callable,
5-
Dict,
6-
Iterable,
7-
List,
8-
Tuple,
9-
TypeVar,
10-
)
2+
from typing import Any, Callable, Dict, Iterable, List, Tuple, TypeVar
113

124
import numpy as np
135

146
from app.core.data.crud.span_annotation import crud_span_anno
15-
from app.core.data.dto.search import (
16-
SimSearchSentenceHit,
17-
)
7+
from app.core.data.dto.search import SimSearchSentenceHit
188
from app.core.data.dto.span_annotation import SpanAnnotationCreate
199
from app.core.data.orm.annotation_document import AnnotationDocumentORM
2010
from app.core.data.orm.source_document import SourceDocumentORM
@@ -142,7 +132,7 @@ def __suggest_similar_sentences(
142132
client=client,
143133
project_id=proj_id,
144134
id=SentenceObjectIdentifier(sdoc_id=sdoc_id, sentence_id=sent_id),
145-
k=top_k,
135+
k=1,
146136
threshold=0.0,
147137
)
148138
nearest.extend(

backend/src/app/core/ml/doc_tag_recommendation/doc_tag_recommendation_service.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,6 @@
11
import statistics
22
from collections import defaultdict
3-
from typing import (
4-
Any,
5-
Callable,
6-
Dict,
7-
Iterable,
8-
Iterator,
9-
List,
10-
Sequence,
11-
Set,
12-
TypeVar,
13-
)
3+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Sequence, Set, TypeVar
144

155
from app.core.data.crud.document_tag import crud_document_tag
166
from app.core.data.crud.document_tag_recommendation import (
@@ -21,12 +11,12 @@
2111
DocumentTagRecommendationLinkCreate,
2212
DocumentTagRecommendationMethod,
2313
)
24-
from app.core.data.dto.search import (
25-
SimSearchDocumentHit,
26-
)
14+
from app.core.data.dto.search import SimSearchDocumentHit
2715
from app.core.data.orm.document_tag import DocumentTagORM
2816
from app.core.db.sql_service import SQLService
2917
from app.core.vector.crud.document_embedding import crud_document_embedding
18+
from app.core.vector.dto.document_embedding import DocumentObjectIdentifier
19+
from app.core.vector.dto.search_results import SimSearchResult
3020
from app.core.vector.weaviate_service import WeaviateService
3121
from app.util.singleton_meta import SingletonMeta
3222
from weaviate import WeaviateClient
@@ -109,7 +99,7 @@ def classify_untagged_documents(
10999
client, ml_job_id, project_id, sdoc_ids, sdocs_and_tags
110100
)
111101

112-
dtos = self._deduplicate_document_classifications(dto_iter, multi_class)
102+
dtos = self._deduplicate_document_classifications(dto_iter, multi_class)
113103

114104
# Insert all generated tag recommendation DTOs into the database at once.
115105
crud_document_tag_recommendation_link.create_multi(db=db, create_dtos=dtos)
@@ -162,7 +152,7 @@ def __suggest_similar_documents(
162152
client=client,
163153
project_id=proj_id,
164154
sdoc_id=sdoc_id,
165-
k=top_k,
155+
k=1,
166156
threshold=0.0,
167157
)
168158
nearest.extend(
@@ -257,26 +247,25 @@ def _knn_suggestions(
257247
)
258248
sdoc_ids_to_classify = [sdoc.id for sdoc in sdocs_without_tags]
259249

260-
# TODO: Fix this
261-
# nns = self.sim.knn_documents(project_id, sdoc_ids_to_classify, sdoc_ids, k=5)
262-
nns = []
250+
nns: List[List[SimSearchResult[DocumentObjectIdentifier]]] = []
263251

264252
for sdoc_id in sdoc_ids_to_classify:
265253
# 1. Find k-nearest neighbors for the current sdoc_id
266-
crud_document_embedding.search_near_sdoc(
254+
result = crud_document_embedding.search_near_sdoc(
267255
client=client,
268256
project_id=project_id,
269257
sdoc_id=sdoc_id,
270258
k=5,
271-
threshold=0.5,
259+
threshold=0.0,
272260
sdoc_ids=list(sdoc_ids),
273261
)
262+
nns.append(result)
274263

275264
for nn, sdoc in zip(nns, sdoc_ids_to_classify):
276265
pairs = [
277266
(item.id, items.score)
278267
for items in nn
279-
for item in sdocs_and_tags[items.sdoc_id]
268+
for item in sdocs_and_tags[items.id.sdoc_id]
280269
]
281270
scores = defaultdict[int, list[float]](list)
282271
for id, score in pairs:

frontend/src/views/search/Statistics/SearchStatistics.tsx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ function SearchStatistics({
8383
<Tabs value={tab} onChange={handleTabChange} variant="scrollable">
8484
<Tab label="Keywords" value="keywords" />
8585
<Tab label="Tags" value="tags" />
86-
{projectCodes.data?.map((code) => <Tab key={code.id} label={code.name} value={`${code.id}`} />)}
86+
{projectCodes.data?.map((code) => (
87+
<Tab key={code.id} label={code.name} value={`${code.id}`} />
88+
))}
8789
</Tabs>
8890
</Stack>
8991

0 commit comments

Comments
 (0)