Skip to content

Commit c38fbd2

Browse files
authored
Merge pull request #501 from uhh-lt/improve-code-search
Improve code search
2 parents 777f916 + 3c414d2 commit c38fbd2

File tree

8 files changed

+154
-72
lines changed

8 files changed

+154
-72
lines changed

backend/src/app/core/analysis/search_statistics/search_statistics.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from app.core.data.crud.project_metadata import crud_project_meta
77
from app.core.data.dto.search_stats import KeywordStat, SpanEntityStat, TagStat
88
from app.core.data.orm.annotation_document import AnnotationDocumentORM
9-
from app.core.data.orm.code import CodeORM
109
from app.core.data.orm.document_tag import (
1110
DocumentTagORM,
1211
SourceDocumentDocumentTagLinkTable,
@@ -141,10 +140,9 @@ def compute_code_statistics(
141140
)
142141
.join(SpanTextORM.span_annotations)
143142
.join(SpanAnnotationORM.annotation_document)
144-
.join(SpanAnnotationORM.code)
145143
.group_by(SpanTextORM.id)
146144
.filter(
147-
CodeORM.id == code_id,
145+
SpanAnnotationORM.code_id == code_id,
148146
AnnotationDocumentORM.source_document_id.in_(list(sdoc_ids)),
149147
)
150148
.order_by(count.desc())
@@ -169,7 +167,7 @@ def compute_code_statistics(
169167
.join(SpanAnnotationORM.code)
170168
.group_by(SpanTextORM.id)
171169
.filter(
172-
CodeORM.id == code_id,
170+
SpanAnnotationORM.code_id == code_id,
173171
SpanTextORM.id.in_(span_text_ids),
174172
)
175173
.order_by(func.array_position(span_text_ids, SpanTextORM.id))

backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@
22
from sqlalchemy.dialects.postgresql import ARRAY, array, array_agg
33

44
from app.core.data.orm.annotation_document import AnnotationDocumentORM
5-
from app.core.data.orm.code import CodeORM
65
from app.core.data.orm.document_tag import DocumentTagORM
6+
from app.core.data.orm.sentence_annotation import SentenceAnnotationORM
77
from app.core.data.orm.source_document import SourceDocumentORM
88
from app.core.data.orm.span_annotation import SpanAnnotationORM
99
from app.core.data.orm.span_text import SpanTextORM
10-
from app.core.data.orm.user import UserORM
11-
from app.core.db.sql_utils import aggregate_ids
10+
from app.core.db.sql_utils import aggregate_ids, aggregate_two_ids
1211
from app.core.search.column_info import AbstractColumns
1312
from app.core.search.filtering_operators import FilterOperator, FilterValueType
1413
from app.core.search.search_builder import SearchBuilder
@@ -100,57 +99,72 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder):
10099
)
101100
case TimelineAnalysisColumns.CODE_ID_LIST:
102101
query_builder._add_subquery_column(
103-
aggregate_ids(
104-
CodeORM.id, label=TimelineAnalysisColumns.CODE_ID_LIST.value
102+
aggregate_two_ids(
103+
SpanAnnotationORM.code_id,
104+
SentenceAnnotationORM.code_id,
105+
label=TimelineAnalysisColumns.CODE_ID_LIST.value,
105106
)
106107
)
107108
query_builder._join_subquery(
108-
SourceDocumentORM.annotation_documents,
109+
AnnotationDocumentORM,
110+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
111+
isouter=True,
112+
)
113+
query_builder._join_subquery(
114+
SpanAnnotationORM,
115+
SpanAnnotationORM.annotation_document_id
116+
== AnnotationDocumentORM.id,
109117
isouter=True,
110118
)
111119
query_builder._join_subquery(
112-
SpanAnnotationORM.code,
120+
SentenceAnnotationORM,
121+
SentenceAnnotationORM.annotation_document_id
122+
== AnnotationDocumentORM.id,
113123
isouter=True,
114124
)
125+
115126
case TimelineAnalysisColumns.USER_ID_LIST:
116127
query_builder._add_subquery_column(
117128
aggregate_ids(
118-
UserORM.id, TimelineAnalysisColumns.USER_ID_LIST.value
129+
AnnotationDocumentORM.user_id,
130+
TimelineAnalysisColumns.USER_ID_LIST.value,
119131
)
120132
)
121133
query_builder._join_subquery(
122-
SourceDocumentORM.annotation_documents,
123-
isouter=True,
124-
)
125-
query_builder._join_subquery(
126-
AnnotationDocumentORM.user,
134+
AnnotationDocumentORM,
135+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
127136
isouter=True,
128137
)
129138
case TimelineAnalysisColumns.SPAN_ANNOTATIONS:
130139
query_builder._add_subquery_column(
131140
cast(
132141
array_agg(
133142
func.distinct(
134-
array([cast(CodeORM.id, String), SpanTextORM.text])
143+
array(
144+
[
145+
cast(SpanAnnotationORM.code_id, String),
146+
SpanTextORM.text,
147+
]
148+
)
135149
),
136150
),
137151
ARRAY(String, dimensions=2),
138152
).label(TimelineAnalysisColumns.SPAN_ANNOTATIONS.value)
139153
)
140154
query_builder._join_subquery(
141-
SourceDocumentORM.annotation_documents,
142-
isouter=True,
143-
)
144-
query_builder._join_subquery(
145-
AnnotationDocumentORM.span_annotations,
155+
AnnotationDocumentORM,
156+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
146157
isouter=True,
147158
)
148159
query_builder._join_subquery(
149-
SpanAnnotationORM.span_text,
160+
SpanAnnotationORM,
161+
SpanAnnotationORM.annotation_document_id
162+
== AnnotationDocumentORM.id,
150163
isouter=True,
151164
)
152165
query_builder._join_subquery(
153-
SpanAnnotationORM.code,
166+
SpanTextORM,
167+
SpanTextORM.id == SpanAnnotationORM.span_text_id,
154168
isouter=True,
155169
)
156170

backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22
from sqlalchemy.dialects.postgresql import ARRAY, array, array_agg
33

44
from app.core.data.orm.annotation_document import AnnotationDocumentORM
5-
from app.core.data.orm.code import CodeORM
65
from app.core.data.orm.document_tag import DocumentTagORM
6+
from app.core.data.orm.sentence_annotation import SentenceAnnotationORM
77
from app.core.data.orm.source_document import SourceDocumentORM
88
from app.core.data.orm.span_annotation import SpanAnnotationORM
99
from app.core.data.orm.span_text import SpanTextORM
10-
from app.core.data.orm.user import UserORM
1110
from app.core.data.orm.word_frequency import WordFrequencyORM
12-
from app.core.db.sql_utils import aggregate_ids
11+
from app.core.db.sql_utils import aggregate_ids, aggregate_two_ids
1312
from app.core.search.column_info import AbstractColumns
1413
from app.core.search.filtering_operators import FilterOperator, FilterValueType
1514
from app.core.search.search_builder import SearchBuilder
@@ -156,55 +155,72 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder):
156155
)
157156
case WordFrequencyColumns.CODE_ID_LIST:
158157
query_builder._add_subquery_column(
159-
aggregate_ids(
160-
CodeORM.id, label=WordFrequencyColumns.CODE_ID_LIST.value
158+
aggregate_two_ids(
159+
SpanAnnotationORM.code_id,
160+
SentenceAnnotationORM.code_id,
161+
label=WordFrequencyColumns.CODE_ID_LIST.value,
161162
)
162163
)
163164
query_builder._join_subquery(
164-
SourceDocumentORM.annotation_documents,
165+
AnnotationDocumentORM,
166+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
165167
isouter=True,
166168
)
167169
query_builder._join_subquery(
168-
SpanAnnotationORM.code,
170+
SpanAnnotationORM,
171+
SpanAnnotationORM.annotation_document_id
172+
== AnnotationDocumentORM.id,
169173
isouter=True,
170174
)
171-
case WordFrequencyColumns.USER_ID_LIST:
172-
query_builder._add_subquery_column(
173-
aggregate_ids(UserORM.id, WordFrequencyColumns.USER_ID_LIST.value)
174-
)
175175
query_builder._join_subquery(
176-
SourceDocumentORM.annotation_documents,
176+
SentenceAnnotationORM,
177+
SentenceAnnotationORM.annotation_document_id
178+
== AnnotationDocumentORM.id,
177179
isouter=True,
178180
)
181+
182+
case WordFrequencyColumns.USER_ID_LIST:
183+
query_builder._add_subquery_column(
184+
aggregate_ids(
185+
AnnotationDocumentORM.user_id,
186+
WordFrequencyColumns.USER_ID_LIST.value,
187+
)
188+
)
179189
query_builder._join_subquery(
180-
AnnotationDocumentORM.user,
190+
AnnotationDocumentORM,
191+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
181192
isouter=True,
182193
)
183194
case WordFrequencyColumns.SPAN_ANNOTATIONS:
184195
query_builder._add_subquery_column(
185196
cast(
186197
array_agg(
187198
func.distinct(
188-
array([cast(CodeORM.id, String), SpanTextORM.text])
199+
array(
200+
[
201+
cast(SpanAnnotationORM.code_id, String),
202+
SpanTextORM.text,
203+
]
204+
)
189205
),
190206
),
191207
ARRAY(String, dimensions=2),
192208
).label(WordFrequencyColumns.SPAN_ANNOTATIONS.value)
193209
)
194210
query_builder._join_subquery(
195-
SourceDocumentORM.annotation_documents,
196-
isouter=True,
197-
)
198-
query_builder._join_subquery(
199-
AnnotationDocumentORM.span_annotations,
211+
AnnotationDocumentORM,
212+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
200213
isouter=True,
201214
)
202215
query_builder._join_subquery(
203-
SpanAnnotationORM.span_text,
216+
SpanAnnotationORM,
217+
SpanAnnotationORM.annotation_document_id
218+
== AnnotationDocumentORM.id,
204219
isouter=True,
205220
)
206221
query_builder._join_subquery(
207-
SpanAnnotationORM.code,
222+
SpanTextORM,
223+
SpanTextORM.id == SpanAnnotationORM.span_text_id,
208224
isouter=True,
209225
)
210226

backend/src/app/core/db/sql_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,16 @@ def aggregate_ids(column: InstrumentedAttribute, label: str):
99
None,
1010
type_=ARRAY(Integer),
1111
).label(label)
12+
13+
14+
def aggregate_two_ids(
15+
column1: InstrumentedAttribute, column2: InstrumentedAttribute, label: str
16+
):
17+
return func.array_remove(
18+
func.array_cat(
19+
array_agg(func.distinct(column1), type_=ARRAY(Integer)),
20+
array_agg(func.distinct(column2), type_=ARRAY(Integer)),
21+
),
22+
None,
23+
type_=ARRAY(Integer),
24+
).label(label)

backend/src/app/core/search/bbox_anno_search/bbox_anno_search_columns.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,15 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder):
8181
label=BBoxColumns.DOCUMENT_TAG_ID_LIST.value,
8282
)
8383
)
84-
query_builder._join_subquery(BBoxAnnotationORM.annotation_document)
85-
query_builder._join_subquery(AnnotationDocumentORM.source_document)
84+
query_builder._join_subquery(
85+
AnnotationDocumentORM,
86+
AnnotationDocumentORM.id
87+
== BBoxAnnotationORM.annotation_document_id,
88+
)
89+
query_builder._join_subquery(
90+
SourceDocumentORM,
91+
SourceDocumentORM.id == AnnotationDocumentORM.source_document_id,
92+
)
8693
query_builder._join_subquery(
8794
SourceDocumentORM.document_tags, isouter=True
8895
)

backend/src/app/core/search/sdoc_search/sdoc_search_columns.py

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
from sqlalchemy.dialects.postgresql import ARRAY, array, array_agg
33

44
from app.core.data.orm.annotation_document import AnnotationDocumentORM
5-
from app.core.data.orm.code import CodeORM
65
from app.core.data.orm.document_tag import DocumentTagORM
6+
from app.core.data.orm.sentence_annotation import SentenceAnnotationORM
77
from app.core.data.orm.source_document import SourceDocumentORM
88
from app.core.data.orm.span_annotation import SpanAnnotationORM
99
from app.core.data.orm.span_text import SpanTextORM
10-
from app.core.data.orm.user import UserORM
11-
from app.core.db.sql_utils import aggregate_ids
10+
from app.core.db.sql_utils import (
11+
aggregate_ids,
12+
aggregate_two_ids,
13+
)
1214
from app.core.search.column_info import AbstractColumns
1315
from app.core.search.filtering_operators import FilterOperator, FilterValueType
1416
from app.core.search.search_builder import SearchBuilder
@@ -111,53 +113,71 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder):
111113
)
112114
case SdocColumns.CODE_ID_LIST:
113115
query_builder._add_subquery_column(
114-
aggregate_ids(CodeORM.id, label=SdocColumns.CODE_ID_LIST.value)
116+
aggregate_two_ids(
117+
SpanAnnotationORM.code_id,
118+
SentenceAnnotationORM.code_id,
119+
label=SdocColumns.CODE_ID_LIST.value,
120+
)
115121
)
116122
query_builder._join_subquery(
117-
SourceDocumentORM.annotation_documents,
123+
AnnotationDocumentORM,
124+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
118125
isouter=True,
119126
)
120127
query_builder._join_subquery(
121-
SpanAnnotationORM.code,
128+
SpanAnnotationORM,
129+
SpanAnnotationORM.annotation_document_id
130+
== AnnotationDocumentORM.id,
122131
isouter=True,
123132
)
124-
case SdocColumns.USER_ID_LIST:
125-
query_builder._add_subquery_column(
126-
aggregate_ids(UserORM.id, SdocColumns.USER_ID_LIST.value)
127-
)
128133
query_builder._join_subquery(
129-
SourceDocumentORM.annotation_documents,
134+
SentenceAnnotationORM,
135+
SentenceAnnotationORM.annotation_document_id
136+
== AnnotationDocumentORM.id,
130137
isouter=True,
131138
)
139+
140+
case SdocColumns.USER_ID_LIST:
141+
query_builder._add_subquery_column(
142+
aggregate_ids(
143+
AnnotationDocumentORM.user_id, SdocColumns.USER_ID_LIST.value
144+
)
145+
)
132146
query_builder._join_subquery(
133-
AnnotationDocumentORM.user,
147+
AnnotationDocumentORM,
148+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
134149
isouter=True,
135150
)
136151
case SdocColumns.SPAN_ANNOTATIONS:
137152
query_builder._add_subquery_column(
138153
cast(
139154
array_agg(
140155
func.distinct(
141-
array([cast(CodeORM.id, String), SpanTextORM.text])
156+
array(
157+
[
158+
cast(SpanAnnotationORM.code_id, String),
159+
SpanTextORM.text,
160+
]
161+
)
142162
),
143163
),
144164
ARRAY(String, dimensions=2),
145165
).label(SdocColumns.SPAN_ANNOTATIONS.value)
146166
)
147167
query_builder._join_subquery(
148-
SourceDocumentORM.annotation_documents,
149-
isouter=True,
150-
)
151-
query_builder._join_subquery(
152-
AnnotationDocumentORM.span_annotations,
168+
AnnotationDocumentORM,
169+
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
153170
isouter=True,
154171
)
155172
query_builder._join_subquery(
156-
SpanAnnotationORM.span_text,
173+
SpanAnnotationORM,
174+
SpanAnnotationORM.annotation_document_id
175+
== AnnotationDocumentORM.id,
157176
isouter=True,
158177
)
159178
query_builder._join_subquery(
160-
SpanAnnotationORM.code,
179+
SpanTextORM,
180+
SpanTextORM.id == SpanAnnotationORM.span_text_id,
161181
isouter=True,
162182
)
163183

0 commit comments

Comments
 (0)