22from typing import Dict , List , NamedTuple , Tuple
33
44from loguru import logger
5- from sqlalchemy import ColumnElement
5+ from sqlalchemy import ColumnElement , and_
66from sqlalchemy .orm import Session
77
88from app .core .data .crud .annotation_document import crud_adoc
99from app .core .data .crud .code import crud_code
10+ from app .core .data .crud .project_metadata import crud_project_meta
1011from app .core .data .crud .source_document_job_status import crud_sdoc_job_status
1112from app .core .data .crud .span_annotation import crud_span_anno
1213from app .core .data .crud .span_group import crud_span_group
1314from app .core .data .crud .user import SYSTEM_USER_ID
15+ from app .core .data .doc_type import DocType
1416from app .core .data .dto .source_document_job_status import SourceDocumentJobStatusCreate
1517from app .core .data .dto .span_annotation import SpanAnnotationCreateIntern
1618from app .core .data .dto .span_group import SpanGroupCreateIntern
19+ from app .core .data .meta_type import MetaType
1720from app .core .data .orm .annotation_document import AnnotationDocumentORM
1821from app .core .data .orm .source_document_data import SourceDocumentDataORM
1922from app .core .data .orm .source_document_job_status import (
2023 JobStatus ,
2124 JobType ,
2225 SourceDocumentJobStatusORM ,
2326)
27+ from app .core .data .orm .source_document_metadata import SourceDocumentMetadataORM
2428from app .core .data .orm .span_annotation import SpanAnnotationORM
2529from app .core .db .sql_service import SQLService
2630from app .preprocessing .ray_model_service import RayModelService
@@ -68,12 +72,27 @@ def perform_quotation_detection(
6872 addr = self ._get_code_id (db , "ADDRESSEE" , project_id ),
6973 cue = self ._get_code_id (db , "CUE" , project_id ),
7074 )
75+ language_metadata = (
76+ crud_project_meta .read_by_project_and_key_and_metatype_and_doctype (
77+ db ,
78+ project_id ,
79+ "language" ,
80+ MetaType .STRING .value ,
81+ DocType .text .value ,
82+ )
83+ )
84+ if language_metadata is None :
85+ raise ValueError ("error with project, no language metadata available" )
7186
7287 total_processed = 0
7388 num_processed = - 1
7489 while num_processed != 0 :
7590 num_processed = self ._process_batch (
76- filter_criterion , project_id , codes , recompute
91+ filter_criterion ,
92+ project_id ,
93+ codes ,
94+ language_metadata .id ,
95+ recompute ,
7796 )
7897 total_processed = + num_processed
7998 return total_processed
@@ -83,20 +102,36 @@ def _process_batch(
83102 filter_criterion : ColumnElement ,
84103 project_id : int ,
85104 code : _CodeQuoteId ,
105+ language_metadata_id : int ,
86106 recompute : bool = False ,
87107 ):
88108 with self .sqls .db_session () as db :
89109 query = (
90110 db .query (SourceDocumentDataORM )
111+ .join (
112+ SourceDocumentMetadataORM ,
113+ SourceDocumentMetadataORM .source_document_id
114+ == SourceDocumentDataORM .id ,
115+ )
91116 .outerjoin (
92117 SourceDocumentJobStatusORM ,
93- SourceDocumentJobStatusORM .id == SourceDocumentDataORM .id ,
118+ and_ (
119+ SourceDocumentJobStatusORM .id == SourceDocumentDataORM .id ,
120+ SourceDocumentJobStatusORM .type
121+ == JobType .QUOTATION_ATTRIBUTION ,
122+ ),
94123 full = True ,
95124 )
96125 .filter (filter_criterion )
126+ .filter (
127+ SourceDocumentMetadataORM .project_metadata_id
128+ == language_metadata_id ,
129+ SourceDocumentMetadataORM .str_value == "de" ,
130+ )
97131 .limit (10 )
98132 )
99133 sdoc_data = query .all ()
134+ sdoc_data = [doc for doc in sdoc_data if doc is not None ]
100135 num_docs = len (sdoc_data )
101136
102137 if num_docs == 0 :
0 commit comments