5252 ProjectMetadataCreate ,
5353 ProjectMetadataRead ,
5454)
55+ from app .core .data .dto .source_document_data import WordLevelTranscription
5556from app .core .data .dto .source_document_link import SourceDocumentLinkCreate
5657from app .core .data .orm .project import ProjectORM
5758from app .core .data .repo .repo_service import (
7677 (r"user_\d+_logbook.md" , "logbook" , True ),
7778 (r"user_\d+_memo.csv" , "memo" , True ),
7879 (r"\w+.csv" , "sdoc_annotations" , False ),
79- (r"\w+.txt " , "sdoc_transcript" , False ),
80- (r"\w+.json" , "sdoc_metadatas" , False ),
80+ (r"\w+.transcript.json " , "sdoc_transcript" , False ),
81+ (r"\w+.metadata. json" , "sdoc_metadatas" , False ),
8182]
8283
8384SDOC_FILE_TYPES = ["sdoc" , "sdoc_annotations" , "sdoc_metadatas" ]
@@ -672,12 +673,11 @@ def _import_project(
672673 sdoc_filepaths = {
673674 "sdoc_filename":{
674675 "sdoc": filename.html,
675- "sdoc_metadatas": filename.json
676- "sdoc_annotations": filename.csv
676+ "sdoc_metadatas": filename.metadata.json
677+ "sdoc_annotations": filename.annotations.csv
678+ "sdoc_word_level_transcription": filename.transcript.json
677679 }
678680 }
679-
680-
681681 """
682682 expected_file_paths , sdoc_filepaths = self .__read_import_project_files (
683683 temp_proj_path = path_to_temp_import_dir
@@ -776,6 +776,18 @@ def _import_project(
776776 }
777777 logger .info (f"Generate sdoc metadata { metadata } " )
778778
779+ # import (optional) word level transcriptions
780+ sdoc_wlt : Optional [List [WordLevelTranscription ]] = None
781+ if "sdoc_transcript" in sdoc_package :
782+ sdoc_transcript_filepath = sdoc_package ["sdoc_transcript" ]
783+ with open (sdoc_transcript_filepath , "r" ) as f :
784+ sdoc_wlt = [
785+ WordLevelTranscription .model_validate (x )
786+ for x in json .load (f )
787+ ]
788+
789+ logger .info (f"Generate word level transcription { sdoc_wlt } " )
790+
779791 # import sdoc tags
780792 for tag in sdoc_metadata ["tags" ]:
781793 tags .append (tags_id_mapping [tag ])
@@ -785,39 +797,50 @@ def _import_project(
785797 sdoc_annotations_filepath = sdoc_package ["sdoc_annotations" ]
786798 sdoc_annotations_df = pd .read_csv (sdoc_annotations_filepath )
787799 logger .info (f"The doctype is { sdoc_doctype } " )
788- if sdoc_doctype == DocType .text :
789- # create AutoSpans for NER
790- for _ , row in sdoc_annotations_df .iterrows ():
791- if row ["user_email" ] in user_email_id_mapping :
792- email : Optional [str ] = (
793- str (row ["user_email" ])
794- if isinstance (row ["user_email" ], str )
795- else None
800+ # if sdoc_doctype == DocType.text:
801+ # create AutoSpans for NER
802+ for _ , row in sdoc_annotations_df .iterrows ():
803+ if row ["user_email" ] in user_email_id_mapping :
804+ email : Optional [str ] = (
805+ str (row ["user_email" ])
806+ if isinstance (row ["user_email" ], str )
807+ else None
808+ )
809+ if (
810+ email and bool (pd .notna (row ["text" ]))
811+ ): # this should always be true because of if email in email_id_mapping
812+ auto = AutoSpan .model_validate (
813+ {
814+ "code" : row ["code_name" ],
815+ "start" : row ["text_begin_char" ],
816+ "end" : row ["text_end_char" ],
817+ "text" : row ["text" ],
818+ "start_token" : row ["text_begin_token" ],
819+ "end_token" : row ["text_end_token" ],
820+ "user_id" : user_email_id_mapping [email ],
821+ }
796822 )
797- if email : # this should always be true because of if email in email_id_mapping
798- auto = AutoSpan .model_validate (
799- {
800- "code" : row ["code_name" ],
801- "start" : row ["text_begin_char" ],
802- "end" : row ["text_end_char" ],
803- "text" : row ["text" ],
804- "start_token" : row ["text_begin_token" ],
805- "end_token" : row ["text_end_token" ],
806- "user_id" : user_email_id_mapping [email ],
807- }
808- )
809- annotations .add (auto )
810- logger .info (f"Generate sdoc annotations { annotations } " )
811-
812- elif sdoc_doctype == DocType .image :
823+ annotations .add (auto )
824+ logger .info (f"Generate sdoc annotations { annotations } " )
825+
826+ if sdoc_doctype == DocType .image :
813827 # create boundig boxes for object detection
814828 for _ , row in sdoc_annotations_df .iterrows ():
815829 email : Optional [str ] = (
816830 str (row ["user_email" ])
817831 if isinstance (row ["user_email" ], str )
818832 else None
819833 )
820- if email :
834+ if (
835+ email
836+ and bool (pd .notna (row ["bbox_x_min" ]))
837+ and bool (pd .notna (row ["bbox_y_min" ]))
838+ and bool (pd .notna (row ["bbox_x_max" ]))
839+ and bool (pd .notna (row ["bbox_y_max" ]))
840+ ):
841+ logger .info (
842+ f"x_min { row ['bbox_x_min' ]} , y_min: { row ['bbox_y_min' ]} , x_max: { row ['bbox_x_max' ]} , y_max: { row ['bbox_y_max' ]} "
843+ )
821844 bbox = AutoBBox .model_validate (
822845 {
823846 "code" : row ["code_name" ],
@@ -853,6 +876,10 @@ def _import_project(
853876 "tags" : tags ,
854877 "sdoc_link" : sdoc_link ,
855878 }
879+ if sdoc_wlt :
880+ sdoc_specific_payloads [sdoc_filepath .name ][
881+ "word_level_transcriptions"
882+ ] = sdoc_wlt
856883
857884 # 2. Create preprojob
858885 from app .preprocessing .preprocessing_service import PreprocessingService
@@ -968,18 +995,17 @@ def __read_import_project_files(self, temp_proj_path: Path) -> Tuple[Dict, Dict]
968995 "codes": project_codes.csv
969996 "sdoc_links": project_sdoc_links.csv
970997 "tags": project_tags.csv
971- "users": users .csv
998+ "users": project_users .csv
972999 }
973- sdocs = {
1000+ // das abrauchst du intern aufjeden fall
1001+ sdoc_filepaths = {
9741002 "sdoc_filename":{
9751003 "sdoc": filename.html,
976- "sdoc_metadatas": filename.html .json
977- "sdoc_annotations": filename.html .csv
978- "sdoc_transcript":" filename.html.txt"
1004+ "sdoc_metadatas": filename.metadata .json
1005+ "sdoc_annotations": filename.annotations .csv
1006+ "sdoc_word_level_transcription": filename.transcript.json
9791007 }
9801008 }
981-
982-
9831009 """
9841010
9851011 expected_files : Dict = dict ()
0 commit comments