uhh-lt
diff --git a/‎backend/src/app/core/data/crawler/pipelines/clean_html_pipeline.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/app/core/data/crawler/pipelines/clean_html_pipeline.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/app/core/data/export/export_service.py‎
Lines changed: 13 additions & 15 deletions b/‎backend/src/app/core/data/export/export_service.py‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎backend/src/app/core/data/import_/import_service.py‎
Lines changed: 64 additions & 38 deletions b/‎backend/src/app/core/data/import_/import_service.py‎
Lines changed: 64 additions & 38 deletions
@@ -5,7 +5,7 @@
 
 
 from app.core.data.crawler.crawled_item import CrawledItem
-from app.preprocessing.pipeline.steps.text.clean_html import (
+from app.preprocessing.pipeline.steps.text.init.clean_html import (
     cleaning_with_readability_pipeline,
 )
 
 
@@ -175,7 +175,7 @@ def __write_exported_txt_to_temp_file(
 
     def __write_exported_json_to_temp_file(
         self,
-        exported_file: Dict[str, Any],
+        exported_file: Union[List[Dict[str, Any]], Dict[str, Any]],
         fn: Optional[str] = None,
     ) -> Path:
         temp_file = self.repo.create_temp_file(fn=fn)
@@ -270,8 +270,8 @@ def __get_all_raw_sdocs_files_in_project_for_export(
 
     def __get_all_sdoc_transcripts_in_project_for_export(
         self, db: Session, project_id: int
-    ) -> List[Dict[str, str]]:
-        transcripts: List[Dict[str, Any]] = []
+    ) -> List[Tuple[str, List[Dict[str, Any]]]]:
+        transcripts: List[Tuple[str, List[Dict[str, Any]]]] = []
         sdocs = [
             SourceDocumentRead.model_validate(sdoc)
             for sdoc in crud_sdoc.read_by_project(db=db, proj_id=project_id)
@@ -284,14 +284,12 @@ def __get_all_sdoc_transcripts_in_project_for_export(
                 assert (
                     sdoc_data
                 ), f"Expected sdoc data for id {sdoc.id} to exist, because sdocs exist."
-                if sdoc_data.token_time_starts is not None:
-                    logger.info(f"Exporting transcript of file {sdoc.filename}")
-                    transcripts.append(
-                        {
-                            "transcript": sdoc_data.content,
-                            "filename": sdoc.filename,
-                        }
+                wlt = sdoc_data.word_level_transcriptions
+                if wlt is not None:
+                    logger.info(
+                        f"Exporting word_level_transcript of file {sdoc.filename}"
                     )
+                    transcripts.append((sdoc.filename, [x.model_dump() for x in wlt]))
 
         return transcripts
 
@@ -1068,10 +1066,10 @@ def _export_all_data_from_proj(
         exported_transcripts = self.__get_all_sdoc_transcripts_in_project_for_export(
             db=db, project_id=project_id
         )
-        for exported_transcript in exported_transcripts:
-            exported_file = self.__write_exported_txt_to_temp_file(
-                text=exported_transcript["transcript"],
-                fn=exported_transcript["filename"],
+        for filename, word_level_transcriptions in exported_transcripts:
+            exported_file = self.__write_exported_json_to_temp_file(
+                exported_file=word_level_transcriptions,
+                fn=filename + ".transcript",
             )
             exported_files.append(exported_file)
 
@@ -1082,7 +1080,7 @@ def _export_all_data_from_proj(
         for exported_sdoc_metadata in exported_sdocs_metadata:
             exported_file = self.__write_exported_json_to_temp_file(
                 exported_file=exported_sdoc_metadata,
-                fn=exported_sdoc_metadata["filename"],
+                fn=exported_sdoc_metadata["filename"] + ".metadata",
             )
             exported_files.append(exported_file)
 
 
@@ -52,6 +52,7 @@
     ProjectMetadataCreate,
     ProjectMetadataRead,
 )
+from app.core.data.dto.source_document_data import WordLevelTranscription
 from app.core.data.dto.source_document_link import SourceDocumentLinkCreate
 from app.core.data.orm.project import ProjectORM
 from app.core.data.repo.repo_service import (
@@ -76,8 +77,8 @@
     (r"user_\d+_logbook.md", "logbook", True),
     (r"user_\d+_memo.csv", "memo", True),
     (r"\w+.csv", "sdoc_annotations", False),
-    (r"\w+.txt", "sdoc_transcript", False),
-    (r"\w+.json", "sdoc_metadatas", False),
+    (r"\w+.transcript.json", "sdoc_transcript", False),
+    (r"\w+.metadata.json", "sdoc_metadatas", False),
 ]
 
 SDOC_FILE_TYPES = ["sdoc", "sdoc_annotations", "sdoc_metadatas"]
@@ -672,12 +673,11 @@ def _import_project(
                 sdoc_filepaths = {
                     "sdoc_filename":{
                         "sdoc": filename.html,
-                        "sdoc_metadatas": filename.json
-                        "sdoc_annotations": filename.csv
+                        "sdoc_metadatas": filename.metadata.json
+                        "sdoc_annotations": filename.annotations.csv
+                        "sdoc_word_level_transcription": filename.transcript.json
                     }
                 }
-
-
             """
             expected_file_paths, sdoc_filepaths = self.__read_import_project_files(
                 temp_proj_path=path_to_temp_import_dir
@@ -776,6 +776,18 @@ def _import_project(
                     }
                 logger.info(f"Generate sdoc metadata {metadata}")
 
+                # import (optional) word level transcriptions
+                sdoc_wlt: Optional[List[WordLevelTranscription]] = None
+                if "sdoc_transcript" in sdoc_package:
+                    sdoc_transcript_filepath = sdoc_package["sdoc_transcript"]
+                    with open(sdoc_transcript_filepath, "r") as f:
+                        sdoc_wlt = [
+                            WordLevelTranscription.model_validate(x)
+                            for x in json.load(f)
+                        ]
+
+                    logger.info(f"Generate word level transcription {sdoc_wlt}")
+
                 # import sdoc tags
                 for tag in sdoc_metadata["tags"]:
                     tags.append(tags_id_mapping[tag])
@@ -785,39 +797,50 @@ def _import_project(
                 sdoc_annotations_filepath = sdoc_package["sdoc_annotations"]
                 sdoc_annotations_df = pd.read_csv(sdoc_annotations_filepath)
                 logger.info(f"The doctype is {sdoc_doctype}")
-                if sdoc_doctype == DocType.text:
-                    # create AutoSpans for NER
-                    for _, row in sdoc_annotations_df.iterrows():
-                        if row["user_email"] in user_email_id_mapping:
-                            email: Optional[str] = (
-                                str(row["user_email"])
-                                if isinstance(row["user_email"], str)
-                                else None
+                # if sdoc_doctype == DocType.text:
+                # create AutoSpans for NER
+                for _, row in sdoc_annotations_df.iterrows():
+                    if row["user_email"] in user_email_id_mapping:
+                        email: Optional[str] = (
+                            str(row["user_email"])
+                            if isinstance(row["user_email"], str)
+                            else None
+                        )
+                        if (
+                            email and bool(pd.notna(row["text"]))
+                        ):  # this should always be true because of if email in email_id_mapping
+                            auto = AutoSpan.model_validate(
+                                {
+                                    "code": row["code_name"],
+                                    "start": row["text_begin_char"],
+                                    "end": row["text_end_char"],
+                                    "text": row["text"],
+                                    "start_token": row["text_begin_token"],
+                                    "end_token": row["text_end_token"],
+                                    "user_id": user_email_id_mapping[email],
+                                }
                             )
-                            if email:  # this should always be true because of if email in email_id_mapping
-                                auto = AutoSpan.model_validate(
-                                    {
-                                        "code": row["code_name"],
-                                        "start": row["text_begin_char"],
-                                        "end": row["text_end_char"],
-                                        "text": row["text"],
-                                        "start_token": row["text_begin_token"],
-                                        "end_token": row["text_end_token"],
-                                        "user_id": user_email_id_mapping[email],
-                                    }
-                                )
-                                annotations.add(auto)
-                    logger.info(f"Generate sdoc annotations {annotations}")
-
-                elif sdoc_doctype == DocType.image:
+                            annotations.add(auto)
+                logger.info(f"Generate sdoc annotations {annotations}")
+
+                if sdoc_doctype == DocType.image:
                     # create boundig boxes for object detection
                     for _, row in sdoc_annotations_df.iterrows():
                         email: Optional[str] = (
                             str(row["user_email"])
                             if isinstance(row["user_email"], str)
                             else None
                         )
-                        if email:
+                        if (
+                            email
+                            and bool(pd.notna(row["bbox_x_min"]))
+                            and bool(pd.notna(row["bbox_y_min"]))
+                            and bool(pd.notna(row["bbox_x_max"]))
+                            and bool(pd.notna(row["bbox_y_max"]))
+                        ):
+                            logger.info(
+                                f"x_min {row['bbox_x_min']}, y_min: {row['bbox_y_min']}, x_max: {row['bbox_x_max']}, y_max: {row['bbox_y_max']}"
+                            )
                             bbox = AutoBBox.model_validate(
                                 {
                                     "code": row["code_name"],
@@ -853,6 +876,10 @@ def _import_project(
                     "tags": tags,
                     "sdoc_link": sdoc_link,
                 }
+                if sdoc_wlt:
+                    sdoc_specific_payloads[sdoc_filepath.name][
+                        "word_level_transcriptions"
+                    ] = sdoc_wlt
 
             # 2. Create preprojob
             from app.preprocessing.preprocessing_service import PreprocessingService
@@ -968,18 +995,17 @@ def __read_import_project_files(self, temp_proj_path: Path) -> Tuple[Dict, Dict]
             "codes": project_codes.csv
             "sdoc_links": project_sdoc_links.csv
             "tags": project_tags.csv
-            "users": users.csv
+            "users": project_users.csv
         }
-        sdocs = {
+        // das abrauchst du intern aufjeden fall
+        sdoc_filepaths = {
             "sdoc_filename":{
                 "sdoc": filename.html,
-                "sdoc_metadatas": filename.html.json
-                "sdoc_annotations": filename.html.csv
-                "sdoc_transcript":"filename.html.txt"
+                "sdoc_metadatas": filename.metadata.json
+                "sdoc_annotations": filename.annotations.csv
+                "sdoc_word_level_transcription": filename.transcript.json
             }
         }
-
-
         """
 
         expected_files: Dict = dict()
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`
`6`	`6`
`7`	`7`	`from app.core.data.crawler.crawled_item import CrawledItem`
`8`		`-from app.preprocessing.pipeline.steps.text.clean_html import (`
	`8`	`+from app.preprocessing.pipeline.steps.text.init.clean_html import (`
`9`	`9`	`cleaning_with_readability_pipeline,`
`10`	`10`	`)`
`11`	`11`