Skip to content

Commit b6491f6

Browse files
committed
refactored preprocessing pipeline structure and add word level transcription import / export
1 parent 36b80a5 commit b6491f6

File tree

66 files changed

+913
-1595
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+913
-1595
lines changed

backend/src/app/core/data/crawler/pipelines/clean_html_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
from app.core.data.crawler.crawled_item import CrawledItem
8-
from app.preprocessing.pipeline.steps.text.clean_html import (
8+
from app.preprocessing.pipeline.steps.text.init.clean_html import (
99
cleaning_with_readability_pipeline,
1010
)
1111

backend/src/app/core/data/export/export_service.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def __write_exported_txt_to_temp_file(
175175

176176
def __write_exported_json_to_temp_file(
177177
self,
178-
exported_file: Dict[str, Any],
178+
exported_file: Union[List[Dict[str, Any]], Dict[str, Any]],
179179
fn: Optional[str] = None,
180180
) -> Path:
181181
temp_file = self.repo.create_temp_file(fn=fn)
@@ -270,8 +270,8 @@ def __get_all_raw_sdocs_files_in_project_for_export(
270270

271271
def __get_all_sdoc_transcripts_in_project_for_export(
272272
self, db: Session, project_id: int
273-
) -> List[Dict[str, str]]:
274-
transcripts: List[Dict[str, Any]] = []
273+
) -> List[Tuple[str, List[Dict[str, Any]]]]:
274+
transcripts: List[Tuple[str, List[Dict[str, Any]]]] = []
275275
sdocs = [
276276
SourceDocumentRead.model_validate(sdoc)
277277
for sdoc in crud_sdoc.read_by_project(db=db, proj_id=project_id)
@@ -284,14 +284,12 @@ def __get_all_sdoc_transcripts_in_project_for_export(
284284
assert (
285285
sdoc_data
286286
), f"Expected sdoc data for id {sdoc.id} to exist, because sdocs exist."
287-
if sdoc_data.token_time_starts is not None:
288-
logger.info(f"Exporting transcript of file {sdoc.filename}")
289-
transcripts.append(
290-
{
291-
"transcript": sdoc_data.content,
292-
"filename": sdoc.filename,
293-
}
287+
wlt = sdoc_data.word_level_transcriptions
288+
if wlt is not None:
289+
logger.info(
290+
f"Exporting word_level_transcript of file {sdoc.filename}"
294291
)
292+
transcripts.append((sdoc.filename, [x.model_dump() for x in wlt]))
295293

296294
return transcripts
297295

@@ -1068,10 +1066,10 @@ def _export_all_data_from_proj(
10681066
exported_transcripts = self.__get_all_sdoc_transcripts_in_project_for_export(
10691067
db=db, project_id=project_id
10701068
)
1071-
for exported_transcript in exported_transcripts:
1072-
exported_file = self.__write_exported_txt_to_temp_file(
1073-
text=exported_transcript["transcript"],
1074-
fn=exported_transcript["filename"],
1069+
for filename, word_level_transcriptions in exported_transcripts:
1070+
exported_file = self.__write_exported_json_to_temp_file(
1071+
exported_file=word_level_transcriptions,
1072+
fn=filename + ".transcript",
10751073
)
10761074
exported_files.append(exported_file)
10771075

@@ -1082,7 +1080,7 @@ def _export_all_data_from_proj(
10821080
for exported_sdoc_metadata in exported_sdocs_metadata:
10831081
exported_file = self.__write_exported_json_to_temp_file(
10841082
exported_file=exported_sdoc_metadata,
1085-
fn=exported_sdoc_metadata["filename"],
1083+
fn=exported_sdoc_metadata["filename"] + ".metadata",
10861084
)
10871085
exported_files.append(exported_file)
10881086

backend/src/app/core/data/import_/import_service.py

Lines changed: 64 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
ProjectMetadataCreate,
5353
ProjectMetadataRead,
5454
)
55+
from app.core.data.dto.source_document_data import WordLevelTranscription
5556
from app.core.data.dto.source_document_link import SourceDocumentLinkCreate
5657
from app.core.data.orm.project import ProjectORM
5758
from app.core.data.repo.repo_service import (
@@ -76,8 +77,8 @@
7677
(r"user_\d+_logbook.md", "logbook", True),
7778
(r"user_\d+_memo.csv", "memo", True),
7879
(r"\w+.csv", "sdoc_annotations", False),
79-
(r"\w+.txt", "sdoc_transcript", False),
80-
(r"\w+.json", "sdoc_metadatas", False),
80+
(r"\w+.transcript.json", "sdoc_transcript", False),
81+
(r"\w+.metadata.json", "sdoc_metadatas", False),
8182
]
8283

8384
SDOC_FILE_TYPES = ["sdoc", "sdoc_annotations", "sdoc_metadatas"]
@@ -672,12 +673,11 @@ def _import_project(
672673
sdoc_filepaths = {
673674
"sdoc_filename":{
674675
"sdoc": filename.html,
675-
"sdoc_metadatas": filename.json
676-
"sdoc_annotations": filename.csv
676+
"sdoc_metadatas": filename.metadata.json
677+
"sdoc_annotations": filename.annotations.csv
678+
"sdoc_word_level_transcription": filename.transcript.json
677679
}
678680
}
679-
680-
681681
"""
682682
expected_file_paths, sdoc_filepaths = self.__read_import_project_files(
683683
temp_proj_path=path_to_temp_import_dir
@@ -776,6 +776,18 @@ def _import_project(
776776
}
777777
logger.info(f"Generate sdoc metadata {metadata}")
778778

779+
# import (optional) word level transcriptions
780+
sdoc_wlt: Optional[List[WordLevelTranscription]] = None
781+
if "sdoc_transcript" in sdoc_package:
782+
sdoc_transcript_filepath = sdoc_package["sdoc_transcript"]
783+
with open(sdoc_transcript_filepath, "r") as f:
784+
sdoc_wlt = [
785+
WordLevelTranscription.model_validate(x)
786+
for x in json.load(f)
787+
]
788+
789+
logger.info(f"Generate word level transcription {sdoc_wlt}")
790+
779791
# import sdoc tags
780792
for tag in sdoc_metadata["tags"]:
781793
tags.append(tags_id_mapping[tag])
@@ -785,39 +797,50 @@ def _import_project(
785797
sdoc_annotations_filepath = sdoc_package["sdoc_annotations"]
786798
sdoc_annotations_df = pd.read_csv(sdoc_annotations_filepath)
787799
logger.info(f"The doctype is {sdoc_doctype}")
788-
if sdoc_doctype == DocType.text:
789-
# create AutoSpans for NER
790-
for _, row in sdoc_annotations_df.iterrows():
791-
if row["user_email"] in user_email_id_mapping:
792-
email: Optional[str] = (
793-
str(row["user_email"])
794-
if isinstance(row["user_email"], str)
795-
else None
800+
# if sdoc_doctype == DocType.text:
801+
# create AutoSpans for NER
802+
for _, row in sdoc_annotations_df.iterrows():
803+
if row["user_email"] in user_email_id_mapping:
804+
email: Optional[str] = (
805+
str(row["user_email"])
806+
if isinstance(row["user_email"], str)
807+
else None
808+
)
809+
if (
810+
email and bool(pd.notna(row["text"]))
811+
): # this should always be true because of if email in email_id_mapping
812+
auto = AutoSpan.model_validate(
813+
{
814+
"code": row["code_name"],
815+
"start": row["text_begin_char"],
816+
"end": row["text_end_char"],
817+
"text": row["text"],
818+
"start_token": row["text_begin_token"],
819+
"end_token": row["text_end_token"],
820+
"user_id": user_email_id_mapping[email],
821+
}
796822
)
797-
if email: # this should always be true because of if email in email_id_mapping
798-
auto = AutoSpan.model_validate(
799-
{
800-
"code": row["code_name"],
801-
"start": row["text_begin_char"],
802-
"end": row["text_end_char"],
803-
"text": row["text"],
804-
"start_token": row["text_begin_token"],
805-
"end_token": row["text_end_token"],
806-
"user_id": user_email_id_mapping[email],
807-
}
808-
)
809-
annotations.add(auto)
810-
logger.info(f"Generate sdoc annotations {annotations}")
811-
812-
elif sdoc_doctype == DocType.image:
823+
annotations.add(auto)
824+
logger.info(f"Generate sdoc annotations {annotations}")
825+
826+
if sdoc_doctype == DocType.image:
813827
# create boundig boxes for object detection
814828
for _, row in sdoc_annotations_df.iterrows():
815829
email: Optional[str] = (
816830
str(row["user_email"])
817831
if isinstance(row["user_email"], str)
818832
else None
819833
)
820-
if email:
834+
if (
835+
email
836+
and bool(pd.notna(row["bbox_x_min"]))
837+
and bool(pd.notna(row["bbox_y_min"]))
838+
and bool(pd.notna(row["bbox_x_max"]))
839+
and bool(pd.notna(row["bbox_y_max"]))
840+
):
841+
logger.info(
842+
f"x_min {row['bbox_x_min']}, y_min: {row['bbox_y_min']}, x_max: {row['bbox_x_max']}, y_max: {row['bbox_y_max']}"
843+
)
821844
bbox = AutoBBox.model_validate(
822845
{
823846
"code": row["code_name"],
@@ -853,6 +876,10 @@ def _import_project(
853876
"tags": tags,
854877
"sdoc_link": sdoc_link,
855878
}
879+
if sdoc_wlt:
880+
sdoc_specific_payloads[sdoc_filepath.name][
881+
"word_level_transcriptions"
882+
] = sdoc_wlt
856883

857884
# 2. Create preprojob
858885
from app.preprocessing.preprocessing_service import PreprocessingService
@@ -968,18 +995,17 @@ def __read_import_project_files(self, temp_proj_path: Path) -> Tuple[Dict, Dict]
968995
"codes": project_codes.csv
969996
"sdoc_links": project_sdoc_links.csv
970997
"tags": project_tags.csv
971-
"users": users.csv
998+
"users": project_users.csv
972999
}
973-
sdocs = {
1000+
// das abrauchst du intern aufjeden fall
1001+
sdoc_filepaths = {
9741002
"sdoc_filename":{
9751003
"sdoc": filename.html,
976-
"sdoc_metadatas": filename.html.json
977-
"sdoc_annotations": filename.html.csv
978-
"sdoc_transcript":"filename.html.txt"
1004+
"sdoc_metadatas": filename.metadata.json
1005+
"sdoc_annotations": filename.annotations.csv
1006+
"sdoc_word_level_transcription": filename.transcript.json
9791007
}
9801008
}
981-
982-
9831009
"""
9841010

9851011
expected_files: Dict = dict()

0 commit comments

Comments
 (0)