Skip to content

Commit d5bbf7d

Browse files
authored
Merge pull request #478 from uhh-lt/transcript_bound_to_audio_video
Transcript bound to audio video
2 parents 2f37ebf + 6bc701b commit d5bbf7d

20 files changed

+468
-245
lines changed

backend/src/app/core/data/crud/document_tag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict, List
1+
from typing import Dict, List, Optional
22

33
from sqlalchemy import delete, func, select
44
from sqlalchemy.orm import Session

backend/src/app/core/data/crud/project_metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from app.core.data.crud.crud_base import CRUDBase
77
from app.core.data.crud.source_document_metadata import crud_sdoc_meta
8+
from app.core.data.doc_type import DocType
89
from app.core.data.dto.project_metadata import (
910
ProjectMetadataCreate,
1011
ProjectMetadataUpdate,

backend/src/app/preprocessing/pipeline/__init__.py

Lines changed: 157 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
def build_text_pipeline(
99
is_init: bool = True,
1010
) -> PreprocessingPipeline:
11+
from app.preprocessing.pipeline.steps.common.detect_content_language import (
12+
detect_content_language,
13+
)
1114
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
1215
remove_erroneous_or_unfinished_sdocs,
1316
)
@@ -27,9 +30,6 @@ def build_text_pipeline(
2730
from app.preprocessing.pipeline.steps.text.create_pptd import (
2831
create_pptd,
2932
)
30-
from app.preprocessing.pipeline.steps.text.detect_content_language import (
31-
detect_content_language,
32-
)
3333
from app.preprocessing.pipeline.steps.text.extract_content_in_html_from_raw_text_docs import (
3434
extract_content_in_html_from_raw_text_docs,
3535
)
@@ -210,7 +210,7 @@ def build_image_pipeline(
210210
run_object_detection,
211211
)
212212
from app.preprocessing.pipeline.steps.image.store_metadata_to_database import (
213-
store_metadata_to_database,
213+
store_metadata_and_data_to_database,
214214
)
215215
from app.preprocessing.pipeline.steps.image.write_ppid_to_database import (
216216
write_ppid_to_database,
@@ -277,7 +277,7 @@ def build_image_pipeline(
277277

278278
pipeline.register_step(
279279
func=create_pptd_from_caption,
280-
required_data=["ppid", "sdoc_id"],
280+
required_data=["ppid"],
281281
)
282282

283283
# run caption through spacy and add to elasticsearch to make it searchable
@@ -307,7 +307,7 @@ def build_image_pipeline(
307307
)
308308

309309
pipeline.register_step(
310-
func=store_metadata_to_database,
310+
func=store_metadata_and_data_to_database,
311311
required_data=[
312312
"pptd",
313313
"ppid",
@@ -334,9 +334,6 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
334334
# we need to import the steps here to avoid loading models at startup
335335
# in the api worker!
336336
from app.preprocessing.pipeline.steps.audio.convert_to_pcm import convert_to_pcm
337-
from app.preprocessing.pipeline.steps.audio.create_and_store_transcript_file import (
338-
create_and_store_transcript_file,
339-
)
340337
from app.preprocessing.pipeline.steps.audio.create_ffmpeg_probe_audio_metadata import (
341338
create_ffmpeg_probe_audio_metadata,
342339
)
@@ -350,9 +347,15 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
350347
from app.preprocessing.pipeline.steps.audio.generate_webp_thumbnail_for_audio import (
351348
generate_webp_thumbnail_for_audio,
352349
)
350+
from app.preprocessing.pipeline.steps.audio.store_metadata_to_database import (
351+
store_metadata_and_data_to_database,
352+
)
353353
from app.preprocessing.pipeline.steps.audio.write_ppad_to_database import (
354354
write_ppad_to_database,
355355
)
356+
from app.preprocessing.pipeline.steps.common.detect_content_language import (
357+
detect_content_language,
358+
)
356359
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
357360
remove_erroneous_or_unfinished_sdocs,
358361
)
@@ -362,8 +365,22 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
362365
from app.preprocessing.pipeline.steps.common.update_sdoc_status_to_finish import (
363366
update_sdoc_status_to_finish,
364367
)
368+
from app.preprocessing.pipeline.steps.text.generate_keywords import (
369+
generate_keywords,
370+
)
371+
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
372+
generate_sentence_annotations,
373+
)
374+
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
375+
generate_word_frequncies,
376+
)
377+
from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
378+
run_spacy_pipeline,
379+
)
380+
from app.preprocessing.pipeline.steps.text.store_document_in_elasticsearch import (
381+
store_document_in_elasticsearch,
382+
)
365383

366-
text_pipeline = build_text_pipeline()
367384
pipeline = PreprocessingPipeline(doc_type=DocType.audio)
368385

369386
pipeline.register_step(
@@ -392,17 +409,45 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
392409
)
393410

394411
pipeline.register_step(
395-
func=create_and_store_transcript_file,
412+
func=write_ppad_to_database,
396413
required_data=["ppad"],
397414
)
398415

416+
# instead create pptd before and now add it as metadata
399417
pipeline.register_step(
400418
func=create_pptd_from_transcription,
401419
required_data=["ppad"],
402420
)
403-
pipeline.join_pipeline(
404-
pipeline=text_pipeline,
405-
skip_steps_with_name=["create_pptd"],
421+
422+
pipeline.register_step(
423+
func=detect_content_language,
424+
required_data=["pptd"],
425+
)
426+
427+
# run caption through spacy and add to elasticsearch to make it searchable
428+
pipeline.register_step(
429+
func=run_spacy_pipeline,
430+
required_data=["pptd"],
431+
)
432+
433+
pipeline.register_step(
434+
func=generate_word_frequncies,
435+
required_data=["pptd"],
436+
)
437+
438+
pipeline.register_step(
439+
func=generate_keywords,
440+
required_data=["pptd"],
441+
)
442+
443+
pipeline.register_step(
444+
func=generate_sentence_annotations,
445+
required_data=["pptd"],
446+
)
447+
448+
pipeline.register_step(
449+
func=store_document_in_elasticsearch,
450+
required_data=["pptd", "sdoc_id"],
406451
)
407452

408453
pipeline.register_step(
@@ -413,8 +458,12 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
413458
)
414459

415460
pipeline.register_step(
416-
func=write_ppad_to_database,
417-
required_data=["ppad"],
461+
func=store_metadata_and_data_to_database,
462+
required_data=[
463+
"pptd",
464+
"ppad",
465+
"sdoc_id",
466+
],
418467
)
419468

420469
pipeline.register_step(
@@ -433,6 +482,19 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
433482

434483
@lru_cache(maxsize=1)
435484
def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
485+
from app.preprocessing.pipeline.steps.audio.convert_to_pcm import convert_to_pcm
486+
from app.preprocessing.pipeline.steps.audio.create_ffmpeg_probe_audio_metadata import (
487+
create_ffmpeg_probe_audio_metadata,
488+
)
489+
from app.preprocessing.pipeline.steps.audio.create_pptd_from_transcription import (
490+
create_pptd_from_transcription,
491+
)
492+
from app.preprocessing.pipeline.steps.audio.generate_automatic_transcription import (
493+
generate_automatic_transcription,
494+
)
495+
from app.preprocessing.pipeline.steps.common.detect_content_language import (
496+
detect_content_language,
497+
)
436498
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
437499
remove_erroneous_or_unfinished_sdocs,
438500
)
@@ -442,8 +504,20 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
442504
from app.preprocessing.pipeline.steps.common.update_sdoc_status_to_finish import (
443505
update_sdoc_status_to_finish,
444506
)
445-
from app.preprocessing.pipeline.steps.video.add_word_level_transcriptions_to_ppvd_metadata import (
446-
add_word_level_transcriptions_to_ppvd_metadata,
507+
from app.preprocessing.pipeline.steps.text.generate_keywords import (
508+
generate_keywords,
509+
)
510+
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
511+
generate_sentence_annotations,
512+
)
513+
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
514+
generate_word_frequncies,
515+
)
516+
from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
517+
run_spacy_pipeline,
518+
)
519+
from app.preprocessing.pipeline.steps.text.store_document_in_elasticsearch import (
520+
store_document_in_elasticsearch,
447521
)
448522
from app.preprocessing.pipeline.steps.video.create_and_store_audio_stream_file import (
449523
create_and_store_audio_stream_file,
@@ -458,11 +532,13 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
458532
from app.preprocessing.pipeline.steps.video.generate_webp_thumbnail_for_video import (
459533
generate_webp_thumbnail_for_video,
460534
)
535+
from app.preprocessing.pipeline.steps.video.store_metadata_to_database import (
536+
store_metadata_and_data_to_database,
537+
)
461538
from app.preprocessing.pipeline.steps.video.write_ppvd_to_database import (
462539
write_ppvd_to_database,
463540
)
464541

465-
audio_pipeline = build_audio_pipeline()
466542
pipeline = PreprocessingPipeline(doc_type=DocType.video)
467543

468544
pipeline.register_step(
@@ -490,14 +566,71 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
490566
required_data=["ppvd"],
491567
)
492568

493-
pipeline.join_pipeline(
494-
pipeline=audio_pipeline,
495-
skip_steps_with_name=["create_ppad"],
569+
pipeline.register_step(
570+
func=write_ppvd_to_database,
571+
required_data=["ppvd"],
572+
)
573+
574+
pipeline.register_step(
575+
func=create_ffmpeg_probe_audio_metadata,
576+
required_data=["ppad"],
577+
)
578+
579+
pipeline.register_step(
580+
func=convert_to_pcm,
581+
required_data=["ppad"],
582+
)
583+
584+
pipeline.register_step(
585+
func=generate_automatic_transcription,
586+
required_data=["ppad"],
587+
)
588+
589+
# instead create pptd before and now add it as metadata
590+
pipeline.register_step(
591+
func=create_pptd_from_transcription,
592+
required_data=["ppad"],
593+
)
594+
595+
pipeline.register_step(
596+
func=detect_content_language,
597+
required_data=["pptd"],
598+
)
599+
600+
# run caption through spacy and add to elasticsearch to make it searchable
601+
pipeline.register_step(
602+
func=run_spacy_pipeline,
603+
required_data=["pptd"],
604+
)
605+
606+
pipeline.register_step(
607+
func=generate_word_frequncies,
608+
required_data=["pptd"],
609+
)
610+
611+
pipeline.register_step(
612+
func=generate_keywords,
613+
required_data=["pptd"],
496614
)
497615

498616
pipeline.register_step(
499-
func=add_word_level_transcriptions_to_ppvd_metadata,
500-
required_data=["ppvd", "ppad"],
617+
func=generate_sentence_annotations,
618+
required_data=["pptd"],
619+
)
620+
621+
pipeline.register_step(
622+
func=store_document_in_elasticsearch,
623+
required_data=["pptd", "sdoc_id"],
624+
)
625+
626+
pipeline.register_step(
627+
func=store_metadata_and_data_to_database,
628+
required_data=[
629+
"pptd",
630+
"ppad",
631+
"ppvd",
632+
"sdoc_id",
633+
],
501634
)
502635

503636
pipeline.register_step(
@@ -507,11 +640,6 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
507640
func=remove_erroneous_or_unfinished_sdocs,
508641
)
509642

510-
pipeline.register_step(
511-
func=write_ppvd_to_database,
512-
required_data=["ppvd"],
513-
)
514-
515643
pipeline.register_step(
516644
func=resolve_sdoc_links,
517645
)

backend/src/app/preprocessing/pipeline/steps/audio/create_and_store_transcript_file.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

backend/src/app/preprocessing/pipeline/steps/audio/create_pptd_from_transcription.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
1+
from pathlib import Path
2+
13
from app.preprocessing.pipeline.model.audio.preproaudiodoc import PreProAudioDoc
24
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
35
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
46

57

68
def create_pptd_from_transcription(cargo: PipelineCargo) -> PipelineCargo:
79
ppad: PreProAudioDoc = cargo.data["ppad"]
8-
9-
if not ppad.transcript_filepath.exists():
10-
raise FileNotFoundError(
11-
f"The transcription file {ppad.transcript_filepath} "
12-
f"for {cargo.ppj_payload.filename} does not exist!"
13-
)
14-
10+
ppad.metadata["transcription"] = " ".join(
11+
[a.text for a in ppad.word_level_transcriptions]
12+
)
1513
pptd = PreProTextDoc(
16-
filename=ppad.transcript_filepath.name,
17-
filepath=ppad.transcript_filepath,
18-
project_id=cargo.ppj_payload.project_id,
14+
filepath=Path("/this/is/a/fake_path.txt"),
15+
filename="fake_path.txt",
16+
project_id=ppad.project_id,
17+
text=ppad.metadata["transcription"],
18+
html=f"<html><body><p>{ppad.metadata['transcription']}</p></body></html>",
19+
metadata={"language": "en"},
1920
mime_type="text/plain",
2021
)
2122

0 commit comments

Comments
 (0)