88def build_text_pipeline (
99 is_init : bool = True ,
1010) -> PreprocessingPipeline :
11+ from app .preprocessing .pipeline .steps .common .detect_content_language import (
12+ detect_content_language ,
13+ )
1114 from app .preprocessing .pipeline .steps .common .remove_erroneous_sdoc import (
1215 remove_erroneous_or_unfinished_sdocs ,
1316 )
@@ -27,9 +30,6 @@ def build_text_pipeline(
2730 from app .preprocessing .pipeline .steps .text .create_pptd import (
2831 create_pptd ,
2932 )
30- from app .preprocessing .pipeline .steps .text .detect_content_language import (
31- detect_content_language ,
32- )
3333 from app .preprocessing .pipeline .steps .text .extract_content_in_html_from_raw_text_docs import (
3434 extract_content_in_html_from_raw_text_docs ,
3535 )
@@ -210,7 +210,7 @@ def build_image_pipeline(
210210 run_object_detection ,
211211 )
212212 from app .preprocessing .pipeline .steps .image .store_metadata_to_database import (
213- store_metadata_to_database ,
213+ store_metadata_and_data_to_database ,
214214 )
215215 from app .preprocessing .pipeline .steps .image .write_ppid_to_database import (
216216 write_ppid_to_database ,
@@ -277,7 +277,7 @@ def build_image_pipeline(
277277
278278 pipeline .register_step (
279279 func = create_pptd_from_caption ,
280- required_data = ["ppid" , "sdoc_id" ],
280+ required_data = ["ppid" ],
281281 )
282282
283283 # run caption through spacy and add to elasticsearch to make it searchable
@@ -307,7 +307,7 @@ def build_image_pipeline(
307307 )
308308
309309 pipeline .register_step (
310- func = store_metadata_to_database ,
310+ func = store_metadata_and_data_to_database ,
311311 required_data = [
312312 "pptd" ,
313313 "ppid" ,
@@ -348,11 +348,14 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
348348 generate_webp_thumbnail_for_audio ,
349349 )
350350 from app .preprocessing .pipeline .steps .audio .store_metadata_to_database import (
351- store_metadata_to_database ,
351+ store_metadata_and_data_to_database ,
352352 )
353353 from app .preprocessing .pipeline .steps .audio .write_ppad_to_database import (
354354 write_ppad_to_database ,
355355 )
356+ from app .preprocessing .pipeline .steps .common .detect_content_language import (
357+ detect_content_language ,
358+ )
356359 from app .preprocessing .pipeline .steps .common .remove_erroneous_sdoc import (
357360 remove_erroneous_or_unfinished_sdocs ,
358361 )
@@ -362,9 +365,6 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
362365 from app .preprocessing .pipeline .steps .common .update_sdoc_status_to_finish import (
363366 update_sdoc_status_to_finish ,
364367 )
365- from app .preprocessing .pipeline .steps .text .detect_content_language import (
366- detect_content_language ,
367- )
368368 from app .preprocessing .pipeline .steps .text .generate_keywords import (
369369 generate_keywords ,
370370 )
@@ -458,7 +458,7 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
458458 )
459459
460460 pipeline .register_step (
461- func = store_metadata_to_database ,
461+ func = store_metadata_and_data_to_database ,
462462 required_data = [
463463 "pptd" ,
464464 "ppad" ,
@@ -482,6 +482,19 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
482482
483483@lru_cache (maxsize = 1 )
484484def build_video_pipeline (foo : str = "bar" ) -> PreprocessingPipeline :
485+ from app .preprocessing .pipeline .steps .audio .convert_to_pcm import convert_to_pcm
486+ from app .preprocessing .pipeline .steps .audio .create_ffmpeg_probe_audio_metadata import (
487+ create_ffmpeg_probe_audio_metadata ,
488+ )
489+ from app .preprocessing .pipeline .steps .audio .create_pptd_from_transcription import (
490+ create_pptd_from_transcription ,
491+ )
492+ from app .preprocessing .pipeline .steps .audio .generate_automatic_transcription import (
493+ generate_automatic_transcription ,
494+ )
495+ from app .preprocessing .pipeline .steps .common .detect_content_language import (
496+ detect_content_language ,
497+ )
485498 from app .preprocessing .pipeline .steps .common .remove_erroneous_sdoc import (
486499 remove_erroneous_or_unfinished_sdocs ,
487500 )
@@ -491,8 +504,20 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
491504 from app .preprocessing .pipeline .steps .common .update_sdoc_status_to_finish import (
492505 update_sdoc_status_to_finish ,
493506 )
494- from app .preprocessing .pipeline .steps .video .add_word_level_transcriptions_to_ppvd_metadata import (
495- add_word_level_transcriptions_to_ppvd_metadata ,
507+ from app .preprocessing .pipeline .steps .text .generate_keywords import (
508+ generate_keywords ,
509+ )
510+ from app .preprocessing .pipeline .steps .text .generate_sentence_annotations import (
511+ generate_sentence_annotations ,
512+ )
513+ from app .preprocessing .pipeline .steps .text .generate_word_frequencies import (
514+ generate_word_frequncies ,
515+ )
516+ from app .preprocessing .pipeline .steps .text .run_spacy_pipeline import (
517+ run_spacy_pipeline ,
518+ )
519+ from app .preprocessing .pipeline .steps .text .store_document_in_elasticsearch import (
520+ store_document_in_elasticsearch ,
496521 )
497522 from app .preprocessing .pipeline .steps .video .create_and_store_audio_stream_file import (
498523 create_and_store_audio_stream_file ,
@@ -507,11 +532,13 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
507532 from app .preprocessing .pipeline .steps .video .generate_webp_thumbnail_for_video import (
508533 generate_webp_thumbnail_for_video ,
509534 )
535+ from app .preprocessing .pipeline .steps .video .store_metadata_to_database import (
536+ store_metadata_and_data_to_database ,
537+ )
510538 from app .preprocessing .pipeline .steps .video .write_ppvd_to_database import (
511539 write_ppvd_to_database ,
512540 )
513541
514- audio_pipeline = build_audio_pipeline ()
515542 pipeline = PreprocessingPipeline (doc_type = DocType .video )
516543
517544 pipeline .register_step (
@@ -539,14 +566,71 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
539566 required_data = ["ppvd" ],
540567 )
541568
542- pipeline .join_pipeline (
543- pipeline = audio_pipeline ,
544- skip_steps_with_name = ["create_ppad" ],
569+ pipeline .register_step (
570+ func = write_ppvd_to_database ,
571+ required_data = ["ppvd" ],
572+ )
573+
574+ pipeline .register_step (
575+ func = create_ffmpeg_probe_audio_metadata ,
576+ required_data = ["ppad" ],
577+ )
578+
579+ pipeline .register_step (
580+ func = convert_to_pcm ,
581+ required_data = ["ppad" ],
582+ )
583+
584+ pipeline .register_step (
585+ func = generate_automatic_transcription ,
586+ required_data = ["ppad" ],
587+ )
588+
589+ # instead create pptd before and now add it as metadata
590+ pipeline .register_step (
591+ func = create_pptd_from_transcription ,
592+ required_data = ["ppad" ],
593+ )
594+
595+ pipeline .register_step (
596+ func = detect_content_language ,
597+ required_data = ["pptd" ],
598+ )
599+
600+ # run caption through spacy and add to elasticsearch to make it searchable
601+ pipeline .register_step (
602+ func = run_spacy_pipeline ,
603+ required_data = ["pptd" ],
545604 )
546605
547606 pipeline .register_step (
548- func = add_word_level_transcriptions_to_ppvd_metadata ,
549- required_data = ["ppvd" , "ppad" ],
607+ func = generate_word_frequncies ,
608+ required_data = ["pptd" ],
609+ )
610+
611+ pipeline .register_step (
612+ func = generate_keywords ,
613+ required_data = ["pptd" ],
614+ )
615+
616+ pipeline .register_step (
617+ func = generate_sentence_annotations ,
618+ required_data = ["pptd" ],
619+ )
620+
621+ pipeline .register_step (
622+ func = store_document_in_elasticsearch ,
623+ required_data = ["pptd" , "sdoc_id" ],
624+ )
625+
626+ pipeline .register_step (
627+ func = store_metadata_and_data_to_database ,
628+ required_data = [
629+ "pptd" ,
630+ "ppad" ,
631+ "ppvd" ,
632+ "sdoc_id" ,
633+ ],
550634 )
551635
552636 pipeline .register_step (
@@ -556,11 +640,6 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
556640 func = remove_erroneous_or_unfinished_sdocs ,
557641 )
558642
559- pipeline .register_step (
560- func = write_ppvd_to_database ,
561- required_data = ["ppvd" ],
562- )
563-
564643 pipeline .register_step (
565644 func = resolve_sdoc_links ,
566645 )
0 commit comments