88def build_text_pipeline (
99 is_init : bool = True ,
1010) -> PreprocessingPipeline :
11+ from app .preprocessing .pipeline .steps .common .detect_content_language import (
12+ detect_content_language ,
13+ )
1114 from app .preprocessing .pipeline .steps .common .remove_erroneous_sdoc import (
1215 remove_erroneous_or_unfinished_sdocs ,
1316 )
@@ -27,9 +30,6 @@ def build_text_pipeline(
2730 from app .preprocessing .pipeline .steps .text .create_pptd import (
2831 create_pptd ,
2932 )
30- from app .preprocessing .pipeline .steps .text .detect_content_language import (
31- detect_content_language ,
32- )
3333 from app .preprocessing .pipeline .steps .text .extract_content_in_html_from_raw_text_docs import (
3434 extract_content_in_html_from_raw_text_docs ,
3535 )
@@ -210,7 +210,7 @@ def build_image_pipeline(
210210 run_object_detection ,
211211 )
212212 from app .preprocessing .pipeline .steps .image .store_metadata_to_database import (
213- store_metadata_to_database ,
213+ store_metadata_and_data_to_database ,
214214 )
215215 from app .preprocessing .pipeline .steps .image .write_ppid_to_database import (
216216 write_ppid_to_database ,
@@ -277,7 +277,7 @@ def build_image_pipeline(
277277
278278 pipeline .register_step (
279279 func = create_pptd_from_caption ,
280- required_data = ["ppid" , "sdoc_id" ],
280+ required_data = ["ppid" ],
281281 )
282282
283283 # run caption through spacy and add to elasticsearch to make it searchable
@@ -307,7 +307,7 @@ def build_image_pipeline(
307307 )
308308
309309 pipeline .register_step (
310- func = store_metadata_to_database ,
310+ func = store_metadata_and_data_to_database ,
311311 required_data = [
312312 "pptd" ,
313313 "ppid" ,
@@ -334,9 +334,6 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
334334 # we need to import the steps here to avoid loading models at startup
335335 # in the api worker!
336336 from app .preprocessing .pipeline .steps .audio .convert_to_pcm import convert_to_pcm
337- from app .preprocessing .pipeline .steps .audio .create_and_store_transcript_file import (
338- create_and_store_transcript_file ,
339- )
340337 from app .preprocessing .pipeline .steps .audio .create_ffmpeg_probe_audio_metadata import (
341338 create_ffmpeg_probe_audio_metadata ,
342339 )
@@ -350,9 +347,15 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
350347 from app .preprocessing .pipeline .steps .audio .generate_webp_thumbnail_for_audio import (
351348 generate_webp_thumbnail_for_audio ,
352349 )
350+ from app .preprocessing .pipeline .steps .audio .store_metadata_to_database import (
351+ store_metadata_and_data_to_database ,
352+ )
353353 from app .preprocessing .pipeline .steps .audio .write_ppad_to_database import (
354354 write_ppad_to_database ,
355355 )
356+ from app .preprocessing .pipeline .steps .common .detect_content_language import (
357+ detect_content_language ,
358+ )
356359 from app .preprocessing .pipeline .steps .common .remove_erroneous_sdoc import (
357360 remove_erroneous_or_unfinished_sdocs ,
358361 )
@@ -362,8 +365,22 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
362365 from app .preprocessing .pipeline .steps .common .update_sdoc_status_to_finish import (
363366 update_sdoc_status_to_finish ,
364367 )
368+ from app .preprocessing .pipeline .steps .text .generate_keywords import (
369+ generate_keywords ,
370+ )
371+ from app .preprocessing .pipeline .steps .text .generate_sentence_annotations import (
372+ generate_sentence_annotations ,
373+ )
374+ from app .preprocessing .pipeline .steps .text .generate_word_frequencies import (
375+ generate_word_frequncies ,
376+ )
377+ from app .preprocessing .pipeline .steps .text .run_spacy_pipeline import (
378+ run_spacy_pipeline ,
379+ )
380+ from app .preprocessing .pipeline .steps .text .store_document_in_elasticsearch import (
381+ store_document_in_elasticsearch ,
382+ )
365383
366- text_pipeline = build_text_pipeline ()
367384 pipeline = PreprocessingPipeline (doc_type = DocType .audio )
368385
369386 pipeline .register_step (
@@ -392,17 +409,45 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
392409 )
393410
394411 pipeline .register_step (
395- func = create_and_store_transcript_file ,
412+ func = write_ppad_to_database ,
396413 required_data = ["ppad" ],
397414 )
398415
416+ # instead create pptd before and now add it as metadata
399417 pipeline .register_step (
400418 func = create_pptd_from_transcription ,
401419 required_data = ["ppad" ],
402420 )
403- pipeline .join_pipeline (
404- pipeline = text_pipeline ,
405- skip_steps_with_name = ["create_pptd" ],
421+
422+ pipeline .register_step (
423+ func = detect_content_language ,
424+ required_data = ["pptd" ],
425+ )
426+
427+ # run caption through spacy and add to elasticsearch to make it searchable
428+ pipeline .register_step (
429+ func = run_spacy_pipeline ,
430+ required_data = ["pptd" ],
431+ )
432+
433+ pipeline .register_step (
434+ func = generate_word_frequncies ,
435+ required_data = ["pptd" ],
436+ )
437+
438+ pipeline .register_step (
439+ func = generate_keywords ,
440+ required_data = ["pptd" ],
441+ )
442+
443+ pipeline .register_step (
444+ func = generate_sentence_annotations ,
445+ required_data = ["pptd" ],
446+ )
447+
448+ pipeline .register_step (
449+ func = store_document_in_elasticsearch ,
450+ required_data = ["pptd" , "sdoc_id" ],
406451 )
407452
408453 pipeline .register_step (
@@ -413,8 +458,12 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
413458 )
414459
415460 pipeline .register_step (
416- func = write_ppad_to_database ,
417- required_data = ["ppad" ],
461+ func = store_metadata_and_data_to_database ,
462+ required_data = [
463+ "pptd" ,
464+ "ppad" ,
465+ "sdoc_id" ,
466+ ],
418467 )
419468
420469 pipeline .register_step (
@@ -433,6 +482,19 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
433482
434483@lru_cache (maxsize = 1 )
435484def build_video_pipeline (foo : str = "bar" ) -> PreprocessingPipeline :
485+ from app .preprocessing .pipeline .steps .audio .convert_to_pcm import convert_to_pcm
486+ from app .preprocessing .pipeline .steps .audio .create_ffmpeg_probe_audio_metadata import (
487+ create_ffmpeg_probe_audio_metadata ,
488+ )
489+ from app .preprocessing .pipeline .steps .audio .create_pptd_from_transcription import (
490+ create_pptd_from_transcription ,
491+ )
492+ from app .preprocessing .pipeline .steps .audio .generate_automatic_transcription import (
493+ generate_automatic_transcription ,
494+ )
495+ from app .preprocessing .pipeline .steps .common .detect_content_language import (
496+ detect_content_language ,
497+ )
436498 from app .preprocessing .pipeline .steps .common .remove_erroneous_sdoc import (
437499 remove_erroneous_or_unfinished_sdocs ,
438500 )
@@ -442,8 +504,20 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
442504 from app .preprocessing .pipeline .steps .common .update_sdoc_status_to_finish import (
443505 update_sdoc_status_to_finish ,
444506 )
445- from app .preprocessing .pipeline .steps .video .add_word_level_transcriptions_to_ppvd_metadata import (
446- add_word_level_transcriptions_to_ppvd_metadata ,
507+ from app .preprocessing .pipeline .steps .text .generate_keywords import (
508+ generate_keywords ,
509+ )
510+ from app .preprocessing .pipeline .steps .text .generate_sentence_annotations import (
511+ generate_sentence_annotations ,
512+ )
513+ from app .preprocessing .pipeline .steps .text .generate_word_frequencies import (
514+ generate_word_frequncies ,
515+ )
516+ from app .preprocessing .pipeline .steps .text .run_spacy_pipeline import (
517+ run_spacy_pipeline ,
518+ )
519+ from app .preprocessing .pipeline .steps .text .store_document_in_elasticsearch import (
520+ store_document_in_elasticsearch ,
447521 )
448522 from app .preprocessing .pipeline .steps .video .create_and_store_audio_stream_file import (
449523 create_and_store_audio_stream_file ,
@@ -458,11 +532,13 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
458532 from app .preprocessing .pipeline .steps .video .generate_webp_thumbnail_for_video import (
459533 generate_webp_thumbnail_for_video ,
460534 )
535+ from app .preprocessing .pipeline .steps .video .store_metadata_to_database import (
536+ store_metadata_and_data_to_database ,
537+ )
461538 from app .preprocessing .pipeline .steps .video .write_ppvd_to_database import (
462539 write_ppvd_to_database ,
463540 )
464541
465- audio_pipeline = build_audio_pipeline ()
466542 pipeline = PreprocessingPipeline (doc_type = DocType .video )
467543
468544 pipeline .register_step (
@@ -490,14 +566,71 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
490566 required_data = ["ppvd" ],
491567 )
492568
493- pipeline .join_pipeline (
494- pipeline = audio_pipeline ,
495- skip_steps_with_name = ["create_ppad" ],
569+ pipeline .register_step (
570+ func = write_ppvd_to_database ,
571+ required_data = ["ppvd" ],
572+ )
573+
574+ pipeline .register_step (
575+ func = create_ffmpeg_probe_audio_metadata ,
576+ required_data = ["ppad" ],
577+ )
578+
579+ pipeline .register_step (
580+ func = convert_to_pcm ,
581+ required_data = ["ppad" ],
582+ )
583+
584+ pipeline .register_step (
585+ func = generate_automatic_transcription ,
586+ required_data = ["ppad" ],
587+ )
588+
589+ # instead create pptd before and now add it as metadata
590+ pipeline .register_step (
591+ func = create_pptd_from_transcription ,
592+ required_data = ["ppad" ],
593+ )
594+
595+ pipeline .register_step (
596+ func = detect_content_language ,
597+ required_data = ["pptd" ],
598+ )
599+
600+ # run caption through spacy and add to elasticsearch to make it searchable
601+ pipeline .register_step (
602+ func = run_spacy_pipeline ,
603+ required_data = ["pptd" ],
604+ )
605+
606+ pipeline .register_step (
607+ func = generate_word_frequncies ,
608+ required_data = ["pptd" ],
609+ )
610+
611+ pipeline .register_step (
612+ func = generate_keywords ,
613+ required_data = ["pptd" ],
496614 )
497615
498616 pipeline .register_step (
499- func = add_word_level_transcriptions_to_ppvd_metadata ,
500- required_data = ["ppvd" , "ppad" ],
617+ func = generate_sentence_annotations ,
618+ required_data = ["pptd" ],
619+ )
620+
621+ pipeline .register_step (
622+ func = store_document_in_elasticsearch ,
623+ required_data = ["pptd" , "sdoc_id" ],
624+ )
625+
626+ pipeline .register_step (
627+ func = store_metadata_and_data_to_database ,
628+ required_data = [
629+ "pptd" ,
630+ "ppad" ,
631+ "ppvd" ,
632+ "sdoc_id" ,
633+ ],
501634 )
502635
503636 pipeline .register_step (
@@ -507,11 +640,6 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
507640 func = remove_erroneous_or_unfinished_sdocs ,
508641 )
509642
510- pipeline .register_step (
511- func = write_ppvd_to_database ,
512- required_data = ["ppvd" ],
513- )
514-
515643 pipeline .register_step (
516644 func = resolve_sdoc_links ,
517645 )
0 commit comments