Skip to content

Commit f54b588

Browse files
committed
changing audio video and image pipeline, so transcript is not saved seperatly.
1 parent 326003c commit f54b588

11 files changed

+248
-143
lines changed

backend/src/app/preprocessing/pipeline/__init__.py

Lines changed: 103 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
def build_text_pipeline(
99
is_init: bool = True,
1010
) -> PreprocessingPipeline:
11+
from app.preprocessing.pipeline.steps.common.detect_content_language import (
12+
detect_content_language,
13+
)
1114
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
1215
remove_erroneous_or_unfinished_sdocs,
1316
)
@@ -27,9 +30,6 @@ def build_text_pipeline(
2730
from app.preprocessing.pipeline.steps.text.create_pptd import (
2831
create_pptd,
2932
)
30-
from app.preprocessing.pipeline.steps.text.detect_content_language import (
31-
detect_content_language,
32-
)
3333
from app.preprocessing.pipeline.steps.text.extract_content_in_html_from_raw_text_docs import (
3434
extract_content_in_html_from_raw_text_docs,
3535
)
@@ -210,7 +210,7 @@ def build_image_pipeline(
210210
run_object_detection,
211211
)
212212
from app.preprocessing.pipeline.steps.image.store_metadata_to_database import (
213-
store_metadata_to_database,
213+
store_metadata_and_data_to_database,
214214
)
215215
from app.preprocessing.pipeline.steps.image.write_ppid_to_database import (
216216
write_ppid_to_database,
@@ -277,7 +277,7 @@ def build_image_pipeline(
277277

278278
pipeline.register_step(
279279
func=create_pptd_from_caption,
280-
required_data=["ppid", "sdoc_id"],
280+
required_data=["ppid"],
281281
)
282282

283283
# run caption through spacy and add to elasticsearch to make it searchable
@@ -307,7 +307,7 @@ def build_image_pipeline(
307307
)
308308

309309
pipeline.register_step(
310-
func=store_metadata_to_database,
310+
func=store_metadata_and_data_to_database,
311311
required_data=[
312312
"pptd",
313313
"ppid",
@@ -348,11 +348,14 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
348348
generate_webp_thumbnail_for_audio,
349349
)
350350
from app.preprocessing.pipeline.steps.audio.store_metadata_to_database import (
351-
store_metadata_to_database,
351+
store_metadata_and_data_to_database,
352352
)
353353
from app.preprocessing.pipeline.steps.audio.write_ppad_to_database import (
354354
write_ppad_to_database,
355355
)
356+
from app.preprocessing.pipeline.steps.common.detect_content_language import (
357+
detect_content_language,
358+
)
356359
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
357360
remove_erroneous_or_unfinished_sdocs,
358361
)
@@ -362,9 +365,6 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
362365
from app.preprocessing.pipeline.steps.common.update_sdoc_status_to_finish import (
363366
update_sdoc_status_to_finish,
364367
)
365-
from app.preprocessing.pipeline.steps.text.detect_content_language import (
366-
detect_content_language,
367-
)
368368
from app.preprocessing.pipeline.steps.text.generate_keywords import (
369369
generate_keywords,
370370
)
@@ -458,7 +458,7 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
458458
)
459459

460460
pipeline.register_step(
461-
func=store_metadata_to_database,
461+
func=store_metadata_and_data_to_database,
462462
required_data=[
463463
"pptd",
464464
"ppad",
@@ -482,6 +482,19 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
482482

483483
@lru_cache(maxsize=1)
484484
def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
485+
from app.preprocessing.pipeline.steps.audio.convert_to_pcm import convert_to_pcm
486+
from app.preprocessing.pipeline.steps.audio.create_ffmpeg_probe_audio_metadata import (
487+
create_ffmpeg_probe_audio_metadata,
488+
)
489+
from app.preprocessing.pipeline.steps.audio.create_pptd_from_transcription import (
490+
create_pptd_from_transcription,
491+
)
492+
from app.preprocessing.pipeline.steps.audio.generate_automatic_transcription import (
493+
generate_automatic_transcription,
494+
)
495+
from app.preprocessing.pipeline.steps.common.detect_content_language import (
496+
detect_content_language,
497+
)
485498
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
486499
remove_erroneous_or_unfinished_sdocs,
487500
)
@@ -491,8 +504,20 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
491504
from app.preprocessing.pipeline.steps.common.update_sdoc_status_to_finish import (
492505
update_sdoc_status_to_finish,
493506
)
494-
from app.preprocessing.pipeline.steps.video.add_word_level_transcriptions_to_ppvd_metadata import (
495-
add_word_level_transcriptions_to_ppvd_metadata,
507+
from app.preprocessing.pipeline.steps.text.generate_keywords import (
508+
generate_keywords,
509+
)
510+
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
511+
generate_sentence_annotations,
512+
)
513+
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
514+
generate_word_frequncies,
515+
)
516+
from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
517+
run_spacy_pipeline,
518+
)
519+
from app.preprocessing.pipeline.steps.text.store_document_in_elasticsearch import (
520+
store_document_in_elasticsearch,
496521
)
497522
from app.preprocessing.pipeline.steps.video.create_and_store_audio_stream_file import (
498523
create_and_store_audio_stream_file,
@@ -507,11 +532,13 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
507532
from app.preprocessing.pipeline.steps.video.generate_webp_thumbnail_for_video import (
508533
generate_webp_thumbnail_for_video,
509534
)
535+
from app.preprocessing.pipeline.steps.video.store_metadata_to_database import (
536+
store_metadata_and_data_to_database,
537+
)
510538
from app.preprocessing.pipeline.steps.video.write_ppvd_to_database import (
511539
write_ppvd_to_database,
512540
)
513541

514-
audio_pipeline = build_audio_pipeline()
515542
pipeline = PreprocessingPipeline(doc_type=DocType.video)
516543

517544
pipeline.register_step(
@@ -539,14 +566,71 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
539566
required_data=["ppvd"],
540567
)
541568

542-
pipeline.join_pipeline(
543-
pipeline=audio_pipeline,
544-
skip_steps_with_name=["create_ppad"],
569+
pipeline.register_step(
570+
func=write_ppvd_to_database,
571+
required_data=["ppvd"],
572+
)
573+
574+
pipeline.register_step(
575+
func=create_ffmpeg_probe_audio_metadata,
576+
required_data=["ppad"],
577+
)
578+
579+
pipeline.register_step(
580+
func=convert_to_pcm,
581+
required_data=["ppad"],
582+
)
583+
584+
pipeline.register_step(
585+
func=generate_automatic_transcription,
586+
required_data=["ppad"],
587+
)
588+
589+
# instead create pptd before and now add it as metadata
590+
pipeline.register_step(
591+
func=create_pptd_from_transcription,
592+
required_data=["ppad"],
593+
)
594+
595+
pipeline.register_step(
596+
func=detect_content_language,
597+
required_data=["pptd"],
598+
)
599+
600+
# run caption through spacy and add to elasticsearch to make it searchable
601+
pipeline.register_step(
602+
func=run_spacy_pipeline,
603+
required_data=["pptd"],
545604
)
546605

547606
pipeline.register_step(
548-
func=add_word_level_transcriptions_to_ppvd_metadata,
549-
required_data=["ppvd", "ppad"],
607+
func=generate_word_frequncies,
608+
required_data=["pptd"],
609+
)
610+
611+
pipeline.register_step(
612+
func=generate_keywords,
613+
required_data=["pptd"],
614+
)
615+
616+
pipeline.register_step(
617+
func=generate_sentence_annotations,
618+
required_data=["pptd"],
619+
)
620+
621+
pipeline.register_step(
622+
func=store_document_in_elasticsearch,
623+
required_data=["pptd", "sdoc_id"],
624+
)
625+
626+
pipeline.register_step(
627+
func=store_metadata_and_data_to_database,
628+
required_data=[
629+
"pptd",
630+
"ppad",
631+
"ppvd",
632+
"sdoc_id",
633+
],
550634
)
551635

552636
pipeline.register_step(
@@ -556,11 +640,6 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
556640
func=remove_erroneous_or_unfinished_sdocs,
557641
)
558642

559-
pipeline.register_step(
560-
func=write_ppvd_to_database,
561-
required_data=["ppvd"],
562-
)
563-
564643
pipeline.register_step(
565644
func=resolve_sdoc_links,
566645
)

backend/src/app/preprocessing/pipeline/steps/audio/generate_automatic_transcription.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23

34
from loguru import logger
@@ -42,4 +43,6 @@ def generate_automatic_transcription(cargo: PipelineCargo) -> PipelineCargo:
4243
)
4344
ppad.word_level_transcriptions.append(wlt)
4445

46+
wlt = list(map(lambda wlt: wlt.model_dump(), ppad.word_level_transcriptions))
47+
ppad.metadata["word_level_transcriptions"] = json.dumps(wlt)
4548
return cargo

backend/src/app/preprocessing/pipeline/steps/audio/store_metadata_to_database.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import json
2-
31
from loguru import logger
42
from sqlalchemy.orm import Session
53

@@ -16,6 +14,7 @@
1614
from app.preprocessing.pipeline.model.audio.preproaudiodoc import PreProAudioDoc
1715
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
1816
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
17+
from app.preprocessing.pipeline.steps.common.persist_sdoc_data import persist_sdoc_data
1918

2019
repo: RepoService = RepoService()
2120
sql: SQLService = SQLService()
@@ -31,10 +30,8 @@ def _persist_sdoc_metadata(
3130
sdoc_id = sdoc_db_obj.id
3231
sdoc = SourceDocumentRead.model_validate(sdoc_db_obj)
3332
ppad.metadata["url"] = str(RepoService().get_sdoc_url(sdoc=sdoc))
34-
wlt = list(map(lambda wlt: wlt.model_dump(), ppad.word_level_transcriptions))
35-
ppad.metadata["word_level_transcriptions"] = json.dumps(wlt)
3633
ppad.metadata["language"] = pptd.metadata["language"]
37-
ppad.metadata["transcription_keywords"] = pptd.keywords
34+
ppad.metadata["transcription_keywords"] = pptd.metadata["keywords"]
3835

3936
project_metadata = [
4037
ProjectMetadataRead.model_validate(pm)
@@ -68,7 +65,7 @@ def _persist_sdoc_metadata(
6865
crud_sdoc_meta.create_multi(db=db, create_dtos=metadata_create_dtos)
6966

7067

71-
def store_metadata_to_database(cargo: PipelineCargo) -> PipelineCargo:
68+
def store_metadata_and_data_to_database(cargo: PipelineCargo) -> PipelineCargo:
7269
ppad: PreProAudioDoc = cargo.data["ppad"]
7370
pptd: PreProTextDoc = cargo.data["pptd"]
7471
audio_sdoc_id: int = cargo.data["sdoc_id"]
@@ -80,6 +77,9 @@ def store_metadata_to_database(cargo: PipelineCargo) -> PipelineCargo:
8077
# persist SourceDocument Metadata
8178
_persist_sdoc_metadata(db=db, sdoc_db_obj=sdoc_db_obj, ppad=ppad, pptd=pptd)
8279

80+
# persist SourceDocument Data
81+
persist_sdoc_data(db=db, sdoc_db_obj=sdoc_db_obj, pptd=pptd)
82+
8383
except Exception as e:
8484
logger.error(
8585
f"Error while persisting SourceDocument Metadata "

backend/src/app/preprocessing/pipeline/steps/text/detect_content_language.py renamed to backend/src/app/preprocessing/pipeline/steps/common/detect_content_language.py

File renamed without changes.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from sqlalchemy.orm import Session
2+
3+
from app.core.data.crud.source_document_data import crud_sdoc_data
4+
from app.core.data.dto.source_document_data import SourceDocumentDataCreate
5+
from app.core.data.orm.source_document import SourceDocumentORM
6+
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
7+
8+
9+
def persist_sdoc_data(
10+
db: Session, sdoc_db_obj: SourceDocumentORM, pptd: PreProTextDoc
11+
) -> None:
12+
sdoc_data = SourceDocumentDataCreate(
13+
id=sdoc_db_obj.id,
14+
content=pptd.text,
15+
html=pptd.html,
16+
token_starts=[s for s, _ in pptd.token_character_offsets],
17+
token_ends=[e for _, e in pptd.token_character_offsets],
18+
sentence_starts=[s.start for s in pptd.sentences],
19+
sentence_ends=[s.end for s in pptd.sentences],
20+
)
21+
crud_sdoc_data.create(db=db, create_dto=sdoc_data)

backend/src/app/preprocessing/pipeline/steps/image/store_metadata_to_database.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from app.preprocessing.pipeline.model.image.preproimagedoc import PreProImageDoc
1515
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
1616
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
17+
from app.preprocessing.pipeline.steps.common.persist_sdoc_data import persist_sdoc_data
1718

1819
repo: RepoService = RepoService()
1920
sql: SQLService = SQLService()
@@ -28,7 +29,7 @@ def _persist_sdoc_metadata(
2829
sdoc_id = sdoc_db_obj.id
2930
sdoc = SourceDocumentRead.model_validate(sdoc_db_obj)
3031
ppid.metadata["url"] = str(RepoService().get_sdoc_url(sdoc=sdoc))
31-
ppid.metadata["keywords"] = pptd.keywords
32+
ppid.metadata["keywords"] = pptd.metadata["keywords"]
3233

3334
project_metadata = [
3435
ProjectMetadataRead.model_validate(pm)
@@ -41,7 +42,6 @@ def _persist_sdoc_metadata(
4142
metadata_create_dtos = []
4243
for project_metadata_key, project_metadata in project_metadata_map.items():
4344
if project_metadata_key in ppid.metadata.keys():
44-
logger.info(f"test {project_metadata_key}")
4545
metadata_create_dtos.append(
4646
SourceDocumentMetadataCreate.with_metatype(
4747
value=ppid.metadata[project_metadata_key],
@@ -62,7 +62,7 @@ def _persist_sdoc_metadata(
6262
crud_sdoc_meta.create_multi(db=db, create_dtos=metadata_create_dtos)
6363

6464

65-
def store_metadata_to_database(cargo: PipelineCargo) -> PipelineCargo:
65+
def store_metadata_and_data_to_database(cargo: PipelineCargo) -> PipelineCargo:
6666
ppid: PreProImageDoc = cargo.data["ppid"]
6767
pptd: PreProTextDoc = cargo.data["pptd"]
6868
image_sdoc_id: int = cargo.data["sdoc_id"]
@@ -74,6 +74,9 @@ def store_metadata_to_database(cargo: PipelineCargo) -> PipelineCargo:
7474
# persist SourceDocument Metadata
7575
_persist_sdoc_metadata(db=db, sdoc_db_obj=sdoc_db_obj, ppid=ppid, pptd=pptd)
7676

77+
# persist SourceDocument Data
78+
persist_sdoc_data(db=db, sdoc_db_obj=sdoc_db_obj, pptd=pptd)
79+
7780
except Exception as e:
7881
logger.error(
7982
f"Error while persisting SourceDocument Metadata "

backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88

99
def generate_keywords(cargo: PipelineCargo) -> PipelineCargo:
1010
pptd: PreProTextDoc = cargo.data["pptd"]
11-
if "keywords" in pptd.metadata:
12-
pptd.keywords = pptd.metadata["keywords"] # type: ignore
13-
else:
11+
if "keywords" not in pptd.metadata:
1412
out = pptd.spacy_pipeline_output
1513
if out is None:
1614
logger.error(
@@ -69,6 +67,6 @@ def generate_keywords(cargo: PipelineCargo) -> PipelineCargo:
6967
# if any of the words is not in the pos dict, we skip the keyword
7068
pass
7169

72-
pptd.keywords = keywords
70+
pptd.metadata["keywords"] = keywords
7371

7472
return cargo

0 commit comments

Comments
 (0)