Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/src/app/core/data/crud/document_tag.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List
from typing import Dict, List, Optional

from sqlalchemy import delete, func, select
from sqlalchemy.orm import Session
Expand Down
1 change: 1 addition & 0 deletions backend/src/app/core/data/crud/project_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from app.core.data.crud.crud_base import CRUDBase
from app.core.data.crud.source_document_metadata import crud_sdoc_meta
from app.core.data.doc_type import DocType
from app.core.data.dto.project_metadata import (
ProjectMetadataCreate,
ProjectMetadataUpdate,
Expand Down
186 changes: 157 additions & 29 deletions backend/src/app/preprocessing/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
def build_text_pipeline(
is_init: bool = True,
) -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.common.detect_content_language import (
detect_content_language,
)
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
remove_erroneous_or_unfinished_sdocs,
)
Expand All @@ -27,9 +30,6 @@ def build_text_pipeline(
from app.preprocessing.pipeline.steps.text.create_pptd import (
create_pptd,
)
from app.preprocessing.pipeline.steps.text.detect_content_language import (
detect_content_language,
)
from app.preprocessing.pipeline.steps.text.extract_content_in_html_from_raw_text_docs import (
extract_content_in_html_from_raw_text_docs,
)
Expand Down Expand Up @@ -210,7 +210,7 @@ def build_image_pipeline(
run_object_detection,
)
from app.preprocessing.pipeline.steps.image.store_metadata_to_database import (
store_metadata_to_database,
store_metadata_and_data_to_database,
)
from app.preprocessing.pipeline.steps.image.write_ppid_to_database import (
write_ppid_to_database,
Expand Down Expand Up @@ -277,7 +277,7 @@ def build_image_pipeline(

pipeline.register_step(
func=create_pptd_from_caption,
required_data=["ppid", "sdoc_id"],
required_data=["ppid"],
)

# run caption through spacy and add to elasticsearch to make it searchable
Expand Down Expand Up @@ -307,7 +307,7 @@ def build_image_pipeline(
)

pipeline.register_step(
func=store_metadata_to_database,
func=store_metadata_and_data_to_database,
required_data=[
"pptd",
"ppid",
Expand All @@ -334,9 +334,6 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
# we need to import the steps here to avoid loading models at startup
# in the api worker!
from app.preprocessing.pipeline.steps.audio.convert_to_pcm import convert_to_pcm
from app.preprocessing.pipeline.steps.audio.create_and_store_transcript_file import (
create_and_store_transcript_file,
)
from app.preprocessing.pipeline.steps.audio.create_ffmpeg_probe_audio_metadata import (
create_ffmpeg_probe_audio_metadata,
)
Expand All @@ -350,9 +347,15 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.audio.generate_webp_thumbnail_for_audio import (
generate_webp_thumbnail_for_audio,
)
from app.preprocessing.pipeline.steps.audio.store_metadata_to_database import (
store_metadata_and_data_to_database,
)
from app.preprocessing.pipeline.steps.audio.write_ppad_to_database import (
write_ppad_to_database,
)
from app.preprocessing.pipeline.steps.common.detect_content_language import (
detect_content_language,
)
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
remove_erroneous_or_unfinished_sdocs,
)
Expand All @@ -362,8 +365,22 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.common.update_sdoc_status_to_finish import (
update_sdoc_status_to_finish,
)
from app.preprocessing.pipeline.steps.text.generate_keywords import (
generate_keywords,
)
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
generate_sentence_annotations,
)
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
generate_word_frequncies,
)
from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
run_spacy_pipeline,
)
from app.preprocessing.pipeline.steps.text.store_document_in_elasticsearch import (
store_document_in_elasticsearch,
)

text_pipeline = build_text_pipeline()
pipeline = PreprocessingPipeline(doc_type=DocType.audio)

pipeline.register_step(
Expand Down Expand Up @@ -392,17 +409,45 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
)

pipeline.register_step(
func=create_and_store_transcript_file,
func=write_ppad_to_database,
required_data=["ppad"],
)

# instead create pptd before and now add it as metadata
pipeline.register_step(
func=create_pptd_from_transcription,
required_data=["ppad"],
)
pipeline.join_pipeline(
pipeline=text_pipeline,
skip_steps_with_name=["create_pptd"],

pipeline.register_step(
func=detect_content_language,
required_data=["pptd"],
)

# run caption through spacy and add to elasticsearch to make it searchable
pipeline.register_step(
func=run_spacy_pipeline,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_word_frequncies,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_keywords,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_sentence_annotations,
required_data=["pptd"],
)

pipeline.register_step(
func=store_document_in_elasticsearch,
required_data=["pptd", "sdoc_id"],
)

pipeline.register_step(
Expand All @@ -413,8 +458,12 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:
)

pipeline.register_step(
func=write_ppad_to_database,
required_data=["ppad"],
func=store_metadata_and_data_to_database,
required_data=[
"pptd",
"ppad",
"sdoc_id",
],
)

pipeline.register_step(
Expand All @@ -433,6 +482,19 @@ def build_audio_pipeline(foo: str = "bar") -> PreprocessingPipeline:

@lru_cache(maxsize=1)
def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.audio.convert_to_pcm import convert_to_pcm
from app.preprocessing.pipeline.steps.audio.create_ffmpeg_probe_audio_metadata import (
create_ffmpeg_probe_audio_metadata,
)
from app.preprocessing.pipeline.steps.audio.create_pptd_from_transcription import (
create_pptd_from_transcription,
)
from app.preprocessing.pipeline.steps.audio.generate_automatic_transcription import (
generate_automatic_transcription,
)
from app.preprocessing.pipeline.steps.common.detect_content_language import (
detect_content_language,
)
from app.preprocessing.pipeline.steps.common.remove_erroneous_sdoc import (
remove_erroneous_or_unfinished_sdocs,
)
Expand All @@ -442,8 +504,20 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.common.update_sdoc_status_to_finish import (
update_sdoc_status_to_finish,
)
from app.preprocessing.pipeline.steps.video.add_word_level_transcriptions_to_ppvd_metadata import (
add_word_level_transcriptions_to_ppvd_metadata,
from app.preprocessing.pipeline.steps.text.generate_keywords import (
generate_keywords,
)
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
generate_sentence_annotations,
)
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
generate_word_frequncies,
)
from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
run_spacy_pipeline,
)
from app.preprocessing.pipeline.steps.text.store_document_in_elasticsearch import (
store_document_in_elasticsearch,
)
from app.preprocessing.pipeline.steps.video.create_and_store_audio_stream_file import (
create_and_store_audio_stream_file,
Expand All @@ -458,11 +532,13 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.video.generate_webp_thumbnail_for_video import (
generate_webp_thumbnail_for_video,
)
from app.preprocessing.pipeline.steps.video.store_metadata_to_database import (
store_metadata_and_data_to_database,
)
from app.preprocessing.pipeline.steps.video.write_ppvd_to_database import (
write_ppvd_to_database,
)

audio_pipeline = build_audio_pipeline()
pipeline = PreprocessingPipeline(doc_type=DocType.video)

pipeline.register_step(
Expand Down Expand Up @@ -490,14 +566,71 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
required_data=["ppvd"],
)

pipeline.join_pipeline(
pipeline=audio_pipeline,
skip_steps_with_name=["create_ppad"],
pipeline.register_step(
func=write_ppvd_to_database,
required_data=["ppvd"],
)

pipeline.register_step(
func=create_ffmpeg_probe_audio_metadata,
required_data=["ppad"],
)

pipeline.register_step(
func=convert_to_pcm,
required_data=["ppad"],
)

pipeline.register_step(
func=generate_automatic_transcription,
required_data=["ppad"],
)

# instead create pptd before and now add it as metadata
pipeline.register_step(
func=create_pptd_from_transcription,
required_data=["ppad"],
)

pipeline.register_step(
func=detect_content_language,
required_data=["pptd"],
)

# run caption through spacy and add to elasticsearch to make it searchable
pipeline.register_step(
func=run_spacy_pipeline,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_word_frequncies,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_keywords,
required_data=["pptd"],
)

pipeline.register_step(
func=add_word_level_transcriptions_to_ppvd_metadata,
required_data=["ppvd", "ppad"],
func=generate_sentence_annotations,
required_data=["pptd"],
)

pipeline.register_step(
func=store_document_in_elasticsearch,
required_data=["pptd", "sdoc_id"],
)

pipeline.register_step(
func=store_metadata_and_data_to_database,
required_data=[
"pptd",
"ppad",
"ppvd",
"sdoc_id",
],
)

pipeline.register_step(
Expand All @@ -507,11 +640,6 @@ def build_video_pipeline(foo: str = "bar") -> PreprocessingPipeline:
func=remove_erroneous_or_unfinished_sdocs,
)

pipeline.register_step(
func=write_ppvd_to_database,
required_data=["ppvd"],
)

pipeline.register_step(
func=resolve_sdoc_links,
)
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from pathlib import Path

from app.preprocessing.pipeline.model.audio.preproaudiodoc import PreProAudioDoc
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc


def create_pptd_from_transcription(cargo: PipelineCargo) -> PipelineCargo:
ppad: PreProAudioDoc = cargo.data["ppad"]

if not ppad.transcript_filepath.exists():
raise FileNotFoundError(
f"The transcription file {ppad.transcript_filepath} "
f"for {cargo.ppj_payload.filename} does not exist!"
)

ppad.metadata["transcription"] = " ".join(
[a.text for a in ppad.word_level_transcriptions]
)
pptd = PreProTextDoc(
filename=ppad.transcript_filepath.name,
filepath=ppad.transcript_filepath,
project_id=cargo.ppj_payload.project_id,
filepath=Path("/this/is/a/fake_path.txt"),
filename="fake_path.txt",
project_id=ppad.project_id,
text=ppad.metadata["transcription"],
html=f"<html><body><p>{ppad.metadata['transcription']}</p></body></html>",
metadata={"language": "en"},
mime_type="text/plain",
)

Expand Down
Loading