Skip to content

Commit

Permalink
optimize post-transcription process
Browse files Browse the repository at this point in the history
- refactor deepgram's post-transcription process to prevent mid-sentence
line breaks when incorporating chapters
- one-sentence-per-line format is now the default output
  • Loading branch information
kouloumos committed Jan 14, 2024
1 parent b4e33cf commit 2ad1c25
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 137 deletions.
285 changes: 149 additions & 136 deletions app/services/deepgram.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import mimetypes
import re

import deepgram
from dotenv import dotenv_values
Expand Down Expand Up @@ -64,124 +65,6 @@ def write_to_json_file(self, transcription_service_output, transcript: Transcrip

return transcription_service_output_file

def process_with_diarization_and_chapters(self, transcription_service_output, chapters):
logger.info(
"(deepgram) Processing diarization with detected chapters...")
try:
para = ""
string = ""
curr_speaker = None
words = transcription_service_output["results"]["channels"][0]["alternatives"][0][
"words"
]
words_pointer = 0
chapters_pointer = 0
while chapters_pointer < len(chapters) and words_pointer < len(words):
if chapters[chapters_pointer][1] <= words[words_pointer]["start"]:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = string + f"## {chapters[chapters_pointer][2]}\n\n"
chapters_pointer += 1
else:
if words[words_pointer]["speaker"] != curr_speaker:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = (
string
+ f'Speaker {words[words_pointer]["speaker"]}: '
+ utils.decimal_to_sexagesimal(words[words_pointer]["start"])
)
curr_speaker = words[words_pointer]["speaker"]
string = string + "\n\n"

para = para + " " + words[words_pointer]["punctuated_word"]
words_pointer += 1
while words_pointer < len(words):
if words[words_pointer]["speaker"] != curr_speaker:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = (
string + f'Speaker {words[words_pointer]["speaker"]}:'
f' {utils.decimal_to_sexagesimal(words[words_pointer]["start"])}'
)
curr_speaker = words[words_pointer]["speaker"]
string = string + "\n\n"

para = para + " " + words[words_pointer]["punctuated_word"]
words_pointer += 1
para = para.strip(" ")
string = string + para
return string
except Exception as e:
raise Exception(f"Error combining deepgram chapters: {e}")

def process_with_diarization(self, transcription_service_output):
logger.info(f"(deepgram) Processing diarization...")
para = ""
string = ""
curr_speaker = None
for word in transcription_service_output["results"]["channels"][0]["alternatives"][0][
"words"
]:
if word["speaker"] != curr_speaker:
if para != "":
para = para.strip(" ")
string = string + para + "\n\n"
para = ""
string = (
string + f'Speaker {word["speaker"]}: '
f'{utils.decimal_to_sexagesimal(word["start"])}'
)
curr_speaker = word["speaker"]
string = string + "\n\n"

para = para + " " + word["punctuated_word"]
para = para.strip(" ")
string = string + para
return string

def process_with_chapters(self, transcription_service_output, chapters):
logger.info("(deepgram) Combining transcript with detected chapters...")
try:
chapters_pointer = 0
words_pointer = 0
result = ""
words = transcription_service_output["results"]["channels"][0]["alternatives"][0][
"words"
]
# chapters index, start time, name
# transcript start time, end time, text
while chapters_pointer < len(chapters) and words_pointer < len(words):
if chapters[chapters_pointer][1] <= words[words_pointer]["end"]:
result = (
result + "\n\n## " +
chapters[chapters_pointer][2] + "\n\n"
)
chapters_pointer += 1
else:
result = result + \
words[words_pointer]["punctuated_word"] + " "
words_pointer += 1

# Append the final chapter heading and remaining content
while chapters_pointer < len(chapters):
result = result + "\n\n## " + \
chapters[chapters_pointer][2] + "\n\n"
chapters_pointer += 1
while words_pointer < len(words):
result = result + words[words_pointer]["punctuated_word"] + " "
words_pointer += 1

return result
except Exception as e:
raise Exception(f"Error combining deepgram with chapters: {e}")

def process_summary(self, transcript: Transcript):
with open(transcript.transcription_service_output_file, "r") as outfile:
transcription_service_output = json.load(outfile)
Expand All @@ -197,31 +80,161 @@ def process_summary(self, transcript: Transcript):
except Exception as e:
logger.error(f"Error getting summary: {e}")

def process_segments(self, transcription_service_output, diarization):
try:
words = transcription_service_output["results"]["channels"][0]["alternatives"][0]["words"]
segments = []
current_segment = None

for word in words:
speaker_id = word["speaker"] if diarization else "single_speaker"
speaker_text = word["punctuated_word"]
if speaker_id != current_segment:
# change of speaker
current_segment = speaker_id
segments.append({
"speaker": speaker_id,
"start": word["start"],
"end": word["end"],
"transcript": "",
"words": []
})

segments[-1]["transcript"] += f"{speaker_text} "
segments[-1]["words"].append(word)
segments[-1]["end"] = word["end"]

for segment in segments:
segment["transcript"] = segment["transcript"].strip()

return segments
except Exception as e:
raise Exception(
f"(deepgram) Error constructing speaker segments: {e}")

def break_segments_into_sentences(self, segments):
result = []
# Define the sentence splitting pattern
abbreviation_pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)'
sentence_end_pattern = r'(?<=\.|\?)\s'
sentence_split_pattern = f'{abbreviation_pattern}{sentence_end_pattern}'

for segment in segments:
# Split the segment into sentences
sentences = re.split(sentence_split_pattern, segment["transcript"])

segment_data = {
"speaker": segment["speaker"],
"transcript": segment["transcript"],
"start": segment["start"],
"end": segment["end"],
"sentences": []
}

word_index = 0

for sentence in sentences:
sentence_words = sentence.split()
sentence_data = {
"transcript": sentence,
"start": segment["words"][word_index]["start"],
"end": segment["words"][word_index + len(sentence_words) - 1]["end"],
"words": segment["words"][word_index:word_index+len(sentence_words)]
}

word_index += len(sentence_words)
segment_data["sentences"].append(sentence_data)

result.append(segment_data)

return result

def adjust_chapter_timestamps(self, transformed_json, chapters):
"""Adjust the given chapter timestamps to prevent mid-sentence line break"""
def find_sentence_for_timestamp(transformed_json, timestamp):
for speaker_data in transformed_json:
for sentence_data in speaker_data["sentences"]:
if sentence_data["start"] <= timestamp <= sentence_data["end"]:
return sentence_data
return None

def adjust_timestamp(original_timestamp, sentence_start, sentence_end):
midpoint = (sentence_start + sentence_end) / 2
return sentence_end if original_timestamp >= midpoint else sentence_start

adjusted_chapters = []

for chapter in chapters:
chapter_start_time = chapter[1]
chapter_sentence = find_sentence_for_timestamp(
transformed_json, chapter_start_time)

if chapter_sentence:
adjusted_start_time = adjust_timestamp(
chapter_start_time, chapter_sentence["start"], chapter_sentence["end"])
adjusted_chapter = [chapter[0],
adjusted_start_time] + chapter[2:]
adjusted_chapters.append(adjusted_chapter)
else:
adjusted_chapters.append(chapter)

return adjusted_chapters

def construct_transcript(self, speaker_segments, chapters):
try:
formatted_transcript = ""
chapter_index = 0 if chapters else None

for speaker_data in speaker_segments:
speaker_id = speaker_data["speaker"]
single_speaker = speaker_id == "single_speaker"

for i, sentence_data in enumerate(speaker_data["sentences"]):
sentence_start = sentence_data["start"]
first_sentence = i == 0

if chapter_index is not None and chapter_index < len(chapters):
chapter_id, chapter_start_time, chapter_title = chapters[chapter_index]

if chapter_start_time <= sentence_start:
# Chapter starts at this sentence
formatted_transcript += "\n" if not first_sentence else ""
formatted_transcript += f"## {chapter_title}\n\n"
if not single_speaker and not first_sentence:
formatted_transcript += f"Speaker {speaker_id}: {utils.decimal_to_sexagesimal(chapter_start_time)}\n\n"
chapter_index += 1

if not single_speaker and first_sentence:
formatted_transcript += f"Speaker {speaker_id}: {utils.decimal_to_sexagesimal(sentence_start)}\n\n"

formatted_transcript += f'{sentence_data["transcript"]}\n'

formatted_transcript += "\n"

return formatted_transcript.strip()
except Exception as e:
raise Exception(f"Error creating output format: {e}")

def finalize_transcript(self, transcript: Transcript):
try:
with open(transcript.transcription_service_output_file, "r") as outfile:
transcription_service_output = json.load(outfile)

has_diarization = any(
'speaker' in word for word in transcription_service_output['results']['channels'][0]['alternatives'][0]['words'])
has_chapters = len(transcript.source.chapters) > 0

if has_chapters:
# With chapters
if has_diarization:
# With diarization
return self.process_with_diarization_and_chapters(transcription_service_output, chapters)
else:
# Without diarization
return self.process_with_chapters(transcription_service_output, transcript.source.chapters)
else:
# Without chapters
if has_diarization:
# With diarization
return self.process_with_diarization(transcription_service_output)
else:
# Without diarization
return transcription_service_output["results"]["channels"][0]["alternatives"][0]["transcript"]

logger.info(
f"(deepgram) Finalizing transcript [diarization={has_diarization}, chapters={len(transcript.source.chapters)> 0}]...")
speaker_segments = self.process_segments(
transcription_service_output, has_diarization)
speaker_segements_with_sentences = self.break_segments_into_sentences(
speaker_segments)
with open("test.json", "w") as json_file:
json.dump(speaker_segements_with_sentences, json_file, indent=4)
adjusted_chapters = self.adjust_chapter_timestamps(
speaker_segements_with_sentences, transcript.source.chapters)
result = self.construct_transcript(
speaker_segements_with_sentences, adjusted_chapters)

return result
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def write_to_markdown_file(self, transcript: Transcript, output_dir):
# Write to file
markdown_file = f"{utils.configure_output_file_path(output_dir, transcript.title, add_timestamp=False)}.md"
with open(markdown_file, "w") as opf:
opf.write(meta_data + "\n")
opf.write(meta_data)
opf.write(transcript.result + "\n")
self.logger.info(f"Markdown file stored at: {markdown_file}")
return os.path.abspath(markdown_file)
Expand Down

0 comments on commit 2ad1c25

Please sign in to comment.