optimize post-transcription process

- refactor deepgram's post-transcription process to prevent mid-sentence line breaks when incorporating chapters - one-sentence-per-line format is now the default output
bitcointranscripts · Jan 14, 2024 · 2ad1c25 · 2ad1c25
1 parent b4e33cf
commit 2ad1c25
Show file tree

Hide file tree

Showing 2 changed files with 150 additions and 137 deletions.
diff --git a/app/services/deepgram.py b/app/services/deepgram.py
@@ -1,5 +1,6 @@
 import json
 import mimetypes
+import re
 
 import deepgram
 from dotenv import dotenv_values
@@ -64,124 +65,6 @@ def write_to_json_file(self, transcription_service_output, transcript: Transcrip
 
         return transcription_service_output_file
 
-    def process_with_diarization_and_chapters(self, transcription_service_output, chapters):
-        logger.info(
-            "(deepgram) Processing diarization with detected chapters...")
-        try:
-            para = ""
-            string = ""
-            curr_speaker = None
-            words = transcription_service_output["results"]["channels"][0]["alternatives"][0][
-                "words"
-            ]
-            words_pointer = 0
-            chapters_pointer = 0
-            while chapters_pointer < len(chapters) and words_pointer < len(words):
-                if chapters[chapters_pointer][1] <= words[words_pointer]["start"]:
-                    if para != "":
-                        para = para.strip(" ")
-                        string = string + para + "\n\n"
-                    para = ""
-                    string = string + f"## {chapters[chapters_pointer][2]}\n\n"
-                    chapters_pointer += 1
-                else:
-                    if words[words_pointer]["speaker"] != curr_speaker:
-                        if para != "":
-                            para = para.strip(" ")
-                            string = string + para + "\n\n"
-                        para = ""
-                        string = (
-                            string
-                            + f'Speaker {words[words_pointer]["speaker"]}: '
-                            + utils.decimal_to_sexagesimal(words[words_pointer]["start"])
-                        )
-                        curr_speaker = words[words_pointer]["speaker"]
-                        string = string + "\n\n"
-
-                    para = para + " " + words[words_pointer]["punctuated_word"]
-                    words_pointer += 1
-            while words_pointer < len(words):
-                if words[words_pointer]["speaker"] != curr_speaker:
-                    if para != "":
-                        para = para.strip(" ")
-                        string = string + para + "\n\n"
-                    para = ""
-                    string = (
-                        string + f'Speaker {words[words_pointer]["speaker"]}:'
-                        f' {utils.decimal_to_sexagesimal(words[words_pointer]["start"])}'
-                    )
-                    curr_speaker = words[words_pointer]["speaker"]
-                    string = string + "\n\n"
-
-                para = para + " " + words[words_pointer]["punctuated_word"]
-                words_pointer += 1
-            para = para.strip(" ")
-            string = string + para
-            return string
-        except Exception as e:
-            raise Exception(f"Error combining deepgram chapters: {e}")
-
-    def process_with_diarization(self, transcription_service_output):
-        logger.info(f"(deepgram) Processing diarization...")
-        para = ""
-        string = ""
-        curr_speaker = None
-        for word in transcription_service_output["results"]["channels"][0]["alternatives"][0][
-            "words"
-        ]:
-            if word["speaker"] != curr_speaker:
-                if para != "":
-                    para = para.strip(" ")
-                    string = string + para + "\n\n"
-                para = ""
-                string = (
-                    string + f'Speaker {word["speaker"]}: '
-                    f'{utils.decimal_to_sexagesimal(word["start"])}'
-                )
-                curr_speaker = word["speaker"]
-                string = string + "\n\n"
-
-            para = para + " " + word["punctuated_word"]
-        para = para.strip(" ")
-        string = string + para
-        return string
-
-    def process_with_chapters(self, transcription_service_output, chapters):
-        logger.info("(deepgram) Combining transcript with detected chapters...")
-        try:
-            chapters_pointer = 0
-            words_pointer = 0
-            result = ""
-            words = transcription_service_output["results"]["channels"][0]["alternatives"][0][
-                "words"
-            ]
-            # chapters index, start time, name
-            # transcript start time, end time, text
-            while chapters_pointer < len(chapters) and words_pointer < len(words):
-                if chapters[chapters_pointer][1] <= words[words_pointer]["end"]:
-                    result = (
-                        result + "\n\n## " +
-                        chapters[chapters_pointer][2] + "\n\n"
-                    )
-                    chapters_pointer += 1
-                else:
-                    result = result + \
-                        words[words_pointer]["punctuated_word"] + " "
-                    words_pointer += 1
-
-            # Append the final chapter heading and remaining content
-            while chapters_pointer < len(chapters):
-                result = result + "\n\n## " + \
-                    chapters[chapters_pointer][2] + "\n\n"
-                chapters_pointer += 1
-            while words_pointer < len(words):
-                result = result + words[words_pointer]["punctuated_word"] + " "
-                words_pointer += 1
-
-            return result
-        except Exception as e:
-            raise Exception(f"Error combining deepgram with chapters: {e}")
-
     def process_summary(self, transcript: Transcript):
         with open(transcript.transcription_service_output_file, "r") as outfile:
             transcription_service_output = json.load(outfile)
@@ -197,31 +80,161 @@ def process_summary(self, transcript: Transcript):
         except Exception as e:
             logger.error(f"Error getting summary: {e}")
 
+    def process_segments(self, transcription_service_output, diarization):
+        try:
+            words = transcription_service_output["results"]["channels"][0]["alternatives"][0]["words"]
+            segments = []
+            current_segment = None
+
+            for word in words:
+                speaker_id = word["speaker"] if diarization else "single_speaker"
+                speaker_text = word["punctuated_word"]
+                if speaker_id != current_segment:
+                    # change of speaker
+                    current_segment = speaker_id
+                    segments.append({
+                        "speaker": speaker_id,
+                        "start": word["start"],
+                        "end": word["end"],
+                        "transcript": "",
+                        "words": []
+                    })
+
+                segments[-1]["transcript"] += f"{speaker_text} "
+                segments[-1]["words"].append(word)
+                segments[-1]["end"] = word["end"]
+
+            for segment in segments:
+                segment["transcript"] = segment["transcript"].strip()
+
+            return segments
+        except Exception as e:
+            raise Exception(
+                f"(deepgram) Error constructing speaker segments: {e}")
+
+    def break_segments_into_sentences(self, segments):
+        result = []
+        # Define the sentence splitting pattern
+        abbreviation_pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)'
+        sentence_end_pattern = r'(?<=\.|\?)\s'
+        sentence_split_pattern = f'{abbreviation_pattern}{sentence_end_pattern}'
+
+        for segment in segments:
+            # Split the segment into sentences
+            sentences = re.split(sentence_split_pattern, segment["transcript"])
+
+            segment_data = {
+                "speaker": segment["speaker"],
+                "transcript": segment["transcript"],
+                "start": segment["start"],
+                "end": segment["end"],
+                "sentences": []
+            }
+
+            word_index = 0
+
+            for sentence in sentences:
+                sentence_words = sentence.split()
+                sentence_data = {
+                    "transcript": sentence,
+                    "start": segment["words"][word_index]["start"],
+                    "end": segment["words"][word_index + len(sentence_words) - 1]["end"],
+                    "words": segment["words"][word_index:word_index+len(sentence_words)]
+                }
+
+                word_index += len(sentence_words)
+                segment_data["sentences"].append(sentence_data)
+
+            result.append(segment_data)
+
+        return result
+
+    def adjust_chapter_timestamps(self, transformed_json, chapters):
+        """Adjust the given chapter timestamps to prevent mid-sentence line break"""
+        def find_sentence_for_timestamp(transformed_json, timestamp):
+            for speaker_data in transformed_json:
+                for sentence_data in speaker_data["sentences"]:
+                    if sentence_data["start"] <= timestamp <= sentence_data["end"]:
+                        return sentence_data
+            return None
+
+        def adjust_timestamp(original_timestamp, sentence_start, sentence_end):
+            midpoint = (sentence_start + sentence_end) / 2
+            return sentence_end if original_timestamp >= midpoint else sentence_start
+
+        adjusted_chapters = []
+
+        for chapter in chapters:
+            chapter_start_time = chapter[1]
+            chapter_sentence = find_sentence_for_timestamp(
+                transformed_json, chapter_start_time)
+
+            if chapter_sentence:
+                adjusted_start_time = adjust_timestamp(
+                    chapter_start_time, chapter_sentence["start"], chapter_sentence["end"])
+                adjusted_chapter = [chapter[0],
+                                    adjusted_start_time] + chapter[2:]
+                adjusted_chapters.append(adjusted_chapter)
+            else:
+                adjusted_chapters.append(chapter)
+
+        return adjusted_chapters
+
+    def construct_transcript(self, speaker_segments, chapters):
+        try:
+            formatted_transcript = ""
+            chapter_index = 0 if chapters else None
+
+            for speaker_data in speaker_segments:
+                speaker_id = speaker_data["speaker"]
+                single_speaker = speaker_id == "single_speaker"
+
+                for i, sentence_data in enumerate(speaker_data["sentences"]):
+                    sentence_start = sentence_data["start"]
+                    first_sentence = i == 0
+
+                    if chapter_index is not None and chapter_index < len(chapters):
+                        chapter_id, chapter_start_time, chapter_title = chapters[chapter_index]
+
+                        if chapter_start_time <= sentence_start:
+                            # Chapter starts at this sentence
+                            formatted_transcript += "\n" if not first_sentence else ""
+                            formatted_transcript += f"## {chapter_title}\n\n"
+                            if not single_speaker and not first_sentence:
+                                formatted_transcript += f"Speaker {speaker_id}: {utils.decimal_to_sexagesimal(chapter_start_time)}\n\n"
+                            chapter_index += 1
+
+                    if not single_speaker and first_sentence:
+                        formatted_transcript += f"Speaker {speaker_id}: {utils.decimal_to_sexagesimal(sentence_start)}\n\n"
+
+                    formatted_transcript += f'{sentence_data["transcript"]}\n'
+
+                formatted_transcript += "\n"
+
+            return formatted_transcript.strip()
+        except Exception as e:
+            raise Exception(f"Error creating output format: {e}")
+
     def finalize_transcript(self, transcript: Transcript):
         try:
             with open(transcript.transcription_service_output_file, "r") as outfile:
                 transcription_service_output = json.load(outfile)
 
             has_diarization = any(
                 'speaker' in word for word in transcription_service_output['results']['channels'][0]['alternatives'][0]['words'])
-            has_chapters = len(transcript.source.chapters) > 0
-
-            if has_chapters:
-                # With chapters
-                if has_diarization:
-                    # With diarization
-                    return self.process_with_diarization_and_chapters(transcription_service_output, chapters)
-                else:
-                    # Without diarization
-                    return self.process_with_chapters(transcription_service_output, transcript.source.chapters)
-            else:
-                # Without chapters
-                if has_diarization:
-                    # With diarization
-                    return self.process_with_diarization(transcription_service_output)
-                else:
-                    # Without diarization
-                    return transcription_service_output["results"]["channels"][0]["alternatives"][0]["transcript"]
+
+            logger.info(
+                f"(deepgram) Finalizing transcript [diarization={has_diarization}, chapters={len(transcript.source.chapters)> 0}]...")
+            speaker_segments = self.process_segments(
+                transcription_service_output, has_diarization)
+            speaker_segements_with_sentences = self.break_segments_into_sentences(
+                speaker_segments)
+            with open("test.json", "w") as json_file:
+                json.dump(speaker_segements_with_sentences, json_file, indent=4)
+            adjusted_chapters = self.adjust_chapter_timestamps(
+                speaker_segements_with_sentences, transcript.source.chapters)
+            result = self.construct_transcript(
+                speaker_segements_with_sentences, adjusted_chapters)
 
             return result
         except Exception as e:

diff --git a/app/transcription.py b/app/transcription.py
@@ -360,7 +360,7 @@ def write_to_markdown_file(self, transcript: Transcript, output_dir):
             # Write to file
             markdown_file = f"{utils.configure_output_file_path(output_dir, transcript.title, add_timestamp=False)}.md"
             with open(markdown_file, "w") as opf:
-                opf.write(meta_data + "\n")
+                opf.write(meta_data)
                 opf.write(transcript.result + "\n")
             self.logger.info(f"Markdown file stored at: {markdown_file}")
             return os.path.abspath(markdown_file)