From 9f1a3a4e7684fe22ee33bde1407ef16d99e523a7 Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Mon, 17 Apr 2023 23:39:25 +0530 Subject: [PATCH 01/10] feat: added deepgram for transcription --- app/application.py | 48 ++++++++++++++++++++++++++++++++++++++-------- requirements.txt | 5 +++-- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/app/application.py b/app/application.py index 57b6918..1a80707 100644 --- a/app/application.py +++ b/app/application.py @@ -5,7 +5,6 @@ from clint.textui import progress import pytube from moviepy.editor import VideoFileClip -import whisper import os import static_ffmpeg from app import __version__ @@ -15,6 +14,8 @@ import time from dotenv import dotenv_values import yt_dlp +from deepgram import Deepgram +import mimetypes def download_video(url): @@ -165,6 +166,7 @@ def get_playlist_videos(url): print(e) return + def get_audio_file(url, title): print("URL: " + url) print("downloading audio file") @@ -183,14 +185,42 @@ def get_audio_file(url, title): return +def decimal_to_sexagesimal(dec): + sec = int(dec % 60) + minu = int((dec // 60) % 60) + hrs = int((dec // 60) // 60) + + return f'{hrs}:{minu}:{sec}' + + def process_mp3(filename, model): print("Transcribing audio to text...") try: - mymodel = whisper.load_model(model) - result = mymodel.transcribe(filename[:-4] + ".mp3") - result = result["text"] - print("Removed video and audio files") - return result + config = dotenv_values(".env") + dg_client = Deepgram(config["DEEPGRAM_API_KEY"]) + + with open(filename, "rb") as audio: + mimeType = mimetypes.MimeTypes().guess_type(filename)[0] + source = {'buffer': audio, 'mimetype': mimeType} + response = dg_client.transcription.sync_prerecorded(source, {'punctuate': True, 'speaker_labels': True, + 'diarize': True, 'smart_formatting': True}) + para = "" + string = "" + curr_speaker = None + for word in response["results"]["channels"][0]["alternatives"][0]["words"]: + if word["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = string + f'Speaker {word["speaker"]}: {decimal_to_sexagesimal(word["start"])}' + curr_speaker = word["speaker"] + string = string + '\n\n' + + para = para + " " + word["punctuated_word"] + para = para.strip(" ") + string = string + para + return string except Exception as e: print("Error transcribing audio to text") print(e) @@ -265,7 +295,8 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video print(e) -def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, test, +def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, + test, pr): try: print("writing .md file") @@ -350,7 +381,8 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode result = test else: result = process_mp3(abs_path, model) - absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, tags=tags, + absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, + tags=tags, category=category, speakers=speakers, username=username, local=local, video_title=filename[:-4], test=test, pr=pr) diff --git a/requirements.txt b/requirements.txt index 089bf23..1e0993a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ -openai-whisper==20230314 Click==7.0 clint==0.5.1 +deepgram_sdk==2.4.0 moviepy==1.0.3 +openai_whisper==20230314 pytest==7.2.1 python-dotenv==1.0.0 pytube==12.1.2 requests==2.28.2 -setuptools==45.2.0 +setuptools==67.6.1 static_ffmpeg==2.3 yt_dlp==2023.3.4 From eb288eccde8ed4c163ff8badaf9abdafabd028a1 Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Tue, 18 Apr 2023 00:06:25 +0530 Subject: [PATCH 02/10] fix: removed the option to select different models --- app/application.py | 29 ++++++++++++++--------------- test/test_audio.py | 6 +++--- test/test_video.py | 6 +++--- transcriber.py | 6 +----- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/app/application.py b/app/application.py index 1a80707..19dcd98 100644 --- a/app/application.py +++ b/app/application.py @@ -193,7 +193,7 @@ def decimal_to_sexagesimal(dec): return f'{hrs}:{minu}:{sec}' -def process_mp3(filename, model): +def process_mp3(filename): print("Transcribing audio to text...") try: config = dotenv_values(".env") @@ -352,7 +352,7 @@ def check_source_type(source): return None -def process_audio(source, title, event_date, tags, category, speakers, loc, model, username, local, +def process_audio(source, title, event_date, tags, category, speakers, loc, username, local, created_files, test, pr): try: print("audio file detected") @@ -380,7 +380,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode if test: result = test else: - result = process_mp3(abs_path, model) + result = process_mp3(abs_path) absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, tags=tags, category=category, speakers=speakers, username=username, local=local, @@ -397,7 +397,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode print(e) -def process_videos(source, title, event_date, tags, category, speakers, loc, model, username, created_files, +def process_videos(source, title, event_date, tags, category, speakers, loc, username, created_files, chapters, pr): try: print("Playlist detected") @@ -411,12 +411,11 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, mod print("Playlist is empty") return - selected_model = model + '.en' filename = "" for video in videos: filename = process_video(video=video, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=selected_model, username=username, + speakers=speakers, loc=loc, username=username, pr=pr, created_files=created_files, chapters=chapters, test=False) if filename is None: return None @@ -426,7 +425,7 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, mod print(e) -def process_video(video, title, event_date, tags, category, speakers, loc, model, username, created_files, +def process_video(video, title, event_date, tags, category, speakers, loc, username, created_files, chapters, test, pr, local=False): try: result = "" @@ -472,7 +471,7 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model if file is None: print("File not found") return None - temp_res = process_mp3(filename=temp_filename, model=model) + temp_res = process_mp3(filename=temp_filename) created_files.append(temp_filename[:-4] + ".mp3") else: temp_res = "" @@ -490,7 +489,7 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model if not test: convert_video_to_mp3(abs_path) created_files.append(abs_path[:-4] + '.mp3') - result = process_mp3(abs_path[:-4] + '.mp3', model) + result = process_mp3(abs_path[:-4] + '.mp3') created_files.append(abs_path[:-4] + ".mp3") else: result = "" @@ -511,7 +510,7 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model print(e) -def process_source(source, title, event_date, tags, category, speakers, loc, model, username, source_type, +def process_source(source, title, event_date, tags, category, speakers, loc, username, source_type, created_files, chapters, local=False, test=None, pr=False): try: if not os.path.isdir("tmp"): @@ -522,24 +521,24 @@ def process_source(source, title, event_date, tags, category, speakers, loc, mod if source_type == 'audio': filename = process_audio(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=model, username=username, + speakers=speakers, loc=loc, username=username, local=local, created_files=created_files, test=test, pr=pr) elif source_type == 'audio-local': filename = process_audio(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=model, username=username, + speakers=speakers, loc=loc, username=username, local=True, created_files=created_files, test=test, pr=pr) elif source_type == 'playlist': filename = process_videos(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=model, username=username, + speakers=speakers, loc=loc, username=username, created_files=created_files, chapters=chapters, pr=pr) elif source_type == 'video-local': filename = process_video(video=source, title=title, event_date=event_date, - tags=tags, category=category, speakers=speakers, loc=loc, model=model, + tags=tags, category=category, speakers=speakers, loc=loc, username=username, created_files=created_files, local=True, chapters=chapters, test=test, pr=pr) else: filename = process_video(video=source, title=title, event_date=event_date, - tags=tags, category=category, speakers=speakers, loc=loc, model=model, + tags=tags, category=category, speakers=speakers, loc=loc, username=username, created_files=created_files, local=local, chapters=chapters, test=test, pr=pr) return filename diff --git a/test/test_audio.py b/test/test_audio.py index 731a4ed..24aac52 100644 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -49,7 +49,7 @@ def test_audio_with_title(): username = "username" created_files = [] filename = application.process_source(source=source, title=title, event_date=None, tags=None, category=None, - speakers=None, loc="yada/yada", model="tiny", username=username, + speakers=None, loc="yada/yada", username=username, source_type="audio", local=True, test=result, chapters=False, pr=False, created_files=created_files) assert os.path.isfile(filename) @@ -68,7 +68,7 @@ def test_audio_without_title(): created_files = [] title = None filename = application.process_source(source=source, title=title, event_date=None, tags=None, category=None, - speakers=None, loc="yada/yada", model="tiny", username=username, pr=False, + speakers=None, loc="yada/yada", username=username, pr=False, source_type="audio", local=True, created_files=created_files, test=result, chapters=False) assert filename is None @@ -91,7 +91,7 @@ def test_audio_with_all_data(): date = datetime.strptime(date, '%Y-%m-%d').date() created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", model="tiny", username=username, + speakers=speakers, loc="yada/yada", username=username, source_type="audio", local=True, test=result, chapters=False, created_files=created_files, pr=False) category = [cat.strip() for cat in category.split(",")] diff --git a/test/test_video.py b/test/test_video.py index 2c65a6c..8ec747c 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -81,7 +81,7 @@ def test_video_with_title(): date = None created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", model="tiny", username=username, + speakers=speakers, loc="yada/yada", username=username, source_type="video", local=True, created_files=created_files, test=result, chapters=False) assert os.path.isfile(filename) @@ -107,7 +107,7 @@ def test_video_with_all_options(): date = datetime.strptime(date, '%Y-%m-%d').date() created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", model="tiny", username=username, + speakers=speakers, loc="yada/yada", username=username, source_type="video", local=True, created_files=created_files, test=True, chapters=False) assert os.path.isfile(filename) @@ -137,7 +137,7 @@ def test_video_with_chapters(): date = datetime.strptime(date, '%Y-%m-%d').date() created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", model="tiny", username=username, + speakers=speakers, loc="yada/yada", username=username, source_type="video", local=True, created_files=created_files, test=result, chapters=True, pr=True) chapter_names = [] diff --git a/transcriber.py b/transcriber.py index 5c068ca..5ec110a 100644 --- a/transcriber.py +++ b/transcriber.py @@ -26,9 +26,6 @@ def print_help(ctx, param, value): @click.command() @click.argument('source', nargs=1) @click.argument('loc', nargs=1) -@click.option('-m', '--model', type=click.Choice(['tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v2']), default='tiny.en', - help='Options for transcription model' - ) @click.option('-t', '--title', type=str, help="Supply transcribed file title in 'quotes', title is mandatory in case of audio files") @click.option('-d', '--date', type=str, help="Supply the event date in format 'yyyy-mm-dd'") @@ -47,7 +44,6 @@ def print_help(ctx, param, value): def add( source: str, loc: str, - model: str, title: str, date: str, tags: str, @@ -76,7 +72,7 @@ def add( print("Invalid source") return filename = application.process_source(source=source, title=title, event_date=event_date, tags=tags, - category=category, speakers=speakers, loc=loc, model=model, + category=category, speakers=speakers, loc=loc, username=username, chapters=chapters, pr=pr, source_type=source_type, created_files=created_files) if filename: From 9a726e502d31b1aa67b9300f1dc1039a8eddd1e3 Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Thu, 20 Apr 2023 23:04:12 +0530 Subject: [PATCH 03/10] Revert "fix: removed the option to select different models" This reverts commit eb288eccde8ed4c163ff8badaf9abdafabd028a1. --- app/application.py | 29 +++++++++++++++-------------- test/test_audio.py | 6 +++--- test/test_video.py | 6 +++--- transcriber.py | 6 +++++- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/app/application.py b/app/application.py index 19dcd98..1a80707 100644 --- a/app/application.py +++ b/app/application.py @@ -193,7 +193,7 @@ def decimal_to_sexagesimal(dec): return f'{hrs}:{minu}:{sec}' -def process_mp3(filename): +def process_mp3(filename, model): print("Transcribing audio to text...") try: config = dotenv_values(".env") @@ -352,7 +352,7 @@ def check_source_type(source): return None -def process_audio(source, title, event_date, tags, category, speakers, loc, username, local, +def process_audio(source, title, event_date, tags, category, speakers, loc, model, username, local, created_files, test, pr): try: print("audio file detected") @@ -380,7 +380,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, user if test: result = test else: - result = process_mp3(abs_path) + result = process_mp3(abs_path, model) absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, tags=tags, category=category, speakers=speakers, username=username, local=local, @@ -397,7 +397,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, user print(e) -def process_videos(source, title, event_date, tags, category, speakers, loc, username, created_files, +def process_videos(source, title, event_date, tags, category, speakers, loc, model, username, created_files, chapters, pr): try: print("Playlist detected") @@ -411,11 +411,12 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, use print("Playlist is empty") return + selected_model = model + '.en' filename = "" for video in videos: filename = process_video(video=video, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, username=username, + speakers=speakers, loc=loc, model=selected_model, username=username, pr=pr, created_files=created_files, chapters=chapters, test=False) if filename is None: return None @@ -425,7 +426,7 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, use print(e) -def process_video(video, title, event_date, tags, category, speakers, loc, username, created_files, +def process_video(video, title, event_date, tags, category, speakers, loc, model, username, created_files, chapters, test, pr, local=False): try: result = "" @@ -471,7 +472,7 @@ def process_video(video, title, event_date, tags, category, speakers, loc, usern if file is None: print("File not found") return None - temp_res = process_mp3(filename=temp_filename) + temp_res = process_mp3(filename=temp_filename, model=model) created_files.append(temp_filename[:-4] + ".mp3") else: temp_res = "" @@ -489,7 +490,7 @@ def process_video(video, title, event_date, tags, category, speakers, loc, usern if not test: convert_video_to_mp3(abs_path) created_files.append(abs_path[:-4] + '.mp3') - result = process_mp3(abs_path[:-4] + '.mp3') + result = process_mp3(abs_path[:-4] + '.mp3', model) created_files.append(abs_path[:-4] + ".mp3") else: result = "" @@ -510,7 +511,7 @@ def process_video(video, title, event_date, tags, category, speakers, loc, usern print(e) -def process_source(source, title, event_date, tags, category, speakers, loc, username, source_type, +def process_source(source, title, event_date, tags, category, speakers, loc, model, username, source_type, created_files, chapters, local=False, test=None, pr=False): try: if not os.path.isdir("tmp"): @@ -521,24 +522,24 @@ def process_source(source, title, event_date, tags, category, speakers, loc, use if source_type == 'audio': filename = process_audio(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, username=username, + speakers=speakers, loc=loc, model=model, username=username, local=local, created_files=created_files, test=test, pr=pr) elif source_type == 'audio-local': filename = process_audio(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, username=username, + speakers=speakers, loc=loc, model=model, username=username, local=True, created_files=created_files, test=test, pr=pr) elif source_type == 'playlist': filename = process_videos(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, username=username, + speakers=speakers, loc=loc, model=model, username=username, created_files=created_files, chapters=chapters, pr=pr) elif source_type == 'video-local': filename = process_video(video=source, title=title, event_date=event_date, - tags=tags, category=category, speakers=speakers, loc=loc, + tags=tags, category=category, speakers=speakers, loc=loc, model=model, username=username, created_files=created_files, local=True, chapters=chapters, test=test, pr=pr) else: filename = process_video(video=source, title=title, event_date=event_date, - tags=tags, category=category, speakers=speakers, loc=loc, + tags=tags, category=category, speakers=speakers, loc=loc, model=model, username=username, created_files=created_files, local=local, chapters=chapters, test=test, pr=pr) return filename diff --git a/test/test_audio.py b/test/test_audio.py index 24aac52..731a4ed 100644 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -49,7 +49,7 @@ def test_audio_with_title(): username = "username" created_files = [] filename = application.process_source(source=source, title=title, event_date=None, tags=None, category=None, - speakers=None, loc="yada/yada", username=username, + speakers=None, loc="yada/yada", model="tiny", username=username, source_type="audio", local=True, test=result, chapters=False, pr=False, created_files=created_files) assert os.path.isfile(filename) @@ -68,7 +68,7 @@ def test_audio_without_title(): created_files = [] title = None filename = application.process_source(source=source, title=title, event_date=None, tags=None, category=None, - speakers=None, loc="yada/yada", username=username, pr=False, + speakers=None, loc="yada/yada", model="tiny", username=username, pr=False, source_type="audio", local=True, created_files=created_files, test=result, chapters=False) assert filename is None @@ -91,7 +91,7 @@ def test_audio_with_all_data(): date = datetime.strptime(date, '%Y-%m-%d').date() created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", username=username, + speakers=speakers, loc="yada/yada", model="tiny", username=username, source_type="audio", local=True, test=result, chapters=False, created_files=created_files, pr=False) category = [cat.strip() for cat in category.split(",")] diff --git a/test/test_video.py b/test/test_video.py index 8ec747c..2c65a6c 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -81,7 +81,7 @@ def test_video_with_title(): date = None created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", username=username, + speakers=speakers, loc="yada/yada", model="tiny", username=username, source_type="video", local=True, created_files=created_files, test=result, chapters=False) assert os.path.isfile(filename) @@ -107,7 +107,7 @@ def test_video_with_all_options(): date = datetime.strptime(date, '%Y-%m-%d').date() created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", username=username, + speakers=speakers, loc="yada/yada", model="tiny", username=username, source_type="video", local=True, created_files=created_files, test=True, chapters=False) assert os.path.isfile(filename) @@ -137,7 +137,7 @@ def test_video_with_chapters(): date = datetime.strptime(date, '%Y-%m-%d').date() created_files = [] filename = application.process_source(source=source, title=title, event_date=date, tags=tags, category=category, - speakers=speakers, loc="yada/yada", username=username, + speakers=speakers, loc="yada/yada", model="tiny", username=username, source_type="video", local=True, created_files=created_files, test=result, chapters=True, pr=True) chapter_names = [] diff --git a/transcriber.py b/transcriber.py index 5ec110a..5c068ca 100644 --- a/transcriber.py +++ b/transcriber.py @@ -26,6 +26,9 @@ def print_help(ctx, param, value): @click.command() @click.argument('source', nargs=1) @click.argument('loc', nargs=1) +@click.option('-m', '--model', type=click.Choice(['tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v2']), default='tiny.en', + help='Options for transcription model' + ) @click.option('-t', '--title', type=str, help="Supply transcribed file title in 'quotes', title is mandatory in case of audio files") @click.option('-d', '--date', type=str, help="Supply the event date in format 'yyyy-mm-dd'") @@ -44,6 +47,7 @@ def print_help(ctx, param, value): def add( source: str, loc: str, + model: str, title: str, date: str, tags: str, @@ -72,7 +76,7 @@ def add( print("Invalid source") return filename = application.process_source(source=source, title=title, event_date=event_date, tags=tags, - category=category, speakers=speakers, loc=loc, + category=category, speakers=speakers, loc=loc, model=model, username=username, chapters=chapters, pr=pr, source_type=source_type, created_files=created_files) if filename: From 8708491c966c2d256ad6d61126060f9c98b03bea Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Thu, 20 Apr 2023 23:04:13 +0530 Subject: [PATCH 04/10] Revert "feat: added deepgram for transcription" This reverts commit 9f1a3a4e7684fe22ee33bde1407ef16d99e523a7. --- app/application.py | 48 ++++++++-------------------------------------- requirements.txt | 5 ++--- 2 files changed, 10 insertions(+), 43 deletions(-) diff --git a/app/application.py b/app/application.py index 1a80707..57b6918 100644 --- a/app/application.py +++ b/app/application.py @@ -5,6 +5,7 @@ from clint.textui import progress import pytube from moviepy.editor import VideoFileClip +import whisper import os import static_ffmpeg from app import __version__ @@ -14,8 +15,6 @@ import time from dotenv import dotenv_values import yt_dlp -from deepgram import Deepgram -import mimetypes def download_video(url): @@ -166,7 +165,6 @@ def get_playlist_videos(url): print(e) return - def get_audio_file(url, title): print("URL: " + url) print("downloading audio file") @@ -185,42 +183,14 @@ def get_audio_file(url, title): return -def decimal_to_sexagesimal(dec): - sec = int(dec % 60) - minu = int((dec // 60) % 60) - hrs = int((dec // 60) // 60) - - return f'{hrs}:{minu}:{sec}' - - def process_mp3(filename, model): print("Transcribing audio to text...") try: - config = dotenv_values(".env") - dg_client = Deepgram(config["DEEPGRAM_API_KEY"]) - - with open(filename, "rb") as audio: - mimeType = mimetypes.MimeTypes().guess_type(filename)[0] - source = {'buffer': audio, 'mimetype': mimeType} - response = dg_client.transcription.sync_prerecorded(source, {'punctuate': True, 'speaker_labels': True, - 'diarize': True, 'smart_formatting': True}) - para = "" - string = "" - curr_speaker = None - for word in response["results"]["channels"][0]["alternatives"][0]["words"]: - if word["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = string + f'Speaker {word["speaker"]}: {decimal_to_sexagesimal(word["start"])}' - curr_speaker = word["speaker"] - string = string + '\n\n' - - para = para + " " + word["punctuated_word"] - para = para.strip(" ") - string = string + para - return string + mymodel = whisper.load_model(model) + result = mymodel.transcribe(filename[:-4] + ".mp3") + result = result["text"] + print("Removed video and audio files") + return result except Exception as e: print("Error transcribing audio to text") print(e) @@ -295,8 +265,7 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video print(e) -def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, - test, +def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, test, pr): try: print("writing .md file") @@ -381,8 +350,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode result = test else: result = process_mp3(abs_path, model) - absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, - tags=tags, + absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, tags=tags, category=category, speakers=speakers, username=username, local=local, video_title=filename[:-4], test=test, pr=pr) diff --git a/requirements.txt b/requirements.txt index 1e0993a..089bf23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,11 @@ +openai-whisper==20230314 Click==7.0 clint==0.5.1 -deepgram_sdk==2.4.0 moviepy==1.0.3 -openai_whisper==20230314 pytest==7.2.1 python-dotenv==1.0.0 pytube==12.1.2 requests==2.28.2 -setuptools==67.6.1 +setuptools==45.2.0 static_ffmpeg==2.3 yt_dlp==2023.3.4 From 6a941dae0590d3ec3e9ef3425c11fa1427e0db27 Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Sat, 22 Apr 2023 23:56:18 +0530 Subject: [PATCH 05/10] fix: added chapters without splitting the original file --- app/application.py | 128 +++++++++++++++++++++------------------------ test/test_cli.py | 11 ---- 2 files changed, 59 insertions(+), 80 deletions(-) diff --git a/app/application.py b/app/application.py index 57b6918..85a14c6 100644 --- a/app/application.py +++ b/app/application.py @@ -55,19 +55,13 @@ def read_description(prefix): return list_of_chapters for index, x in enumerate(info['chapters']): name = x['title'] - start = x['start_time'] - m, s = divmod(start, 60) - h, m = divmod(m, 60) - current_dur = ':'.join([str(int(h)), str(int(m)), str(s)]) - start = current_dur - - list_of_chapters.append((str(index), str(start), str(name))) + list_of_chapters.append((str(index), start, str(name))) return list_of_chapters except Exception as e: print("Error reading description") - return list_of_chapters + return [] def write_chapters_file(chapter_file: str, chapter_list: list) -> None: @@ -85,33 +79,6 @@ def write_chapters_file(chapter_file: str, chapter_list: list) -> None: print(e) -def split_mp4(chapters: list, download_filename: str, download_name: str) -> None: - try: - current_duration_pretext = subprocess.run(['ffprobe', '-i', download_filename, - '-show_entries', 'format=duration', - '-v', 'quiet'], - capture_output=True, encoding='UTF8') - current_duration = float(current_duration_pretext.stdout[18:-13]) - m, s = divmod(current_duration, 60) - h, m = divmod(m, 60) - current_dur = ':'.join([str(int(h)), str(int(m)), str(s)]) - for current_index, current_chapter in enumerate(chapters): - # current_chapter will be a tuple: position, timecode, name - next_index = current_index + 1 - start_time = current_chapter[1] - try: - end_time = chapters[next_index][1] - except: - end_time = current_dur - output_name = f'{download_name} - ({current_index}).mp4' - subprocess.run(["ffmpeg", "-ss", start_time, "-to", end_time, - "-i", download_filename, "-acodec", "copy", - "-vcodec", "copy", output_name, "-loglevel", "quiet"]) - except Exception as e: - print("Error splitting mp4") - print(e) - - def convert_video_to_mp3(filename): try: clip = VideoFileClip(filename) @@ -165,6 +132,7 @@ def get_playlist_videos(url): print(e) return + def get_audio_file(url, title): print("URL: " + url) print("downloading audio file") @@ -186,17 +154,27 @@ def get_audio_file(url, title): def process_mp3(filename, model): print("Transcribing audio to text...") try: - mymodel = whisper.load_model(model) - result = mymodel.transcribe(filename[:-4] + ".mp3") - result = result["text"] + my_model = whisper.load_model(model) + result = my_model.transcribe(filename) + data = [] + for x in result["segments"]: + data.append(tuple((x["start"], x["end"], x["text"]))) print("Removed video and audio files") - return result + return data except Exception as e: print("Error transcribing audio to text") print(e) return +def create_transcript(data): + result = "" + for x in data: + result = result + x[2] + " " + + return result + + def initialize(): try: print(''' @@ -265,7 +243,8 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video print(e) -def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, test, +def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, + test, pr): try: print("writing .md file") @@ -334,7 +313,9 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode # process audio file if not local: filename = get_audio_file(url=source, title=title) - abs_path = os.path.abspath(path=filename) + abs_path = os.path.abspath(path="tmp/" + filename) + print("filename", filename) + print("abs_path", abs_path) created_files.append(abs_path) else: filename = source.split("/")[-1] @@ -344,13 +325,16 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode print("File not found") return if filename.endswith('wav'): + initialize() abs_path = convert_wav_to_mp3(abs_path=abs_path, filename=filename) created_files.append(abs_path) if test: result = test else: result = process_mp3(abs_path, model) - absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, tags=tags, + result = create_transcript(result) + absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, + tags=tags, category=category, speakers=speakers, username=username, local=local, video_title=filename[:-4], test=test, pr=pr) @@ -394,10 +378,34 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, mod print(e) +def combine_chapter(chapters, transcript): + chapters_pointer = 0 + transcript_pointer = 0 + result = "" + # chapters index, start time, name + # transcript start time, end time, text + + while chapters_pointer < len(chapters) and transcript_pointer < len(transcript): + if chapters[chapters_pointer][1] <= transcript[transcript_pointer][0]: + result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" + chapters_pointer += 1 + else: + result = result + transcript[transcript_pointer][2] + transcript_pointer += 1 + + while transcript_pointer < len(transcript): + result = result + transcript[transcript_pointer][2] + transcript_pointer += 1 + + with open("result.md", "w") as file: + file.write(result) + + return result + + def process_video(video, title, event_date, tags, category, speakers, loc, model, username, created_files, chapters, test, pr, local=False): try: - result = "" curr_time = str(round(time.time() * 1000)) if not local: if "watch?v=" in video: @@ -422,49 +430,31 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model print() print() + initialize() if chapters and not test: chapters = read_description("tmp/") elif test: chapters = read_description("test/testAssets/") + convert_video_to_mp3(abs_path[:-4] + '.mp4') + result = process_mp3(abs_path[:-4] + ".mp3", model) + created_files.append(abs_path[:-4] + ".mp3") if chapters and len(chapters) > 0: print("Chapters detected") write_chapters_file(abs_path[:-4] + '.chapters', chapters) created_files.append(abs_path[:-4] + '.chapters') - split_mp4(chapters=chapters, download_filename=abs_path, download_name=abs_path[:-4]) - initialize() - for current_index, chapter in enumerate(chapters): - print(f"Processing chapter {chapter[2]} {current_index + 1} of {len(chapters)}") - temp_filename = f'{abs_path[:-4]} - ({current_index}).mp4' - if not test: - file = convert_video_to_mp3(filename=temp_filename) - if file is None: - print("File not found") - return None - temp_res = process_mp3(filename=temp_filename, model=model) - created_files.append(temp_filename[:-4] + ".mp3") - else: - temp_res = "" - created_files.append(temp_filename) - - if chapter[2].startswith(" Date: Tue, 9 May 2023 02:28:18 +0530 Subject: [PATCH 06/10] feat: added options for diarization and summary using deepgram (diarization doesn't work with chapters) --- app/application.py | 153 +++++++++++++++++++++++++++++++++++---------- requirements.txt | 1 + transcriber.py | 20 ++++-- 3 files changed, 135 insertions(+), 39 deletions(-) diff --git a/app/application.py b/app/application.py index 85a14c6..ded61a9 100644 --- a/app/application.py +++ b/app/application.py @@ -15,6 +15,8 @@ import time from dotenv import dotenv_values import yt_dlp +from deepgram import Deepgram +import mimetypes def download_video(url): @@ -152,7 +154,7 @@ def get_audio_file(url, title): def process_mp3(filename, model): - print("Transcribing audio to text...") + print("Transcribing audio to text using whisper ...") try: my_model = whisper.load_model(model) result = my_model.transcribe(filename) @@ -167,6 +169,69 @@ def process_mp3(filename, model): return +def decimal_to_sexagesimal(dec): + sec = int(dec % 60) + minu = int((dec // 60) % 60) + hrs = int((dec // 60) // 60) + + return f'{hrs}:{minu}:{sec}' + + +def get_deepgram_transcript(deepgram_data, diarize): + if diarize: + para = "" + string = "" + curr_speaker = None + for word in deepgram_data["results"]["channels"][0]["alternatives"][0]["words"]: + if word["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = string + f'Speaker {word["speaker"]}: {decimal_to_sexagesimal(word["start"])}' + curr_speaker = word["speaker"] + string = string + '\n\n' + + para = para + " " + word["punctuated_word"] + para = para.strip(" ") + string = string + para + return string + else: + return deepgram_data["results"]["channels"][0]["alternatives"][0]["transcript"] + + +def get_deepgram_summary(deepgram_data): + try: + summaries = deepgram_data["results"]["channels"][0]["alternatives"][0]["summaries"] + summary = "" + for x in summaries: + summary = summary + " " + x["summary"] + return summary.strip(" ") + except Exception as e: + print("Error getting summary") + print(e) + + +def process_mp3_deepgram(filename, summarize, diarize): + print("Transcribing audio to text using deepgram...") + try: + config = dotenv_values(".env") + dg_client = Deepgram(config["DEEPGRAM_API_KEY"]) + + with open(filename, "rb") as audio: + mimeType = mimetypes.MimeTypes().guess_type(filename)[0] + source = {'buffer': audio, 'mimetype': mimeType} + response = dg_client.transcription.sync_prerecorded(source, {'punctuate': True, 'speaker_labels': True, + 'diarize': diarize, 'smart_formatting': True, + 'summarize': summarize}) + audio.close() + return response + except Exception as e: + print("Error transcribing audio to text") + print(e) + return + + def create_transcript(data): result = "" for x in data: @@ -189,7 +254,8 @@ def initialize(): print(e) -def write_to_file(result, loc, url, title, date, tags, category, speakers, video_title, username, local, test, pr): +def write_to_file(result, loc, url, title, date, tags, category, speakers, video_title, username, local, test, pr, + summary): try: transcribed_text = result if title: @@ -219,6 +285,8 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video for i in range(len(category)): category[i] = category[i].strip() meta_data += f'categories: {category}\n' + if summary: + meta_data += f'summary: {summary}\n' file_name = video_title.replace(' ', '-') file_name_with_ext = "tmp/" + file_name + '.md' @@ -244,12 +312,11 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, - test, - pr): + test, pr, summary=None): try: print("writing .md file") file_name_with_ext = write_to_file(result, loc, video, title, event_date, tags, category, speakers, video_title, - username, local, test, pr) + username, local, test, pr, summary) print("wrote .md file") absolute_path = os.path.abspath(file_name_with_ext) @@ -301,7 +368,7 @@ def check_source_type(source): def process_audio(source, title, event_date, tags, category, speakers, loc, model, username, local, - created_files, test, pr): + created_files, test, pr, deepgram, summarize, diarize): try: print("audio file detected") curr_time = str(round(time.time() * 1000)) @@ -311,6 +378,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode print("Error: Please supply a title for the audio file") return None # process audio file + summary = None if not local: filename = get_audio_file(url=source, title=title) abs_path = os.path.abspath(path="tmp/" + filename) @@ -331,12 +399,17 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode if test: result = test else: - result = process_mp3(abs_path, model) - result = create_transcript(result) + if deepgram or summarize: + deepgram_resp = process_mp3_deepgram(filename=abs_path, summarize=summarize, diarize=diarize) + result = get_deepgram_transcript(deepgram_data=deepgram_resp, diarize=diarize) + if summarize: + summary = get_deepgram_summary(deepgram_data=deepgram_resp) + if not deepgram: + result = process_mp3(abs_path, model) + result = create_transcript(result) absolute_path = get_md_file_path(result=result, loc=loc, video=source, title=title, event_date=event_date, - tags=tags, - category=category, speakers=speakers, username=username, local=local, - video_title=filename[:-4], test=test, pr=pr) + tags=tags, category=category, speakers=speakers, username=username, + local=local, video_title=filename[:-4], test=test, pr=pr, summary=summary) created_files.append(absolute_path) if pr: @@ -350,7 +423,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode def process_videos(source, title, event_date, tags, category, speakers, loc, model, username, created_files, - chapters, pr): + chapters, pr, deepgram, summarize, diarize): try: print("Playlist detected") if source.startswith("http") or source.startswith("www"): @@ -369,7 +442,8 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, mod for video in videos: filename = process_video(video=video, title=title, event_date=event_date, tags=tags, category=category, speakers=speakers, loc=loc, model=selected_model, username=username, - pr=pr, created_files=created_files, chapters=chapters, test=False) + pr=pr, created_files=created_files, chapters=chapters, test=False, diarize=diarize, + deepgram=deepgram, summarize=summarize) if filename is None: return None return filename @@ -404,7 +478,7 @@ def combine_chapter(chapters, transcript): def process_video(video, title, event_date, tags, category, speakers, loc, model, username, created_files, - chapters, test, pr, local=False): + chapters, test, pr, local=False, deepgram=False, summarize=False, diarize=False): try: curr_time = str(round(time.time() * 1000)) if not local: @@ -431,12 +505,21 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model print() initialize() + summary = None if chapters and not test: chapters = read_description("tmp/") elif test: chapters = read_description("test/testAssets/") convert_video_to_mp3(abs_path[:-4] + '.mp4') - result = process_mp3(abs_path[:-4] + ".mp3", model) + if deepgram or summarize: + deepgram_data = process_mp3_deepgram(abs_path[:-4] + ".mp3", summarize=summarize, diarize=diarize) + result = get_deepgram_transcript(deepgram_data=deepgram_data, diarize=diarize) + if summarize: + print("Summarizing") + summary = get_deepgram_summary(deepgram_data=deepgram_data) + print(summary) + if not deepgram: + result = process_mp3(abs_path[:-4] + ".mp3", model) created_files.append(abs_path[:-4] + ".mp3") if chapters and len(chapters) > 0: print("Chapters detected") @@ -447,16 +530,16 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model created_files.append(abs_path) created_files.append("tmp/" + filename[:-4] + '.chapters') else: - if not test: + if not test and not deepgram: result = create_transcript(result) - else: + elif not deepgram: result = "" if not title: title = filename[:-4] + print("Creating markdown file") absolute_path = get_md_file_path(result=result, loc=loc, video=video, title=title, event_date=event_date, - tags=tags, - category=category, speakers=speakers, username=username, - video_title=filename[:-4], local=local, pr=pr, test=test) + tags=tags, summary=summary, category=category, speakers=speakers, + username=username, video_title=filename[:-4], local=local, pr=pr, test=test) created_files.append("tmp/" + filename[:-4] + '.description') if not test: if pr: @@ -470,7 +553,8 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model def process_source(source, title, event_date, tags, category, speakers, loc, model, username, source_type, - created_files, chapters, local=False, test=None, pr=False): + created_files, chapters, local=False, test=None, pr=False, deepgram=False, summarize=False, + diarize=False): try: if not os.path.isdir("tmp"): os.mkdir("tmp") @@ -480,26 +564,29 @@ def process_source(source, title, event_date, tags, category, speakers, loc, mod if source_type == 'audio': filename = process_audio(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=model, username=username, - local=local, created_files=created_files, test=test, pr=pr) + speakers=speakers, loc=loc, model=model, username=username, summarize=summarize, + local=local, created_files=created_files, test=test, pr=pr, deepgram=deepgram, + diarize=diarize) elif source_type == 'audio-local': filename = process_audio(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=model, username=username, - local=True, created_files=created_files, test=test, pr=pr) + speakers=speakers, loc=loc, model=model, username=username, summarize=summarize, + local=True, created_files=created_files, test=test, pr=pr, deepgram=deepgram, + diarize=diarize) elif source_type == 'playlist': filename = process_videos(source=source, title=title, event_date=event_date, tags=tags, category=category, - speakers=speakers, loc=loc, model=model, username=username, - created_files=created_files, chapters=chapters, pr=pr) + speakers=speakers, loc=loc, model=model, username=username, summarize=summarize, + created_files=created_files, chapters=chapters, pr=pr, deepgram=deepgram, + diarize=diarize) elif source_type == 'video-local': - filename = process_video(video=source, title=title, event_date=event_date, + filename = process_video(video=source, title=title, event_date=event_date, summarize=summarize, tags=tags, category=category, speakers=speakers, loc=loc, model=model, - username=username, created_files=created_files, local=True, - chapters=chapters, test=test, pr=pr) + username=username, created_files=created_files, local=True, diarize=diarize, + chapters=chapters, test=test, pr=pr, deepgram=deepgram) else: - filename = process_video(video=source, title=title, event_date=event_date, + filename = process_video(video=source, title=title, event_date=event_date, summarize=summarize, tags=tags, category=category, speakers=speakers, loc=loc, model=model, - username=username, created_files=created_files, local=local, - chapters=chapters, test=test, pr=pr) + username=username, created_files=created_files, local=local, diarize=diarize, + chapters=chapters, test=test, pr=pr, deepgram=deepgram) return filename except Exception as e: print("Error processing source") diff --git a/requirements.txt b/requirements.txt index 089bf23..fb8dafd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ requests==2.28.2 setuptools==45.2.0 static_ffmpeg==2.3 yt_dlp==2023.3.4 +deepgram-sdk==2.4.0 diff --git a/transcriber.py b/transcriber.py index 5c068ca..f8aab5f 100644 --- a/transcriber.py +++ b/transcriber.py @@ -26,9 +26,9 @@ def print_help(ctx, param, value): @click.command() @click.argument('source', nargs=1) @click.argument('loc', nargs=1) -@click.option('-m', '--model', type=click.Choice(['tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v2']), default='tiny.en', - help='Options for transcription model' - ) +@click.option('-m', '--model', type=click.Choice( + ['tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v2']), default='tiny.en', + help='Options for transcription model') @click.option('-t', '--title', type=str, help="Supply transcribed file title in 'quotes', title is mandatory in case of audio files") @click.option('-d', '--date', type=str, help="Supply the event date in format 'yyyy-mm-dd'") @@ -44,6 +44,10 @@ def print_help(ctx, param, value): @click.option('-h', '--help', is_flag=True, callback=print_help, expose_value=False, is_eager=True, help="Show the application's help and exit.") @click.option('-p', '--PR', is_flag=True, default=False, help="Supply this flag if you want to generate a payload") +@click.option('-D', '--deepgram', is_flag=True, default=False, help="Supply this flag if you want to use deepgram") +@click.option('-S', '--summarize', is_flag=True, default=False, + help="Supply this flag if you want to summarize the content") +@click.option('--diarize', is_flag=True, default=False, help="Supply this flag if you want to diarize the content") def add( source: str, loc: str, @@ -54,7 +58,10 @@ def add( speakers: str, category: str, chapters: bool, - pr: bool + pr: bool, + deepgram: bool, + summarize: bool, + diarize: bool ) -> None: """Supply a YouTube video id and directory for transcription. \n Note: The https links need to be wrapped in quotes when running the command on zsh @@ -77,8 +84,9 @@ def add( return filename = application.process_source(source=source, title=title, event_date=event_date, tags=tags, category=category, speakers=speakers, loc=loc, model=model, - username=username, chapters=chapters, pr=pr, - source_type=source_type, created_files=created_files) + username=username, chapters=chapters, pr=pr, summarize=summarize, + source_type=source_type, created_files=created_files, deepgram=deepgram, + diarize=diarize) if filename: """ INITIALIZE GIT AND OPEN A PR""" print("Transcription complete") From 8662914150e160fa7ea1809bf16ff5d54d1d39b0 Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Wed, 10 May 2023 03:00:46 +0530 Subject: [PATCH 07/10] feat: added chapters support to deepgram --- app/application.py | 72 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/app/application.py b/app/application.py index ded61a9..c0327a7 100644 --- a/app/application.py +++ b/app/application.py @@ -292,7 +292,7 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video file_name_with_ext = "tmp/" + file_name + '.md' if date: - meta_data = meta_data + f'date: {date}\n' + meta_data += f'date: {date}\n' meta_data += '---\n' if test is not None or pr: @@ -312,7 +312,7 @@ def write_to_file(result, loc, url, title, date, tags, category, speakers, video def get_md_file_path(result, loc, video, title, event_date, tags, category, speakers, username, local, video_title, - test, pr, summary=None): + test, pr, summary=""): try: print("writing .md file") file_name_with_ext = write_to_file(result, loc, video, title, event_date, tags, category, speakers, video_title, @@ -379,6 +379,7 @@ def process_audio(source, title, event_date, tags, category, speakers, loc, mode return None # process audio file summary = None + result = None if not local: filename = get_audio_file(url=source, title=title) abs_path = os.path.abspath(path="tmp/" + filename) @@ -452,29 +453,54 @@ def process_videos(source, title, event_date, tags, category, speakers, loc, mod print(e) +def combine_deepgram_with_chapters(deepgram_data, chapters): + try: + chapters_pointer = 0 + words_pointer = 0 + result = "" + words = deepgram_data["results"]["channels"][0]["alternatives"][0]["words"] + # chapters index, start time, name + # transcript start time, end time, text + while chapters_pointer < len(chapters) and words_pointer < len(words): + if chapters[chapters_pointer][1] <= words[words_pointer]["end"]: + result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" + chapters_pointer += 1 + else: + result = result + words[words_pointer]["punctuated_word"] + " " + words_pointer += 1 + return result + except Exception as e: + print("Error combining deepgram with chapters") + print(e) + + def combine_chapter(chapters, transcript): - chapters_pointer = 0 - transcript_pointer = 0 - result = "" - # chapters index, start time, name - # transcript start time, end time, text + try: + chapters_pointer = 0 + transcript_pointer = 0 + result = "" + # chapters index, start time, name + # transcript start time, end time, text + + while chapters_pointer < len(chapters) and transcript_pointer < len(transcript): + if chapters[chapters_pointer][1] <= transcript[transcript_pointer][0]: + result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" + chapters_pointer += 1 + else: + result = result + transcript[transcript_pointer][2] + transcript_pointer += 1 - while chapters_pointer < len(chapters) and transcript_pointer < len(transcript): - if chapters[chapters_pointer][1] <= transcript[transcript_pointer][0]: - result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" - chapters_pointer += 1 - else: + while transcript_pointer < len(transcript): result = result + transcript[transcript_pointer][2] transcript_pointer += 1 - while transcript_pointer < len(transcript): - result = result + transcript[transcript_pointer][2] - transcript_pointer += 1 + with open("result.md", "w") as file: + file.write(result) - with open("result.md", "w") as file: - file.write(result) - - return result + return result + except Exception as e: + print("Error combining chapters") + print(e) def process_video(video, title, event_date, tags, category, speakers, loc, model, username, created_files, @@ -506,6 +532,8 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model initialize() summary = None + result = "" + deepgram_data = None if chapters and not test: chapters = read_description("tmp/") elif test: @@ -517,7 +545,6 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model if summarize: print("Summarizing") summary = get_deepgram_summary(deepgram_data=deepgram_data) - print(summary) if not deepgram: result = process_mp3(abs_path[:-4] + ".mp3", model) created_files.append(abs_path[:-4] + ".mp3") @@ -525,7 +552,10 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model print("Chapters detected") write_chapters_file(abs_path[:-4] + '.chapters', chapters) created_files.append(abs_path[:-4] + '.chapters') - result = combine_chapter(chapters=chapters, transcript=result) + if deepgram: + result = combine_deepgram_with_chapters(deepgram_data=deepgram_data, chapters=chapters) + else: + result = combine_chapter(chapters=chapters, transcript=result) if not local: created_files.append(abs_path) created_files.append("tmp/" + filename[:-4] + '.chapters') From 3d87efea0e606aa14d0d3d31ee1ace3ac0a1b2cd Mon Sep 17 00:00:00 2001 From: Jonas Date: Thu, 11 May 2023 20:48:33 -0400 Subject: [PATCH 08/10] add -M flag for diarize and double-digit timestamps --- app/application.py | 2 +- transcriber.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/application.py b/app/application.py index c0327a7..3d77378 100644 --- a/app/application.py +++ b/app/application.py @@ -174,7 +174,7 @@ def decimal_to_sexagesimal(dec): minu = int((dec // 60) % 60) hrs = int((dec // 60) // 60) - return f'{hrs}:{minu}:{sec}' + return f'{hrs:02d}:{minu:02d}:{sec:02d}' def get_deepgram_transcript(deepgram_data, diarize): diff --git a/transcriber.py b/transcriber.py index f8aab5f..4d165c8 100644 --- a/transcriber.py +++ b/transcriber.py @@ -47,7 +47,7 @@ def print_help(ctx, param, value): @click.option('-D', '--deepgram', is_flag=True, default=False, help="Supply this flag if you want to use deepgram") @click.option('-S', '--summarize', is_flag=True, default=False, help="Supply this flag if you want to summarize the content") -@click.option('--diarize', is_flag=True, default=False, help="Supply this flag if you want to diarize the content") +@click.option('-M', '--diarize', is_flag=True, default=False, help="Supply this flag if you have multiple speakers AKA want to diarize the content") def add( source: str, loc: str, From 025e51979e420f68d76d0c20b54a22bda4e5d2bc Mon Sep 17 00:00:00 2001 From: Jonas Date: Thu, 11 May 2023 21:01:29 -0400 Subject: [PATCH 09/10] add transcription after final chapter header --- app/application.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/app/application.py b/app/application.py index 3d77378..942a969 100644 --- a/app/application.py +++ b/app/application.py @@ -468,6 +468,15 @@ def combine_deepgram_with_chapters(deepgram_data, chapters): else: result = result + words[words_pointer]["punctuated_word"] + " " words_pointer += 1 + + # Append the final chapter heading and remaining content + while chapters_pointer < len(chapters): + result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" + chapters_pointer += 1 + while words_pointer < len(words): + result = result + words[words_pointer]["punctuated_word"] + " " + words_pointer += 1 + return result except Exception as e: print("Error combining deepgram with chapters") From 5d96b2e191ec87eae016cb9eb16747d02c29905f Mon Sep 17 00:00:00 2001 From: masterchief164 <63920595+masterchief164@users.noreply.github.com> Date: Fri, 12 May 2023 16:17:19 +0530 Subject: [PATCH 10/10] feat: added chapters support to deepgram with diarization --- app/application.py | 113 +++++++++++++++++++++++++++++++++------------ transcriber.py | 3 +- 2 files changed, 85 insertions(+), 31 deletions(-) diff --git a/app/application.py b/app/application.py index 942a969..5f1c6c3 100644 --- a/app/application.py +++ b/app/application.py @@ -177,6 +177,85 @@ def decimal_to_sexagesimal(dec): return f'{hrs:02d}:{minu:02d}:{sec:02d}' +def combine_chapter(chapters, transcript): + try: + chapters_pointer = 0 + transcript_pointer = 0 + result = "" + # chapters index, start time, name + # transcript start time, end time, text + + while chapters_pointer < len(chapters) and transcript_pointer < len(transcript): + if chapters[chapters_pointer][1] <= transcript[transcript_pointer][0]: + result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" + chapters_pointer += 1 + else: + result = result + transcript[transcript_pointer][2] + transcript_pointer += 1 + + while transcript_pointer < len(transcript): + result = result + transcript[transcript_pointer][2] + transcript_pointer += 1 + + with open("result.md", "w") as file: + file.write(result) + + return result + except Exception as e: + print("Error combining chapters") + print(e) + + +def combine_deepgram_chapters_with_diarization(deepgram_data, chapters): + try: + para = "" + string = "" + curr_speaker = None + words = deepgram_data["results"]["channels"][0]["alternatives"][0]["words"] + words_pointer = 0 + chapters_pointer = 0 + while chapters_pointer < len(chapters) and words_pointer < len(words): + if chapters[chapters_pointer][1] <= words[words_pointer]["start"]: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = string + f'## {chapters[chapters_pointer][2]}\n\n' + chapters_pointer += 1 + else: + if words[words_pointer]["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = string + f'Speaker {words[words_pointer]["speaker"]}:' \ + f' {decimal_to_sexagesimal(words[words_pointer]["start"])}' + curr_speaker = words[words_pointer]["speaker"] + string = string + '\n\n' + + para = para + " " + words[words_pointer]["punctuated_word"] + words_pointer += 1 + while words_pointer < len(words): + if words[words_pointer]["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = string + f'Speaker {words[words_pointer]["speaker"]}:' \ + f' {decimal_to_sexagesimal(words[words_pointer]["start"])}' + curr_speaker = words[words_pointer]["speaker"] + string = string + '\n\n' + + para = para + " " + words[words_pointer]["punctuated_word"] + words_pointer += 1 + para = para.strip(" ") + string = string + para + return string + except Exception as e: + print("Error combining deepgram chapters") + print(e) + + def get_deepgram_transcript(deepgram_data, diarize): if diarize: para = "" @@ -483,35 +562,6 @@ def combine_deepgram_with_chapters(deepgram_data, chapters): print(e) -def combine_chapter(chapters, transcript): - try: - chapters_pointer = 0 - transcript_pointer = 0 - result = "" - # chapters index, start time, name - # transcript start time, end time, text - - while chapters_pointer < len(chapters) and transcript_pointer < len(transcript): - if chapters[chapters_pointer][1] <= transcript[transcript_pointer][0]: - result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" - chapters_pointer += 1 - else: - result = result + transcript[transcript_pointer][2] - transcript_pointer += 1 - - while transcript_pointer < len(transcript): - result = result + transcript[transcript_pointer][2] - transcript_pointer += 1 - - with open("result.md", "w") as file: - file.write(result) - - return result - except Exception as e: - print("Error combining chapters") - print(e) - - def process_video(video, title, event_date, tags, category, speakers, loc, model, username, created_files, chapters, test, pr, local=False, deepgram=False, summarize=False, diarize=False): try: @@ -562,7 +612,10 @@ def process_video(video, title, event_date, tags, category, speakers, loc, model write_chapters_file(abs_path[:-4] + '.chapters', chapters) created_files.append(abs_path[:-4] + '.chapters') if deepgram: - result = combine_deepgram_with_chapters(deepgram_data=deepgram_data, chapters=chapters) + if diarize: + result = combine_deepgram_chapters_with_diarization(deepgram_data=deepgram_data, chapters=chapters) + else: + result = combine_deepgram_with_chapters(deepgram_data=deepgram_data, chapters=chapters) else: result = combine_chapter(chapters=chapters, transcript=result) if not local: diff --git a/transcriber.py b/transcriber.py index 4d165c8..1b037d5 100644 --- a/transcriber.py +++ b/transcriber.py @@ -47,7 +47,8 @@ def print_help(ctx, param, value): @click.option('-D', '--deepgram', is_flag=True, default=False, help="Supply this flag if you want to use deepgram") @click.option('-S', '--summarize', is_flag=True, default=False, help="Supply this flag if you want to summarize the content") -@click.option('-M', '--diarize', is_flag=True, default=False, help="Supply this flag if you have multiple speakers AKA want to diarize the content") +@click.option('-M', '--diarize', is_flag=True, default=False, help="Supply this flag if you have multiple speakers AKA " + "want to diarize the content") def add( source: str, loc: str,