From 0c30d155b679c17a899deffe610d846d3b756562 Mon Sep 17 00:00:00 2001 From: kouloumos Date: Mon, 13 Nov 2023 15:48:25 +0200 Subject: [PATCH] configuration changes for cli options & update README - `tags`, `speakers` and `category` must now be used one time per each item (tag, speaker, category) that we want to add to the metadata of the transcript - Better wording for the help text of cli options - Update README --- Readme.md | 156 ++++++++++++++-------------- app/transcription.py | 2 +- test/testAssets/payload.json | 2 +- test/test_video.py | 2 +- transcriber.py | 191 +++++++++++++++++++---------------- 5 files changed, 183 insertions(+), 170 deletions(-) diff --git a/Readme.md b/Readme.md index 11748f7..35d8be5 100644 --- a/Readme.md +++ b/Readme.md @@ -2,36 +2,63 @@ This cli app transcribes audio and video for submission to the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo. +**Available transcription models and services** + +- (local) Whisper `--model xxx [default: tiny.en]` +- (remote) Deepgram (whisper-large) `--deepgram [default: False]` + - summarization `--summarize` + - diarization `--diarize` + **Features**: + - Transcription using [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) -- Collection of video's metadata when sourcing from YouTube +- Collection of video's metadata when sourcing from YouTube. - Open Pull Request on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript. +- Save the resulting transcript to a markdown format supported by bitcointranscripts. - Upload the resulting transcript to an AWS S3 Bucket repo. -- Push the resulting transcript to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend) +- Push the resulting transcript to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend), or save the payload in a json for later use. -## Steps: +## Prerequisites -The step-by-step flow for the scripts are: +- To use [deepgram](https://deepgram.com/) as a transcription service, + you must have a valid `DEEPGRAM_API_KEY` in the `.env` file. -- transcribe given video and generate the output file +- To push the resulting transcript to a Queuer backend, you must have a + valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save + the payload in a json file using the `--noqueue` flag. -- authenticate the user to GitHub +- To enable us fork bitcointranscript repo and open a PR, we require you to + login into your GitHub account. Kindly install `GITHUB CLI` using the + instructions on their repo [here](https://github.com/cli/cli#installation). + Following the prompt, please select the below options from the prompt to + login: -- fork the transcript repo/use their existing fork, clone it and branch out + - what account do you want to log into? `Github.com` + + - what is your preferred protocol for Git operations? `SSH` -- copy the transcript file to the new transcript repo + - Upload your SSH public key to your GitHub account? `skip` -- commit new file and push + - How would you like to authenticate GitHub CLI? `Login with a web browser` + + - copy the generated one-time pass-code and paste in the browser to + authenticate if you have enabled 2FA -- then open a PR +- To enable pushing the models to a S3 bucket, + - [Install](https://aws.amazon.com/cli/) aws-cli to your system. + - [Configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) + aws-cli by first generating IAM credentials (if not already present) and + using `aws configure` to set them. + - To verify proper configuration, run `aws s3 ls` to show the list of S3 + buckets. Don't forget to set a valid `S3_BUCKET` in the `.env` file. - or +- To be able to convert the intermediary media files to mp3, install `FFmpeg` -- add the backend url to a `.env` file as `QUEUE_ENDPOINT`. Optionally, - specify `S3_BUCKET` in `.env` for uploading model files. + - for Mac Os users, run `brew install ffmpeg` -- send the transcript data to the backend queue + - for other users, follow the instruction on + their [site](https://ffmpeg.org/) to install ## Install/Uninstall @@ -52,51 +79,55 @@ To check the version: ## Usage -`tstbtc {video_id} {directory}` create video transcript supplying the id of the -YouTube video and the associated directory bitcointranscripts destination folder +`tstbtc {source_file/url} {directory}` transcribe the given source -Note: The https links need to be wrapped in quotes when running the command on -zsh +Suported sources: + - YouTube videos + - YouTube playlists + - Local and remote audio files -`tstbtc {audio_url} {directory} --title {title}` create audio transcript -supplying the url of the audio, the source/year and the title of the audio +Note: +- The `directory` is the bitcointranscripts directory that you want to associate the transcript with +- The https links need to be wrapped in quotes when running the command on zsh To include optional metadata in your transcript, you can add the following parameters: -- `-t` or `--title`: Supply transcribed file title in 'quotes' -- `-d` or `--date`: Supply the event date in format 'yyyy-mm-dd' -- `-T` or `--tags`: Supply the tags for the transcript in 'quotes' and separated - by commas -- `-s` or `--speakers`: Supply the speakers for the transcript in 'quotes' and - separated by commas -- `-c` or `--category`: Supply the category for the transcript in 'quotes' and - separated by commas -- `-C` or `--chapters`: Split the transcript into chapters based on the supplied - timestamps in the youtube video. +- `-t` or `--title`: Add the title for the resulting transcript (required for audio files) +- `-d` or `--date`: Add the event date to transcript's metadata in format 'yyyy-mm-dd' +- can be used multiple times: + - `-T` or `--tags`: Add a tag to transcript's metadata + - `-s` or `--speakers`: Add a speaker to the transcript's metadata + - `-c` or `--category`: Add a category to the transcript's metadata + +To configure the transcription process, you can use the following flags: + +- `-m` or `--model`: Select which whisper model to use for the transcription [default: tiny.en] +- `-D` or `--deepgram`: Use deepgram for transcription, instead of using the whisper model [default: False] +- `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram] +- `-S` or `--summarize`: Summarize the transcript [only available with deepgram] +- `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript. - `-p` or `--pr`: Open a PR on the bitcointranscripts repo -- `-m` or `model`: Supply optional whisper model -- `-u` or `--upload`: Specify if you want to upload the generated model files in - AWS S3. +- `-u` or `--upload`: Upload processed model files to AWS S3 +- `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts +- `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file +- `--nocleanup`: Do not remove temp files on exit -#### Examples +### Examples -To -transcribe [this podcast episode](https://www.youtube.com/watch?v=Nq6WxJ0PgJ4) -from Stephan Livera's podcast with the associated metadata, we would run either +To transcribe [this podcast episode](https://www.youtube.com/watch?v=Nq6WxJ0PgJ4) from YouTube +from Stephan Livera's podcast and add the associated metadata, we would run either of the below commands. The first uses short argument tags, while the second uses long argument tags. The result is the same. -- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'op_vault' -s 'Stephan Livera, James O’Beirn' -c ‘podcast’` -- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'op_vault' --speakers 'Stephan Livera, James O’Beirn' --category ‘podcast’` +- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’` +- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’` -You can also transcribe a mp3 link, such as the following from Stephan Livera's -podcast: https://anchor.fm/s/7d083a4/podcast/play/64348045/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2023-1-1%2Ff7fafb12-9441-7d85-d557-e9e5d18ab788.mp3 - -For demonstration purposes, let's substitute the link above with the following: -websitelink.mp3. In this scenario, we would run the below command. - -- `tstbtc websitelink.mp3 bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Stephan Livera, Anant Tapadia' --category 'podcast'` +You can also transcribe a remote audio/mp3 link, such as the following from Stephan Livera's podcast: +```shell +mp3_link="https://anchor.fm/s/7d083a4/podcast/play/64348045/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2023-1-1%2Ff7fafb12-9441-7d85-d557-e9e5d18ab788.mp3" +tstbtc $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast' +``` ## Testing @@ -112,39 +143,6 @@ To run the full test suite `pytest -v -s` -## OTHER REQUIREMENTS - -- To enable us fork bitcointranscript repo and open a PR, we require you to - login into your GitHub account. Kindly install `GITHUB CLI` using the - instructions on their repo [here](https://github.com/cli/cli#installation). - Following the prompt, please select the below options from the prompt to - login: - - - what account do you want to log into? `Github.com` - - - what is your preferred protocol for Git operations? `SSH` - - - Upload your SSH public key to your GitHub account? `skip` - - - How would you like to authenticate GitHub CLI? `Login with a web browser` - - - copy the generated one-time pass-code and paste in the browser to - authenticate if you have enabled 2FA - -- To enable pushing the models to a S3 bucket, - - [Install](https://aws.amazon.com/cli/) aws-cli to your system. - - [Configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) - aws-cli by first generating IAM credentials (if not already present) and - using `aws configure` to set them. - - To verify proper configuration, run `aws s3 ls` to show the list of S3 - buckets. Set a valid bucket in the `.env` file. - -- Install `FFmpeg` - - - for Mac Os users, run `brew install ffmpeg` - - - for other users, follow the instruction on - their [site](https://ffmpeg.org/) to install ## License diff --git a/app/transcription.py b/app/transcription.py index 2d0dcba..9c01019 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -111,7 +111,7 @@ def check_if_youtube(source: Source): except Exception as e: raise Exception(f"Error from assigning source: {e}") - def add_transcription_source(self, source_file, title=None, date=None, tags=None, category=None, speakers=None, preprocess=True, youtube_metadata=None, chapters=None): + def add_transcription_source(self, source_file, title=None, date=None, tags=[], category=[], speakers=[], preprocess=True, youtube_metadata=None, chapters=None): """Calculates the type of the source based on its file name """ transcription_sources = {"added": [], "exist": []} diff --git a/test/testAssets/payload.json b/test/testAssets/payload.json index 6b4eee1..c008213 100644 --- a/test/testAssets/payload.json +++ b/test/testAssets/payload.json @@ -3,7 +3,7 @@ "title": "test_title", "transcript_by": "username via TBTBTC v1.0.0", "categories": ["category1", "category2"], - "tags": ["tag1", "tag2"], + "tags": [], "speakers": ["speaker1", "speaker2"], "date": "2020-01-31", "loc": "yada/yada", diff --git a/test/test_video.py b/test/test_video.py index 83c2fb2..2d07576 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -216,7 +216,7 @@ def test_generate_payload(): username = "username" title = "test_title" speakers = ["speaker1", "speaker2"] - tags = ["tag1", "tag2"] + tags = [] category = ["category1", "category2"] date = "2020-01-31" loc = "yada/yada" diff --git a/transcriber.py b/transcriber.py index 500397f..18827f6 100644 --- a/transcriber.py +++ b/transcriber.py @@ -30,10 +30,7 @@ def print_help(ctx, param, value): ctx.exit() -@click.command() -@click.argument("source", nargs=1) -@click.argument("loc", nargs=1) -@click.option( +whisper = click.option( "-m", "--model", type=click.Choice( @@ -50,127 +47,143 @@ def print_help(ctx, param, value): ] ), default="tiny.en", - help="Options for transcription model", -) -@click.option( - "-t", - "--title", - type=str, - help="Supply transcribed file title in 'quotes', title is mandatory in case" - " of audio files", + show_default=True, + help="Select which whisper model to use for the transcription", ) -@click.option( - "-d", - "--date", - type=str, - help="Supply the event date in format 'yyyy-mm-dd'", -) -@click.option( - "-T", - "--tags", - type=str, - help="Supply the tags for the transcript in 'quotes' and separated by " - "commas", -) -@click.option( - "-s", - "--speakers", - type=str, - help="Supply the speakers for the transcript in 'quotes' and separated by " - "commas", +deepgram = click.option( + "-D", + "--deepgram", + is_flag=True, + default=False, + help="Use deepgram for transcription", ) -@click.option( - "-c", - "--category", - type=str, - help="Supply the category for the transcript in 'quotes' and separated by " - "commas", +diarize = click.option( + "-M", + "--diarize", + is_flag=True, + default=False, + help="Supply this flag if you have multiple speakers AKA " + "want to diarize the content", ) -@click.option( - "-v", - "--version", +summarize = click.option( + "-S", + "--summarize", is_flag=True, - callback=print_version, - expose_value=False, - is_eager=True, - help="Show the application's version and exit.", + default=False, + help="Summarize the transcript [only available with deepgram]", ) -@click.option( +use_youtube_chapters = click.option( "-C", "--chapters", is_flag=True, default=False, - help="Supply this flag if you want to generate chapters for the transcript", + help="For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.", ) -@click.option( +open_pr = click.option( "-p", "--PR", is_flag=True, default=False, - help="Supply this flag if you want to open a PR at the bitcointranscripts repo", + help="Open a PR on the bitcointranscripts repo", ) -@click.option( - "-D", - "--deepgram", +upload_to_s3 = click.option( + "-u", + "--upload", is_flag=True, default=False, - help="Supply this flag if you want to use deepgram", + help="Upload processed model files to AWS S3", ) -@click.option( - "-S", - "--summarize", +save_to_markdown = click.option( + "--markdown", is_flag=True, default=False, - help="Supply this flag if you want to summarize the content", + help="Save the resulting transcript to a markdown format supported by bitcointranscripts", ) -@click.option( - "-M", - "--diarize", +noqueue = click.option( + "--noqueue", is_flag=True, default=False, - help="Supply this flag if you have multiple speakers AKA " - "want to diarize the content", + help="Do not push the resulting transcript to the Queuer backend", ) -@click.option( +model_output_dir = click.option( + "-o", + "--model_output_dir", + type=str, + default="local_models/", + show_default=True, + help="Set the directory for saving model outputs", +) +nocleanup = click.option( + "--nocleanup", + is_flag=True, + default=False, + help="Do not remove temp files on exit", +) +verbose_logging = click.option( "-V", "--verbose", is_flag=True, default=False, help="Supply this flag to enable verbose logging", ) + + +@cli.command() +@click.argument("source", nargs=1) +@click.argument("loc", nargs=1) +# Available transcription models and services +@whisper +@deepgram +# Options for adding metadata @click.option( - "-o", - "--model_output_dir", + "-t", + "--title", type=str, - default="local_models/", - help="Supply this flag if you want to change the directory for saving " - "model outputs", + help="Add the title for the resulting transcript (required for audio files)", ) @click.option( - "-u", - "--upload", - is_flag=True, - default=False, - help="Supply this flag if you want to upload processed model files to AWS " - "S3", + "-d", + "--date", + type=str, + help="Add the event date to transcript's metadata in format 'yyyy-mm-dd'", ) @click.option( - "--nocleanup", - is_flag=True, - default=False, - help="Do not remove temp files on exit", + "-T", + "--tags", + multiple=True, + help="Add a tag to transcript's metadata (can be used multiple times)", ) @click.option( - "--noqueue", - is_flag=True, - default=False, - help="Do not push the resulting transcript to the Queuer backend", + "-s", + "--speakers", + multiple=True, + help="Add a speaker to the transcript's metadata (can be used multiple times)", ) @click.option( - "--markdown", + "-c", + "--category", + multiple=True, + help="Add a category to the transcript's metadata (can be used multiple times)", +) +# Options for configuring the transcription process +@diarize +@summarize +@use_youtube_chapters +@open_pr +@upload_to_s3 +@save_to_markdown +@noqueue +@model_output_dir +@nocleanup +@verbose_logging +@click.option( + "-v", + "--version", is_flag=True, - default=False, - help="Create a markdown file for the resulting transcript", + callback=print_version, + expose_value=False, + is_eager=True, + help="Show the application's version and exit.", ) def add( source: str, @@ -178,9 +191,9 @@ def add( model: str, title: str, date: str, - tags: str, - speakers: str, - category: str, + tags: list, + speakers: list, + category: list, chapters: bool, pr: bool, deepgram: bool, @@ -193,7 +206,9 @@ def add( noqueue: bool, markdown: bool ) -> None: - """Supply a YouTube video id and directory for transcription. \n + """Transcribe the given source. Suported sources: + YouTube videos, YouTube playlists, Local and remote audio files + Note: The https links need to be wrapped in quotes when running the command on zsh """