Skip to content

Commit

Permalink
feat: enhance --github flag to support multiple operation modes
Browse files Browse the repository at this point in the history
Previously, the --github flag was a simple boolean flag, used only to
toggle the functionality of pushing transcripts to a new branch on the
bitcointranscripts repository.

With this update, the --github flag now accepts multiple options,
enabling more versatile control over GitHub interactions.
  • Loading branch information
kouloumos committed Feb 14, 2024
1 parent 98c8491 commit 8f711fa
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 35 deletions.
3 changes: 1 addition & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ To configure the transcription process, you can use the following flags:
- `-D` or `--deepgram`: Use deepgram for transcription, instead of using the whisper model [default: False]
- `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram]
- `-S` or `--summarize`: Summarize the transcript [only available with deepgram]
- `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.
- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo
- `--github`: Specify the GitHub operation mode
- `-u` or `--upload`: Upload processed model files to AWS S3
- `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts
- `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file
Expand Down
59 changes: 34 additions & 25 deletions app/transcription.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import json
import logging
import os
import shutil
import random
import re
import subprocess
import tempfile
import time
from datetime import datetime

from dotenv import dotenv_values
import pytube
from pytube.exceptions import PytubeError
import requests
import yt_dlp

from app.transcript import Transcript, Source, Audio, Video, Playlist, RSS
from app.transcript import (
Transcript,
Source,
Audio,
Video,
Playlist,
RSS
)
from app import (
__app_name__,
__version__,
Expand All @@ -25,14 +24,17 @@
)
from app.logging import get_logger
from app.queuer import Queuer
from app.types import PostprocessOutput
from app.types import (
GitHubMode,
PostprocessOutput
)


class Transcription:
def __init__(
self,
model="tiny",
github=False,
github: GitHubMode = "none",
summarize=False,
deepgram=False,
diarize=False,
Expand Down Expand Up @@ -77,15 +79,15 @@ def _create_subdirectory(self, subdir_name):
os.makedirs(subdir_path)
return subdir_path

def __configure_target_repo(self, github):
if not github:
def __configure_target_repo(self, github: GitHubMode):
if github == "none":
return None
config = dotenv_values(".env")
git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR")
if not git_repo_dir:
raise Exception(
"To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file")
return None
self.github = github
return git_repo_dir

def __configure_review_flag(self, needs_review):
Expand Down Expand Up @@ -218,7 +220,8 @@ def add_transcription_source(
tags, category, speakers, preprocess, link),
youtube_metadata=youtube_metadata,
chapters=chapters)
self.logger.info(f"Detected source: {source}")
self.logger.debug(f"Detected source: {source}")

if source.type == "playlist":
# add a transcript for each source/video in the playlist
for video in source.videos:
Expand Down Expand Up @@ -306,17 +309,22 @@ def start(self, test_transcript=None):
def push_to_github(self, outputs: list[PostprocessOutput]):
# Change to the directory where your Git repository is located
os.chdir(self.bitcointranscripts_dir)
# Fetch the latest changes from the remote repository
subprocess.run(['git', 'fetch', 'origin', 'master'])
# Create a new branch from the fetched 'origin/master'
branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}"
subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master'])
if self.github == "remote":
# Fetch the latest changes from the remote repository
subprocess.run(['git', 'fetch', 'origin', 'master'])
# Create a new branch from the fetched 'origin/master'
branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}"
subprocess.run(
['git', 'checkout', '-b', branch_name, 'origin/master'])

# For each output with markdown, create a new commit in the new branch
for output in outputs:
if output.get('markdown'):
markdown_file = output['markdown']
destination_path = os.path.join(
self.bitcointranscripts_dir, output["transcript"].source.loc)
# Create the destination directory if it doesn't exist
os.makedirs(destination_path, exist_ok=True)
# Ensure the markdown file exists before copying
if os.path.exists(markdown_file):
shutil.copy(markdown_file, destination_path)
Expand All @@ -328,11 +336,12 @@ def push_to_github(self, outputs: list[PostprocessOutput]):
else:
print(f"Markdown file {markdown_file} does not exist.")

# Push the branch to the remote repository
subprocess.run(['git', 'push', 'origin', branch_name])
# Delete branch locally
subprocess.run(['git', 'checkout', 'master'])
subprocess.run(['git', 'branch', '-D', branch_name])
if self.github == "remote":
# Push the branch to the remote repository
subprocess.run(['git', 'push', 'origin', branch_name])
# Delete branch locally
subprocess.run(['git', 'checkout', 'master'])
subprocess.run(['git', 'branch', '-D', branch_name])

def write_to_markdown_file(self, transcript: Transcript, output_dir):
"""Writes transcript to a markdown file and returns its absolute path
Expand Down
2 changes: 2 additions & 0 deletions app/types.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import (
Literal,
TypedDict,
Optional
)

from app.transcript import Transcript

GitHubMode = Literal["remote", "local", "none"]

class PostprocessOutput(TypedDict):
transcript: Transcript
Expand Down
22 changes: 14 additions & 8 deletions transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from app import (
__app_name__,
__version__,
application,
utils
)
from app.commands import queue
from app.logging import configure_logger, get_logger
from app.transcript import Transcript
from app.transcription import Transcription
from app.logging import configure_logger, get_logger
from app.types import GitHubMode

logger = get_logger()

Expand Down Expand Up @@ -91,9 +91,13 @@ def print_help(ctx, param, value):
)
github = click.option(
"--github",
is_flag=True,
default=False,
help="Push transcripts to a new branch on the origin bitcointranscripts repo",
type=click.Choice(["remote", "local", "none"]),
default="none",
help=("Specify the GitHub operation mode."
"'remote': Create a new branch, push changes to it, and push it to the origin bitcointranscripts repo. "
"'local': Commit changes to the current local branch without pushing to the remote repo."
"'none': Do not perform any GitHub operations."),
show_default=True
)
upload_to_s3 = click.option(
"-u",
Expand Down Expand Up @@ -213,7 +217,7 @@ def transcribe(
tags: list,
speakers: list,
category: list,
github: bool,
github: GitHubMode,
deepgram: bool,
summarize: bool,
diarize: bool,
Expand Down Expand Up @@ -317,6 +321,7 @@ def preprocess(
configure_logger(log_level=logging.INFO)
logger.info(f"Preprocessing sources...")
transcription = Transcription(
queue=False,
batch_preprocessing_output=not no_batched_output)
if source.endswith(".json"):
transcription.add_transcription_source_JSON(source, nocheck=nocheck)
Expand Down Expand Up @@ -361,7 +366,7 @@ def preprocess(
def postprocess(
metadata_json_file,
service,
github: bool,
github: GitHubMode,
upload: bool,
markdown: bool,
noqueue: bool,
Expand Down Expand Up @@ -406,7 +411,8 @@ def postprocess(
f"{service}_output"]
transcript_to_postprocess.result = transcription.service.finalize_transcript(
transcript_to_postprocess)
postprocessed_transcript = transcription.postprocess(transcript_to_postprocess)
postprocessed_transcript = transcription.postprocess(
transcript_to_postprocess)

if transcription.bitcointranscripts_dir:
transcription.push_to_github([postprocessed_transcript])
Expand Down

0 comments on commit 8f711fa

Please sign in to comment.