diff --git a/app/application.py b/app/application.py index c5fe6a3..efe541d 100644 --- a/app/application.py +++ b/app/application.py @@ -27,93 +27,6 @@ from app import __app_name__, __version__ -def download_video(url, working_dir="tmp/"): - logger = logging.getLogger(__app_name__) - try: - logger.info("URL: " + url) - logger.info("Downloading video... Please wait.") - - ydl_opts = { - "format": "18", - "outtmpl": os.path.join(working_dir, "videoFile.%(ext)s"), - "nopart": True, - "writeinfojson": True, - } - with yt_dlp.YoutubeDL(ydl_opts) as ytdl: - ytdl.download([url]) - - with open(os.path.join(working_dir, "videoFile.info.json")) as file: - info = ytdl.sanitize_info(json.load(file)) - name = info["title"].replace("/", "-") - file.close() - - os.rename( - os.path.join(working_dir, "videoFile.mp4"), - os.path.join(working_dir, name + ".mp4"), - ) - - return os.path.abspath(os.path.join(working_dir, name + ".mp4")) - except Exception as e: - logger.error(f"Error downloading video: {e}") - shutil.rmtree(working_dir) - return - - -def read_description(prefix): - logger = logging.getLogger(__app_name__) - try: - list_of_chapters = [] - with open(prefix + "videoFile.info.json", "r") as f: - info = json.load(f) - if "chapters" not in info: - logger.info("No chapters found in description") - return list_of_chapters - for index, x in enumerate(info["chapters"]): - name = x["title"] - start = x["start_time"] - list_of_chapters.append((str(index), start, str(name))) - - return list_of_chapters - except Exception as e: - logger.error(f"Error reading description: {e}") - return [] - - -def write_chapters_file(chapter_file: str, chapter_list: list) -> None: - # Write out the chapter file based on simple MP4 format (OGM) - logger = logging.getLogger(__app_name__) - try: - with open(chapter_file, "w") as fo: - for current_chapter in chapter_list: - fo.write( - f"CHAPTER{current_chapter[0]}=" - f"{current_chapter[1]}\n" - f"CHAPTER{current_chapter[0]}NAME=" - f"{current_chapter[2]}\n" - ) - fo.close() - except Exception as e: - logger.error("Error writing chapter file") - logger.error(e) - - -def convert_video_to_mp3(filename, working_dir="tmp/"): - logger = logging.getLogger(__app_name__) - try: - clip = VideoFileClip(filename) - logger.info("Converting video to mp3... Please wait.") - logger.info(filename[:-4] + ".mp3") - clip.audio.write_audiofile( - os.path.join(working_dir, filename.split("/")[-1][:-4] + ".mp3") - ) - clip.close() - logger.info("Converted video to mp3") - return os.path.join(working_dir, filename.split("/")[-1][:-4] + ".mp3") - except Exception as e: - logger.error(f"Error converting video to mp3: {e}") - return None - - def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"): logger = logging.getLogger(__app_name__) op = subprocess.run( @@ -127,90 +40,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"): return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3")) -def check_if_playlist(media): - logger = logging.getLogger(__app_name__) - try: - if ( - media.startswith("PL") - or media.startswith("UU") - or media.startswith("FL") - or media.startswith("RD") - ): - return True - playlists = list(pytube.Playlist(media).video_urls) - if type(playlists) is not list: - return False - return True - except Exception as e: - logger.error(f"Pytube Error: {e}") - return False - - -def check_if_video(media): - logger = logging.getLogger(__app_name__) - if re.search(r"^([\dA-Za-z_-]{11})$", media): - return True - try: - pytube.YouTube(media) - return True - except PytubeError as e: - logger.error(f"Pytube Error: {e}") - return False - - -def get_playlist_videos(url): - logger = logging.getLogger(__app_name__) - try: - videos = pytube.Playlist(url) - return videos - except Exception as e: - logger.error("Error getting playlist videos") - logger.error(e) - return - - -def get_audio_file(url, title, working_dir="tmp/"): - logger = logging.getLogger(__app_name__) - logger.info("URL: " + url) - logger.info("downloading audio file") - try: - audio = requests.get(url, stream=True) - with open(os.path.join(working_dir, title + ".mp3"), "wb") as f: - total_length = int(audio.headers.get("content-length")) - for chunk in progress.bar( - audio.iter_content(chunk_size=1024), - expected_size=(total_length / 1024) + 1, - ): - if chunk: - f.write(chunk) - f.flush() - return os.path.join(working_dir, title + ".mp3") - except Exception as e: - logger.error("Error downloading audio file") - logger.error(e) - return - - -def process_mp3(filename, model, upload, model_output_dir): - logger = logging.getLogger(__app_name__) - logger.info("Transcribing audio to text using whisper ...") - try: - my_model = whisper.load_model(model) - result = my_model.transcribe(filename) - data = [] - for x in result["segments"]: - data.append(tuple((x["start"], x["end"], x["text"]))) - data_path = generate_srt(data, filename, model_output_dir) - if upload: - upload_file_to_s3(data_path) - logger.info("Removed video and audio files") - return data - except Exception as e: - logger.error("Error transcribing audio to text") - logger.error(e) - return - - def decimal_to_sexagesimal(dec): sec = int(dec % 60) minu = int((dec // 60) % 60) @@ -311,42 +140,55 @@ def combine_deepgram_chapters_with_diarization(deepgram_data, chapters): logger.error(e) -def get_deepgram_transcript( - deepgram_data, diarize, title, upload, model_output_dir -): - if diarize: - para = "" - string = "" - curr_speaker = None - data_path = save_local_json(deepgram_data, title, model_output_dir) - if upload: - upload_file_to_s3(data_path) - for word in deepgram_data["results"]["channels"][0]["alternatives"][0][ - "words" - ]: - if word["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string + f'Speaker {word["speaker"]}: ' - f'{decimal_to_sexagesimal(word["start"])}' - ) - curr_speaker = word["speaker"] - string = string + "\n\n" +def get_deepgram_transcript(deepgram_data, diarize, title, upload, model_output_dir): + logger = logging.getLogger(__app_name__) - para = para + " " + word["punctuated_word"] - para = para.strip(" ") - string = string + para - return string - else: - data_path = save_local_json(deepgram_data, title, model_output_dir) + def save_local_json(json_data, title, model_output_dir): + logger.info(f"Saving Locally...") + time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + if not os.path.isdir(model_output_dir): + os.makedirs(model_output_dir) + file_path = os.path.join( + model_output_dir, title + "_" + time_in_str + ".json" + ) + with open(file_path, "w") as json_file: + json.dump(json_data, json_file, indent=4) + logger.info(f"Model stored at path {file_path}") + return file_path + try: + data_path = save_local_json( + deepgram_data, title, model_output_dir) if upload: upload_file_to_s3(data_path) - return deepgram_data["results"]["channels"][0]["alternatives"][0][ - "transcript" - ] + if diarize: + para = "" + string = "" + curr_speaker = None + for word in deepgram_data["results"]["channels"][0]["alternatives"][0][ + "words" + ]: + if word["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = ( + string + f'Speaker {word["speaker"]}: ' + f'{decimal_to_sexagesimal(word["start"])}' + ) + curr_speaker = word["speaker"] + string = string + "\n\n" + + para = para + " " + word["punctuated_word"] + para = para.strip(" ") + string = string + para + return string + else: + return deepgram_data["results"]["channels"][0]["alternatives"][0][ + "transcript" + ] + except Exception as e: + raise Exception(f"Error while getting deepgram transcript: {e}") def get_deepgram_summary(deepgram_data): @@ -365,6 +207,7 @@ def get_deepgram_summary(deepgram_data): def process_mp3_deepgram(filename, summarize, diarize): + """using deepgram""" logger = logging.getLogger(__app_name__) logger.info("Transcribing audio to text using deepgram...") try: @@ -388,160 +231,7 @@ def process_mp3_deepgram(filename, summarize, diarize): audio.close() return response except Exception as e: - logger.error("Error transcribing audio to text") - logger.error(e) - return - - -def create_transcript(data): - result = "" - for x in data: - result = result + x[2] + " " - - return result - - -def initialize(): - logger = logging.getLogger(__app_name__) - try: - # FFMPEG installed on first use. - logger.debug("Initializing FFMPEG...") - static_ffmpeg.add_paths() - logger.debug("Initialized FFMPEG") - except Exception as e: - logger.error("Error initializing") - logger.error(e) - - -def write_to_file( - result, - loc, - url, - title, - date, - tags, - category, - speakers, - video_title, - username, - local, - test, - pr, - summary, - working_dir="tmp/", -): - logger = logging.getLogger(__app_name__) - try: - transcribed_text = result - if title: - file_title = title - else: - file_title = video_title - meta_data = ( - "---\n" - f"title: {file_title}\n" - f"transcript_by: {username} via TBTBTC v{__version__}\n" - ) - if not local: - meta_data += f"media: {url}\n" - if tags: - tags = tags.strip() - tags = tags.split(",") - for i in range(len(tags)): - tags[i] = tags[i].strip() - meta_data += f"tags: {tags}\n" - if speakers: - speakers = speakers.strip() - speakers = speakers.split(",") - for i in range(len(speakers)): - speakers[i] = speakers[i].strip() - meta_data += f"speakers: {speakers}\n" - if category: - category = category.strip() - category = category.split(",") - for i in range(len(category)): - category[i] = category[i].strip() - meta_data += f"categories: {category}\n" - if summary: - meta_data += f"summary: {summary}\n" - - file_name = video_title.replace(" ", "-") - file_name_with_ext = os.path.join(working_dir, file_name + ".md") - - if date: - meta_data += f"date: {date}\n" - - meta_data += "---\n" - if test is not None or pr: - with open(file_name_with_ext, "a") as opf: - opf.write(meta_data + "\n") - opf.write(transcribed_text + "\n") - opf.close() - if local: - url = None - if not pr: - generate_payload( - loc=loc, - title=file_title, - transcript=transcribed_text, - media=url, - tags=tags, - category=category, - speakers=speakers, - username=username, - event_date=date, - test=test, - ) - return os.path.abspath(file_name_with_ext) - except Exception as e: - logger.error("Error writing to file") - logger.error(e) - - -def get_md_file_path( - result, - loc, - video, - title, - event_date, - tags, - category, - speakers, - username, - local, - video_title, - test, - pr, - summary="", - working_dir="tmp/", -): - logger = logging.getLogger(__app_name__) - try: - logger.info("writing .md file") - file_name_with_ext = write_to_file( - result, - loc, - video, - title, - event_date, - tags, - category, - speakers, - video_title, - username, - local, - test, - pr, - summary, - working_dir=working_dir, - ) - logger.info("wrote .md file") - - absolute_path = os.path.abspath(file_name_with_ext) - return absolute_path - except Exception as e: - logger.error("Error getting markdown file path") - logger.error(e) + raise Exception(f"(deepgram) Error transcribing audio to text: {e}") def create_pr(absolute_path, loc, username, curr_time, title): @@ -564,208 +254,6 @@ def create_pr(absolute_path, loc, username, curr_time, title): logger.info("Please check the PR for the transcription.") -def get_username(): - logger = logging.getLogger(__app_name__) - try: - if os.path.isfile(".username"): - with open(".username", "r") as f: - username = f.read() - f.close() - else: - print("What is your github username?") - username = input() - with open(".username", "w") as f: - f.write(username) - f.close() - return username - except Exception as e: - logger.error("Error getting username") - logger.error(e) - - -def check_source_type(source): - """Returns the type of source based on the file name - """ - source_type = None - local = False - if source.endswith(".mp3") or source.endswith(".wav"): - source_type = "audio" - elif check_if_playlist(source): - source_type = "playlist" - elif check_if_video(source): - source_type = "video" - # check if source is a local file - if os.path.isfile(source): - local = True - return (source_type, local) - - -def process_audio( - source, - title, - event_date, - tags, - category, - speakers, - loc, - model, - username, - local, - test, - pr, - deepgram, - summarize, - diarize, - upload=False, - model_output_dir="local_models/", - working_dir="tmp/", -): - logger = logging.getLogger(__app_name__) - try: - logger.info("audio file detected") - curr_time = str(round(time.time() * 1000)) - - # check if title is supplied if not, return None - if title is None: - logger.error("Error: Please supply a title for the audio file") - return None - # process audio file - summary = None - result = None - if not local: - filename = get_audio_file( - url=source, title=title, working_dir=working_dir - ) - abs_path = os.path.abspath(path=filename) - logger.info(f"filename: {filename}") - logger.info(f"abs_path: {abs_path}") - else: - filename = source.split("/")[-1] - abs_path = os.path.abspath(source) - logger.info(f"processing audio file: {abs_path}") - if filename is None: - logger.info("File not found") - return - if filename.endswith("wav"): - initialize() - abs_path = convert_wav_to_mp3( - abs_path=abs_path, filename=filename, working_dir=working_dir - ) - if test: - result = test - else: - if deepgram or summarize: - deepgram_resp = process_mp3_deepgram( - filename=abs_path, summarize=summarize, diarize=diarize - ) - result = get_deepgram_transcript( - deepgram_data=deepgram_resp, - diarize=diarize, - title=title, - model_output_dir=model_output_dir, - upload=upload, - ) - if summarize: - summary = get_deepgram_summary(deepgram_data=deepgram_resp) - if not deepgram: - result = process_mp3(abs_path, model, upload, model_output_dir) - result = create_transcript(result) - absolute_path = get_md_file_path( - result=result, - loc=loc, - video=source, - title=title, - event_date=event_date, - tags=tags, - category=category, - speakers=speakers, - username=username, - local=local, - video_title=filename[:-4], - test=test, - pr=pr, - summary=summary, - working_dir=working_dir, - ) - - if pr: - create_pr( - absolute_path=absolute_path, - loc=loc, - username=username, - curr_time=curr_time, - title=title, - ) - return absolute_path - except Exception as e: - logger.error("Error processing audio file") - logger.error(e) - - -def process_videos( - source, - title, - event_date, - tags, - category, - speakers, - loc, - model, - username, - chapters, - pr, - deepgram, - summarize, - diarize, - upload=False, - model_output_dir="local_models", - working_dir="tmp/", -): - logger = logging.getLogger(__app_name__) - try: - logger.info("Playlist detected") - if source.startswith("http") or source.startswith("www"): - parsed_url = urlparse(source) - source = parse_qs(parsed_url.query)["list"][0] - url = "https://www.youtube.com/playlist?list=" + source - logger.info(url) - videos = get_playlist_videos(url) - if videos is None: - logger.info("Playlist is empty") - return - - selected_model = model + ".en" - filename = "" - - for video in videos: - filename = process_video( - video=video, - title=title, - event_date=event_date, - tags=tags, - category=category, - speakers=speakers, - loc=loc, - model=selected_model, - username=username, - pr=pr, - chapters=chapters, - test=False, - diarize=diarize, - deepgram=deepgram, - summarize=summarize, - upload=upload, - working_dir=working_dir, - model_output_dir=model_output_dir, - ) - if filename is None: - return None - return filename - except Exception as e: - logger.error("Error processing playlist") - logger.error(e) - - def combine_deepgram_with_chapters(deepgram_data, chapters): logger = logging.getLogger(__app_name__) try: @@ -801,240 +289,6 @@ def combine_deepgram_with_chapters(deepgram_data, chapters): logger.error(e) -def process_video( - video, - title, - event_date, - tags, - category, - speakers, - loc, - model, - username, - chapters, - test, - pr, - local=False, - deepgram=False, - summarize=False, - diarize=False, - upload=False, - model_output_dir="local_models", - working_dir="tmp/", -): - logger = logging.getLogger(__app_name__) - try: - curr_time = str(round(time.time() * 1000)) - if not local: - if "watch?v=" in video: - parsed_url = urlparse(video) - video = parse_qs(parsed_url.query)["v"][0] - elif "youtu.be" in video or "embed" in video: - video = video.split("/")[-1] - video = "https://www.youtube.com/watch?v=" + video - logger.info("Transcribing video: " + video) - if event_date is None: - event_date = get_date(video) - abs_path = download_video(url=video, working_dir=working_dir) - if abs_path is None: - logger.info("File not found") - return None - filename = abs_path.split("/")[-1] - else: - filename = video.split("/")[-1] - logger.info("Transcribing video: " + filename) - abs_path = os.path.abspath(video) - - if not title: - title = filename[:-4] - initialize() - summary = None - result = "" - deepgram_data = None - if chapters and not test: - chapters = read_description(working_dir) - elif test: - chapters = read_description("test/testAssets/") - mp3_path = convert_video_to_mp3(abs_path, working_dir) - if deepgram or summarize: - deepgram_data = process_mp3_deepgram( - filename=mp3_path, summarize=summarize, diarize=diarize - ) - result = get_deepgram_transcript( - deepgram_data=deepgram_data, - diarize=diarize, - title=title, - model_output_dir=model_output_dir, - upload=upload, - ) - if summarize: - logger.info("Summarizing") - summary = get_deepgram_summary(deepgram_data=deepgram_data) - if not deepgram: - result = process_mp3(mp3_path, model, upload, model_output_dir) - if chapters and len(chapters) > 0: - logger.info("Chapters detected") - write_chapters_file( - os.path.join(working_dir, filename[:-4] + ".chapters"), chapters - ) - if deepgram: - if diarize: - result = combine_deepgram_chapters_with_diarization( - deepgram_data=deepgram_data, chapters=chapters - ) - else: - result = combine_deepgram_with_chapters( - deepgram_data=deepgram_data, chapters=chapters - ) - else: - result = combine_chapter( - chapters=chapters, - transcript=result, - working_dir=working_dir, - ) - else: - if not test and not deepgram: - result = create_transcript(result) - elif not deepgram: - result = "" - logger.info("Creating markdown file") - absolute_path = get_md_file_path( - result=result, - loc=loc, - video=video, - title=title, - event_date=event_date, - tags=tags, - summary=summary, - category=category, - speakers=speakers, - username=username, - video_title=filename[:-4], - local=local, - pr=pr, - test=test, - working_dir=working_dir, - ) - if not test: - if pr: - create_pr( - absolute_path=absolute_path, - loc=loc, - username=username, - curr_time=curr_time, - title=title, - ) - return absolute_path - except Exception as e: - logger.error("Error processing video") - logger.error(e) - - - -def process_source( - source, - title, - event_date, - tags, - category, - speakers, - loc, - model, - username, - source_type, - chapters, - local, - test=None, - pr=False, - deepgram=False, - summarize=False, - diarize=False, - upload=False, - model_output_dir=None, - verbose=False, -): - tmp_dir = tempfile.mkdtemp() - model_output_dir = ( - "local_models/" if model_output_dir is None else model_output_dir - ) - - try: - if source_type == "audio": - filename = process_audio( - source=source, - title=title, - event_date=event_date, - tags=tags, - category=category, - speakers=speakers, - loc=loc, - model=model, - username=username, - summarize=summarize, - local=local, - test=test, - pr=pr, - deepgram=deepgram, - diarize=diarize, - upload=upload, - model_output_dir=model_output_dir, - working_dir=tmp_dir, - ) - elif source_type == "playlist": - filename = process_videos( - source=source, - title=title, - event_date=event_date, - tags=tags, - category=category, - speakers=speakers, - loc=loc, - model=model, - username=username, - summarize=summarize, - chapters=chapters, - pr=pr, - deepgram=deepgram, - diarize=diarize, - upload=upload, - model_output_dir=model_output_dir, - working_dir=tmp_dir, - ) - elif source_type == "video": - filename = process_video( - video=source, - title=title, - event_date=event_date, - summarize=summarize, - tags=tags, - category=category, - speakers=speakers, - loc=loc, - model=model, - username=username, - local=local, - diarize=diarize, - chapters=chapters, - test=test, - pr=pr, - deepgram=deepgram, - upload=upload, - model_output_dir=model_output_dir, - working_dir=tmp_dir, - ) - else: - raise Exception(f"{source_type} is not a valid source type") - return filename, tmp_dir - except Exception as e: - logger.error("Error processing source") - logger.error(e) - - -def get_date(url): - video = pytube.YouTube(url) - return str(video.publish_date).split(" ")[0] - - def clean_up(tmp_dir): try: shutil.rmtree(tmp_dir) @@ -1043,32 +297,16 @@ def clean_up(tmp_dir): raise -def save_local_json(json_data, title, model_output_dir): - logger = logging.getLogger(__app_name__) - logger.info(f"Saving Locally...") - time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - if not os.path.isdir(model_output_dir): - os.makedirs(model_output_dir) - file_path = os.path.join( - model_output_dir, title + "_" + time_in_str + ".json" - ) - with open(file_path, "w") as json_file: - json.dump(json_data, json_file, indent=4) - logger.info(f"Model stored at path {file_path}") - return file_path - - def generate_srt(data, filename, model_output_dir): logger = logging.getLogger(__app_name__) logger.info("Saving Locally...") time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - base_filename, _ = os.path.splitext(filename) if not os.path.isdir(model_output_dir): os.makedirs(model_output_dir) output_file = os.path.join( - model_output_dir, base_filename + "_" + time_in_str + ".srt" + model_output_dir, filename + "_" + time_in_str + ".srt" ) - logger.debug(f"writing srt to {output_file}") + logger.debug(f"Writing srt to {output_file}") with open(output_file, "w") as f: for index, segment in enumerate(data): start_time, end_time, text = segment @@ -1099,49 +337,3 @@ def upload_file_to_s3(file_path): logger.info(f"File uploaded to S3 bucket : {bucket}") except Exception as e: logger.error(f"Error uploading file to S3 bucket: {e}") - - -def generate_payload( - loc, - title, - event_date, - tags, - category, - speakers, - username, - media, - transcript, - test, -): - logger = logging.getLogger(__app_name__) - try: - event_date = ( - event_date - if event_date is None - else event_date - if type(event_date) is str - else event_date.strftime("%Y-%m-%d") - ) - data = { - "title": title, - "transcript_by": f"{username} via TBTBTC v{__version__}", - "categories": category, - "tags": tags, - "speakers": speakers, - "date": event_date, - "media": media, - "loc": loc, - "body": transcript, - } - content = {"content": data} - if test: - return content - else: - config = dotenv_values(".env") - url = config["QUEUE_ENDPOINT"] + "/api/transcripts" - resp = requests.post(url, json=content) - if resp.status_code == 200: - logger.info("Transcript added to queue") - return resp - except Exception as e: - logger.error(e) diff --git a/test/testAssets/payload.json b/test/testAssets/payload.json index be39d68..6b4eee1 100644 --- a/test/testAssets/payload.json +++ b/test/testAssets/payload.json @@ -6,7 +6,6 @@ "tags": ["tag1", "tag2"], "speakers": ["speaker1", "speaker2"], "date": "2020-01-31", - "media": "test/testAssets/test_video.mp4", "loc": "yada/yada", "body": "Welcome to the Jankoid podcast. I'm here with merch hi there. Today we're gonna jump into the temple and That's a pun if you didn't get it. Welcome to Jankoid decoded the temple The temple an area you are more than familiar with yeah a Nampool whispery in the call. Yeah, let's maybe start with what's the relationship between the Mimpool and fees? We often talk about the Mimpool, but there is no such thing as a global Mimpool Every full-known has its own Mimpool and the Mimpool is basically just the queue of transactions waiting to get Confred where Confred means included in a block. So by default Block template builders will just sort the waiting transactions by the highest effective fee rate Then pick from the top the juicier transaction the quicker gets confirmed now Especially in the last few months we've seen that there was a very large queues because we had a huge run up in the price I haven't checked but I think it's now about a hundred and twenty days that We haven't cleared the Mimpool maybe a hundred and ten and since 15th of December So Mimpools are limited and By default they are limited to 300 megabytes of De-serialized data So that includes all day overhead structure the previous U-tix O's maybe even the whole transaction that created U-tix O's and so forth So roughly at about 80 blocks worth of data the default of 300 megabyte gets exceeded and at that point a full node will automatically start Perching the lowest fee rate transactions data stop them and tell all their neighboring peers Hey, don't send me anything under this period. They they start raising up their min fee rate So the problem that gets introduced here is if a parent transaction is no longer in the Mimpool You cannot bump it because if you try to do a CPP and the pair doesn't there the child is gonna be invalid CFF just for the initiated child place for parent Some things that are being done in the context of that is that people are working on Package relay where you can send more than one transaction to appear as a package that they evaluate as a whole together Instead of looking at the parent and saying okay you're out and this child doesn't have a parent Okay, you're out to And maybe you can just talk a little bit more about the mechanics of how CPS fee actually works to get into a block You bid on block space transactions get serialized in an apartment Where inputs are fair we big outputs are smaller there's a little bit of a transaction header that includes like how many inputs there output there and Lock time inversion So we already found out that when miners build blocks days Sort transactions by the highest period so they first considered the transactions that paid the most set-touchies Per byte of serialized data. So what's the mechanic where the mechanics of CPS fee when you try to Get a transaction through sometimes they have a Firit that is to low for it to be considered quickly and you can reprioritize your transaction by Increasing its effective Firit now you cannot edit a transaction after you submitted it to the network because the Transaction itself is immutable But what you can do is you can spend one of their outputs of the transactions with a Another child transaction that has a very high fee and Now the child transaction can only be valid by the parent getting included in the block So miners will look at transaction packages actually they sort the weight list by the M sister fee rate of transactions not just by transactions in the singular So when you have a child that is super juicy it basically pays for the parent to get included at low as well So literally tell pace for parent got every parents dream to have their children pay for You said that when miners evaluate these Fee rates is that built in the Bitcoin core are they writing custom software for that Bitcoin core has a get black template corn which allows you to exactly do that just generate a black template But I believe that most miners are probably running custom code because for example They accept out of band payments to reprioritize transactions or they run their own wallet service on this side and always prioritize their own transactions or They might have some sort of other solver that optimizes block template building further So I think that I haven't looked at this in detail, but I think that at least they're not running default values because By default blocks created by Bitcoin core would leave a little space. I think about six kilo bytes and blocks are full if you look at them. So they must have at least treated a little bit and we're not when we say miners We're talking about pools. Yes, right so most miners as in the People running a six or whatever They just join our pool who does the coordination of the work and They basically the pool operator picks the block template that is being worked on and the miner just gets a separate workspace that they iterate over in order to try to fund the This problem sounds hard. Why is it hard to estimate periods? So block discovery is a random process think of like Decay of radio active isotopes What we do there is we can give you a half time It usually takes around this much of time for half of the atoms to Disappade But we can't tell you if we look at a single atom when it's actually gonna Disappade it might be immediately it might be at the half time it might take decades Right with blocks that's the same thing there in average coming in at I think about 9.7 minutes But when the next block is gonna be found is up to this random press on process Actually it is such that since there is no memory to the process It's every draw just has a chance to succeed at every point in time The next block is about 10 minutes away in the average. Yeah, it's really intuitive to think about that Right if even if you're 18 minutes into not finding a block the next block will be found in 10 minutes Yes, exactly you don't know when the next block is gonna be found So you don't know what transactions you will be competing against you might be competing against the transactions that I Translate in the man pool plus the transactions that get added in the next one minute You might be competing against the transactions in the man pool plus 10 minutes or plus 60 minutes Because about once a day There's a block that takes 60 minutes really you have this one shot to pick exactly the right view To slide in at the bottom of the block that you want to be in because if you don't slide in at the bottom of the block You're overpay and if you underestimate you're not gonna get confirmed in the time that you were aiming to be confident And so how do exchanges usually do this are they overpaying? Are they just estimating the the upper end? Maybe like who's paying those fees? Right, there's different scenarios some exchanges have different tiers like low-time preference and high-time preference or whatever and they treat those differently But generally most exchanges by now batch their withdrawals Which gives them a way to leverage their scale So if you're sending to 20 people every minute Making one transaction out of that is a lot cheaper than making 20 separate payments It's also much easier to manage your due-to-to-pull that way and And Then they just tend to very conservatively estimate their fees just Be in the next two blocks and maybe rather overpay slightly because it's so much less work To deal with all the customer compliance over step-transactions than to to pay like sure we're overpaying by 30% to be in the next block But it's not them that's overpaying Is they usually that gives passed on the customer? There's different models. I think in most actually the exchange pays But they take a flat fee for a withdraw or really yeah, so like it's time for a very long time for example I'd like I think a 90 cent 90 Euro cent flat restraw fee But then they'd bet every few minutes only you said that the member who hasn't really been empty for almost four months Yeah, that's correct. Is the ever gonna empty again as we go to the moon does the what happens to the man pool? Yeah, that's a great question. I think we'll eventually see a man pool empty again But there should probably be a long tail end to it emptying Because now in this for months a lot of the exchanges that usually would do consolidations to keep their you take so pool sized Manageable they haven't been able to get any of those through so when the fee rates go down now I think that we'll see more people put in their consolidation transactions had like three to five such as per bite And that I think we might not see an empty man pool for multiple months So even if the top fee rates get a lot more relaxed now Generally the competition to be in blocks seems to correlate with volatility and especially price rises when when the market Heats up and and people are more excited to trade There's more transaction volume on the network and Now we've seen in the past four weeks or so the price has been going more sideways There might have been even a small dips here and there and the top fee rates have come down On the on the weekends that's dropped first to seven set of sheet per bite then six and now last weekend Six was clear completely I don't think that getting a one set of super by transaction a true will be possible at any time soon But it'll be very possible to wait to the weekend to get a ten set of super by transactions Maybe from like a more met-of-you know the miners like this don't they like having high fees because One is revenue for them but also As we sort of zoom out we think about the decreasing block reward over time Don't we have to have a high fee environment in order for this this is the work under one hand You have to also consider that the exchange rate 10x in the last year So the same fee rates represent a 10x purchasing value in cost for Getting a sense to the same service a transaction into a block so while the fee rates are similar The cost of getting a transaction through has actually increased there miners do love it because I think he rates make about 17% or so of the block reward right now So I'm not sure yeah, that's that's a nice little tip right But there's definitely a concern that when we continue to reduce the blocks subsidy in the every four year having rewards schedule that eventually the system will have to subside just transaction fees and if the transaction fees are to low it will Basically not be Economic for miners to provide security to the bit-ten system so there's a good argument for not Increasing the block space To our degree where it's always gonna be empty if you want to do that you essentially have to Also switch to an endless block subsidy otherwise there is no economic incentive for miners to continue mining if there's Not enough fees unless unless your minimum fee rate at some point becomes So valuable that even at minimum fee rate any transactions are Some sort of sufficient revenue for miners to continue their business Maybe we can sort of circle back to what happens when transactions are elected from the mumpul and to talk about like what problems Like it introduced especially for fee bumping and and lightning channel closing right When a mumpul fills up as we said earlier the node will start dropping the lowest fee rate transactions And especially for people or services that use Unconfirmed inputs that can be a problem at times because you cannot Spend an input that is unknown to other nodes Right, so if all other nodes on a network have dropped a transaction Your polar option that spends the output from that drop transaction will not be able to relay on in that work So you Cannot only not spend your hands, but you can also not Repair or ties the prior transaction One thing that this solves basically is RBF because you can just rewrite a replaceable transaction and submit a transaction with a higher fee rate All right, so we went over CPSP can we go over our BS? Sure So dip one 25 introduces rules by which you are allowed to replace transactions You have to explicitly signal data transaction is replaceable and In that case before a transaction is confirmed the sender may issue an updated version of the transaction Which can completely change the outputs the only restriction is that it has to use one of the same inputs Otherwise it wouldn't be a replacement And wouldn't be so it has to be a conflicting transaction essentially and Additionally, it has to pay enough fees to replace the prior transaction and all the transactions that changed off of them In the mimpul so if you had like three transactions you have to pay more fees and the replacement than those three transactions together All right, so blast double spending It's over site. I do not like to term double spending in that context So the problem with that is a successful double spend means that Either you actually got two transactions that were in conflict confirmed Which could basically only happen if you have two competing blocks where one Block had a prior version and a second block had a netty and then the second block eventually becomes part of the best chain Or when you at least convince somebody that they had been paid But then actually managed to spend the funds somewhere else But here in this case are the f transactions are Explicitly labeling themselves as replaceable basically they're running around with a red lettered sign on front of their chest Do not trust me, right and most wallets are for just doesn't show your rb f transactions until they are confirmed Once confirmed in the blockchain they're exactly the same and same reliability as any other transaction But while tuing they are explicitly saying look, I could be replaced do not consider yourself paid So calling this a double spend is really just saying that well Somebody made extremely unreasonable assumptions about the reliability of a transaction that explicitly warned them that It's not reliable so I like conflicting transactions more in this context and maybe why do we need two ways to bump fees? Why do we need RBF and CPFB right so they have slightly different traders CPFP allows any recipient of a transaction to bump it Right that could be a recipient in the sense that the person that got paid or The sender if there was a change I'd put on the transaction it also doesn't change the tx ID because you're just training Advait transactions on it and it it takes more blocks this right because you now have to send a second transaction in order to Increase the effective fee rate of the first so more blocks this Easier to keep track off and more flexibility as in there's more parties than can interact with it RBF on the other hand allows you to completely replace the transaction Which means that it is more flexible But you potentially have to pay more fee use especially if somebody else changed off of your transaction already It changes the tx ID and a lot of wallets and services have been tracking payments by the tx ID Rather than looking at like what address got paid what the amount was or Whatever as in like treating the current addresses as in voices as they should be used they built a whole system around tx IDs so our BF transactions that They changed the inputs are outputs right otherwise they couldn't change the fees and that means that they have a new tx ID And it it is not trivial to keep proper track of that and to update your UX and UI to make that Easily accessible to your users right then also only the sender can run per transaction Think that because they have to reissue the updated variant of the transaction Given that it is a little more difficult to interact with our BF transactions a lot of services Only see them once they're confirmed once they're reliable so if you're trying to get a service To give you something very quickly You might want to choose to not do an RBS transaction the first place though that they can Reasonably assume oh this has a high enough fee read and we know the user We can trust them that they're sending us these three dollars actually and give them Existed it so I don't know what I thought it okay So we asked the question like what problems do does an employee eviction Cause for fee bumping and and also maybe the lightning Channel closing use case we talked a bit generically about how parent transactions being gone Staps you from being able to spend those unconfermed output But does this especially a problem in the context of lightning because when you close a lightning channel It's either the collaborative case where you have no problem because you can really go see it the closing transaction with your partner But where you really needed you're trying to unilatory close because your partner has Channel partner hasn't shown up in all then if you you have to fall back to the transaction that you had negotiated Sometime in the past when you last updated the commitment transaction So let's say that was in a low fee read time and now the fee read suffix loaded and you can't actually even Broadcast the commitment transaction to the network because it's too low fee read Now the problem is the parted that is closing the lightning channel under the LN penalty update mechanism their funds are actually locked with a csv So they can't do cp because The output is only spendable after the transaction is confirmed for multiple blocks So you can't obtain a transaction to a output that is not spendable while it's still in the income Especially for enlightening this introduces the volatility in the the block space martin introduces a headache because You can literally come into the situation where you can close your lightning channel due to the fee read So one approach I've heard about is to introduce anchor outputs Which are depending on the proposal either spendable by either side or spendable on certain conditions But they immediately spendable so they can be cp of peat or Another idea is to have package relay right because if the Channel closing transaction has a low fee read and you can then relay it together with a second transaction That'll work except if you're in the naturally closing because the csv issue still Pretends to that but either way if you get package relay you would be able to do away with the the estimate and Commitment transactions altogether because we talked about how fee estimation is hard for regular transactions The estimation for a commitment transactions is even much harder because you have no clue when you will want to use the transaction Yeah, that depends if they seem Very scary right you you have absolutely no clue what the fee rates will be like when when you actually try to use it So having Packetry lay would in combination with anchor outputs would allow you to always have a zero fee on the commitment transaction And then basically always bring your own fee when you broadcast it in the Cp P P Touch Transaction got it okay, so we sort of talked about specific but maybe we can zoom out and you know What are some ways that we could be using our box based more efficiently? What are some things that make us optimistic about the future? We still have Only about I think 40% or so Segwood inputs now about 50% percent of all transactions use Segwood inputs, but the majority of inputs is still non Segwood Once more people start using Segwood or even tap root once tap root comes out The input sizes will be smaller. So naturally there will be more space for more transactions So recently a major service wall service provider and How I'm first of April nonetheless that they would be switching to native Segwood addresses and they they had been a long holdout So blockchain.com has Probably around 33% of all transactions creations among their use of this yeah, I mean that dependency is We're shaking our heads simultaneously Great Segwood activated on 24th of August in 2017 Right, that's three and a half years ago and until recently I think they they weren't even able to send to native Segwood addresses and now they announced that not only They'll actually default native Segwood addresses altogether I think they claimed this month, but I'm hoping that they'll come true with that because we have a huge backlock of all use outputs that they created over the years It has been one of the most popular bit carnwallas for yeah almost a decade And it will take forever for all of these non Segwood outputs to eventually get spent But the observation is that most Inputs are Consuming just very young outputs so funds that got moved are much more likely to move again soon so Seeing that Chanda.com will hopefully switch to native Segwood output soon I would assume that even while the U.T.s O set will have a lot of non Segwood outputs living there for a very long time The transactions that get built very well much quicker become Segwood transactions to a high degree If 33% of all transactions let's say 80% of them become Segwood inputs and Literally more than half their input sides that would be I want to say like 15% of the current blocks based demand Going away overnight. Yeah, yeah That's right. I think they calculate more more certainly, but other other holdouts Bit max recently switched to native Segwood I think for deposits there's still Quite a few services that use rap Segwood rather than native Segwood which already gets most of the efficiency but clearly not all it was actually expecting that the high fee rates might get more people moving I think that the tap root roll out might get a huge Block space efficiency gain because Tap root introduces a bunch of new features that are only available fruit Tap root and tap root outputs and inputs are about the size of pay-to-avitness public key hash in total So smaller than a lot of the multi-state constructions these days even in native Segwood and Definitely smaller than everything non Segwood so any any wallet that switched to tap root roll Bring down the blocks based use a lot quickly. Yeah, the multi-state savings are pretty significant yeah And local little bring in a new era of Multi-state being more standard. I think that'd be that's the system work setting thing it'll take quite some time because to do the Public key aggregation that will bring the biggest efficiency gain people will actually have to implement new Segwood or another aggregation algorithm and until that gets into regular wallets Will be a while. I think maybe the first it gets into libraries and and Especially for services with multi Segwood wallets There would be a huge efficiency gain there and they they should have Great incentives to roll it out very quickly Great Thanks for listening to another episode of chain code decoded and we're gonna keep it rolling we'll have another one next week Yeah, let's talk about maybe how the blockchain works go back to basics in a time\n" } diff --git a/test/test_audio.py b/test/test_audio.py index 28aaaed..53fe977 100644 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -4,6 +4,7 @@ import pytest from app import application +from app.transcription import Transcription def rel_path(path): @@ -67,32 +68,21 @@ def test_audio_with_title(): source = rel_path("testAssets/audio.mp3") title = "title" username = "username" - filename, tmp_dir = application.process_source( - source=source, - title=title, - event_date=None, - tags=None, - category=None, - speakers=None, - loc=rel_path("yada/yada"), - model="tiny", + transcription = Transcription( username=username, - source_type="audio", - local=True, - test=result, - chapters=False, - pr=False, + test_mode=True, ) - filename = os.path.join(tmp_dir, filename) - assert os.path.isfile(filename) + transcription.add_transcription_source(source, title) + transcripts = transcription.start() + assert os.path.isfile(transcripts[0]) assert check_md_file( - path=filename, + path=transcripts[0], transcript_by=username, media=source, title=title, local=True, ) - application.clean_up(tmp_dir) + transcription.clean_up() @pytest.mark.feature @@ -102,29 +92,15 @@ def test_audio_without_title(): file.close() source = rel_path("test/testAssets/audio.mp3") - username = "username" title = None - filename, tmp_dir = application.process_source( - source=source, - title=title, - event_date=None, - tags=None, - category=None, - speakers=None, - loc=rel_path("yada/yada"), - model="tiny", - username=username, - pr=False, - source_type="audio", - local=True, - test=result, - chapters=False, + transcription = Transcription( + test_mode=True ) - assert filename is None - assert not check_md_file( - path=filename, transcript_by=username, media=source, title=title - ) - application.clean_up(tmp_dir) + transcription.add_transcription_source(source, title) + with pytest.raises(Exception) as error: + transcription.start() + assert "Please supply a title for the audio file" in str(error) + transcription.clean_up() @pytest.mark.feature @@ -139,31 +115,20 @@ def test_audio_with_all_data(): tags = "tag1, tag2" category = "category" date = "2020-01-31" - date = datetime.strptime(date, "%Y-%m-%d").date() - filename, tmp_dir = application.process_source( - source=source, - title=title, - event_date=date, - tags=tags, - category=category, - speakers=speakers, - loc=rel_path("yada/yada"), - model="tiny", + transcription = Transcription( username=username, - source_type="audio", - local=True, - test=result, - chapters=False, - pr=False, + test_mode=True, ) + transcription.add_transcription_source( + source, title, date, tags, category, speakers) + transcripts = transcription.start() + category = [cat.strip() for cat in category.split(",")] tags = [tag.strip() for tag in tags.split(",")] speakers = [speaker.strip() for speaker in speakers.split(",")] - date = date.strftime("%Y-%m-%d") - filename = os.path.join(tmp_dir, filename) - assert os.path.isfile(filename) + assert os.path.isfile(transcripts[0]) assert check_md_file( - path=filename, + path=transcripts[0], transcript_by=username, media=source, title=title, @@ -173,4 +138,4 @@ def test_audio_with_all_data(): speakers=speakers, local=True, ) - application.clean_up(tmp_dir) + transcription.clean_up() diff --git a/test/test_cli.py b/test/test_cli.py index f59a404..00e3403 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -4,6 +4,7 @@ import pytest from app import application +from app.transcription import Transcription def rel_path(path): @@ -21,53 +22,64 @@ def test_initialize_repo(): assert False -@pytest.mark.feature +@pytest.mark.main def test_find_source_type(): - assert application.check_source_type("B0HW_sJ503Y")[0] == "video" - assert application.check_source_type("https://www.youtube.com/watch?v=B0HW_sJ503Y")[0] == "video" - assert application.check_source_type("https://youtu.be/B0HW_sJ503Y")[0] == "video" - assert application.check_source_type("https://youtube.com/embed/B0HW_sJ503Y")[0] == "video" - assert application.check_source_type("youtube.com/watch?v=B0HW_sJ503Y")[0] == "video" - assert application.check_source_type("www.youtube.com/watch?v=B0HW_sJ503Y&list")[0] == "video" - assert application.check_source_type("https://youtube.com/watch?v=B0HW_sJ503Y")[0] == "video" - - assert application.check_source_type("PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj")[0] == "playlist" - assert application.check_source_type("https://www.youtube.com/playlist?list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj")[0] == "playlist" - assert application.check_source_type("www.youtube.com/playlist?list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj")[0] == "playlist" - assert application.check_source_type("https://youtube.com/playlist?list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj")[0] == "playlist" - assert application.check_source_type("https://www.youtube.com/watch?v=B0HW_sJ503Y&list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj")[0] == "playlist" - - assert application.check_source_type("https://anchor.fm/s/12fe0620/podcast/play/32260353/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2021-3-26%2Fdc6f12e7-a547-d872-6ef6-7acfe755a692.mp3")[0] == "audio" - + transcription = Transcription( + test_mode=True, + ) + assert transcription._calculate_source_type("B0HW_sJ503Y") == "video" + assert transcription._calculate_source_type( + "https://www.youtube.com/watch?v=B0HW_sJ503Y") == "video" + assert transcription._calculate_source_type( + "https://youtu.be/B0HW_sJ503Y") == "video" + assert transcription._calculate_source_type( + "https://youtube.com/embed/B0HW_sJ503Y") == "video" + assert transcription._calculate_source_type( + "youtube.com/watch?v=B0HW_sJ503Y") == "video" + assert transcription._calculate_source_type( + "www.youtube.com/watch?v=B0HW_sJ503Y&list") == "video" + assert transcription._calculate_source_type( + "https://youtube.com/watch?v=B0HW_sJ503Y") == "video" + + assert transcription._calculate_source_type( + "PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj") == "playlist" + assert transcription._calculate_source_type( + "https://www.youtube.com/playlist?list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj") == "playlist" + assert transcription._calculate_source_type( + "www.youtube.com/playlist?list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj") == "playlist" + assert transcription._calculate_source_type( + "https://youtube.com/playlist?list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj") == "playlist" + assert transcription._calculate_source_type( + "https://www.youtube.com/watch?v=B0HW_sJ503Y&list=PLPQwGV1aLnTuN6kdNWlElfr2tzigB9Nnj") == "playlist" + + assert transcription._calculate_source_type( + "https://anchor.fm/s/12fe0620/podcast/play/32260353/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2021-3-26%2Fdc6f12e7-a547-d872-6ef6-7acfe755a692.mp3") == "audio" + transcription.clean_up() +@pytest.mark.feature def test_download_audio_file(): - if not os.path.isdir("tmp"): - os.mkdir("tmp") - audio = application.get_audio_file( - "https://dcs.megaphone.fm/FPMN6776580946.mp3", "test" + transcription = Transcription( + test_mode=True, ) - print("audio", audio) - assert os.path.isfile(audio) - os.remove(audio) + transcription.add_transcription_source( + "https://dcs.megaphone.fm/FPMN6776580946.mp3", "test") + audio_file, tmp_dir = transcription.transcripts[0].process_source( + transcription.tmp_dir) + assert os.path.isfile(audio_file) + application.clean_up(tmp_dir) +@pytest.mark.feature def test_download_video_file(): - if not os.path.isdir("tmp"): - os.mkdir("tmp") - url = "https://www.youtube.com/watch?v=B0HW_sJ503Y" - video = application.download_video(url) - assert os.path.isfile(video) and os.path.isfile("tmp/videoFile.info.json") - print() - os.remove(video) - os.remove("tmp/videoFile.info.json") - shutil.rmtree("tmp") - - -@pytest.mark.main -def test_convert_video_to_audio(): - if not os.path.isdir("tmp/"): - os.makedirs("tmp/") - application.convert_video_to_mp3(rel_path("testAssets/test_video.mp4")) - assert os.path.isfile("tmp/test_video.mp3") - shutil.rmtree("tmp/") + transcription = Transcription( + test_mode=True, + ) + transcription.add_transcription_source( + "https://www.youtube.com/watch?v=B0HW_sJ503Y") + audio_file, tmp_dir = transcription.transcripts[0].process_source( + transcription.tmp_dir) + assert os.path.isfile(f"{audio_file[:-4]}.mp4") # video download + assert os.path.isfile(audio_file) # mp3 convert + assert os.path.isfile(f"{tmp_dir}/videoFile.info.json") # metadata + application.clean_up(tmp_dir) diff --git a/test/test_video.py b/test/test_video.py index a4b0406..83c2fb2 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -6,6 +6,7 @@ import pytest from app import application +from app.transcription import Transcription def rel_path(path): @@ -90,9 +91,6 @@ def check_md_file( @pytest.mark.feature def test_video_with_title(): - with open(rel_path("testAssets/transcript.txt"), "r") as file: - result = file.read() - file.close() source = os.path.abspath(rel_path("testAssets/test_video.mp4")) username = "username" title = "test_video" @@ -100,26 +98,17 @@ def test_video_with_title(): tags = None category = None date = None - filename, tmp_dir = application.process_source( - source=source, - title=title, - event_date=date, - tags=tags, - category=category, - speakers=speakers, - loc="yada/yada", - model="tiny", + transcription = Transcription( username=username, - source_type="video", - local=True, - test=result, - chapters=False, + test_mode=True, ) - assert tmp_dir is not None - filename = os.path.join(tmp_dir, filename) - assert os.path.isfile(filename) + transcription.add_transcription_source( + source, title, date, tags, category, speakers) + transcripts = transcription.start() + + assert os.path.isfile(transcripts[0]) assert check_md_file( - path=filename, + path=transcripts[0], transcript_by=username, media=source, title=title, @@ -129,7 +118,7 @@ def test_video_with_title(): speakers=speakers, local=True, ) - application.clean_up(tmp_dir) + transcription.clean_up() @pytest.mark.feature @@ -141,32 +130,21 @@ def test_video_with_all_options(): tags = "tag1, tag2" category = "category" date = "2020-01-31" - date = datetime.strptime(date, "%Y-%m-%d").date() - filename, tmp_dir = application.process_source( - source=source, - title=title, - event_date=date, - tags=tags, - category=category, - speakers=speakers, - loc="yada/yada", - model="tiny", + + transcription = Transcription( username=username, - source_type="video", - local=True, - test=True, - chapters=False, + test_mode=True, ) - assert tmp_dir is not None - filename = os.path.join(tmp_dir, filename) - assert os.path.isfile(filename) + transcription.add_transcription_source( + source, title, date, tags, category, speakers) + transcripts = transcription.start() + assert os.path.isfile(transcripts[0]) category = [cat.strip() for cat in category.split(",")] tags = [tag.strip() for tag in tags.split(",")] speakers = [speaker.strip() for speaker in speakers.split(",")] - date = date.strftime("%Y-%m-%d") assert check_md_file( - path=filename, + path=transcripts[0], transcript_by=username, media=source, title=title, @@ -176,7 +154,7 @@ def test_video_with_all_options(): speakers=speakers, local=True, ) - application.clean_up(tmp_dir) + transcription.clean_up() @pytest.mark.feature @@ -191,25 +169,16 @@ def test_video_with_chapters(): tags = "tag1, tag2" category = "category" date = "2020-01-31" - date = datetime.strptime(date, "%Y-%m-%d").date() - filename, tmp_dir = application.process_source( - source=source, - title=title, - event_date=date, - tags=tags, - category=category, - speakers=speakers, - loc="yada/yada", - model="tiny", + + transcription = Transcription( username=username, - source_type="video", - local=True, - test=result, chapters=True, - pr=True, + test_mode=True, ) - assert tmp_dir is not None - filename = os.path.join(tmp_dir, filename) + transcription.add_transcription_source( + source, title, date, tags, category, speakers) + transcripts = transcription.start(result) + chapter_names = [] with open(rel_path("testAssets/test_video_chapters.chapters"), "r") as file: result = file.read() @@ -218,14 +187,12 @@ def test_video_with_chapters(): chapter_names.append(x.split("= ")[1].strip()) file.close() - print(filename) - assert os.path.isfile(filename) + assert os.path.isfile(transcripts[0]) category = [cat.strip() for cat in category.split(",")] tags = [tag.strip() for tag in tags.split(",")] speakers = [speaker.strip() for speaker in speakers.split(",")] - date = date.strftime("%Y-%m-%d") assert check_md_file( - path=filename, + path=transcripts[0], transcript_by=username, media=source, title=title, @@ -236,28 +203,34 @@ def test_video_with_chapters(): chapters=chapter_names, local=True, ) - application.clean_up(tmp_dir) + transcription.clean_up() @pytest.mark.feature def test_generate_payload(): - date = "2020-01-31" - date = datetime.strptime(date, "%Y-%m-%d").date() with open(rel_path("testAssets/transcript.txt"), "r") as file: transcript = file.read() file.close() - payload = application.generate_payload( - loc=rel_path("yada/yada"), - title="test_title", - event_date=date, - tags=["tag1", "tag2"], - test=True, - category=["category1", "category2"], - speakers=["speaker1", "speaker2"], + + source = rel_path("testAssets/test_video.mp4") + username = "username" + title = "test_title" + speakers = ["speaker1", "speaker2"] + tags = ["tag1", "tag2"] + category = ["category1", "category2"] + date = "2020-01-31" + loc = "yada/yada" + + transcription = Transcription( + loc=loc, username="username", - media=rel_path("testAssets/test_video.mp4"), - transcript=transcript, + test_mode=True, ) + transcription.add_transcription_source( + source, title, date, tags, category, speakers) + transcription.start(transcript) + payload = transcription.push_to_queue(transcription.transcripts[0]) + transcription.clean_up() with open(rel_path("testAssets/payload.json"), "r") as outfile: content = json.load(outfile) outfile.close()