Skip to content

Commit

Permalink
Merge pull request #264 from harry0703/dev
Browse files Browse the repository at this point in the history
support azure new speech voice and fix the bug where clip were not closed
  • Loading branch information
harry0703 authored Apr 16, 2024
2 parents b9b9bea + d4eb7bc commit 414bcb0
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 25 deletions.
5 changes: 4 additions & 1 deletion app/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,16 @@ def save_config():
_cfg["app"] = app
_cfg["whisper"] = whisper
_cfg["pexels"] = pexels
_cfg["azure"] = azure
_cfg["ui"] = ui
f.write(toml.dumps(_cfg))


_cfg = load_config()
app = _cfg.get("app", {})
whisper = _cfg.get("whisper", {})
pexels = _cfg.get("pexels", {})
azure = _cfg.get("azure", {})
ui = _cfg.get("ui", {})

hostname = socket.gethostname()
Expand All @@ -53,7 +56,7 @@ def save_config():
project_name = _cfg.get("project_name", "MoneyPrinterTurbo")
project_description = _cfg.get("project_description",
"<a href='https://github.com/harry0703/MoneyPrinterTurbo'>https://github.com/harry0703/MoneyPrinterTurbo</a>")
project_version = _cfg.get("project_version", "1.1.1")
project_version = _cfg.get("project_version", "1.1.2")
reload_debug = False

imagemagick_path = app.get("imagemagick_path", "")
Expand Down
5 changes: 3 additions & 2 deletions app/controllers/v1/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID

sm.state.delete_task(task_id)
logger.success(f"video deleted: {utils.to_json(task)}")
return utils.get_response(200, task)
return utils.get_response(200)

raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found")

Expand Down Expand Up @@ -190,4 +190,5 @@ async def download_video(_: Request, file_path: str):
headers = {
"Content-Disposition": f"attachment; filename={filename}{extension}"
}
return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}", media_type=f'video/{extension[1:]}')
return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}",
media_type=f'video/{extension[1:]}')
9 changes: 5 additions & 4 deletions app/services/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,18 @@ def combine_videos(combined_video_path: str,
clips.append(clip)
video_duration += clip.duration

final_clip = concatenate_videoclips(clips)
final_clip = final_clip.set_fps(30)
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
logger.info(f"writing")
# https://github.com/harry0703/MoneyPrinterTurbo/issues/111#issuecomment-2032354030
final_clip.write_videofile(filename=combined_video_path,
video_clip.write_videofile(filename=combined_video_path,
threads=threads,
logger=None,
temp_audiofile_path=output_dir,
audio_codec="aac",
fps=30,
)
video_clip.close()
logger.success(f"completed")
return combined_video_path

Expand Down Expand Up @@ -263,7 +264,7 @@ def create_text_clip(subtitle_item):
logger=None,
fps=30,
)

video_clip.close()
logger.success(f"completed")


Expand Down
133 changes: 129 additions & 4 deletions app/services/voice.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import asyncio
import os
import re
from datetime import datetime
from xml.sax.saxutils import unescape
from edge_tts.submaker import mktimestamp
from loguru import logger
from edge_tts import submaker, SubMaker
import edge_tts
from moviepy.video.tools import subtitles

from app.config import config
from app.utils import utils


def get_all_voices(filter_locals=None) -> list[str]:
def get_all_azure_voices(filter_locals=None) -> list[str]:
if filter_locals is None:
filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW"]
voices_str = """
Expand Down Expand Up @@ -956,6 +958,34 @@ def get_all_voices(filter_locals=None) -> list[str]:
Name: zu-ZA-ThembaNeural
Gender: Male
Name: en-US-AvaMultilingualNeural-V2
Gender: Female
Name: en-US-AndrewMultilingualNeural-V2
Gender: Male
Name: en-US-EmmaMultilingualNeural-V2
Gender: Female
Name: en-US-BrianMultilingualNeural-V2
Gender: Male
Name: de-DE-FlorianMultilingualNeural-V2
Gender: Male
Name: de-DE-SeraphinaMultilingualNeural-V2
Gender: Female
Name: fr-FR-RemyMultilingualNeural-V2
Gender: Male
Name: fr-FR-VivienneMultilingualNeural-V2
Gender: Female
Name: zh-CN-XiaoxiaoMultilingualNeural-V2
Gender: Female
""".strip()
voices = []
name = ''
Expand Down Expand Up @@ -986,11 +1016,26 @@ def get_all_voices(filter_locals=None) -> list[str]:
def parse_voice_name(name: str):
# zh-CN-XiaoyiNeural-Female
# zh-CN-YunxiNeural-Male
# zh-CN-XiaoxiaoMultilingualNeural-V2-Female
name = name.replace("-Female", "").replace("-Male", "").strip()
return name


def is_azure_v2_voice(voice_name: str):
voice_name = parse_voice_name(voice_name)
print(voice_name)
if voice_name.endswith("-V2"):
return voice_name.replace("-V2", "").strip()
return ""


def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
if is_azure_v2_voice(voice_name):
return azure_tts_v2(text, voice_name, voice_file)
return azure_tts_v1(text, voice_name, voice_file)


def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
text = text.strip()
for i in range(3):
try:
Expand Down Expand Up @@ -1019,6 +1064,80 @@ async def _do() -> SubMaker:
return None


def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
voice_name = is_azure_v2_voice(voice_name)
if not voice_name:
logger.error(f"invalid voice name: {voice_name}")
raise ValueError(f"invalid voice name: {voice_name}")
text = text.strip()

def _format_duration_to_offset(duration) -> int:
if isinstance(duration, str):
time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
milliseconds = (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + (
time_obj.microsecond // 1000)
return milliseconds * 10000

if isinstance(duration, int):
return duration

return 0

for i in range(3):
try:
logger.info(f"start, voice name: {voice_name}, try: {i + 1}")

import azure.cognitiveservices.speech as speechsdk

sub_maker = SubMaker()

def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
# print('WordBoundary event:')
# print('\tBoundaryType: {}'.format(evt.boundary_type))
# print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
# print('\tDuration: {}'.format(evt.duration))
# print('\tText: {}'.format(evt.text))
# print('\tTextOffset: {}'.format(evt.text_offset))
# print('\tWordLength: {}'.format(evt.word_length))

duration = _format_duration_to_offset(str(evt.duration))
offset = _format_duration_to_offset(evt.audio_offset)
sub_maker.subs.append(evt.text)
sub_maker.offset.append((offset, offset + duration))

# Creates an instance of a speech config with specified subscription key and service region.
speech_key = config.azure.get("speech_key", "")
service_region = config.azure.get("speech_region", "")
audio_config = speechsdk.audio.AudioOutputConfig(filename=voice_file, use_default_speaker=True)
speech_config = speechsdk.SpeechConfig(subscription=speech_key,
region=service_region)
speech_config.speech_synthesis_voice_name = voice_name
# speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
# value='true')
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
value='true')

speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
speech_synthesizer = speechsdk.SpeechSynthesizer(audio_config=audio_config,
speech_config=speech_config)
speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)

result = speech_synthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
logger.success(f"azure v2 speech synthesis succeeded: {voice_file}")
return sub_maker
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
logger.error(f"azure v2 speech synthesis canceled: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
logger.error(f"azure v2 speech synthesis error: {cancellation_details.error_details}")
logger.info(f"completed, output file: {voice_file}")
except Exception as e:
logger.error(f"failed, error: {str(e)}")
return None


def _format_text(text: str) -> str:
# text = text.replace("\n", " ")
text = text.replace("[", " ")
Expand Down Expand Up @@ -1131,15 +1250,20 @@ def get_audio_duration(sub_maker: submaker.SubMaker):


if __name__ == "__main__":
voices = get_all_voices()
print(voices)
voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
voice_name = parse_voice_name(voice_name)
voice_name = is_azure_v2_voice(voice_name)
print(voice_name)

voices = get_all_azure_voices()
print(len(voices))


async def _do():
temp_dir = utils.storage_dir("temp")

voice_names = [
"zh-CN-XiaoxiaoMultilingualNeural",
# 女性
"zh-CN-XiaoxiaoNeural",
"zh-CN-XiaoyiNeural",
Expand Down Expand Up @@ -1174,6 +1298,7 @@ async def _do():
业绩解读
利润方面,2023全年贵州茅台,>归母净利润增速为19%,其中营业收入正贡献18%,营业成本正贡献百分之一,管理费用正贡献百分之一点四。(注:归母净利润增速值=营业收入增速+各科目贡献,展示贡献/拖累的前四名科目,且要求贡献值/净利润增速>15%)
"""
text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚,看到窗前的明月,不禁想起远方的家乡和亲人"

text = _format_text(text)
lines = utils.split_string_by_punctuations(text)
Expand All @@ -1182,7 +1307,7 @@ async def _do():
for voice_name in voice_names:
voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
subtitle_file = f"{temp_dir}/tts.mp3.srt"
sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
sub_maker = azure_tts_v2(text=text, voice_name=voice_name, voice_file=voice_file)
create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
audio_duration = get_audio_duration(sub_maker)
print(f"voice: {voice_name}, audio duration: {audio_duration}s")
Expand Down
2 changes: 1 addition & 1 deletion app/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def split_string_by_punctuations(s):
else:
result.append(txt.strip())
txt = ""

result.append(txt.strip())
# filter empty string
result = list(filter(None, result))
return result
Expand Down
8 changes: 7 additions & 1 deletion config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,10 @@
### Example: "http://user:pass@proxy:1234"
### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
# http = "http://10.10.1.10:3128"
# https = "http://10.10.1.10:1080"
# https = "http://10.10.1.10:1080"

[azure]
# Azure Speech API Key
# Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
speech_key=""
speech_region=""
8 changes: 7 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,10 @@ g4f~=0.2.5.4
dashscope~=1.15.0
google.generativeai~=0.4.1
python-multipart~=0.0.9
redis==5.0.3
redis==5.0.3
# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
opencv-python
# for azure speech
# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
azure-cognitiveservices-speech~=1.37.0
Loading

0 comments on commit 414bcb0

Please sign in to comment.