Skip to content

Commit

Permalink
new generate_subs_based_on_punc function (#218)
Browse files Browse the repository at this point in the history
* update generate cn subs

* # new file:   src/streaming_with_cn_subtitles.py
    new example

* modified:   src/streaming_with_cn_subtitles.py

* update new re clause

* modified:   src/streaming_with_cn_subtitles.py

* update comment and rename file

---------

Co-authored-by: wh1te-moon <[email protected]>
  • Loading branch information
wh1te-moon and wh1te-moon authored May 8, 2024
1 parent 388e6f2 commit d7880c8
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 1 deletion.
70 changes: 69 additions & 1 deletion src/edge_tts/submaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import math
import re
from typing import List, Tuple
from xml.sax.saxutils import escape, unescape

Expand Down Expand Up @@ -98,7 +99,7 @@ def generate_subs(self, words_in_cue: int = 10) -> str:
if sub_state_count == words_in_cue or idx == len(self.offset) - 1:
subs = sub_state_subs
split_subs: List[str] = [
subs[i : i + 79] for i in range(0, len(subs), 79)
subs[i: i + 79] for i in range(0, len(subs), 79)
]
for i in range(len(split_subs) - 1):
sub = split_subs[i]
Expand All @@ -123,3 +124,70 @@ def generate_subs(self, words_in_cue: int = 10) -> str:
sub_state_start = -1
sub_state_subs = ""
return data

def generate_subs_based_on_punc(self, text) -> str:
PUNCTUATION = [',', '。', '!', '?', ';',
':', '\n', '“', '”', ',', '!', '\\. ']
# def clause(self)->list[str]:
# start=0
# i=0
# text_list=[]
# while(i<len(text)):
# if text[i] in PUNCTUATION:
# try:
# while text[i] in PUNCTUATION:
# i+=1
# except IndexError:
# pass
# text_list.append(text[start:i])
# start=i
# i+=1
# return text_list

def clause(self) -> list[str]:
pattern = '(' + '|'.join(punc for punc in PUNCTUATION) + ')'
text_list = re.split(pattern, text)

index = 0
pattern = '^[' + ''.join(p for p in PUNCTUATION) + ']+$'
while (index < len(text_list)-1):
if not text_list[index+1]:
text_list.pop(index+1)
continue
if re.match(pattern, text_list[index+1]):
if (text_list[index+1] == '\n'):
text_list.pop(index+1)
continue
text_list[index] += text_list.pop(index+1)
else:
index += 1

return text_list

self.text_list = clause(self)
if len(self.subs) != len(self.offset):
raise ValueError("subs and offset are not of the same length")
data = "WEBVTT\r\n\r\n"
j = 0
for text in self.text_list:
try:
start_time = self.offset[j][0]
except IndexError:
return data
try:
while (self.subs[j + 1] in text):
j += 1
except IndexError:
pass
data += formatter(start_time, self.offset[j][1], text)
j += 1
return data


if __name__ == "__main__":
generator = SubMaker()
generator.create_sub((0, 15000), " 你好,")
generator.create_sub((15000, 15000), "世界!")
print(generator.generate_subs_based_on_punc("你好,世界!"))
# print(generator.generate_subs())
print()
42 changes: 42 additions & 0 deletions src/streaming_with_new_subtitles_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import asyncio
import edge_tts
from edge_tts.communicate import Communicate

TEXT = """Title: Exploring the Beauty of Mathematics
Mathematics, often regarded as the language of the universe, encompasses a myriad of concepts, from basic arithmetic to complex calculus. It's a discipline that transcends boundaries and delves into the depths of abstraction.
In mathematics, precision is paramount. Numbers dance across the page, guided by symbols such as +, -, ×, and ÷, each punctuation mark playing a crucial role in shaping equations and expressions. These symbols, like punctuation in language, clarify and organize mathematical ideas.
Consider the beauty of decimals, those subtle points that delineate fractions of wholes. They appear unassumingly yet hold profound significance in calculations. Whether it's 3.14, the beloved pi, or the golden ratio 1.618, decimals offer glimpses into the elegant patterns underlying the chaos of numbers.
But mathematics isn't just about numbers and symbols; it's about discovery and exploration. It's about unraveling the mysteries of the universe, from the microscopic world of quantum mechanics to the vast expanse of cosmology. Punctuation marks in mathematics, much like their linguistic counterparts, serve as signposts on this journey, guiding us through the intricate landscapes of mathematical thought.
So let us embrace the beauty of mathematics, where decimals and punctuation marks converge to form the tapestry of our understanding, illuminating the path to new insights and discoveries."""
# VOICE = "zh-CN-YunxiNeural"
OUTPUT_FILE = "test.mp3"
WEBVTT_FILE = "test.vtt"

async def amain() -> None:
"""Main function"""
communicate = Communicate(TEXT,
# rate="+50%",volume="+50%"
)
submaker = edge_tts.SubMaker()
with open(OUTPUT_FILE, "wb") as file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])

with open(WEBVTT_FILE, "w", encoding="utf-8") as file:
file.write(submaker.generate_subs_based_on_punc(TEXT))
# file.write(submaker.generate_subs())


loop = asyncio.get_event_loop_policy().get_event_loop()
try:
loop.run_until_complete(amain())
finally:
loop.close()

0 comments on commit d7880c8

Please sign in to comment.