Skip to content

Commit

Permalink
Merge pull request #286 from TomTom101/feat/regex_endofsentence
Browse files Browse the repository at this point in the history
fix: No more falsely detect a sentence end on "U.S.A", "3:00 a.m."
  • Loading branch information
aconchillo authored Jul 17, 2024
2 parents 9e3d87e + da2082b commit 029bbc1
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 5 deletions.
22 changes: 19 additions & 3 deletions src/pipecat/services/ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,24 @@
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.audio import calculate_audio_volume
from pipecat.utils.utils import exp_smoothing
import re


ENDOFSENTENCE_PATTERN_STR = r"""
(?<![A-Z]) # Negative lookbehind: not preceded by an uppercase letter (e.g., "U.S.A.")
(?<!\d) # Negative lookbehind: not preceded by a digit (e.g., "1. Let's start")
(?<!\d\s[ap]) # Negative lookbehind: not preceded by time (e.g., "3:00 a.m.")
(?<!Mr|Ms|Dr) # Negative lookbehind: not preceded by Mr, Ms, Dr (combined bc. length is the same)
(?<!Mrs) # Negative lookbehind: not preceded by "Mrs"
(?<!Prof) # Negative lookbehind: not preceded by "Prof"
[\.\?\!:] # Match a period, question mark, exclamation point, or colon
$ # End of string
"""
ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)


def match_endofsentence(text: str) -> bool:
return ENDOFSENTENCE_PATTERN.search(text.rstrip()) is not None


class AIService(FrameProcessor):
Expand Down Expand Up @@ -137,9 +155,7 @@ async def _process_text_frame(self, frame: TextFrame):
text = frame.text
else:
self._current_sentence += frame.text
if self._current_sentence.strip().endswith(
(".", "?", "!")) and not self._current_sentence.strip().endswith(
("Mr,", "Mrs.", "Ms.", "Dr.")):
if match_endofsentence(self._current_sentence):
text = self._current_sentence
self._current_sentence = ""

Expand Down
20 changes: 18 additions & 2 deletions tests/test_ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from typing import AsyncGenerator

from pipecat.services.ai_services import AIService
from pipecat.pipeline.frames import EndFrame, Frame, TextFrame
from pipecat.services.ai_services import AIService, match_endofsentence
from pipecat.frames.frames import EndFrame, Frame, TextFrame


class SimpleAIService(AIService):
Expand All @@ -27,6 +27,22 @@ async def test_simple_processing(self):

self.assertEqual(input_frames, output_frames)

async def test_endofsentence(self):
assert match_endofsentence("This is a sentence.")
assert match_endofsentence("This is a sentence! ")
assert match_endofsentence("This is a sentence?")
assert match_endofsentence("This is a sentence:")
assert not match_endofsentence("This is not a sentence")
assert not match_endofsentence("This is not a sentence,")
assert not match_endofsentence("This is not a sentence, ")
assert not match_endofsentence("Ok, Mr. Smith let's ")
assert not match_endofsentence("Dr. Walker, I presume ")
assert not match_endofsentence("Prof. Walker, I presume ")
assert not match_endofsentence("zweitens, und 3.")
assert not match_endofsentence("Heute ist Dienstag, der 3.") # 3. Juli 2024
assert not match_endofsentence("America, or the U.") # U.S.A.
assert not match_endofsentence("It still early, it's 3:00 a.") # 3:00 a.m.


if __name__ == "__main__":
unittest.main()

0 comments on commit 029bbc1

Please sign in to comment.