-
Notifications
You must be signed in to change notification settings - Fork 7
/
interruptor.py
172 lines (133 loc) · 5 KB
/
interruptor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from concurrent.futures import ThreadPoolExecutor
import threading
import time
import numpy
import io
import soundfile as sf
import sounddevice as sd
from fuzzywuzzy import fuzz
from openai import OpenAI
from llama_cpp import Llama
from dotenv import load_dotenv
import os
# whisper(user audio) -> query
# llama(query) -> completion
# loop until k-token convergence
# gpt(completion) -> response
# tts(response) -> response audio
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(
api_key=openai_api_key,
base_url="http://oai.hconeai.com/v1",
default_headers={"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}"},
)
query_completions = []
def find_converged_completion(query):
# Given an incomplete query, complete it. When a prior completion matches the last k tokens of the user's query, return it.
for old_query, completion in query_completions[::-1]:
if check_match(completion, query):
return " ".join([old_query, completion])
completion = llm(query)
if completion.startswith(query):
completion = completion.replace(query, "")
query_completions.append((query, completion))
return None
def check_match(completion, query, k=3, fuzz_threshold=60):
# Given a completion and a query, check if the last k tokens of the query match the completion. If so, return True.
query_toks = query.split(" ")[-k:]
completion_toks = completion.split(" ")[:k]
query_str = " ".join(query_toks)
completion_str = " ".join(completion_toks)
if fuzz.ratio(query_str, completion_str) > fuzz_threshold:
return True
else:
return False
# model = Llama("./models/llama-2-7b.Q4_K_M.gguf")
# def llm(query):
# response = model(
# query,
# max_tokens=6,
# stop=[
# ".",
# "\n",
# ],
# echo=False, # Echo the prompt back in the output
# )
# completion = response['choices'][0]['text']
# print("COMPLETION: ", completion, "\n")
# return completion
def llm(query):
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": "Please complete the user's text. Do not repeat the user's text. Do not add or say anything else. Do NOT answer their query or respond to them. Simply respond by 'autocompleting' their sentence. If they've completed a sentence, continue with the next sentence.",
},
{"role": "user", "content": query},
],
max_tokens=64,
)
print("COMPLETION: ", response.choices[0].message.content, "\n")
return response.choices[0].message.content
def gpt(completion):
response = client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": completion},
],
max_tokens=48,
)
print("INTERRUPTION: ", response.choices[0].message.content)
return response.choices[0].message.content
def tts(response, voice="onyx"):
spoken_response = client.audio.speech.create(
model="tts-1", voice=voice, response_format="opus", input=response
)
buffer = io.BytesIO()
for chunk in spoken_response.iter_bytes(chunk_size=4096):
buffer.write(chunk)
buffer.seek(0)
with sf.SoundFile(buffer, "r") as sound_file:
data = sound_file.read(dtype="int16")
sd.play(data, sound_file.samplerate)
sd.wait()
def tts_from_future(future):
gpt_result = future.result() # This will block until the gpt function completes
tts(gpt_result, "onyx")
def thoughtocomplete(query):
completion = find_converged_completion(query)
if completion:
print("THOUGHTOCOMPLETE: ", completion)
# tts(f"Thought o complete: {completion}", voice="shimmer")
threading.Thread(
target=tts,
args=("interrupting cow! interrupting cow! interrupting cow!", "onyx"),
).start()
with ThreadPoolExecutor() as executor:
future = executor.submit(gpt, completion)
threading.Thread(target=tts_from_future, args=(future,)).start()
return True
return False
if __name__ == "__main__":
query = (
"What is the following song? On a dark desert highway, cool wind in my hair."
)
# query = "Who said this quote? To be or not to be, that is the question."
query_toks = query.split(" ")
for i in range(1, len(query_toks)):
curr_query = " ".join(query_toks[:i])
print(curr_query)
# tts(query_toks[i-1], voice="nova")
done = thoughtocomplete(curr_query)
if done:
break
# query = "What is the following song? On a dark desert highway"
# completion = llm(query)
# print(completion)
# For AI to be an extension of thought, convos must be fluid.
# When I speak, you predict what I'll say.
# If you understand what I'm saying, you interrupt me.
# LLMs don't do this.