interruptor.py

from concurrent.futures import ThreadPoolExecutor
import threading
import time
import numpy
import io
import soundfile as sf
import sounddevice as sd
from fuzzywuzzy import fuzz
from openai import OpenAI
from llama_cpp import Llama
from dotenv import load_dotenv
import os

# whisper(user audio) -> query
# llama(query) -> completion
# loop until k-token convergence
# gpt(completion) -> response
# tts(response) -> response audio

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    api_key=openai_api_key,
    base_url="http://oai.hconeai.com/v1",
    default_headers={"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}"},
)

query_completions = []


def find_converged_completion(query):
    # Given an incomplete query, complete it. When a prior completion matches the last k tokens of the user's query, return it.
    for old_query, completion in query_completions[::-1]:
        if check_match(completion, query):
            return " ".join([old_query, completion])

    completion = llm(query)
    if completion.startswith(query):
        completion = completion.replace(query, "")

    query_completions.append((query, completion))
    return None


def check_match(completion, query, k=3, fuzz_threshold=60):
    # Given a completion and a query, check if the last k tokens of the query match the completion. If so, return True.
    query_toks = query.split(" ")[-k:]
    completion_toks = completion.split(" ")[:k]

    query_str = " ".join(query_toks)
    completion_str = " ".join(completion_toks)

    if fuzz.ratio(query_str, completion_str) > fuzz_threshold:
        return True
    else:
        return False


# model = Llama("./models/llama-2-7b.Q4_K_M.gguf")

# def llm(query):
#     response = model(
#         query,
#         max_tokens=6,
#         stop=[
#             ".",
#             "\n",
#         ],
#         echo=False,  # Echo the prompt back in the output
#     )
#     completion = response['choices'][0]['text']

#     print("COMPLETION: ", completion, "\n")

#     return completion


def llm(query):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "Please complete the user's text. Do not repeat the user's text. Do not add or say anything else. Do NOT answer their query or respond to them. Simply respond by 'autocompleting' their sentence. If they've completed a sentence, continue with the next sentence.",
            },
            {"role": "user", "content": query},
        ],
        max_tokens=64,
    )

    print("COMPLETION: ", response.choices[0].message.content, "\n")

    return response.choices[0].message.content


def gpt(completion):
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": completion},
        ],
        max_tokens=48,
    )

    print("INTERRUPTION: ", response.choices[0].message.content)

    return response.choices[0].message.content


def tts(response, voice="onyx"):
    spoken_response = client.audio.speech.create(
        model="tts-1", voice=voice, response_format="opus", input=response
    )

    buffer = io.BytesIO()
    for chunk in spoken_response.iter_bytes(chunk_size=4096):
        buffer.write(chunk)
    buffer.seek(0)

    with sf.SoundFile(buffer, "r") as sound_file:
        data = sound_file.read(dtype="int16")
        sd.play(data, sound_file.samplerate)
        sd.wait()


def tts_from_future(future):
    gpt_result = future.result()  # This will block until the gpt function completes
    tts(gpt_result, "onyx")


def thoughtocomplete(query):
    completion = find_converged_completion(query)
    if completion:
        print("THOUGHTOCOMPLETE: ", completion)
        # tts(f"Thought o complete: {completion}", voice="shimmer")
        threading.Thread(
            target=tts,
            args=("interrupting cow! interrupting cow! interrupting cow!", "onyx"),
        ).start()
        with ThreadPoolExecutor() as executor:
            future = executor.submit(gpt, completion)
            threading.Thread(target=tts_from_future, args=(future,)).start()
            return True
    return False


if __name__ == "__main__":
    query = (
        "What is the following song? On a dark desert highway, cool wind in my hair."
    )
    # query = "Who said this quote? To be or not to be, that is the question."
    query_toks = query.split(" ")

    for i in range(1, len(query_toks)):
        curr_query = " ".join(query_toks[:i])
        print(curr_query)
        # tts(query_toks[i-1], voice="nova")
        done = thoughtocomplete(curr_query)
        if done:
            break

    # query = "What is the following song? On a dark desert highway"
    # completion = llm(query)
    # print(completion)


# For AI to be an extension of thought, convos must be fluid.
# When I speak, you predict what I'll say.
# If you understand what I'm saying, you interrupt me.
# LLMs don't do this.