Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for local models (OpenAI compatible) #157

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion app/ui/settings/settings_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def get_text_rendering_settings(self):
def get_llm_settings(self):
return {
'extra_context': self.ui.llm_widgets['extra_context'].toPlainText(),
'image_input_enabled': self.ui.llm_widgets['image_input'].isChecked()
'image_input_enabled': self.ui.llm_widgets['image_input'].isChecked(),
'local_oai_url_input': self.ui.llm_widgets['local_oai_url_input'].text(),
'local_oai_model_input': self.ui.llm_widgets['local_oai_model_input'].text()
}

def get_export_settings(self):
Expand Down Expand Up @@ -268,6 +270,8 @@ def load_settings(self):
settings.beginGroup('llm')
self.ui.llm_widgets['extra_context'].setPlainText(settings.value('extra_context', ''))
self.ui.llm_widgets['image_input'].setChecked(settings.value('image_input_enabled', True, type=bool))
self.ui.llm_widgets['local_oai_url_input'].setText(settings.value('local_oai_url_input', ''))
self.ui.llm_widgets['local_oai_model_input'].setText(settings.value('local_oai_model_input', ''))
settings.endGroup()

# Load export settings
Expand Down
33 changes: 32 additions & 1 deletion app/ui/settings/settings_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, parent=None):
self.tr("Claude-3-Opus"), self.tr("Claude-3.5-Sonnet"),
self.tr("Claude-3-Haiku"), self.tr("Gemini-1.5-Flash"),
self.tr("Gemini-1.5-Pro"), self.tr("Yandex"), self.tr("Google Translate"),
self.tr("Microsoft Translator")]
self.tr("Microsoft Translator"), self.tr("Local OpenAI Server")]

self.languages = ['English', '한국어', 'Français', '日本語',
'简体中文', '繁體中文', 'русский', 'Deutsch',
Expand Down Expand Up @@ -78,6 +78,7 @@ def __init__(self, parent=None):
self.tr("Yandex"): "Yandex",
self.tr("Google Translate"): "Google Translate",
self.tr("Microsoft Translator"): "Microsoft Translator",
self.tr("Local OpenAI Server"): "Local OpenAI Server",

# OCR mappings
self.tr("Default"): "Default",
Expand Down Expand Up @@ -435,9 +436,39 @@ def _create_llms_layout(self):
image_checkbox.setChecked(True)
self.llm_widgets['image_input'] = image_checkbox

# Local OpenAI
local_oai_label = MLabel(self.tr("Local OpenAI Server")).strong()

local_oai_url_input = MLineEdit()
local_oai_url_input.setFixedWidth(400)
local_oai_url_input.setPlaceholderText("http://localhost:1337/v1/")
local_oai_url_prefix = MLabel(self.tr("Base API URL")).border()

self.set_label_width(local_oai_url_prefix)
local_oai_url_prefix.setAlignment(QtCore.Qt.AlignmentFlag.AlignCenter)
local_oai_url_input.set_prefix_widget(local_oai_url_prefix)

self.llm_widgets["local_oai_url_input"] = local_oai_url_input

local_oai_model_input = MLineEdit()
local_oai_model_input.setFixedWidth(400)
local_oai_model_input.setPlaceholderText("llama3.1-8b-instruct")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using llama3.1-8b-instruct as the default model.

local_oai_model_prefix = MLabel(self.tr("Model ID")).border()

self.set_label_width(local_oai_model_prefix)
local_oai_model_prefix.setAlignment(QtCore.Qt.AlignmentFlag.AlignCenter)
local_oai_model_input.set_prefix_widget(local_oai_model_prefix)

self.llm_widgets["local_oai_model_input"] = local_oai_model_input


llms_layout.addWidget(prompt_label)
llms_layout.addWidget(self.llm_widgets['extra_context'])
llms_layout.addWidget(image_checkbox)
llms_layout.addSpacing(20) # Add 20 pixels of vertical spacing
llms_layout.addWidget(local_oai_label)
llms_layout.addWidget(local_oai_url_input)
llms_layout.addWidget(local_oai_model_input)
llms_layout.addStretch(1)

return llms_layout
Expand Down
2 changes: 1 addition & 1 deletion modules/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def process(self, img: np.ndarray, blk_list: List[TextBlock]):
elif self.gpt_ocr:
credentials = self.settings.get_credentials(self.settings.ui.tr("Open AI GPT"))
api_key = credentials['api_key']
gpt_client = get_llm_client('GPT', api_key)
gpt_client = get_llm_client('GPT', api_key, None)
return self._ocr_gpt(img, blk_list, gpt_client)

else:
Expand Down
60 changes: 53 additions & 7 deletions modules/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(self, main_page, source_lang: str = "", target_lang: str = ""):
self.target_lang_en = self.get_english_lang(main_page, self.target_lang)

self.api_key = self.get_api_key(self.translator_key)
self.client = get_llm_client(self.translator_key, self.api_key)
self.base_url = self.get_base_url(self.translator_key)
self.client = get_llm_client(self.translator_key, self.api_key, self.base_url)

self.img_as_llm_input = self.settings.get_llm_settings()['image_input_enabled']

Expand All @@ -38,7 +39,8 @@ def get_translator_key(self, localized_translator: str) -> str:
self.settings.ui.tr("Google Translate"): "Google Translate",
self.settings.ui.tr("Microsoft Translator"): "Microsoft Translator",
self.settings.ui.tr("DeepL"): "DeepL",
self.settings.ui.tr("Yandex"): "Yandex"
self.settings.ui.tr("Yandex"): "Yandex",
self.settings.ui.tr("Local OpenAI Server"): "Local OpenAI Server",
}
return translator_map.get(localized_translator, localized_translator)

Expand All @@ -53,9 +55,16 @@ def get_llm_model(self, translator_key: str):
"Claude-3.5-Sonnet": "claude-3-5-sonnet-20240620",
"Claude-3-Haiku": "claude-3-haiku-20240307",
"Gemini-1.5-Flash": "gemini-1.5-flash-latest",
"Gemini-1.5-Pro": "gemini-1.5-pro-latest"
"Gemini-1.5-Pro": "gemini-1.5-pro-latest",
"Local OpenAI Server": self.settings.ui.llm_widgets['local_oai_model_input'].text()
}
return model_map.get(translator_key)

model = model_map.get(translator_key)

if not model and translator_key == "Local OpenAI Server":
raise ValueError(f"Model not found for translator: {translator_key}")

return model

def get_system_prompt(self, source_lang: str, target_lang: str):
return f"""You are an expert translator who translates {source_lang} to {target_lang}. You pay attention to style, formality, idioms, slang etc and try to convey it in the way a {target_lang} speaker would understand.
Expand Down Expand Up @@ -89,7 +98,31 @@ def get_gpt_translation(self, user_prompt: str, model: str, system_prompt: str,

translated = response.choices[0].message.content
return translated


def get_local_gpt_translation(self, user_prompt: str, model: str, system_prompt: str, image: np.ndarray):
encoded_image = encode_image_array(image)

if self.img_as_llm_input:
message = [
{"role": "system", "content": [{"type": "text", "text": system_prompt}]},
{"role": "user", "content": [{"type": "text", "text": user_prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}]}
]
else:
message = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
Comment on lines +111 to +114
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be a Jan-specific issue, but requests in format: {"type": "text", "text": system_prompt} do not work, since only text is supported (images are not). However, I left the image option toggle and the request format for that alone, in case any OpenAI compatible servers do support images.


response = self.client.chat.completions.create(
model=model,
messages=message,
temperature=1,
max_tokens=1000,
)

translated = response.choices[0].message.content
return translated

def get_claude_translation(self, user_prompt: str, model: str, system_prompt: str, image: np.ndarray):
encoded_image = encode_image_array(image)
media_type = "image/png"
Expand Down Expand Up @@ -192,6 +225,8 @@ def translate(self, blk_list: List[TextBlock], image: np.ndarray, extra_context:
elif 'Gemini' in self.translator_key:
image = cv2_to_pil(image)
entire_translated_text = self.get_gemini_translation(user_prompt, model, system_prompt, image)
elif 'Local OpenAI' in self.translator_key:
entire_translated_text = self.get_local_gpt_translation(user_prompt, model, system_prompt, image)

set_texts_from_json(blk_list, entire_translated_text)

Expand All @@ -216,7 +251,18 @@ def get_api_key(self, translator_key: str):
}
api_key = api_key_map.get(translator_key, "")

if not api_key and translator_key!= 'Google Translate':
if not api_key and translator_key not in ['Google Translate', 'Local OpenAI Server']:
raise ValueError(f"API key not found for translator: {translator_key}")

return api_key
return api_key

def get_base_url(self, translator_key: str):
base_url = None

if 'Local OpenAI' in translator_key:
base_url = self.settings.ui.llm_widgets['local_oai_url_input'].text()

if not base_url and translator_key == "Local OpenAI Server":
raise ValueError(f"Base URL not found for translator: {translator_key}")

return base_url
Comment on lines +259 to +268
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added logic to pull the base_url in translator.py and pass it to the get_llm_client() helper, rather than have the helper need to access the settings pane.

7 changes: 6 additions & 1 deletion modules/utils/translator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,19 @@ def encode_image_array(img_array: np.ndarray):
_, img_bytes = cv2.imencode('.png', img_array)
return base64.b64encode(img_bytes).decode('utf-8')

def get_llm_client(translator: str, api_key: str):
def get_llm_client(translator: str, api_key: str, base_url: str):
if 'GPT' in translator:
client = OpenAI(api_key = api_key)
elif 'Claude' in translator:
client = anthropic.Anthropic(api_key = api_key)
elif 'Gemini' in translator:
client = genai
client.configure(api_key = api_key)
elif 'Local OpenAI' in translator:
client = OpenAI(
api_key = "none", # Must be set, but not checked
base_url = base_url
)
else:
client = None

Expand Down