diff --git a/app/ui/settings/settings_page.py b/app/ui/settings/settings_page.py index e52c7a5..935cbcd 100644 --- a/app/ui/settings/settings_page.py +++ b/app/ui/settings/settings_page.py @@ -61,7 +61,9 @@ def get_text_rendering_settings(self): def get_llm_settings(self): return { 'extra_context': self.ui.llm_widgets['extra_context'].toPlainText(), - 'image_input_enabled': self.ui.llm_widgets['image_input'].isChecked() + 'image_input_enabled': self.ui.llm_widgets['image_input'].isChecked(), + 'local_oai_url_input': self.ui.llm_widgets['local_oai_url_input'].text(), + 'local_oai_model_input': self.ui.llm_widgets['local_oai_model_input'].text() } def get_export_settings(self): @@ -268,6 +270,8 @@ def load_settings(self): settings.beginGroup('llm') self.ui.llm_widgets['extra_context'].setPlainText(settings.value('extra_context', '')) self.ui.llm_widgets['image_input'].setChecked(settings.value('image_input_enabled', True, type=bool)) + self.ui.llm_widgets['local_oai_url_input'].setText(settings.value('local_oai_url_input', '')) + self.ui.llm_widgets['local_oai_model_input'].setText(settings.value('local_oai_model_input', '')) settings.endGroup() # Load export settings diff --git a/app/ui/settings/settings_ui.py b/app/ui/settings/settings_ui.py index 3cb7d02..085b5e2 100644 --- a/app/ui/settings/settings_ui.py +++ b/app/ui/settings/settings_ui.py @@ -38,7 +38,7 @@ def __init__(self, parent=None): self.tr("Claude-3-Opus"), self.tr("Claude-3.5-Sonnet"), self.tr("Claude-3-Haiku"), self.tr("Gemini-1.5-Flash"), self.tr("Gemini-1.5-Pro"), self.tr("Yandex"), self.tr("Google Translate"), - self.tr("Microsoft Translator")] + self.tr("Microsoft Translator"), self.tr("Local OpenAI Server")] self.languages = ['English', '한국어', 'Français', '日本語', '简体中文', '繁體中文', 'русский', 'Deutsch', @@ -78,6 +78,7 @@ def __init__(self, parent=None): self.tr("Yandex"): "Yandex", self.tr("Google Translate"): "Google Translate", self.tr("Microsoft Translator"): "Microsoft Translator", + self.tr("Local OpenAI Server"): "Local OpenAI Server", # OCR mappings self.tr("Default"): "Default", @@ -435,9 +436,39 @@ def _create_llms_layout(self): image_checkbox.setChecked(True) self.llm_widgets['image_input'] = image_checkbox + # Local OpenAI + local_oai_label = MLabel(self.tr("Local OpenAI Server")).strong() + + local_oai_url_input = MLineEdit() + local_oai_url_input.setFixedWidth(400) + local_oai_url_input.setPlaceholderText("http://localhost:1337/v1/") + local_oai_url_prefix = MLabel(self.tr("Base API URL")).border() + + self.set_label_width(local_oai_url_prefix) + local_oai_url_prefix.setAlignment(QtCore.Qt.AlignmentFlag.AlignCenter) + local_oai_url_input.set_prefix_widget(local_oai_url_prefix) + + self.llm_widgets["local_oai_url_input"] = local_oai_url_input + + local_oai_model_input = MLineEdit() + local_oai_model_input.setFixedWidth(400) + local_oai_model_input.setPlaceholderText("llama3.1-8b-instruct") + local_oai_model_prefix = MLabel(self.tr("Model ID")).border() + + self.set_label_width(local_oai_model_prefix) + local_oai_model_prefix.setAlignment(QtCore.Qt.AlignmentFlag.AlignCenter) + local_oai_model_input.set_prefix_widget(local_oai_model_prefix) + + self.llm_widgets["local_oai_model_input"] = local_oai_model_input + + llms_layout.addWidget(prompt_label) llms_layout.addWidget(self.llm_widgets['extra_context']) llms_layout.addWidget(image_checkbox) + llms_layout.addSpacing(20) # Add 20 pixels of vertical spacing + llms_layout.addWidget(local_oai_label) + llms_layout.addWidget(local_oai_url_input) + llms_layout.addWidget(local_oai_model_input) llms_layout.addStretch(1) return llms_layout diff --git a/modules/ocr/ocr.py b/modules/ocr/ocr.py index 9050578..baae780 100644 --- a/modules/ocr/ocr.py +++ b/modules/ocr/ocr.py @@ -74,7 +74,7 @@ def process(self, img: np.ndarray, blk_list: List[TextBlock]): elif self.gpt_ocr: credentials = self.settings.get_credentials(self.settings.ui.tr("Open AI GPT")) api_key = credentials['api_key'] - gpt_client = get_llm_client('GPT', api_key) + gpt_client = get_llm_client('GPT', api_key, None) return self._ocr_gpt(img, blk_list, gpt_client) else: diff --git a/modules/translator.py b/modules/translator.py index 03a92a9..5663721 100644 --- a/modules/translator.py +++ b/modules/translator.py @@ -21,7 +21,8 @@ def __init__(self, main_page, source_lang: str = "", target_lang: str = ""): self.target_lang_en = self.get_english_lang(main_page, self.target_lang) self.api_key = self.get_api_key(self.translator_key) - self.client = get_llm_client(self.translator_key, self.api_key) + self.base_url = self.get_base_url(self.translator_key) + self.client = get_llm_client(self.translator_key, self.api_key, self.base_url) self.img_as_llm_input = self.settings.get_llm_settings()['image_input_enabled'] @@ -38,7 +39,8 @@ def get_translator_key(self, localized_translator: str) -> str: self.settings.ui.tr("Google Translate"): "Google Translate", self.settings.ui.tr("Microsoft Translator"): "Microsoft Translator", self.settings.ui.tr("DeepL"): "DeepL", - self.settings.ui.tr("Yandex"): "Yandex" + self.settings.ui.tr("Yandex"): "Yandex", + self.settings.ui.tr("Local OpenAI Server"): "Local OpenAI Server", } return translator_map.get(localized_translator, localized_translator) @@ -53,9 +55,16 @@ def get_llm_model(self, translator_key: str): "Claude-3.5-Sonnet": "claude-3-5-sonnet-20240620", "Claude-3-Haiku": "claude-3-haiku-20240307", "Gemini-1.5-Flash": "gemini-1.5-flash-latest", - "Gemini-1.5-Pro": "gemini-1.5-pro-latest" + "Gemini-1.5-Pro": "gemini-1.5-pro-latest", + "Local OpenAI Server": self.settings.ui.llm_widgets['local_oai_model_input'].text() } - return model_map.get(translator_key) + + model = model_map.get(translator_key) + + if not model and translator_key == "Local OpenAI Server": + raise ValueError(f"Model not found for translator: {translator_key}") + + return model def get_system_prompt(self, source_lang: str, target_lang: str): return f"""You are an expert translator who translates {source_lang} to {target_lang}. You pay attention to style, formality, idioms, slang etc and try to convey it in the way a {target_lang} speaker would understand. @@ -89,7 +98,31 @@ def get_gpt_translation(self, user_prompt: str, model: str, system_prompt: str, translated = response.choices[0].message.content return translated - + + def get_local_gpt_translation(self, user_prompt: str, model: str, system_prompt: str, image: np.ndarray): + encoded_image = encode_image_array(image) + + if self.img_as_llm_input: + message = [ + {"role": "system", "content": [{"type": "text", "text": system_prompt}]}, + {"role": "user", "content": [{"type": "text", "text": user_prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}]} + ] + else: + message = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + + response = self.client.chat.completions.create( + model=model, + messages=message, + temperature=1, + max_tokens=1000, + ) + + translated = response.choices[0].message.content + return translated + def get_claude_translation(self, user_prompt: str, model: str, system_prompt: str, image: np.ndarray): encoded_image = encode_image_array(image) media_type = "image/png" @@ -192,6 +225,8 @@ def translate(self, blk_list: List[TextBlock], image: np.ndarray, extra_context: elif 'Gemini' in self.translator_key: image = cv2_to_pil(image) entire_translated_text = self.get_gemini_translation(user_prompt, model, system_prompt, image) + elif 'Local OpenAI' in self.translator_key: + entire_translated_text = self.get_local_gpt_translation(user_prompt, model, system_prompt, image) set_texts_from_json(blk_list, entire_translated_text) @@ -216,7 +251,18 @@ def get_api_key(self, translator_key: str): } api_key = api_key_map.get(translator_key, "") - if not api_key and translator_key!= 'Google Translate': + if not api_key and translator_key not in ['Google Translate', 'Local OpenAI Server']: raise ValueError(f"API key not found for translator: {translator_key}") - return api_key \ No newline at end of file + return api_key + + def get_base_url(self, translator_key: str): + base_url = None + + if 'Local OpenAI' in translator_key: + base_url = self.settings.ui.llm_widgets['local_oai_url_input'].text() + + if not base_url and translator_key == "Local OpenAI Server": + raise ValueError(f"Base URL not found for translator: {translator_key}") + + return base_url \ No newline at end of file diff --git a/modules/utils/translator_utils.py b/modules/utils/translator_utils.py index b204bd2..db086e2 100644 --- a/modules/utils/translator_utils.py +++ b/modules/utils/translator_utils.py @@ -15,7 +15,7 @@ def encode_image_array(img_array: np.ndarray): _, img_bytes = cv2.imencode('.png', img_array) return base64.b64encode(img_bytes).decode('utf-8') -def get_llm_client(translator: str, api_key: str): +def get_llm_client(translator: str, api_key: str, base_url: str): if 'GPT' in translator: client = OpenAI(api_key = api_key) elif 'Claude' in translator: @@ -23,6 +23,11 @@ def get_llm_client(translator: str, api_key: str): elif 'Gemini' in translator: client = genai client.configure(api_key = api_key) + elif 'Local OpenAI' in translator: + client = OpenAI( + api_key = "none", # Must be set, but not checked + base_url = base_url + ) else: client = None