Merge pull request #65 from it-at-m/fix-max-tokens

Fix max tokens and version
it-at-m · Sep 11, 2024 · 2b96e14 · 2b96e14
2 parents 28d613b + 445bb6a
commit 2b96e14
Show file tree

Hide file tree

Showing 24 changed files with 172 additions and 101 deletions.
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -101,12 +101,12 @@ async def chat_stream():
     try:
         impl = cfg["chat_approaches"]
         temperature=request_json['temperature'] or 0.7
-        max_tokens=request_json['max_tokens'] or 4096
+        max_output_tokens=request_json['max_output_tokens'] or 4096
         system_message = request_json['system_message'] or None
         model = request_json['model'] or "gpt-4o-mini"
         response_generator = impl.run_with_streaming(history= request_json["history"],
                                                     temperature=temperature,
-                                                    max_tokens=max_tokens,
+                                                    max_output_tokens=max_output_tokens,
                                                     system_message=system_message,
                                                     model=model,
                                                     department= department)
@@ -128,13 +128,13 @@ async def chat():
     try:
         impl = cfg["chat_approaches"]
         temperature=request_json['temperature'] or 0.7
-        max_tokens=request_json['max_tokens'] or 4096
+        max_output_tokens=request_json['max_output_tokens'] or 4096
         model_name=request_json['model'] or "gpt-4o-mini"
         system_message = request_json['system_message'] or None
         history =  request_json["history"]
         chatResult = impl.run_without_streaming(history= history,
                                                     temperature=temperature,
-                                                    max_tokens=max_tokens,
+                                                    max_output_tokens=max_output_tokens,
                                                     system_message=system_message,
                                                     department= department,
                                                     model_name= model_name)
@@ -150,11 +150,12 @@ async def getConfig():
     models= cast(List[ModelsConfig], cfg["configuration_features"]["backend"]["models"])
     models_dto_list = []
     for model in models:
-        dto = ModelsDTO(model_name=model["model_name"], max_tokens=model["max_tokens"], description=model["description"])
+        dto = ModelsDTO(model_name=model["model_name"], max_output_tokens=model["max_output_tokens"], max_input_tokens=model["max_input_tokens"], description=model["description"])
         models_dto_list.append(dto)
     return jsonify({
         "frontend": frontend_features,
-        "models": models_dto_list
+        "models": models_dto_list,
+        "version": cfg["configuration_features"]["version"]
     })
 
 @bp.route("/statistics", methods=["GET"])

diff --git a/app/backend/chat/chat.py b/app/backend/chat/chat.py
@@ -21,12 +21,12 @@ def __init__(self, llm: RunnableSerializable, config: ApproachConfig, repo: Repo
         self.config = config
         self.repo = repo
 
-    async def run_with_streaming(self, history: 'list[dict[str, str]]',max_tokens: int, temperature: float, system_message: Optional[str], model: str, department: Optional[str]) -> AsyncGenerator[Chunk, None]:
+    async def run_with_streaming(self, history: 'list[dict[str, str]]',max_output_tokens: int, temperature: float, system_message: Optional[str], model: str, department: Optional[str]) -> AsyncGenerator[Chunk, None]:
         """call the llm in streaming mode
 
         Args:
             history (list[dict[str, str]]): the history,user and ai messages 
-            max_tokens (int): max_tokens to generate
+            max_output_tokens (int): max_output_tokens to generate
             temperature (float): temperature of the llm
             system_message (Optional[str]): the system message
             department (Optional[str]): from which department comes the call
@@ -40,7 +40,7 @@ async def run_with_streaming(self, history: 'list[dict[str, str]]',max_tokens: i
         """
         # configure
         config: LlmConfigs = {
-            "llm_max_tokens": max_tokens,
+            "llm_max_tokens": max_output_tokens,
             "llm_temperature": temperature,
             "llm_streaming": True,
             "llm": model
@@ -74,12 +74,12 @@ async def run_with_streaming(self, history: 'list[dict[str, str]]',max_tokens: i
             info = ChunkInfo(requesttokens=num_tokens_from_messages([msgs[-1]],model), streamedtokens=num_tokens_from_messages([HumanMessage(result)], model)) 
             yield Chunk(type="I", message=info, order=position)
 
-    def run_without_streaming(self, history: "Sequence[dict[str, str]]", max_tokens: int, temperature: float, system_message: Optional[str], department: Optional[str], model_name:str) -> ChatResult:
+    def run_without_streaming(self, history: "Sequence[dict[str, str]]", max_output_tokens: int, temperature: float, system_message: Optional[str], department: Optional[str], model_name:str) -> ChatResult:
         """calls the llm in blocking mode, returns the full result
 
         Args:
             history (list[dict[str, str]]): the history,user and ai messages 
-            max_tokens (int): max_tokens to generate
+            max_output_tokens (int): max_output_tokens to generate
             temperature (float): temperature of the llm
             system_message (Optional[str]): the system message
             department (Optional[str]): from which department comes the call
@@ -88,7 +88,7 @@ def run_without_streaming(self, history: "Sequence[dict[str, str]]", max_tokens:
             ChatResult: the generated text from the llm
         """
         config: LlmConfigs = {
-            "llm_max_tokens": max_tokens,
+            "llm_max_tokens": max_output_tokens,
             "llm_temperature": temperature,
             "llm_streaming": False,
         }

diff --git a/app/backend/core/llmhelper.py b/app/backend/core/llmhelper.py
@@ -13,7 +13,7 @@ class ModelsConfigurationException(Exception):
 
 
 def getModel(models: List[ModelsConfig], 
-             max_tokens: int,
+             max_output_tokens: int,
              n: int,
              temperature: float,
              streaming: bool) -> RunnableSerializable:
@@ -31,7 +31,7 @@ def getModel(models: List[ModelsConfig],
                         openai_api_key=default_model["api_key"],
                         azure_endpoint=default_model["endpoint"],
                         openai_api_version=default_model["api_version"],
-                        max_tokens=max_tokens,
+                        max_tokens=max_output_tokens,
                         n=n,
                         streaming=streaming,
                         temperature=temperature,
@@ -42,7 +42,7 @@ def getModel(models: List[ModelsConfig],
                         model=default_model["model_name"],
                         api_key=default_model["api_key"],
                         base_url=default_model["endpoint"],
-                        max_tokens=max_tokens,
+                        max_tokens=max_output_tokens,
                         n=n,
                         streaming=streaming,
                         temperature=temperature,
@@ -59,20 +59,60 @@ def getModel(models: List[ModelsConfig],
                                 azure_endpoint=model["endpoint"],
                                 openai_api_version=model["api_version"],
                                 openai_api_type="azure",
-                                max_tokens=max_tokens,
+                                max_tokens=max_output_tokens,
                                 n=n,
                                 streaming=streaming,
                                 temperature=temperature,
-                                )
+                                ).configurable_fields(
+                        temperature=ConfigurableField(
+                                id="llm_temperature",
+                                name="LLM Temperature",
+                                description="The temperature of the LLM",
+                        ),
+                        max_tokens= ConfigurableField(
+                                id="llm_max_tokens",
+                                name="LLM max Tokens",
+                                description="The token Limit of the LLM",
+                        ),
+                        streaming = ConfigurableField(
+                                id="llm_streaming",
+                                name="Streaming",
+                                description="Should the LLM Stream"),
+                        callbacks = ConfigurableField(
+                                id="llm_callbacks",
+                                name="Callbacks",
+                                description="Callbacks for the llm")
+
+                        )
                 elif model["type"] == "OPENAI":
                         alternative = ChatOpenAI(
                                         model=model["model_name"],
                                         api_key=model["api_key"],
                                         base_url=model["endpoint"],
-                                        max_tokens=max_tokens,
+                                        max_tokens=max_output_tokens,
                                         n=n,
                                         streaming=streaming,
                                         temperature=temperature,
+                        ).configurable_fields(
+                        temperature=ConfigurableField(
+                                id="llm_temperature",
+                                name="LLM Temperature",
+                                description="The temperature of the LLM",
+                        ),
+                        max_tokens= ConfigurableField(
+                                id="llm_max_tokens",
+                                name="LLM max Tokens",
+                                description="The token Limit of the LLM",
+                        ),
+                        streaming = ConfigurableField(
+                                id="llm_streaming",
+                                name="Streaming",
+                                description="Should the LLM Stream"),
+                        callbacks = ConfigurableField(
+                                id="llm_callbacks",
+                                name="Callbacks",
+                                description="Callbacks for the llm")
+
                         )
                 alternatives[model["model_name"]] = alternative
         llm = llm.configurable_fields(
@@ -86,10 +126,6 @@ def getModel(models: List[ModelsConfig],
                                 name="LLM max Tokens",
                                 description="The token Limit of the LLM",
                         ),
-                        openai_api_key = ConfigurableField(
-                                id="llm_api_key",
-                                name="The api key",
-                                description="The api key"),
                         streaming = ConfigurableField(
                                 id="llm_streaming",
                                 name="Streaming",

diff --git a/app/backend/core/types/Config.py b/app/backend/core/types/Config.py
@@ -11,11 +11,13 @@ class ModelsConfig(TypedDict):
     endpoint: str
     api_key: str
     api_version: str
-    max_tokens: int
+    max_output_tokens: int
+    max_input_tokens: int
 
 class ModelsDTO(TypedDict):
     model_name: str
-    max_tokens: int
+    max_output_tokens: int
+    max_input_tokens: int
     description: str
 
 class SSOConfig(TypedDict):

diff --git a/app/backend/core/types/LlmConfigs.py b/app/backend/core/types/LlmConfigs.py
@@ -10,6 +10,5 @@ class LlmConfigs(TypedDict, total=False):
     llm: NotRequired[str] # one of the SupportedModels
     llm_max_tokens: NotRequired[int]
     llm_temperature: NotRequired[float]
-    llm_api_key: NotRequired[str]
     llm_streaming: NotRequired[bool]
     llm_callbacks: NotRequired[List]
diff --git a/app/backend/init_app.py b/app/backend/init_app.py
@@ -24,19 +24,19 @@ def initApproaches(cfg: BackendConfig, repoHelper: Repository) -> Tuple[Chat, Br
     """
     brainstormllm = getModel(
                     models=cfg["models"],
-                    max_tokens =  4000,
+                    max_output_tokens =  4000,
                     n = 1,
                     streaming=False,
                     temperature=0.9)
     sumllm = getModel(
                     models=cfg["models"],
-                    max_tokens =  2000,
+                    max_output_tokens =  2000,
                     n = 1,
                     streaming=False,
                     temperature=0)
     chatlllm = getModel(
                     models=cfg["models"],
-                    max_tokens=4000,
+                    max_output_tokens=4000,
                     n = 1,
                     streaming=True,
                     temperature=0.7)

diff --git a/app/frontend/package.json b/app/frontend/package.json
@@ -1,7 +1,7 @@
 {
   "name": "mucgpt",
   "private": true,
-  "version": "1.1.3",
+  "version": "1.1.4",
   "type": "module",
   "engines": {
     "node": ">=16.0.0"

diff --git a/app/frontend/src/api/api.ts b/app/frontend/src/api/api.ts
@@ -14,7 +14,7 @@ export async function chatApi(options: ChatRequest): Promise<Response> {
             temperature: options.temperature,
             language: options.language,
             system_message: options.system_message,
-            max_tokens: options.max_tokens,
+            max_output_tokens: options.max_output_tokens,
             model: options.model
         })
     });

diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts
@@ -18,7 +18,7 @@ export type ChatRequest = {
     history: ChatTurn[];
     temperature?: number;
     language?: string;
-    max_tokens?: number;
+    max_output_tokens?: number;
     system_message?: string;
     shouldStream?: boolean;
     model?: string;
@@ -50,7 +50,8 @@ export interface Frontend {
 }
 
 export interface Model {
-    max_tokens: number;
+    max_output_tokens: number;
+    max_input_tokens: number;
     model_name: string;
     description: string;
 }

diff --git a/app/frontend/src/components/ChatsettingsDrawer/ChatsettingsDrawer.tsx b/app/frontend/src/components/ChatsettingsDrawer/ChatsettingsDrawer.tsx
@@ -20,14 +20,14 @@ import { LLMContext } from "../LLMSelector/LLMContextProvider";
 interface Props {
     temperature: number;
     setTemperature: (temp: number, id: number) => void;
-    max_tokens: number;
+    max_output_tokens: number;
     setMaxTokens: (maxTokens: number, id: number) => void;
     systemPrompt: string;
     setSystemPrompt: (systemPrompt: string, id: number) => void;
     current_id: number;
 }
 
-export const ChatsettingsDrawer = ({ temperature, setTemperature, max_tokens, setMaxTokens, systemPrompt, setSystemPrompt, current_id }: Props) => {
+export const ChatsettingsDrawer = ({ temperature, setTemperature, max_output_tokens, setMaxTokens, systemPrompt, setSystemPrompt, current_id }: Props) => {
     const [isOpen, setIsOpen] = useState<boolean>(false);
     const { t, i18n } = useTranslation();
     const { LLM } = useContext(LLMContext)
@@ -41,11 +41,10 @@ export const ChatsettingsDrawer = ({ temperature, setTemperature, max_tokens, se
     const max_tokensID = useId("input-max_tokens");
 
     const min_max_tokens = 10;
-    const max_max_tokens = LLM.max_tokens;
+    const max_max_tokens = LLM.max_output_tokens;
     const min_temp = 0;
     const max_temp = 1;
 
-
     const isEmptySystemPrompt = systemPrompt.trim() === "";
 
     const onTemperatureChange: SliderProps["onChange"] = (_, data) =>
@@ -144,12 +143,12 @@ export const ChatsettingsDrawer = ({ temperature, setTemperature, max_tokens, se
                             defaultValue={20}
                             onChange={onMaxtokensChange}
                             aria-valuetext={t('components.chattsettingsdrawer.max_lenght') + ` ist ${max_tokensID}`}
-                            value={max_tokens}
+                            value={max_output_tokens}
                             aria-labelledby={max_tokens_headerID}
                             id={max_tokensID} />
                         <br></br>
                         <Label htmlFor={max_tokensID} aria-hidden>
-                            {max_tokens} Tokens
+                            {max_output_tokens} Tokens
                         </Label>
                     </div>
                 </div>

diff --git a/app/frontend/src/components/LLMSelector/LLMContextProvider.tsx b/app/frontend/src/components/LLMSelector/LLMContextProvider.tsx
@@ -8,10 +8,10 @@ interface ILLMProvider {
 }
 
 export const DEFAULTLLM = "gpt-4o-mini";
-export const LLMContext = React.createContext<ILLMProvider>({ LLM: { model_name: DEFAULTLLM, max_tokens: 0, description: "" }, setLLM: () => { } });
+export const LLMContext = React.createContext<ILLMProvider>({ LLM: { model_name: DEFAULTLLM, max_output_tokens: 0, max_input_tokens: 0, description: "" }, setLLM: () => { } });
 
 export const LLMContextProvider = (props: React.PropsWithChildren<{}>) => {
-    const [LLM, setLLM] = useState<Model>({ model_name: DEFAULTLLM, max_tokens: 0, description: "" });
+    const [LLM, setLLM] = useState<Model>({ model_name: DEFAULTLLM, max_output_tokens: 0, max_input_tokens: 0, description: "" });
 
     return (
         <LLMContext.Provider value={{ LLM, setLLM }}>

diff --git a/app/frontend/src/components/QuestionInput/QuestionInput.tsx b/app/frontend/src/components/QuestionInput/QuestionInput.tsx
@@ -21,7 +21,7 @@ interface Props {
 export const QuestionInput = ({ onSend, disabled, placeholder, clearOnSend, tokens_used, token_limit_tracking = true, question, setQuestion }: Props) => {
     const { t, i18n } = useTranslation();
     const { LLM } = useContext(LLMContext)
-    const wordCount = LLM.max_tokens;
+    const wordCount = LLM.max_input_tokens;
     const getDescription = () => {
         let actual = countWords(question) + tokens_used;
         let text;

diff --git a/app/frontend/src/components/SumInput/SumInput.tsx b/app/frontend/src/components/SumInput/SumInput.tsx
@@ -24,7 +24,7 @@ export const SumInput = ({ onSend, disabled, placeholder, clearOnSend, tokens_us
     const [dragging, setDragging] = useState(false);
     const [file, setFile] = useState<File | undefined>(undefined);
     const { LLM } = useContext(LLMContext)
-    const wordCount = LLM.max_tokens;
+    const wordCount = LLM.max_input_tokens;
     const getDescription = () => {
         let actual = countWords(question) + tokens_used;
         let text;