Merge branch 'main' into feat/use-llama-cpp-server

janhq · Jan 13, 2025 · 3476fba · 3476fba
2 parents ba7e5af + fde2c39
commit 3476fba
Show file tree

Hide file tree

Showing 6 changed files with 13 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -148,3 +148,4 @@ Table of parameters
 |`flash_attn` | Boolean| To enable Flash Attention, default is true|
 |`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16|
 |`use_mmap` | Boolean| To enable mmap, default is true|
+|`ctx_shift` | Boolean| To enable context shift, default is true|
diff --git a/llama.cpp b/llama.cpp
diff --git a/patches/0001-Add-API-query-buffer-size.patch b/patches/0001-Add-API-query-buffer-size.patch
@@ -22,11 +22,11 @@ index 7cae1bbe..fdcbf949 100644
      LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
      LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
      LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
-diff --git a/src/llama.cpp b/src/llama.cpp
-index c466cd88..15f3102c 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -19561,6 +19561,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index 38a55fb2..80b3532e 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -602,6 +602,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
      return &ctx->model;
  }
 

diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -712,6 +712,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
       }
     }
 
+    params.ctx_shift = json_body->get("ctx_shift", true).asBool();
     params.n_gpu_layers =
         json_body->get("ngl", 300)
             .asInt();  // change from 100 -> 300 since llama 3.1 has 292 gpu layers

diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
@@ -177,14 +177,6 @@ bool IsLlava_1_6(const std::string& model) {
 }  // namespace
 
 LlamaServerContext::~LlamaServerContext() {
-  if (ctx) {
-    llama_free(ctx);
-    ctx = nullptr;
-  }
-  if (model) {
-    llama_free_model(model);
-    model = nullptr;
-  }
 }
 
 bool LlamaServerContext::LoadModel(const common_params& params_) {
@@ -212,9 +204,9 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
     }
   }
 
-  auto res = common_init_from_params(params);
-  model = res.model;
-  ctx = res.context;
+  llama_init = common_init_from_params(params);
+  model = llama_init.model.get();
+  ctx = llama_init.context.get();
   if (model == nullptr) {
     LOG_ERROR_LLAMA("llama.cpp unable to load model",
                     {{"model", params.model}});
@@ -232,8 +224,6 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
                 << n_embd_llm
                 << "). Make sure that you use the "
                    "correct mmproj file.";
-      llama_free(ctx);
-      llama_free_model(model);
       return false;
     }
   }
@@ -382,8 +372,6 @@ void LlamaServerContext::ReleaseResources() {
       bgr_thread.join();
     }
 
-    llama_free(ctx);
-    llama_free_model(model);
     ctx = nullptr;
     model = nullptr;
     LOG_INFO << "Released llama_server_context resources";

diff --git a/src/llama_server_context.h b/src/llama_server_context.h
@@ -106,6 +106,8 @@ static T json_value(const json& body, const std::string& key,
 }
 
 struct LlamaServerContext {
+  common_init_result llama_init;
+
   llama_model* model = nullptr;
   llama_context* ctx = nullptr;