From 8cfadd38997c9b3f0c1778bab87ca40eae2d07fe Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Fri, 3 Jan 2025 13:13:17 +0700
Subject: [PATCH 1/5] fix: add ctx_shift parameter (#357)

* fix: add ctx_shift parameter

* chore: readme

---------

Co-authored-by: vansangpfiev <sang@jan.ai>
---
 README.md           |  1 +
 src/llama_engine.cc | 25 ++++++++++---------------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index ce67774..284ce52 100644
--- a/README.md
+++ b/README.md
@@ -148,3 +148,4 @@ Table of parameters
 |`flash_attn` | Boolean| To enable Flash Attention, default is true|
 |`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16|
 |`use_mmap` | Boolean| To enable mmap, default is true|
+|`ctx_shift` | Boolean| To enable context shift, default is true|
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
index 5560645..762d7e7 100644
--- a/src/llama_engine.cc
+++ b/src/llama_engine.cc
@@ -270,24 +270,18 @@ std::string CreateReturnJson(const std::string& id, const std::string& model,
 }
 
 const std::vector<ggml_type> kv_cache_types = {
-    GGML_TYPE_F32,
-    GGML_TYPE_F16,
-    GGML_TYPE_BF16,
-    GGML_TYPE_Q8_0,
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
-    GGML_TYPE_IQ4_NL,
-    GGML_TYPE_Q5_0,
-    GGML_TYPE_Q5_1,
+    GGML_TYPE_F32,    GGML_TYPE_F16,  GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,   GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
 };
 
-ggml_type kv_cache_type_from_str(const std::string & s) {
-    for (const auto & type : kv_cache_types) {
-        if (ggml_type_name(type) == s) {
-            return type;
-        }
+ggml_type kv_cache_type_from_str(const std::string& s) {
+  for (const auto& type : kv_cache_types) {
+    if (ggml_type_name(type) == s) {
+      return type;
     }
-    throw std::runtime_error("Unsupported cache type: " + s);
+  }
+  throw std::runtime_error("Unsupported cache type: " + s);
 }
 
 }  // namespace
@@ -611,6 +605,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
       }
     }
 
+    params.ctx_shift = json_body->get("ctx_shift", true).asBool();
     params.n_gpu_layers =
         json_body->get("ngl", 300)
             .asInt();  // change from 100 -> 300 since llama 3.1 has 292 gpu layers

From 234143024c04aaf76545b41c8850ac695b5e8a77 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Fri, 3 Jan 2025 13:22:55 +0700
Subject: [PATCH 2/5] chore: down log level (#358)

Co-authored-by: vansangpfiev <sang@jan.ai>
---
 src/llama_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama_engine.cc b/src/llama_engine.cc
index 762d7e7..b967b71 100644
--- a/src/llama_engine.cc
+++ b/src/llama_engine.cc
@@ -287,7 +287,7 @@ ggml_type kv_cache_type_from_str(const std::string& s) {
 }  // namespace
 
 void LlamaEngine::Load(EngineLoadOption opts) {
-  LOG_INFO << "Loading engine..";
+  LOG_DEBUG << "Loading engine..";
 
   LOG_DEBUG << "Is custom engine path: " << opts.is_custom_engine_path;
   LOG_DEBUG << "Engine path: " << opts.engine_path.string();

From 44412ee83a7d017353db41e0baeda03f4226235f Mon Sep 17 00:00:00 2001
From: jan-service-account
 <136811300+jan-service-account@users.noreply.github.com>
Date: Fri, 3 Jan 2025 14:12:00 +0700
Subject: [PATCH 3/5] Update submodule to latest release b4406 (#356)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 0827b2c..0da5d86 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 0827b2c1da299805288abbd556d869318f2b121e
+Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6

From 1d36c597469f4e4f16eefe06b4ecaad3e2caa42d Mon Sep 17 00:00:00 2001
From: jan-service-account
 <136811300+jan-service-account@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:05:08 +0700
Subject: [PATCH 4/5] Update llama.cpp submodule to latest release b4453 (#365)

* Update submodule to latest release b4453

* fix: patch

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: vansangpfiev <sang@jan.ai>
---
 llama.cpp                                    |  2 +-
 patches/0001-Add-API-query-buffer-size.patch | 10 +++++-----
 src/llama_server_context.cc                  | 18 +++---------------
 src/llama_server_context.h                   |  2 ++
 4 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0da5d86..f8feb4b 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6
+Subproject commit f8feb4b01af374ad2fce302fd5790529c615710b
diff --git a/patches/0001-Add-API-query-buffer-size.patch b/patches/0001-Add-API-query-buffer-size.patch
index 4185885..cfe7bd1 100644
--- a/patches/0001-Add-API-query-buffer-size.patch
+++ b/patches/0001-Add-API-query-buffer-size.patch
@@ -22,11 +22,11 @@ index 7cae1bbe..fdcbf949 100644
      LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
      LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
      LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
-diff --git a/src/llama.cpp b/src/llama.cpp
-index c466cd88..15f3102c 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -19561,6 +19561,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index 38a55fb2..80b3532e 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -602,6 +602,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
      return &ctx->model;
  }
  
diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
index 7118df4..f1a73da 100644
--- a/src/llama_server_context.cc
+++ b/src/llama_server_context.cc
@@ -177,14 +177,6 @@ bool IsLlava_1_6(const std::string& model) {
 }  // namespace
 
 LlamaServerContext::~LlamaServerContext() {
-  if (ctx) {
-    llama_free(ctx);
-    ctx = nullptr;
-  }
-  if (model) {
-    llama_free_model(model);
-    model = nullptr;
-  }
 }
 
 bool LlamaServerContext::LoadModel(const common_params& params_) {
@@ -212,9 +204,9 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
     }
   }
 
-  auto res = common_init_from_params(params);
-  model = res.model;
-  ctx = res.context;
+  llama_init = common_init_from_params(params);
+  model = llama_init.model.get();
+  ctx = llama_init.context.get();
   if (model == nullptr) {
     LOG_ERROR_LLAMA("llama.cpp unable to load model",
                     {{"model", params.model}});
@@ -232,8 +224,6 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
                 << n_embd_llm
                 << "). Make sure that you use the "
                    "correct mmproj file.";
-      llama_free(ctx);
-      llama_free_model(model);
       return false;
     }
   }
@@ -382,8 +372,6 @@ void LlamaServerContext::ReleaseResources() {
       bgr_thread.join();
     }
 
-    llama_free(ctx);
-    llama_free_model(model);
     ctx = nullptr;
     model = nullptr;
     LOG_INFO << "Released llama_server_context resources";
diff --git a/src/llama_server_context.h b/src/llama_server_context.h
index 3dd512f..0ae63ac 100644
--- a/src/llama_server_context.h
+++ b/src/llama_server_context.h
@@ -106,6 +106,8 @@ static T json_value(const json& body, const std::string& key,
 }
 
 struct LlamaServerContext {
+  common_init_result llama_init;
+  
   llama_model* model = nullptr;
   llama_context* ctx = nullptr;
 

From bc50ff7d40841f834ded836538739095db4b1639 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:01:00 +0000
Subject: [PATCH 5/5] Update submodule to latest release b4458

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index f8feb4b..c3f9d25 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit f8feb4b01af374ad2fce302fd5790529c615710b
+Subproject commit c3f9d25706ac84297067aeaa662c1f1af42ed443