From 8cfadd38997c9b3f0c1778bab87ca40eae2d07fe Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 Jan 2025 13:13:17 +0700 Subject: [PATCH 1/5] fix: add ctx_shift parameter (#357) * fix: add ctx_shift parameter * chore: readme --------- Co-authored-by: vansangpfiev --- README.md | 1 + src/llama_engine.cc | 25 ++++++++++--------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ce67774..284ce52 100644 --- a/README.md +++ b/README.md @@ -148,3 +148,4 @@ Table of parameters |`flash_attn` | Boolean| To enable Flash Attention, default is true| |`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16| |`use_mmap` | Boolean| To enable mmap, default is true| +|`ctx_shift` | Boolean| To enable context shift, default is true| diff --git a/src/llama_engine.cc b/src/llama_engine.cc index 5560645..762d7e7 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -270,24 +270,18 @@ std::string CreateReturnJson(const std::string& id, const std::string& model, } const std::vector kv_cache_types = { - GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_Q8_0, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_IQ4_NL, - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, + GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, }; -ggml_type kv_cache_type_from_str(const std::string & s) { - for (const auto & type : kv_cache_types) { - if (ggml_type_name(type) == s) { - return type; - } +ggml_type kv_cache_type_from_str(const std::string& s) { + for (const auto& type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; } - throw std::runtime_error("Unsupported cache type: " + s); + } + throw std::runtime_error("Unsupported cache type: " + s); } } // namespace @@ -611,6 +605,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr json_body) { } } + params.ctx_shift = json_body->get("ctx_shift", true).asBool(); params.n_gpu_layers = json_body->get("ngl", 300) .asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers From 234143024c04aaf76545b41c8850ac695b5e8a77 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 Jan 2025 13:22:55 +0700 Subject: [PATCH 2/5] chore: down log level (#358) Co-authored-by: vansangpfiev --- src/llama_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama_engine.cc b/src/llama_engine.cc index 762d7e7..b967b71 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -287,7 +287,7 @@ ggml_type kv_cache_type_from_str(const std::string& s) { } // namespace void LlamaEngine::Load(EngineLoadOption opts) { - LOG_INFO << "Loading engine.."; + LOG_DEBUG << "Loading engine.."; LOG_DEBUG << "Is custom engine path: " << opts.is_custom_engine_path; LOG_DEBUG << "Engine path: " << opts.engine_path.string(); From 44412ee83a7d017353db41e0baeda03f4226235f Mon Sep 17 00:00:00 2001 From: jan-service-account <136811300+jan-service-account@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:12:00 +0700 Subject: [PATCH 3/5] Update submodule to latest release b4406 (#356) Co-authored-by: github-actions[bot] --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 0827b2c..0da5d86 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 0827b2c1da299805288abbd556d869318f2b121e +Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6 From 1d36c597469f4e4f16eefe06b4ecaad3e2caa42d Mon Sep 17 00:00:00 2001 From: jan-service-account <136811300+jan-service-account@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:05:08 +0700 Subject: [PATCH 4/5] Update llama.cpp submodule to latest release b4453 (#365) * Update submodule to latest release b4453 * fix: patch --------- Co-authored-by: github-actions[bot] Co-authored-by: vansangpfiev --- llama.cpp | 2 +- patches/0001-Add-API-query-buffer-size.patch | 10 +++++----- src/llama_server_context.cc | 18 +++--------------- src/llama_server_context.h | 2 ++ 4 files changed, 11 insertions(+), 21 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0da5d86..f8feb4b 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6 +Subproject commit f8feb4b01af374ad2fce302fd5790529c615710b diff --git a/patches/0001-Add-API-query-buffer-size.patch b/patches/0001-Add-API-query-buffer-size.patch index 4185885..cfe7bd1 100644 --- a/patches/0001-Add-API-query-buffer-size.patch +++ b/patches/0001-Add-API-query-buffer-size.patch @@ -22,11 +22,11 @@ index 7cae1bbe..fdcbf949 100644 LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); -diff --git a/src/llama.cpp b/src/llama.cpp -index c466cd88..15f3102c 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -19561,6 +19561,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) { +diff --git a/src/llama-context.cpp b/src/llama-context.cpp +index 38a55fb2..80b3532e 100644 +--- a/src/llama-context.cpp ++++ b/src/llama-context.cpp +@@ -602,6 +602,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc index 7118df4..f1a73da 100644 --- a/src/llama_server_context.cc +++ b/src/llama_server_context.cc @@ -177,14 +177,6 @@ bool IsLlava_1_6(const std::string& model) { } // namespace LlamaServerContext::~LlamaServerContext() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - if (model) { - llama_free_model(model); - model = nullptr; - } } bool LlamaServerContext::LoadModel(const common_params& params_) { @@ -212,9 +204,9 @@ bool LlamaServerContext::LoadModel(const common_params& params_) { } } - auto res = common_init_from_params(params); - model = res.model; - ctx = res.context; + llama_init = common_init_from_params(params); + model = llama_init.model.get(); + ctx = llama_init.context.get(); if (model == nullptr) { LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}}); @@ -232,8 +224,6 @@ bool LlamaServerContext::LoadModel(const common_params& params_) { << n_embd_llm << "). Make sure that you use the " "correct mmproj file."; - llama_free(ctx); - llama_free_model(model); return false; } } @@ -382,8 +372,6 @@ void LlamaServerContext::ReleaseResources() { bgr_thread.join(); } - llama_free(ctx); - llama_free_model(model); ctx = nullptr; model = nullptr; LOG_INFO << "Released llama_server_context resources"; diff --git a/src/llama_server_context.h b/src/llama_server_context.h index 3dd512f..0ae63ac 100644 --- a/src/llama_server_context.h +++ b/src/llama_server_context.h @@ -106,6 +106,8 @@ static T json_value(const json& body, const std::string& key, } struct LlamaServerContext { + common_init_result llama_init; + llama_model* model = nullptr; llama_context* ctx = nullptr; From bc50ff7d40841f834ded836538739095db4b1639 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 10 Jan 2025 17:01:00 +0000 Subject: [PATCH 5/5] Update submodule to latest release b4458 --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f8feb4b..c3f9d25 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit f8feb4b01af374ad2fce302fd5790529c615710b +Subproject commit c3f9d25706ac84297067aeaa662c1f1af42ed443