Skip to content

Commit

Permalink
Merge branch 'main' into feat/use-llama-cpp-server
Browse files Browse the repository at this point in the history
  • Loading branch information
vansangpfiev authored Jan 13, 2025
2 parents ba7e5af + fde2c39 commit 3476fba
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 21 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,4 @@ Table of parameters
|`flash_attn` | Boolean| To enable Flash Attention, default is true|
|`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16|
|`use_mmap` | Boolean| To enable mmap, default is true|
|`ctx_shift` | Boolean| To enable context shift, default is true|
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 179 files
10 changes: 5 additions & 5 deletions patches/0001-Add-API-query-buffer-size.patch
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ index 7cae1bbe..fdcbf949 100644
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
diff --git a/src/llama.cpp b/src/llama.cpp
index c466cd88..15f3102c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19561,6 +19561,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..80b3532e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -602,6 +602,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
return &ctx->model;
}

Expand Down
1 change: 1 addition & 0 deletions src/llama_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
}
}

params.ctx_shift = json_body->get("ctx_shift", true).asBool();
params.n_gpu_layers =
json_body->get("ngl", 300)
.asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers
Expand Down
18 changes: 3 additions & 15 deletions src/llama_server_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,6 @@ bool IsLlava_1_6(const std::string& model) {
} // namespace

LlamaServerContext::~LlamaServerContext() {
if (ctx) {
llama_free(ctx);
ctx = nullptr;
}
if (model) {
llama_free_model(model);
model = nullptr;
}
}

bool LlamaServerContext::LoadModel(const common_params& params_) {
Expand Down Expand Up @@ -212,9 +204,9 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
}
}

auto res = common_init_from_params(params);
model = res.model;
ctx = res.context;
llama_init = common_init_from_params(params);
model = llama_init.model.get();
ctx = llama_init.context.get();
if (model == nullptr) {
LOG_ERROR_LLAMA("llama.cpp unable to load model",
{{"model", params.model}});
Expand All @@ -232,8 +224,6 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
<< n_embd_llm
<< "). Make sure that you use the "
"correct mmproj file.";
llama_free(ctx);
llama_free_model(model);
return false;
}
}
Expand Down Expand Up @@ -382,8 +372,6 @@ void LlamaServerContext::ReleaseResources() {
bgr_thread.join();
}

llama_free(ctx);
llama_free_model(model);
ctx = nullptr;
model = nullptr;
LOG_INFO << "Released llama_server_context resources";
Expand Down
2 changes: 2 additions & 0 deletions src/llama_server_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ static T json_value(const json& body, const std::string& key,
}

struct LlamaServerContext {
common_init_result llama_init;

llama_model* model = nullptr;
llama_context* ctx = nullptr;

Expand Down

0 comments on commit 3476fba

Please sign in to comment.