From df1d3606f59886e3befd78f6870c3343f49d4b26 Mon Sep 17 00:00:00 2001 From: b4rtaz Date: Mon, 27 May 2024 23:12:39 +0200 Subject: [PATCH] feat: nSlices <= nKvHeads limit. --- README.md | 3 ++- src/transformer.cpp | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d4488cf..1a8e4da 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,8 @@ Python and GCC required. Download this repository and run: - [API Server](./src/apps/dllama-api/README.md) **Known limitations:** -* You can run Distributed Llama only on 1, 2, 4... 2^n devices. +* You can run Distributed Llama only on 1, 2, 4... 2^n nodes. +* The maximum number of nodes is equal to the number of KV heads in the model [#70](https://github.com/b4rtaz/distributed-llama/issues/70). * Optimized for (weights format × buffer format): * ARM CPUs * ✅ F32 × F32 diff --git a/src/transformer.cpp b/src/transformer.cpp index 2f95c53..9d17d15 100644 --- a/src/transformer.cpp +++ b/src/transformer.cpp @@ -251,6 +251,10 @@ TransformerSpec Transformer::loadSpecFromFile(const char* path, const unsigned i spec.bufferFloatType = bufferFloatType; spec.nSlices = nSlices; + if (spec.nSlices > spec.nKvHeads) { + // TODO: https://github.com/b4rtaz/distributed-llama/issues/70 + throw std::runtime_error("This version does not support more nodes than the number of KV heads in the model."); + } if (spec.archType == LLAMA) { printf("💡 arch: llama\n"); } else if (spec.archType == GROK1) {