From 1b6024e170c1ec5733c69cde1ffcd46b013c7338 Mon Sep 17 00:00:00 2001 From: Bart Tadych Date: Wed, 12 Jun 2024 23:22:48 +0200 Subject: [PATCH] feat: accelerator structure. (#90) --- .github/workflows/main.yml | 12 +- Makefile | 26 +- examples/macbeth.sh | 2 +- src/app.cpp | 7 +- src/app.hpp | 8 +- src/apps/dllama-api/dllama-api.cpp | 2 +- src/apps/dllama/dllama.cpp | 7 +- src/commands-test.cpp | 85 +++++ src/commands.cpp | 229 +++++++++++++ src/commands.hpp | 144 ++++++++ src/funcs.cpp | 3 +- src/funcs.hpp | 9 - src/grok1-tasks-test.cpp | 7 +- src/grok1-tasks.cpp | 15 +- src/llama2-tasks-test.cpp | 7 +- src/llama2-tasks.cpp | 40 ++- src/tasks.cpp | 10 +- src/transformer-test.cpp | 85 ----- src/transformer.cpp | 524 ++++++++++------------------- src/transformer.hpp | 140 ++------ src/utils.hpp | 10 +- 21 files changed, 754 insertions(+), 618 deletions(-) create mode 100644 src/commands-test.cpp create mode 100644 src/commands.cpp create mode 100644 src/commands.hpp delete mode 100644 src/transformer-test.cpp diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a8b13f6..cc4b35f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,7 +31,7 @@ jobs: make funcs-test make quants-test make tokenizer-test - make transformer-test + make commands-test make llama2-tasks-test make grok1-tasks-test - name: funcs-test @@ -40,8 +40,8 @@ jobs: run: ./quants-test - name: tokenizer-test run: ./tokenizer-test - - name: transformer-test - run: ./transformer-test + - name: commands-test + run: ./commands-test - name: llama2-tasks-test run: ./llama2-tasks-test - name: grok1-tasks-test @@ -64,7 +64,7 @@ jobs: make funcs-test make quants-test make tokenizer-test - make transformer-test + make commands-test make llama2-tasks-test make grok1-tasks-test - name: funcs-test @@ -73,8 +73,8 @@ jobs: run: ./quants-test - name: tokenizer-test run: ./tokenizer-test - - name: transformer-test - run: ./transformer-test + - name: commands-test + run: ./commands-test - name: llama2-tasks-test run: ./llama2-tasks-test - name: grok1-tasks-test diff --git a/Makefile b/Makefile index d8104f7..2141cd1 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,8 @@ funcs: src/funcs.cpp $(CXX) $(CXXFLAGS) -c src/funcs.cpp -o funcs.o funcs-test: src/funcs-test.cpp funcs $(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o +commands: src/commands.cpp + $(CXX) $(CXXFLAGS) -c src/commands.cpp -o commands.o socket: src/socket.cpp $(CXX) $(CXXFLAGS) -c src/socket.cpp -o socket.o transformer: src/utils.cpp @@ -33,20 +35,20 @@ tokenizer: src/tokenizer.cpp app: src/app.cpp $(CXX) $(CXXFLAGS) -c src/app.cpp -o app.o -dllama: src/apps/dllama/dllama.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app - $(CXX) $(CXXFLAGS) src/apps/dllama/dllama.cpp -o dllama utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o $(LIBS) -dllama-api: src/apps/dllama-api/dllama-api.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app - $(CXX) $(CXXFLAGS) src/apps/dllama-api/dllama-api.cpp -o dllama-api utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o $(LIBS) +dllama: src/apps/dllama/dllama.cpp utils quants funcs commands socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app + $(CXX) $(CXXFLAGS) src/apps/dllama/dllama.cpp -o dllama utils.o quants.o funcs.o commands.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o $(LIBS) +dllama-api: src/apps/dllama-api/dllama-api.cpp utils quants funcs commands socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app + $(CXX) $(CXXFLAGS) src/apps/dllama-api/dllama-api.cpp -o dllama-api utils.o quants.o funcs.o commands.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o $(LIBS) funcs-test: src/funcs-test.cpp funcs utils quants $(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o utils.o quants.o $(LIBS) quants-test: src/quants.cpp utils quants $(CXX) $(CXXFLAGS) src/quants-test.cpp -o quants-test utils.o quants.o $(LIBS) -tokenizer-test: src/tokenizer-test.cpp tokenizer funcs utils quants - $(CXX) $(CXXFLAGS) src/tokenizer-test.cpp -o tokenizer-test tokenizer.o funcs.o utils.o quants.o $(LIBS) -transformer-test: src/transformer-test.cpp funcs utils quants transformer socket - $(CXX) $(CXXFLAGS) src/transformer-test.cpp -o transformer-test funcs.o utils.o quants.o transformer.o socket.o $(LIBS) -llama2-tasks-test: src/llama2-tasks-test.cpp utils quants funcs socket transformer tasks llama2-tasks tokenizer - $(CXX) $(CXXFLAGS) src/llama2-tasks-test.cpp -o llama2-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o tokenizer.o $(LIBS) -grok1-tasks-test: src/grok1-tasks-test.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks tokenizer - $(CXX) $(CXXFLAGS) src/grok1-tasks-test.cpp -o grok1-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o tokenizer.o $(LIBS) \ No newline at end of file +tokenizer-test: src/tokenizer-test.cpp tokenizer funcs commands utils quants + $(CXX) $(CXXFLAGS) src/tokenizer-test.cpp -o tokenizer-test tokenizer.o funcs.o commands.o utils.o quants.o $(LIBS) +commands-test: src/commands-test.cpp funcs commands utils quants transformer socket + $(CXX) $(CXXFLAGS) src/commands-test.cpp -o commands-test funcs.o commands.o utils.o quants.o transformer.o socket.o $(LIBS) +llama2-tasks-test: src/llama2-tasks-test.cpp utils quants funcs commands socket transformer tasks llama2-tasks tokenizer + $(CXX) $(CXXFLAGS) src/llama2-tasks-test.cpp -o llama2-tasks-test utils.o quants.o funcs.o commands.o socket.o transformer.o tasks.o llama2-tasks.o tokenizer.o $(LIBS) +grok1-tasks-test: src/grok1-tasks-test.cpp utils quants funcs commands socket transformer tasks llama2-tasks grok1-tasks tokenizer + $(CXX) $(CXXFLAGS) src/grok1-tasks-test.cpp -o grok1-tasks-test utils.o quants.o funcs.o commands.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o tokenizer.o $(LIBS) \ No newline at end of file diff --git a/examples/macbeth.sh b/examples/macbeth.sh index d3459d1..e549eb1 100644 --- a/examples/macbeth.sh +++ b/examples/macbeth.sh @@ -189,7 +189,7 @@ Macbeth. Thou seest the moon" echo "Generating, it can take a while..." -OUTPUT=$(( ./dllama generate --seed 12345 --temperature 0.9 --topp 0.9 --prompt "$PROMPT" --weights-float-type q40 --buffer-float-type f32 --nthreads 8 --steps 2048 --model converter/dllama_meta-llama-3-8b_q40.bin --tokenizer converter/dllama_meta-llama3-tokenizer.t ) 2>&1) +OUTPUT=$(( ./dllama generate --seed 12345 --temperature 0.9 --topp 0.9 --prompt "$PROMPT" --weights-float-type q40 --buffer-float-type f32 --nthreads 2 --steps 2048 --model models/llama3_8b_q40/dllama_model_llama3_8b_q40.m --tokenizer models/llama3_8b_q40/dllama_tokenizer_llama3_8b_q40.t --workers 127.0.0.1:9999 127.0.0.1:9998 127.0.0.1:9997 ) 2>&1) echo "$OUTPUT" diff --git a/src/app.cpp b/src/app.cpp index 517e325..60211c8 100644 --- a/src/app.cpp +++ b/src/app.cpp @@ -100,7 +100,7 @@ TransformerArch TransformerArchFactory::create(TransformerSpec* spec) { exit(EXIT_FAILURE); } -void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec)) { +void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec, AcceleratorContext* acc)) { if (args->modelPath == NULL) { throw std::runtime_error("Model is required"); } @@ -119,14 +119,15 @@ void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* s args->steps = spec.seqLen; } - Transformer transformer = Transformer::loadRootFromFile(args->modelPath, &spec, socketPool); + AcceleratorContext acc(0, 1, NULL); + Transformer transformer = Transformer::loadRootFromFile(args->modelPath, &spec, socketPool, &acc); socketPool->setTurbo(true); Inference inference = Inference(&arch, args->nThreads, &transformer, socketPool); Sampler sampler(spec.vocabSize, args->temperature, args->topp, args->seed); - program(&inference, socketPool, &tokenizer, &sampler, args, &spec); + program(&inference, socketPool, &tokenizer, &sampler, args, &spec, &acc); delete socketPool; } diff --git a/src/app.hpp b/src/app.hpp index e7f0ddc..5073023 100644 --- a/src/app.hpp +++ b/src/app.hpp @@ -1,10 +1,10 @@ -#ifndef FUNCS_HPP -#define FUNCS_HPP +#ifndef APP_HPP +#define APP_HPP #include "quants.hpp" #include "transformer.hpp" #include "utils.hpp" -#include "socket.hpp" +#include "utils.hpp" #include "app.hpp" #include "transformer.hpp" #include "tasks.hpp" @@ -46,7 +46,7 @@ class TransformerArchFactory { class App { public: - static void run(AppArgs* args, void (*program)(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec)); + static void run(AppArgs* args, void (*program)(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec, AcceleratorContext* acc)); }; #endif diff --git a/src/apps/dllama-api/dllama-api.cpp b/src/apps/dllama-api/dllama-api.cpp index 002206f..dc71376 100644 --- a/src/apps/dllama-api/dllama-api.cpp +++ b/src/apps/dllama-api/dllama-api.cpp @@ -392,7 +392,7 @@ void handleModelsRequest(HttpRequest& request) { "] }"); } -void server(Inference* inference, SocketPool* socketPool, Tokenizer *tokenizer, Sampler *sampler, AppArgs* args, TransformerSpec* spec) { +void server(Inference* inference, SocketPool* socketPool, Tokenizer *tokenizer, Sampler *sampler, AppArgs* args, TransformerSpec* spec, AcceleratorContext* acc) { SocketServer* server = new SocketServer(args->port); TokenizerChatStops stops(tokenizer); diff --git a/src/apps/dllama/dllama.cpp b/src/apps/dllama/dllama.cpp index 69d7814..a1cc95f 100644 --- a/src/apps/dllama/dllama.cpp +++ b/src/apps/dllama/dllama.cpp @@ -14,7 +14,7 @@ #include "../../tokenizer.hpp" #include "../../app.hpp" -void generate(Inference* inference, SocketPool* socketPool, Tokenizer *tokenizer, Sampler *sampler, AppArgs* args, TransformerSpec* spec) { +void generate(Inference* inference, SocketPool* socketPool, Tokenizer *tokenizer, Sampler *sampler, AppArgs* args, TransformerSpec* spec, AcceleratorContext* acc) { if (args->prompt == NULL) throw std::runtime_error("Prompt is required"); @@ -193,7 +193,7 @@ class Chat { } }; -void chat(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec) { +void chat(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec, AcceleratorContext* acc) { TokenizerChatStops stops(tokenizer); ChatTemplate chatTemplate(tokenizer->chatTemplate, stops.stops[0]); EosDetector eosDetector(tokenizer->chatEosId, stops.nStops, stops.stops, stops.maxStopLength, stops.maxStopLength); @@ -210,7 +210,8 @@ void worker(AppArgs* args) { SocketServer server(args->port); Socket socket = server.accept(); TransformerSpec spec; - Transformer transformer = Transformer::loadSlice(&spec, &socket); + AcceleratorContext acc(0, 1, NULL); + Transformer transformer = Transformer::loadSlice(&spec, &socket, &acc); TransformerArch arch = TransformerArchFactory::create(&spec); Worker worker = Worker(&arch, args->nThreads, &transformer, &socket); diff --git a/src/commands-test.cpp b/src/commands-test.cpp new file mode 100644 index 0000000..5bc8348 --- /dev/null +++ b/src/commands-test.cpp @@ -0,0 +1,85 @@ +#include "commands.hpp" +#include +#include +#include + +void testRopeSlice(int arch, const int nSliceTests, const int nPosTests, const int nThreadTests) { + int dim = 4096; + int headSize = 128; + int nKvHeads = 8; + int seqLen = 2048; + int nHeads = dim / headSize; + int kvDim = (dim * nKvHeads) / nHeads; + int ropeTheta = 10000.0f; + + float* q = new float[dim]; + float* k = new float[kvDim]; + float* correctQ = new float[dim]; + float* correctK = new float[kvDim]; + + for (int pos = 0; pos < seqLen; pos += seqLen / nPosTests) { + for (int si = 0; si < nSliceTests; si++) { + int nSlices = pow(2, si); + + for (int nThreads = 1; nThreads <= nThreadTests; nThreads++) { + printf("pos=%d nSlices=%d threads=%d\n", pos, nSlices, nThreads); + + for (int j = 0; j < dim; j++) q[j] = 1.0; + for (int j = 0; j < kvDim; j++) k[j] = 1.0; + + for (slice_index_t sliceIndex = 0; sliceIndex < nSlices; sliceIndex++) { + RopeSlice slice(dim, kvDim, nKvHeads, nSlices, seqLen, headSize, ropeTheta, sliceIndex); + RopeCommand* rope; + if (arch == 1) { + rope = new LlamaRopeCommand(&slice); + } else if (arch == 2) { + rope = new FalconRopeCommand(&slice); + } + + for (int threadIndex = 0; threadIndex < nThreads; threadIndex++) { + rope->forward( + true, + &q[(sliceIndex * dim) / nSlices], + pos, nThreads, threadIndex); + rope->forward( + false, + &k[(sliceIndex * kvDim) / nSlices], + pos, nThreads, threadIndex); + } + + delete rope; + } + + if (si == 0 && nThreads == 1) { + memcpy(correctQ, q, dim * sizeof(float)); + memcpy(correctK, k, kvDim * sizeof(float)); + } else { + for (int j = 0; j < dim; j++) { + if (fabs(q[j] - correctQ[j]) > 1e-6) { + printf("q[%d] mismatch: %f != %f (arch=%d)\n", j, q[j], correctQ[j], arch); + exit(EXIT_FAILURE); + } + } + for (int j = 0; j < kvDim; j++) { + if (fabs(k[j] - correctK[j]) > 1e-6) { + printf("k[%d] mismatch: %f != %f (arch=%d)\n", j, k[j], correctK[j], arch); + exit(EXIT_FAILURE); + } + } + } + } + } + } + + delete[] q; + delete[] k; + delete[] correctQ; + delete[] correctK; + printf("✅ ropeSlice (arch=%d)\n", arch); +} + +int main() { + testRopeSlice(2, 4, 6, 3); + testRopeSlice(1, 6, 4, 3); + return 0; +} diff --git a/src/commands.cpp b/src/commands.cpp new file mode 100644 index 0000000..200b816 --- /dev/null +++ b/src/commands.cpp @@ -0,0 +1,229 @@ +#include +#include +#include +#include "utils.hpp" +#include "funcs.hpp" +#include "commands.hpp" + +RowMatmulSlice::RowMatmulSlice(FloatType type, int nSlices, int n, int d) { + assert(d % nSlices == 0); + + this->type = type; + this->nSlices = nSlices; + this->d0 = d / nSlices; + this->n = n; + this->bytes = getBatchBytes(type, this->n, d); + this->sliceBytes = getBatchBytes(type, this->n, this->d0); +} + +size_t RowMatmulSlice::splitWeights(slice_index_t sliceIndex, char* weights, char* weights0) { + int numbersPerBatch = getNumbersPerBatch(this->type); + int batchBytes = getBatchBytes(this->type, numbersPerBatch, 1); + + int n = this->n / numbersPerBatch; + size_t offset = this->d0 * sliceIndex * n * batchBytes; + size_t copiedBytes = 0; + + for (int d = 0; d < this->d0; d++) { + for (int j = 0; j < n; j++) { + long o = (d * n + j) * batchBytes; + + memcpy(weights0 + o, weights + offset + o, batchBytes); + copiedBytes += batchBytes; + } + } + return copiedBytes; +} + +unsigned int RowMatmulSlice::dOffset(slice_index_t sliceIndex) { + return this->d0 * sliceIndex; +} + +ColMatmulSlice::ColMatmulSlice(FloatType type, int nSlices, int n, int d) { + assert(n % nSlices == 0); + + this->type = type; + this->nSlices = nSlices; + this->n = n; + this->n0 = n / nSlices; + this->d = d; + this->bytes = getBatchBytes(type, n, d); + this->sliceBytes = getBatchBytes(type, this->n0, d); +} + +size_t ColMatmulSlice::splitWeights(slice_index_t sliceIndex, char* weights, char* weights0) { + int numbersPerBatch = getNumbersPerBatch(this->type); + int batchBytes = getBatchBytes(this->type, numbersPerBatch, 1); + assert(n0 % numbersPerBatch == 0); + + int n = this->n / numbersPerBatch; + int rowBytes = n * batchBytes; + int row0Bytes = (n0 / numbersPerBatch) * batchBytes; + int rowOffsetBytes = sliceIndex * row0Bytes; + + size_t copiedBytes = 0; + for (int d = 0; d < this->d; d++) { + memcpy(&weights0[row0Bytes * d], &weights[rowBytes * d + rowOffsetBytes], row0Bytes); + copiedBytes += row0Bytes; + } + return copiedBytes; +} + +RopeSlice::RopeSlice(unsigned int dim, unsigned int kvDim, unsigned int nKvHeads, unsigned int nSlices, unsigned int seqLen, unsigned int headSize, float ropeTheta, slice_index_t sliceIndex) { + assert(dim >= kvDim); + assert(dim % nSlices == 0); + assert(kvDim % nSlices == 0); + + qDim0 = dim / nSlices; + kvDim0 = kvDim / nSlices; + assert(qDim0 % 2 == 0); + assert(kvDim0 % 2 == 0); + kvDimStart = kvDim0 * sliceIndex; + qDimStart = qDim0 * sliceIndex; + qDimEnd = qDimStart + qDim0; + qShift = qDimStart - kvDimStart; + sliceDim = qDimEnd - kvDimStart; + this->kvDim = kvDim; + this->nKvHeads = nKvHeads; + this->seqLen = seqLen; + this->headSize = headSize; + this->ropeTheta = ropeTheta; + assert(sliceDim % 2 == 0); +} + +KvCacheSlice::KvCacheSlice(unsigned int kvDim, unsigned int seqLen, unsigned int nSlices) { + assert(kvDim % nSlices == 0); + kvDim0 = kvDim / nSlices; + keyCacheSize = seqLen * kvDim0 * sizeof(float); + valueCacheSize = seqLen * kvDim0 * sizeof(float); +} + +MultiHeadAttSlice::MultiHeadAttSlice(unsigned int nHeads, unsigned int seqLen, unsigned int nSlices, slice_index_t sliceIndex) { + assert(nHeads % nSlices == 0); + nHeads0 = nHeads / nSlices; + attSize = seqLen * nHeads0 * sizeof(float); +} + +AcceleratorContext::AcceleratorContext(unsigned int nominator, unsigned int denominator, Accelerator* accelerator) { + this->nominator = nominator; + this->denominator = denominator; + this->accelerator = accelerator; +} + +unsigned int AcceleratorContext::divCpu(unsigned int value) { + return value - divAcc(value); +} + +unsigned int AcceleratorContext::divAcc(unsigned int value) { + return (nominator * value) / denominator; +} + +MatmulCommand::MatmulCommand(const unsigned int n, const unsigned int d, const FloatType inputFloatType, const FloatType weightsFloatType, AcceleratorContext* acc) { + this->n = n; + this->d = d; + this->inputFloatType = inputFloatType; + this->weightsFloatType = weightsFloatType; + this->acc = acc; + this->accD = acc->divAcc(d); + this->accSize = getBatchBytes(weightsFloatType, n, this->accD); + this->cpuD = acc->divCpu(d); + this->cpuSize = getBatchBytes(weightsFloatType, n, this->cpuD); + this->cpuWeights = newBuffer(this->cpuSize); + + if (this->accD != 0) { + this->accMatmulIndex = acc->accelerator->allocateMatmul(weightsFloatType, n, this->accD); + } +}; + +MatmulCommand::~MatmulCommand() { + freeBuffer(cpuWeights); +} + +size_t MatmulCommand::loadWeights(const void* source) { + memcpy(cpuWeights, source, cpuSize); + if (this->accD != 0) { + acc->accelerator->loadMatmulWeights(this->accMatmulIndex, &((char*)source)[cpuSize]); + } + return cpuSize + accSize; +} + +void MatmulCommand::forward(const void* input, float* output, const unsigned int nThreads, const unsigned int threadIndex) { + if (this->accD != 0 && threadIndex == 0) { + acc->accelerator->beginForwardMatmul(this->accMatmulIndex, input); + } + matmul(weightsFloatType, inputFloatType, output, input, cpuWeights, n, cpuD, nThreads, threadIndex); + if (this->accD != 0 && threadIndex == nThreads - 1) { + acc->accelerator->endForwardMatmul(this->accMatmulIndex, &output[cpuD]); + } +} + +LlamaRopeCommand::LlamaRopeCommand(RopeSlice *slice) { + this->slice = slice; + + size_t cacheBytes = slice->seqLen * slice->sliceDim * sizeof(float); + cache = (float*)newBuffer(cacheBytes); + printf("🕒 ropeCache: %ld kB\n", cacheBytes / 1024); + + for (pos_t pos = 0; pos < slice->seqLen; pos++) { + for (unsigned int i = slice->kvDimStart; i < slice->qDimEnd; i += 2) { + const unsigned int headDim = i % slice->headSize; + const float freq = 1.0f / powf(slice->ropeTheta, headDim / (float)slice->headSize); + const float val = pos * freq; + const float fcr = cosf(val); + const float fci = sinf(val); + cache[pos * slice->sliceDim + (i - slice->kvDimStart)] = fcr; + cache[pos * slice->sliceDim + (i - slice->kvDimStart) + 1] = fci; + } + } +}; + +LlamaRopeCommand::~LlamaRopeCommand() { + freeBuffer(cache); +} + +void LlamaRopeCommand::forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex) { + const unsigned int dim0Half = (isQ ? slice->qDim0 : slice->kvDim0) / 2; + const unsigned int shift = isQ ? slice->qShift : 0; + SPLIT_RANGE_TO_THREADS(s, e, 0, dim0Half, nThreads, threadIndex); + const unsigned int iStart = s * 2; + const unsigned int iEnd = e * 2; + + for (unsigned int i = iStart; i < iEnd; i += 2) { + float fcr = cache[pos * slice->sliceDim + shift + i]; + float fci = cache[pos * slice->sliceDim + shift + i + 1]; + float v0 = qOrK[i]; + float v1 = qOrK[i + 1]; + qOrK[i] = v0 * fcr - v1 * fci; + qOrK[i + 1] = v0 * fci + v1 * fcr; + } +} + +FalconRopeCommand::FalconRopeCommand(RopeSlice *slice) { + this->slice = slice; +} + +FalconRopeCommand::~FalconRopeCommand() {} + +void FalconRopeCommand::forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex) { + // TODO: this implementation allows only a small number of slices (because it requires dim0 % headSize == 0). This could be improved. + unsigned int dimStart = isQ ? slice->qDimStart : slice->kvDimStart; + unsigned int dim0 = isQ ? slice->qDim0 : slice->kvDim0; + unsigned int headSize = isQ ? slice->headSize : slice->kvDim / slice->nKvHeads; + assert(dimStart % headSize == 0); + assert(dim0 % headSize == 0); + unsigned int nHeads0 = dim0 / headSize; + SPLIT_RANGE_TO_THREADS(h0s, h0e, 0, nHeads0, nThreads, threadIndex); + + for (unsigned int h = h0s; h < h0e; h++) { + for (unsigned int j = 0; j < headSize / 2; j++) { + float freq = 1.0f / powf(slice->ropeTheta, 2.0f * (float)j / (float)headSize); + float val = pos * freq; + float fcr = cosf(val); + float fci = sinf(val); + float q0 = qOrK[h * headSize + j]; + float q1 = qOrK[h * headSize + j + headSize / 2]; + qOrK[h * headSize + j] = q0 * fcr - q1 * fci; + qOrK[h * headSize + j + headSize / 2] = q0 * fci + q1 * fcr; + } + } +} \ No newline at end of file diff --git a/src/commands.hpp b/src/commands.hpp new file mode 100644 index 0000000..6122354 --- /dev/null +++ b/src/commands.hpp @@ -0,0 +1,144 @@ +#ifndef COMMANDS_HPP +#define COMMANDS_HPP + +#include +#include "quants.hpp" + +// RESPONSIBILITIES +// +// *Slice - calculates sizes, offsets, slice sizes etc. It is not responsible for memory allocation. It may help in the loading of data. +// *Command - allocates memory for weights, performs calculations. + +typedef unsigned short pos_t; +typedef uint8_t slice_index_t; + +class MatmulSlice { +public: + size_t bytes; + size_t sliceBytes; + virtual size_t splitWeights(slice_index_t sliceIndex, char* weights, char* weights0) = 0; +}; + +class RowMatmulSlice : public MatmulSlice { +public: + FloatType type; + int nSlices; + int n; + int d0; + + RowMatmulSlice(FloatType type, int nSlices, int n, int d); + size_t splitWeights(slice_index_t sliceIndex, char* weights, char* weights0); + unsigned int dOffset(slice_index_t sliceIndex); +}; + +class ColMatmulSlice : public MatmulSlice { +public: + FloatType type; + int nSlices; + int n; + int n0; + int d; + + ColMatmulSlice(FloatType type, int nSlices, int n, int d); + size_t splitWeights(slice_index_t sliceIndex, char* weights, char* weights0); +}; + +class RopeSlice { +public: + unsigned int qDim0; + unsigned int qDimStart; + unsigned int qDimEnd; + unsigned int qShift; + unsigned int kvDim; + unsigned int kvDim0; + unsigned int kvDimStart; + unsigned int sliceDim; + unsigned int seqLen; + unsigned int headSize; + unsigned int nKvHeads; + float ropeTheta; + RopeSlice(unsigned int dim, unsigned int kvDim, unsigned int nKvHeads, unsigned int nSlices, unsigned int seqLen, unsigned int headSize, float ropeTheta, slice_index_t sliceIndex); +}; + +class KvCacheSlice { +public: + unsigned int kvDim0; + size_t keyCacheSize; + size_t valueCacheSize; + KvCacheSlice(unsigned int kvDim, unsigned int seqLen, unsigned int nSlices); +}; + +class MultiHeadAttSlice { +public: + unsigned int nHeads0; + size_t attSize; + MultiHeadAttSlice(unsigned int nHeads, unsigned int seqLen, unsigned int nSlices, slice_index_t sliceIndex); +}; + +class Accelerator { +public: + virtual const unsigned int allocateMatmul(const FloatType floatType, const unsigned int n, const unsigned int d) = 0; + virtual void loadMatmulWeights(const unsigned int matmulIndex, const void* weights) = 0; + virtual void beginForwardMatmul(const unsigned int matmulIndex, const void* input) = 0; + virtual void endForwardMatmul(const unsigned int matmulIndex, float* output) = 0; + virtual void closeMatmul(const unsigned int matmulIndex) = 0; +}; + +class AcceleratorContext { +public: + // ratio + unsigned int nominator; + unsigned int denominator; + Accelerator* accelerator; + + AcceleratorContext(unsigned int nominator, unsigned int denominator, Accelerator* accelerator); + unsigned int divCpu(unsigned int value); + unsigned int divAcc(unsigned int value); +}; + +class MatmulCommand { +private: + FloatType inputFloatType; + FloatType weightsFloatType; + unsigned int n; + unsigned int d; + unsigned int cpuD; + unsigned int accD; + size_t cpuSize; + size_t accSize; + void* cpuWeights; + unsigned int accMatmulIndex; + AcceleratorContext* acc; +public: + MatmulCommand(const unsigned int n, const unsigned int d, const FloatType inputFloatType, const FloatType weightsFloatType, AcceleratorContext* acc); + ~MatmulCommand(); + size_t loadWeights(const void* source); + void forward(const void* input, float* output, const unsigned int nThreads, const unsigned int threadIndex); +}; + +class RopeCommand { +public: + virtual ~RopeCommand() {}; + virtual void forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex) = 0; +}; + +class LlamaRopeCommand : public RopeCommand { +private: + RopeSlice* slice; + float* cache; +public: + LlamaRopeCommand(RopeSlice *slice); + ~LlamaRopeCommand(); + void forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex); +}; + +class FalconRopeCommand : public RopeCommand { +private: + RopeSlice* slice; +public: + FalconRopeCommand(RopeSlice *slice); + ~FalconRopeCommand(); + void forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex); +}; + +#endif diff --git a/src/funcs.cpp b/src/funcs.cpp index 42f7881..0f57e7f 100644 --- a/src/funcs.cpp +++ b/src/funcs.cpp @@ -1,9 +1,10 @@ #include #include #include +#include #include "common/pthread.h" -#include "quants.hpp" #include "funcs.hpp" +#include "utils.hpp" #if defined(__ARM_NEON) #include diff --git a/src/funcs.hpp b/src/funcs.hpp index c858194..8d304b1 100644 --- a/src/funcs.hpp +++ b/src/funcs.hpp @@ -3,15 +3,6 @@ #include "quants.hpp" -#define SPLIT_RANGE_TO_THREADS(varStart, varEnd, rangeStart, rangeEnd, nThreads, threadIndex) \ - const unsigned int rangeLen = (rangeEnd - rangeStart); \ - const unsigned int rangeSlice = rangeLen / nThreads; \ - const unsigned int rangeRest = rangeLen % nThreads; \ - const unsigned int varStart = threadIndex * rangeSlice + (threadIndex < rangeRest ? threadIndex : rangeRest); \ - const unsigned int varEnd = varStart + rangeSlice + (threadIndex < rangeRest ? 1 : 0); - -#define DEBUG_FLOATS(name, v, n) printf("⭕ %s ", name); for (int i = 0; i < n; i++) printf("%f ", v[i]); printf("\n"); - void softmax(float* x, const unsigned int size); float rms(const float* x, const unsigned int size); void rmsnorm(float* o, const float* x, const float ms, const float* weight, const unsigned int size, const unsigned int nThreads, const unsigned int threadIndex); diff --git a/src/grok1-tasks-test.cpp b/src/grok1-tasks-test.cpp index e29321b..d758779 100644 --- a/src/grok1-tasks-test.cpp +++ b/src/grok1-tasks-test.cpp @@ -52,7 +52,7 @@ int main() { size_t afterBlockBytes = (spec.dim + spec.dim * spec.vocabSize) * sizeof(float); spec.fileSize = spec.headerSize + beforeBlockBytes + blockBytes + afterBlockBytes; - char* weights = NEW_BUFFER(beforeBlockBytes + blockBytes + afterBlockBytes); + char* weights = (char*)newBuffer(beforeBlockBytes + blockBytes + afterBlockBytes); long nFloats = blockBytes / sizeof(float); float* block = (float*)&weights[beforeBlockBytes]; @@ -60,7 +60,8 @@ int main() { for (int f = 0; f < nFloats; f++) block[f] = randomF32(&state) / 100.0; SocketPool socketPool(0, NULL); - Transformer transformer = Transformer::loadRoot((char*)weights, &spec, &socketPool); + AcceleratorContext acc(0, 1, NULL); + Transformer transformer = Transformer::loadRoot(weights, &spec, &socketPool, &acc); transformer.pos = 0; float* x = transformer.x; @@ -81,7 +82,7 @@ int main() { loop.run(); long t1 = timeMs(); - FREE_BUFFER(weights); + freeBuffer(weights); compare(&x[0], expectedOutput_0_4, 4); compare(&x[256], expectedOutput_256_260, 4); diff --git a/src/grok1-tasks.cpp b/src/grok1-tasks.cpp index 8c57041..3156a19 100644 --- a/src/grok1-tasks.cpp +++ b/src/grok1-tasks.cpp @@ -56,7 +56,7 @@ void grokMoeRmsNorm(TASK_ARGS) { void grokMoeRouter(TASK_ARGS) { TASK_VARIABLES; float* xb = (float*)transformer->buffer->getUnit(TB_UNIT_XB); - matmul(spec->weightsFloatType, F32, block->moeRouterProbs, xb, block->moeRouter, spec->dim, spec->nExperts, nThreads, threadIndex); + block->moeRouterMm->forward(xb, block->moeRouterProbs, nThreads, threadIndex); } void grokMoeRouterSoftmax(TASK_ARGS) { @@ -137,8 +137,9 @@ void grokMoeBlock0(TASK_ARGS) { float* expertUp = &hb[block->moeUpAndGate0Slice->d0 * ae]; float* expertGate = &block->expertGate[block->moeUpAndGate0Slice->d0 * ae]; - matmul(spec->weightsFloatType, spec->bufferFloatType, expertUp, xb, block->moeUp[e], block->moeUpAndGate0Slice->n, block->moeUpAndGate0Slice->d0, nThreads, threadIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, expertGate, xb, block->moeGate[e], block->moeUpAndGate0Slice->n, block->moeUpAndGate0Slice->d0, nThreads, threadIndex); + + block->moeUpMm[e]->forward(xb, expertUp, nThreads, threadIndex); + block->moeGateMm[e]->forward(xb, expertGate, nThreads, threadIndex); } } @@ -175,7 +176,7 @@ void grokSyncMoeMulRearrange(TASK_ARGS) { TASK_VARIABLES; if (threadIndex == 0 && spec->nSlices > 1) { - char* hbq = transformer->buffer->getUnit(TB_SLICED_HB_QUANTIZED); + char* hbq = (char*)transformer->buffer->getUnit(TB_SLICED_HB_QUANTIZED); size_t bufferBytes = transformer->buffer->getUnitBytes(TB_SLICED_HB_QUANTIZED); size_t bufferSliceBytes = transformer->buffer->getSlicedBytes(TB_SLICED_HB_QUANTIZED); @@ -204,7 +205,7 @@ void grokMoeBlock2(TASK_ARGS) { TASK_VARIABLES; float* xb2 = (float*)transformer->buffer->getSliced(TB_SLICED_XB2, transformer->sliceIndex); - char* hbq = transformer->buffer->getUnit(TB_SLICED_HB_QUANTIZED); + char* hbq = (char*)transformer->buffer->getUnit(TB_SLICED_HB_QUANTIZED); size_t rowBytes = getBatchBytes(spec->bufferFloatType, spec->hiddenDim, 1); uint8_t* indexes = (uint8_t*)transformer->buffer->getUnit(TB_UNIT_MOE_INDEXES); @@ -217,7 +218,7 @@ void grokMoeBlock2(TASK_ARGS) { char* expertUp = &hbq[rowBytes * ae]; float* expertDown = ae == 0 ? xb2 : &block->expertDown[block->moeDown0Slice->d0 * (ae - 1)]; - matmul(spec->weightsFloatType, spec->bufferFloatType, expertDown, expertUp, block->moeDown[e], block->moeDown0Slice->n, block->moeDown0Slice->d0, nThreads, threadIndex); + block->moeDownMm[e]->forward(expertUp, expertDown, nThreads, threadIndex); mulScalar(expertDown, weight, block->moeDown0Slice->d0, nThreads, threadIndex); if (ae > 0) { @@ -263,7 +264,7 @@ void grokMoeAdd(TASK_ARGS) { void grokFinalize(TASK_ARGS) { TASK_VARIABLES; - matmul(spec->weightsFloatType, F32, transformer->logits, transformer->x, transformer->wcls, spec->dim, spec->vocabSize, nThreads, threadIndex); + transformer->wclsMm->forward(transformer->x, transformer->logits, nThreads, threadIndex); } void grokFinalize2(TASK_ARGS) { diff --git a/src/llama2-tasks-test.cpp b/src/llama2-tasks-test.cpp index ae96be4..05049a5 100644 --- a/src/llama2-tasks-test.cpp +++ b/src/llama2-tasks-test.cpp @@ -550,7 +550,7 @@ int main() { size_t afterBlockBytes = /* norm */ 16384 + /* embedding */ 524288000; spec.fileSize = beforeBlockBytes + blockBytes + afterBlockBytes + spec.headerSize; size_t dataSize = beforeBlockBytes + blockBytes + afterBlockBytes; - char* data = NEW_BUFFER(dataSize); + char* data = (char*)newBuffer(dataSize); memset(data, 0, dataSize); unsigned long long state = 800000010L; @@ -562,7 +562,8 @@ int main() { for (int i = 0; i < mm; i++) mmData[i] = randomF32(&state) / 120.0; SocketPool socketPool(0, NULL); - Transformer transformer = Transformer::loadRoot((char*)data, &spec, &socketPool); + AcceleratorContext acc(0, 1, NULL); + Transformer transformer = Transformer::loadRoot((char*)data, &spec, &socketPool, &acc); transformer.pos = 0; float* x = transformer.x; @@ -583,7 +584,7 @@ int main() { loop.run(); long t1 = timeMs(); - FREE_BUFFER(data); + freeBuffer(data); int ix = -1; for (int i = 0; i < spec.dim; i++) { diff --git a/src/llama2-tasks.cpp b/src/llama2-tasks.cpp index 235552f..a84f26b 100644 --- a/src/llama2-tasks.cpp +++ b/src/llama2-tasks.cpp @@ -36,19 +36,19 @@ void llamaQkv(TASK_ARGS) { assert(block->kvCacheSlice->kvDim0 == block->v0Slice->d0); float *xbq = (float*)transformer->buffer->getUnit(TB_UNIT_XB_QUANTIZED); - float *k0 = &block->kvCacheSlice->keyCache[transformer->pos * block->kvCacheSlice->kvDim0]; - float* v0 = &block->kvCacheSlice->valueCache[transformer->pos * block->kvCacheSlice->kvDim0]; + float *k0 = &block->keyCache[transformer->pos * block->kvCacheSlice->kvDim0]; + float* v0 = &block->valueCache[transformer->pos * block->kvCacheSlice->kvDim0]; - matmul(spec->weightsFloatType, spec->bufferFloatType, block->qo0, xbq, block->q0, block->q0Slice->n, block->q0Slice->d0, nThreads, threadIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, k0, xbq, block->k0, block->k0Slice->n, block->k0Slice->d0, nThreads, threadIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, v0, xbq, block->v0, block->v0Slice->n, block->v0Slice->d0, nThreads, threadIndex); + block->q0mm->forward(xbq, block->qo0, nThreads, threadIndex); + block->k0mm->forward(xbq, k0, nThreads, threadIndex); + block->v0mm->forward(xbq, v0, nThreads, threadIndex); } void llamaRope(TASK_ARGS) { TASK_VARIABLES; - float* k0 = &block->kvCacheSlice->keyCache[transformer->pos * block->kvCacheSlice->kvDim0]; - transformer->ropeSlice->forward(true, block->qo0, transformer->pos, nThreads, threadIndex); - transformer->ropeSlice->forward(false, k0, transformer->pos, nThreads, threadIndex); + float* k0 = &block->keyCache[transformer->pos * block->kvCacheSlice->kvDim0]; + transformer->rope->forward(true, block->qo0, transformer->pos, nThreads, threadIndex); + transformer->rope->forward(false, k0, transformer->pos, nThreads, threadIndex); } void llamaMultiheadAtt(TASK_ARGS) { @@ -63,11 +63,11 @@ void llamaMultiheadAtt(TASK_ARGS) { // get the query vector for this head float* _q = block->qo0 + h0 * spec->headSize; // attention scores for this head - float* _att = block->multiHeadAttSlice->att + h0 * spec->seqLen; + float* _att = block->att + h0 * spec->seqLen; // iterate over all timesteps, including the current one for (int t = 0; t <= transformer->pos; t++) { // get the key vector for this head and at this timestep - float* k = block->kvCacheSlice->keyCache + t * block->kvCacheSlice->kvDim0 + (h0 / kvMul) * spec->headSize; + float* k = block->keyCache + t * block->kvCacheSlice->kvDim0 + (h0 / kvMul) * spec->headSize; // calculate the attention score as the dot product of q and k float score = dotProduct(_q, k, spec->headSize) / sqrtf(spec->headSize); _att[t] = score; @@ -81,7 +81,7 @@ void llamaMultiheadAtt(TASK_ARGS) { memset(hxb, 0, spec->headSize * sizeof(float)); for (int t = 0; t <= transformer->pos; t++) { // get the value vector for this head and at this timestep - float* _v = block->kvCacheSlice->valueCache + t * block->kvCacheSlice->kvDim0 + (h0 / kvMul) * spec->headSize; + float* _v = block->valueCache + t * block->kvCacheSlice->kvDim0 + (h0 / kvMul) * spec->headSize; // get the attention weight for this timestep float a = _att[t]; @@ -101,10 +101,10 @@ void llamaQuantizeMultiheadAtt(TASK_ARGS) { void llamaAtt(TASK_ARGS) { TASK_VARIABLES; - char* xbq0 = transformer->buffer->getSliced(TB_UNIT_XB_QUANTIZED, transformer->sliceIndex); + void* xbq0 = transformer->buffer->getSliced(TB_UNIT_XB_QUANTIZED, transformer->sliceIndex); float* xbv0 = (float*)transformer->buffer->getSliced(TB_SLICED_XBV, transformer->sliceIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, xbv0, xbq0, block->wo0, block->wo0Slice->n0, block->wo0Slice->d, nThreads, threadIndex); + block->wo0mm->forward(xbq0, xbv0, nThreads, threadIndex); } void llamaQuantizeAtt(TASK_ARGS) { @@ -124,7 +124,7 @@ void llamaDequantizeAtt(TASK_ARGS) { void llamaMergeAtt(TASK_ARGS) { TASK_VARIABLES; - for (uint8_t sliceIndex = 0; sliceIndex < spec->nSlices; sliceIndex++) { + for (slice_index_t sliceIndex = 0; sliceIndex < spec->nSlices; sliceIndex++) { float* xbv = (float*)transformer->buffer->getSliced(TB_SLICED_XBV, sliceIndex); add(transformer->x, xbv, spec->dim, nThreads, threadIndex); } @@ -161,8 +161,8 @@ void llamaFfn0(TASK_ARGS) { float* xb = (float*)transformer->buffer->getUnit(TB_UNIT_XB_QUANTIZED); float* hb0 = (float*)transformer->buffer->getSliced(TB_SLICED_HB, transformer->sliceIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, hb0, xb, block->w10, block->w10Slice->n, block->w10Slice->d0, nThreads, threadIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, block->hb20, xb, block->w30, block->w30Slice->n, block->w30Slice->d0, nThreads, threadIndex); + block->w10mm->forward(xb, hb0, nThreads, threadIndex); + block->w30mm->forward(xb, block->hb20, nThreads, threadIndex); if (spec->hiddenAct == SILU) { silu(hb0, block->w10Slice->d0, nThreads, threadIndex); @@ -185,7 +185,7 @@ void llamaFfn2(TASK_ARGS) { float *hb = (float*)transformer->buffer->getSliced(TB_SLICED_HB_QUANTIZED, transformer->sliceIndex); float *xbv = (float*)transformer->buffer->getSliced(TB_SLICED_XBV, transformer->sliceIndex); - matmul(spec->weightsFloatType, spec->bufferFloatType, xbv, hb, block->w20, block->w20Slice->n0, block->w20Slice->d, nThreads, threadIndex); + block->w20mm->forward(hb, xbv, nThreads, threadIndex); } void llamaQuantizeFfn2(TASK_ARGS) { @@ -205,7 +205,7 @@ void llamaDequantizeFfn2(TASK_ARGS) { void llamaMergeFfn2(TASK_ARGS) { TASK_VARIABLES; - for (uint8_t sliceIndex = 0; sliceIndex < spec->nSlices; sliceIndex++) { + for (slice_index_t sliceIndex = 0; sliceIndex < spec->nSlices; sliceIndex++) { float* xbv = (float*)transformer->buffer->getSliced(TB_SLICED_XBV, sliceIndex); add(transformer->x, xbv, spec->dim, nThreads, threadIndex); } @@ -235,9 +235,7 @@ void llamaRmsFinalNorm(TASK_ARGS) { void llamaFinalize(TASK_ARGS) { TASK_VARIABLES; - - float* x = transformer->x; - matmul(spec->weightsFloatType, F32, transformer->logits, x, transformer->wcls, spec->dim, spec->vocabSize, nThreads, threadIndex); + transformer->wclsMm->forward(transformer->x, transformer->logits, nThreads, threadIndex); } TransformerArch buildLlamaArch(TransformerSpec* spec) { diff --git a/src/tasks.cpp b/src/tasks.cpp index 5b4b187..17c2322 100644 --- a/src/tasks.cpp +++ b/src/tasks.cpp @@ -42,7 +42,7 @@ void TransformerArch::W(TaskLoopHandler* handler, unsigned int taskType) { } void syncUnitBuffer(unsigned int nThreads, unsigned int threadIndex, TransformerContext* ctx, uint8_t bufferIndex) { - char* buffer = ctx->transformer->buffer->getUnit(bufferIndex); + void* buffer = ctx->transformer->buffer->getUnit(bufferIndex); size_t bufferBytes = ctx->transformer->buffer->getUnitBytes(bufferIndex); if (ctx->socketPool != NULL) { @@ -84,7 +84,7 @@ void syncSliceOfSlicedBuffer(unsigned int nThreads, unsigned int threadIndex, Tr if (threadIndex != 0) return; // worker - char* buffer = ctx->transformer->buffer->getSliced(bufferIndex, ctx->transformer->sliceIndex); + void* buffer = ctx->transformer->buffer->getSliced(bufferIndex, ctx->transformer->sliceIndex); ctx->socket->write(buffer, bufferBytes); } } @@ -101,7 +101,7 @@ void syncMissingSlicesOfSlicedBuffer(unsigned int nThreads, unsigned int threadI for (unsigned int i = 0; i < nSockets; i++) { int socketIndex = threadIndex + i * nThreads; uint8_t workerSliceIndex = socketIndex + 1; - uint8_t sliceIndex = si < workerSliceIndex ? si : si + 1; + slice_index_t sliceIndex = si < workerSliceIndex ? si : si + 1; ios[i].socketIndex = socketIndex; ios[i].data = ctx->transformer->buffer->getSliced(bufferIndex, sliceIndex); ios[i].size = sliceBytes; @@ -112,9 +112,9 @@ void syncMissingSlicesOfSlicedBuffer(unsigned int nThreads, unsigned int threadI if (threadIndex != 0) return; // worker - for (uint8_t sliceIndex = 0; sliceIndex < ctx->transformer->spec->nSlices; sliceIndex++) { + for (slice_index_t sliceIndex = 0; sliceIndex < ctx->transformer->spec->nSlices; sliceIndex++) { if (sliceIndex != ctx->transformer->sliceIndex) { - char* buffer = ctx->transformer->buffer->getSliced(bufferIndex, sliceIndex); + void* buffer = ctx->transformer->buffer->getSliced(bufferIndex, sliceIndex); ctx->socket->read(buffer, sliceBytes); } } diff --git a/src/transformer-test.cpp b/src/transformer-test.cpp deleted file mode 100644 index ef5dc5c..0000000 --- a/src/transformer-test.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include "transformer.hpp" -#include -#include -#include - -void testRopeSlice(const TransformerArchType archType, const int nSliceTests, const int nPosTests, const int nThreadTests) { - TransformerSpec spec; - spec.dim = 4096; - spec.headSize = 128; - spec.nKvHeads = 8; - spec.seqLen = 2048; - spec.nHeads = spec.dim / spec.headSize; - spec.kvDim = (spec.dim * spec.nKvHeads) / spec.nHeads; - spec.ropeTheta = 10000.0f; - - float* q = new float[spec.dim]; - float* k = new float[spec.kvDim]; - float* correctQ = new float[spec.dim]; - float* correctK = new float[spec.kvDim]; - - for (int pos = 0; pos < spec.seqLen; pos += spec.seqLen / nPosTests) { - for (int si = 0; si < nSliceTests; si++) { - spec.nSlices = pow(2, si); - - for (int nThreads = 1; nThreads <= nThreadTests; nThreads++) { - printf("pos=%d nSlices=%d threads=%d\n", pos, spec.nSlices, nThreads); - - for (int j = 0; j < spec.dim; j++) q[j] = 1.0; - for (int j = 0; j < spec.kvDim; j++) k[j] = 1.0; - - for (uint8_t sliceIndex = 0; sliceIndex < spec.nSlices; sliceIndex++) { - RopeSlice* slice; - if (archType == LLAMA) { - slice = new LlamaRopeSlice(&spec, sliceIndex); - } else if (archType == MIXTRAL) { - slice = new FalconRopeSlice(&spec, sliceIndex); - } - - for (int threadIndex = 0; threadIndex < nThreads; threadIndex++) { - slice->forward( - true, - &q[(sliceIndex * spec.dim) / spec.nSlices], - pos, nThreads, threadIndex); - slice->forward( - false, - &k[(sliceIndex * spec.kvDim) / spec.nSlices], - pos, nThreads, threadIndex); - } - - delete slice; - } - - if (si == 0 && nThreads == 1) { - memcpy(correctQ, q, spec.dim * sizeof(float)); - memcpy(correctK, k, spec.kvDim * sizeof(float)); - } else { - for (int j = 0; j < spec.dim; j++) { - if (fabs(q[j] - correctQ[j]) > 1e-6) { - printf("q[%d] mismatch: %f != %f (arch=%d)\n", j, q[j], correctQ[j], archType); - exit(EXIT_FAILURE); - } - } - for (int j = 0; j < spec.kvDim; j++) { - if (fabs(k[j] - correctK[j]) > 1e-6) { - printf("k[%d] mismatch: %f != %f (arch=%d)\n", j, k[j], correctK[j], archType); - exit(EXIT_FAILURE); - } - } - } - } - } - } - - delete[] q; - delete[] k; - delete[] correctQ; - delete[] correctK; - printf("✅ ropeSlice (arch=%d)\n", archType); -} - -int main() { - testRopeSlice(MIXTRAL, 4, 6, 3); - testRopeSlice(LLAMA, 6, 4, 3); - return 0; -} diff --git a/src/transformer.cpp b/src/transformer.cpp index e81fb35..ebae06f 100644 --- a/src/transformer.cpp +++ b/src/transformer.cpp @@ -1,185 +1,14 @@ #include -#include #include #include #include -#include "funcs.hpp" #include "utils.hpp" #include "socket.hpp" +#include "commands.hpp" #include "transformer.hpp" -#define ALLOC_WEIGHTS true #define IS_ROOT_SLICE(sliceIndex) (sliceIndex == 0) -RowMatmulSlice::RowMatmulSlice(FloatType type, int nSlices, int n, int d) { - assert(d % nSlices == 0); - - this->type = type; - this->nSlices = nSlices; - this->d0 = d / nSlices; - this->n = n; - this->bytes = getBatchBytes(type, this->n, d); - this->sliceBytes = getBatchBytes(type, this->n, this->d0); -} - -size_t RowMatmulSlice::splitWeights(uint8_t sliceIndex, char* weights, char* weights0) { - int numbersPerBatch = getNumbersPerBatch(this->type); - int batchBytes = getBatchBytes(this->type, numbersPerBatch, 1); - - int n = this->n / numbersPerBatch; - size_t offset = this->d0 * sliceIndex * n * batchBytes; - size_t copiedBytes = 0; - - for (int d = 0; d < this->d0; d++) { - for (int j = 0; j < n; j++) { - long o = (d * n + j) * batchBytes; - - memcpy(weights0 + o, weights + offset + o, batchBytes); - copiedBytes += batchBytes; - } - } - return copiedBytes; -} - -unsigned int RowMatmulSlice::dOffset(uint8_t sliceIndex) { - return this->d0 * sliceIndex; -} - -ColMatmulSlice::ColMatmulSlice(FloatType type, int nSlices, int n, int d) { - assert(n % nSlices == 0); - - this->type = type; - this->nSlices = nSlices; - this->n = n; - this->n0 = n / nSlices; - this->d = d; - this->bytes = getBatchBytes(type, n, d); - this->sliceBytes = getBatchBytes(type, this->n0, d); -} - -size_t ColMatmulSlice::splitWeights(uint8_t sliceIndex, char* weights, char* weights0) { - int numbersPerBatch = getNumbersPerBatch(this->type); - int batchBytes = getBatchBytes(this->type, numbersPerBatch, 1); - assert(n0 % numbersPerBatch == 0); - - int n = this->n / numbersPerBatch; - int rowBytes = n * batchBytes; - int row0Bytes = (n0 / numbersPerBatch) * batchBytes; - int rowOffsetBytes = sliceIndex * row0Bytes; - - size_t copiedBytes = 0; - for (int d = 0; d < this->d; d++) { - memcpy(&weights0[row0Bytes * d], &weights[rowBytes * d + rowOffsetBytes], row0Bytes); - copiedBytes += row0Bytes; - } - return copiedBytes; -} - -RopeSlice::RopeSlice(TransformerSpec* spec, uint8_t sliceIndex) { - assert(spec->dim >= spec->kvDim); - assert(spec->dim % spec->nSlices == 0); - assert(spec->kvDim % spec->nSlices == 0); - - qDim0 = spec->dim / spec->nSlices; - kvDim0 = spec->kvDim / spec->nSlices; - assert(qDim0 % 2 == 0); - assert(kvDim0 % 2 == 0); - kvDimStart = kvDim0 * sliceIndex; - qDimStart = qDim0 * sliceIndex; - qDimEnd = qDimStart + qDim0; - qShift = qDimStart - kvDimStart; - sliceDim = qDimEnd - kvDimStart; - assert(sliceDim % 2 == 0); - this->spec = spec; -} - -RopeSlice::~RopeSlice() {} - -LlamaRopeSlice::LlamaRopeSlice(TransformerSpec* spec, uint8_t sliceIndex) : RopeSlice(spec, sliceIndex) { - size_t cacheBytes = spec->seqLen * sliceDim * sizeof(float); - cache = (float*)NEW_BUFFER(cacheBytes); - printf("🕒 ropeCache: %ld kB\n", cacheBytes / 1024); - - for (pos_t pos = 0; pos < spec->seqLen; pos++) { - for (unsigned int i = kvDimStart; i < qDimEnd; i += 2) { - const unsigned int headDim = i % spec->headSize; - const float freq = 1.0f / powf(spec->ropeTheta, headDim / (float)spec->headSize); - const float val = pos * freq; - const float fcr = cosf(val); - const float fci = sinf(val); - cache[pos * sliceDim + (i - kvDimStart)] = fcr; - cache[pos * sliceDim + (i - kvDimStart) + 1] = fci; - } - } -} - -LlamaRopeSlice::~LlamaRopeSlice() { - FREE_BUFFER(cache); -} - -void LlamaRopeSlice::forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex) { - const unsigned int dim0Half = (isQ ? qDim0 : kvDim0) / 2; - const unsigned int shift = isQ ? qShift : 0; - SPLIT_RANGE_TO_THREADS(s, e, 0, dim0Half, nThreads, threadIndex); - const unsigned int iStart = s * 2; - const unsigned int iEnd = e * 2; - - for (unsigned int i = iStart; i < iEnd; i += 2) { - float fcr = cache[pos * sliceDim + shift + i]; - float fci = cache[pos * sliceDim + shift + i + 1]; - float v0 = qOrK[i]; - float v1 = qOrK[i + 1]; - qOrK[i] = v0 * fcr - v1 * fci; - qOrK[i + 1] = v0 * fci + v1 * fcr; - } -} - -void FalconRopeSlice::forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex) { - // TODO: this implementation allows only a small number of slices (because it requires dim0 % headSize == 0). This could be improved. - unsigned int dimStart = isQ ? qDimStart : kvDimStart; - unsigned int dim0 = isQ ? qDim0 : kvDim0; - unsigned int headSize = isQ ? spec->headSize : spec->kvDim / spec->nKvHeads; - assert(dimStart % headSize == 0); - assert(dim0 % headSize == 0); - unsigned int nHeads0 = dim0 / headSize; - SPLIT_RANGE_TO_THREADS(h0s, h0e, 0, nHeads0, nThreads, threadIndex); - - for (unsigned int h = h0s; h < h0e; h++) { - for (unsigned int j = 0; j < headSize / 2; j++) { - float freq = 1.0f / powf(spec->ropeTheta, 2.0f * (float)j / (float)headSize); - float val = pos * freq; - float fcr = cosf(val); - float fci = sinf(val); - float q0 = qOrK[h * headSize + j]; - float q1 = qOrK[h * headSize + j + headSize / 2]; - qOrK[h * headSize + j] = q0 * fcr - q1 * fci; - qOrK[h * headSize + j + headSize / 2] = q0 * fci + q1 * fcr; - } - } -} - -KvCacheSlice::KvCacheSlice(unsigned int kvDim, unsigned int seqLen, unsigned int nSlices) { - assert(kvDim % nSlices == 0); - kvDim0 = kvDim / nSlices; - keyCache = (float*)NEW_BUFFER(seqLen * kvDim0 * sizeof(float)); - valueCache = (float*)NEW_BUFFER(seqLen * kvDim0 * sizeof(float)); -} - -KvCacheSlice::~KvCacheSlice() { - FREE_BUFFER(keyCache); - FREE_BUFFER(valueCache); -} - -MultiHeadAttSlice::MultiHeadAttSlice(unsigned int nHeads, unsigned int seqLen, unsigned int nSlices, uint8_t sliceIndex) { - assert(nHeads % nSlices == 0); - nHeads0 = nHeads / nSlices; - att = (float*)NEW_BUFFER(seqLen * nHeads0 * sizeof(float)); -} - -MultiHeadAttSlice::~MultiHeadAttSlice() { - FREE_BUFFER(att); -} - TransformerSpec Transformer::loadSpecFromFile(const char* path, const unsigned int nSlices, FloatType weightsFloatType, FloatType bufferFloatType) { TransformerSpec spec; memset(&spec, 0, sizeof(TransformerSpec)); @@ -243,8 +72,6 @@ TransformerSpec Transformer::loadSpecFromFile(const char* path, const unsigned i throw std::runtime_error("Unsupported header key"); } } - - } else { throw std::runtime_error("Unsupported model file"); } @@ -299,7 +126,7 @@ TransformerSpec Transformer::loadSpecFromFile(const char* path, const unsigned i TransformerBuffer::TransformerBuffer(TransformerSpec* spec) { nSlices = spec->nSlices; - buffers = new char*[TB_LENGTH]; + buffers = new void*[TB_LENGTH]; bufferBytes = new size_t[TB_LENGTH]; bufferBytes[TB_UNIT_XB] = spec->dim * sizeof(float); @@ -319,8 +146,8 @@ TransformerBuffer::TransformerBuffer(TransformerSpec* spec) { bufferBytes[TB_UNIT_MOE_INDEXES] = spec->nActiveExperts * sizeof(uint8_t); bufferBytes[TB_UNIT_MOE_WEIGHTS] = spec->nActiveExperts * sizeof(float); - buffers[TB_UNIT_MOE_INDEXES] = NEW_BUFFER(bufferBytes[TB_UNIT_MOE_INDEXES]); - buffers[TB_UNIT_MOE_WEIGHTS] = NEW_BUFFER(bufferBytes[TB_UNIT_MOE_WEIGHTS]); + buffers[TB_UNIT_MOE_INDEXES] = newBuffer(bufferBytes[TB_UNIT_MOE_INDEXES]); + buffers[TB_UNIT_MOE_WEIGHTS] = newBuffer(bufferBytes[TB_UNIT_MOE_WEIGHTS]); } else { bufferBytes[TB_UNIT_MOE_INDEXES] = 0; bufferBytes[TB_UNIT_MOE_WEIGHTS] = 0; @@ -328,34 +155,34 @@ TransformerBuffer::TransformerBuffer(TransformerSpec* spec) { for (int i = 0; i < TB_LENGTH - TB_NO_PAIRS; i += 2) { int bytes = bufferBytes[i]; - buffers[i] = NEW_BUFFER(bufferBytes[i]); + buffers[i] = newBuffer(bufferBytes[i]); if (spec->bufferFloatType == F32) { buffers[i + 1] = buffers[i]; } else { - buffers[i + 1] = NEW_BUFFER(bufferBytes[i + 1]); + buffers[i + 1] = newBuffer(bufferBytes[i + 1]); } } } TransformerBuffer::~TransformerBuffer() { if (bufferBytes[TB_UNIT_MOE_INDEXES] > 0 && bufferBytes[TB_UNIT_MOE_WEIGHTS] > 0) { - FREE_BUFFER(buffers[TB_UNIT_MOE_INDEXES]); - FREE_BUFFER(buffers[TB_UNIT_MOE_WEIGHTS]); + freeBuffer(buffers[TB_UNIT_MOE_INDEXES]); + freeBuffer(buffers[TB_UNIT_MOE_WEIGHTS]); } for (int i = 0; i < TB_LENGTH - TB_NO_PAIRS; i += 2) { if (bufferBytes[i] > 0) { if (buffers[i] != buffers[i + 1]) { - FREE_BUFFER(buffers[i + 1]); + freeBuffer(buffers[i + 1]); } - FREE_BUFFER(buffers[i]); + freeBuffer(buffers[i]); } } delete[] bufferBytes; delete[] buffers; } -char* TransformerBuffer::getUnit(uint8_t bufferIndex) { +void* TransformerBuffer::getUnit(uint8_t bufferIndex) { return buffers[bufferIndex]; } @@ -363,42 +190,44 @@ size_t TransformerBuffer::getUnitBytes(uint8_t bufferIndex) { return bufferBytes[bufferIndex]; } -char* TransformerBuffer::getSliced(uint8_t bufferIndex, uint8_t sliceIndex) { +void* TransformerBuffer::getSliced(uint8_t bufferIndex, slice_index_t sliceIndex) { size_t sliceBytes = getSlicedBytes(bufferIndex); - return buffers[bufferIndex] + sliceBytes * sliceIndex; + return ((char*)buffers[bufferIndex]) + sliceBytes * sliceIndex; } size_t TransformerBuffer::getSlicedBytes(uint8_t bufferIndex) { return bufferBytes[bufferIndex] / nSlices; } -Transformer::Transformer(TransformerSpec* spec, uint8_t sliceIndex) { +Transformer::Transformer(TransformerSpec* spec, slice_index_t sliceIndex, AcceleratorContext* acc) { this->spec = spec; this->sliceIndex = sliceIndex; + this->acc = acc; buffer = new TransformerBuffer(spec); blocks = new TransformerBlock*[spec->nLayers]; for (int i = 0; i < spec->nLayers; i++) { - blocks[i] = new TransformerBlock(spec, sliceIndex); + blocks[i] = new TransformerBlock(spec, sliceIndex, acc); } if (IS_ROOT_SLICE(sliceIndex)) { tokenEmbeddingTableBytes = spec->vocabSize * spec->dim * sizeof(float); rmsFinalBytes = spec->dim * sizeof(float); - wclsBytes = getBatchBytes(spec->weightsFloatType, spec->vocabSize, spec->dim); -#if ALLOC_WEIGHTS - tokenEmbeddingTable = NEW_BUFFER(tokenEmbeddingTableBytes); - rmsFinal = NEW_BUFFER(rmsFinalBytes); - wcls = NEW_BUFFER(wclsBytes); -#endif - x = (float*)NEW_BUFFER(spec->dim * sizeof(float)); - logits = (float*)NEW_BUFFER(spec->vocabSize * sizeof(float)); + + tokenEmbeddingTable = (float*)newBuffer(tokenEmbeddingTableBytes); + rmsFinal = (float*)newBuffer(rmsFinalBytes); + + wclsMm = new MatmulCommand(spec->dim, spec->vocabSize, F32, spec->weightsFloatType, acc); + + x = (float*)newBuffer(spec->dim * sizeof(float)); + logits = (float*)newBuffer(spec->vocabSize * sizeof(float)); } + ropeSlice = new RopeSlice(spec->dim, spec->kvDim, spec->nKvHeads, spec->nSlices, spec->seqLen, spec->headSize, spec->ropeTheta, sliceIndex); if (spec->archType == GROK1 || spec->archType == MIXTRAL) { - ropeSlice = new FalconRopeSlice(spec, sliceIndex); + rope = new FalconRopeCommand(ropeSlice); } else { - ropeSlice = new LlamaRopeSlice(spec, sliceIndex); + rope = new LlamaRopeCommand(ropeSlice); } TransformerBlock* b = blocks[0]; @@ -417,192 +246,165 @@ Transformer::~Transformer() { delete[] blocks; if (IS_ROOT_SLICE(sliceIndex)) { -#if ALLOC_WEIGHTS - FREE_BUFFER(tokenEmbeddingTable); - FREE_BUFFER(rmsFinal); - FREE_BUFFER(wcls); -#endif - FREE_BUFFER(x); - FREE_BUFFER(logits); + freeBuffer(tokenEmbeddingTable); + freeBuffer(rmsFinal); + delete wclsMm; + + freeBuffer(x); + freeBuffer(logits); } delete ropeSlice; + delete rope; } -TransformerBlock::TransformerBlock(TransformerSpec* spec, uint8_t sliceIndex) { +TransformerBlock::TransformerBlock(TransformerSpec* spec, slice_index_t sliceIndex, AcceleratorContext* acc) { this->sliceIndex = sliceIndex; this->spec = spec; + this->acc = acc; if (IS_ROOT_SLICE(sliceIndex)) { rmsAttBytes = spec->dim * sizeof(float); rmsFfnBytes = spec->dim * sizeof(float); rmsMoeBytes = spec->dim * sizeof(float); rmsFfn2Bytes = spec->dim * sizeof(float); -#if ALLOC_WEIGHTS - rmsAtt = (float*)NEW_BUFFER(rmsAttBytes); - rmsFfn = (float*)NEW_BUFFER(rmsFfnBytes); + + rmsAtt = (float*)newBuffer(rmsAttBytes); + rmsFfn = (float*)newBuffer(rmsFfnBytes); if (spec->archType == GROK1) { - rmsMoe = (float*)NEW_BUFFER(rmsMoeBytes); - rmsFfn2 = (float*)NEW_BUFFER(rmsFfn2Bytes); + rmsMoe = (float*)newBuffer(rmsMoeBytes); + rmsFfn2 = (float*)newBuffer(rmsFfn2Bytes); } -#endif } kvCacheSlice = new KvCacheSlice(spec->kvDim, spec->seqLen, spec->nSlices); + keyCache = (float*)newBuffer(kvCacheSlice->keyCacheSize); + valueCache = (float*)newBuffer(kvCacheSlice->valueCacheSize); + multiHeadAttSlice = new MultiHeadAttSlice(spec->nHeads, spec->seqLen, spec->nSlices, sliceIndex); + att = (float*)newBuffer(multiHeadAttSlice->attSize); q0Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->dim); k0Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->kvDim); v0Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->kvDim); wo0Slice = new ColMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->dim); - qo0 = (float*)NEW_BUFFER(q0Slice->d0 * sizeof(float)); + q0mm = new MatmulCommand(q0Slice->n, q0Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); + k0mm = new MatmulCommand(k0Slice->n, k0Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); + v0mm = new MatmulCommand(v0Slice->n, v0Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); + wo0mm = new MatmulCommand(wo0Slice->n0, wo0Slice->d, spec->bufferFloatType, spec->weightsFloatType, acc); -#if ALLOC_WEIGHTS - q0 = NEW_BUFFER(q0Slice->sliceBytes); - k0 = NEW_BUFFER(k0Slice->sliceBytes); - v0 = NEW_BUFFER(v0Slice->sliceBytes); - wo0 = NEW_BUFFER(wo0Slice->sliceBytes); -#endif + qo0 = (float*)newBuffer(q0Slice->d0 * sizeof(float)); if (spec->nExperts > 0) { moeUpAndGate0Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->hiddenDim); moeDown0Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->hiddenDim, spec->dim); - moeRouterBytes = getBatchBytes(spec->weightsFloatType, spec->dim, spec->nExperts); - moeRouterProbs = (float*)NEW_BUFFER(spec->nExperts * sizeof(float)); + moeRouterProbs = (float*)newBuffer(spec->nExperts * sizeof(float)); - moeUp = new char*[spec->nExperts]; - moeGate = new char*[spec->nExperts]; - moeDown = new char*[spec->nExperts]; - -#if ALLOC_WEIGHTS - moeRouter = NEW_BUFFER(moeRouterBytes); + moeUpMm = new MatmulCommand*[spec->nExperts]; + moeGateMm = new MatmulCommand*[spec->nExperts]; + moeDownMm = new MatmulCommand*[spec->nExperts]; + moeRouterMm = new MatmulCommand(spec->dim, spec->nExperts, F32, spec->weightsFloatType, acc); for (int e = 0; e < spec->nExperts; e++) { - moeUp[e] = NEW_BUFFER(moeUpAndGate0Slice->sliceBytes); - moeGate[e] = NEW_BUFFER(moeUpAndGate0Slice->sliceBytes); - moeDown[e] = NEW_BUFFER(moeDown0Slice->sliceBytes); + moeUpMm[e] = new MatmulCommand(moeUpAndGate0Slice->n, moeUpAndGate0Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); + moeGateMm[e] = new MatmulCommand(moeUpAndGate0Slice->n, moeUpAndGate0Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); + moeDownMm[e] = new MatmulCommand(moeDown0Slice->n, moeDown0Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); } -#endif - expertGate = (float*)NEW_BUFFER(moeUpAndGate0Slice->d0 * spec->nExperts * sizeof(float)); - expertDown = (float*)NEW_BUFFER(moeDown0Slice->d0 * (spec->nExperts - 1) * sizeof(float)); + + expertGate = (float*)newBuffer(moeUpAndGate0Slice->d0 * spec->nExperts * sizeof(float)); + expertDown = (float*)newBuffer(moeDown0Slice->d0 * (spec->nExperts - 1) * sizeof(float)); } else { w10Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->hiddenDim); w20Slice = new ColMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->hiddenDim, spec->dim); w30Slice = new RowMatmulSlice(spec->weightsFloatType, spec->nSlices, spec->dim, spec->hiddenDim); -#if ALLOC_WEIGHTS - w10 = NEW_BUFFER(w10Slice->sliceBytes); - w20 = NEW_BUFFER(w20Slice->sliceBytes); - w30 = NEW_BUFFER(w30Slice->sliceBytes); -#endif + w10mm = new MatmulCommand(w10Slice->n, w10Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); + w20mm = new MatmulCommand(w20Slice->n0, w20Slice->d, spec->bufferFloatType, spec->weightsFloatType, acc); + w30mm = new MatmulCommand(w30Slice->n, w30Slice->d0, spec->bufferFloatType, spec->weightsFloatType, acc); - hb20 = (float*)NEW_BUFFER(w30Slice->d0 * sizeof(float)); + hb20 = (float*)newBuffer(w30Slice->d0 * sizeof(float)); } } TransformerBlock::~TransformerBlock() { if (IS_ROOT_SLICE(sliceIndex)) { -#if ALLOC_WEIGHTS - FREE_BUFFER(rmsAtt); - FREE_BUFFER(rmsFfn); + freeBuffer(rmsAtt); + freeBuffer(rmsFfn); if (spec->archType == GROK1) { - FREE_BUFFER(rmsMoe); - FREE_BUFFER(rmsFfn2); + freeBuffer(rmsMoe); + freeBuffer(rmsFfn2); } -#endif } delete kvCacheSlice; + freeBuffer(keyCache); + freeBuffer(valueCache); delete multiHeadAttSlice; + freeBuffer(att); delete q0Slice; delete k0Slice; delete v0Slice; delete wo0Slice; - FREE_BUFFER(qo0); + freeBuffer(qo0); -#if ALLOC_WEIGHTS - FREE_BUFFER(q0); - FREE_BUFFER(k0); - FREE_BUFFER(v0); - FREE_BUFFER(wo0); -#endif + delete q0mm; + delete k0mm; + delete v0mm; + delete wo0mm; if (spec->nExperts > 0) { delete moeUpAndGate0Slice; delete moeDown0Slice; -#if ALLOC_WEIGHTS for (int e = 0; e < spec->nExperts; e++) { - FREE_BUFFER(moeUp[e]); - FREE_BUFFER(moeGate[e]); - FREE_BUFFER(moeDown[e]); + delete moeUpMm[e]; + delete moeGateMm[e]; + delete moeDownMm[e]; } - FREE_BUFFER(moeRouter); -#endif - delete[] moeUp; - delete[] moeGate; - delete[] moeDown; - FREE_BUFFER(moeRouterProbs); + delete[] moeUpMm; + delete[] moeGateMm; + delete[] moeDownMm; + freeBuffer(moeRouterProbs); - FREE_BUFFER(expertGate); - FREE_BUFFER(expertDown); + freeBuffer(expertGate); + freeBuffer(expertDown); } else { delete w10Slice; delete w20Slice; delete w30Slice; -#if ALLOC_WEIGHTS - FREE_BUFFER(w10); - FREE_BUFFER(w20); - FREE_BUFFER(w30); -#endif + delete w10mm; + delete w20mm; + delete w30mm; - FREE_BUFFER(hb20); + freeBuffer(hb20); } } -static size_t loadSlicedMatmulWeights(uint8_t nSlices, MatmulSlice* slice, char* weights, char** weights0, SocketPool* socketPool) { -#if ALLOC_WEIGHTS - if (nSlices > 1) { - char* temp = NEW_BUFFER(slice->bytes); - memcpy(temp, weights, slice->bytes); - - size_t loadedBytes = 0; - for (uint8_t s = 0; s < nSlices; s++) { - uint8_t sliceIndex = (s + 1) % nSlices; // Root slice must be loaded last because we want keep root weights in the memory. - loadedBytes += slice->splitWeights(sliceIndex, temp, *weights0); - if (sliceIndex > 0) { - unsigned int socketIndex = sliceIndex - 1; - socketPool->write(socketIndex, *weights0, slice->sliceBytes); - } +static size_t loadSlicedMatmulWeights(const uint8_t nSlices, MatmulSlice* slice, char* source, MatmulCommand* mm, SocketPool* socketPool) { + char* buffer = (char*)newBuffer(slice->sliceBytes); + size_t loadedBytes = 0; + for (uint8_t s = 0; s < nSlices; s++) { + slice_index_t sliceIndex = (s + 1) % nSlices; + loadedBytes += slice->splitWeights(sliceIndex, source, buffer); + if (sliceIndex > 0) { + unsigned int socketIndex = sliceIndex - 1; + socketPool->write(socketIndex, buffer, slice->sliceBytes); + } else { + mm->loadWeights(buffer); } - - assert(loadedBytes == slice->bytes); - FREE_BUFFER(temp); - return loadedBytes; - } else { - size_t loadedBytes = slice->splitWeights(0, weights, *weights0); - assert(loadedBytes == slice->bytes); - return loadedBytes; } -#else - assert(nSlices == 1); - *weights0 = weights; - return slice->bytes; -#endif + freeBuffer(buffer); + return loadedBytes; } -static size_t loadRootMatmulWeights(char** target, char* source, size_t bytes) { -#if ALLOC_WEIGHTS +static size_t loadRootWeights(char** target, char* source, size_t bytes) { memcpy(*target, source, bytes); -#else - *target = source; -#endif return bytes; } @@ -611,30 +413,26 @@ static size_t readSlicedMatmulWeights(MatmulSlice* slice, char* weights0, Socket return slice->sliceBytes; } -Transformer Transformer::loadRootFromFile(const char* path, TransformerSpec* spec, SocketPool* socketPool) { +Transformer Transformer::loadRootFromFile(const char* path, TransformerSpec* spec, SocketPool* socketPool, AcceleratorContext* acc) { MmapFile file; openMmapFile(&file, path, spec->fileSize); char* weights = ((char*)file.data) + spec->headerSize; - Transformer transformer = Transformer::loadRoot((char*)weights, spec, socketPool); + Transformer transformer = Transformer::loadRoot((char*)weights, spec, socketPool, acc); -#if ALLOC_WEIGHTS closeMmapFile(&file); -#else - // TODO: handler should be released in destructor -#endif return transformer; } -Transformer Transformer::loadRoot(char* data, TransformerSpec* spec, SocketPool* socketPool) { +Transformer Transformer::loadRoot(char* data, TransformerSpec* spec, SocketPool* socketPool, AcceleratorContext* acc) { assert(socketPool->nSockets == spec->nSlices - 1); - const uint8_t sliceIndex = 0; // Root slice - Transformer transformer(spec, sliceIndex); + const slice_index_t sliceIndex = 0; // Root slice + Transformer transformer(spec, sliceIndex, acc); if (spec->nSlices > 1) { - for (uint8_t sliceIndex = 1; sliceIndex < spec->nSlices; sliceIndex++) { + for (slice_index_t sliceIndex = 1; sliceIndex < spec->nSlices; sliceIndex++) { unsigned int socketIndex = sliceIndex - 1; socketPool->write(socketIndex, (char*)&sliceIndex, sizeof(uint8_t)); socketPool->write(socketIndex, (char*)spec, sizeof(TransformerSpec)); @@ -643,41 +441,40 @@ Transformer Transformer::loadRoot(char* data, TransformerSpec* spec, SocketPool* char* w = data; - w += loadRootMatmulWeights(&transformer.tokenEmbeddingTable, w, transformer.tokenEmbeddingTableBytes); + w += loadRootWeights((char**)&transformer.tokenEmbeddingTable, w, transformer.tokenEmbeddingTableBytes); for (int i = 0; i < spec->nLayers; i++) { TransformerBlock* block = transformer.blocks[i]; - - w += loadSlicedMatmulWeights(spec->nSlices, block->q0Slice, w, &block->q0, socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->k0Slice, w, &block->k0, socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->v0Slice, w, &block->v0, socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->wo0Slice, w, &block->wo0, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->q0Slice, w, block->q0mm, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->k0Slice, w, block->k0mm, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->v0Slice, w, block->v0mm, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->wo0Slice, w, block->wo0mm, socketPool); if (spec->nExperts > 0) { - w += loadRootMatmulWeights(&block->moeRouter, w, block->moeRouterBytes); + w += block->moeRouterMm->loadWeights(w); for (int e = 0; e < spec->nExperts; e++) { - w += loadSlicedMatmulWeights(spec->nSlices, block->moeUpAndGate0Slice, w, &block->moeUp[e], socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->moeUpAndGate0Slice, w, &block->moeGate[e], socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->moeDown0Slice, w, &block->moeDown[e], socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->moeUpAndGate0Slice, w, block->moeUpMm[e], socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->moeUpAndGate0Slice, w, block->moeGateMm[e], socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->moeDown0Slice, w, block->moeDownMm[e], socketPool); } } else { - w += loadSlicedMatmulWeights(spec->nSlices, block->w10Slice, w, &block->w10, socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->w20Slice, w, &block->w20, socketPool); - w += loadSlicedMatmulWeights(spec->nSlices, block->w30Slice, w, &block->w30, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->w10Slice, w, block->w10mm, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->w20Slice, w, block->w20mm, socketPool); + w += loadSlicedMatmulWeights(spec->nSlices, block->w30Slice, w, block->w30mm, socketPool); } - w += loadRootMatmulWeights((char**)&block->rmsAtt, w, block->rmsAttBytes); - w += loadRootMatmulWeights((char**)&block->rmsFfn, w, block->rmsFfnBytes); + w += loadRootWeights((char**)&block->rmsAtt, w, block->rmsAttBytes); + w += loadRootWeights((char**)&block->rmsFfn, w, block->rmsFfnBytes); if (spec->archType == GROK1) { - w += loadRootMatmulWeights((char**)&block->rmsMoe, w, block->rmsMoeBytes); - w += loadRootMatmulWeights((char**)&block->rmsFfn2, w, block->rmsFfn2Bytes); + w += loadRootWeights((char**)&block->rmsMoe, w, block->rmsMoeBytes); + w += loadRootWeights((char**)&block->rmsFfn2, w, block->rmsFfn2Bytes); } } - w += loadRootMatmulWeights(&transformer.rmsFinal, w, transformer.rmsFinalBytes); - w += loadRootMatmulWeights(&transformer.wcls, w, transformer.wclsBytes); + w += loadRootWeights((char**)&transformer.rmsFinal, w, transformer.rmsFinalBytes); + w += transformer.wclsMm->loadWeights(w); long missedBytes = (long)(w - data) - spec->fileSize + spec->headerSize; if (missedBytes != 0) { @@ -689,8 +486,8 @@ Transformer Transformer::loadRoot(char* data, TransformerSpec* spec, SocketPool* return transformer; } -Transformer Transformer::loadSlice(TransformerSpec* spec, Socket* socket) { - uint8_t sliceIndex; +Transformer Transformer::loadSlice(TransformerSpec* spec, Socket* socket, AcceleratorContext* acc) { + slice_index_t sliceIndex; socket->read((char*)&sliceIndex, sizeof(uint8_t)); socket->read((char*)spec, sizeof(TransformerSpec)); @@ -698,31 +495,70 @@ Transformer Transformer::loadSlice(TransformerSpec* spec, Socket* socket) { printf("💡 nSlices: %d\n", spec->nSlices); assert(sliceIndex >= 1); - Transformer transformer(spec, sliceIndex); + Transformer transformer(spec, sliceIndex, acc); + + size_t bufferSize = 0; + // TODO: this is ugly + for (int i = 0; i < spec->nLayers; i++) { + TransformerBlock* block = transformer.blocks[i]; + if (block->k0Slice->sliceBytes > bufferSize) bufferSize = block->k0Slice->sliceBytes; + if (block->q0Slice->sliceBytes > bufferSize) bufferSize = block->q0Slice->sliceBytes; + if (block->wo0Slice->sliceBytes > bufferSize) bufferSize = block->wo0Slice->sliceBytes; + if (spec->nExperts > 0) { + if (block->moeUpAndGate0Slice[0].sliceBytes > bufferSize) bufferSize = block->moeUpAndGate0Slice[0].sliceBytes; + if (block->moeDown0Slice[0].sliceBytes > bufferSize) bufferSize = block->moeDown0Slice[0].sliceBytes; + } else { + if (block->w10Slice->sliceBytes > bufferSize) bufferSize = block->w10Slice->sliceBytes; + if (block->w20Slice->sliceBytes > bufferSize) bufferSize = block->w20Slice->sliceBytes; + if (block->w30Slice->sliceBytes > bufferSize) bufferSize = block->w30Slice->sliceBytes; + } + } + + char* buffer = new char[bufferSize]; for (int i = 0; i < spec->nLayers; i++) { TransformerBlock* block = transformer.blocks[i]; size_t blockBytes = 0; long t0 = timeMs(); - blockBytes += readSlicedMatmulWeights(block->q0Slice, block->q0, socket); - blockBytes += readSlicedMatmulWeights(block->k0Slice, block->k0, socket); - blockBytes += readSlicedMatmulWeights(block->v0Slice, block->v0, socket); - blockBytes += readSlicedMatmulWeights(block->wo0Slice, block->wo0, socket); + + socket->read(buffer, block->q0Slice->sliceBytes); + blockBytes += block->q0mm->loadWeights(buffer); + + socket->read(buffer, block->k0Slice->sliceBytes); + blockBytes += block->k0mm->loadWeights(buffer); + + socket->read(buffer, block->v0Slice->sliceBytes); + blockBytes += block->v0mm->loadWeights(buffer); + + socket->read(buffer, block->wo0Slice->sliceBytes); + blockBytes += block->wo0mm->loadWeights(buffer); if (spec->nExperts > 0) { for (int e = 0; e < spec->nExperts; e++) { - blockBytes += readSlicedMatmulWeights(block->moeUpAndGate0Slice, block->moeUp[e], socket); - blockBytes += readSlicedMatmulWeights(block->moeUpAndGate0Slice, block->moeGate[e], socket); - blockBytes += readSlicedMatmulWeights(block->moeDown0Slice, block->moeDown[e], socket); + socket->read(buffer, block->moeUpAndGate0Slice->sliceBytes); + blockBytes += block->moeUpMm[e]->loadWeights(buffer); + + socket->read(buffer, block->moeUpAndGate0Slice->sliceBytes); + blockBytes += block->moeGateMm[e]->loadWeights(buffer); + + socket->read(buffer, block->moeDown0Slice->sliceBytes); + blockBytes += block->moeDownMm[e]->loadWeights(buffer); } } else { - blockBytes += readSlicedMatmulWeights(block->w10Slice, block->w10, socket); - blockBytes += readSlicedMatmulWeights(block->w20Slice, block->w20, socket); - blockBytes += readSlicedMatmulWeights(block->w30Slice, block->w30, socket); + socket->read(buffer, block->w10Slice->sliceBytes); + blockBytes += block->w10mm->loadWeights(buffer); + + socket->read(buffer, block->w20Slice->sliceBytes); + blockBytes += block->w20mm->loadWeights(buffer); + + socket->read(buffer, block->w30Slice->sliceBytes); + blockBytes += block->w30mm->loadWeights(buffer); } float kbs = blockBytes / (float)(timeMs() - t0); printf("⏩ Received %ld kB for block %d (%.0f kB/s)\n", blockBytes / 1024, i, kbs); } + + delete[] buffer; return transformer; } diff --git a/src/transformer.hpp b/src/transformer.hpp index f195754..1ff8948 100644 --- a/src/transformer.hpp +++ b/src/transformer.hpp @@ -4,41 +4,9 @@ #include #include #include "quants.hpp" +#include "commands.hpp" #include "socket.hpp" -typedef unsigned short pos_t; - -class MatmulSlice { -public: - size_t bytes; - size_t sliceBytes; - virtual size_t splitWeights(uint8_t sliceIndex, char* weights, char* weights0) = 0; -}; - -class RowMatmulSlice : public MatmulSlice { -public: - FloatType type; - int nSlices; - int n; - int d0; - - RowMatmulSlice(FloatType type, int nSlices, int n, int d); - size_t splitWeights(uint8_t sliceIndex, char* weights, char* weights0); - unsigned int dOffset(uint8_t sliceIndex); -}; - -class ColMatmulSlice : public MatmulSlice { -public: - FloatType type; - int nSlices; - int n; - int n0; - int d; - - ColMatmulSlice(FloatType type, int nSlices, int n, int d); - size_t splitWeights(uint8_t sliceIndex, char* weights, char* weights0); -}; - enum TransformerHeaderKey { VERSION = 0, ARCH_TYPE = 1, @@ -103,57 +71,11 @@ struct TransformerSpec { uint8_t nSlices; }; -class RopeSlice { -public: - unsigned int qDim0; - unsigned int qDimStart; - unsigned int qDimEnd; - unsigned int qShift; - unsigned int kvDim0; - unsigned int kvDimStart; - unsigned int sliceDim; - TransformerSpec* spec; - RopeSlice(TransformerSpec* spec, uint8_t sliceIndex); - virtual ~RopeSlice(); - virtual void forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex) = 0; -}; - -class LlamaRopeSlice : public RopeSlice { -private: - float* cache; -public: - LlamaRopeSlice(TransformerSpec* spec, uint8_t sliceIndex); - ~LlamaRopeSlice(); - void forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex); -}; - -class FalconRopeSlice : public RopeSlice { -public: - using RopeSlice::RopeSlice; - void forward(bool isQ, float* qOrK, pos_t pos, unsigned int nThreads, unsigned int threadIndex); -}; - -class KvCacheSlice { -public: - unsigned int kvDim0; - float* keyCache; - float* valueCache; - KvCacheSlice(unsigned int kvDim, unsigned int seqLen, unsigned int nSlices); - ~KvCacheSlice(); -}; - -class MultiHeadAttSlice { -public: - unsigned int nHeads0; - float* att; - MultiHeadAttSlice(unsigned int nHeads, unsigned int seqLen, unsigned int nSlices, uint8_t sliceIndex); - ~MultiHeadAttSlice(); -}; - class TransformerBlock { public: - uint8_t sliceIndex; + slice_index_t sliceIndex; TransformerSpec *spec; + AcceleratorContext* acc; size_t rmsAttBytes; float* rmsAtt; @@ -164,41 +86,42 @@ class TransformerBlock { size_t rmsFfn2Bytes; float* rmsFfn2; - char* q0; + MatmulCommand *q0mm; + MatmulCommand *k0mm; + MatmulCommand *v0mm; + MatmulCommand *wo0mm; RowMatmulSlice* q0Slice; - char* k0; RowMatmulSlice* k0Slice; - char* v0; RowMatmulSlice* v0Slice; - char* wo0; ColMatmulSlice* wo0Slice; - char* w10; + MatmulCommand *w10mm; + MatmulCommand *w20mm; + MatmulCommand *w30mm; RowMatmulSlice* w10Slice; - char* w20; ColMatmulSlice* w20Slice; - char* w30; RowMatmulSlice* w30Slice; - char* moeRouter; - size_t moeRouterBytes; + MatmulCommand* moeRouterMm; RowMatmulSlice* moeUpAndGate0Slice; - char** moeUp; - char** moeGate; RowMatmulSlice* moeDown0Slice; - char** moeDown; - float* moeRouterProbs; + MatmulCommand** moeUpMm; + MatmulCommand** moeGateMm; + MatmulCommand** moeDownMm; + float* moeRouterProbs; float* expertGate; float* expertDown; - float* hb20; KvCacheSlice* kvCacheSlice; + float* keyCache; + float* valueCache; MultiHeadAttSlice* multiHeadAttSlice; + float* att; float* qo0; - TransformerBlock(TransformerSpec* spec, uint8_t sliceIndex); + TransformerBlock(TransformerSpec* spec, slice_index_t sliceIndex, AcceleratorContext* acc); ~TransformerBlock(); }; @@ -219,46 +142,47 @@ class TransformerBlock { class TransformerBuffer { public: uint8_t nSlices; - char** buffers; + void** buffers; size_t* bufferBytes; TransformerBuffer(TransformerSpec* spec); ~TransformerBuffer(); - char* getUnit(uint8_t bufferIndex); + void* getUnit(uint8_t bufferIndex); size_t getUnitBytes(uint8_t bufferIndex); - char* getSliced(uint8_t bufferIndex, uint8_t sliceIndex); + void* getSliced(uint8_t bufferIndex, slice_index_t sliceIndex); size_t getSlicedBytes(uint8_t bufferIndex); }; class Transformer { public: TransformerSpec* spec; + AcceleratorContext* acc; TransformerBlock** blocks; TransformerBuffer* buffer; - uint8_t sliceIndex; + slice_index_t sliceIndex; size_t tokenEmbeddingTableBytes; - char* tokenEmbeddingTable; + float* tokenEmbeddingTable; size_t rmsFinalBytes; - char* rmsFinal; - size_t wclsBytes; - char* wcls; + float* rmsFinal; + MatmulCommand* wclsMm; pos_t pos; float rms; float* x; float* logits; RopeSlice* ropeSlice; + RopeCommand* rope; ~Transformer(); static TransformerSpec loadSpecFromFile(const char* path, const unsigned int nSlices, FloatType weightsFloatType, FloatType bufferFloatType); - static Transformer loadRootFromFile(const char* path, TransformerSpec* spec, SocketPool* socketPool); - static Transformer loadRoot(char* data, TransformerSpec* spec, SocketPool* socketPool); - static Transformer loadSlice(TransformerSpec* spec, Socket* socket); + static Transformer loadRootFromFile(const char* path, TransformerSpec* spec, SocketPool* socketPool, AcceleratorContext* acc); + static Transformer loadRoot(char* data, TransformerSpec* spec, SocketPool* socketPool, AcceleratorContext* acc); + static Transformer loadSlice(TransformerSpec* spec, Socket* socket, AcceleratorContext* acc); private: - Transformer(TransformerSpec* spec, uint8_t sliceIndex); + Transformer(TransformerSpec* spec, slice_index_t sliceIndex, AcceleratorContext* acc); }; #endif diff --git a/src/utils.hpp b/src/utils.hpp index 127e9ea..4323fdc 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -9,8 +9,14 @@ #include #endif -#define NEW_BUFFER(size) (char*)newBuffer(size) -#define FREE_BUFFER(buffer) freeBuffer(buffer) +#define SPLIT_RANGE_TO_THREADS(varStart, varEnd, rangeStart, rangeEnd, nThreads, threadIndex) \ + const unsigned int rangeLen = (rangeEnd - rangeStart); \ + const unsigned int rangeSlice = rangeLen / nThreads; \ + const unsigned int rangeRest = rangeLen % nThreads; \ + const unsigned int varStart = threadIndex * rangeSlice + (threadIndex < rangeRest ? threadIndex : rangeRest); \ + const unsigned int varEnd = varStart + rangeSlice + (threadIndex < rangeRest ? 1 : 0); + +#define DEBUG_FLOATS(name, v, n) printf("⭕ %s ", name); for (int i = 0; i < n; i++) printf("%f ", v[i]); printf("\n"); void* newBuffer(size_t size); void freeBuffer(void* buffer);