b4rtaz · b4rtaz · May 27, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -12,19 +12,25 @@ jobs:
     runs-on: ${{matrix.os}}
     strategy:
       matrix:
-        os:
-          - ubuntu-latest
-        platforms: 
-          - linux/arm64
-          - linux/amd64
+        os: [windows-latest, ubuntu-latest]
+        include:
+          - os: windows-latest
+            install_dependencies: choco install make
+            shell: cmd
+            platforms: 
+              - windows/amd64
+          - os: ubuntu-latest
+            install_dependencies: sudo apt-get update && sudo apt-get install build-essential
+            shell: bash
+            platforms: 
+              - linux/arm64
+              - linux/amd64
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v3
       - name: Dependencies
         id: dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
+        run: ${{ matrix.install_dependencies }}
       - name: Build
         id: build
         run: |
@@ -44,4 +50,4 @@ jobs:
       - name: llama2-tasks-test
         run: ./llama2-tasks-test
       - name: grok1-tasks-test
-        run: ./grok1-tasks-test
+        run: ./grok1-tasks-test
diff --git a/.github/workflows/scripts/dependencies.cmd b/.github/workflows/scripts/dependencies.cmd
@@ -0,0 +1 @@
+choco install make
diff --git a/.github/workflows/scripts/dependencies.sh b/.github/workflows/scripts/dependencies.sh
@@ -0,0 +1,2 @@
+sudo apt-get update
+sudo apt-get install build-essential
diff --git a/.gitignore b/.gitignore
@@ -3,12 +3,12 @@
 *.o
 *.dSYM
 *.data
-*.bin
 __pycache__
 
 *-test
 main
-run.sh
+run*.sh
 server
 /dllama
 /dllama-*
+*.exe
diff --git a/Makefile b/Makefile
@@ -1,6 +1,14 @@
 CXX = g++
 CXXFLAGS = -std=c++11 -Werror -O3 -march=native -mtune=native
 
+# Default settings
+LIBS = -lpthread
+
+# Conditional settings for Windows
+ifeq ($(OS),Windows_NT)
+    LIBS = -lws2_32 # or -lpthreadGC2 if needed
+endif
+
 utils: src/utils.cpp
 	$(CXX) $(CXXFLAGS) -c src/utils.cpp -o utils.o
 quants: src/quants.cpp
@@ -27,16 +35,17 @@ app: src/app.cpp
 	$(CXX) $(CXXFLAGS) -c src/app.cpp -o app.o
 
 dllama: src/apps/dllama/dllama.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app
-	$(CXX) $(CXXFLAGS) src/apps/dllama/dllama.cpp -o dllama utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o -lpthread
+	$(CXX) $(CXXFLAGS) src/apps/dllama/dllama.cpp -o dllama utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o $(LIBS)
 dllama-api: src/apps/dllama-api/dllama-api.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app
-	$(CXX) $(CXXFLAGS) src/apps/dllama-api/dllama-api.cpp -o dllama-api utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o -lpthread
+	$(CXX) $(CXXFLAGS) src/apps/dllama-api/dllama-api.cpp -o dllama-api utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o $(LIBS)
+
 funcs-test: src/funcs-test.cpp funcs utils quants
-	$(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o utils.o quants.o -lpthread
+	$(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o utils.o quants.o $(LIBS)
 quants-test: src/quants.cpp utils quants
-	$(CXX) $(CXXFLAGS) src/quants-test.cpp -o quants-test utils.o quants.o -lpthread
+	$(CXX) $(CXXFLAGS) src/quants-test.cpp -o quants-test utils.o quants.o $(LIBS)
 transformer-test: src/transformer-test.cpp funcs utils quants transformer socket
-	$(CXX) $(CXXFLAGS) src/transformer-test.cpp -o transformer-test funcs.o utils.o quants.o transformer.o socket.o -lpthread
+	$(CXX) $(CXXFLAGS) src/transformer-test.cpp -o transformer-test funcs.o utils.o quants.o transformer.o socket.o $(LIBS)
 llama2-tasks-test: src/llama2-tasks-test.cpp utils quants funcs socket transformer tasks llama2-tasks tokenizer
-	$(CXX) $(CXXFLAGS) src/llama2-tasks-test.cpp -o llama2-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o tokenizer.o -lpthread
+	$(CXX) $(CXXFLAGS) src/llama2-tasks-test.cpp -o llama2-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o tokenizer.o $(LIBS)
 grok1-tasks-test: src/grok1-tasks-test.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks tokenizer
-	$(CXX) $(CXXFLAGS) src/grok1-tasks-test.cpp -o grok1-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o tokenizer.o -lpthread
+	$(CXX) $(CXXFLAGS) src/grok1-tasks-test.cpp -o grok1-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o tokenizer.o $(LIBS)
diff --git a/README.md b/README.md
@@ -152,9 +152,11 @@ To add more worker nodes, just add more addresses to the `--workers` argument.
 
 [Share your results](https://github.com/b4rtaz/distributed-llama/discussions)!
 
-## 💻 How to Run on MacOS or Linux
+## 💻 How to Run on MacOS, Linux, or Windows
 
-You need to have x86_64 AVX2 CPU or ARM CPU. Different devices may have different CPUs. The below instructions are for Debian-based distributions but you can easily adapt them to your distribution or macOS.
+You need to have x86_64 AVX2 CPU or ARM CPU. Different devices may have different CPUs. The below instructions are for Debian-based distributions but you can easily adapt them to your distribution, macOS, or Windows.
+
+### MacOS and Linux
 
 1. Install Git and G++:
 ```sh
@@ -182,6 +184,35 @@ sudo nice -n -20 ./dllama inference --model ../dllama_llama-2-7b_q40.bin --token
 sudo nice -n -20 ./dllama chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
 ```
 
+### Windows
+
+1. Install Git and Mingw (Chocolatey):
+  - https://chocolatey.org/install
+```powershell
+choco install mingw
+```
+2. Clone this repository:
+```sh
+git clone https://github.com/b4rtaz/distributed-llama.git
+```
+3. Compile Distributed Llama:
+```sh
+make dllama
+```
+4. Transfer weights and the tokenizer file to the root node.
+5. Run worker nodes on worker devices:
+```sh
+./dllama worker --port 9998 --nthreads 4
+```
+6. Run root node on the root device:
+```sh
+./dllama inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
+```
+7. To run the root node in the chat mode:
+```sh
+./dllama chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
+```
+
 [Share your results](https://github.com/b4rtaz/distributed-llama/discussions)!
 
 ## 💡 License

diff --git a/src/app.cpp b/src/app.cpp
@@ -4,6 +4,7 @@
 #include <cstdio>
 #include <cassert>
 #include <stdexcept>
+#include <ctime>
 #include "app.hpp"
 
 FloatType parseFloatType(char* val) {

diff --git a/src/apps/dllama-api/dllama-api.cpp b/src/apps/dllama-api/dllama-api.cpp
@@ -5,10 +5,17 @@
 #include <cassert>
 #include <sstream>
 #include <iostream>
+#include <vector>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#pragma comment(lib, "ws2_32.lib")
+#else
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <unistd.h>
-#include <vector>
+#endif
 
 #include "../../utils.hpp"
 #include "../../socket.hpp"

diff --git a/src/common/pthread.h b/src/common/pthread.h
@@ -0,0 +1,40 @@
+#ifndef PTHREAD_WRAPPER
+#define PTHREAD_WRAPPER
+
+#ifdef _WIN32 
+#include <windows.h>
+
+typedef HANDLE dl_thread;
+typedef DWORD thread_ret_t;
+typedef DWORD (WINAPI *thread_func_t)(void *);
+
+static int pthread_create(dl_thread * out, void * unused, thread_func_t func, void * arg) {
+    (void) unused;
+    dl_thread handle = CreateThread(NULL, 0, func, arg, 0, NULL);
+    if (handle == NULL) {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(dl_thread thread, void * unused) {
+    (void) unused;
+    DWORD ret = WaitForSingleObject(thread, INFINITE);
+    if (ret == WAIT_FAILED) {
+        return -1;
+    }
+    CloseHandle(thread);
+    return 0;
+}
+#else
+#include <pthread.h>
+
+typedef pthread_t dl_thread;
+typedef void* thread_ret_t;
+typedef void* (*thread_func_t)(void *);
+
+#endif
+
+#endif  // PTHREAD_WRAPPER
diff --git a/src/funcs.cpp b/src/funcs.cpp
@@ -1,7 +1,7 @@
 #include <cmath>
 #include <cassert>
 #include <cstdio>
-#include <pthread.h>
+#include "common/pthread.h"
 #include "quants.hpp"
 #include "funcs.hpp"
 
@@ -145,7 +145,7 @@ void rmsnorm(float* o, const float* x, const float ms, const float* weight, cons
 }
 
 struct MatmulThreadInfo {
-    pthread_t handler;
+    dl_thread handler;
     float* output;
     const void* input;
     const void* weights;