Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/online repacking #10446

Merged
merged 9 commits into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
MK_CFLAGS += -march=native -mtune=native
HOST_CXXFLAGS += -march=native -mtune=native

# Usage AMX build test
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids

# Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx
Expand Down Expand Up @@ -948,17 +952,18 @@ DIR_COMMON = common

OBJ_GGML = \
$(DIR_GGML)/src/ggml.o \
$(DIR_GGML)/src/ggml-aarch64.o \
$(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \
$(DIR_GGML)/src/ggml-opt.o \
$(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
$(OBJ_GGML_EXT)

OBJ_LLAMA = \
Expand Down Expand Up @@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
# Default target
all: $(BUILD_TARGETS)

# force c++ build for source file that have same name as c file
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@

# Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
Expand Down
5 changes: 3 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ var sources = [
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-backend-reg.cpp",
"ggml/src/ggml-cpu/ggml-cpu.c",
"ggml/src/ggml-cpu/ggml-cpu.cpp",
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
"ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp",
"ggml/src/ggml-cpu/ggml-cpu-hbm.cpp",
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-cpu/ggml-cpu-traits.cpp",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c",
]
Expand Down
2 changes: 1 addition & 1 deletion docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ cmake --build build --config Release
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.

## BLAS Build

Expand Down
2 changes: 0 additions & 2 deletions examples/quantize/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis

Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.

*(outdated)*

| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
Expand Down
3 changes: 0 additions & 3 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
Expand Down
17 changes: 0 additions & 17 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,24 +103,14 @@ extern "C" {

// Internal types and functions exposed for tests and benchmarks

typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);

struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
};

GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
Expand All @@ -140,13 +130,6 @@ extern "C" {

GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);

#ifdef GGML_USE_CPU_HBM
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif

GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);

#ifdef __cplusplus
}
#endif
29 changes: 17 additions & 12 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,15 +384,15 @@ extern "C" {
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
// GGML_TYPE_Q4_0_4_8 = 32,
// GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT = 39,
};

// precision
Expand Down Expand Up @@ -433,9 +433,6 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
};

// available tensor operations:
Expand Down Expand Up @@ -2205,11 +2202,19 @@ extern "C" {
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);

#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#ifdef __cplusplus
// restrict not standard in C++
# if defined(__GNUC__)
# define GGML_RESTRICT __restrict__
# elif defined(__clang__)
# define GGML_RESTRICT __restrict
# elif defined(_MSC_VER)
# define GGML_RESTRICT __restrict
# else
# define GGML_RESTRICT
# endif
#else
#define GGML_RESTRICT restrict
# define GGML_RESTRICT restrict
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
Expand Down
4 changes: 1 addition & 3 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,7 @@ add_library(ggml-base
ggml-threading.cpp
ggml-threading.h
ggml-quants.c
ggml-quants.h
ggml-aarch64.c
ggml-aarch64.h)
ggml-quants.h)

target_include_directories(ggml-base PRIVATE .)

Expand Down
129 changes: 0 additions & 129 deletions ggml/src/ggml-aarch64.c

This file was deleted.

19 changes: 0 additions & 19 deletions ggml/src/ggml-aarch64.h

This file was deleted.

2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
/* .get_name = */ ggml_backend_cann_reg_get_name,
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
/* .get_device_get = */ ggml_backend_cann_reg_get_device,
/* .get_device = */ ggml_backend_cann_reg_get_device,
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
};

Expand Down
Loading
Loading