Skip to content

Commit

Permalink
Implement vector distance using Google's highway library (1yefuwang1#22)
Browse files Browse the repository at this point in the history
* Implement vector distance using highway

* Add benchmark

* Add normalize benchmark

* Implement l2 squared

* Rename to ops

* implement hnswlib::SpaceInterface using vectorlite distance functions

* re-organize cmake files

* Fix path issues

* Run ops benchmark when building wheels

* Add ops readme

* Fix UT on macos ARM

* Fix UT on MSVC

* Fix link issue on Linux
  • Loading branch information
1yefuwang1 authored Aug 16, 2024
1 parent b3c2b6d commit 89d6d7f
Show file tree
Hide file tree
Showing 45 changed files with 967 additions and 119 deletions.
6 changes: 5 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}/build/dev"
]
],
"files.associations": {
".fantomasignore": "ignore",
"string_view": "cpp"
}
}
55 changes: 2 additions & 53 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,15 @@ endif(WIN32)

project(vectorlite VERSION 0.1.0 LANGUAGES CXX)

configure_file(src/version.h.in version.h)
message(STATUS "version.h generated to: ${PROJECT_BINARY_DIR}")

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(OPTION_USE_AVX OFF)


find_package(absl CONFIG REQUIRED)
find_package(RapidJSON CONFIG REQUIRED)
find_package(unofficial-sqlite3 CONFIG REQUIRED)
find_package(hnswlib CONFIG REQUIRED)
find_package(GTest CONFIG REQUIRED)
find_package(benchmark CONFIG REQUIRED)

find_package(re2 CONFIG REQUIRED)

Expand All @@ -28,51 +24,4 @@ message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIRS}")
find_path(HNSWLIB_INCLUDE_DIRS hnswlib/hnswlib.h)
message(STATUS "HNSWLIB include dir: ${HNSWLIB_INCLUDE_DIRS}")

message(STATUS "Compiling on ${CMAKE_SYSTEM_PROCESSOR}")

# compile with avx for x86_64 and x86. Though SSE would be a safer default.
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set(OPTION_USE_AVX ON)
endif ()

add_library(vectorlite SHARED src/vectorlite.cpp src/virtual_table.cpp src/vector.cpp src/vector_view.cpp src/util.cpp src/vector_space.cpp src/index_options.cpp src/sqlite_functions.cpp src/constraint.cpp)
# remove the lib prefix to make the shared library name consistent on all platforms.
set_target_properties(vectorlite PROPERTIES PREFIX "")
target_include_directories(vectorlite PUBLIC ${RAPIDJSON_INCLUDE_DIRS} ${HNSWLIB_INCLUDE_DIRS} ${PROJECT_BINARY_DIR})
target_link_libraries(vectorlite PRIVATE unofficial::sqlite3::sqlite3 absl::status absl::statusor absl::strings re2::re2)
# copy the shared library to the python package to make running integration tests easier
add_custom_command(TARGET vectorlite POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:vectorlite> ${PROJECT_SOURCE_DIR}/bindings/python/vectorlite_py/$<TARGET_FILE_NAME:vectorlite>)

include(GoogleTest)
enable_testing()
file(GLOB TEST_SOURCES src/*.cpp)
add_executable(unit-test ${TEST_SOURCES})
target_include_directories(unit-test PUBLIC ${PROJECT_BINARY_DIR})
target_link_libraries(unit-test PRIVATE GTest::gtest GTest::gtest_main unofficial::sqlite3::sqlite3 absl::status absl::statusor absl::strings re2::re2)
# target_compile_options(unit-test PRIVATE -Wall -fno-omit-frame-pointer -g -O0)
# target_link_options(unit-test PRIVATE -fsanitize=address)
if(OPTION_USE_AVX)
message(STATUS "AVX enabled")
if (MSVC)
target_compile_options(vectorlite PRIVATE /arch:AVX)
target_compile_options(unit-test PRIVATE /arch:AVX)
else()
target_compile_options(vectorlite PRIVATE -mavx)
target_compile_options(unit-test PRIVATE -mavx)
endif()
endif(OPTION_USE_AVX)

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_libraries(vectorlite PRIVATE absl::log)
target_link_libraries(unit-test PRIVATE absl::log)
endif()

# TODO: For mysterious reason, absl::log symbols are required for even release build on MSVC. Only DLOG are used which should be guarded by NDEBUG and not included in Release build.
if(MSVC)
target_link_libraries(vectorlite PRIVATE absl::log)
target_link_libraries(unit-test PRIVATE absl::log)
endif()

gtest_discover_tests(unit-test)

add_test(NAME unit-test COMMAND unit-test)
add_subdirectory(vectorlite)
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ sh build.sh # for debug build
sh build_release.sh # for release build

```
`vecorlite.[so|dll|dylib]` can be found in `build/release` or `build/dev` folder
`vecorlite.[so|dll|dylib]` can be found in `build/release/vectorlite` or `build/dev/vectorlite` folder

### Build wheel

Expand Down
11 changes: 7 additions & 4 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,21 @@ def timeit(func):
end_us = time.perf_counter_ns() / 1000
return end_us - start_us, retval

vectorlite_path = os.environ.get("VECTORLITE_PATH", vectorlite_py.vectorlite_path())

if vectorlite_path != vectorlite_py.vectorlite_path():
print(f"Using local vectorlite: {vectorlite_path}")

conn = apsw.Connection(":memory:")
conn.enable_load_extension(True) # enable extension loading
conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite
# conn.load_extension('build/release/vectorlite') # loads vectorlite
conn.load_extension(vectorlite_path) # loads vectorlite

cursor = conn.cursor()

NUM_ELEMENTS = 5000 # number of vectors, higher number
NUM_ELEMENTS = 5000 # number of vectors
NUM_QUERIES = 100 # number of queries

DIMS = [128, 512, 1536]
DIMS = [128, 512, 1536, 3000]
data = {dim: np.float32(np.random.random((NUM_ELEMENTS, dim))) for dim in DIMS}
data_bytes = {dim: [data[dim][i].tobytes() for i in range(NUM_ELEMENTS)] for dim in DIMS}

Expand Down
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cmake --preset dev && cmake --build build/dev -j8 && ctest --test-dir build/dev --output-on-failure && pytest bindings/python/vectorlite_py/test
cmake --preset dev && cmake --build build/dev -j8 && ctest --test-dir build/dev/vectorlite --output-on-failure && pytest bindings/python/vectorlite_py/test
2 changes: 1 addition & 1 deletion build_release.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cmake --preset release && cmake --build build/release -j8 && ctest --test-dir build/release --output-on-failure && pytest bindings/python/vectorlite_py/test
cmake --preset release && cmake --build build/release -j8 && ctest --test-dir build/release/vectorlite --output-on-failure && pytest bindings/python/vectorlite_py/test
9 changes: 8 additions & 1 deletion examples/hnsw_param.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import vectorlite_py
import apsw
import numpy as np
import os
"""
This is an example of setting HNSW parameters in vectorlite.
"""


vectorlite_path = os.environ.get("VECTORLITE_PATH", vectorlite_py.vectorlite_path())

if vectorlite_path != vectorlite_py.vectorlite_path():
print(f"Using local vectorlite: {vectorlite_path}")

conn = apsw.Connection(':memory:')
conn.enable_load_extension(True) # enable extension loading
conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite
conn.load_extension(vectorlite_path) # loads vectorlite

cursor = conn.cursor()

Expand Down
12 changes: 7 additions & 5 deletions examples/index_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
import numpy as np
import os
import timeit
import vectorlite_py
"""
This is an example of using vectorlite to search vectors and serialize/deserialize the index.
"""

use_apsw = os.environ.get('USE_BUILTIN_SQLITE3', '0') == '0'
use_local_vectorlite = os.environ.get('USE_LOCAL_VECTORLITE')
vectorlite_path = os.environ.get("VECTORLITE_PATH", vectorlite_py.vectorlite_path())

if vectorlite_path != vectorlite_py.vectorlite_path():
print(f"Using local vectorlite: {vectorlite_path}")

if not use_local_vectorlite:
import vectorlite_py
use_apsw = os.environ.get('USE_BUILTIN_SQLITE3', '0') == '0'

DIM = 1000
NUM_ELEMENTS = 10000
Expand All @@ -24,7 +26,7 @@ def create_connection():
# create connection to in-memory database
conn = apsw.Connection(':memory:') if use_apsw else sqlite3.connect(':memory:')
conn.enable_load_extension(True)
conn.load_extension(use_local_vectorlite if use_local_vectorlite else vectorlite_py.vectorlite_path())
conn.load_extension(vectorlite_path)
return conn

conn = create_connection()
Expand Down
8 changes: 7 additions & 1 deletion examples/metadata_filter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import vectorlite_py
import numpy as np
import apsw
import os

"""
A contrived example of using vectorlite to search vectors with metadata filter.
Expand All @@ -10,9 +11,14 @@
"""

vectorlite_path = os.environ.get("VECTORLITE_PATH", vectorlite_py.vectorlite_path())

if vectorlite_path != vectorlite_py.vectorlite_path():
print(f"Using local vectorlite: {vectorlite_path}")

conn = apsw.Connection(':memory:')
conn.enable_load_extension(True) # enable extension loading
conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite
conn.load_extension(vectorlite_path) # loads vectorlite

cursor = conn.cursor()

Expand Down
9 changes: 8 additions & 1 deletion examples/quickstart.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import vectorlite_py
import apsw
import numpy as np
import os
"""
Quick start of using vectorlite extension.
"""


vectorlite_path = os.environ.get("VECTORLITE_PATH", vectorlite_py.vectorlite_path())

if vectorlite_path != vectorlite_py.vectorlite_path():
print(f"Using local vectorlite: {vectorlite_path}")

conn = apsw.Connection(':memory:')
conn.enable_load_extension(True) # enable extension loading
conn.load_extension(vectorlite_py.vectorlite_path()) # load vectorlite
conn.load_extension(vectorlite_path) # load vectorlite

cursor = conn.cursor()
# check if vectorlite is loaded
Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,15 @@ def build_extension(self, ext: CMakeExtension) -> None:

subprocess.run([cmake_path, '--build', os.path.join('build', 'release'), '-j8'], check=True)
print(f'Running unit tests')
subprocess.run([ctest_path, '--test-dir', os.path.join('build', 'release'), '--rerun-failed', '--output-on-failure'], check=True)
subprocess.run([ctest_path, '--test-dir', os.path.join('build', 'release', 'vectorlite'), '--rerun-failed', '--output-on-failure'], check=True)
print(f'Running benchmark')
subprocess.run([os.path.join('build', 'release', 'vectorlite', 'ops', 'ops_benchmark')], check=True)

class CMakeInstallLib(install_lib):
def run(self):
install_to = Path(self.build_dir, PACKAGE_NAME)
print(f'Install lib to {install_to}')
lib = Path('build', 'release', get_lib_name())
lib = Path('build', 'release', 'vectorlite', get_lib_name())
if not lib.exists():
raise FileNotFoundError(f'Build output not found: {lib}')

Expand Down
2 changes: 1 addition & 1 deletion vcpkg
Submodule vcpkg updated 95 files
+7 −0 ports/angle/portfile.cmake
+1 −1 ports/angle/vcpkg.json
+22 −0 ports/brpc/fix-compilation-error.patch
+2 −1 ports/brpc/portfile.cmake
+1 −2 ports/brpc/vcpkg.json
+0 −39 ports/ecm/fix_canberra.patch
+0 −23 ports/ecm/fix_libmount.patch
+5 −6 ports/ecm/portfile.cmake
+2 −2 ports/ecm/usage
+5 −1 ports/ecm/vcpkg.json
+1 −1 ports/fbthrift/portfile.cmake
+1 −1 ports/fbthrift/vcpkg.json
+1 −1 ports/fizz/portfile.cmake
+1 −2 ports/fizz/vcpkg.json
+1 −8 ports/folly/portfile.cmake
+1 −2 ports/folly/vcpkg.json
+2 −2 ports/functionalplus/portfile.cmake
+1 −1 ports/functionalplus/vcpkg.json
+13 −0 ports/hello-imgui/fix-upw.patch
+4 −2 ports/hello-imgui/portfile.cmake
+1 −2 ports/hello-imgui/vcpkg.json
+2 −2 ports/imageinfo/portfile.cmake
+1 −1 ports/imageinfo/vcpkg.json
+3 −3 ports/imgui/portfile.cmake
+1 −2 ports/imgui/vcpkg.json
+6 −1 ports/kddockwidgets/portfile.cmake
+1 −1 ports/kddockwidgets/vcpkg.json
+1 −1 ports/libcoro/portfile.cmake
+1 −1 ports/libcoro/vcpkg.json
+63 −0 ports/libdatachannel/dependencies.diff
+0 −30 ports/libdatachannel/fix-for-vcpkg.patch
+0 −40 ports/libdatachannel/fix_dependency.patch
+0 −13 ports/libdatachannel/fix_srtp.patch
+22 −0 ports/libdatachannel/library-linkage.diff
+9 −30 ports/libdatachannel/portfile.cmake
+10 −9 ports/libdatachannel/uwp-warnings.patch
+1 −1 ports/libdatachannel/vcpkg.json
+0 −21 ports/libdeflate/fix_gcc.patch
+1 −2 ports/libdeflate/portfile.cmake
+1 −2 ports/libdeflate/vcpkg.json
+7 −4 ports/libmysql/install-exports.patch
+6 −3 ports/libmysql/portfile.cmake
+1 −2 ports/libmysql/vcpkg.json
+1 −1 ports/lightgbm/portfile.cmake
+7 −0 ports/lightgbm/vcpkg.json
+7 −10 ports/lunasvg/fix-cmake.patch
+1 −1 ports/lunasvg/portfile.cmake
+1 −1 ports/lunasvg/vcpkg.json
+7 −0 ports/microsoft-signalr/portfile.cmake
+1 −1 ports/microsoft-signalr/vcpkg.json
+1 −1 ports/mvfst/portfile.cmake
+1 −1 ports/mvfst/vcpkg.json
+2 −2 ports/openvino/001-disable-tools.patch
+0 −53 ports/openvino/002-api-validator.patch
+13 −0 ports/openvino/002-fix-onnx-codegen.patch
+12 −11 ports/openvino/portfile.cmake
+1 −2 ports/openvino/vcpkg.json
+1 −1 ports/perfetto/portfile.cmake
+1 −1 ports/perfetto/vcpkg.json
+1 −1 ports/proxygen/portfile.cmake
+1 −1 ports/proxygen/vcpkg.json
+1 −1 ports/uwebsockets/portfile.cmake
+1 −1 ports/uwebsockets/vcpkg.json
+1 −1 ports/wangle/portfile.cmake
+1 −1 ports/wangle/vcpkg.json
+3 −0 ports/zeromq/portfile.cmake
+1 −1 ports/zeromq/vcpkg.json
+10 −1 scripts/ci.baseline.txt
+1 −0 scripts/test_ports/vcpkg-ci-curl/vcpkg.json
+5 −0 versions/a-/angle.json
+5 −0 versions/b-/brpc.json
+33 −33 versions/baseline.json
+5 −0 versions/e-/ecm.json
+5 −0 versions/f-/fbthrift.json
+5 −0 versions/f-/fizz.json
+5 −0 versions/f-/folly.json
+5 −0 versions/f-/functionalplus.json
+5 −0 versions/h-/hello-imgui.json
+5 −0 versions/i-/imageinfo.json
+5 −0 versions/i-/imgui.json
+5 −0 versions/k-/kddockwidgets.json
+5 −0 versions/l-/libcoro.json
+5 −0 versions/l-/libdatachannel.json
+5 −0 versions/l-/libdeflate.json
+5 −0 versions/l-/libmysql.json
+5 −0 versions/l-/lightgbm.json
+5 −0 versions/l-/lunasvg.json
+5 −0 versions/m-/microsoft-signalr.json
+5 −0 versions/m-/mvfst.json
+5 −0 versions/o-/openvino.json
+5 −0 versions/p-/perfetto.json
+5 −0 versions/p-/proxygen.json
+5 −0 versions/u-/uwebsockets.json
+5 −0 versions/w-/wangle.json
+5 −0 versions/z-/zeromq.json
41 changes: 26 additions & 15 deletions vcpkg.json
Original file line number Diff line number Diff line change
@@ -1,36 +1,47 @@
{
"name": "vectorlite",
"version-string": "0.1.0",
"license": "MIT",
"description": "SQLite extension for fast vector search",
"license": "MIT",
"builtin-baseline": "67cc1677c3bf5c23ea14b9d2416c7422fdeac492",
"dependencies": [
{
"name": "sqlite3",
"version>=": "3.43.1"
"name": "abseil",
"features": [
"cxx17"
],
"version>=": "20240116.2"
},
{
"name": "rapidjson",
"version>=": "2023-07-17"
"name": "gtest",
"version>=": "1.14.0"
},
{
"name": "hnswlib",
"version>=": "0.8.0"
},
{
"name": "gtest",
"version>=": "1.14.0"
"name": "rapidjson",
"version>=": "2023-07-17"
},
{
"name": "abseil",
"name": "re2",
"version>=": "2023-07-01"
},
{
"name": "sqlite3",
"version>=": "3.43.1"
},
{
"name": "highway",
"features": [
"cxx17"
"contrib"
],
"version>=": "20240116.2"
"version>=": "1.2.0"
},
{
"name": "re2",
"version>=": "2023-07-01"
"name": "benchmark",
"version>=": "1.8.5"
}
],
"builtin-baseline": "67cc1677c3bf5c23ea14b9d2416c7422fdeac492"
}
]
}
47 changes: 47 additions & 0 deletions vectorlite/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
configure_file(version.h.in version.h)
message(STATUS "version.h generated to: ${PROJECT_BINARY_DIR}")

find_path(RAPIDJSON_INCLUDE_DIRS rapidjson/rapidjson.h)
message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIRS}")
find_path(HNSWLIB_INCLUDE_DIRS hnswlib/hnswlib.h)
message(STATUS "HNSWLIB include dir: ${HNSWLIB_INCLUDE_DIRS}")

message(STATUS "Compiling on ${CMAKE_SYSTEM_PROCESSOR}")

add_subdirectory(ops)

add_library(vectorlite SHARED vectorlite.cpp virtual_table.cpp vector.cpp vector_view.cpp util.cpp vector_space.cpp index_options.cpp sqlite_functions.cpp constraint.cpp)
# remove the lib prefix to make the shared library name consistent on all platforms.
set_target_properties(vectorlite PROPERTIES PREFIX "")
target_include_directories(vectorlite PUBLIC ${RAPIDJSON_INCLUDE_DIRS} ${HNSWLIB_INCLUDE_DIRS} ${PROJECT_BINARY_DIR})
target_link_libraries(vectorlite PRIVATE unofficial::sqlite3::sqlite3 absl::status absl::statusor absl::strings re2::re2 ops)
# copy the shared library to the python package to make running integration tests easier
add_custom_command(TARGET vectorlite POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:vectorlite> ${PROJECT_SOURCE_DIR}/bindings/python/vectorlite_py/$<TARGET_FILE_NAME:vectorlite>)

include(GoogleTest)
enable_testing()
file(GLOB TEST_SOURCES *.cpp)
add_executable(unit_test ${TEST_SOURCES})
target_include_directories(unit_test PUBLIC ${PROJECT_BINARY_DIR})
target_link_libraries(unit_test PRIVATE GTest::gtest GTest::gtest_main unofficial::sqlite3::sqlite3 absl::status absl::statusor absl::strings re2::re2 ops)
# target_compile_options(unit_test PRIVATE -Wall -fno-omit-frame-pointer -g -O0)
# target_link_options(unit_test PRIVATE -fsanitize=address)
if (MSVC)
target_compile_options(vectorlite PRIVATE /arch:AVX)
target_compile_options(unit_test PRIVATE /arch:AVX)
endif()

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_libraries(vectorlite PRIVATE absl::log)
target_link_libraries(unit_test PRIVATE absl::log)
endif()

# TODO: For mysterious reason, absl::log symbols are required for even release build on MSVC. Only DLOG are used which should be guarded by NDEBUG and not included in Release build.
if(MSVC)
target_link_libraries(vectorlite PRIVATE absl::log)
target_link_libraries(unit_test PRIVATE absl::log)
endif()

gtest_discover_tests(unit_test)

add_test(NAME unit_test COMMAND unit_test)
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 89d6d7f

Please sign in to comment.