Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid copy by using VectorView and update benchmark.py for sqlite-vec #21

Merged
merged 5 commits into from
Aug 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set(OPTION_USE_AVX ON)
endif ()

add_library(vectorlite SHARED src/vectorlite.cpp src/virtual_table.cpp src/vector.cpp src/util.cpp src/vector_space.cpp src/index_options.cpp src/sqlite_functions.cpp src/constraint.cpp)
add_library(vectorlite SHARED src/vectorlite.cpp src/virtual_table.cpp src/vector.cpp src/vector_view.cpp src/util.cpp src/vector_space.cpp src/index_options.cpp src/sqlite_functions.cpp src/constraint.cpp)
# remove the lib prefix to make the shared library name consistent on all platforms.
set_target_properties(vectorlite PROPERTIES PREFIX "")
target_include_directories(vectorlite PUBLIC ${RAPIDJSON_INCLUDE_DIRS} ${HNSWLIB_INCLUDE_DIRS} ${PROJECT_BINARY_DIR})
Expand Down
211 changes: 161 additions & 50 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import time
from typing import Literal
from typing import Literal, Optional, List
import numpy as np
import vectorlite_py
import apsw
Expand All @@ -15,12 +15,10 @@
"""




# Roll our own timeit function to measure time in us and get return value of the func.
# Why Python's built-in timeit.timeit is not used:
# 1. it includes unnecessary overheads, because compiles the code passed to it
# 2. func's return value is cannot be obtained directly
# 2. func's return value cannot be obtained directly
def timeit(func):
start_us = time.perf_counter_ns() / 1000
retval = func()
Expand All @@ -31,25 +29,28 @@ def timeit(func):
conn = apsw.Connection(":memory:")
conn.enable_load_extension(True) # enable extension loading
conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite
# conn.load_extension('build/release/vectorlite') # loads vectorlite

cursor = conn.cursor()

NUM_ELEMENTS = 1000 # number of vectors
NUM_ELEMENTS = 5000 # number of vectors, higher number
NUM_QUERIES = 100 # number of queries

DIMS = [256, 1024]
DIMS = [128, 512, 1536]
data = {dim: np.float32(np.random.random((NUM_ELEMENTS, dim))) for dim in DIMS}
data_bytes = {dim: [data[dim][i].tobytes() for i in range(NUM_ELEMENTS)] for dim in DIMS}

query_data = {dim: np.float32(np.random.random((NUM_QUERIES, dim))) for dim in DIMS}
query_data_bytes = {dim: [query_data[dim][i].tobytes() for i in range(NUM_QUERIES)] for dim in DIMS}

# search for k nearest neighbors in this benchmark
k = 10

# (ef_construction, M)
hnsw_params = [(200, 32), (200, 48), (200, 64)]
hnsw_params = [(100, 30)]

# ef_search
efs = [10, 50, 100, 150]
efs = [10, 50, 100]


# 'ip'(inner product) is not tested as it is not an actual metric that measures the distance between two vectors
Expand All @@ -69,6 +70,7 @@ def timeit(func):
correct_labels[distance_type][dim] = labels
del bf_index

console = Console()

@dataclasses.dataclass
class BenchmarkResult:
Expand All @@ -84,7 +86,7 @@ class BenchmarkResult:

@dataclasses.dataclass
class ResultTable:
results: list[BenchmarkResult]
results: List[BenchmarkResult]

def __rich_console__(
self, console: Console, options: ConsoleOptions
Expand Down Expand Up @@ -126,7 +128,7 @@ def benchmark(distance_type, dim, ef_constructoin, M):
insert_time_us, _ = timeit(
lambda: cursor.executemany(
f"insert into {table_name}(rowid, embedding) values (?, ?)",
[(i, data[dim][i].tobytes()) for i in range(NUM_ELEMENTS)],
[(i, data_bytes[dim][i]) for i in range(NUM_ELEMENTS)],
)
)
result.insert_time_us = insert_time_us / NUM_ELEMENTS
Expand All @@ -139,12 +141,13 @@ def search():
result.append(
cursor.execute(
f"select rowid from {table_name} where knn_search(embedding, knn_param(?, ?, ?))",
(query_data[dim][i].tobytes(), k, ef),
(query_data_bytes[dim][i], k, ef),
).fetchall()
)
return result

search_time_us, results = timeit(search)
# console.log(results)
recall_rate = np.mean(
[
np.intersect1d(results[i], correct_labels[distance_type][dim][i]).size
Expand All @@ -166,52 +169,106 @@ def search():
for ef_construction, M in hnsw_params:
benchmark(distance_type, dim, ef_construction, M)

console = Console()

result_table = ResultTable(benchmark_results)
console.print(result_table)


@dataclasses.dataclass
class BruteForceBenchmarkResult:
dim: int
insert_time_us: float # in micro seconds, per vector
search_time_us: float # in micro seconds, per query
recall_rate: float # in micro seconds


@dataclasses.dataclass
class BruteForceResultTable:
results: List[BruteForceBenchmarkResult]

def __rich_console__(
self, console: Console, options: ConsoleOptions
) -> RenderResult:
table = Table()
table.add_column("vector dimension")
table.add_column("insert_time(per vector)")
table.add_column("search_time(per query)")
table.add_column("recall_rate")
for result in self.results:
table.add_row(
str(result.dim),
f"{result.insert_time_us:.2f} us",
f"{result.search_time_us:.2f} us",
f"{result.recall_rate * 100:.2f}%",
)
yield table


brute_force_benchmark_results = []

console.print("Bencharmk brute force as comparison.")

def benchmark_brute_force(dim: int):
benchmark_result = BruteForceBenchmarkResult(dim, 0, 0, 0)
table_name = f"table_vectorlite_bf_{dim}"
cursor.execute(
f"create table {table_name}(rowid integer primary key, embedding blob)"
)

insert_time_us, _ = timeit(
lambda: cursor.executemany(
f"insert into {table_name}(rowid, embedding) values (?, ?)",
[(i, data_bytes[dim][i]) for i in range(NUM_ELEMENTS)],
)
)
benchmark_result.insert_time_us = insert_time_us / NUM_ELEMENTS

def search():
result = []
for i in range(NUM_QUERIES):
result.append(
cursor.execute(
f"select rowid from {table_name} order by vector_distance(?, embedding, 'l2') asc limit {k}",
[query_data_bytes[dim][i]],
).fetchall()
)
return result

search_time_us, results = timeit(search)
# console.log(results)
benchmark_result.search_time_us = search_time_us / NUM_QUERIES
recall_rate = np.mean(
[
np.intersect1d(results[i], correct_labels["l2"][dim][i]).size / k
for i in range(NUM_QUERIES)
]
)
benchmark_result.recall_rate = recall_rate
brute_force_benchmark_results.append(benchmark_result)

for dim in DIMS:
benchmark_brute_force(dim)
brute_force_table = BruteForceResultTable(brute_force_benchmark_results)
console.print(brute_force_table)


# Benchmark sqlite_vss as compariso.
# pip install sqlite-vss
import platform
benchmark_vss = os.environ.get('BENCHMARK_VSS', '0') != '0'
if benchmark_vss and platform.system().lower() == "linux":

benchmark_vss = os.environ.get("BENCHMARK_VSS", "0") != "0"
if benchmark_vss and (platform.system().lower() == "linux" or platform.system().lower() == "darwin"):
# note sqlite_vss is not self-contained.
# Need to install dependencies manually using: sudo apt-get install -y libgomp1 libatlas-base-dev liblapack-dev
console.print("Bencharmk sqlite_vss as comparison.")
import sqlite_vss
sqlite_vss.load(conn)

@dataclasses.dataclass
class VssBenchmarkResult:
dim: int
insert_time_us: float # in micro seconds, per vector
search_time_us: float # in micro seconds, per query
recall_rate: float # in micro seconds

@dataclasses.dataclass
class VssResultTable:
results: list[VssBenchmarkResult]

def __rich_console__(
self, console: Console, options: ConsoleOptions
) -> RenderResult:
table = Table()
table.add_column("vector dimension")
table.add_column("insert_time(per vector)")
table.add_column("search_time(per query)")
table.add_column("recall_rate")
for result in self.results:
table.add_row(
str(result.dim),
f"{result.insert_time_us:.2f} us",
f"{result.search_time_us:.2f} us",
f"{result.recall_rate * 100:.2f}%",
)
yield table
sqlite_vss.load(conn)

vss_benchmark_results = []

def benchmark_sqlite_vss(dim: int):
benchmark_result = VssBenchmarkResult(dim, 0, 0, 0)
benchmark_result = BruteForceBenchmarkResult(dim, 0, 0, 0)
table_name = f"table_vss_{dim}"
cursor.execute(
f"create virtual table {table_name} using vss0(embedding({dim}))"
Expand All @@ -221,7 +278,7 @@ def benchmark_sqlite_vss(dim: int):
insert_time_us, _ = timeit(
lambda: cursor.executemany(
f"insert into {table_name}(rowid, embedding) values (?, ?)",
[(i, data[dim][i].tobytes()) for i in range(NUM_ELEMENTS)],
[(i, data_bytes[dim][i]) for i in range(NUM_ELEMENTS)],
)
)
benchmark_result.insert_time_us = insert_time_us / NUM_ELEMENTS
Expand All @@ -232,17 +289,16 @@ def search():
result.append(
cursor.execute(
f"select rowid from {table_name} where vss_search(embedding, ?) limit {k}",
(query_data[dim][i].tobytes(),),
(query_data_bytes[dim][i],),
).fetchall()
)
return result

search_time_us, results = timeit(search)
benchmark_result.search_time_us = search_time_us / NUM_QUERIES
recall_rate = np.mean(
[
np.intersect1d(results[i], correct_labels["cosine"][dim][i]).size
/ k
np.intersect1d(results[i], correct_labels["l2"][dim][i]).size / k
for i in range(NUM_QUERIES)
]
)
Expand All @@ -252,5 +308,60 @@ def search():
for dim in DIMS:
benchmark_sqlite_vss(dim)

vss_result_table = VssResultTable(vss_benchmark_results)
console.print(vss_result_table)
vss_result_table = BruteForceResultTable(vss_benchmark_results)
console.print(vss_result_table)

# benchmark sqlite-vec
# pip install sqlite-vec
benchmark_sqlite_vec = os.environ.get("BENCHMARK_SQLITE_VEC", "0") != "0"
if benchmark_sqlite_vec and (platform.system().lower() == "linux" or platform.system().lower() == "darwin"):
# VssBenchamrkResult and VssResultTable can be reused
vec_benchmark_results = []
console.print("Bencharmk sqlite_vec as comparison.")
import sqlite_vec

conn.load_extension(sqlite_vec.loadable_path())

def benchmark_sqlite_vec(dim: int):
benchmark_result = BruteForceBenchmarkResult(dim, 0, 0, 0)
table_name = f"table_vec_{dim}"
cursor.execute(
f"create virtual table {table_name} using vec0(rowid integer primary key, embedding float[{dim}])"
)

# measure insert time
insert_time_us, _ = timeit(
lambda: cursor.executemany(
f"insert into {table_name}(rowid, embedding) values (?, ?)",
[(i, data_bytes[dim][i]) for i in range(NUM_ELEMENTS)],
)
)
benchmark_result.insert_time_us = insert_time_us / NUM_ELEMENTS

def search():
result = []
for i in range(NUM_QUERIES):
result.append(
cursor.execute(
f"select rowid from {table_name} where embedding match ? and k = {k}",
(query_data_bytes[dim][i],),
).fetchall()
)
return result

search_time_us, results = timeit(search)
benchmark_result.search_time_us = search_time_us / NUM_QUERIES
recall_rate = np.mean(
[
np.intersect1d(results[i], correct_labels["l2"][dim][i]).size / k
for i in range(NUM_QUERIES)
]
)
benchmark_result.recall_rate = recall_rate
vec_benchmark_results.append(benchmark_result)

for dim in DIMS:
benchmark_sqlite_vec(dim)

vec_result_table = BruteForceResultTable(vec_benchmark_results)
console.print(vec_result_table)
6 changes: 6 additions & 0 deletions bindings/nodejs/packages/vectorlite/test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,10 @@ console.log(result);
result = db.prepare('select rowid from test where knn_search(vec, knn_param(?, 2)) and rowid in (1,2,3)')
.all([Buffer.from(Float32Array.from(Array.from({length: 10}, () => Math.random())).buffer)]);

console.log(result);

// a vector query with rowid filter
result = db.prepare('select rowid, vector_distance(vec, ?, \'l2\') from test where rowid in (0,1,2,3)')
.all([Buffer.from(Float32Array.from(Array.from({length: 10}, () => Math.random())).buffer)]);

console.log(result);
Loading
Loading