Add benchmark (#6)

1yefuwang1 · Jul 8, 2024 · 3e792f4 · 3e792f4
1 parent f6175d0
commit 3e792f4
Show file tree

Hide file tree

Showing 5 changed files with 324 additions and 3 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -66,17 +66,23 @@ jobs:
       - name: Run python examples
         shell: bash
         run: |
-          python -m pip install -r examples/requirements.txt
 
           for wheel in wheelhouse/*.whl; do
             echo "Running examples for $wheel"
             python -m pip install $wheel --force-reinstall
+            python -m pip install -r examples/requirements.txt
             for example in examples/*.py; do
               echo "Running $example"
               python $example
             done
           done
 
+      - name: Run benchmark
+        shell: bash
+        run: |
+          python -m pip install -r benchmark/requirements.txt
+          python benchmark/benchmark.py
+
   upload_wheels:
     name: Upload wheels
     if: ${{ github.event.inputs.upload_wheel != 'no' && github.event_name != 'pull_request' }}

diff --git a/README.md b/README.md
@@ -6,15 +6,67 @@ For other languages, vectorlite.[so|dll|dylib] can be extracted from the wheel f
 
 Vectorlite is currently in beta. There could be breaking changes.
 ## Highlights
-1. Fast ANN-search backed by hnswlib.
+1. Fast ANN-search backed by hnswlib. Please see benchmark below.
 2. Works on Windows, Linux and MacOS.
 3. SIMD accelerated vector distance calculation for x86 platform, using `vector_distance()`
-4. Supports all vector distance types provided by hnswlib: l2(squared l2), cosine, ip(inner product). For more info please check [hnswlib's doc](https://github.com/nmslib/hnswlib/tree/v0.8.0?tab=readme-ov-file#supported-distances).
+4. Supports all vector distance types provided by hnswlib: l2(squared l2), cosine, ip(inner product. I do not recomend you to use it though). For more info please check [hnswlib's doc](https://github.com/nmslib/hnswlib/tree/v0.8.0?tab=readme-ov-file#supported-distances).
 3. Full control over HNSW parameters for performance tuning.
 4. Metadata filter support (requires sqlite version >= 3.38).
 5. Index serde support. A vectorlite table can be saved to a file, and be reloaded from it. Index files created by hnswlib can also be loaded by vectorlite.
 6. Vector json serde support using `vector_from_json()` and `vector_to_json()`.
 
+## Benchamrk
+Vectorlite is fast. Compared with [sqlite-vss](https://github.com/facebookresearch/faiss), vectorlite is 10x faster in insertion and 2x-10x faster in searching with much better recall rate.
+The benchmark method is that:
+1. Insert 10000 randomly-generated vectors into a vectorlite table.
+2. Randomly generate 100 vectors and then query the table with them.
+
+The benchmark is run on my PC with a i5-12600KF CPU and 16G RAM and on WSL.
+The benchmark code can be found in benchmark folder, which can be used as an example of how to improve recall_rate for your scenario by tuning HNSW parameters.
+
+```
+┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
+┃ distance_type ┃ vector dimension ┃ ef_construction ┃ M   ┃ ef_search ┃ insert_time(per vector) ┃ search_time(per query) ┃ recall_rate ┃
+┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
+│ l2            │ 256              │ 16              │ 200 │ 10        │ 310.31 us               │ 81.46 us               │ 52.40%      │
+│ l2            │ 256              │ 16              │ 200 │ 50        │ 310.31 us               │ 217.17 us              │ 88.40%      │
+│ l2            │ 256              │ 16              │ 200 │ 100       │ 310.31 us               │ 314.81 us              │ 97.40%      │
+│ l2            │ 256              │ 32              │ 200 │ 10        │ 327.48 us               │ 89.16 us               │ 52.40%      │
+│ l2            │ 256              │ 32              │ 200 │ 50        │ 327.48 us               │ 213.95 us              │ 88.40%      │
+│ l2            │ 256              │ 32              │ 200 │ 100       │ 327.48 us               │ 349.63 us              │ 97.40%      │
+│ l2            │ 1024             │ 16              │ 200 │ 10        │ 1460.37 us              │ 445.21 us              │ 42.70%      │
+│ l2            │ 1024             │ 16              │ 200 │ 50        │ 1460.37 us              │ 1362.84 us             │ 81.90%      │
+│ l2            │ 1024             │ 16              │ 200 │ 100       │ 1460.37 us              │ 1989.38 us             │ 92.90%      │
+│ l2            │ 1024             │ 32              │ 200 │ 10        │ 1436.74 us              │ 415.00 us              │ 42.70%      │
+│ l2            │ 1024             │ 32              │ 200 │ 50        │ 1436.74 us              │ 1282.99 us             │ 81.90%      │
+│ l2            │ 1024             │ 32              │ 200 │ 100       │ 1436.74 us              │ 1904.94 us             │ 92.90%      │
+│ cosine        │ 256              │ 16              │ 200 │ 10        │ 268.53 us               │ 63.51 us               │ 52.40%      │
+│ cosine        │ 256              │ 16              │ 200 │ 50        │ 268.53 us               │ 163.83 us              │ 89.40%      │
+│ cosine        │ 256              │ 16              │ 200 │ 100       │ 268.53 us               │ 264.20 us              │ 96.40%      │
+│ cosine        │ 256              │ 32              │ 200 │ 10        │ 286.86 us               │ 64.63 us               │ 52.40%      │
+│ cosine        │ 256              │ 32              │ 200 │ 50        │ 286.86 us               │ 192.57 us              │ 89.40%      │
+│ cosine        │ 256              │ 32              │ 200 │ 100       │ 286.86 us               │ 338.05 us              │ 96.40%      │
+│ cosine        │ 1024             │ 16              │ 200 │ 10        │ 1235.72 us              │ 411.42 us              │ 47.30%      │
+│ cosine        │ 1024             │ 16              │ 200 │ 50        │ 1235.72 us              │ 1113.31 us             │ 85.20%      │
+│ cosine        │ 1024             │ 16              │ 200 │ 100       │ 1235.72 us              │ 1652.70 us             │ 95.60%      │
+│ cosine        │ 1024             │ 32              │ 200 │ 10        │ 1152.72 us              │ 378.64 us              │ 47.30%      │
+│ cosine        │ 1024             │ 32              │ 200 │ 50        │ 1152.72 us              │ 1142.82 us             │ 85.20%      │
+│ cosine        │ 1024             │ 32              │ 200 │ 100       │ 1152.72 us              │ 1634.47 us             │ 95.60%      │
+└───────────────┴──────────────────┴─────────────────┴─────┴───────────┴─────────────────────────┴────────────────────────┴─────────────┘
+```
+The result of the same benchmark for [sqlite-vss](https://github.com/asg017/sqlite-vss) is below: 
+```
+┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
+┃ vector dimension ┃ insert_time(per vector) ┃ search_time(per query) ┃ recall_rate ┃
+┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
+│ 256              │ 3694.37 us              │ 755.17 us              │ 55.40%      │
+│ 1024             │ 18598.29 us             │ 3848.64 us             │ 48.60%      │
+└──────────────────┴─────────────────────────┴────────────────────────┴─────────────┘
+```
+I believe the performance difference is mainly caused by the underlying vector search library.
+Sqlite-vss uses [faiss](https://github.com/facebookresearch/faiss), which is optimized for batched scenarios.
+Vectorlite uses [hnswlib](https://github.com/facebookresearch/faiss), which is optimized for online vector searching.
+
 # Quick Start
 The quickest way to get started is to install vectorlite using python.
 ```shell
@@ -121,6 +173,7 @@ vectorlite_py wheel can be found in `dist` folder
 - [ ] Support Multi-vector document search and epsilon search
 - [ ] Support multi-threaded search
 - [ ] Release vectorlite to more package managers.
+- [ ] Support more vector types, e.g. float16, int8.
 
 # Known limitations
 1. On a single query, a knn_search vector constraint can only be paired with at most one rowid constraint and vice versa. 

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -0,0 +1,256 @@
+import time
+from typing import Literal
+import numpy as np
+import vectorlite_py
+import apsw
+import dataclasses
+import hnswlib
+from rich.console import Console, ConsoleOptions, RenderResult
+from rich.table import Table
+import os
+
+
+"""
+Benchamrk vectorlite's performance and recall rate using method described in https://github.com/nmslib/hnswlib/blob/v0.8.0/TESTING_RECALL.md
+"""
+
+
+
+
+# Roll our own timeit function to measure time in us and get return value of the func.
+# Why Python's built-in timeit.timeit is not used:
+# 1. it includes unnecessary overheads, because compiles the code passed to it
+# 2. func's return value is cannot be obtained directly
+def timeit(func):
+    start_us = time.perf_counter_ns() / 1000
+    retval = func()
+    end_us = time.perf_counter_ns() / 1000
+    return end_us - start_us, retval
+
+
+conn = apsw.Connection(":memory:")
+conn.enable_load_extension(True)  # enable extension loading
+conn.load_extension(vectorlite_py.vectorlite_path())  # loads vectorlite
+
+cursor = conn.cursor()
+
+NUM_ELEMENTS = 10000  # number of vectors
+NUM_QUERIES = 100  # number of queries
+
+DIMS = [256, 1024]
+data = {dim: np.float32(np.random.random((NUM_ELEMENTS, dim))) for dim in DIMS}
+
+query_data = {dim: np.float32(np.random.random((NUM_QUERIES, dim))) for dim in DIMS}
+
+# search for k nearest neighbors in this benchmark
+k = 10
+
+# (ef_construction, M)
+hnsw_params = [(16, 200), (32, 200)]
+
+# ef_search
+efs = [10, 50, 100]
+
+
+# 'ip'(inner product) is not tested as it is not an actual metric that measures the distance between two vectors
+distance_types = ["l2", "cosine"]
+
+# Calculate correot results using Brute Force index
+correct_labels = {}
+for distance_type in distance_types:
+    correct_labels[distance_type] = {}
+    for dim in DIMS:
+        bf_index = hnswlib.BFIndex(space=distance_type, dim=dim)
+        bf_index.init_index(max_elements=NUM_ELEMENTS)
+        bf_index.add_items(data[dim])
+
+        labels, distances = bf_index.knn_query(query_data[dim], k=k)
+        assert len(labels) == NUM_QUERIES and len(labels[0]) == k
+        correct_labels[distance_type][dim] = labels
+        del bf_index
+
+
+@dataclasses.dataclass
+class BenchmarkResult:
+    distance_type: Literal["l2", "cosine"]
+    dim: int
+    ef_construction: int
+    M: int
+    ef_search: int
+    insert_time_us: float  # in micro seconds, per vector
+    search_time_us: float  # in micro seconds, per query
+    recall_rate: float  # in micro seconds
+
+
+@dataclasses.dataclass
+class ResultTable:
+    results: list[BenchmarkResult]
+
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        table = Table()
+        table.add_column("distance_type")
+        table.add_column("vector dimension")
+        table.add_column("ef_construction")
+        table.add_column("M")
+        table.add_column("ef_search")
+        table.add_column("insert_time(per vector)")
+        table.add_column("search_time(per query)")
+        table.add_column("recall_rate")
+        for result in self.results:
+            table.add_row(
+                result.distance_type,
+                str(result.dim),
+                str(result.ef_construction),
+                str(result.M),
+                str(result.ef_search),
+                f"{result.insert_time_us:.2f} us",
+                f"{result.search_time_us:.2f} us",
+                f"{result.recall_rate * 100:.2f}%",
+            )
+        yield table
+
+
+benchmark_results = []
+
+
+def benchmark(distance_type, dim, ef_constructoin, M):
+    result = BenchmarkResult(distance_type, dim, ef_constructoin, M, 0, 0, 0, 0)
+    table_name = f"table_{distance_type}_{dim}_{ef_constructoin}_{M}"
+    cursor.execute(
+        f"create virtual table {table_name} using vectorlite(embedding float32[{dim}] {distance_type}, hnsw(max_elements={NUM_ELEMENTS}, ef_construction={ef_constructoin}, M={M}))"
+    )
+
+    # measure insert time
+    insert_time_us, _ = timeit(
+        lambda: cursor.executemany(
+            f"insert into {table_name}(rowid, embedding) values (?, ?)",
+            [(i, data[dim][i].tobytes()) for i in range(NUM_ELEMENTS)],
+        )
+    )
+    result.insert_time_us = insert_time_us / NUM_ELEMENTS
+
+    for ef in efs:
+
+        def search():
+            result = []
+            for i in range(NUM_QUERIES):
+                result.append(
+                    cursor.execute(
+                        f"select rowid from {table_name} where knn_search(embedding, knn_param(?, ?, ?))",
+                        (query_data[dim][i].tobytes(), k, ef),
+                    ).fetchall()
+                )
+            return result
+
+        search_time_us, results = timeit(search)
+        recall_rate = np.mean(
+            [
+                np.intersect1d(results[i], correct_labels[distance_type][dim][i]).size
+                / k
+                for i in range(NUM_QUERIES)
+            ]
+        )
+        result = dataclasses.replace(
+            result,
+            ef_search=ef,
+            search_time_us=search_time_us / NUM_QUERIES,
+            recall_rate=recall_rate,
+        )
+        benchmark_results.append(result)
+
+
+for distance_type in distance_types:
+    for dim in DIMS:
+        for ef_construction, M in hnsw_params:
+            benchmark(distance_type, dim, ef_construction, M)
+
+console = Console()
+
+result_table = ResultTable(benchmark_results)
+console.print(result_table)
+
+
+import platform
+benchmark_vss = os.environ.get('BENCHMARK_VSS', '0') != '0'
+if benchmark_vss and platform.system().lower() == "linux":
+    # note sqlite_vss is not self-contained.
+    # Need to install dependencies manually using: sudo apt-get install -y libgomp1 libatlas-base-dev liblapack-dev
+    console.print("Bencharmk sqlite_vss as comparison.")
+    import sqlite_vss
+    sqlite_vss.load(conn)
+
+    @dataclasses.dataclass
+    class VssBenchmarkResult:
+        dim: int
+        insert_time_us: float  # in micro seconds, per vector
+        search_time_us: float  # in micro seconds, per query
+        recall_rate: float  # in micro seconds
+
+    @dataclasses.dataclass
+    class VssResultTable:
+        results: list[VssBenchmarkResult]
+
+        def __rich_console__(
+            self, console: Console, options: ConsoleOptions
+        ) -> RenderResult:
+            table = Table()
+            table.add_column("vector dimension")
+            table.add_column("insert_time(per vector)")
+            table.add_column("search_time(per query)")
+            table.add_column("recall_rate")
+            for result in self.results:
+                table.add_row(
+                    str(result.dim),
+                    f"{result.insert_time_us:.2f} us",
+                    f"{result.search_time_us:.2f} us",
+                    f"{result.recall_rate * 100:.2f}%",
+                )
+            yield table
+
+    vss_benchmark_results = []
+    def benchmark_sqlite_vss(dim: int):
+        benchmark_result = VssBenchmarkResult(dim, 0, 0, 0)
+        table_name = f"table_vss_{dim}"
+        cursor.execute(
+            f"create virtual table {table_name} using vss0(embedding({dim}))"
+        )
+
+        # measure insert time
+        insert_time_us, _ = timeit(
+            lambda: cursor.executemany(
+                f"insert into {table_name}(rowid, embedding) values (?, ?)",
+                [(i, data[dim][i].tobytes()) for i in range(NUM_ELEMENTS)],
+            )
+        )
+        benchmark_result.insert_time_us = insert_time_us / NUM_ELEMENTS
+
+        def search():
+            result = []
+            for i in range(NUM_QUERIES):
+                result.append(
+                    cursor.execute(
+                        f"select rowid from {table_name} where vss_search(embedding, ?) limit {k}",
+                        (query_data[dim][i].tobytes(),),
+                    ).fetchall()
+                )
+            return result
+
+        search_time_us, results = timeit(search)
+        benchmark_result.search_time_us = search_time_us / NUM_QUERIES
+        recall_rate = np.mean(
+            [
+                np.intersect1d(results[i], correct_labels["cosine"][dim][i]).size
+                / k
+                for i in range(NUM_QUERIES)
+            ]
+        )
+        benchmark_result.recall_rate = recall_rate
+        vss_benchmark_results.append(benchmark_result)
+
+    for dim in DIMS:
+        benchmark_sqlite_vss(dim)
+
+    vss_result_table = VssResultTable(vss_benchmark_results)
+    console.print(vss_result_table)
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
@@ -0,0 +1,5 @@
+vectorlite_py
+numpy>=1.22
+apsw>=3.45
+rich>=13.7
+hnswlib>=0.8
diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -1,2 +1,3 @@
+vectorlite_py
 numpy>=1.22
 apsw>=3.45