From bdcb7700107327260a1ac96368673873869b978e Mon Sep 17 00:00:00 2001 From: Yefu Wang <1yefuwang1@gmail.com> Date: Sat, 6 Jul 2024 16:53:45 +0800 Subject: [PATCH] Improve examples --- .github/workflows/ci.yml | 14 ++++++-- README.md | 40 +++++++++------------- examples/README.md | 5 ++- examples/hnsw_param.py | 35 +++++++++++++++++++ examples/{knn_search.py => index_serde.py} | 2 +- examples/metadata_filter.py | 40 ++++++++++++++++++++++ examples/quickstart.py | 31 ++++------------- 7 files changed, 112 insertions(+), 55 deletions(-) create mode 100644 examples/hnsw_param.py rename examples/{knn_search.py => index_serde.py} (98%) create mode 100644 examples/metadata_filter.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f87483b..dbcf58a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,11 +63,19 @@ jobs: name: vectorlite-wheel-${{ matrix.os }}-${{ steps.short_sha.outputs.sha }} path: ./wheelhouse/*.whl - - name: Run python example + - name: Run python examples shell: bash run: | - python -m pip install wheelhouse/*.whl - python examples/knn_search.py + python -m pip install -r examples/requirements.txt + + for wheel in wheelhouse/*.whl; do + echo "Running examples for $wheel" + python -m pip install $wheel --force-reinstall + for example in examples/*.py; do + echo "Running $example" + python $example + done + done upload_wheels: name: Upload wheels diff --git a/README.md b/README.md index 3b47e2e..e358259 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,13 @@ Below is a minimal example of using vectorlite. It can also be found in the exam import vectorlite_py import apsw import numpy as np +""" +Quick start of using vectorlite extension. +""" conn = apsw.Connection(':memory:') conn.enable_load_extension(True) # enable extension loading -conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite +conn.load_extension(vectorlite_py.vectorlite_path()) # load vectorlite cursor = conn.cursor() # check if vectorlite is loaded @@ -70,34 +73,13 @@ print(f'vector at rowid 1234: {result[0]}') result = cursor.execute('select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10))', [data[0].tobytes()]).fetchall() print(f'10 nearest neighbors of row 0 is {result}') -# Find 10 approximate nearest neighbors of data[0] for rowid from 1000 to 2000 using metadata(rowid) filtering. +# Find 10 approximate nearest neighbors of the first embedding in vectors with rowid within [1001, 2000) using metadata(rowid) filtering. rowids = ','.join([str(rowid) for rowid in range(1000, 2000)]) result = cursor.execute(f'select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10)) and rowid in ({rowids})', [data[0].tobytes()]).fetchall() - -# Insert the test data into the virtual table. Note that the rowid MUST be explicitly set when inserting vectors and cannot be auto-generated. -# The rowid is used to uniquely identify a vector and serve as a "foreign key" to relate to the vector's metadata. -# Vectorlite takes vectors in raw bytes, so a numpy vector need to be converted to bytes before inserting into the table. -cursor.executemany('insert into my_table(rowid, my_embedding) values (?, ?)', [(i, data[i].tobytes()) for i in range(NUM_ELEMENTS)]) - -# Query the virtual table to get the vector at rowid 12345. Note the vector needs to be converted back to json using vector_to_json() to be human-readable. -result = cursor.execute('select vector_to_json(my_embedding) from my_table where rowid = 1234').fetchone() -print(f'vector at rowid 1234: {result[0]}') - -# Find 10 approximate nearest neighbors of data[0] and there distances from data[0]. -# knn_search() is used to tell vectorlite to do a vector search. -# knn_param(V, K, ef) is used to pass the query vector V, the number of nearest neighbors K to find and an optional ef parameter to tune the performance of the search. -# If ef is not specified, ef defaults to 10. For more info on ef, please check https://github.com/nmslib/hnswlib/blob/v0.8.0/ALGO_PARAMS.md -result = cursor.execute('select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10))', [data[0].tobytes()]).fetchall() -print(f'10 nearest neighbors of row 0 is {result}') - -# Find 10 approximate nearest neighbors of data[0] for rowid from 1000 to 2000 using metadata(rowid) filtering. -rowids = ','.join([str(rowid) for rowid in range(1000, 2000)]) -result = cursor.execute(f'select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10)) and rowid in ({rowids})', [data[0].tobytes()]).fetchall() -print(f'10 nearest neighbors of row 0 from rowid 1000 to 2000 is {result}') +print(f'10 nearest neighbors of row 0 in vectors with rowid within [1000, 2000) is {result}') conn.close() - ``` More examples can be found in examples and integration_test folder. @@ -129,8 +111,17 @@ sh build_release.sh # for release build ```shell python3 -m build -w + ``` vectorlite_py wheel can be found in `dist` folder + +# Roadmap +- [ ] SIMD support for ARM platform +- [ ] Support user defined metadata/rowid filter +- [ ] Support Multi-vector document search and epsilon search +- [ ] Support multi-threaded search +- [ ] Release vectorlite to more package managers. + # Known limitations 1. On a single query, a knn_search vector constraint can only be paired with at most one rowid constraint and vice versa. For example, The following queries will fail: @@ -158,6 +149,7 @@ select rowid, distance from my_table where knn_search(my_embedding, knn_param(ve 6. Metadata filter(rowid filter) requires sqlite3 >= 3.38. Python's built-in `sqlite` module is usually built with old versions. Please use a newer sqlite binding such as `apsw` if you want to use metadata filter. knn_search() without rowid fitler still works for old sqlite3. 7. The vector index is held in memory. 8. Deleting a row only marks the vector as deleted and doesn't free the memory. The vector will not be included in later queries. However, if another vector is inserted with the same rowid, the memory will be reused. +9. A vectorlite table can only have one vector column. # Acknowledgement This project is greatly inspired by following projects diff --git a/examples/README.md b/examples/README.md index 72deb1e..90156ba 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,4 +1,3 @@ # Instructions -1. Please first run `sh build_release.sh` in project root folder. -2. Run `pip3 install -r requirements.txt`. Vectorlite's rowid filter feature requires sqlite3 version >= 3.38. Usually, python's built-in sqlite3 module is not new enough. Apsw provides binding to latest sqlite3 releases. -4. Run `python3 knn_search.py`. If you still want to use the built-in sqlite3 module, run `USE_BUILTIN_SQLITE3=1 python3 knn_search.py`. \ No newline at end of file +1. Run `pip3 install -r requirements.txt`. Vectorlite's metadata(rowid) filter feature requires sqlite3 version >= 3.38. Usually, python's built-in sqlite3 module is not new enough. Apsw provides binding to latest sqlite3 releases. +2. If you still want to use the built-in sqlite3 module, run set `USE_BUILTIN_SQLITE3=1`. \ No newline at end of file diff --git a/examples/hnsw_param.py b/examples/hnsw_param.py new file mode 100644 index 0000000..35eae76 --- /dev/null +++ b/examples/hnsw_param.py @@ -0,0 +1,35 @@ +import vectorlite_py +import apsw +import numpy as np +""" +This is an example of setting HNSW parameters in vectorlite. +""" + +conn = apsw.Connection(':memory:') +conn.enable_load_extension(True) # enable extension loading +conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite + +cursor = conn.cursor() + +DIM = 32 # dimension of the vectors +NUM_ELEMENTS = 1000 # number of vectors + +# Create virtual table with customized HNSW parameters. Please check https://github.com/nmslib/hnswlib/blob/v0.8.0/ALGO_PARAMS.md for more info. +cursor.execute(f'create virtual table vector_table using vectorlite(embedding float32[{DIM}], hnsw(max_elements={NUM_ELEMENTS}, ef_construction=100, M=32, random_seed=1000))') + +data = np.float32(np.random.random((NUM_ELEMENTS, DIM))) # Only float32 vectors are supported by vectorlite for now +embeddings = [(i, data[i].tobytes()) for i in range(NUM_ELEMENTS)] +cursor.executemany('insert into vector_table(rowid, embedding) values (?, ?)', embeddings) + +# Search for the 10 nearest neighbors of the first embedding with customized ef = 32. Increasing ef will increase the search accuracy but also increase the search time. +# knn_param(V, K, ef) is used to pass the query vector V, the number of nearest neighbors K to find and an optional ef parameter to tune the performance of the search. +# If ef is not specified, ef defaults to 10. +# Note: setting ef is an imperitive operation. If it is not set in later queires, it will stay 32. +# If index serialization is enabled, ef is not serialized in the index file, so it will be lost when the connection is closed. +# When the table is reloaded from an index file, ef will be set to the default value 10. +vector_to_search = data[0].tobytes() +k = 10 +ef = 32 +result = cursor.execute(f'select rowid, distance from vector_table where knn_search(embedding, knn_param(?, ?, ?))', [vector_to_search, k, ef]).fetchall() +print(result) + diff --git a/examples/knn_search.py b/examples/index_serde.py similarity index 98% rename from examples/knn_search.py rename to examples/index_serde.py index 4ddc582..4e86467 100644 --- a/examples/knn_search.py +++ b/examples/index_serde.py @@ -5,7 +5,7 @@ import os import timeit """ -Example of using vectorlite extension to perform KNN search on a table of vectors. +This is an example of using vectorlite to search vectors and serialize/deserialize the index. """ use_apsw = os.environ.get('USE_BUILTIN_SQLITE3', '0') == '0' diff --git a/examples/metadata_filter.py b/examples/metadata_filter.py new file mode 100644 index 0000000..976af25 --- /dev/null +++ b/examples/metadata_filter.py @@ -0,0 +1,40 @@ +import vectorlite_py +import numpy as np +import apsw + +""" +A contrived example of using vectorlite to search vectors with metadata filter. +Metadata filter in vectorlite is achived by filtering rowid. +Candidate rowids need to be generated first then passed to the `rowid in (..)` constraint. +The rowid constraint is pushed down to the HNSW index when traversing the HNSW graph, so it is efficient and accurate. + +""" + +conn = apsw.Connection(':memory:') +conn.enable_load_extension(True) # enable extension loading +conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite + +cursor = conn.cursor() + +DIM = 32 # dimension of the vectors +NUM_ELEMENTS = 1000 # number of vectors + +# In this example, we have a news table that stores article. +cursor.execute(f'create table news(rowid integer primary key, article text)') +# For simplicity, the article is just a string of the rowid. +cursor.executemany('insert into news(rowid, article) values (?, ?)', [(i, str(i)) for i in range(NUM_ELEMENTS)]) + +# Create a virtual table to store the embeddings +cursor.execute(f'create virtual table vector_table using vectorlite(article_embedding float32[{DIM}], hnsw(max_elements={NUM_ELEMENTS}))') +# For simplicity, embeddings are randomly generated for each article. +# In a real application, you should replace this with your own embeddings. +data = np.float32(np.random.random((NUM_ELEMENTS, DIM))) # Only float32 vectors are supported by vectorlite for now +embeddings = [(i, data[i].tobytes()) for i in range(NUM_ELEMENTS)] +cursor.executemany('insert into vector_table(rowid, article_embedding) values (?, ?)', embeddings) + +# Now let's search for the 10 nearest neighbors of the first article in articles that start with "1" +result = cursor.execute(f'select rowid, distance from vector_table where knn_search(article_embedding, knn_param(?, 10)) and rowid in (select rowid from news where article like "1%")', [data[0].tobytes()]).fetchall() +print(result) + +# Please prefer using rowid in(...) instead of using `join`. The below query will first find 10 neighbors and then filter by rowid, which is not what we wanted. +# result = cursor.execute(f'select a.rowid, a.distance from vector_table a join news b on a.rowid = b.rowid where b.article like "1%" and knn_search(my_embedding, knn_param(?, 10)) ', [data[0].tobytes()]).fetchall() diff --git a/examples/quickstart.py b/examples/quickstart.py index 488412e..49ec222 100644 --- a/examples/quickstart.py +++ b/examples/quickstart.py @@ -1,10 +1,13 @@ import vectorlite_py import apsw import numpy as np +""" +Quick start of using vectorlite extension. +""" conn = apsw.Connection(':memory:') conn.enable_load_extension(True) # enable extension loading -conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite +conn.load_extension(vectorlite_py.vectorlite_path()) # load vectorlite cursor = conn.cursor() # check if vectorlite is loaded @@ -44,29 +47,9 @@ result = cursor.execute('select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10))', [data[0].tobytes()]).fetchall() print(f'10 nearest neighbors of row 0 is {result}') -# Find 10 approximate nearest neighbors of data[0] for rowid from 1000 to 2000 using metadata(rowid) filtering. +# Find 10 approximate nearest neighbors of the first embedding in vectors with rowid within [1001, 2000) using metadata(rowid) filtering. rowids = ','.join([str(rowid) for rowid in range(1000, 2000)]) result = cursor.execute(f'select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10)) and rowid in ({rowids})', [data[0].tobytes()]).fetchall() +print(f'10 nearest neighbors of row 0 in vectors with rowid within [1000, 2000) is {result}') -# Insert the test data into the virtual table. Note that the rowid MUST be explicitly set when inserting vectors and cannot be auto-generated. -# The rowid is used to uniquely identify a vector and serve as a "foreign key" to relate to the vector's metadata. -# Vectorlite takes vectors in raw bytes, so a numpy vector need to be converted to bytes before inserting into the table. -cursor.executemany('insert into my_table(rowid, my_embedding) values (?, ?)', [(i, data[i].tobytes()) for i in range(NUM_ELEMENTS)]) - -# Query the virtual table to get the vector at rowid 12345. Note the vector needs to be converted back to json using vector_to_json() to be human-readable. -result = cursor.execute('select vector_to_json(my_embedding) from my_table where rowid = 1234').fetchone() -print(f'vector at rowid 1234: {result[0]}') - -# Find 10 approximate nearest neighbors of data[0] and there distances from data[0]. -# knn_search() is used to tell vectorlite to do a vector search. -# knn_param(V, K, ef) is used to pass the query vector V, the number of nearest neighbors K to find and an optional ef parameter to tune the performance of the search. -# If ef is not specified, ef defaults to 10. For more info on ef, please check https://github.com/nmslib/hnswlib/blob/v0.8.0/ALGO_PARAMS.md -result = cursor.execute('select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10))', [data[0].tobytes()]).fetchall() -print(f'10 nearest neighbors of row 0 is {result}') - -# Find 10 approximate nearest neighbors of data[0] for rowid from 1000 to 2000 using metadata(rowid) filtering. -rowids = ','.join([str(rowid) for rowid in range(1000, 2000)]) -result = cursor.execute(f'select rowid, distance from my_table where knn_search(my_embedding, knn_param(?, 10)) and rowid in ({rowids})', [data[0].tobytes()]).fetchall() -print(f'10 nearest neighbors of row 0 from rowid 1000 to 2000 is {result}') - -conn.close() \ No newline at end of file +conn.close()