-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
74c0096
commit f885ee7
Showing
7 changed files
with
112 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
# Instructions | ||
1. Please first run `sh build_release.sh` in project root folder. | ||
2. Run `pip3 install -r requirements.txt`. Vectorlite's rowid filter feature requires sqlite3 version >= 3.38. Usually, python's built-in sqlite3 module is not new enough. Apsw provides binding to latest sqlite3 releases. | ||
4. Run `python3 knn_search.py`. If you still want to use the built-in sqlite3 module, run `USE_BUILTIN_SQLITE3=1 python3 knn_search.py`. | ||
1. Run `pip3 install -r requirements.txt`. Vectorlite's metadata(rowid) filter feature requires sqlite3 version >= 3.38. Usually, python's built-in sqlite3 module is not new enough. Apsw provides binding to latest sqlite3 releases. | ||
2. If you still want to use the built-in sqlite3 module, run set `USE_BUILTIN_SQLITE3=1`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import vectorlite_py | ||
import apsw | ||
import numpy as np | ||
""" | ||
This is an example of setting HNSW parameters in vectorlite. | ||
""" | ||
|
||
conn = apsw.Connection(':memory:') | ||
conn.enable_load_extension(True) # enable extension loading | ||
conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite | ||
|
||
cursor = conn.cursor() | ||
|
||
DIM = 32 # dimension of the vectors | ||
NUM_ELEMENTS = 1000 # number of vectors | ||
|
||
# Create virtual table with customized HNSW parameters. Please check https://github.com/nmslib/hnswlib/blob/v0.8.0/ALGO_PARAMS.md for more info. | ||
cursor.execute(f'create virtual table vector_table using vectorlite(embedding float32[{DIM}], hnsw(max_elements={NUM_ELEMENTS}, ef_construction=100, M=32, random_seed=1000))') | ||
|
||
data = np.float32(np.random.random((NUM_ELEMENTS, DIM))) # Only float32 vectors are supported by vectorlite for now | ||
embeddings = [(i, data[i].tobytes()) for i in range(NUM_ELEMENTS)] | ||
cursor.executemany('insert into vector_table(rowid, embedding) values (?, ?)', embeddings) | ||
|
||
# Search for the 10 nearest neighbors of the first embedding with customized ef = 32. Increasing ef will increase the search accuracy but also increase the search time. | ||
# knn_param(V, K, ef) is used to pass the query vector V, the number of nearest neighbors K to find and an optional ef parameter to tune the performance of the search. | ||
# If ef is not specified, ef defaults to 10. | ||
# Note: setting ef is an imperitive operation. If it is not set in later queires, it will stay 32. | ||
# If index serialization is enabled, ef is not serialized in the index file, so it will be lost when the connection is closed. | ||
# When the table is reloaded from an index file, ef will be set to the default value 10. | ||
vector_to_search = data[0].tobytes() | ||
k = 10 | ||
ef = 32 | ||
result = cursor.execute(f'select rowid, distance from vector_table where knn_search(embedding, knn_param(?, ?, ?))', [vector_to_search, k, ef]).fetchall() | ||
print(result) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import vectorlite_py | ||
import numpy as np | ||
import apsw | ||
|
||
""" | ||
A contrived example of using vectorlite to search vectors with metadata filter. | ||
Metadata filter in vectorlite is achived by filtering rowid. | ||
Candidate rowids need to be generated first then passed to the `rowid in (..)` constraint. | ||
The rowid constraint is pushed down to the HNSW index when traversing the HNSW graph, so it is efficient and accurate. | ||
""" | ||
|
||
conn = apsw.Connection(':memory:') | ||
conn.enable_load_extension(True) # enable extension loading | ||
conn.load_extension(vectorlite_py.vectorlite_path()) # loads vectorlite | ||
|
||
cursor = conn.cursor() | ||
|
||
DIM = 32 # dimension of the vectors | ||
NUM_ELEMENTS = 1000 # number of vectors | ||
|
||
# In this example, we have a news table that stores article. | ||
cursor.execute(f'create table news(rowid integer primary key, article text)') | ||
# For simplicity, the article is just a string of the rowid. | ||
cursor.executemany('insert into news(rowid, article) values (?, ?)', [(i, str(i)) for i in range(NUM_ELEMENTS)]) | ||
|
||
# Create a virtual table to store the embeddings | ||
cursor.execute(f'create virtual table vector_table using vectorlite(article_embedding float32[{DIM}], hnsw(max_elements={NUM_ELEMENTS}))') | ||
# For simplicity, embeddings are randomly generated for each article. | ||
# In a real application, you should replace this with your own embeddings. | ||
data = np.float32(np.random.random((NUM_ELEMENTS, DIM))) # Only float32 vectors are supported by vectorlite for now | ||
embeddings = [(i, data[i].tobytes()) for i in range(NUM_ELEMENTS)] | ||
cursor.executemany('insert into vector_table(rowid, article_embedding) values (?, ?)', embeddings) | ||
|
||
# Now let's search for the 10 nearest neighbors of the first article in articles that start with "1" | ||
result = cursor.execute(f'select rowid, distance from vector_table where knn_search(article_embedding, knn_param(?, 10)) and rowid in (select rowid from news where article like "1%")', [data[0].tobytes()]).fetchall() | ||
print(result) | ||
|
||
# Please prefer using rowid in(...) instead of using `join`. The below query will first find 10 neighbors and then filter by rowid, which is not what we wanted. | ||
# result = cursor.execute(f'select a.rowid, a.distance from vector_table a join news b on a.rowid = b.rowid where b.article like "1%" and knn_search(my_embedding, knn_param(?, 10)) ', [data[0].tobytes()]).fetchall() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters