Skip to content

Commit

Permalink
sparse: disable refinement by default
Browse files Browse the repository at this point in the history
Because of the removal of forward index, the current refine searching is
slower than before. To prevent performance degradation when using a
non-zero value of drop_ratio_search, disable refinement by default.

Signed-off-by: Shawn Wang <[email protected]>
  • Loading branch information
sparknack committed Jan 16, 2025
1 parent 45d757c commit 2b9ee37
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 5 deletions.
5 changes: 3 additions & 2 deletions src/index/sparse/sparse_index_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class SparseInvertedIndexNode : public IndexNode {
auto nq = dataset->GetRows();
auto queries = static_cast<const sparse::SparseRow<T>*>(dataset->GetTensor());
auto k = cfg.k.value();
auto refine_factor = cfg.refine_factor.value_or(10);
auto refine_factor = cfg.refine_factor.value_or(1);
auto drop_ratio_search = cfg.drop_ratio_search.value_or(0.0f);

auto p_id = std::make_unique<sparse::label_t[]>(nq * k);
Expand Down Expand Up @@ -185,7 +185,8 @@ class SparseInvertedIndexNode : public IndexNode {
auto computer = computer_or.value();
auto drop_ratio_search = cfg.drop_ratio_search.value_or(0.0f);

const bool approximated = drop_ratio_search > 0;
// TODO: set approximated to false for now since the refinement is too slow after forward index is removed.
const bool approximated = false;

auto vec = std::vector<std::shared_ptr<IndexNode::iterator>>(nq, nullptr);
try {
Expand Down
14 changes: 11 additions & 3 deletions src/index/sparse/sparse_inverted_index_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,19 @@ class SparseInvertedIndexConfig : public BaseConfig {
.for_search()
.for_range_search()
.for_iterator();
/**
* refine_factor is used for approximate search.
* refine_factor == 1 means no refinement, and is the default value.
* refine_factor > 1 means refinement. The larger the value, the more
* accurate the approximate result will be, but the slower the
* performance.
* Be aware that if you opt to use a large drop_ratio_search, it is
* necessary for you to manually modify this value.
*/
KNOWHERE_CONFIG_DECLARE_FIELD(refine_factor)
.description("refine factor")
.set_default(10)
.for_search()
.for_range_search();
.set_default(1)
.for_search();
/**
* The term frequency part of score of BM25 is:
* tf * (k1 + 1) / (tf + k1 * (1 - b + b * (doc_len / avgdl)))
Expand Down

0 comments on commit 2b9ee37

Please sign in to comment.