Skip to content

Commit

Permalink
Impact value ranges
Browse files Browse the repository at this point in the history
  • Loading branch information
bpiwowar committed Jul 29, 2024
1 parent 82172d8 commit 2527568
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 11 deletions.
8 changes: 7 additions & 1 deletion src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use super::{
};
use crate::{
base::{BoxResult, DocId, ImpactValue, TermIndex},
index::TermIndexInformation,
index::{SparseIndexInformation, TermIndexInformation},
};
use crate::{
base::{Len, TermImpact},
Expand Down Expand Up @@ -593,6 +593,12 @@ impl SparseIndex for SparseBuilderIndex {
}
}

impl SparseIndexInformation for SparseBuilderIndex {
fn value_range(&self, term_ix: TermIndex) -> (ImpactValue, ImpactValue) {
return (0., self.terms[term_ix].max_value);
}
}

impl Len for SparseBuilderIndex {
fn len(&self) -> usize {
return self.terms.len();
Expand Down
14 changes: 9 additions & 5 deletions src/compress/impact.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,20 @@ pub struct GlobalQuantizerFactory {

impl ImpactCompressorFactory for GlobalQuantizerFactory {
fn create(&self, index: &dyn crate::index::SparseIndexView) -> Box<dyn ImpactCompressor> {
log::info!("Computing global minimum and maximum impact (quantizer)");
log::info!(
"Computing global minimum and maximum impact (quantizer) over {} terms",
index.len()
);
let mut min = ImpactValue::INFINITY;
let mut max = -ImpactValue::INFINITY;

// Compute the maximum over all terms
for term_ix in 0..index.len() {
for posting in index.iterator(term_ix) {
min = min.min(posting.value);
max = max.max(posting.value);
}
let (term_min, term_max) = index.value_range(term_ix);
min = min.min(term_min);
max = max.max(term_max);
}
log::info!("Quantizer bounds: {}-{}", min, max);
Box::new(Quantizer::new(self.nbits, min, max))
}

Expand Down
7 changes: 7 additions & 0 deletions src/compress/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use super::{
};
use crate::{
base::{save_index, DocId, ImpactValue, IndexLoader, Len, TermImpact, TermIndex},
index::SparseIndexInformation,
utils::buffer::{Buffer, MemoryBuffer, MmapBuffer, Slice},
};
use log::{debug, info};
Expand Down Expand Up @@ -452,6 +453,12 @@ impl SparseIndex for CompressedIndex {
}
}

impl SparseIndexInformation for CompressedIndex {
fn value_range(&self, term_ix: TermIndex) -> (ImpactValue, ImpactValue) {
return (0., self.information.terms[term_ix].max_value);
}
}

impl Len for CompressedIndex {
fn len(&self) -> usize {
self.information.terms.len()
Expand Down
7 changes: 6 additions & 1 deletion src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,13 @@ impl IndexInformation {
}
}

pub trait SparseIndexInformation: Len {
/// Get maximum impact value for a term
fn value_range(&self, term_ix: TermIndex) -> (ImpactValue, ImpactValue);
}

/// A very simple
pub trait SparseIndexView: Send + Sync + Len {
pub trait SparseIndexView: Send + Sync + SparseIndexInformation {
/// Basic iterator
fn iterator<'a>(&'a self, term_ix: TermIndex) -> Box<dyn Iterator<Item = TermImpact> + 'a>;
}
Expand Down
43 changes: 39 additions & 4 deletions src/transforms/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use std::sync::Mutex;
use crate::base::{
load_index, save_index, DocId, ImpactValue, IndexLoader, Len, TermImpact, TermIndex,
};
use crate::index::SparseIndexView;
use crate::index::{SparseIndexInformation, SparseIndexView};
use crate::{
index::{BlockTermImpactIterator, SparseIndex},
transforms::IndexTransform,
Expand Down Expand Up @@ -293,6 +293,15 @@ impl Len for SplitIndex {
}
}

impl SparseIndexInformation for SplitIndex {
fn value_range(&self, term_ix: TermIndex) -> (ImpactValue, ImpactValue) {
return (
self.inner.value_range(term_ix * self.splits - 1).0,
self.inner.value_range((term_ix + 1) * self.splits - 1).1,
);
}
}

/// View on the index
struct SplitIndexView<'a> {
/// Inner index that contains the postings
Expand Down Expand Up @@ -342,11 +351,10 @@ impl<'a> Iterator for SplitIndexViewIterator<'a> {
}
}

impl<'a> SparseIndexView for SplitIndexView<'a> {
fn iterator<'b>(&'b self, term_ix: TermIndex) -> Box<dyn Iterator<Item = TermImpact> + 'b> {
impl<'a> SplitIndexView<'a> {
fn compute_threshold(&self, term_ix: TermIndex) {
// Source term and quantile indices
let source_term_ix = term_ix / (self.quantiles.len() + 1);
let quantile_ix = term_ix % (self.quantiles.len() + 1);

let thresholds = &mut self.thresholds.lock().unwrap();
let term_thresholds = &mut thresholds[source_term_ix];
Expand All @@ -371,8 +379,19 @@ impl<'a> SparseIndexView for SplitIndexView<'a> {
}
term_thresholds.push(ImpactValue::INFINITY);
}
}
}

impl<'a> SparseIndexView for SplitIndexView<'a> {
fn iterator<'b>(&'b self, term_ix: TermIndex) -> Box<dyn Iterator<Item = TermImpact> + 'b> {
// Source term and quantile indices
self.compute_threshold(term_ix);
let source_term_ix = term_ix / (self.quantiles.len() + 1);
let quantile_ix = term_ix % (self.quantiles.len() + 1);

// Returns the iterator
let thresholds = &mut self.thresholds.lock().unwrap();
let term_thresholds = &mut thresholds[source_term_ix];
Box::new(SplitIndexViewIterator {
iterator: self.source.iterator(source_term_ix),
min: term_thresholds[quantile_ix],
Expand All @@ -386,3 +405,19 @@ impl<'a> Len for SplitIndexView<'a> {
self.source.len() * (self.quantiles.len() + 1)
}
}

impl<'a> SparseIndexInformation for SplitIndexView<'a> {
fn value_range(&self, term_ix: TermIndex) -> (ImpactValue, ImpactValue) {
// Computes the
self.compute_threshold(term_ix);
let source_term_ix = term_ix / (self.quantiles.len() + 1);
let quantile_ix = term_ix % (self.quantiles.len() + 1);

let thresholds = &mut self.thresholds.lock().unwrap();
let term_thresholds = &mut thresholds[source_term_ix];
(
term_thresholds[quantile_ix],
term_thresholds[quantile_ix + 1],
)
}
}

0 comments on commit 2527568

Please sign in to comment.