Skip to content

Commit

Permalink
Fix BufferBackend soundness issue and add `StringInterner::resolve_…
Browse files Browse the repository at this point in the history
…unchecked` (#68)

* fix BufferBackend::resolve unsoundness

Unfortunately this fix vastly regresses the performance of the method.
Benchmarks show -73% throughput which is massive ...

Still the BufferBackend is a viable choice for memory constrained environments.

* add StringInterner::resolve_unchecked method

We added this because it make a huge difference for the BufferBackend to have this available.
  • Loading branch information
Robbepop authored May 1, 2024
1 parent 52f1139 commit 3dbdfe1
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 10 deletions.
35 changes: 34 additions & 1 deletion benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ use criterion::{
};
use string_interner::backend::Backend;

criterion_group!(bench_resolve, bench_resolve_already_filled);
criterion_group!(
bench_resolve,
bench_resolve_already_filled,
bench_resolve_unchecked_already_filled
);
criterion_group!(bench_get, bench_get_already_filled);
criterion_group!(bench_iter, bench_iter_already_filled);
criterion_group!(
Expand Down Expand Up @@ -184,6 +188,35 @@ fn bench_resolve_already_filled(c: &mut Criterion) {
bench_for_backend::<BenchBuffer>(&mut g);
}

fn bench_resolve_unchecked_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("resolve_unchecked/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
fn bench_for_backend<BB: BackendBenchmark>(g: &mut BenchmarkGroup<WallTime>) {
g.bench_with_input(
BB::NAME,
&(BENCH_LEN_STRINGS, BENCH_STRING_LEN),
|bencher, &(len_words, word_len)| {
let words = generate_test_strings(len_words, word_len);
bencher.iter_batched_ref(
|| BB::setup_filled_with_ids(&words),
|(interner, word_ids)| {
for &word_id in &*word_ids {
black_box(
// SAFETY: We provide only valid symbols to the tested interners.
unsafe { interner.resolve_unchecked(word_id) },
);
}
},
BatchSize::SmallInput,
)
},
);
}
bench_for_backend::<BenchBucket>(&mut g);
bench_for_backend::<BenchString>(&mut g);
bench_for_backend::<BenchBuffer>(&mut g);
}

fn bench_get_already_filled(c: &mut Criterion) {
let mut g = c.benchmark_group("get/already-filled");
g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64));
Expand Down
18 changes: 10 additions & 8 deletions src/backend/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,12 @@ where
///
/// Returns the string from the given index if any as well
/// as the index of the next string in the buffer.
fn resolve_index_to_str(&self, index: usize) -> Option<(&str, usize)> {
fn resolve_index_to_str(&self, index: usize) -> Option<(&[u8], usize)> {
let bytes = self.buffer.get(index..)?;
let (str_len, str_len_bytes) = decode_var_usize(bytes)?;
let index_str = index + str_len_bytes;
let str_bytes = self.buffer.get(index_str..index_str + str_len)?;
// SAFETY: It is guaranteed by the backend that only valid strings
// are stored in this portion of the buffer.
let string = unsafe { str::from_utf8_unchecked(str_bytes) };
Some((string, index_str + str_len))
Some((str_bytes, index_str + str_len))
}

/// Resolves the string for the given symbol.
Expand Down Expand Up @@ -180,8 +177,10 @@ where

#[inline]
fn resolve(&self, symbol: Self::Symbol) -> Option<&str> {
self.resolve_index_to_str(symbol.to_usize())
.map(|(string, _next_str_index)| string)
match self.resolve_index_to_str(symbol.to_usize()) {
None => None,
Some((bytes, _)) => str::from_utf8(bytes).ok(),
}
}

fn shrink_to_fit(&mut self) {
Expand Down Expand Up @@ -481,7 +480,10 @@ where
fn next(&mut self) -> Option<Self::Item> {
self.backend
.resolve_index_to_str(self.next)
.and_then(|(string, next)| {
.and_then(|(bytes, next)| {
// SAFETY: Within the iterator all indices given to `resolv_index_to_str`
// are properly pointing to the start of each interned string.
let string = unsafe { str::from_utf8_unchecked(bytes) };
let symbol = S::try_from_usize(self.next)?;
self.next = next;
self.remaining -= 1;
Expand Down
13 changes: 12 additions & 1 deletion src/interner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,23 @@ where
self.backend.shrink_to_fit()
}

/// Returns the string for the given symbol if any.
/// Returns the string for the given `symbol`` if any.
#[inline]
pub fn resolve(&self, symbol: <B as Backend>::Symbol) -> Option<&str> {
self.backend.resolve(symbol)
}

/// Returns the string for the given `symbol` without performing any checks.
///
/// # Safety
///
/// It is the caller's responsibility to provide this method with `symbol`s
/// that are valid for the [`StringInterner`].
#[inline]
pub unsafe fn resolve_unchecked(&self, symbol: <B as Backend>::Symbol) -> &str {
unsafe { self.backend.resolve_unchecked(symbol) }
}

/// Returns an iterator that yields all interned strings and their symbols.
#[inline]
pub fn iter(&self) -> <B as Backend>::Iter<'_> {
Expand Down
20 changes: 20 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,26 @@ macro_rules! gen_tests_for_backend {
assert_eq!(interner.resolve(dd), None);
}

#[test]
fn resolve_unchecked_works() {
let mut interner = StringInterner::new();
// Insert 3 unique strings:
let aa = interner.get_or_intern("aa");
let bb = interner.get_or_intern("bb");
let cc = interner.get_or_intern("cc");
assert_eq!(interner.len(), 3);
// Resolve valid symbols:
assert_eq!(unsafe { interner.resolve_unchecked(aa) }, "aa");
assert_eq!(unsafe { interner.resolve_unchecked(bb) }, "bb");
assert_eq!(unsafe { interner.resolve_unchecked(cc) }, "cc");
assert_eq!(interner.len(), 3);
// Resolve invalid symbols:
let dd = expect_valid_symbol(1000);
assert_ne!(aa, dd);
assert_ne!(bb, dd);
assert_ne!(cc, dd);
}

#[test]
fn get_works() {
let mut interner = StringInterner::new();
Expand Down

0 comments on commit 3dbdfe1

Please sign in to comment.