Skip to content

Commit

Permalink
feat: add new kapiche_tokenizer_lower (quickwit-oss#194)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sidhant29 authored Mar 15, 2024
1 parent 174ba07 commit 9686fab
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 0 deletions.
18 changes: 18 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,16 @@ pub(crate) struct Index {
// It combines a WhiteSpaceTokenizer with a StopWordFilter, OuterPunctuationFilter
// and a PossessiveContractionFilter.
fn get_kapiche_tokenizer() -> TextAnalyzer {
TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(OuterPunctuationFilter::new(vec!['#', '@']))
.filter(PossessiveContractionFilter)
.build()
}

// Creates a custom Tokenizer in line with the requirements of Kapiche.
// It combines a WhiteSpaceTokenizer with a StopWordFilter, LowerCaser, OuterPunctuationFilter,
// and a PossessiveContractionFilter.
fn get_kapiche_tokenizer_lower() -> TextAnalyzer {
TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(LowerCaser)
.filter(OuterPunctuationFilter::new(vec!['#', '@']))
Expand All @@ -321,6 +331,10 @@ impl Index {
index
.tokenizers()
.register("kapiche_tokenizer", kapiche_tokenizer);
let kapiche_tokenizer_lower = get_kapiche_tokenizer_lower();
index
.tokenizers()
.register("kapiche_tokenizer_lower", kapiche_tokenizer_lower);

let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader })
Expand Down Expand Up @@ -353,6 +367,10 @@ impl Index {
index
.tokenizers()
.register("kapiche_tokenizer", kapiche_tokenizer);
let kapiche_tokenizer_lower = get_kapiche_tokenizer_lower();
index
.tokenizers()
.register("kapiche_tokenizer_lower", kapiche_tokenizer_lower);

let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader })
Expand Down
9 changes: 9 additions & 0 deletions tests/test_stat_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def test_stat_searcher_memory():
SchemaBuilder()
.add_text_field("title", stored=True)
.add_text_field("body", tokenizer_name='kapiche_tokenizer')
.add_text_field("body_lower", tokenizer_name='kapiche_tokenizer_lower')
.add_unsigned_field("document_id__", stored=True, indexed=True, fast=True)
.add_unsigned_field("frame_id__", stored=True, indexed=True, fast=True)
.add_unsigned_field("sentence_id__", stored=True, indexed=True, fast=True)
Expand All @@ -120,6 +121,7 @@ def test_stat_searcher_memory():
doc = Document()
doc.add_text("title", f"Paragraph {i}")
doc.add_text("body", paragraph)
doc.add_text("body_lower", paragraph)
doc.add_unsigned("document_id__", i)
doc.add_unsigned("frame_id__", i)
doc.add_unsigned("sentence_id__", i)
Expand Down Expand Up @@ -152,6 +154,13 @@ def test_stat_searcher_memory():

assert total_mem_growth < 500_000

result = index.stat_searcher().search(query)
items = sorted(result.unique_docs_frames)
assert len(items) == 439
assert items[:4] == [(0, 0), (2, 2), (11, 11), (18, 18)]

# Test kapiche_tokenizer_lower
query = index.parse_query("Holmes", ["body_lower"])
result = index.stat_searcher().search(query)
items = sorted(result.unique_docs_frames)
assert len(items) == 441
Expand Down

0 comments on commit 9686fab

Please sign in to comment.