diff --git a/src/index.rs b/src/index.rs index 843e23ac..4bbe974b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -301,6 +301,16 @@ pub(crate) struct Index { // It combines a WhiteSpaceTokenizer with a StopWordFilter, OuterPunctuationFilter // and a PossessiveContractionFilter. fn get_kapiche_tokenizer() -> TextAnalyzer { + TextAnalyzer::builder(WhitespaceTokenizer::default()) + .filter(OuterPunctuationFilter::new(vec!['#', '@'])) + .filter(PossessiveContractionFilter) + .build() +} + +// Creates a custom Tokenizer in line with the requirements of Kapiche. +// It combines a WhiteSpaceTokenizer with a StopWordFilter, LowerCaser, OuterPunctuationFilter, +// and a PossessiveContractionFilter. +fn get_kapiche_tokenizer_lower() -> TextAnalyzer { TextAnalyzer::builder(WhitespaceTokenizer::default()) .filter(LowerCaser) .filter(OuterPunctuationFilter::new(vec!['#', '@'])) @@ -321,6 +331,10 @@ impl Index { index .tokenizers() .register("kapiche_tokenizer", kapiche_tokenizer); + let kapiche_tokenizer_lower = get_kapiche_tokenizer_lower(); + index + .tokenizers() + .register("kapiche_tokenizer_lower", kapiche_tokenizer_lower); let reader = index.reader().map_err(to_pyerr)?; Ok(Index { index, reader }) @@ -353,6 +367,10 @@ impl Index { index .tokenizers() .register("kapiche_tokenizer", kapiche_tokenizer); + let kapiche_tokenizer_lower = get_kapiche_tokenizer_lower(); + index + .tokenizers() + .register("kapiche_tokenizer_lower", kapiche_tokenizer_lower); let reader = index.reader().map_err(to_pyerr)?; Ok(Index { index, reader }) diff --git a/tests/test_stat_collector.py b/tests/test_stat_collector.py index 004ab510..288423fb 100644 --- a/tests/test_stat_collector.py +++ b/tests/test_stat_collector.py @@ -104,6 +104,7 @@ def test_stat_searcher_memory(): SchemaBuilder() .add_text_field("title", stored=True) .add_text_field("body", tokenizer_name='kapiche_tokenizer') + .add_text_field("body_lower", tokenizer_name='kapiche_tokenizer_lower') .add_unsigned_field("document_id__", stored=True, indexed=True, fast=True) .add_unsigned_field("frame_id__", stored=True, indexed=True, fast=True) .add_unsigned_field("sentence_id__", stored=True, indexed=True, fast=True) @@ -120,6 +121,7 @@ def test_stat_searcher_memory(): doc = Document() doc.add_text("title", f"Paragraph {i}") doc.add_text("body", paragraph) + doc.add_text("body_lower", paragraph) doc.add_unsigned("document_id__", i) doc.add_unsigned("frame_id__", i) doc.add_unsigned("sentence_id__", i) @@ -152,6 +154,13 @@ def test_stat_searcher_memory(): assert total_mem_growth < 500_000 + result = index.stat_searcher().search(query) + items = sorted(result.unique_docs_frames) + assert len(items) == 439 + assert items[:4] == [(0, 0), (2, 2), (11, 11), (18, 18)] + + # Test kapiche_tokenizer_lower + query = index.parse_query("Holmes", ["body_lower"]) result = index.stat_searcher().search(query) items = sorted(result.unique_docs_frames) assert len(items) == 441