diff --git a/be/src/clucene b/be/src/clucene index a1b2a2aefcecad..835c1ab0a39a0e 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit a1b2a2aefcecad157448de49a88abf01d18d9ccf +Subproject commit 835c1ab0a39a0e4594f7acf4fcca31a614debe8e diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index fced65724e5d05..5fd1223e0fa101 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -224,9 +224,10 @@ Status InvertedIndexReader::handle_searcher_cache( // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; static_cast(read_null_bitmap(io_ctx, stats, &null_bitmap_cache_handle, dir.get())); - RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); - auto* cache_value = new InvertedIndexSearcherCache::CacheValue( - std::move(searcher), mem_tracker->consumption(), UnixMillis()); + size_t reader_size = 0; + RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, type(), reader_size)); + auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher), + reader_size, UnixMillis()); InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, inverted_index_cache_handle); return Status::OK(); @@ -235,9 +236,8 @@ Status InvertedIndexReader::handle_searcher_cache( Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, IndexSearcherPtr* searcher, - MemTracker* mem_tracker, - InvertedIndexReaderType reader_type) { - SCOPED_CONSUME_MEM_TRACKER(mem_tracker); + InvertedIndexReaderType reader_type, + size_t& reader_size) { auto index_searcher_builder = DORIS_TRY(IndexSearcherBuilder::create_index_searcher_builder(reader_type)); @@ -245,12 +245,11 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, *searcher = searcher_result; // When the meta information has been read, the ioContext needs to be reset to prevent it from being used by other queries. - auto stream = static_cast(dir)->getDorisIndexInput(); + auto* stream = static_cast(dir)->getDorisIndexInput(); stream->setIoContext(nullptr); stream->setIndexFile(false); - // NOTE: before mem_tracker hook becomes active, we caculate reader memory size by hand. - mem_tracker->consume(index_searcher_builder->get_reader_size()); + reader_size = index_searcher_builder->get_reader_size(); return Status::OK(); }; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index bbd148fae5250d..768e9533ca97db 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -220,8 +220,7 @@ class InvertedIndexReader : public std::enable_shared_from_thisopen()) { LOG(INFO) << "bkd index file " << directory->toString() << " is empty"; } + reader_size = bkd_reader->ram_bytes_used(); output_searcher = bkd_reader; _CLDECDELETE(directory) return Status::OK(); diff --git a/be/test/olap/rowset/segment_v2/inverted_index_searcher_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_searcher_test.cpp new file mode 100644 index 00000000000000..955d46c4b0a4ab --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index_searcher_test.cpp @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index_searcher.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wshadow-field" +#endif + +#include + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +#include +#include +#include + +#include +#include + +#include "common/status.h" +#include "io/fs/local_file_system.h" + +namespace doris::segment_v2 { +class InvertedIndexSearcherBuilderFlowTest : public testing::Test { +public: + const std::string kTestDir = "./ut_dir/inverted_index_searcher_flow_test"; + + void SetUp() override { + _fs = io::global_local_filesystem(); + auto st = _fs->delete_directory(kTestDir); + st = _fs->create_directory(kTestDir); + ASSERT_TRUE(st.ok()) << st; + } + + void TearDown() override { + auto st = _fs->delete_directory(kTestDir); + ASSERT_TRUE(st.ok()) << st; + } + std::string generateRandomString(int length) { + static const char alphanum[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + std::string randomString; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(0, sizeof(alphanum) - 2); + for (int i = 0; i < length; ++i) { + randomString += alphanum[dis(gen)]; + } + return randomString; + } + + const int32_t MAX_FIELD_LEN = 0x7FFFFFFFL; + const int32_t MAX_BUFFER_DOCS = 100000000; + const int32_t MERGE_FACTOR = 100000000; + +protected: + io::FileSystemSPtr _fs; +}; + +TEST_F(InvertedIndexSearcherBuilderFlowTest, test_bkd_builder_build_success) { + const int N = 1024 * 1024; + lucene::store::Directory* tmp_dir = + lucene::store::FSDirectory::getDirectory("./ut_dir/TestBKD"); + std::unique_ptr dir = + std::unique_ptr(_CL_POINTER(tmp_dir)); + std::shared_ptr w = + std::make_shared(N, 1, 1, 4, 512, 100.0, N, true); + w->docs_seen_ = N; + + for (int docID = 0; docID < N; docID++) { + std::vector scratch(4); + + if (docID > 500000) { + lucene::util::NumericUtils::intToSortableBytes(200, scratch, 0); + + } else { + lucene::util::NumericUtils::intToSortableBytes(100, scratch, 0); + } + w->add(scratch.data(), scratch.size(), docID); + } + + int64_t indexFP; + { + std::unique_ptr out(dir->createOutput("bkd")); + std::unique_ptr meta_out(dir->createOutput("bkd_meta")); + std::unique_ptr index_out(dir->createOutput("bkd_index")); + + try { + indexFP = w->finish(out.get(), index_out.get()); + w->meta_finish(meta_out.get(), indexFP, 0); + } catch (...) { + ASSERT_TRUE(false) << "BKDIndexSearcherBuilder build error"; + } + } + + BKDIndexSearcherBuilder builder; + OptionalIndexSearcherPtr output_searcher; + + auto st = builder.build(dir.get(), output_searcher); + EXPECT_TRUE(st.ok()) << "BKDIndexSearcherBuilder build error: " << st.msg(); + EXPECT_TRUE(output_searcher.has_value()); + EXPECT_GT(builder.get_reader_size(), 0); + std::cout << "test_bkd_builder size = " << builder.get_reader_size() << std::endl; +} + +TEST_F(InvertedIndexSearcherBuilderFlowTest, test_fulltext_builder) { + auto* tmp_dir = new lucene::store::RAMDirectory(); + std::unique_ptr dir = + std::unique_ptr(_CL_POINTER(tmp_dir)); + + lucene::analysis::SimpleAnalyzer sanalyzer; + lucene::index::IndexWriter w(dir.get(), &sanalyzer, true); + w.setUseCompoundFile(false); + w.setMaxBufferedDocs(MAX_BUFFER_DOCS); + w.setRAMBufferSizeMB(256); + w.setMaxFieldLength(MAX_FIELD_LEN); + w.setMergeFactor(MERGE_FACTOR); + lucene::document::Document doc; + std::wstring field_name = L"fulltext"; + auto* field = _CLNEW lucene::document::Field( + field_name.c_str(), + int(lucene::document::Field::INDEX_TOKENIZED) | int(lucene::document::Field::STORE_NO)); + doc.add(*field); + + for (int i = 0; i <= 2000; i++) { + std::string value1 = "value1"; + if (i > 0) { + value1 = generateRandomString(2000); + } + auto* stringReader = _CLNEW lucene::util::SStringReader( + value1.c_str(), strlen(value1.c_str()), false); + auto* stream = sanalyzer.reusableTokenStream(field_name.c_str(), stringReader); + + field->setValue(stream); + w.addDocument(&doc); + _CLDELETE(stringReader); + } + doc.clear(); + w.close(); + FulltextIndexSearcherBuilder builder; + OptionalIndexSearcherPtr output_searcher; + auto st = builder.build(dir.get(), output_searcher); + EXPECT_TRUE(st.ok()) << st.to_string(); + + ASSERT_TRUE(output_searcher.has_value()); + auto searcher_variant = *output_searcher; + EXPECT_TRUE(std::holds_alternative(searcher_variant)); + auto searcher_ptr = std::get(searcher_variant); + EXPECT_NE(searcher_ptr, nullptr); + EXPECT_GT(builder.get_reader_size(), 0); + std::cout << "test_fulltext_builder size = " << builder.get_reader_size() << std::endl; +} + +TEST_F(InvertedIndexSearcherBuilderFlowTest, test_keyword_builder) { + auto* tmp_dir = new lucene::store::RAMDirectory(); + std::unique_ptr dir = + std::unique_ptr(_CL_POINTER(tmp_dir)); + + lucene::analysis::SimpleAnalyzer sanalyzer; + lucene::index::IndexWriter w(dir.get(), &sanalyzer, true); + w.setUseCompoundFile(false); + w.setMaxBufferedDocs(MAX_BUFFER_DOCS); + w.setRAMBufferSizeMB(256); + w.setMaxFieldLength(MAX_FIELD_LEN); + w.setMergeFactor(MERGE_FACTOR); + lucene::document::Document doc; + std::wstring field_name = L"keyword"; + auto* field = _CLNEW lucene::document::Field(field_name.c_str(), + int(lucene::document::Field::INDEX_UNTOKENIZED) | + int(lucene::document::Field::STORE_NO)); + doc.add(*field); + + for (int i = 0; i <= 2000; i++) { + std::string value1 = "value1"; + if (i > 0) { + value1 = generateRandomString(2000); + } + field->setValue((char*)value1.c_str(), value1.size()); + w.addDocument(&doc); + } + doc.clear(); + w.close(); + FulltextIndexSearcherBuilder builder; + OptionalIndexSearcherPtr output_searcher; + auto st = builder.build(dir.get(), output_searcher); + EXPECT_TRUE(st.ok()) << st.to_string(); + + ASSERT_TRUE(output_searcher.has_value()); + auto searcher_variant = *output_searcher; + EXPECT_TRUE(std::holds_alternative(searcher_variant)); + auto searcher_ptr = std::get(searcher_variant); + EXPECT_NE(searcher_ptr, nullptr); + EXPECT_GT(builder.get_reader_size(), 0); + std::cout << "test_keyword_builder size = " << builder.get_reader_size() << std::endl; +} +} // namespace doris::segment_v2 \ No newline at end of file