diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index c7919b3f8dca24..1758166e76edee 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -2766,7 +2766,7 @@ void Tablet::check_table_size_correctness() { const std::vector& all_rs_metas = _tablet_meta->all_rs_metas(); for (const auto& rs_meta : all_rs_metas) { int64_t total_segment_size = get_segment_file_size(rs_meta); - int64_t total_inverted_index_size = get_inverted_index_file_szie(rs_meta); + int64_t total_inverted_index_size = get_inverted_index_file_size(rs_meta); if (rs_meta->data_disk_size() != total_segment_size || rs_meta->index_disk_size() != total_inverted_index_size || rs_meta->data_disk_size() + rs_meta->index_disk_size() != rs_meta->total_disk_size()) { @@ -2817,7 +2817,7 @@ int64_t Tablet::get_segment_file_size(const RowsetMetaSharedPtr& rs_meta) { return total_segment_size; } -int64_t Tablet::get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta) { +int64_t Tablet::get_inverted_index_file_size(const RowsetMetaSharedPtr& rs_meta) { const auto& fs = rs_meta->fs(); if (!fs) { LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta->resource_id(); diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index d00476f044191c..afe043bf15195b 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -214,6 +214,7 @@ class Tablet final : public BaseTablet { std::mutex& get_push_lock() { return _ingest_lock; } std::mutex& get_base_compaction_lock() { return _base_compaction_lock; } std::mutex& get_cumulative_compaction_lock() { return _cumulative_compaction_lock; } + std::shared_mutex& get_meta_store_lock() { return _meta_store_lock; } std::shared_timed_mutex& get_migration_lock() { return _migration_lock; } @@ -531,7 +532,7 @@ class Tablet final : public BaseTablet { void check_table_size_correctness(); std::string get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t seg_id); int64_t get_segment_file_size(const RowsetMetaSharedPtr& rs_meta); - int64_t get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta); + int64_t get_inverted_index_file_size(const RowsetMetaSharedPtr& rs_meta); public: static const int64_t K_INVALID_CUMULATIVE_POINT = -1; @@ -588,7 +589,7 @@ class Tablet final : public BaseTablet { std::shared_ptr _cumulative_compaction_policy; std::string_view _cumulative_compaction_type; - // use a seperate thread to check all tablets paths existance + // use a separate thread to check all tablets paths existence std::atomic _is_tablet_path_exists; int64_t _last_missed_version; diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index b492a929fca3bf..c5b27c823054a4 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -765,49 +765,68 @@ Status SnapshotLoader::move(const std::string& snapshot_path, TabletSharedPtr ta return Status::InternalError(err_msg); } - if (overwrite) { - std::vector snapshot_files; - RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, &snapshot_files)); - - // 1. simply delete the old dir and replace it with the snapshot dir - try { - // This remove seems soft enough, because we already get - // tablet id and schema hash from this path, which - // means this path is a valid path. - std::filesystem::remove_all(tablet_path); - VLOG_CRITICAL << "remove dir: " << tablet_path; - std::filesystem::create_directory(tablet_path); - VLOG_CRITICAL << "re-create dir: " << tablet_path; - } catch (const std::filesystem::filesystem_error& e) { - std::stringstream ss; - ss << "failed to move tablet path: " << tablet_path << ". err: " << e.what(); - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } + if (!overwrite) { + throw Exception(Status::FatalError("only support overwrite now")); + } - // link files one by one - // files in snapshot dir will be moved in snapshot clean process - std::vector linked_files; - for (auto& file : snapshot_files) { - auto full_src_path = fmt::format("{}/{}", snapshot_path, file); - auto full_dest_path = fmt::format("{}/{}", tablet_path, file); - if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) { - LOG(WARNING) << "failed to link file from " << full_src_path << " to " - << full_dest_path << ", err: " << std::strerror(errno); - - // clean the already linked files - for (auto& linked_file : linked_files) { - remove(linked_file.c_str()); - } + // Medium migration/clone/checkpoint/compaction may change or check the + // files and tablet meta, so we need to take these locks. + std::unique_lock migration_lock(tablet->get_migration_lock(), std::try_to_lock); + std::unique_lock base_compact_lock(tablet->get_base_compaction_lock(), std::try_to_lock); + std::unique_lock cumu_compact_lock(tablet->get_cumulative_compaction_lock(), std::try_to_lock); + std::unique_lock cold_compact_lock(tablet->get_cold_compaction_lock(), std::try_to_lock); + std::unique_lock build_idx_lock(tablet->get_build_inverted_index_lock(), std::try_to_lock); + std::unique_lock meta_store_lock(tablet->get_meta_store_lock(), std::try_to_lock); + if (!migration_lock.owns_lock() || !base_compact_lock.owns_lock() || + !cumu_compact_lock.owns_lock() || !cold_compact_lock.owns_lock() || + !build_idx_lock.owns_lock() || !meta_store_lock.owns_lock()) { + // This error should be retryable + auto status = Status::ObtainLockFailed("failed to get tablet locks, tablet: {}", tablet_id); + LOG(WARNING) << status << ", snapshot path: " << snapshot_path + << ", tablet path: " << tablet_path; + return status; + } - return Status::InternalError("move tablet failed"); + std::vector snapshot_files; + RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, &snapshot_files)); + + // FIXME: the below logic will demage the tablet files if failed in the middle. + + // 1. simply delete the old dir and replace it with the snapshot dir + try { + // This remove seems soft enough, because we already get + // tablet id and schema hash from this path, which + // means this path is a valid path. + std::filesystem::remove_all(tablet_path); + VLOG_CRITICAL << "remove dir: " << tablet_path; + std::filesystem::create_directory(tablet_path); + VLOG_CRITICAL << "re-create dir: " << tablet_path; + } catch (const std::filesystem::filesystem_error& e) { + std::stringstream ss; + ss << "failed to move tablet path: " << tablet_path << ". err: " << e.what(); + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + // link files one by one + // files in snapshot dir will be moved in snapshot clean process + std::vector linked_files; + for (auto& file : snapshot_files) { + auto full_src_path = fmt::format("{}/{}", snapshot_path, file); + auto full_dest_path = fmt::format("{}/{}", tablet_path, file); + if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) { + LOG(WARNING) << "failed to link file from " << full_src_path << " to " << full_dest_path + << ", err: " << std::strerror(errno); + + // clean the already linked files + for (auto& linked_file : linked_files) { + remove(linked_file.c_str()); } - linked_files.push_back(full_dest_path); - VLOG_CRITICAL << "link file from " << full_src_path << " to " << full_dest_path; - } - } else { - throw Exception(Status::FatalError("only support overwrite now")); + return Status::InternalError("move tablet failed"); + } + linked_files.push_back(full_dest_path); + VLOG_CRITICAL << "link file from " << full_src_path << " to " << full_dest_path; } // snapshot loader not need to change tablet uid