diff --git a/be/src/cloud/cloud_rowset_writer.cpp b/be/src/cloud/cloud_rowset_writer.cpp index 47cc3865c1bf65..81645c791e29c9 100644 --- a/be/src/cloud/cloud_rowset_writer.cpp +++ b/be/src/cloud/cloud_rowset_writer.cpp @@ -54,6 +54,8 @@ Status CloudRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) _rowset_meta->set_rowset_id(_context.rowset_id); _rowset_meta->set_partition_id(_context.partition_id); _rowset_meta->set_tablet_id(_context.tablet_id); + _rowset_meta->set_db_id(_context.db_id); + _rowset_meta->set_table_id(_context.table_id); _rowset_meta->set_index_id(_context.index_id); _rowset_meta->set_tablet_schema_hash(_context.tablet_schema_hash); _rowset_meta->set_rowset_type(_context.rowset_type); diff --git a/be/src/cloud/cloud_tablets_channel.cpp b/be/src/cloud/cloud_tablets_channel.cpp index 590677a38d7de0..35ba9d964f4201 100644 --- a/be/src/cloud/cloud_tablets_channel.cpp +++ b/be/src/cloud/cloud_tablets_channel.cpp @@ -25,6 +25,7 @@ #include "cloud/config.h" #include "load/channel/tablets_channel.h" #include "load/delta_writer/delta_writer.h" +#include "storage/tablet_info.h" namespace doris { @@ -42,6 +43,9 @@ std::unique_ptr CloudTabletsChannel::create_delta_writer( Status CloudTabletsChannel::add_batch(const PTabletWriterAddBlockRequest& request, PTabletWriterAddBlockResult* response) { + if (_schema != nullptr && _schema->row_binlog_index_schema() != nullptr) { + return Status::NotSupported("cloud mode does not support binlog now"); + } // FIXME(plat1ko): Too many duplicate code with `TabletsChannel` SCOPED_TIMER(_add_batch_timer); int64_t cur_seq = 0; diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 724dde723f6c82..108d61fa18f962 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -118,11 +118,17 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in) out->set_job_id(in.job_id()); } if (in.has_commit_tso()) { - out->set_commit_tso(in.commit_tso()); + out->mutable_commit_tso()->CopyFrom(in.commit_tso()); } if (in.has_is_row_binlog()) { out->set_is_row_binlog(in.is_row_binlog()); } + if (in.has_db_id()) { + out->set_db_id(in.db_id()); + } + if (in.has_table_id()) { + out->set_table_id(in.table_id()); + } } void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { @@ -208,11 +214,17 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { out->set_job_id(in.job_id()); } if (in.has_commit_tso()) { - out->set_commit_tso(in.commit_tso()); + out->mutable_commit_tso()->CopyFrom(in.commit_tso()); } if (in.has_is_row_binlog()) { out->set_is_row_binlog(in.is_row_binlog()); } + if (in.has_db_id()) { + out->set_db_id(in.db_id()); + } + if (in.has_table_id()) { + out->set_table_id(in.table_id()); + } } RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { @@ -308,11 +320,17 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) out->set_job_id(in.job_id()); } if (in.has_commit_tso()) { - out->set_commit_tso(in.commit_tso()); + out->mutable_commit_tso()->CopyFrom(in.commit_tso()); } if (in.has_is_row_binlog()) { out->set_is_row_binlog(in.is_row_binlog()); } + if (in.has_db_id()) { + out->set_db_id(in.db_id()); + } + if (in.has_table_id()) { + out->set_table_id(in.table_id()); + } } void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { @@ -397,11 +415,17 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { out->set_job_id(in.job_id()); } if (in.has_commit_tso()) { - out->set_commit_tso(in.commit_tso()); + out->mutable_commit_tso()->CopyFrom(in.commit_tso()); } if (in.has_is_row_binlog()) { out->set_is_row_binlog(in.is_row_binlog()); } + if (in.has_db_id()) { + out->set_db_id(in.db_id()); + } + if (in.has_table_id()) { + out->set_table_id(in.table_id()); + } } TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB& in) { diff --git a/be/src/exec/operator/olap_scan_operator.cpp b/be/src/exec/operator/olap_scan_operator.cpp index c1113ba9a3e519..619d89dd769b05 100644 --- a/be/src/exec/operator/olap_scan_operator.cpp +++ b/be/src/exec/operator/olap_scan_operator.cpp @@ -45,6 +45,7 @@ #include "service/backend_options.h" #include "storage/index/ann/ann_topn_runtime.h" #include "storage/storage_engine.h" +#include "storage/tablet/tablet.h" #include "storage/tablet/tablet_manager.h" #include "util/to_string.h" @@ -515,6 +516,8 @@ Status OlapScanLocalState::_init_scanners(std::list* scanners) { } bool enable_parallel_scan = state()->enable_parallel_scan(); + bool read_row_binlog = + p._olap_scan_node.__isset.read_row_binlog && p._olap_scan_node.read_row_binlog; // The flag of preagg's meaning is whether return pre agg data(or partial agg data) // PreAgg ON: The storage layer returns partially aggregated data without additional processing. (Fast data reading) @@ -524,7 +527,9 @@ Status OlapScanLocalState::_init_scanners(std::list* scanners) { // PreAgg OFF: The storage layer must complete pre-aggregation and return fully aggregated data. (Slow data reading) if (enable_parallel_scan && !p._should_run_serial && p._push_down_agg_type == TPushAggOp::NONE && - (_storage_no_merge() || p._olap_scan_node.is_preaggregation)) { + (_storage_no_merge() || p._olap_scan_node.is_preaggregation) + // binlog need to be read in order + && !read_row_binlog) { // Filter out the "full scan" placeholder range (has_lower_bound == false) // so that only ranges with real key bounds are forwarded to the parallel scanner. std::vector key_ranges; @@ -624,6 +629,7 @@ Status OlapScanLocalState::_init_scanners(std::list* scanners) { _read_sources[scan_range_idx], p._limit, p._olap_scan_node.is_preaggregation, + read_row_binlog, }); RETURN_IF_ERROR(scanner->init(state(), _conjuncts)); scanners->push_back(std::move(scanner)); @@ -793,6 +799,8 @@ Status OlapScanLocalState::prepare(RuntimeState* state) { {0, _tablets[i].version}, {.skip_missing_versions = _state->skip_missing_version(), .enable_fetch_rowsets_from_peers = config::enable_fetch_rowsets_from_peer_replicas, + .capture_row_binlog = olap_scan_node().__isset.read_row_binlog && + olap_scan_node().read_row_binlog, .enable_prefer_cached_rowset = config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, .query_freshness_tolerance_ms = @@ -866,7 +874,10 @@ void OlapScanLocalState::set_scan_ranges(RuntimeState* state, const std::vector& scan_ranges) { const auto& cache_param = _parent->cast()._cache_param; bool hit_cache = false; - if (!cache_param.digest.empty() && !cache_param.force_refresh_query_cache) { + // read binlog scan should not participate in query cache. + if (olap_scan_node().__isset.read_row_binlog && olap_scan_node().read_row_binlog) { + hit_cache = false; + } else if (!cache_param.digest.empty() && !cache_param.force_refresh_query_cache) { std::string cache_key; int64_t version = 0; auto status = QueryCache::build_cache_key(scan_ranges, cache_param, &cache_key, &version); diff --git a/be/src/exec/pipeline/pipeline_fragment_context.cpp b/be/src/exec/pipeline/pipeline_fragment_context.cpp index 0750dcbb33a74c..da8bf48b59d9dc 100644 --- a/be/src/exec/pipeline/pipeline_fragment_context.cpp +++ b/be/src/exec/pipeline/pipeline_fragment_context.cpp @@ -1095,7 +1095,7 @@ Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataS int child_node_id = pipeline->operators().back()->node_id(); if (state->query_options().enable_memtable_on_sink_node && !_has_inverted_index_v1_or_partial_update(thrift_sink.olap_table_sink) && - !config::is_cloud_mode()) { + !_has_row_binlog(thrift_sink.olap_table_sink) && !config::is_cloud_mode()) { _sink = std::make_shared( pool, next_sink_operator_id(), child_node_id + 1, row_desc, output_exprs); } else { diff --git a/be/src/exec/scan/olap_scanner.cpp b/be/src/exec/scan/olap_scanner.cpp index a7949535e37755..86fac6e3fc7f76 100644 --- a/be/src/exec/scan/olap_scanner.cpp +++ b/be/src/exec/scan/olap_scanner.cpp @@ -71,6 +71,8 @@ OlapScanner::OlapScanner(ScanLocalStateBase* parent, OlapScanner::Params&& param _key_ranges(std::move(params.key_ranges)), _tablet_reader_params({.tablet = std::move(params.tablet), .tablet_schema {}, + .reader_type = params.read_row_binlog ? ReaderType::READER_BINLOG + : ReaderType::READER_QUERY, .aggregation = params.aggregation, .version = {0, params.version}, .start_key {}, @@ -168,10 +170,13 @@ Status OlapScanner::_prepare_impl() { _tablet_reader->set_preferred_block_size_bytes(_state->preferred_block_size_bytes()); { TOlapScanNode& olap_scan_node = local_state->olap_scan_node(); + TabletSchemaSPtr source_tablet_schema = + _tablet_reader_params.reader_type == ReaderType::READER_BINLOG + ? tablet->row_binlog_tablet_schema() + : tablet->tablet_schema(); - // Each scanner builds its own TabletSchema to avoid concurrent modification. tablet_schema = std::make_shared(); - tablet_schema->copy_from(*tablet->tablet_schema()); + tablet_schema->copy_from(*source_tablet_schema); if (olap_scan_node.__isset.columns_desc && !olap_scan_node.columns_desc.empty() && olap_scan_node.columns_desc[0].col_unique_id >= 0) { tablet_schema->clear_columns(); @@ -204,6 +209,8 @@ Status OlapScanner::_prepare_impl() { .skip_missing_versions = _state->skip_missing_version(), .enable_fetch_rowsets_from_peers = config::enable_fetch_rowsets_from_peer_replicas, + .capture_row_binlog = + _tablet_reader_params.reader_type == ReaderType::READER_BINLOG, .enable_prefer_cached_rowset = config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, @@ -215,7 +222,6 @@ Status OlapScanner::_prepare_impl() { LOG(WARNING) << "fail to init reader. res=" << maybe_read_source.error(); return maybe_read_source.error(); } - read_source = std::move(maybe_read_source.value()); if (config::enable_mow_verbose_log && tablet->enable_unique_key_merge_on_write()) { @@ -246,7 +252,7 @@ Status OlapScanner::_prepare_impl() { _tablet_reader_params.collection_statistics = std::make_shared(); io::IOContext io_ctx { - .reader_type = ReaderType::READER_QUERY, + .reader_type = _tablet_reader_params.reader_type, .expiration_time = tablet->ttl_seconds(), .query_id = &_state->query_id(), .file_cache_stats = &_tablet_reader->mutable_stats()->file_cache_stats, @@ -306,7 +312,6 @@ Status OlapScanner::_init_tablet_reader_params( RETURN_IF_ERROR(_init_variant_columns()); RETURN_IF_ERROR(_init_return_columns()); - _tablet_reader_params.reader_type = ReaderType::READER_QUERY; _tablet_reader_params.push_down_agg_type_opt = _local_state->get_push_down_agg_type(); // TODO: If a new runtime filter arrives after `_conjuncts` move to `_common_expr_ctxs_push_down`, diff --git a/be/src/exec/scan/olap_scanner.h b/be/src/exec/scan/olap_scanner.h index 0458a4a07470e4..7ed31140977842 100644 --- a/be/src/exec/scan/olap_scanner.h +++ b/be/src/exec/scan/olap_scanner.h @@ -65,6 +65,7 @@ class OlapScanner : public Scanner { TabletReadSource read_source; int64_t limit; bool aggregation; + bool read_row_binlog = false; }; OlapScanner(ScanLocalStateBase* parent, Params&& params); diff --git a/be/src/exec/scan/parallel_scanner_builder.cpp b/be/src/exec/scan/parallel_scanner_builder.cpp index 1984b85fe65c64..80cda08531f8e6 100644 --- a/be/src/exec/scan/parallel_scanner_builder.cpp +++ b/be/src/exec/scan/parallel_scanner_builder.cpp @@ -254,8 +254,10 @@ Status ParallelScannerBuilder::_load() { std::shared_ptr ParallelScannerBuilder::_build_scanner( BaseTabletSPtr tablet, int64_t version, const std::vector& key_ranges, TabletReadSource&& read_source) { - OlapScanner::Params params {_state, _scanner_profile.get(), key_ranges, std::move(tablet), - version, std::move(read_source), _limit, _is_preaggregation}; + OlapScanner::Params params { + _state, _scanner_profile.get(), key_ranges, std::move(tablet), + version, std::move(read_source), _limit, _is_preaggregation, + }; return OlapScanner::create_shared(_parent, std::move(params)); } diff --git a/be/src/exec/sink/autoinc_buffer.cpp b/be/src/exec/sink/autoinc_buffer.cpp index 717e517bc6745b..645d2b73c34379 100644 --- a/be/src/exec/sink/autoinc_buffer.cpp +++ b/be/src/exec/sink/autoinc_buffer.cpp @@ -152,9 +152,11 @@ Status AutoIncIDBuffer::sync_request_ids(size_t request_length, } } CHECK_EQ(request_length, 0); +#ifndef BE_TEST if (!_is_fetching && _current_volume < _low_water_level_mark()) { RETURN_IF_ERROR(_launch_async_fetch_task(_prefetch_size())); } +#endif return Status::OK(); } diff --git a/be/src/exec/sink/autoinc_buffer.h b/be/src/exec/sink/autoinc_buffer.h index d46d71bef75929..b6f51a1d72fe43 100644 --- a/be/src/exec/sink/autoinc_buffer.h +++ b/be/src/exec/sink/autoinc_buffer.h @@ -76,6 +76,14 @@ class AutoIncIDBuffer { } }; +#ifdef BE_TEST + void append_range_for_test(int64_t start, size_t length) { + std::lock_guard lock {_latch}; + _buffers.emplace_back(AutoIncRange {start, length}); + _current_volume += length; + } +#endif + private: [[nodiscard]] size_t _prefetch_size() const { return _batch_size * config::auto_inc_prefetch_size_ratio; diff --git a/be/src/information_schema/schema_rowsets_scanner.cpp b/be/src/information_schema/schema_rowsets_scanner.cpp index e5827a3617460e..8d546f17d46a98 100644 --- a/be/src/information_schema/schema_rowsets_scanner.cpp +++ b/be/src/information_schema/schema_rowsets_scanner.cpp @@ -61,7 +61,7 @@ std::vector SchemaRowsetsScanner::_s_tbls_columns = { {"CREATION_TIME", TYPE_DATETIME, sizeof(int64_t), true}, {"NEWEST_WRITE_TIMESTAMP", TYPE_DATETIME, sizeof(int64_t), true}, {"SCHEMA_VERSION", TYPE_INT, sizeof(int32_t), true}, - {"COMMIT_TSO", TYPE_BIGINT, sizeof(int64_t), true}, + {"COMMIT_TSO", TYPE_VARCHAR, sizeof(StringRef), true}, }; @@ -269,10 +269,13 @@ Status SchemaRowsetsScanner::_fill_block_impl(Block* block) { } // COMMIT_TSO { - std::vector srcs(fill_rowsets_num); + std::vector commit_tsos(fill_rowsets_num); + std::vector srcs(fill_rowsets_num); for (size_t i = fill_idx_begin; i < fill_idx_end; ++i) { RowsetSharedPtr rowset = rowsets_[i]; - srcs[i - fill_idx_begin] = rowset->commit_tso(); + commit_tsos[i - fill_idx_begin] = rowset->commit_tso().to_string(); + srcs[i - fill_idx_begin] = StringRef(commit_tsos[i - fill_idx_begin].c_str(), + commit_tsos[i - fill_idx_begin].size()); datas[i - fill_idx_begin] = srcs.data() + i - fill_idx_begin; } RETURN_IF_ERROR(fill_dest_column_for_range(block, 13, datas)); diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 5540d598d87612..36b20517afb87c 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -30,7 +30,9 @@ enum class ReaderType : uint8_t { READER_COLD_DATA_COMPACTION = 5, READER_SEGMENT_COMPACTION = 6, READER_FULL_COMPACTION = 7, - UNKNOWN = 8 + READER_BINLOG_COMPACTION = 8, + READER_BINLOG = 9, + UNKNOWN = 10 }; namespace io { diff --git a/be/src/load/channel/tablets_channel.cpp b/be/src/load/channel/tablets_channel.cpp index 9b730e8874eaa1..21737a00303910 100644 --- a/be/src/load/channel/tablets_channel.cpp +++ b/be/src/load/channel/tablets_channel.cpp @@ -45,6 +45,7 @@ #include "load/channel/load_channel.h" #include "load/delta_writer/delta_writer.h" #include "storage/storage_engine.h" +#include "storage/tablet/tablet_manager.h" #include "storage/tablet_info.h" #include "storage/txn/txn_manager.h" #include "util/defer_op.h" @@ -240,6 +241,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para wrequest.is_high_priority = _is_high_priority; wrequest.table_schema_param = _schema; wrequest.txn_expiration = params.txn_expiration(); // Required by CLOUD. + wrequest.write_file_cache = params.write_file_cache(); wrequest.storage_vault_id = params.storage_vault_id(); auto delta_writer = create_delta_writer(wrequest); @@ -260,7 +262,41 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para } std::unique_ptr TabletsChannel::create_delta_writer(const WriteRequest& request) { - return std::make_unique(_engine, request, _profile, _load_id); + DCHECK(request.write_req_type == WriteRequestType::DATA); + DCHECK(request.table_schema_param != nullptr); + + int64_t row_binlog_index_id = 0; + for (const auto* index_schema : request.table_schema_param->indexes()) { + if (index_schema->index_id == request.index_id) { + row_binlog_index_id = index_schema->row_binlog_id; + break; + } + } + if (row_binlog_index_id <= 0) { + return std::make_unique(_engine, request, _profile, _load_id); + } + + const auto* row_binlog_index_schema = request.table_schema_param->row_binlog_index_schema(); + DCHECK(row_binlog_index_schema != nullptr); + DCHECK(row_binlog_index_schema->index_id == row_binlog_index_id); + + // group_build_req is only for the group wrapper itself. It provides the group semantics and + // metadata used by BaseDeltaWriter/GroupRowsetBuilder to expose tablet_id, txn_id, + // partition_id, load_id and profile information, while concrete rowset builders use the + // sub requests below. + WriteRequest group_build_req = request; + group_build_req.write_req_type = WriteRequestType::GROUP; + + WriteRequest sub_data_req = request; + sub_data_req.write_req_type = WriteRequestType::DATA; + + WriteRequest sub_row_binlog_req = request; + sub_row_binlog_req.write_req_type = WriteRequestType::ROW_BINLOG; + sub_row_binlog_req.index_id = row_binlog_index_schema->index_id; + sub_row_binlog_req.schema_hash = row_binlog_index_schema->schema_hash; + + return std::make_unique(_engine, group_build_req, sub_data_req, sub_row_binlog_req, + _profile, _load_id); } Status TabletsChannel::close(LoadChannel* parent, const PTabletWriterAddBlockRequest& req, diff --git a/be/src/load/delta_writer/delta_writer.cpp b/be/src/load/delta_writer/delta_writer.cpp index 2fd053f765ebe9..b8ef440d3ca18c 100644 --- a/be/src/load/delta_writer/delta_writer.cpp +++ b/be/src/load/delta_writer/delta_writer.cpp @@ -70,9 +70,21 @@ BaseDeltaWriter::BaseDeltaWriter(const WriteRequest& req, RuntimeProfile* profil DeltaWriter::DeltaWriter(StorageEngine& engine, const WriteRequest& req, RuntimeProfile* profile, const UniqueId& load_id) : BaseDeltaWriter(req, profile, load_id), _engine(engine) { + DCHECK(req.write_req_type == WriteRequestType::DATA); _rowset_builder = std::make_unique(_engine, req, profile); } +DeltaWriter::DeltaWriter(StorageEngine& engine, const WriteRequest& group_build_req, + const WriteRequest& sub_data_req, const WriteRequest& sub_row_binlog_req, + RuntimeProfile* profile, const UniqueId& load_id) + : BaseDeltaWriter(group_build_req, profile, load_id), _engine(engine) { + DCHECK(group_build_req.write_req_type == WriteRequestType::GROUP && + sub_data_req.write_req_type == WriteRequestType::DATA && + sub_row_binlog_req.write_req_type == WriteRequestType::ROW_BINLOG); + _rowset_builder = std::make_unique(_engine, group_build_req, sub_data_req, + sub_row_binlog_req, profile); +} + void BaseDeltaWriter::_init_profile(RuntimeProfile* profile) { DCHECK(profile != nullptr); _profile = profile->create_child(fmt::format("DeltaWriter {}", _req.tablet_id), true, true); @@ -137,7 +149,7 @@ Status BaseDeltaWriter::init() { RETURN_IF_ERROR(_memtable_writer->init( _rowset_builder->rowset_writer(), _rowset_builder->tablet_schema(), _rowset_builder->get_partial_update_info(), wg_sptr, - _rowset_builder->tablet()->enable_unique_key_merge_on_write())); + _rowset_builder->tablet_sptr()->enable_unique_key_merge_on_write())); ExecEnv::GetInstance()->memtable_memory_limiter()->register_writer(_memtable_writer); _is_init = true; return Status::OK(); diff --git a/be/src/load/delta_writer/delta_writer.h b/be/src/load/delta_writer/delta_writer.h index ab9715b74b4d52..2e6d180f2ee958 100644 --- a/be/src/load/delta_writer/delta_writer.h +++ b/be/src/load/delta_writer/delta_writer.h @@ -124,6 +124,9 @@ class DeltaWriter final : public BaseDeltaWriter { public: DeltaWriter(StorageEngine& engine, const WriteRequest& req, RuntimeProfile* profile, const UniqueId& load_id); + DeltaWriter(StorageEngine& engine, const WriteRequest& group_build_req, + const WriteRequest& sub_data_req, const WriteRequest& sub_row_binlog_req, + RuntimeProfile* profile, const UniqueId& load_id); ~DeltaWriter() override; diff --git a/be/src/load/delta_writer/delta_writer_context.h b/be/src/load/delta_writer/delta_writer_context.h index 21d436c86358a1..61db94636002d0 100644 --- a/be/src/load/delta_writer/delta_writer_context.h +++ b/be/src/load/delta_writer/delta_writer_context.h @@ -30,9 +30,9 @@ class SlotDescriptor; class OlapTableSchemaParam; enum class WriteRequestType { - DATA = 0, - ROW_BINLOG = 1, - GROUP = 2, + DATA = 0, // data write + ROW_BINLOG = 1, // row binlog write + GROUP = 2, // group write for data and binlog }; struct WriteRequest { @@ -53,9 +53,4 @@ struct WriteRequest { std::string storage_vault_id; }; -struct GroupWriteRequest : public WriteRequest { - WriteRequest data_req; - WriteRequest row_binlog_req; -}; - } // namespace doris diff --git a/be/src/load/delta_writer/delta_writer_v2.cpp b/be/src/load/delta_writer/delta_writer_v2.cpp index 78271f2a48202e..c99c7ebe614e75 100644 --- a/be/src/load/delta_writer/delta_writer_v2.cpp +++ b/be/src/load/delta_writer/delta_writer_v2.cpp @@ -110,6 +110,8 @@ Status DeltaWriterV2::init() { context.rowset_state = PREPARED; context.segments_overlap = OVERLAPPING; context.tablet_schema = _tablet_schema; + context.db_id = _tablet_schema->db_id(); + context.table_id = _tablet_schema->table_id(); context.newest_write_timestamp = UnixSeconds(); context.tablet = nullptr; context.write_type = DataWriteType::TYPE_DIRECT; diff --git a/be/src/load/memtable/memtable_flush_executor.cpp b/be/src/load/memtable/memtable_flush_executor.cpp index ca79b1a834563f..6c598b57475462 100644 --- a/be/src/load/memtable/memtable_flush_executor.cpp +++ b/be/src/load/memtable/memtable_flush_executor.cpp @@ -29,10 +29,14 @@ #include "common/metrics/metrics.h" #include "common/metrics/system_metrics.h" #include "common/signal_handler.h" +#include "exec/sink/autoinc_buffer.h" #include "load/memtable/memtable.h" #include "runtime/thread_context.h" +#include "storage/binlog.h" +#include "storage/rowset/group_rowset_writer.h" #include "storage/rowset/rowset_writer.h" #include "storage/storage_engine.h" +#include "storage/tablet_info.h" #include "util/debug_points.h" #include "util/pretty_printer.h" #include "util/stopwatch.hpp" @@ -43,7 +47,7 @@ using namespace ErrorCode; bvar::Adder g_flush_task_num("memtable_flush_task_num"); -class MemtableFlushTask final : public Runnable { +class MemtableFlushTask : public Runnable { ENABLE_FACTORY_CREATOR(MemtableFlushTask); public: @@ -67,13 +71,38 @@ class MemtableFlushTask final : public Runnable { } } -private: +protected: std::weak_ptr _flush_token; std::shared_ptr _memtable; int32_t _segment_id; int64_t _submit_task_time; }; +class PartOfGroupMemtableFlushTask final : public MemtableFlushTask { + ENABLE_FACTORY_CREATOR(PartOfGroupMemtableFlushTask); + +public: + PartOfGroupMemtableFlushTask(std::shared_ptr flush_token, + std::shared_ptr shared_memtable, + WriteRequestType write_req_type, int64_t submit_task_time) + : MemtableFlushTask(flush_token, nullptr, 0, submit_task_time), + _shared_memtable(std::move(shared_memtable)), + _write_req_type(write_req_type) {} + + void run() override { + auto token = _flush_token.lock(); + if (token) { + token->_flush_group_memtable(_shared_memtable, _write_req_type, _submit_task_time); + } else { + LOG(WARNING) << "flush token is deconstructed, ignore the flush task"; + } + } + +private: + std::shared_ptr _shared_memtable; + WriteRequestType _write_req_type; +}; + std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) { os << "(flush time(ms)=" << stat.flush_time_ns / NANOS_PER_MILLIS << ", flush wait time(ms)=" << stat.flush_wait_time_ns / NANOS_PER_MILLIS @@ -85,6 +114,48 @@ std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) { return os; } +SharedMemtable::~SharedMemtable() { + if (block == nullptr) { + return; + } + DCHECK(memtable != nullptr); + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER( + memtable->resource_ctx()->memory_context()->mem_tracker()->write_tracker()); + SCOPED_CONSUME_MEM_TRACKER(memtable->mem_tracker()); + block.reset(); +} + +Status FlushToken::_submit_sub_tasks(ThreadPool* pool, + std::vector> sub_tasks) { + for (int i = 0; i < sub_tasks.size(); ++i) { + { + std::shared_lock rdlk(_flush_status_lock); + DBUG_EXECUTE_IF("FlushToken.submit_sub_task_error", { + if (i != 0) { + // only affect flush binlog task + _flush_status = Status::IOError("dbug_be_memtable_submit_flush_error"); + } + }); + if (!_flush_status.ok()) { + return _flush_status; + } + } + Status submit_st = pool->submit(std::move(sub_tasks[i])); + if (UNLIKELY(!submit_st.ok())) { + { + std::lock_guard wrlk(_flush_status_lock); + if (_flush_status.ok()) { + _flush_status = submit_st; + } + } + _shutdown_flush_token(); + return submit_st; + } + _stats.flush_submit_count++; + } + return Status::OK(); +} + Status FlushToken::submit(std::shared_ptr mem_table) { { std::shared_lock rdlk(_flush_status_lock); @@ -100,8 +171,57 @@ Status FlushToken::submit(std::shared_ptr mem_table) { return Status::OK(); } int64_t submit_task_time = MonotonicNanos(); - auto task = MemtableFlushTask::create_shared( - shared_from_this(), mem_table, _rowset_writer->allocate_segment_id(), submit_task_time); + auto* group_rowset_writer = typeid_cast(_rowset_writer.get()); + std::shared_ptr shared_memtable; + std::vector> tasks; + if (group_rowset_writer != nullptr) { + auto data_writer = group_rowset_writer->data_writer(); + auto binlog_writer = group_rowset_writer->row_binlog_writer(); + DCHECK(data_writer != nullptr); + DCHECK(binlog_writer != nullptr); + + shared_memtable = std::make_shared(); + shared_memtable->memtable = mem_table; + // Keep data/binlog segment_id allocators in sync. + auto segment_id = data_writer->allocate_segment_id(); + auto binlog_segment_id = binlog_writer->allocate_segment_id(); + DCHECK_EQ(segment_id, binlog_segment_id); + shared_memtable->segment_id = segment_id; + + if (binlog_writer->context().write_binlog_opt().enable) { + if (_row_binlog_lsn_buffer == nullptr) { + std::unique_lock lock(_mutex); + if (_row_binlog_lsn_buffer == nullptr) { + if (_table_schema_param == nullptr) { + return Status::InternalError( + "need binlog but table_schema_param is null"); + } + _row_binlog_lsn_buffer = + GlobalAutoIncBuffers::GetInstance()->get_auto_inc_buffer( + _table_schema_param->db_id(), _table_schema_param->table_id(), + kBinlogLsnAutoIncId); + } + } + std::shared_ptr> lsn; + RETURN_IF_ERROR( + allocate_binlog_lsn(_row_binlog_lsn_buffer, mem_table->raw_rows(), &lsn)); + DCHECK(lsn != nullptr && !lsn->empty()); + const_cast(binlog_writer->context()) + .write_binlog_opt() + .write_binlog_config() + .insert_seg_lsn(shared_memtable->segment_id, lsn); + } + + tasks.emplace_back(PartOfGroupMemtableFlushTask::create_shared( + shared_from_this(), shared_memtable, WriteRequestType::DATA, submit_task_time)); + tasks.emplace_back(PartOfGroupMemtableFlushTask::create_shared( + shared_from_this(), shared_memtable, WriteRequestType::ROW_BINLOG, + submit_task_time)); + } else { + tasks.emplace_back(MemtableFlushTask::create_shared(shared_from_this(), mem_table, + _rowset_writer->allocate_segment_id(), + submit_task_time)); + } // NOTE: we should guarantee WorkloadGroup is not deconstructed when submit memtable flush task. // because currently WorkloadGroup's can only be destroyed when all queries in the group is finished, // but not consider whether load channel is finish. @@ -110,13 +230,26 @@ Status FlushToken::submit(std::shared_ptr mem_table) { if (wg_sptr) { wg_thread_pool = wg_sptr->get_memtable_flush_pool(); } - Status ret = wg_thread_pool ? wg_thread_pool->submit(std::move(task)) - : _thread_pool->submit(std::move(task)); - if (ret.ok()) { - // _wait_running_task_finish was executed after this function, so no need to notify _cond here - _stats.flush_submit_count++; - } - return ret; + ThreadPool* pool = wg_thread_pool ? wg_thread_pool : _thread_pool; + + return _submit_sub_tasks(pool, std::move(tasks)); +} + +void FlushToken::_flush_group_memtable(std::shared_ptr shared_memtable, + WriteRequestType write_req_type, int64_t submit_task_time) { + DCHECK(shared_memtable != nullptr); + DCHECK(shared_memtable->memtable != nullptr); + DCHECK(write_req_type == WriteRequestType::DATA || + write_req_type == WriteRequestType::ROW_BINLOG); + + auto* group_rowset_writer = typeid_cast(_rowset_writer.get()); + DCHECK(group_rowset_writer != nullptr); + auto flush_writer = write_req_type == WriteRequestType::DATA + ? group_rowset_writer->data_writer() + : group_rowset_writer->row_binlog_writer(); + DCHECK(flush_writer != nullptr); + _flush_memtable_impl(flush_writer.get(), shared_memtable->memtable.get(), + shared_memtable->segment_id, submit_task_time, shared_memtable.get()); } // NOTE: FlushToken's submit/cancel/wait run in one thread, @@ -182,47 +315,40 @@ Status FlushToken::_try_reserve_memory(const std::shared_ptr& r return st; } -Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, int64_t* flush_size) { - VLOG_CRITICAL << "begin to flush memtable for tablet: " << memtable->tablet_id() - << ", memsize: " << PrettyPrinter::print_bytes(memtable->memory_usage()) - << ", rows: " << memtable->stat().raw_rows; - memtable->update_mem_type(MemType::FLUSH); - int64_t duration_ns = 0; - { - SCOPED_RAW_TIMER(&duration_ns); - SCOPED_ATTACH_TASK(memtable->resource_ctx()); - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER( - memtable->resource_ctx()->memory_context()->mem_tracker()->write_tracker()); - SCOPED_CONSUME_MEM_TRACKER(memtable->mem_tracker()); - - // DEFER_RELEASE_RESERVED(); - - // auto reserve_size = memtable->get_flush_reserve_memory_size(); - // if (memtable->resource_ctx()->task_controller()->is_enable_reserve_memory() && - // reserve_size > 0) { - // RETURN_IF_ERROR(_try_reserve_memory(memtable->resource_ctx(), reserve_size)); - // } - - // Defer defer {[&]() { - // ExecEnv::GetInstance()->storage_engine().memtable_flush_executor()->dec_flushing_task(); - // }}; +Status FlushToken::_memtable2block(MemTable* memtable, SharedMemtable* shared_memtable, + std::shared_ptr& flush_block) { + DCHECK(memtable != nullptr); + + if (shared_memtable == nullptr) { std::unique_ptr block; RETURN_IF_ERROR(memtable->to_block(&block)); - RETURN_IF_ERROR(_rowset_writer->flush_memtable(block.get(), segment_id, flush_size)); - memtable->set_flush_success(); + flush_block.reset(block.release()); + return Status::OK(); + } + + std::call_once(shared_memtable->block_once, [&]() { + std::unique_ptr block; + shared_memtable->block_status = memtable->to_block(&block); + if (shared_memtable->block_status.ok()) { + shared_memtable->block.reset(block.release()); + } + }); + if (!shared_memtable->block_status.ok()) { + return shared_memtable->block_status; } - _memtable_stat += memtable->stat(); - DorisMetrics::instance()->memtable_flush_total->increment(1); - DorisMetrics::instance()->memtable_flush_duration_us->increment(duration_ns / 1000); - VLOG_CRITICAL << "after flush memtable for tablet: " << memtable->tablet_id() - << ", flushsize: " << PrettyPrinter::print_bytes(*flush_size); + flush_block = shared_memtable->block; + DCHECK(flush_block != nullptr); return Status::OK(); } -void FlushToken::_flush_memtable(std::shared_ptr memtable_ptr, int32_t segment_id, - int64_t submit_task_time) { - signal::set_signal_task_id(_rowset_writer->load_id()); - signal::tablet_id = memtable_ptr->tablet_id(); +void FlushToken::_flush_memtable_impl(RowsetWriter* flush_writer, MemTable* memtable, + int32_t segment_id, int64_t submit_task_time, + SharedMemtable* shared_memtable) { + DCHECK(flush_writer != nullptr); + DCHECK(memtable != nullptr); + + signal::set_signal_task_id(flush_writer->load_id()); + signal::tablet_id = memtable->tablet_id(); // Count the task as running before registering the deferred cleanup so // cancel/shutdown paths keep flush_running_count symmetric on every exit. _stats.flush_running_count++; @@ -262,10 +388,54 @@ void FlushToken::_flush_memtable(std::shared_ptr memtable_ptr, int32_t MonotonicStopWatch timer; timer.start(); - size_t memory_usage = memtable_ptr->memory_usage(); + size_t memory_usage = memtable->memory_usage(); - int64_t flush_size; - Status s = _do_flush_memtable(memtable_ptr.get(), segment_id, &flush_size); + int64_t flush_size = 0; + Status s; + memtable->update_mem_type(MemType::FLUSH); + int64_t duration_ns = 0; + { + s = [&]() { + SCOPED_RAW_TIMER(&duration_ns); + SCOPED_ATTACH_TASK(memtable->resource_ctx()); + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER( + memtable->resource_ctx()->memory_context()->mem_tracker()->write_tracker()); + SCOPED_CONSUME_MEM_TRACKER(memtable->mem_tracker()); + + // DEFER_RELEASE_RESERVED(); + + // auto reserve_size = memtable->get_flush_reserve_memory_size(); + // if (memtable->resource_ctx()->task_controller()->is_enable_reserve_memory() && + // reserve_size > 0) { + // RETURN_IF_ERROR(_try_reserve_memory(memtable->resource_ctx(), reserve_size)); + // } + + // Defer defer {[&]() { + // ExecEnv::GetInstance()->storage_engine().memtable_flush_executor()->dec_flushing_task(); + // }}; + std::shared_ptr flush_block; + RETURN_IF_ERROR(_memtable2block(memtable, shared_memtable, flush_block)); + RETURN_IF_ERROR( + flush_writer->flush_memtable(flush_block.get(), segment_id, &flush_size)); + memtable->set_flush_success(); + + return Status::OK(); + }(); + + if (s.ok()) { + bool record_memtable_stat = shared_memtable == nullptr; + if (shared_memtable != nullptr) { + auto finished_sub_task_count = shared_memtable->add_finished_sub_task() + 1; + record_memtable_stat = + finished_sub_task_count == shared_memtable->total_sub_task_count.load(); + } + if (record_memtable_stat) { + _memtable_stat += memtable->stat(); + } + DorisMetrics::instance()->memtable_flush_total->increment(1); + DorisMetrics::instance()->memtable_flush_duration_us->increment(duration_ns / 1000); + } + } { std::shared_lock rdlk(_flush_status_lock); @@ -275,9 +445,12 @@ void FlushToken::_flush_memtable(std::shared_ptr memtable_ptr, int32_t } if (!s.ok()) { std::lock_guard wrlk(_flush_status_lock); - LOG(WARNING) << "Flush memtable failed with res = " << s - << ", load_id: " << print_id(_rowset_writer->load_id()); - _flush_status = s; + if (_flush_status.ok()) { + LOG(WARNING) << "Flush memtable failed with res = " << s + << ", load_id: " << print_id(flush_writer->load_id()); + _flush_status = s; + } + _shutdown_flush_token(); return; } @@ -292,10 +465,15 @@ void FlushToken::_flush_memtable(std::shared_ptr memtable_ptr, int32_t << ", disk size: " << PrettyPrinter::print_bytes(flush_size); _stats.flush_time_ns += timer.elapsed_time(); _stats.flush_finish_count++; - _stats.flush_size_bytes += memtable_ptr->memory_usage(); + _stats.flush_size_bytes += memtable->memory_usage(); _stats.flush_disk_size_bytes += flush_size; } +void FlushToken::_flush_memtable(std::shared_ptr memtable_ptr, int32_t segment_id, + int64_t submit_task_time) { + _flush_memtable_impl(_rowset_writer.get(), memtable_ptr.get(), segment_id, submit_task_time); +} + std::pair MemTableFlushExecutor::calc_flush_thread_count(int num_cpus, int num_disk, int thread_num_per_store) { if (config::enable_adaptive_flush_threads && num_cpus > 0) { @@ -346,10 +524,10 @@ void MemTableFlushExecutor::update_memtable_flush_threads() { } // NOTE: we use SERIAL mode here to ensure all mem-tables from one tablet are flushed in order. -Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& flush_token, - std::shared_ptr rowset_writer, - bool is_high_priority, - std::shared_ptr wg_sptr) { +Status MemTableFlushExecutor::create_flush_token( + std::shared_ptr& flush_token, std::shared_ptr rowset_writer, + bool is_high_priority, std::shared_ptr wg_sptr, + std::shared_ptr table_schema_param) { switch (rowset_writer->type()) { case ALPHA_ROWSET: // alpha rowset do not support flush in CONCURRENT. and not support alpha rowset now. @@ -359,6 +537,7 @@ Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& fl ThreadPool* pool = is_high_priority ? _high_prio_flush_pool.get() : _flush_pool.get(); flush_token = FlushToken::create_shared(pool, wg_sptr); flush_token->set_rowset_writer(rowset_writer); + flush_token->set_table_schema_param(std::move(table_schema_param)); return Status::OK(); } default: diff --git a/be/src/load/memtable/memtable_flush_executor.h b/be/src/load/memtable/memtable_flush_executor.h index ae08bec68c4daf..b97c34aa9af549 100644 --- a/be/src/load/memtable/memtable_flush_executor.h +++ b/be/src/load/memtable/memtable_flush_executor.h @@ -26,6 +26,7 @@ #include #include "common/status.h" +#include "load/delta_writer/delta_writer_context.h" #include "load/memtable/memtable.h" #include "util/threadpool.h" @@ -34,6 +35,10 @@ namespace doris { class DataDir; class MemTable; class MemTableMemoryLimiter; +class Block; +class GroupRowsetWriter; +class OlapTableSchemaParam; +class AutoIncIDBuffer; class RowsetWriter; class SystemMetrics; class WorkloadGroup; @@ -50,6 +55,29 @@ struct FlushStatistic { std::atomic_uint64_t flush_wait_time_ns = 0; }; +struct SharedMemtable { + std::shared_ptr memtable; + int32_t segment_id = 0; + + ~SharedMemtable(); + + std::once_flag block_once; + Status block_status; + std::shared_ptr block; + + std::atomic finished_sub_task_count {0}; + // data + binlog + std::atomic total_sub_task_count {2}; + + int add_finished_sub_task() { return finished_sub_task_count.fetch_add(1); } + + std::string debug_string() const { + return "PartOfGroupMemtableFlushTask{segment_id=" + std::to_string(segment_id) + + ", finished_sub_task_count=" + std::to_string(finished_sub_task_count.load()) + + ", total_sub_task_count=" + std::to_string(total_sub_task_count.load()) + "}"; + } +}; + std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat); // A thin wrapper of ThreadPoolToken to submit task. @@ -82,6 +110,17 @@ class FlushToken : public std::enable_shared_from_this { _rowset_writer = rowset_writer; } + void set_table_schema_param(std::shared_ptr table_schema_param) { + _table_schema_param = std::move(table_schema_param); + } + +#ifdef BE_TEST + void set_row_binlog_lsn_buffer_for_test( + std::shared_ptr row_binlog_lsn_buffer) { + _row_binlog_lsn_buffer = std::move(row_binlog_lsn_buffer); + } +#endif + const MemTableStat& memtable_stat() { return _memtable_stat; } private: @@ -92,11 +131,21 @@ class FlushToken : public std::enable_shared_from_this { private: friend class MemtableFlushTask; + friend class PartOfGroupMemtableFlushTask; + + Status _submit_sub_tasks(ThreadPool* pool, std::vector> sub_tasks); + + void _flush_memtable_impl(RowsetWriter* flush_writer, MemTable* memtable, int32_t segment_id, + int64_t submit_task_time, SharedMemtable* shared_memtable = nullptr); void _flush_memtable(std::shared_ptr memtable_ptr, int32_t segment_id, int64_t submit_task_time); - Status _do_flush_memtable(MemTable* memtable, int32_t segment_id, int64_t* flush_size); + void _flush_group_memtable(std::shared_ptr shared_memtable, + WriteRequestType write_req_type, int64_t submit_task_time); + + Status _memtable2block(MemTable* memtable, SharedMemtable* shared_memtable, + std::shared_ptr& flush_block); Status _try_reserve_memory(const std::shared_ptr& resource_context, int64_t size); @@ -110,6 +159,9 @@ class FlushToken : public std::enable_shared_from_this { std::shared_ptr _rowset_writer = nullptr; + std::shared_ptr _table_schema_param = nullptr; + std::shared_ptr _row_binlog_lsn_buffer = nullptr; + MemTableStat _memtable_stat; std::atomic _shutdown = false; @@ -145,7 +197,8 @@ class MemTableFlushExecutor { Status create_flush_token(std::shared_ptr& flush_token, std::shared_ptr rowset_writer, bool is_high_priority, - std::shared_ptr wg_sptr); + std::shared_ptr wg_sptr, + std::shared_ptr table_schema_param = nullptr); // return true if it already has any flushing task bool check_and_inc_has_any_flushing_task() { diff --git a/be/src/load/memtable/memtable_writer.cpp b/be/src/load/memtable/memtable_writer.cpp index 3179939a1d6061..865751d05e21b7 100644 --- a/be/src/load/memtable/memtable_writer.cpp +++ b/be/src/load/memtable/memtable_writer.cpp @@ -37,11 +37,11 @@ #include "runtime/memory/mem_tracker.h" #include "service/backend_options.h" #include "storage/rowset/beta_rowset_writer.h" +#include "storage/rowset/group_rowset_writer.h" #include "storage/rowset/rowset_writer.h" #include "storage/schema_change/schema_change.h" #include "storage/storage_engine.h" #include "storage/tablet/tablet_schema.h" -#include "storage/tablet_info.h" #include "util/mem_info.h" #include "util/stopwatch.hpp" @@ -80,7 +80,8 @@ Status MemTableWriter::init(std::shared_ptr rowset_writer, // we can make sure same keys sort in the same order in all replicas. RETURN_IF_ERROR( ExecEnv::GetInstance()->storage_engine().memtable_flush_executor()->create_flush_token( - _flush_token, _rowset_writer, _req.is_high_priority, wg_sptr)); + _flush_token, _rowset_writer, _req.is_high_priority, wg_sptr, + _req.table_schema_param)); _is_init = true; return Status::OK(); diff --git a/be/src/storage/binlog.h b/be/src/storage/binlog.h index 35fef1db444bb3..1a302ea26453d5 100644 --- a/be/src/storage/binlog.h +++ b/be/src/storage/binlog.h @@ -19,16 +19,41 @@ #include +#include +#include +#include +#include #include #include +#include +#include +#include "common/logging.h" // DCHECK +#include "common/status.h" +#include "exec/sink/autoinc_buffer.h" #include "storage/olap_common.h" +#include "storage/olap_define.h" // DataWriteType +#include "storage/tablet/tablet_schema.h" // TabletSchemaSPtr namespace doris { + +class AutoIncIDBuffer; +struct PartialUpdateInfo; +struct MowContext; + +// Row binlog op type. +// NOTE: The value is persisted into row binlog data, so keep it stable. +static constexpr int64_t ROW_BINLOG_APPEND = 0; +static constexpr int64_t ROW_BINLOG_UPDATE = 1; +static constexpr int64_t ROW_BINLOG_DELETE = 2; + constexpr std::string_view kBinlogPrefix = "binlog_"; constexpr std::string_view kBinlogMetaPrefix = "binlog_meta_"; constexpr std::string_view kBinlogDataPrefix = "binlog_data_"; constexpr std::string_view kRowBinlogPrefix = "binlog_row_"; +constexpr std::string_view kRowBinlogLsnColName = "__DORIS_BINLOG_LSN__"; +constexpr std::string_view kRowBinlogTimestampColName = "__DORIS_BINLOG_TIMESTAMP__"; +constexpr int64_t kBinlogLsnAutoIncId = -1; // used in file directory constexpr std::string_view FDRowBinlogSuffix = "_row_binlog"; @@ -98,4 +123,125 @@ inline std::string get_binlog_data_key_from_meta_key(const std::string_view meta // like "binlog_meta_6943f1585fe834b5-e542c2b83a21d0b7" => "binlog_data-6943f1585fe834b5-e542c2b83a21d0b7" return fmt::format("{}data_{}", kBinlogPrefix, meta_key.substr(kBinlogMetaPrefix.length())); } + +inline auto make_row_binlog_key_prefix(const TabletUid& tablet_uid, const RowsetId& rowset_id) { + return fmt::format("{}{}_{}_", kRowBinlogPrefix, tablet_uid.to_string(), rowset_id.to_string()); +} + +inline auto make_row_binlog_key(const TabletUid& tablet_uid, const RowsetId& rowset_id, + const RowsetId& binlog_rowset_id) { + return fmt::format("{}{}_{}_{}", kRowBinlogPrefix, tablet_uid.to_string(), + rowset_id.to_string(), binlog_rowset_id.to_string()); +} + +// Allocate per-row LSNs for row-binlog data. +// The caller must provide a valid auto-inc buffer (typically from GlobalAutoIncBuffers). +inline Status allocate_binlog_lsn(const std::shared_ptr& lsn_buffer, + size_t num_rows, std::shared_ptr>* lsn_ids) { + if (lsn_buffer == nullptr) { + return Status::InternalError("binlog try to get lsn buffer but null"); + } + DCHECK(lsn_ids != nullptr); + DCHECK(num_rows > 0); + + std::vector> ranges; + RETURN_IF_ERROR(lsn_buffer->sync_request_ids(num_rows, &ranges)); + + auto ids = std::make_shared>(); + ids->reserve(num_rows); + for (const auto& [start, length] : ranges) { + for (size_t i = 0; i < length; ++i) { + DCHECK_LE(start, std::numeric_limits::max() - static_cast(i)); + ids->push_back(start + static_cast(i)); + } + } + DCHECK_EQ(ids->size(), num_rows); + *lsn_ids = std::move(ids); + return Status::OK(); +} + +constexpr int64_t kTsoLogicalBits = 18; + +inline int64_t extract_tso_physical_time(int64_t tso) { + return tso <= 0 ? 0 : tso >> kTsoLogicalBits; +} + +inline int128_t make_row_binlog_lsn(int64_t tso, int128_t row_id) { + static constexpr int128_t kLow64Mask = (static_cast(1) << 64) - 1; + return (static_cast(tso) << 64) | (row_id & kLow64Mask); +} + +namespace segment_v2 { + +class SegmentWriteBinlogLsnMap { +public: + void insert_seg_lsn(int64_t seg_id, std::shared_ptr> lsn_ids) { + std::lock_guard l(_mutex); + _seg_id_to_lsn_ids.emplace(seg_id, std::move(lsn_ids)); + } + + void remove_seg(int64_t seg_id) { + std::lock_guard l(_mutex); + _seg_id_to_lsn_ids.erase(seg_id); + } + + std::shared_ptr> get_seg_lsn(int64_t seg_id) const { + std::lock_guard l(_mutex); + auto it = _seg_id_to_lsn_ids.find(seg_id); + CHECK(it != _seg_id_to_lsn_ids.end()) + << "SegmentWriteBinlogLsnMap::get_seg_lsn missing seg_id=" << seg_id + << ", existing_seg_ids=[" << ([&] { + std::string s; + for (const auto& [id, _] : _seg_id_to_lsn_ids) { + if (!s.empty()) { + s.push_back(','); + } + s.append(std::to_string(id)); + } + return s; + }()) + << "]"; + return it->second; + } + +private: + mutable std::mutex _mutex; + std::map>> _seg_id_to_lsn_ids; +}; + +struct SegmentWriteBinlogOptions { +public: + bool write_before = false; + + // source context, used for retrieving historical row and building binlog block + struct SourceWriteDataOptions { + TabletSchemaSPtr tablet_schema = nullptr; + std::shared_ptr partial_update_info; + std::shared_ptr mow_context; + bool is_transient_rowset_writer = false; + DataWriteType source_write_type = DataWriteType::TYPE_DEFAULT; + } source; + + void insert_seg_lsn(int64_t seg_id, std::shared_ptr> lsn_ids) { + DCHECK(lsn_map != nullptr); + lsn_map->insert_seg_lsn(seg_id, std::move(lsn_ids)); + } + + void remove_seg(int64_t seg_id) { + DCHECK(lsn_map != nullptr); + lsn_map->remove_seg(seg_id); + } + + std::shared_ptr> get_seg_lsn(int64_t seg_id) const { + DCHECK(lsn_map != nullptr); + return lsn_map->get_seg_lsn(seg_id); + } + + // Shared LSN storage for row-binlog writers. + // Keep it as a pointer so SegmentWriteBinlogOptions stays copyable. + std::shared_ptr lsn_map = + std::make_shared(); +}; +} // namespace segment_v2 + } // namespace doris diff --git a/be/src/storage/binlog_config.h b/be/src/storage/binlog_config.h index e0b4651db345e8..696eb6963229cc 100644 --- a/be/src/storage/binlog_config.h +++ b/be/src/storage/binlog_config.h @@ -66,10 +66,10 @@ class BinlogConfig { _need_historical_value = need_historical_value; } - bool isCCRBinlogFormat() const { + bool is_ccr_binlog_format() const { return _binlog_format == BinlogFormatPB::STATEMENT_AND_SNAPSHOT; } - bool isRowBinlogFormat() const { return _binlog_format == BinlogFormatPB::ROW; } + bool is_row_binlog_format() const { return _binlog_format == BinlogFormatPB::ROW; } BinlogConfig& operator=(const TBinlogConfig& config); BinlogConfig& operator=(const BinlogConfigPB& config); diff --git a/be/src/storage/compaction/compaction.cpp b/be/src/storage/compaction/compaction.cpp index f1c6ad3636b37a..658c478d0a8c1f 100644 --- a/be/src/storage/compaction/compaction.cpp +++ b/be/src/storage/compaction/compaction.cpp @@ -151,6 +151,17 @@ bool is_rowset_tidy(std::string& pre_max_key, bool& pre_rs_key_bounds_truncated, return true; } +TsoRange commit_tso_range(const std::vector& rowsets) { + DCHECK(!rowsets.empty()); + auto range = rowsets.front()->commit_tso(); + for (const auto& rowset : rowsets) { + const auto commit_tso = rowset->commit_tso(); + range.first = std::min(range.first, commit_tso.start_tso()); + range.second = std::max(range.second, commit_tso.end_tso()); + } + return range; +} + } // namespace Compaction::Compaction(BaseTabletSPtr tablet, const std::string& label) @@ -321,6 +332,7 @@ Status Compaction::merge_input_rowsets() { RETURN_NOT_OK_STATUS_WITH_WARN(_output_rs_writer->build(_output_rowset), fmt::format("rowset writer build failed. output_version: {}", _output_version.to_string())); + _output_rowset->rowset_meta()->set_commit_tso(commit_tso_range(_input_rowsets)); // When true, writers should remove variant extracted subcolumns from the // schema stored in RowsetMeta. This is used when compaction temporarily @@ -458,6 +470,7 @@ Status CompactionMixin::do_compact_ordered_rowsets() { !_tablet->enable_unique_key_merge_on_write()); rowset_meta->set_segments_key_bounds(segment_key_bounds, aggregate_key_bounds); rowset_meta->set_num_segment_rows(num_segment_rows); + rowset_meta->set_commit_tso(commit_tso_range(_input_rowsets)); _output_rowset = _output_rs_writer->manual_build(rowset_meta); diff --git a/be/src/storage/data_dir.cpp b/be/src/storage/data_dir.cpp index db53a2d741292c..ab7b56f610a724 100644 --- a/be/src/storage/data_dir.cpp +++ b/be/src/storage/data_dir.cpp @@ -29,8 +29,10 @@ #include // IWYU pragma: keep #include #include +#include #include #include +#include #include #include #include @@ -517,6 +519,37 @@ Status DataDir::load() { rowset_partition_id_eq_0_num, config::ignore_invalid_partition_id_rowset_num)); } + std::map> rowset_id_to_row_binlog_metas; + int64_t row_binlog_cnt {0}; + int64_t invalid_row_binlog_cnt {0}; + auto load_row_binlog_meta_func = + [&rowset_id_to_row_binlog_metas, &row_binlog_cnt, &invalid_row_binlog_cnt]( + const TabletUid& tablet_uid, const RowsetId& rowset_id, + const RowsetId& row_binlog_rowset_id, const std::string& val) -> bool { + RowsetMetaSharedPtr rowset_meta(new RowsetMeta()); + bool parsed = rowset_meta->init(val); + if (!parsed) { + LOG(WARNING) << "parse binlog meta string failed, tablet_uid=" << tablet_uid + << ", rowset_id=" << rowset_id + << ", row_binlog_rowset_id=" << row_binlog_rowset_id; + ++invalid_row_binlog_cnt; + return true; + } + DCHECK(rowset_meta->is_row_binlog()); + DCHECK_EQ(rowset_meta->tablet_uid(), tablet_uid); + rowset_id_to_row_binlog_metas[rowset_id].emplace_back(std::move(rowset_meta)); + ++row_binlog_cnt; + return true; + }; + + MonotonicStopWatch rb_timer; + rb_timer.start(); + RETURN_IF_ERROR(RowsetMetaManager::traverse_row_binlog_metas(_meta, load_row_binlog_meta_func)); + rb_timer.stop(); + LOG(INFO) << "load binlog meta finished, cost: " << rb_timer.elapsed_time_milliseconds() + << " ms, data dir: " << _path << ", count: " << row_binlog_cnt + << ", invalid: " << invalid_row_binlog_cnt; + // traverse rowset // 1. add committed rowset to txn map // 2. add visible rowset to tablet @@ -540,6 +573,37 @@ Status DataDir::load() { continue; } + std::vector attach_rowsets; + std::map attach_rowset_map; + bool invalid_attach_rowset = false; + if (auto it = rowset_id_to_row_binlog_metas.find(rowset_meta->rowset_id()); + it != rowset_id_to_row_binlog_metas.end()) { + for (auto& attach_rowset_meta : it->second) { + DCHECK_EQ(attach_rowset_meta->rowset_state(), rowset_meta->rowset_state()); + if (!attach_rowset_meta->tablet_schema()) { + attach_rowset_meta->set_tablet_schema(tablet->row_binlog_tablet_schema()); + } + + RowsetSharedPtr attach_rowset; + Status attach_create_status = + tablet->create_rowset(attach_rowset_meta, &attach_rowset); + if (!attach_create_status.ok()) { + LOG(WARNING) << "could not create rowset from binlog rowset meta: " + << " rowset_id: " << attach_rowset_meta->rowset_id() + << " rowset_type: " << attach_rowset_meta->rowset_type() + << " rowset_state: " << attach_rowset_meta->rowset_state(); + invalid_attach_rowset = true; + break; + } + attach_rowset_map.emplace(attach_rowset_meta->rowset_id(), + attach_rowset_meta->get_rowset_pb()); + attach_rowsets.emplace_back(std::move(attach_rowset)); + } + } + if (invalid_attach_rowset) { + continue; + } + RowsetSharedPtr rowset; Status create_status = tablet->create_rowset(rowset_meta, &rowset); if (!create_status) { @@ -549,51 +613,80 @@ Status DataDir::load() { << " rowset_state: " << rowset_meta->rowset_state(); continue; } + + std::optional binlog_format = std::nullopt; + const std::map* attach_rowset_map_ptr = nullptr; + if (!attach_rowset_map.empty()) { + binlog_format = BinlogFormatPB::ROW; + attach_rowset_map_ptr = &attach_rowset_map; + } + if (rowset_meta->rowset_state() == RowsetStatePB::COMMITTED && rowset_meta->tablet_uid() == tablet->tablet_uid()) { if (!rowset_meta->tablet_schema()) { rowset_meta->set_tablet_schema(tablet->tablet_schema()); - RETURN_IF_ERROR(RowsetMetaManager::save(_meta, rowset_meta->tablet_uid(), - rowset_meta->rowset_id(), - rowset_meta->get_rowset_pb(), false)); + RETURN_IF_ERROR(RowsetMetaManager::save( + _meta, rowset_meta->tablet_uid(), rowset_meta->rowset_id(), + rowset_meta->get_rowset_pb(), binlog_format, attach_rowset_map_ptr)); + } + std::vector rowset_ids {rowset_meta->rowset_id()}; + for (const auto& attach_rowset : attach_rowsets) { + if (attach_rowset != nullptr) { + rowset_ids.emplace_back(attach_rowset->rowset_id()); + } } Status commit_txn_status = _engine.txn_manager()->commit_txn( _meta, rowset_meta->partition_id(), rowset_meta->txn_id(), rowset_meta->tablet_id(), rowset_meta->tablet_uid(), rowset_meta->load_id(), - rowset, _engine.pending_local_rowsets().add(rowset_meta->rowset_id()), true); + rowset, _engine.pending_local_rowsets().add(rowset_ids), true, nullptr, + attach_rowsets.empty() ? nullptr : &attach_rowsets); if (commit_txn_status || commit_txn_status.is()) { LOG(INFO) << "successfully to add committed rowset: " << rowset_meta->rowset_id() << " to tablet: " << rowset_meta->tablet_id() << " schema hash: " << rowset_meta->tablet_schema_hash() - << " for txn: " << rowset_meta->txn_id(); + << " for txn: " << rowset_meta->txn_id() << ", binlog rowset: " + << (attach_rowsets.empty() ? "0" + : attach_rowsets[0]->rowset_id().to_string()); } else if (commit_txn_status.is()) { LOG(WARNING) << "failed to add committed rowset: " << rowset_meta->rowset_id() << " to tablet: " << rowset_meta->tablet_id() << " for txn: " << rowset_meta->txn_id() - << " error: " << commit_txn_status; + << " error: " << commit_txn_status << ", binlog rowset: " + << (attach_rowsets.empty() + ? "0" + : attach_rowsets[0]->rowset_id().to_string()); return commit_txn_status; } else { LOG(WARNING) << "failed to add committed rowset: " << rowset_meta->rowset_id() << " to tablet: " << rowset_meta->tablet_id() << " for txn: " << rowset_meta->txn_id() - << " error: " << commit_txn_status; + << " error: " << commit_txn_status << ", binlog rowset: " + << (attach_rowsets.empty() + ? "0" + : attach_rowsets[0]->rowset_id().to_string()); } } else if (rowset_meta->rowset_state() == RowsetStatePB::VISIBLE && rowset_meta->tablet_uid() == tablet->tablet_uid()) { if (!rowset_meta->tablet_schema()) { rowset_meta->set_tablet_schema(tablet->tablet_schema()); - RETURN_IF_ERROR(RowsetMetaManager::save(_meta, rowset_meta->tablet_uid(), - rowset_meta->rowset_id(), - rowset_meta->get_rowset_pb(), false)); + RETURN_IF_ERROR(RowsetMetaManager::save( + _meta, rowset_meta->tablet_uid(), rowset_meta->rowset_id(), + rowset_meta->get_rowset_pb(), binlog_format, attach_rowset_map_ptr)); } - Status publish_status = tablet->add_rowset(rowset); + DCHECK_LE(attach_rowsets.size(), 1); + Status publish_status = tablet->add_rowset( + rowset, attach_rowsets.empty() ? nullptr : attach_rowsets[0]); if (!publish_status && !publish_status.is()) { LOG(WARNING) << "add visible rowset to tablet failed rowset_id:" << rowset->rowset_id() << " tablet id: " << rowset_meta->tablet_id() << " txn id:" << rowset_meta->txn_id() << " start_version: " << rowset_meta->version().first - << " end_version: " << rowset_meta->version().second; + << " end_version: " << rowset_meta->version().second + << ", binlog rowset: " + << (attach_rowsets.empty() + ? "0" + : attach_rowsets[0]->rowset_id().to_string()); } } else { LOG(WARNING) << "find invalid rowset: " << rowset_meta->rowset_id() @@ -601,7 +694,10 @@ Status DataDir::load() { << " tablet uid: " << rowset_meta->tablet_uid() << " schema hash: " << rowset_meta->tablet_schema_hash() << " txn: " << rowset_meta->txn_id() - << " current valid tablet uid: " << tablet->tablet_uid(); + << " current valid tablet uid: " << tablet->tablet_uid() + << ", binlog rowset: " + << (attach_rowsets.empty() ? "0" + : attach_rowsets[0]->rowset_id().to_string()); ++invalid_rowset_counter; } } @@ -774,6 +870,9 @@ void DataDir::_perform_rowset_gc(const std::string& tablet_schema_hash_path) { tablet->traverse_rowsets( [&rowsets_in_version_map](auto& rs) { rowsets_in_version_map.insert(rs->rowset_id()); }, true); + for (const auto& [_, rb_meta] : tablet->tablet_meta()->all_row_binlog_rs_metas()) { + rowsets_in_version_map.insert(rb_meta->rowset_id()); + } DBUG_EXECUTE_IF("DataDir::_perform_rowset_gc.simulation.slow", { auto target_tablet_id = dp->param("tablet_id", -1); @@ -796,7 +895,9 @@ void DataDir::_perform_rowset_gc(const std::string& tablet_schema_hash_path) { return !rowsets_in_version_map.contains(rowset_id) && !_engine.check_rowset_id_in_unused_rowsets(rowset_id) && RowsetMetaManager::exists(get_meta(), tablet->tablet_uid(), rowset_id) - .is(); + .is() && + !RowsetMetaManager::row_binlog_meta_exists(get_meta(), tablet->tablet_uid(), + rowset_id); }; // rowset_id -> is_garbage diff --git a/be/src/storage/iterator/vcollect_iterator.cpp b/be/src/storage/iterator/vcollect_iterator.cpp index 1d70e61eee62f9..879ee982a5f5b0 100644 --- a/be/src/storage/iterator/vcollect_iterator.cpp +++ b/be/src/storage/iterator/vcollect_iterator.cpp @@ -76,6 +76,10 @@ void VCollectIterator::init(TabletReader* reader, bool ori_data_overlapping, boo _merge = false; } + if (_reader->_reader_type == ReaderType::READER_BINLOG) { + _merge = false; + } + // When data is none overlapping, no need to build heap to traverse data if (!ori_data_overlapping) { _merge = false; diff --git a/be/src/storage/iterators.h b/be/src/storage/iterators.h index 1c9b551874360c..a14c8ee3434ac5 100644 --- a/be/src/storage/iterators.h +++ b/be/src/storage/iterators.h @@ -132,6 +132,7 @@ class StorageReadOptions { RuntimeState* runtime_state = nullptr; RowsetId rowset_id; Version version; + TsoRange commit_tso; int64_t tablet_id = 0; // slots that cast may be eliminated in storage layer std::map target_cast_type_for_variants; diff --git a/be/src/storage/olap_common.h b/be/src/storage/olap_common.h index 2c31e92b115ed3..c98b4b025cb7bb 100644 --- a/be/src/storage/olap_common.h +++ b/be/src/storage/olap_common.h @@ -249,6 +249,16 @@ struct Version { std::string to_string() const { return fmt::format("[{}-{}]", first, second); } }; +struct TsoRange : public Version { + TsoRange() : Version(-1, -1) {} + TsoRange(int64_t start_tso, int64_t end_tso) : Version(start_tso, end_tso) {} + + int64_t start_tso() const { return first; } + int64_t end_tso() const { return second; } + + bool contains(const TsoRange& other) const { return Version::contains(other); } +}; + using Versions = std::vector; inline std::ostream& operator<<(std::ostream& os, const Version& version) { diff --git a/be/src/storage/partial_update_info.cpp b/be/src/storage/partial_update_info.cpp index 7b97ecfc081167..05d648d4819b15 100644 --- a/be/src/storage/partial_update_info.cpp +++ b/be/src/storage/partial_update_info.cpp @@ -31,6 +31,7 @@ #include "storage/olap_common.h" #include "storage/rowset/rowset.h" #include "storage/rowset/rowset_writer_context.h" +#include "storage/segment/historical_row_retriever.h" #include "storage/segment/vertical_segment_writer.h" #include "storage/tablet/base_tablet.h" #include "storage/tablet/tablet_meta.h" @@ -365,16 +366,20 @@ Status FixedReadPlan::read_columns_by_plan( } Status FixedReadPlan::fill_missing_columns( - RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const segment_v2::HistoricalRowRetrieverContext& historical_context, + const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, Block& full_block, const std::vector& use_default_or_null_flag, bool has_default_or_nullable, uint32_t segment_start_pos, const Block* block) const { auto mutable_full_columns = full_block.mutate_columns(); // create old value columns - const auto& missing_cids = rowset_ctx->partial_update_info->missing_cids; + DCHECK(historical_context.partial_update_info != nullptr); + DCHECK(historical_context.tablet_schema != nullptr); + const auto& partial_update_info = *historical_context.partial_update_info; + const auto& missing_cids = partial_update_info.missing_cids; bool have_input_seq_column = false; if (tablet_schema.has_sequence_col()) { - const std::vector& including_cids = rowset_ctx->partial_update_info->update_cids; + const std::vector& including_cids = partial_update_info.update_cids; have_input_seq_column = (std::find(including_cids.cbegin(), including_cids.cend(), tablet_schema.sequence_col_idx()) != including_cids.cend()); @@ -395,9 +400,9 @@ Status FixedReadPlan::fill_missing_columns( } // build default value columns auto default_value_block = old_value_block.clone_empty(); - RETURN_IF_ERROR(BaseTablet::generate_default_value_block( - tablet_schema, missing_cids, rowset_ctx->partial_update_info->default_values, - old_value_block, default_value_block)); + RETURN_IF_ERROR(BaseTablet::generate_default_value_block(tablet_schema, missing_cids, + partial_update_info.default_values, + old_value_block, default_value_block)); auto mutable_default_value_columns = default_value_block.mutate_columns(); // fill all missing value from mutable_old_columns, need to consider default value and null value @@ -434,8 +439,8 @@ Status FixedReadPlan::fill_missing_columns( auto* nullable_column = assert_cast(missing_col.get()); nullable_column->insert_many_defaults(1); } else if (tablet_schema.auto_increment_column() == tablet_column.name()) { - const auto& column = - *DORIS_TRY(rowset_ctx->tablet_schema->column(tablet_column.name())); + const auto& column = *DORIS_TRY( + historical_context.tablet_schema->column(tablet_column.name())); DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT); auto* auto_inc_column = assert_cast(missing_col.get()); int pos = block->get_position_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL); @@ -540,28 +545,30 @@ Status FlexibleReadPlan::read_columns_by_plan( } Status FlexibleReadPlan::fill_non_primary_key_columns( - RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const segment_v2::HistoricalRowRetrieverContext& historical_context, + const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, Block& full_block, const std::vector& use_default_or_null_flag, bool has_default_or_nullable, uint32_t segment_start_pos, uint32_t block_start_pos, const Block* block, std::vector* skip_bitmaps) const { auto mutable_full_columns = full_block.mutate_columns(); + DCHECK(historical_context.partial_update_info != nullptr); // missing_cids are all non sort key columns' cids - const auto& non_sort_key_cids = rowset_ctx->partial_update_info->missing_cids; + const auto& non_sort_key_cids = historical_context.partial_update_info->missing_cids; auto old_value_block = tablet_schema.create_block_by_cids(non_sort_key_cids); CHECK_EQ(non_sort_key_cids.size(), old_value_block.columns()); if (!use_row_store) { RETURN_IF_ERROR(fill_non_primary_key_columns_for_column_store( - rowset_ctx, rsid_to_rowset, tablet_schema, non_sort_key_cids, old_value_block, - mutable_full_columns, use_default_or_null_flag, has_default_or_nullable, - segment_start_pos, block_start_pos, block, skip_bitmaps)); + historical_context, rsid_to_rowset, tablet_schema, non_sort_key_cids, + old_value_block, mutable_full_columns, use_default_or_null_flag, + has_default_or_nullable, segment_start_pos, block_start_pos, block, skip_bitmaps)); } else { RETURN_IF_ERROR(fill_non_primary_key_columns_for_row_store( - rowset_ctx, rsid_to_rowset, tablet_schema, non_sort_key_cids, old_value_block, - mutable_full_columns, use_default_or_null_flag, has_default_or_nullable, - segment_start_pos, block_start_pos, block, skip_bitmaps)); + historical_context, rsid_to_rowset, tablet_schema, non_sort_key_cids, + old_value_block, mutable_full_columns, use_default_or_null_flag, + has_default_or_nullable, segment_start_pos, block_start_pos, block, skip_bitmaps)); } full_block.set_columns(std::move(mutable_full_columns)); return Status::OK(); @@ -630,13 +637,14 @@ static void fill_non_primary_key_cell_for_column_store( } Status FlexibleReadPlan::fill_non_primary_key_columns_for_column_store( - RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const segment_v2::HistoricalRowRetrieverContext& historical_context, + const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, Block& old_value_block, MutableColumns& mutable_full_columns, const std::vector& use_default_or_null_flag, bool has_default_or_nullable, uint32_t segment_start_pos, uint32_t block_start_pos, const Block* block, std::vector* skip_bitmaps) const { - auto* info = rowset_ctx->partial_update_info.get(); + auto* info = historical_context.partial_update_info.get(); int32_t seq_col_unique_id = -1; if (tablet_schema.has_sequence_col()) { seq_col_unique_id = tablet_schema.column(tablet_schema.sequence_col_idx()).unique_id(); @@ -735,13 +743,14 @@ static void fill_non_primary_key_cell_for_row_store( } Status FlexibleReadPlan::fill_non_primary_key_columns_for_row_store( - RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const segment_v2::HistoricalRowRetrieverContext& historical_context, + const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, Block& old_value_block, MutableColumns& mutable_full_columns, const std::vector& use_default_or_null_flag, bool has_default_or_nullable, uint32_t segment_start_pos, uint32_t block_start_pos, const Block* block, std::vector* skip_bitmaps) const { - auto* info = rowset_ctx->partial_update_info.get(); + auto* info = historical_context.partial_update_info.get(); int32_t seq_col_unique_id = -1; if (tablet_schema.has_sequence_col()) { seq_col_unique_id = tablet_schema.column(tablet_schema.sequence_col_idx()).unique_id(); diff --git a/be/src/storage/partial_update_info.h b/be/src/storage/partial_update_info.h index 6371a79fe71db9..0192f35b9dbf64 100644 --- a/be/src/storage/partial_update_info.h +++ b/be/src/storage/partial_update_info.h @@ -38,6 +38,9 @@ struct RowLocation; class Block; class MutableBlock; class IOlapColumnDataAccessor; +namespace segment_v2 { +struct HistoricalRowRetrieverContext; +} struct RowsetWriterContext; struct RowsetId; @@ -119,6 +122,7 @@ struct RidAndPos { class FixedReadPlan { public: bool empty() const; + void clear() { plan.clear(); } void prepare_to_read(const RowLocation& row_location, size_t pos); Status read_columns_by_plan(const TabletSchema& tablet_schema, std::vector cids_to_read, @@ -126,7 +130,7 @@ class FixedReadPlan { Block& block, std::map* read_index, bool force_read_old_delete_signs, const signed char* __restrict cur_delete_signs = nullptr) const; - Status fill_missing_columns(RowsetWriterContext* rowset_ctx, + Status fill_missing_columns(const segment_v2::HistoricalRowRetrieverContext& historical_context, const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, Block& full_block, const std::vector& use_default_or_null_flag, @@ -154,16 +158,16 @@ class FlexibleReadPlan { const std::map& rsid_to_rowset, Block& old_value_block, std::map* read_index) const; - Status fill_non_primary_key_columns(RowsetWriterContext* rowset_ctx, - const std::map& rsid_to_rowset, - const TabletSchema& tablet_schema, Block& full_block, - const std::vector& use_default_or_null_flag, - bool has_default_or_nullable, uint32_t segment_start_pos, - uint32_t block_start_pos, const Block* block, - std::vector* skip_bitmaps) const; + Status fill_non_primary_key_columns( + const segment_v2::HistoricalRowRetrieverContext& historical_context, + const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, Block& full_block, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + uint32_t segment_start_pos, uint32_t block_start_pos, const Block* block, + std::vector* skip_bitmaps) const; Status fill_non_primary_key_columns_for_column_store( - RowsetWriterContext* rowset_ctx, + const segment_v2::HistoricalRowRetrieverContext& historical_context, const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, Block& old_value_block, MutableColumns& mutable_full_columns, @@ -171,7 +175,7 @@ class FlexibleReadPlan { uint32_t segment_start_pos, uint32_t block_start_pos, const Block* block, std::vector* skip_bitmaps) const; Status fill_non_primary_key_columns_for_row_store( - RowsetWriterContext* rowset_ctx, + const segment_v2::HistoricalRowRetrieverContext& historical_context, const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, Block& old_value_block, MutableColumns& mutable_full_columns, diff --git a/be/src/storage/rowset/beta_rowset_reader.cpp b/be/src/storage/rowset/beta_rowset_reader.cpp index 61170b37226ae4..2f4dc26d07e38a 100644 --- a/be/src/storage/rowset/beta_rowset_reader.cpp +++ b/be/src/storage/rowset/beta_rowset_reader.cpp @@ -113,6 +113,7 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context _read_options.collection_statistics = _read_context->collection_statistics; _read_options.rowset_id = _rowset->rowset_id(); _read_options.version = _rowset->version(); + _read_options.commit_tso = _rowset->rowset_meta()->commit_tso(); _read_options.tablet_id = _rowset->rowset_meta()->tablet_id(); _read_options.topn_limit = _topn_limit; if (_read_context->lower_bound_keys != nullptr) { diff --git a/be/src/storage/rowset/beta_rowset_writer.cpp b/be/src/storage/rowset/beta_rowset_writer.cpp index 464eab64c2ef33..befb9be491c420 100644 --- a/be/src/storage/rowset/beta_rowset_writer.cpp +++ b/be/src/storage/rowset/beta_rowset_writer.cpp @@ -96,6 +96,8 @@ void build_rowset_meta_with_spec_field(RowsetMeta& rowset_meta, rowset_meta.set_rowset_state(spec_rowset_meta.rowset_state()); rowset_meta.set_segments_key_bounds_truncated( spec_rowset_meta.is_segments_key_bounds_truncated()); + rowset_meta.set_db_id(spec_rowset_meta.db_id()); + rowset_meta.set_table_id(spec_rowset_meta.table_id()); std::vector segments_key_bounds; spec_rowset_meta.get_segments_key_bounds(&segments_key_bounds); // Preserve source layout: if source was aggregated (size 1), re-aggregating @@ -318,6 +320,7 @@ BetaRowsetWriter::~BetaRowsetWriter() { Status BaseBetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) { _context = rowset_writer_context; + DCHECK(_context.tablet_schema != nullptr); _rowset_meta.reset(new RowsetMeta); if (_context.storage_resource) { _rowset_meta->set_remote_storage_resource(*_context.storage_resource); @@ -325,6 +328,8 @@ Status BaseBetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte _rowset_meta->set_rowset_id(_context.rowset_id); _rowset_meta->set_partition_id(_context.partition_id); _rowset_meta->set_tablet_id(_context.tablet_id); + _rowset_meta->set_db_id(_context.db_id); + _rowset_meta->set_table_id(_context.table_id); _rowset_meta->set_index_id(_context.index_id); _rowset_meta->set_tablet_schema_hash(_context.tablet_schema_hash); _rowset_meta->set_rowset_type(_context.rowset_type); @@ -340,9 +345,10 @@ Status BaseBetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte } _rowset_meta->set_tablet_uid(_context.tablet_uid); _rowset_meta->set_tablet_schema(_context.tablet_schema); - if (_context.write_binlog_opt().is_binlog_writer()) { + if (_context.write_binlog_opt().enable) { _rowset_meta->mark_row_binlog(); } + _rowset_meta->set_compaction_level(_context.compaction_level); _context.segment_collector = std::make_shared>(this); _context.file_writer_creator = std::make_shared>(this); return Status::OK(); @@ -354,7 +360,8 @@ Status BaseBetaRowsetWriter::add_block(const Block* block) { Status BaseBetaRowsetWriter::_generate_delete_bitmap(int32_t segment_id) { SCOPED_RAW_TIMER(&_delete_bitmap_ns); - if (!_context.tablet->enable_unique_key_merge_on_write() || + if (_context.is_transient_rowset_writer || + !_context.tablet->enable_unique_key_merge_on_write() || (_context.partial_update_info && _context.partial_update_info->is_partial_update())) { return Status::OK(); } diff --git a/be/src/storage/rowset/beta_rowset_writer.h b/be/src/storage/rowset/beta_rowset_writer.h index 4da74560bcad52..a7f044c10e74a2 100644 --- a/be/src/storage/rowset/beta_rowset_writer.h +++ b/be/src/storage/rowset/beta_rowset_writer.h @@ -168,11 +168,22 @@ class BaseBetaRowsetWriter : public RowsetWriter { int32_t allocate_segment_id() override { return _segment_creator.allocate_segment_id(); }; + int32_t get_allocated_segment_id() override { + return _segment_creator.get_allocated_segment_id(); + }; + void set_segment_start_id(int32_t start_id) override { _segment_creator.set_segment_start_id(start_id); _segment_start_id = start_id; } + Status force_rollback() override { + DCHECK(_context.is_transient_rowset_writer); + DCHECK(_already_built); + _already_built = false; + return Status::OK(); + } + int64_t delete_bitmap_ns() override { return _delete_bitmap_ns; } int64_t segment_writer_ns() override { return _segment_writer_ns; } diff --git a/be/src/storage/rowset/beta_rowset_writer_v2.h b/be/src/storage/rowset/beta_rowset_writer_v2.h index 7788f9d2ce670b..5ee55209b0507c 100644 --- a/be/src/storage/rowset/beta_rowset_writer_v2.h +++ b/be/src/storage/rowset/beta_rowset_writer_v2.h @@ -113,6 +113,10 @@ class BetaRowsetWriterV2 : public RowsetWriter { int32_t allocate_segment_id() override { return _segment_creator.allocate_segment_id(); }; + int32_t get_allocated_segment_id() override { + return _segment_creator.get_allocated_segment_id(); + }; + int32_t next_segment_id() { return _segment_creator.next_segment_id(); }; int64_t delete_bitmap_ns() override { return _delete_bitmap_ns; } diff --git a/be/src/storage/rowset/group_rowset_writer.cpp b/be/src/storage/rowset/group_rowset_writer.cpp index 4d6b7f2e8c836f..3328ca484a4242 100644 --- a/be/src/storage/rowset/group_rowset_writer.cpp +++ b/be/src/storage/rowset/group_rowset_writer.cpp @@ -18,11 +18,13 @@ #include "storage/rowset/group_rowset_writer.h" #include "storage/rowset/beta_rowset_writer.h" +#include "storage/segment/segment_writer.h" +#include "util/debug_points.h" namespace doris { void GroupRowsetWriter::set_data_writer(const RowsetWriterSharedPtr& txn_rowset_writer) { - _txn_rowset_writer = std::dynamic_pointer_cast(txn_rowset_writer); + _txn_rowset_writer = txn_rowset_writer; } void GroupRowsetWriter::set_row_binlog_writer( @@ -32,21 +34,42 @@ void GroupRowsetWriter::set_row_binlog_writer( Status GroupRowsetWriter::flush_rowsets() { RETURN_IF_ERROR(_txn_rowset_writer->flush()); - if (_row_binlog_rowset_writer) { - RETURN_IF_ERROR(_row_binlog_rowset_writer->flush()); - } + RETURN_IF_ERROR(_row_binlog_rowset_writer->flush()); return Status::OK(); } Status GroupRowsetWriter::build_rowsets(std::vector& rowsets) { - if (rowsets.size() < 2) { - return Status::InvalidArgument( - "GroupRowsetWriter::build_rowsets expects at least 2 rowset slots"); + rowsets.clear(); + rowsets.reserve(2); + + RowsetSharedPtr txn_rowset; + RowsetSharedPtr row_binlog_rowset; + RETURN_IF_ERROR(_txn_rowset_writer->build(txn_rowset)); + Status st = Status::OK(); + DBUG_EXECUTE_IF("GroupRowsetWriter::build_rowsets.row_binlog_build_failed", + { st = Status::InternalError("debug row binlog build failed"); }); + if (st.ok()) { + st = _row_binlog_rowset_writer->build(row_binlog_rowset); } - RETURN_IF_ERROR(_txn_rowset_writer->build(rowsets[0])); - if (_row_binlog_rowset_writer) { - RETURN_IF_ERROR(_row_binlog_rowset_writer->build(rowsets[1])); + if (!st.ok()) { + RETURN_IF_ERROR(_txn_rowset_writer->force_rollback()); + return st; } + + rowsets.emplace_back(std::move(txn_rowset)); + rowsets.emplace_back(std::move(row_binlog_rowset)); + return Status::OK(); +} + +Status GroupRowsetWriter::flush_memtable(Block* block, int32_t segment_id, int64_t* flush_size) { + RETURN_IF_ERROR(_txn_rowset_writer->flush_memtable(block, segment_id, flush_size)); + RETURN_IF_ERROR(_row_binlog_rowset_writer->flush_memtable(block, segment_id, flush_size)); + return Status::OK(); +} + +Status GroupRowsetWriter::flush_single_block(const Block* block) { + RETURN_IF_ERROR(_txn_rowset_writer->flush_single_block(block)); + RETURN_IF_ERROR(_row_binlog_rowset_writer->flush_single_block(block)); return Status::OK(); } diff --git a/be/src/storage/rowset/group_rowset_writer.h b/be/src/storage/rowset/group_rowset_writer.h index a48f8aa29bc9f3..bd0b365dd5253a 100644 --- a/be/src/storage/rowset/group_rowset_writer.h +++ b/be/src/storage/rowset/group_rowset_writer.h @@ -20,6 +20,7 @@ #include "storage/rowset/rowset_writer.h" namespace doris { +class Block; class GroupRowsetWriter : public RowsetWriter { public: GroupRowsetWriter() = default; @@ -64,6 +65,10 @@ class GroupRowsetWriter : public RowsetWriter { // note that `add_row` could also trigger flush when certain conditions are met Status flush() override { return flush_rowsets(); } + Status flush_memtable(Block* block, int32_t segment_id, int64_t* flush_size) override; + + Status flush_single_block(const Block* block) override; + // GroupRowsetWriter does not support build a single rowset; its build is // delegated to underlying writers. Status build(RowsetSharedPtr& rowset) override { @@ -106,6 +111,14 @@ class GroupRowsetWriter : public RowsetWriter { return -1; } + int32_t get_allocated_segment_id() override { + DCHECK(_txn_rowset_writer != nullptr); + DCHECK(_row_binlog_rowset_writer != nullptr); + auto seg_id = _txn_rowset_writer->get_allocated_segment_id(); + DCHECK_EQ(seg_id, _row_binlog_rowset_writer->get_allocated_segment_id()); + return seg_id; + } + void set_segment_start_id(int num_segment) override { LOG(FATAL) << "GroupRowsetWriter::set_segment_start_id not supported"; } diff --git a/be/src/storage/rowset/rowset.h b/be/src/storage/rowset/rowset.h index f0c5403ca48b93..b952c44197cdec 100644 --- a/be/src/storage/rowset/rowset.h +++ b/be/src/storage/rowset/rowset.h @@ -166,8 +166,8 @@ class Rowset : public std::enable_shared_from_this, public MetadataAdder RowsetMetaPB get_rowset_pb() const { return rowset_meta()->get_rowset_pb(); } // The writing time of the newest data in rowset, to measure the freshness of a rowset. int64_t newest_write_timestamp() const { return rowset_meta()->newest_write_timestamp(); } - // The commit tso of the newest data in rowset. - int64_t commit_tso() const { return rowset_meta()->commit_tso(); } + // The commit tso range of the data in rowset. + TsoRange commit_tso() const { return rowset_meta()->commit_tso(); } bool is_segments_overlapping() const { return rowset_meta()->is_segments_overlapping(); } KeysType keys_type() { return _schema->keys_type(); } diff --git a/be/src/storage/rowset/rowset_factory.cpp b/be/src/storage/rowset/rowset_factory.cpp index eabd48bba54ae2..e6d0c7a8e371f8 100644 --- a/be/src/storage/rowset/rowset_factory.cpp +++ b/be/src/storage/rowset/rowset_factory.cpp @@ -55,10 +55,10 @@ Result> RowsetFactory::create_rowset_writer( return ResultError(Status::Error("invalid rowset_type")); } - if (context.write_binlog_opt().is_binlog_writer()) { + if (context.write_binlog_opt().enable) { std::unique_ptr writer; if (is_vertical) { - writer = std::make_unique>(engine); + writer = std::make_unique>(engine); RETURN_IF_ERROR_RESULT(writer->init(context)); return writer; } else { diff --git a/be/src/storage/rowset/rowset_meta.h b/be/src/storage/rowset/rowset_meta.h index 39b2ea2fead726..ff4f64466d99d9 100644 --- a/be/src/storage/rowset/rowset_meta.h +++ b/be/src/storage/rowset/rowset_meta.h @@ -94,6 +94,14 @@ class RowsetMeta : public MetadataAdder { void set_tablet_id(int64_t tablet_id) { _rowset_meta_pb.set_tablet_id(tablet_id); } + int64_t db_id() const { return _rowset_meta_pb.db_id(); } + + void set_db_id(int64_t db_id) { _rowset_meta_pb.set_db_id(db_id); } + + int64_t table_id() const { return _rowset_meta_pb.table_id(); } + + void set_table_id(int64_t table_id) { _rowset_meta_pb.set_table_id(table_id); } + int64_t index_id() const { return _rowset_meta_pb.index_id(); } void set_index_id(int64_t index_id) { _rowset_meta_pb.set_index_id(index_id); } @@ -478,9 +486,18 @@ class RowsetMeta : public MetadataAdder { [algorithm]() -> Result { return algorithm; }); } - int64_t commit_tso() const { return _rowset_meta_pb.commit_tso(); } + TsoRange commit_tso() const { + const auto& commit_tso_pb = _rowset_meta_pb.commit_tso(); + return {commit_tso_pb.start_tso(), commit_tso_pb.end_tso()}; + } + + void set_commit_tso(const TsoRange& commit_tso) { + auto* commit_tso_pb = _rowset_meta_pb.mutable_commit_tso(); + commit_tso_pb->set_start_tso(commit_tso.start_tso()); + commit_tso_pb->set_end_tso(commit_tso.end_tso()); + } - void set_commit_tso(int64_t commit_tso) { _rowset_meta_pb.set_commit_tso(commit_tso); } + void set_commit_tso(int64_t commit_tso) { set_commit_tso({commit_tso, commit_tso}); } void set_cloud_fields_after_visible(int64_t visible_version, int64_t version_update_time_ms) { // Update rowset meta with correct version and visible_ts diff --git a/be/src/storage/rowset/rowset_meta_manager.cpp b/be/src/storage/rowset/rowset_meta_manager.cpp index 0b9e11bbfa402a..2b9e48d093808b 100644 --- a/be/src/storage/rowset/rowset_meta_manager.cpp +++ b/be/src/storage/rowset/rowset_meta_manager.cpp @@ -72,7 +72,9 @@ Status RowsetMetaManager::get_rowset_meta(OlapMeta* meta, TabletUid tablet_uid, } Status RowsetMetaManager::save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, - const RowsetMetaPB& rowset_meta_pb, bool enable_binlog) { + const RowsetMetaPB& rowset_meta_pb, + std::optional binlog_format, + const std::map* attach_rowset_map) { if (rowset_meta_pb.partition_id() <= 0) { LOG(WARNING) << "invalid partition id " << rowset_meta_pb.partition_id() << " tablet " << rowset_meta_pb.tablet_id(); @@ -87,11 +89,16 @@ Status RowsetMetaManager::save(OlapMeta* meta, TabletUid tablet_uid, const Rowse LOG(WARNING) << "set debug point RowsetMetaManager::save::zero_partition_id old=" << partition_id << " new=" << rowset_meta_pb.DebugString(); }); - if (enable_binlog) { - return _save_with_binlog(meta, tablet_uid, rowset_id, rowset_meta_pb); - } else { + if (!binlog_format.has_value()) { return _save(meta, tablet_uid, rowset_id, rowset_meta_pb); } + if (*binlog_format == BinlogFormatPB::STATEMENT_AND_SNAPSHOT) { + return _save_with_ccr_binlog(meta, tablet_uid, rowset_id, rowset_meta_pb); + } + DCHECK_EQ(*binlog_format, BinlogFormatPB::ROW); + DCHECK(attach_rowset_map != nullptr); + DCHECK(!attach_rowset_map->empty()); + return _save_with_row_binlog(meta, tablet_uid, rowset_id, rowset_meta_pb, *attach_rowset_map); } Status RowsetMetaManager::_save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, @@ -107,9 +114,9 @@ Status RowsetMetaManager::_save(OlapMeta* meta, TabletUid tablet_uid, const Rows return meta->put(META_COLUMN_FAMILY_INDEX, key, value); } -Status RowsetMetaManager::_save_with_binlog(OlapMeta* meta, TabletUid tablet_uid, - const RowsetId& rowset_id, - const RowsetMetaPB& rowset_meta_pb) { +Status RowsetMetaManager::_save_with_ccr_binlog(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb) { // create rowset write data std::string rowset_key = fmt::format("{}{}_{}", ROWSET_PREFIX, tablet_uid.to_string(), rowset_id.to_string()); @@ -153,6 +160,40 @@ Status RowsetMetaManager::_save_with_binlog(OlapMeta* meta, TabletUid tablet_uid return meta->put(META_COLUMN_FAMILY_INDEX, entries); } +Status RowsetMetaManager::_save_with_row_binlog( + OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb, + const std::map& attach_rowset_map) { + std::vector keys; + std::vector values; + keys.reserve(attach_rowset_map.size() + 1); + values.reserve(attach_rowset_map.size() + 1); + + keys.emplace_back( + fmt::format("{}{}_{}", ROWSET_PREFIX, tablet_uid.to_string(), rowset_id.to_string())); + if (!rowset_meta_pb.SerializeToString(&values.emplace_back())) { + return Status::Error("serialize rowset pb failed. rowset id:{}", + keys[0]); + } + + for (const auto& [row_binlog_rs_id, row_binlog_rs_meta_pb] : attach_rowset_map) { + keys.emplace_back(make_row_binlog_key(tablet_uid, rowset_id, row_binlog_rs_id)); + DCHECK(row_binlog_rs_meta_pb.has_is_row_binlog() && row_binlog_rs_meta_pb.is_row_binlog()) + << row_binlog_rs_meta_pb.ShortDebugString(); + if (!row_binlog_rs_meta_pb.SerializeToString(&values.emplace_back())) { + return Status::Error( + "serialize rowset pb failed. rowset id:{}", keys.back()); + } + } + + std::vector entries; + entries.reserve(keys.size()); + for (size_t i = 0; i < keys.size(); ++i) { + entries.emplace_back(keys[i], values[i]); + } + return meta->put(META_COLUMN_FAMILY_INDEX, entries); +} + std::vector RowsetMetaManager::get_binlog_filenames(OlapMeta* meta, TabletUid tablet_uid, std::string_view binlog_version, @@ -478,6 +519,78 @@ Status RowsetMetaManager::remove_binlog(OlapMeta* meta, const std::string& suffi kBinlogDataPrefix.data() + suffix}); } +Status RowsetMetaManager::remove_row_binlog(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& base_rowset_id, + const RowsetId& row_binlog_rowset_id) { + return meta->remove(META_COLUMN_FAMILY_INDEX, + make_row_binlog_key(tablet_uid, base_rowset_id, row_binlog_rowset_id)); +} + +Status RowsetMetaManager::remove_row_binlog_metas(OlapMeta* meta, TabletUid tablet_uid, + const std::set& row_binlog_rowset_ids) { + std::map base_rowset_id_to_row_binlog; + RETURN_IF_ERROR(get_row_binlog_base_rowset_ids(meta, tablet_uid, base_rowset_id_to_row_binlog, + row_binlog_rowset_ids)); + for (const auto& [base_rowset_id, row_binlog_rowset_id] : base_rowset_id_to_row_binlog) { + RETURN_IF_ERROR(remove_row_binlog(meta, tablet_uid, base_rowset_id, row_binlog_rowset_id)); + } + return Status::OK(); +} + +bool RowsetMetaManager::row_binlog_meta_exists(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& row_binlog_rowset_id) { + bool found = false; + auto probe = [&found, &row_binlog_rowset_id](std::string_view key, + std::string_view /* value */) -> bool { + std::vector parts; + // key format: binlog_row_uuid_{rowset_id}_{row_binlog_rowset_id} + RETURN_IF_ERROR(split_string(key, '_', &parts)); + if (parts.size() != 5) { + LOG(WARNING) << "invalid binlog key:" << key << ", splitted size:" << parts.size(); + return true; + } + RowsetId id; + id.init(parts[4]); + if (id == row_binlog_rowset_id) { + found = true; + return false; + } + return true; + }; + static_cast(meta->iterate(META_COLUMN_FAMILY_INDEX, + std::string(kRowBinlogPrefix) + tablet_uid.to_string(), probe)); + return found; +} + +Status RowsetMetaManager::get_row_binlog_base_rowset_ids( + OlapMeta* meta, TabletUid tablet_uid, + std::map& base_rowset_id_to_row_binlog, + const std::set& row_binlog_rowset_ids) { + auto collect_row_binlog_base_rowset_id = + [&base_rowset_id_to_row_binlog, &row_binlog_rowset_ids]( + std::string_view key, std::string_view /* value */) -> bool { + std::vector parts; + // key format: binlog_row_uuid_{rowset_id}_{row_binlog_rowset_id} + RETURN_IF_ERROR(split_string(key, '_', &parts)); + if (parts.size() != 5) { + LOG(WARNING) << "invalid binlog key:" << key << ", splitted size:" << parts.size(); + return true; + } + + RowsetId rowset_id; + rowset_id.init(parts[3]); + RowsetId row_binlog_rowset_id; + row_binlog_rowset_id.init(parts[4]); + if (row_binlog_rowset_ids.contains(row_binlog_rowset_id)) { + base_rowset_id_to_row_binlog.emplace(rowset_id, row_binlog_rowset_id); + } + return true; + }; + return meta->iterate(META_COLUMN_FAMILY_INDEX, + std::string(kRowBinlogPrefix) + tablet_uid.to_string(), + collect_row_binlog_base_rowset_id); +} + Status RowsetMetaManager::ingest_binlog_metas(OlapMeta* meta, TabletUid tablet_uid, RowsetBinlogMetasPB* metas_pb) { std::vector entries; @@ -559,6 +672,36 @@ Status RowsetMetaManager::traverse_binlog_metas( return status; } +Status RowsetMetaManager::traverse_row_binlog_metas( + OlapMeta* meta, std::function const& func) { + auto traverse_row_binlog_rowset_meta_func = [&func](std::string_view key, + std::string_view value) -> bool { + std::vector parts; + // key format: binlog_row_uuid_{rowset_id}_{row_binlog_rowset_id} + RETURN_IF_ERROR(split_string(key, '_', &parts)); + if (parts.size() != 5) { + LOG(WARNING) << "invalid rowset key:" << key << ", splitted size:" << parts.size(); + return true; + } + std::vector uid_parts; + RETURN_IF_ERROR(split_string(parts[2], '-', &uid_parts)); + if (uid_parts.size() != 2) { + LOG(WARNING) << "invalid tablet uid in binlog key:" << key + << ", splitted size:" << uid_parts.size(); + return true; + } + TabletUid tablet_uid(uid_parts[0], uid_parts[1]); + RowsetId rowset_id; + rowset_id.init(parts[3]); + RowsetId row_binlog_rowset_id; + row_binlog_rowset_id.init(parts[4]); + return func(tablet_uid, rowset_id, row_binlog_rowset_id, std::string(value)); + }; + return meta->iterate(META_COLUMN_FAMILY_INDEX, std::string(kRowBinlogPrefix), + traverse_row_binlog_rowset_meta_func); +} + Status RowsetMetaManager::save_partial_update_info( OlapMeta* meta, int64_t tablet_id, int64_t partition_id, int64_t txn_id, const PartialUpdateInfoPB& partial_update_info_pb) { diff --git a/be/src/storage/rowset/rowset_meta_manager.h b/be/src/storage/rowset/rowset_meta_manager.h index a2d289481ed1f3..2007f43829a0da 100644 --- a/be/src/storage/rowset/rowset_meta_manager.h +++ b/be/src/storage/rowset/rowset_meta_manager.h @@ -22,6 +22,9 @@ #include #include +#include +#include +#include #include #include #include @@ -55,7 +58,18 @@ class RowsetMetaManager { RowsetMetaSharedPtr rowset_meta); // TODO(Drogon): refactor save && _save_with_binlog to one, adapt to ut temperately static Status save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, - const RowsetMetaPB& rowset_meta_pb, bool enable_binlog); + const RowsetMetaPB& rowset_meta_pb, + std::optional binlog_format = std::nullopt, + const std::map* attach_rowset_map = nullptr); + + // STATEMENT_AND_SNAPSHOT + static Status save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb, bool enable_binlog) { + return save(meta, tablet_uid, rowset_id, rowset_meta_pb, + enable_binlog + ? std::optional(BinlogFormatPB::STATEMENT_AND_SNAPSHOT) + : std::nullopt); + } static std::vector get_binlog_filenames(OlapMeta* meta, TabletUid tablet_uid, std::string_view binlog_version, @@ -74,6 +88,21 @@ class RowsetMetaManager { static Status remove_binlog(OlapMeta* meta, const std::string& suffix); static Status ingest_binlog_metas(OlapMeta* meta, TabletUid tablet_uid, RowsetBinlogMetasPB* metas_pb); + + static Status remove_row_binlog(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& base_rowset_id, + const RowsetId& row_binlog_rowset_id); + static Status remove_row_binlog_metas(OlapMeta* meta, TabletUid tablet_uid, + const std::set& row_binlog_rowset_ids); + static bool row_binlog_meta_exists(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& row_binlog_rowset_id); + static Status get_row_binlog_base_rowset_ids( + OlapMeta* meta, TabletUid tablet_uid, + std::map& base_rowset_id_to_row_binlog, + const std::set& row_binlog_rowset_ids); + static Status traverse_row_binlog_metas( + OlapMeta* meta, std::function const& func); static Status traverse_rowset_metas(OlapMeta* meta, std::function const& collector); @@ -101,8 +130,13 @@ class RowsetMetaManager { private: static Status _save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, const RowsetMetaPB& rowset_meta_pb); - static Status _save_with_binlog(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, - const RowsetMetaPB& rowset_meta_pb); + static Status _save_with_ccr_binlog(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb); + static Status _save_with_row_binlog(OlapMeta* meta, TabletUid tablet_uid, + const RowsetId& rowset_id, + const RowsetMetaPB& rowset_meta_pb, + const std::map& attach_rowset_map); static Status _get_rowset_binlog_metas(OlapMeta* meta, const TabletUid tablet_uid, const std::vector& binlog_versions, RowsetBinlogMetasPB* metas_pb); diff --git a/be/src/storage/rowset/rowset_writer.h b/be/src/storage/rowset/rowset_writer.h index 75c0bff084bd8c..0652470db92ce7 100644 --- a/be/src/storage/rowset/rowset_writer.h +++ b/be/src/storage/rowset/rowset_writer.h @@ -172,10 +172,19 @@ class RowsetWriter { virtual int32_t allocate_segment_id() = 0; + // Return the next segment id to be allocated without advancing internal state. + // NOTE: This value equals the one that would be returned by the next + // `allocate_segment_id()` call. + virtual int32_t get_allocated_segment_id() = 0; + virtual void set_segment_start_id(int num_segment) { throw Exception(Status::FatalError("not supported!")); } + virtual Status force_rollback() { + return Status::NotSupported("RowsetWriter::force_rollback not supported"); + } + virtual int64_t delete_bitmap_ns() { return 0; } virtual int64_t segment_writer_ns() { return 0; } diff --git a/be/src/storage/rowset/rowset_writer_context.h b/be/src/storage/rowset/rowset_writer_context.h index b594e756a18f17..58dd12fc1ffa8a 100644 --- a/be/src/storage/rowset/rowset_writer_context.h +++ b/be/src/storage/rowset/rowset_writer_context.h @@ -32,8 +32,10 @@ #include "io/fs/file_writer.h" #include "io/fs/packed_file_system.h" #include "runtime/exec_env.h" +#include "storage/binlog.h" #include "storage/olap_define.h" #include "storage/partial_update_info.h" +#include "storage/segment/historical_row_retriever.h" #include "storage/storage_policy.h" #include "storage/tablet/tablet.h" #include "storage/tablet/tablet_schema.h" @@ -47,6 +49,10 @@ class Tablet; class FileWriterCreator; class SegmentCollector; +namespace segment_v2 { +struct HistoricalRowRetrieverContext; +} + struct RowsetWriterContext { RowsetWriterContext() : schema_lock(new std::mutex) { load_id.set_hi(0); @@ -54,6 +60,8 @@ struct RowsetWriterContext { } RowsetId rowset_id; + int64_t db_id {0}; + int64_t table_id {0}; int64_t tablet_id {0}; int32_t tablet_schema_hash {0}; int64_t index_id {0}; @@ -118,6 +126,8 @@ struct RowsetWriterContext { bool is_transient_rowset_writer = false; + segment_v2::HistoricalRowRetrieverContext make_historical_row_retriever_context(); + // Intent flag: caller can actively turn merge-file feature on/off for this rowset. // This describes whether we *want* to try small-file merging. bool allow_packed_file = true; @@ -252,29 +262,24 @@ struct RowsetWriterContext { struct BinlogOptions { public: - void mark_primary_writer() { binlog_write_type = BinlogWriteType::PrimaryWriter; } + bool enable = false; - void mark_binlog_writer() { binlog_write_type = BinlogWriteType::BinlogWriter; } - - bool is_primary_writer() const { - return binlog_write_type == BinlogWriteType::PrimaryWriter; + void set_need_before(bool need_before) { + this->_need_before = need_before; + _segment_write_binlog_opt.write_before = need_before; } - bool is_binlog_writer() const { return binlog_write_type == BinlogWriteType::BinlogWriter; } - - bool need_build_binlog() const { return binlog_write_type != BinlogWriteType::Unknown; } + segment_v2::SegmentWriteBinlogOptions& write_binlog_config() { + return _segment_write_binlog_opt; + } - void set_need_before(bool need_before) { this->_need_before = need_before; } + const segment_v2::SegmentWriteBinlogOptions& write_binlog_config() const { + return _segment_write_binlog_opt; + } private: - // if you don't need to build row_binlog, `PrimaryWriter` and `BinlogWriter` are both false - // if you need to build row_binlog, the `is_primary_writer` of normal rowset writer is true - enum BinlogWriteType { - PrimaryWriter, - BinlogWriter, - Unknown - } binlog_write_type = BinlogWriteType::Unknown; bool _need_before = false; + segment_v2::SegmentWriteBinlogOptions _segment_write_binlog_opt; } _write_binlog_opt; BinlogOptions& write_binlog_opt() { return _write_binlog_opt; } @@ -282,4 +287,15 @@ struct RowsetWriterContext { const BinlogOptions& write_binlog_opt() const { return _write_binlog_opt; } }; +inline segment_v2::HistoricalRowRetrieverContext +RowsetWriterContext::make_historical_row_retriever_context() { + return segment_v2::HistoricalRowRetrieverContext { + .tablet = tablet, + .tablet_schema = tablet_schema, + .rowset_writer_ctx = this, + .partial_update_info = partial_update_info, + .is_transient_rowset_writer = is_transient_rowset_writer, + .write_type = write_type}; +} + } // namespace doris diff --git a/be/src/storage/rowset/segment_creator.cpp b/be/src/storage/rowset/segment_creator.cpp index 5d67ad7b93bb25..37b847ce216fd3 100644 --- a/be/src/storage/rowset/segment_creator.cpp +++ b/be/src/storage/rowset/segment_creator.cpp @@ -43,6 +43,7 @@ #include "io/fs/file_writer.h" #include "storage/olap_define.h" #include "storage/rowset/beta_rowset_writer.h" // SegmentStatistics +#include "storage/segment/row_binlog_segment_writer.h" #include "storage/segment/segment_writer.h" #include "storage/segment/vertical_segment_writer.h" #include "storage/tablet/tablet_schema.h" @@ -69,7 +70,9 @@ Status SegmentFlusher::flush_single_block(const Block* block, int32_t segment_id } Block flush_block(*block); bool no_compression = flush_block.bytes() <= config::segment_compression_threshold_kb * 1024; - if (config::enable_vertical_segment_writer) { + bool use_vertical_segment_writer = + config::enable_vertical_segment_writer && !_context.write_binlog_opt().enable; + if (use_vertical_segment_writer) { std::unique_ptr writer; RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, no_compression)); RETURN_IF_ERROR_OR_CATCH_EXCEPTION(_add_rows(writer, &flush_block, 0, flush_block.rows())); @@ -124,9 +127,16 @@ Status SegmentFlusher::_create_segment_writer(std::unique_ptr( - segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, writer_options, index_file_writer.get()); + if (_context.write_binlog_opt().enable) { + writer = std::make_unique( + segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, + _context.data_dir, writer_options, + _context.write_binlog_opt().write_binlog_config()); + } else { + writer = std::make_unique( + segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, + _context.data_dir, writer_options, index_file_writer.get()); + } RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); if (_context.tablet_schema->has_inverted_index() || _context.tablet_schema->has_ann_index()) { RETURN_IF_ERROR(_idx_files.add(segment_id, std::move(index_file_writer))); diff --git a/be/src/storage/rowset/segment_creator.h b/be/src/storage/rowset/segment_creator.h index cfa7c85cc685a4..034f3cfa457910 100644 --- a/be/src/storage/rowset/segment_creator.h +++ b/be/src/storage/rowset/segment_creator.h @@ -178,6 +178,9 @@ class SegmentCreator { int32_t allocate_segment_id() { return _next_segment_id.fetch_add(1); } + // Return the next segment id to be allocated without advancing internal state. + int32_t get_allocated_segment_id() const { return _next_segment_id.load(); } + int32_t next_segment_id() const { return _next_segment_id.load(); } int64_t num_rows_written() const { return _segment_flusher.num_rows_written(); } diff --git a/be/src/storage/rowset/vertical_beta_rowset_writer.cpp b/be/src/storage/rowset/vertical_beta_rowset_writer.cpp index 3736124fc0ff52..260c6a87410cc9 100644 --- a/be/src/storage/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/storage/rowset/vertical_beta_rowset_writer.cpp @@ -44,6 +44,7 @@ using namespace ErrorCode; template class VerticalBetaRowsetWriter; template class VerticalBetaRowsetWriter; +template class VerticalBetaRowsetWriter; template requires std::is_base_of_v diff --git a/be/src/storage/rowset_builder.cpp b/be/src/storage/rowset_builder.cpp index aa075f386c48c3..2f2e83669a5d11 100644 --- a/be/src/storage/rowset_builder.cpp +++ b/be/src/storage/rowset_builder.cpp @@ -90,7 +90,7 @@ void BaseRowsetBuilder::_init_profile(RuntimeProfile* profile) { _profile = profile->create_child( fmt::format( "RowsetBuilder {} {}", _req.tablet_id, - _req.write_req_type == WriteRequestType::ROW_BINLOG ? "row_binlog" : "data"), + _req.write_req_type == WriteRequestType::ROW_BINLOG ? "binlog" : "data"), true, true); _build_rowset_timer = ADD_TIMER(_profile, "BuildRowsetTime"); _submit_delete_bitmap_timer = ADD_TIMER(_profile, "DeleteBitmapSubmitTime"); @@ -124,10 +124,6 @@ Tablet* RowsetBuilder::tablet() { return static_cast(_tablet.get()); } -TabletSharedPtr RowsetBuilder::tablet_sptr() { - return std::static_pointer_cast(_tablet); -} - void RowsetBuilder::_garbage_collection(bool cancel_txn) { Status rollback_status; bool need_clean = true; @@ -196,8 +192,9 @@ Status RowsetBuilder::check_tablet_version_count() { (version_count > max_version_config - 100) && !GlobalMemoryArbitrator::is_exceed_soft_mem_limit(GB_EXCHANGE_BYTE)) { // Trigger compaction - auto st = _engine.submit_compaction_task( - tablet_sptr(), CompactionType::CUMULATIVE_COMPACTION, true, true, 2); + auto st = _engine.submit_compaction_task(std::static_pointer_cast(tablet_sptr()), + CompactionType::CUMULATIVE_COMPACTION, true, true, + 2); if (!st.ok()) [[unlikely]] { LOG(WARNING) << "failed to trigger compaction, tablet_id=" << _tablet->tablet_id() << " : " << st; @@ -216,10 +213,6 @@ Status RowsetBuilder::init() { RETURN_IF_ERROR(_init_context_common_fields(context)); - if (tablet()->enable_row_binlog()) { - context.write_binlog_opt().mark_primary_writer(); - } - std::shared_ptr mow_context; if (_tablet->enable_unique_key_merge_on_write()) { RETURN_IF_ERROR(init_mow_context(mow_context)); @@ -272,6 +265,8 @@ Status BaseRowsetBuilder::_init_context_common_fields(RowsetWriterContext& conte context.txn_id = _req.txn_id; context.load_id = _req.load_id; + context.db_id = _req.table_schema_param->db_id(); + context.table_id = _req.table_schema_param->table_id(); context.rowset_state = PREPARED; context.segments_overlap = OVERLAPPING; context.tablet_schema = _tablet_schema; @@ -280,6 +275,9 @@ Status BaseRowsetBuilder::_init_context_common_fields(RowsetWriterContext& conte context.index_id = _req.index_id; context.tablet = _tablet; context.enable_segcompaction = true; + if (_req.write_req_type == WriteRequestType::ROW_BINLOG || !_attach_rowset_ids.empty()) { + context.enable_segcompaction = false; + } context.write_type = DataWriteType::TYPE_DIRECT; context.write_file_cache = _req.write_file_cache; @@ -299,6 +297,8 @@ Status BaseRowsetBuilder::build_rowset() { Status GroupRowsetBuilder::build_rowset() { // build binlog rowset first, then data rowset + // If data rowset build fails after binlog has built some segments, we rely on the + // RowsetBuilder/StorageEngine garbage-collection paths to clean up built-but-uncommitted files. RETURN_IF_ERROR(_row_binlog_rowset_builder->build_rowset()); return _txn_rs_builder->build_rowset(); } @@ -393,7 +393,7 @@ Status RowsetBuilder::commit_txn() { // Transfer ownership of `PendingRowsetGuard` to `TxnManager` Status res = _engine.txn_manager()->commit_txn( _req.partition_id, *tablet(), _req.txn_id, _req.load_id, _rowset, - std::move(_pending_rs_guard), false, _partial_update_info); + std::move(_pending_rs_guard), false, _partial_update_info, &_attach_rowsets); if (!res && !res.is()) { LOG(WARNING) << "Failed to commit txn: " << _req.txn_id @@ -401,6 +401,7 @@ Status RowsetBuilder::commit_txn() { return res; } if (_tablet->enable_unique_key_merge_on_write()) { + // no need to update binlog_delvec, it'll be updated in publish phase _engine.txn_manager()->set_txn_related_delete_bitmap( _req.partition_id, _req.txn_id, tablet()->tablet_id(), tablet()->tablet_uid(), true, _delete_bitmap, *_rowset_ids, _partial_update_info); @@ -425,19 +426,26 @@ Status BaseRowsetBuilder::cancel() { Status BaseRowsetBuilder::_build_current_tablet_schema( int64_t index_id, const OlapTableSchemaParam* table_schema_param, const TabletSchema& ori_tablet_schema) { - // find the right index id - int i = 0; - auto indexes = table_schema_param->indexes(); - for (; i < indexes.size(); i++) { - if (indexes[i]->index_id == index_id) { - break; + const OlapTableIndexSchema* index_schema = nullptr; + if (_req.write_req_type == WriteRequestType::ROW_BINLOG) { + const auto* row_binlog_index_schema = table_schema_param->row_binlog_index_schema(); + DCHECK(row_binlog_index_schema != nullptr); + DCHECK_EQ(row_binlog_index_schema->index_id, index_id); + index_schema = row_binlog_index_schema; + } else { + for (const auto* schema : table_schema_param->indexes()) { + if (schema->index_id == index_id) { + index_schema = schema; + break; + } } } - if (!indexes.empty() && !indexes[i]->columns.empty() && - indexes[i]->columns[0]->unique_id() >= 0) { + + if (index_schema != nullptr && !index_schema->columns.empty() && + index_schema->columns[0]->unique_id() >= 0) { _tablet_schema->shawdow_copy_without_columns(ori_tablet_schema); _tablet_schema->build_current_tablet_schema( - index_id, cast_set(table_schema_param->version()), indexes[i], + index_id, cast_set(table_schema_param->version()), index_schema, ori_tablet_schema); } else { _tablet_schema->copy_from(ori_tablet_schema); @@ -466,32 +474,32 @@ Status BaseRowsetBuilder::_build_current_tablet_schema( _tablet_schema->set_auto_increment_column(table_schema_param->auto_increment_coulumn()); } // set partial update columns info - _partial_update_info = std::make_shared(); - RETURN_IF_ERROR(_partial_update_info->init( - tablet()->tablet_id(), _req.txn_id, *_tablet_schema, - table_schema_param->unique_key_update_mode(), - table_schema_param->partial_update_new_key_policy(), - table_schema_param->partial_update_input_columns(), - table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), - table_schema_param->nano_seconds(), table_schema_param->timezone(), - table_schema_param->auto_increment_coulumn(), - table_schema_param->sequence_map_col_uid(), _max_version_in_flush_phase)); + if (is_data_builder()) { + _partial_update_info = std::make_shared(); + RETURN_IF_ERROR(_partial_update_info->init( + tablet()->tablet_id(), _req.txn_id, *_tablet_schema, + table_schema_param->unique_key_update_mode(), + table_schema_param->partial_update_new_key_policy(), + table_schema_param->partial_update_input_columns(), + table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), + table_schema_param->nano_seconds(), table_schema_param->timezone(), + table_schema_param->auto_increment_coulumn(), + table_schema_param->sequence_map_col_uid(), _max_version_in_flush_phase)); + } return Status::OK(); } -GroupRowsetBuilder::GroupRowsetBuilder(StorageEngine& engine, const WriteRequest& req, - const WriteRequest& row_binlog_req, RuntimeProfile* profile) - : BaseRowsetBuilder( - [](int64_t tablet_id) { - WriteRequest group_req; - group_req.tablet_id = tablet_id; - group_req.write_req_type = WriteRequestType::GROUP; - return group_req; - }(req.tablet_id), - profile) { +GroupRowsetBuilder::GroupRowsetBuilder(StorageEngine& engine, const WriteRequest& group_build_req, + const WriteRequest& sub_data_req, + const WriteRequest& sub_row_binlog_req, + RuntimeProfile* profile) + : BaseRowsetBuilder(group_build_req, profile) { + DCHECK(group_build_req.write_req_type == WriteRequestType::GROUP && + sub_data_req.write_req_type == WriteRequestType::DATA && + sub_row_binlog_req.write_req_type == WriteRequestType::ROW_BINLOG); _row_binlog_rowset_builder = - std::make_shared(engine, row_binlog_req, profile); - _txn_rs_builder = std::make_shared(engine, req, profile); + std::make_shared(engine, sub_row_binlog_req, profile); + _txn_rs_builder = std::make_shared(engine, sub_data_req, profile); } Status GroupRowsetBuilder::init() { @@ -511,6 +519,18 @@ Status GroupRowsetBuilder::init() { group_writer->set_data_writer(_txn_rs_builder->rowset_writer()); group_writer->set_row_binlog_writer(_row_binlog_rowset_builder->rowset_writer()); + { + const auto& data_ctx = _txn_rs_builder->rowset_writer()->context(); + auto& binlog_ctx = const_cast( + _row_binlog_rowset_builder->rowset_writer()->context()); + auto& cfg = binlog_ctx.write_binlog_opt().write_binlog_config(); + cfg.source.tablet_schema = data_ctx.tablet_schema; + cfg.source.partial_update_info = data_ctx.partial_update_info; + cfg.source.mow_context = data_ctx.mow_context; + cfg.source.is_transient_rowset_writer = data_ctx.is_transient_rowset_writer; + cfg.source.source_write_type = data_ctx.write_type; + } + _rowset_writer = std::move(group_writer); _is_init = true; return Status::OK(); @@ -528,7 +548,12 @@ Status GroupRowsetBuilder::commit_txn() { // Attach binlog rowset to txn rowset, so that commit/rollback and // clean-up are all handled by txn rowset builder. RETURN_IF_ERROR(_txn_rs_builder->attach_rowset_to_txn(_row_binlog_rowset_builder->rowset())); - return _txn_rs_builder->commit_txn(); + auto st = _txn_rs_builder->commit_txn(); + if (st.ok()) { + // Avoid RowBinlogRowsetBuilder being cleaned in its base dtor. + RETURN_IF_ERROR(_row_binlog_rowset_builder->commit_txn()); + } + return st; } Status RowBinlogRowsetBuilder::init() { @@ -540,7 +565,7 @@ Status RowBinlogRowsetBuilder::init() { RETURN_IF_ERROR(_build_current_tablet_schema( _req.index_id, _req.table_schema_param.get(), *std::dynamic_pointer_cast(_tablet)->row_binlog_tablet_schema())); - context.write_binlog_opt().mark_binlog_writer(); + context.write_binlog_opt().enable = true; _rowset_writer = DORIS_TRY(_tablet->create_rowset_writer(context, false)); // need to attach PendingRowsetGuard after txn_rs_builder init diff --git a/be/src/storage/rowset_builder.h b/be/src/storage/rowset_builder.h index edc97432eaf9c9..fe3488c7ff86be 100644 --- a/be/src/storage/rowset_builder.h +++ b/be/src/storage/rowset_builder.h @@ -72,14 +72,16 @@ class BaseRowsetBuilder { const BaseTabletSPtr& tablet() const { return _tablet; } - const RowsetSharedPtr& rowset() const { return _rowset; } + virtual const BaseTabletSPtr& tablet_sptr() const { return _tablet; } - const TabletSchemaSPtr& tablet_schema() const { return _tablet_schema; } + virtual const RowsetSharedPtr& rowset() const { return _rowset; } + + virtual const TabletSchemaSPtr& tablet_schema() const { return _tablet_schema; } // For UT const DeleteBitmapPtr& get_delete_bitmap() { return _delete_bitmap; } - const std::shared_ptr& get_partial_update_info() const { + virtual const std::shared_ptr& get_partial_update_info() const { return _partial_update_info; } @@ -160,8 +162,6 @@ class RowsetBuilder : public BaseRowsetBuilder { void _garbage_collection(bool cancel_txn); - TabletSharedPtr tablet_sptr(); - StorageEngine& _engine; RuntimeProfile::Counter* _commit_txn_timer = nullptr; }; @@ -183,6 +183,7 @@ class RowBinlogRowsetBuilder : public RowsetBuilder { // the owner of rowset will be changed, so cleaning rowset is handed to the // data(txn) rowset builder. Status commit_txn() override { + std::lock_guard l(_lock); _is_committed = true; return Status::OK(); } @@ -193,8 +194,9 @@ class RowBinlogRowsetBuilder : public RowsetBuilder { // Now only support one tablet class GroupRowsetBuilder : public BaseRowsetBuilder { public: - GroupRowsetBuilder(StorageEngine& engine, const WriteRequest& req, - const WriteRequest& row_binlog_req, RuntimeProfile* profile); + GroupRowsetBuilder(StorageEngine& engine, const WriteRequest& group_build_req, + const WriteRequest& sub_data_req, const WriteRequest& sub_row_binlog_req, + RuntimeProfile* profile); Status init() override; @@ -206,6 +208,18 @@ class GroupRowsetBuilder : public BaseRowsetBuilder { Status commit_txn() override; + const BaseTabletSPtr& tablet_sptr() const override { return _txn_rs_builder->tablet_sptr(); } + + const RowsetSharedPtr& rowset() const override { return _txn_rs_builder->rowset(); } + + const TabletSchemaSPtr& tablet_schema() const override { + return _txn_rs_builder->tablet_schema(); + } + + const std::shared_ptr& get_partial_update_info() const override { + return _txn_rs_builder->get_partial_update_info(); + } + RowsetBuilder* txn_rowset_builder() { return _txn_rs_builder.get(); } RowsetBuilder* row_binlog_builder() { return _row_binlog_rowset_builder.get(); } diff --git a/be/src/storage/rowset_version_mgr.cpp b/be/src/storage/rowset_version_mgr.cpp index 368d2539a1079b..aa6ed733c39658 100644 --- a/be/src/storage/rowset_version_mgr.cpp +++ b/be/src/storage/rowset_version_mgr.cpp @@ -64,10 +64,12 @@ static bvar::LatencyRecorder g_remote_fetch_tablet_rowsets_latency("remote_fetch [[nodiscard]] Result> BaseTablet::capture_consistent_versions_unlocked( const Version& version_range, const CaptureRowsetOps& options) const { std::vector version_path; - auto st = - _timestamped_version_tracker.capture_consistent_versions(version_range, &version_path); + auto& version_tracker = + options.capture_row_binlog ? _row_binlog_version_tracker : _timestamped_version_tracker; + auto st = version_tracker.capture_consistent_versions(version_range, &version_path); if (!st && !options.quiet) { - auto missed_versions = get_missed_versions_unlocked(version_range.second); + auto missed_versions = + get_missed_versions_unlocked(version_range.second, options.capture_row_binlog); if (missed_versions.empty()) { LOG(WARNING) << fmt::format( "version already has been merged. version_range={}, max_version={}, " @@ -109,14 +111,17 @@ static bvar::LatencyRecorder g_remote_fetch_tablet_rowsets_latency("remote_fetch auto rowset_for_version = [&](const Version& version, bool include_stale) -> Result { - if (auto it = _rs_version_map.find(version); it != _rs_version_map.end()) { + const auto& rs_version_map = + options.capture_row_binlog ? _row_binlog_rs_version_map : _rs_version_map; + if (auto it = rs_version_map.find(version); it != rs_version_map.end()) { return it->second; } else { - VLOG_NOTICE << "fail to find Rowset in rs_version for version. tablet=" - << tablet_id() << ", version='" << version.first << "-" - << version.second; + VLOG_NOTICE << "fail to find Rowset in " + << (options.capture_row_binlog ? "row_binlog_rs_version" : "rs_version") + << " for version. tablet=" << tablet_id() << ", version='" + << version.first << "-" << version.second; } - if (include_stale) { + if (!options.capture_row_binlog && include_stale) { if (auto it = _stale_rs_version_map.find(version); it != _stale_rs_version_map.end()) { return it->second; @@ -139,7 +144,9 @@ static bvar::LatencyRecorder g_remote_fetch_tablet_rowsets_latency("remote_fetch rowsets.push_back(std::move(ret.value())); } - if (keys_type() == KeysType::UNIQUE_KEYS && enable_unique_key_merge_on_write()) { + if (options.capture_row_binlog) { + result.delete_bitmap = _tablet_meta->binlog_delvec_ptr(); + } else if (keys_type() == KeysType::UNIQUE_KEYS && enable_unique_key_merge_on_write()) { result.delete_bitmap = _tablet_meta->delete_bitmap_ptr(); } return result; diff --git a/be/src/storage/schema.cpp b/be/src/storage/schema.cpp index 9a7c59a24d6ab9..63718a20d4b5ce 100644 --- a/be/src/storage/schema.cpp +++ b/be/src/storage/schema.cpp @@ -53,7 +53,15 @@ Schema& Schema::operator=(const Schema& other) { void Schema::_copy_from(const Schema& other) { _col_ids = other._col_ids; + _unique_ids = other._unique_ids; _num_key_columns = other._num_key_columns; + _delete_sign_idx = other._delete_sign_idx; + _has_sequence_col = other._has_sequence_col; + _rowid_col_idx = other._rowid_col_idx; + _version_col_idx = other._version_col_idx; + _lsn_col_idx = other._lsn_col_idx; + _tso_col_idx = other._tso_col_idx; + _mem_size = other._mem_size; // Deep copy _cols // TODO(lingbin): really need clone? diff --git a/be/src/storage/schema.h b/be/src/storage/schema.h index c3138fb49f1ddc..2eabb12853c8b4 100644 --- a/be/src/storage/schema.h +++ b/be/src/storage/schema.h @@ -31,6 +31,7 @@ #include "exprs/aggregate/aggregate_function.h" #include "io/io_common.h" #include "runtime/thread_context.h" +#include "storage/binlog.h" #include "storage/field.h" #include "storage/olap_common.h" #include "storage/tablet/tablet_schema.h" @@ -75,6 +76,12 @@ class Schema { if (column.name() == VERSION_COL) { _version_col_idx = cid; } + if (column.name() == std::string(kRowBinlogLsnColName)) { + _lsn_col_idx = cid; + } + if (column.name() == std::string(kRowBinlogTimestampColName)) { + _tso_col_idx = cid; + } columns.push_back(std::make_shared(column)); } _delete_sign_idx = tablet_schema->delete_sign_idx(); @@ -102,6 +109,12 @@ class Schema { if (columns[i]->name() == VERSION_COL) { _version_col_idx = i; } + if (columns[i]->name() == std::string(kRowBinlogLsnColName)) { + _lsn_col_idx = i; + } + if (columns[i]->name() == std::string(kRowBinlogTimestampColName)) { + _tso_col_idx = i; + } _unique_ids[i] = columns[i]->unique_id(); } _init(columns, col_ids, num_key_columns); @@ -146,6 +159,8 @@ class Schema { bool has_sequence_col() const { return _has_sequence_col; } int32_t rowid_col_idx() const { return _rowid_col_idx; } int32_t version_col_idx() const { return _version_col_idx; } + int32_t lsn_col_idx() const { return _lsn_col_idx; } + int32_t tso_col_idx() const { return _tso_col_idx; } // Don't use. // TODO: memory size of Schema cannot be accurately tracked. // In some places, temporarily use num_columns() as Schema size. @@ -170,6 +185,8 @@ class Schema { bool _has_sequence_col = false; int32_t _rowid_col_idx = -1; int32_t _version_col_idx = -1; + int32_t _lsn_col_idx = -1; + int32_t _tso_col_idx = -1; int64_t _mem_size = 0; }; diff --git a/be/src/storage/segment/historical_row_retriever.cpp b/be/src/storage/segment/historical_row_retriever.cpp new file mode 100644 index 00000000000000..45ed91b281c9e9 --- /dev/null +++ b/be/src/storage/segment/historical_row_retriever.cpp @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/segment/historical_row_retriever.h" + +// IWYU pragma: no_include +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" +#include "common/consts.h" +#include "common/logging.h" // LOG +#include "common/status.h" +#include "core/block/block.h" +#include "core/block/column_with_type_and_name.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type.h" +#include "core/string_ref.h" +#include "runtime/exec_env.h" +#include "service/point_query_executor.h" +#include "storage/binlog.h" +#include "storage/data_dir.h" +#include "storage/iterator/olap_data_convertor.h" +#include "storage/key_coder.h" +#include "storage/rowset/beta_rowset.h" +#include "storage/rowset/rowset.h" +#include "storage/rowset/rowset_reader_context.h" +#include "storage/rowset/rowset_writer_context.h" +#include "storage/segment/segment.h" +#include "storage/storage_engine.h" +#include "storage/tablet/tablet.h" +#include "storage/tablet/tablet_meta.h" +#include "storage/tablet/tablet_schema.h" + +namespace doris { + +namespace segment_v2 { + +using namespace ErrorCode; + +namespace { + +void insert_value_to_nullable_column(IColumn* dst_column, const IColumn& src_column, size_t pos) { + auto* nullable_column = assert_cast(dst_column); + if (src_column.is_nullable()) { + nullable_column->insert_from(src_column, pos); + return; + } + + nullable_column->get_nested_column().insert_from(src_column, pos); + nullable_column->get_null_map_data().push_back(0); +} + +} // namespace + +Status PrimaryKeyModelRowRetriever::init(const HistoricalRowRetrieverContext& context) { + _context = context; + _key_columns.resize(_context.tablet_schema->num_key_columns()); + auto& tablet_schema = _context.tablet_schema; + for (size_t cid = 0; cid < tablet_schema->num_key_columns(); ++cid) { + const auto& column = tablet_schema->column(cid); + _key_coders.push_back(get_key_coder(column.type())); + } + // encode the sequence id into the primary key index + if (tablet_schema->has_sequence_col()) { + const auto& column = tablet_schema->column(tablet_schema->sequence_col_idx()); + _seq_coder = const_cast(get_key_coder(column.type())); + } + return Status::OK(); +} + +Status PrimaryKeyModelRowRetriever::retrieve_historical_row(const Int8* delete_sign_column_data, + size_t row_pos, size_t num_rows) { + auto* tablet = static_cast(_context.tablet.get()); + auto& tablet_schema = _context.tablet_schema; + + DCHECK(_context.partial_update_info); + + std::vector specified_rowsets; + { + std::shared_lock rlock(_context.tablet->get_header_lock()); + specified_rowsets = _mow_context->rowset_ptrs; + } + std::vector> segment_caches(specified_rowsets.size()); + + for (size_t block_pos = row_pos; block_pos < row_pos + num_rows; block_pos++) { + // After converting to olap column, [0, num_rows) in the result column is corresponding to + // [row_pos, row_pos + num_rows) in the original block + size_t delta_pos = block_pos - row_pos; + std::string key = _full_encode_keys(_key_columns, delta_pos); + + _maybe_invalid_row_cache(key); + if (_seq_column != nullptr) { + _encode_seq_column(_seq_column, delta_pos, &key); + } + + // mark key with delete sign as deleted. + bool have_delete_sign = + (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); + + RowLocation loc; + // save rowset shared ptr so this rowset wouldn't delete + RowsetSharedPtr rowset; + auto st = tablet->lookup_row_key(key, tablet->tablet_schema().get(), _seq_column != nullptr, + specified_rowsets, &loc, _mow_context->max_version, + segment_caches, &rowset); + if (st.is()) { + // it's an insert row + _has_default_or_nullable = true; + _use_default_or_null_flag.emplace_back(true); + _operators.emplace_back(have_delete_sign ? ROW_BINLOG_DELETE : ROW_BINLOG_APPEND); + continue; + } + if (!st.ok() && !st.is()) { + LOG(WARNING) << "failed to lookup row key, error: " << st; + return st; + } + + CHECK(_context.rowset_writer_ctx != nullptr); + bool write_before = + _context.rowset_writer_ctx->write_binlog_opt().write_binlog_config().write_before; + // 1. if the delete sign is marked, it means that the value columns of the row will not + // be read. So we don't need to read the missing values from the previous rows. + // 2. the one exception is when there are sequence columns in the table, we need to read + // the sequence columns, otherwise it may cause the merge-on-read based compaction + // policy to produce incorrect results. + // 3. if row binlog needs BEFORE image, delete rows must still read historical values so + // __BEFORE__* columns can be populated. + if (have_delete_sign && !tablet_schema->has_sequence_col() && !write_before) { + _has_default_or_nullable = true; + _use_default_or_null_flag.emplace_back(true); + _operators.emplace_back(ROW_BINLOG_DELETE); + } else { + // partial update should not contain invisible columns + _use_default_or_null_flag.emplace_back(false); + _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); + // currently we think row_pos must be zero, so we won't consider row_pos > 0 + DCHECK(row_pos == 0); + _rssid_to_rid.prepare_to_read(loc, delta_pos); + _operators.emplace_back(have_delete_sign ? ROW_BINLOG_DELETE : ROW_BINLOG_UPDATE); + } + } + + CHECK_EQ(_use_default_or_null_flag.size(), num_rows); + + return Status::OK(); +} + +Status PrimaryKeyModelRowRetriever::build_after_block(Block* block, size_t row_pos, + size_t num_rows) { + DCHECK_EQ(_use_default_or_null_flag.size(), num_rows); + if (config::is_cloud_mode()) { + return Status::NotSupported("fill_missing_columns"); + } + if (_context.partial_update_info == nullptr) { + return Status::InternalError("partial update info is null"); + } + return _rssid_to_rid.fill_missing_columns( + _context, _rsid_to_rowset, *_context.tablet_schema, *block, _use_default_or_null_flag, + _has_default_or_nullable, cast_set(row_pos), block); +} + +Status PrimaryKeyModelRowRetriever::build_before_block(Block* before_block, + const std::vector& value_cids, + size_t /*row_pos*/, size_t num_rows) { + if (config::is_cloud_mode()) { + // TODO(plat1ko): cloud mode + return Status::NotSupported("fill_before_columns"); + } + + auto& tablet_schema = _context.tablet_schema; + + if (num_rows == 0 || value_cids.empty()) { + return Status::OK(); + } + + // Create block to hold historical values for value columns. + Block old_value_block = tablet_schema->create_block_by_cids(value_cids); + CHECK_EQ(value_cids.size(), old_value_block.columns()); + + // key: logical row index in current batch; value: index in old_value_block + std::map read_index; + RETURN_IF_ERROR(_rssid_to_rid.read_columns_by_plan(*tablet_schema, value_cids, _rsid_to_rowset, + old_value_block, &read_index, false, + nullptr)); + + auto mutable_before_columns = before_block->mutate_columns(); + // Fill each row in before_block. + for (uint32_t idx = 0; idx < num_rows; ++idx) { + auto it = read_index.find(idx); + if (it == read_index.end()) { + // No historical row, fill BEFORE with NULL. + for (size_t i = 0; i < value_cids.size(); ++i) { + auto* nullable_column = + assert_cast(mutable_before_columns[i].get()); + nullable_column->insert_many_defaults(1); + } + continue; + } + + uint32_t pos_in_old_block = it->second; + for (size_t i = 0; i < value_cids.size(); ++i) { + insert_value_to_nullable_column(mutable_before_columns[i].get(), + *old_value_block.get_by_position(i).column, + pos_in_old_block); + } + } + + before_block->set_columns(std::move(mutable_before_columns)); + return Status::OK(); +} + +std::string PrimaryKeyModelRowRetriever::_full_encode_keys( + const std::vector& key_columns, size_t pos, bool null_first) { + return _full_encode_keys(_key_coders, key_columns, pos, null_first); +} + +std::string PrimaryKeyModelRowRetriever::_full_encode_keys( + const std::vector& key_coders, + const std::vector& key_columns, size_t pos, bool null_first) { + assert(key_columns.size() == key_coders.size()); + + std::string encoded_keys; + size_t cid = 0; + for (const auto& column : key_columns) { + auto field = column->get_data_at(pos); + if (UNLIKELY(!field)) { + if (null_first) { + encoded_keys.push_back(KeyConsts::KEY_NULL_FIRST_MARKER); + } else { + encoded_keys.push_back(KeyConsts::KEY_NORMAL_MARKER); + } + ++cid; + continue; + } + encoded_keys.push_back(KeyConsts::KEY_NORMAL_MARKER); + DCHECK(key_coders[cid] != nullptr); + key_coders[cid]->full_encode_ascending(field, &encoded_keys); + ++cid; + } + return encoded_keys; +} + +void PrimaryKeyModelRowRetriever::_encode_seq_column(const IOlapColumnDataAccessor* seq_column, + size_t pos, std::string* encoded_keys) { + auto field = seq_column->get_data_at(pos); + // To facilitate the use of the primary key index, encode the seq column + // to the minimum value of the corresponding length when the seq column + // is null + if (UNLIKELY(!field)) { + auto& tablet_schema = _context.tablet_schema; + encoded_keys->push_back(KeyConsts::KEY_NULL_FIRST_MARKER); + size_t seq_col_length = tablet_schema->column(tablet_schema->sequence_col_idx()).length(); + encoded_keys->append(seq_col_length, KeyConsts::KEY_MINIMAL_MARKER); + return; + } + encoded_keys->push_back(KeyConsts::KEY_NORMAL_MARKER); + _seq_coder->full_encode_ascending(field, encoded_keys); +} + +void PrimaryKeyModelRowRetriever::_maybe_invalid_row_cache(const std::string& key) { + // Just invalid row cache for simplicity, since the rowset is not visible at present. + // If we update/insert cache, if load failed rowset will not be visible but cached data + // will be visible, and lead to inconsistency. + if (!config::disable_storage_row_cache && + _context.tablet_schema->has_row_store_for_all_columns() && + _context.write_type == DataWriteType::TYPE_DIRECT) { + // invalidate cache + RowCache::instance()->erase({static_cast(_context.tablet->tablet_id()), key}); + } +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/storage/segment/historical_row_retriever.h b/be/src/storage/segment/historical_row_retriever.h new file mode 100644 index 00000000000000..5949896a4765a1 --- /dev/null +++ b/be/src/storage/segment/historical_row_retriever.h @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "common/status.h" +#include "core/block/block.h" +#include "storage/olap_define.h" +#include "storage/olap_utils.h" +#include "storage/partial_update_info.h" +#include "storage/tablet/tablet_fwd.h" + +namespace doris { +struct RowsetWriterContext; +class KeyCoder; +struct MowContext; + +namespace segment_v2 { + +struct HistoricalRowRetrieverContext { + BaseTabletSPtr tablet; + TabletSchemaSPtr tablet_schema; + RowsetWriterContext* rowset_writer_ctx = nullptr; + std::shared_ptr partial_update_info; + bool is_transient_rowset_writer = false; + DataWriteType write_type = DataWriteType::TYPE_DEFAULT; +}; + +class HistoricalRowRetriever { +public: + HistoricalRowRetriever() = default; + virtual ~HistoricalRowRetriever() = default; + + virtual Status init(const HistoricalRowRetrieverContext& rowset_writer_context) = 0; + + virtual Status retrieve_historical_row(const Int8* delete_sign_column_data, size_t row_pos, + size_t num_rows) = 0; + + virtual Status build_after_block(Block* block, size_t row_pos, size_t num_rows) = 0; + virtual Status build_before_block(Block* before_block, const std::vector& value_cids, + size_t row_pos, size_t num_rows) = 0; + virtual void clear() = 0; + + virtual std::vector& get_operators() = 0; + +protected: + HistoricalRowRetrieverContext _context; +}; + +class PrimaryKeyModelRowRetriever : public HistoricalRowRetriever { +public: + Status init(const HistoricalRowRetrieverContext& context) override; + + Status prepare_lookup_plan_from_source_columns( + const std::vector& key_columns, + const IOlapColumnDataAccessor* seq_column, std::shared_ptr mow_context) { + _key_columns = key_columns; + _seq_column = seq_column; + _mow_context = mow_context; + return Status::OK(); + } + + Status retrieve_historical_row(const Int8* delete_sign_column_data, size_t row_pos, + size_t num_rows) override; + + Status build_after_block(Block* block, size_t row_pos, size_t num_rows) override; + + Status build_before_block(Block* before_block, const std::vector& value_cids, + size_t /*row_pos*/, size_t num_rows) override; + + void clear() override { + _key_columns.clear(); + _seq_column = nullptr; + _use_default_or_null_flag.clear(); + _has_default_or_nullable = false; + _rssid_to_rid.clear(); + _rsid_to_rowset.clear(); + _operators.clear(); + } + + std::vector& get_operators() override { return _operators; }; + +private: + void _maybe_invalid_row_cache(const std::string& key); + + // used for unique-key with merge on write and segment min_max key + std::string _full_encode_keys(const std::vector& key_columns, + size_t pos, bool null_first = true); + + std::string _full_encode_keys(const std::vector& key_coders, + const std::vector& key_columns, + size_t pos, bool null_first = true); + + // used for unique-key with merge on write + void _encode_seq_column(const IOlapColumnDataAccessor* seq_column, size_t pos, + std::string* encoded_keys); + + // get key_columns, seq column, delete data from source block, prepare for searching historial data + std::vector _key_columns; + const IOlapColumnDataAccessor* _seq_column = nullptr; + std::shared_ptr _mow_context; + // used for building primary key index during vectorized write. + // for mow table with cluster keys, this is cluster keys + std::vector _key_coders; + KeyCoder* _seq_coder = nullptr; + + // group every rowset-segment row id to speed up reader + FixedReadPlan _rssid_to_rid; + std::map _rsid_to_rowset; + + // cache flags for filling missing columns + std::vector _use_default_or_null_flag; + bool _has_default_or_nullable = false; + + // cache operator for fill_binlog_columns + std::vector _operators; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/storage/segment/row_binlog_segment_writer.cpp b/be/src/storage/segment/row_binlog_segment_writer.cpp new file mode 100644 index 00000000000000..98c123f3aec279 --- /dev/null +++ b/be/src/storage/segment/row_binlog_segment_writer.cpp @@ -0,0 +1,522 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/segment/row_binlog_segment_writer.h" + +#include "cloud/config.h" +#include "common/cast_set.h" +#include "core/block/column_with_type_and_name.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "storage/binlog.h" +#include "storage/iterator/olap_data_convertor.h" +#include "storage/olap_utils.h" +#include "storage/rowset/rowset_writer_context.h" // RowsetWriterContext + +namespace doris { +namespace segment_v2 { + +RowBinlogSourceDataWriter::RowBinlogSourceDataWriter(const SegmentWriteBinlogOptions& opt) + : _opt(opt) {} + +RowBinlogSegmentWriter::RowBinlogSegmentWriter( + io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, + BaseTabletSPtr tablet, DataDir* data_dir, const SegmentWriterOptions& opts, + const segment_v2::SegmentWriteBinlogOptions& row_binlog_opts) + : SegmentWriter(file_writer, segment_id, tablet_schema, tablet, data_dir, opts, nullptr), + _binlog_opts(row_binlog_opts) { + if (_opts.write_type == DataWriteType::TYPE_DIRECT) { + _source_data_writer = std::make_unique(row_binlog_opts); + _lsn_ids = const_cast(row_binlog_opts).get_seg_lsn(_segment_id); + const_cast(row_binlog_opts).remove_seg(segment_id); + } +} + +RowBinlogSourceDataWriter::~RowBinlogSourceDataWriter() { + this->clear(); +} + +Status RowBinlogSegmentWriter::init() { + RETURN_IF_ERROR(SegmentWriter::init()); + + if (_opts.write_type != DataWriteType::TYPE_DIRECT) { + return Status::OK(); + } + + RETURN_IF_ERROR(_source_data_writer->init()); + _write_before = _source_data_writer->need_before(); + + const TabletSchemaSPtr& source_schema = _binlog_opts.source.tablet_schema; + if (UNLIKELY(source_schema == nullptr)) { + return Status::InternalError("binlog writer missing source_tablet_schema"); + } + + int lsn_col_id = _tablet_schema->field_index(std::string(kRowBinlogLsnColName)); + CHECK(lsn_col_id >= 0) << "binlog schema missing __DORIS_BINLOG_LSN__"; + _binlog_col_start_id = static_cast(lsn_col_id); + _normal_col_start_id = lsn_col_id == 0 ? BINLOG_COLNUM : 0; + + uint32_t normal_col_num = cast_set(source_schema->num_visible_columns()); + _before_col_start_id = _normal_col_start_id + normal_col_num; + + if (!_write_before && _tablet_schema->num_columns() > normal_col_num + BINLOG_COLNUM) { + // Compatibility path + _fill_empty_before_value = true; + _write_before = true; + } + + HistoricalRowRetrieverContext historical_row_retriever_context = { + .tablet = _tablet, + .tablet_schema = source_schema, + .rowset_writer_ctx = _opts.rowset_ctx, + .partial_update_info = _binlog_opts.source.partial_update_info, + .is_transient_rowset_writer = _binlog_opts.source.is_transient_rowset_writer, + .write_type = _binlog_opts.source.source_write_type}; + if (_tablet->enable_unique_key_merge_on_write()) { + _historical_data_writer = std::make_unique(); + RETURN_IF_ERROR(_historical_data_writer->init(historical_row_retriever_context)); + } else if (_tablet->keys_type() == KeysType::AGG_KEYS) { + // todo + } + return Status::OK(); +} + +Status RowBinlogSegmentWriter::append_block(const Block* block, size_t row_pos, size_t num_rows) { + if (config::is_cloud_mode()) { + // TODO(cjh): cloud mode + return Status::NotSupported("append binlog"); + } + + if (_opts.write_type != DataWriteType::TYPE_DIRECT) { + // append block directly because binlog data is completed + RETURN_IF_ERROR(_append_direct_block(block, row_pos, num_rows)); + return Status::OK(); + } + + const TabletSchemaSPtr& source_schema = _binlog_opts.source.tablet_schema; + if (UNLIKELY(source_schema == nullptr)) { + return Status::InternalError("binlog writer missing source_tablet_schema"); + } + + bool is_partial_update = _binlog_opts.source.partial_update_info && + _binlog_opts.source.partial_update_info->is_partial_update() && + _binlog_opts.source.source_write_type == DataWriteType::TYPE_DIRECT && + !_binlog_opts.source.is_transient_rowset_writer; + std::vector partial_cids = + is_partial_update ? _binlog_opts.source.partial_update_info->update_cids + : std::vector(); + if (is_partial_update) { + if (block->columns() <= source_schema->num_key_columns() || + block->columns() >= source_schema->num_columns()) { + return Status::InternalError(fmt::format( + "illegal partial update block columns: {}, num key columns: {}, total " + "schema columns: {}", + block->columns(), _tablet_schema->num_key_columns(), + _tablet_schema->num_columns())); + } + + // binlog don't need invisible column + auto erase_invisible_col = std::remove_if( + partial_cids.begin(), partial_cids.end(), + [&](uint32_t cid) { return cid >= source_schema->num_visible_columns(); }); + partial_cids.erase(erase_invisible_col, partial_cids.end()); + } + + // get delete_sign_column from source block if has + const Int8* delete_sign_column_data = nullptr; + int32_t delete_sign_column_id = source_schema->delete_sign_idx(); + int32_t seq_col_id = source_schema->sequence_col_idx(); + if (is_partial_update) { + delete_sign_column_id = -1; + seq_col_id = -1; + int32_t pos = 0; + for (auto& cid : _binlog_opts.source.partial_update_info->update_cids) { + if (cid == source_schema->delete_sign_idx()) { + delete_sign_column_id = pos; + } else if (cid == source_schema->sequence_col_idx()) { + seq_col_id = pos; + } + pos++; + } + } + if (delete_sign_column_id != -1) { + const ColumnWithTypeAndName& delete_sign_column = + block->get_by_position(delete_sign_column_id); + + auto& delete_sign_col = reinterpret_cast(*(delete_sign_column.column)); + if (delete_sign_col.size() >= row_pos + num_rows) { + delete_sign_column_data = delete_sign_col.get_data().data(); + } + } + + // use full_block to save entrie row data + Block full_block = source_schema->create_block(); + + RETURN_IF_ERROR(_source_data_writer->prepare_by_source_block(block, row_pos, num_rows, + partial_cids, &full_block)); + + if (seq_col_id != -1) { + RETURN_IF_ERROR(_source_data_writer->prepare_seq_column(block->get_by_position(seq_col_id), + source_schema->sequence_col_idx(), + row_pos, num_rows)); + } + + size_t max_normal_col_id = _normal_col_start_id + source_schema->num_visible_columns(); + RETURN_IF_ERROR(_source_data_writer->fill_normal_columns(_column_writers, _normal_col_start_id, + max_normal_col_id, partial_cids)); + + // We read historical rows only when we really need them: + // 1. partial update: build the full AFTER row. + // 2. write_before: fill __BEFORE__* columns. + // Otherwise we do not compare with old rows here, so row binlog op only + // keeps the simple meaning: append for non-delete rows, delete for delete rows. + if (is_partial_update || _write_before) { + auto* pk_retriever = + dynamic_cast(_historical_data_writer.get()); + DCHECK(pk_retriever != nullptr); + RETURN_IF_ERROR(pk_retriever->prepare_lookup_plan_from_source_columns( + _source_data_writer->source_key_columns(), _source_data_writer->seq_column(), + _binlog_opts.source.mow_context)); + RETURN_IF_ERROR(_historical_data_writer->retrieve_historical_row(delete_sign_column_data, + row_pos, num_rows)); + } + + if (is_partial_update) { + std::vector row_binlog_missing_column_ids; + _source_data_writer->filter_source_ids( + _binlog_opts.source.partial_update_info->missing_cids, + row_binlog_missing_column_ids); + + // build AFTER block (fill missing columns in full_block) + RETURN_IF_ERROR(_historical_data_writer->build_after_block(&full_block, row_pos, num_rows)); + + // write AFTER missing columns from full_block to segment + auto& after_convertor = _source_data_writer->olap_data_convertor(); + RETURN_IF_ERROR(after_convertor->set_source_content_with_specifid_columns( + &full_block, row_pos, num_rows, row_binlog_missing_column_ids)); + for (auto cid : row_binlog_missing_column_ids) { + auto converted_cid = _normal_col_start_id + cid; + auto converted_result = after_convertor->convert_column_data(cid); + if (!converted_result.first.ok()) { + return converted_result.first; + } + RETURN_IF_ERROR(_column_writers[converted_cid]->append( + converted_result.second->get_nullmap(), converted_result.second->get_data(), + num_rows)); + } + } + + // get key column, we use them to construct key index and search historical data. + DCHECK(!_tablet_schema->has_sequence_col()); + // _converted_key_columns must be resized before fill binlog columns + _converted_key_columns.resize(_tablet_schema->num_key_columns()); + for (size_t i = _normal_col_start_id; i < _tablet_schema->num_key_columns(); i++) { + _converted_key_columns[i] = _source_data_writer->get_converted_column( + cast_set(i - _normal_col_start_id)); + } + + std::vector no_operators = std::vector {}; + std::vector& operators = + _historical_data_writer ? _historical_data_writer->get_operators() : no_operators; + if (operators.empty()) { + // haven't search historical row, only delete or append + for (size_t block_pos = row_pos; block_pos < row_pos + num_rows; block_pos++) { + bool have_delete_sign = + (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); + if (have_delete_sign) { + operators.emplace_back(ROW_BINLOG_DELETE); + } else { + operators.emplace_back(ROW_BINLOG_APPEND); + } + } + } + + RETURN_IF_ERROR(_fill_binlog_columns(num_rows, operators)); + + // row-binlog key don't need seq column + RETURN_IF_ERROR(build_key_index(_converted_key_columns, nullptr, num_rows)); + + if (_write_before) { + RETURN_IF_ERROR(_fill_before_columns(num_rows)); + } + + _num_rows_written += num_rows; + // need to clean olap_data_convertor that be used when fill binlog columns and build key index + _olap_data_convertor->clear_source_content(); + _source_data_writer->clear(); + if (_historical_data_writer) { + _historical_data_writer->clear(); + } + return Status::OK(); +} + +Status RowBinlogSegmentWriter::_append_direct_block(const Block* block, size_t row_pos, + size_t num_rows) { + _olap_data_convertor->set_source_content(block, row_pos, num_rows); + + // convert column data from endgine format to storage layer format + std::vector key_columns; + for (size_t id = 0; id < _column_writers.size(); ++id) { + // olap data convertor always start from id = 0 + auto converted_result = _olap_data_convertor->convert_column_data(id); + if (!converted_result.first.ok()) { + return converted_result.first; + } + auto cid = _column_ids[id]; + if (_has_key && cid < _tablet_schema->num_key_columns()) { + key_columns.push_back(converted_result.second); + } + RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(), + converted_result.second->get_data(), num_rows)); + } + + RETURN_IF_ERROR(build_key_index(key_columns, nullptr, num_rows)); + + _num_rows_written += num_rows; + _olap_data_convertor->clear_source_content(); + + return Status::OK(); +} + +Status RowBinlogSegmentWriter::_fill_binlog_columns(size_t num_rows, + const std::vector& op_types) { + std::vector binlog_cids = {_binlog_col_start_id, _binlog_col_start_id + 1, + _binlog_col_start_id + 2}; + Block binlog_prefix_block = _tablet_schema->create_block_by_cids(binlog_cids); + MutableColumns binlog_prefix_columns = binlog_prefix_block.mutate_columns(); + // we can't get correct lsn number before commit, because we can't get the version before commit, + // but we can fill auto-inc lsn to ensure the order first, then fill version when read single rowset. + IColumn* lsn_col_ptr = binlog_prefix_columns[0].get(); + CHECK(_lsn_ids->size() >= num_rows) << _lsn_ids->size() << " vs " << num_rows; + for (int i = 0; i < num_rows; i++) { + assert_cast(lsn_col_ptr) + ->insert_value(static_cast(_lsn_ids->at(i))); + } + + // wrong op only happens when partial-update, it will be fixed by delete bitmap when publish + const FieldType op_col_type = _tablet_schema->column(binlog_cids[1]).type(); + IColumn* op_col_ptr = binlog_prefix_columns[1].get(); + auto* op_nullable_column = typeid_cast(op_col_ptr); + IColumn* op_nested_column = + op_nullable_column != nullptr ? &op_nullable_column->get_nested_column() : op_col_ptr; + + CHECK(op_types.size() >= num_rows) << op_types.size() << " vs " << num_rows; + CHECK(op_col_type == FieldType::OLAP_FIELD_TYPE_BIGINT) + << "row binlog op column type must be BIGINT, actual=" << static_cast(op_col_type); + auto* op_int64_column = assert_cast(op_nested_column); + for (int i = 0; i < num_rows; i++) { + op_int64_column->insert_value(op_types[i]); + } + + // we can't get correct timestamp when commit + IColumn* ts_col_ptr = binlog_prefix_columns[2].get(); + auto timestamp = UnixMillis(); + auto* ts_nullable_column = typeid_cast(ts_col_ptr); + if (ts_nullable_column != nullptr) { + assert_cast(&ts_nullable_column->get_nested_column()) + ->insert_many_vals(timestamp, num_rows); + } else { + assert_cast(ts_col_ptr)->insert_many_vals(timestamp, num_rows); + } + + // finally update null map + for (int i = 0; i < num_rows; i++) { + //lsn_column->get_null_map_data().emplace_back(0); + if (op_nullable_column != nullptr) { + op_nullable_column->get_null_map_data().emplace_back(0); + } + if (ts_nullable_column != nullptr) { + ts_nullable_column->get_null_map_data().emplace_back(0); + } + } + + // LOG(INFO) << binlog_prefix_block.dump_data(0, num_rows); + + size_t col_pos_in_block = 0; + for (auto& cid : binlog_cids) { + // convert to olap data + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + binlog_prefix_block.get_by_position(col_pos_in_block++), 0, num_rows, cid)); + auto converted_result = _olap_data_convertor->convert_column_data(cid); + if (!converted_result.first.ok()) { + return converted_result.first; + } + if (cid < _tablet_schema->num_key_columns()) { + _converted_key_columns[cid] = converted_result.second; + } + RETURN_IF_ERROR(_column_writers[cid]->append(converted_result.second->get_nullmap(), + converted_result.second->get_data(), + num_rows)); + } + + return Status::OK(); +} + +Status RowBinlogSegmentWriter::_fill_before_columns(size_t num_rows) { + const TabletSchemaSPtr& source_schema = _binlog_opts.source.tablet_schema; + if (UNLIKELY(source_schema == nullptr)) { + return Status::InternalError("row binlog writer missing source_tablet_schema"); + } + size_t value_column_num = source_schema->num_visible_value_columns(); + if (value_column_num == 0) { + // No BEFORE columns in row binlog schema. + return Status::OK(); + } + + uint32_t before_start_id = _before_col_start_id; + uint32_t before_end_id = _before_col_start_id + cast_set(value_column_num); + + std::vector before_cids; + for (uint32_t cid = before_start_id; cid < before_end_id; cid++) { + before_cids.emplace_back(cid); + } + + Block before_block = _tablet_schema->create_block_by_cids(before_cids); + + // Compatibility path: only fill empty BEFORE values. + if (_fill_empty_before_value) { + MutableColumns before_mutable_columns = before_block.mutate_columns(); + for (auto& before_mutable_column : before_mutable_columns) { + auto* before_nullable_column = + reinterpret_cast(before_mutable_column.get()); + before_nullable_column->insert_many_defaults(num_rows); + } + before_block.set_columns(std::move(before_mutable_columns)); + } else { + DCHECK(_historical_data_writer != nullptr); + + std::vector value_cids; + uint32_t value_start = cast_set(source_schema->num_key_columns()); + uint32_t value_end = cast_set(source_schema->num_visible_columns()); + for (uint32_t cid = value_start; cid < value_end; ++cid) { + value_cids.emplace_back(cid); + } + + DCHECK_EQ(before_cids.size(), value_cids.size()); + RETURN_IF_ERROR(_historical_data_writer->build_before_block(&before_block, value_cids, 0, + num_rows)); + } + + size_t col_pos_in_block = 0; + for (auto& cid : before_cids) { + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + before_block.get_by_position(col_pos_in_block++), 0, num_rows, cid)); + auto converted_result = _olap_data_convertor->convert_column_data(cid); + if (!converted_result.first.ok()) { + return converted_result.first; + } + RETURN_IF_ERROR(_column_writers[cid]->append(converted_result.second->get_nullmap(), + converted_result.second->get_data(), + num_rows)); + } + + return Status::OK(); +} + +Status RowBinlogSourceDataWriter::init() { + _olap_data_convertor = std::make_unique(); + // _normal_column_ids: the columns which we need to write into binlog from source block + if (UNLIKELY(_opt.source.tablet_schema == nullptr)) { + return Status::InternalError("row binlog writer missing source_tablet_schema"); + } + for (uint32_t i = 0; i < _opt.source.tablet_schema->num_visible_columns(); i++) { + _normal_column_ids.emplace_back(i); + } + _olap_data_convertor->reserve(_opt.source.tablet_schema->num_columns()); + for (size_t cid = 0; cid < _opt.source.tablet_schema->num_columns(); cid++) { + _olap_data_convertor->add_column_data_convertor(_opt.source.tablet_schema->column(cid)); + } + return Status::OK(); +} + +Status RowBinlogSourceDataWriter::prepare_by_source_block( + const Block* block, size_t row_pos, size_t num_rows, + std::vector& partial_source_cids, Block* full_block) { + _converted_columns.resize(_normal_column_ids.size()); + + // LOG(INFO) << block->dump_data(0, num_rows); + + // convert column data from engine format to storage layer format + size_t col_pos_in_block = 0; + TabletSchemaSPtr tablet_schema = _opt.source.tablet_schema; + const auto& including_cids = + partial_source_cids.empty() ? _normal_column_ids : partial_source_cids; + for (auto& cid : including_cids) { + const ColumnWithTypeAndName& col = block->get_by_position(col_pos_in_block++); + + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + col, row_pos, num_rows, cid)); + // olap data convertor alway start from id = 0 + auto converted_result = _olap_data_convertor->convert_column_data(cid); + if (!converted_result.first.ok()) { + return converted_result.first; + } + _converted_columns[cid] = converted_result.second; + + if (cid < tablet_schema->num_key_columns()) { + _key_columns.push_back(converted_result.second); + } + full_block->replace_by_position(cid, col.column); + } + _num_rows = num_rows; + + return Status::OK(); +} + +Status RowBinlogSourceDataWriter::prepare_seq_column(const ColumnWithTypeAndName& col, + int32_t seq_col_id_in_schema, size_t row_pos, + size_t num_rows) { + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + col, row_pos, num_rows, seq_col_id_in_schema)); + auto converted_result = _olap_data_convertor->convert_column_data(seq_col_id_in_schema); + if (!converted_result.first.ok()) { + return converted_result.first; + } + _seq_column = converted_result.second; + return Status::OK(); +} + +Status RowBinlogSourceDataWriter::fill_normal_columns( + std::vector>& column_writers, size_t start, size_t end, + std::vector& partial_source_cids) { + DCHECK_EQ(end - start, _normal_column_ids.size()); + + const auto& including_cids = + partial_source_cids.empty() ? _normal_column_ids : partial_source_cids; + for (size_t cid : including_cids) { + DCHECK(column_writers[start + cid]->get_field()->type() == + _opt.source.tablet_schema->columns()[cid]->type()) + << cid; + RETURN_IF_ERROR(column_writers[start + cid]->append(_converted_columns[cid]->get_nullmap(), + _converted_columns[cid]->get_data(), + _num_rows)); + } + + return Status::OK(); +} + +void RowBinlogSourceDataWriter::clear() { + if (_olap_data_convertor) { + _olap_data_convertor->clear_source_content(); + } + _num_rows = 0; + _key_columns.clear(); + _seq_column = nullptr; +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/storage/segment/row_binlog_segment_writer.h b/be/src/storage/segment/row_binlog_segment_writer.h new file mode 100644 index 00000000000000..bbd5e18cb1320c --- /dev/null +++ b/be/src/storage/segment/row_binlog_segment_writer.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/binlog.h" +#include "storage/segment/historical_row_retriever.h" +#include "storage/segment/segment_writer.h" +namespace doris { + +namespace segment_v2 { +#define BINLOG_COLNUM 3 + +class RowBinlogSourceDataWriter { +public: + explicit RowBinlogSourceDataWriter(const SegmentWriteBinlogOptions& opt); + + ~RowBinlogSourceDataWriter(); + + Status init(); + + Status prepare_by_source_block(const Block* block, size_t row_pos, size_t num_rows, + std::vector& partial_source_cids, + Block* full_block = nullptr); + + Status prepare_seq_column(const ColumnWithTypeAndName& col, int32_t seq_col_id_in_schema, + size_t row_pos, size_t num_rows); + + Status fill_normal_columns(std::vector>& column_writers, + size_t start, size_t end, + std::vector& partial_source_cids); + + void clear(); + + IOlapColumnDataAccessor* get_converted_column(uint32_t cid) { return _converted_columns[cid]; } + + bool need_before() const { return _opt.write_before; } + + const std::vector& source_key_columns() const { return _key_columns; } + const IOlapColumnDataAccessor* seq_column() const { return _seq_column; } + + std::unique_ptr& olap_data_convertor() { return _olap_data_convertor; } + + void filter_source_ids(std::vector& full_cids, std::vector& res_cids) { + res_cids.reserve(full_cids.size()); + std::set_intersection(_normal_column_ids.begin(), _normal_column_ids.end(), + full_cids.begin(), full_cids.end(), std::back_inserter(res_cids)); + } + +private: + const SegmentWriteBinlogOptions& _opt; + std::unique_ptr _olap_data_convertor; + std::vector _normal_column_ids; + std::vector _converted_columns; + size_t _num_rows = 0; + + std::vector _key_columns; + IOlapColumnDataAccessor* _seq_column = nullptr; +}; + +class RowBinlogSegmentWriter : public SegmentWriter { +public: + explicit RowBinlogSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, + TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, + DataDir* data_dir, const SegmentWriterOptions& opts, + const segment_v2::SegmentWriteBinlogOptions& row_binlog_opts); + + ~RowBinlogSegmentWriter() override = default; + + Status init() override; + + Status append_block(const Block* block, size_t row_pos, size_t num_rows) override; + + // append block directly for binlog compaction + Status _append_direct_block(const Block* block, size_t row_pos, size_t num_rows); + + Status _fill_binlog_columns(size_t num_rows, const std::vector& op_types); + + Status _fill_before_columns(size_t num_rows); + +private: + bool _write_before = false; + bool _fill_empty_before_value = false; + + uint32_t _normal_col_start_id = 0; + uint32_t _before_col_start_id = 0; + uint32_t _binlog_col_start_id = 0; + + std::unique_ptr _source_data_writer; + std::unique_ptr _historical_data_writer; + + const SegmentWriteBinlogOptions& _binlog_opts; + + std::vector _converted_key_columns; + std::shared_ptr> _lsn_ids; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/storage/segment/segment_iterator.cpp b/be/src/storage/segment/segment_iterator.cpp index 47992c592da387..da4e78b4d0786b 100644 --- a/be/src/storage/segment/segment_iterator.cpp +++ b/be/src/storage/segment/segment_iterator.cpp @@ -51,6 +51,7 @@ #include "core/column/column_string.h" #include "core/column/column_variant.h" #include "core/column/column_vector.h" +#include "core/column/predicate_column.h" #include "core/data_type/data_type.h" #include "core/data_type/data_type_factory.hpp" #include "core/data_type/data_type_number.h" @@ -72,6 +73,7 @@ #include "runtime/runtime_predicate.h" #include "runtime/runtime_state.h" #include "runtime/thread_context.h" +#include "storage/binlog.h" #include "storage/compaction/collection_similarity.h" #include "storage/field.h" #include "storage/id_manager.h" @@ -2388,6 +2390,119 @@ void SegmentIterator::_replace_version_col_if_needed(const std::vector VLOG_DEBUG << "replaced version column in segment iterator, version_col_idx:" << version_idx; } +void SegmentIterator::_update_lsn_col_if_needed(const std::vector& column_ids, + size_t num_rows) { + // | commit tso(64) | auto-inc row_id(64) | + if (_opts.version.first != _opts.version.second) { + return; + } + + if (_opts.io_ctx.reader_type != ReaderType::READER_BINLOG && + _opts.io_ctx.reader_type != ReaderType::READER_BINLOG_COMPACTION) { + return; + } + + int32_t lsn_col_idx = _schema->lsn_col_idx(); + if (lsn_col_idx < 0 || std::ranges::find(column_ids, lsn_col_idx) == column_ids.end()) { + return; + } + + DCHECK_EQ(_opts.commit_tso.start_tso(), _opts.commit_tso.end_tso()); + const Int64 commit_tso = _opts.commit_tso.end_tso() == -1 ? 0 : _opts.commit_tso.end_tso(); + + if (_is_pred_column[lsn_col_idx]) { + auto* lsn_column = assert_cast*>( + _current_return_columns[lsn_col_idx].get()); + std::vector binlog_lsns; + binlog_lsns.reserve(num_rows); + for (size_t j = 0; j < num_rows; j++) { + const Int128 row_id = lsn_column->get_data()[j]; + binlog_lsns.emplace_back(make_row_binlog_lsn(commit_tso, row_id)); + } + _current_return_columns[lsn_col_idx]->clear(); + for (const auto& binlog_lsn : binlog_lsns) { + lsn_column->insert_data(reinterpret_cast(&binlog_lsn), 0); + } + return; + } + + auto* lsn_column = assert_cast(_current_return_columns[lsn_col_idx].get()); + const auto* column_desc = _schema->column(lsn_col_idx); + auto column = Schema::get_data_type_ptr(*column_desc)->create_column(); + DCHECK(column_desc->type() == FieldType::OLAP_FIELD_TYPE_LARGEINT); + auto* col_ptr = assert_cast(column.get()); + + for (size_t j = 0; j < num_rows; j++) { + const Int128 row_id = lsn_column->get_element(j); + col_ptr->insert_value(make_row_binlog_lsn(commit_tso, row_id)); + } + _current_return_columns[lsn_col_idx] = std::move(column); +} + +void SegmentIterator::_update_tso_col_if_needed(const std::vector& column_ids, + size_t num_rows) { + // use physical time part of commit tso to replace timestamp col + if (_opts.version.first != _opts.version.second) { + return; + } + + if (_opts.io_ctx.reader_type != ReaderType::READER_BINLOG && + _opts.io_ctx.reader_type != ReaderType::READER_BINLOG_COMPACTION) { + return; + } + + int32_t tso_col_idx = _schema->tso_col_idx(); + if (tso_col_idx < 0 || std::ranges::find(column_ids, tso_col_idx) == column_ids.end()) { + return; + } + + DCHECK_EQ(_opts.commit_tso.start_tso(), _opts.commit_tso.end_tso()); + Int64 commit_tso = _opts.commit_tso.end_tso() == -1 ? 0 : _opts.commit_tso.end_tso(); + Int64 commit_time = extract_tso_physical_time(commit_tso); + + if (_is_pred_column[tso_col_idx]) { + // Nullable predicate column is represented as ColumnNullable(predicate_col) + if (auto* tso_nullable = + typeid_cast(_current_return_columns[tso_col_idx].get())) { + _current_return_columns[tso_col_idx]->clear(); + auto value = commit_time; + for (size_t j = 0; j < num_rows; j++) { + tso_nullable->get_nested_column_ptr()->insert_data( + reinterpret_cast(&value), 0); + tso_nullable->get_null_map_data().emplace_back(0); + } + return; + } + + auto* tso_column = assert_cast*>( + _current_return_columns[tso_col_idx].get()); + _current_return_columns[tso_col_idx]->clear(); + auto value = commit_time; + for (size_t j = 0; j < num_rows; j++) { + tso_column->insert_data(reinterpret_cast(&value), 0); + } + return; + } + + const auto* column_desc = _schema->column(tso_col_idx); + auto column = Schema::get_data_type_ptr(*column_desc)->create_column(); + DCHECK(column_desc->type() == FieldType::OLAP_FIELD_TYPE_BIGINT); + + if (auto* tso_nullable = typeid_cast(column.get())) { + auto* col_ptr = assert_cast(&tso_nullable->get_nested_column()); + for (size_t j = 0; j < num_rows; j++) { + col_ptr->insert_value(commit_time); + tso_nullable->get_null_map_data().emplace_back(0); + } + } else { + auto* col_ptr = assert_cast(column.get()); + for (size_t j = 0; j < num_rows; j++) { + col_ptr->insert_value(commit_time); + } + } + _current_return_columns[tso_col_idx] = std::move(column); +} + uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size) { SCOPED_RAW_TIMER(&_opts.stats->vec_cond_ns); @@ -2703,6 +2818,8 @@ Status SegmentIterator::_next_batch_internal(Block* block) { _selected_size = 0; RETURN_IF_ERROR(_read_columns_by_index(nrows_read_limit, _selected_size)); _replace_version_col_if_needed(_predicate_column_ids, _selected_size); + _update_lsn_col_if_needed(_predicate_column_ids, _selected_size); + _update_tso_col_if_needed(_predicate_column_ids, _selected_size); _opts.stats->blocks_load += 1; _opts.stats->raw_rows_read += _selected_size; @@ -2746,6 +2863,8 @@ Status SegmentIterator::_next_batch_internal(Block* block) { _common_expr_column_ids, _block_rowids, _sel_rowid_idx.data(), _selected_size, &_current_return_columns)); _replace_version_col_if_needed(_common_expr_column_ids, _selected_size); + _update_lsn_col_if_needed(_common_expr_column_ids, _selected_size); + _update_tso_col_if_needed(_common_expr_column_ids, _selected_size); RETURN_IF_ERROR(_process_columns(_common_expr_column_ids, block)); } @@ -2777,6 +2896,8 @@ Status SegmentIterator::_next_batch_internal(Block* block) { _selected_size, &_current_return_columns, _opts.condition_cache_digest && !_find_condition_cache)); _replace_version_col_if_needed(_non_predicate_columns, _selected_size); + _update_lsn_col_if_needed(_non_predicate_columns, _selected_size); + _update_tso_col_if_needed(_non_predicate_columns, _selected_size); } else { if (_opts.condition_cache_digest && !_find_condition_cache) { auto& condition_cache = *_condition_cache; diff --git a/be/src/storage/segment/segment_iterator.h b/be/src/storage/segment/segment_iterator.h index 3852cf8743d25b..5ff3c55d3baeee 100644 --- a/be/src/storage/segment/segment_iterator.h +++ b/be/src/storage/segment/segment_iterator.h @@ -221,6 +221,8 @@ class SegmentIterator : public RowwiseIterator { MutableColumns& column_block, size_t nrows); [[nodiscard]] Status _read_columns_by_index(uint32_t nrows_read_limit, uint16_t& nrows_read); void _replace_version_col_if_needed(const std::vector& column_ids, size_t num_rows); + void _update_lsn_col_if_needed(const std::vector& column_ids, size_t num_rows); + void _update_tso_col_if_needed(const std::vector& column_ids, size_t num_rows); Status _init_current_block(Block* block, std::vector& non_pred_vector, uint32_t nrows_read_limit); uint16_t _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); diff --git a/be/src/storage/segment/segment_writer.cpp b/be/src/storage/segment/segment_writer.cpp index ac6841a6cf2651..02e18dbd025401 100644 --- a/be/src/storage/segment/segment_writer.cpp +++ b/be/src/storage/segment/segment_writer.cpp @@ -63,6 +63,7 @@ #include "storage/rowset/segment_creator.h" #include "storage/segment/column_writer.h" // ColumnWriter #include "storage/segment/external_col_meta_util.h" +#include "storage/segment/historical_row_retriever.h" #include "storage/segment/page_io.h" #include "storage/segment/page_pointer.h" #include "storage/segment/segment_loader.h" @@ -628,8 +629,8 @@ Status SegmentWriter::append_block_with_partial_content(const Block* block, size // read to fill full block RETURN_IF_ERROR(read_plan.fill_missing_columns( - _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, - use_default_or_null_flag, has_default_or_nullable, + _opts.rowset_ctx->make_historical_row_retriever_context(), _rsid_to_rowset, + *_tablet_schema, full_block, use_default_or_null_flag, has_default_or_nullable, cast_set(segment_start_pos), block)); if (_tablet_schema->num_variant_columns() > 0) { @@ -724,22 +725,6 @@ Status SegmentWriter::append_block(const Block* block, size_t row_pos, size_t nu _olap_data_convertor->set_source_content(block, row_pos, num_rows); - // find all row pos for short key indexes - std::vector short_key_pos; - if (_has_key) { - // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we - // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` - // for next blocks. - // Ensure we build a short key index using 1st rows only for the first block (ISSUE-9766). - if (UNLIKELY(_short_key_row_pos == 0 && _num_rows_written == 0)) { - short_key_pos.push_back(0); - } - while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + num_rows) { - _short_key_row_pos += _opts.num_rows_per_block; - short_key_pos.push_back(_short_key_row_pos - _num_rows_written); - } - } - // convert column data from engine format to storage layer format std::vector key_columns; IOlapColumnDataAccessor* seq_column = nullptr; @@ -763,54 +748,67 @@ Status SegmentWriter::append_block(const Block* block, size_t row_pos, size_t nu RETURN_IF_ERROR( _variant_stats_calculator->calculate_variant_stats(block, row_pos, num_rows)); } - if (_has_key) { - if (_is_mow_with_cluster_key()) { - // for now we don't need to query short key index for CLUSTER BY feature, - // but we still write the index for future usage. - // 1. generate primary key index, the key_columns is primary_key_columns - RETURN_IF_ERROR(_generate_primary_key_index(_primary_key_coders, key_columns, - seq_column, num_rows, true)); - // 2. generate short key index (use cluster key) - key_columns.clear(); - for (const auto& cid : _tablet_schema->cluster_key_uids()) { - // find cluster key index in tablet schema - auto cluster_key_index = _tablet_schema->field_index(cid); - if (cluster_key_index == -1) { - return Status::InternalError( - "could not find cluster key column with unique_id=" + - std::to_string(cid) + " in tablet schema"); - } - bool found = false; - for (auto i = 0; i < _column_ids.size(); ++i) { - if (_column_ids[i] == cluster_key_index) { - auto converted_result = _olap_data_convertor->convert_column_data(i); - if (!converted_result.first.ok()) { - return converted_result.first; - } - key_columns.push_back(converted_result.second); - found = true; - break; + + RETURN_IF_ERROR(build_key_index(key_columns, seq_column, num_rows)); + + _num_rows_written += num_rows; + _olap_data_convertor->clear_source_content(); + return Status::OK(); +} + +Status SegmentWriter::build_key_index(std::vector& key_columns, + IOlapColumnDataAccessor* seq_column, size_t num_rows) { + if (!_has_key) { + return Status::OK(); + } + + // find all row pos for short key indexes + std::vector short_key_pos; + if (UNLIKELY(_short_key_row_pos == 0 && _num_rows_written == 0)) { + short_key_pos.push_back(0); + } + while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + num_rows) { + _short_key_row_pos += _opts.num_rows_per_block; + short_key_pos.push_back(_short_key_row_pos - _num_rows_written); + } + + if (_is_mow_with_cluster_key()) { + // For CLUSTER BY tables: + // 1) generate primary key index (unique keys) + RETURN_IF_ERROR(_generate_primary_key_index(_primary_key_coders, key_columns, seq_column, + num_rows, true)); + // 2) generate short key index (cluster keys) + key_columns.clear(); + for (const auto& cid : _tablet_schema->cluster_key_uids()) { + auto cluster_key_index = _tablet_schema->field_index(cid); + if (cluster_key_index == -1) { + return Status::InternalError("could not find cluster key column with unique_id=" + + std::to_string(cid) + " in tablet schema"); + } + bool found = false; + for (auto i = 0; i < _column_ids.size(); ++i) { + if (_column_ids[i] == cluster_key_index) { + auto converted_result = _olap_data_convertor->convert_column_data(i); + if (!converted_result.first.ok()) { + return converted_result.first; } - } - if (!found) { - return Status::InternalError( - "could not found cluster key column with unique_id=" + - std::to_string(cid) + - ", tablet schema index=" + std::to_string(cluster_key_index)); + key_columns.push_back(converted_result.second); + found = true; + break; } } - RETURN_IF_ERROR(_generate_short_key_index(key_columns, num_rows, short_key_pos)); - } else if (_is_mow()) { - RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, - num_rows, false)); - } else { - RETURN_IF_ERROR(_generate_short_key_index(key_columns, num_rows, short_key_pos)); + if (!found) { + return Status::InternalError( + "could not found cluster key column with unique_id=" + std::to_string(cid) + + ", tablet schema index=" + std::to_string(cluster_key_index)); + } } + return _generate_short_key_index(key_columns, num_rows, short_key_pos); } - - _num_rows_written += num_rows; - _olap_data_convertor->clear_source_content(); - return Status::OK(); + if (_is_mow()) { + return _generate_primary_key_index(_key_coders, key_columns, seq_column, num_rows, false); + } + return _generate_short_key_index(key_columns, num_rows, short_key_pos); } int64_t SegmentWriter::max_row_to_add(size_t row_avg_size_in_bytes) { diff --git a/be/src/storage/segment/segment_writer.h b/be/src/storage/segment/segment_writer.h index 37b4e996448d76..9b6b8b55c3aea1 100644 --- a/be/src/storage/segment/segment_writer.h +++ b/be/src/storage/segment/segment_writer.h @@ -82,14 +82,14 @@ class SegmentWriter { explicit SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, const SegmentWriterOptions& opts, IndexFileWriter* inverted_file_writer); - ~SegmentWriter(); + virtual ~SegmentWriter(); - Status init(); + virtual Status init(); // for vertical compaction - Status init(const std::vector& col_ids, bool has_key); + virtual Status init(const std::vector& col_ids, bool has_key); - Status append_block(const Block* block, size_t row_pos, size_t num_rows); + virtual Status append_block(const Block* block, size_t row_pos, size_t num_rows); Status probe_key_for_mow(std::string key, std::size_t segment_pos, bool have_input_seq_column, bool have_delete_sign, const std::vector& specified_rowsets, @@ -192,7 +192,11 @@ class SegmentWriter { bool _is_mow(); bool _is_mow_with_cluster_key(); -private: +protected: + // Build key index for derived writers that override append_block. + Status build_key_index(std::vector& key_columns, + IOlapColumnDataAccessor* seq_column, size_t num_rows); + uint32_t _segment_id; TabletSchemaSPtr _tablet_schema; BaseTabletSPtr _tablet; diff --git a/be/src/storage/segment/vertical_segment_writer.cpp b/be/src/storage/segment/vertical_segment_writer.cpp index 6203bf50b233de..7e81a2ab0edbbb 100644 --- a/be/src/storage/segment/vertical_segment_writer.cpp +++ b/be/src/storage/segment/vertical_segment_writer.cpp @@ -67,6 +67,7 @@ #include "storage/rowset/segment_creator.h" #include "storage/segment/column_writer.h" // ColumnWriter #include "storage/segment/external_col_meta_util.h" +#include "storage/segment/historical_row_retriever.h" #include "storage/segment/page_io.h" #include "storage/segment/page_pointer.h" #include "storage/segment/segment_loader.h" @@ -84,8 +85,8 @@ namespace doris::segment_v2 { using namespace ErrorCode; using namespace KeyConsts; -static const char* k_segment_magic = "D0R1"; -static const uint32_t k_segment_magic_length = 4; +static constexpr const char* k_segment_magic = "D0R1"; +static constexpr uint32_t k_segment_magic_length = 4; inline std::string vertical_segment_writer_mem_tracker_name(uint32_t segment_id) { return "VerticalSegmentWriter:Segment-" + std::to_string(segment_id); @@ -632,8 +633,9 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da // read to fill full_block RETURN_IF_ERROR(read_plan.fill_missing_columns( - _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, - use_default_or_null_flag, has_default_or_nullable, segment_start_pos, data.block)); + _opts.rowset_ctx->make_historical_row_retriever_context(), _rsid_to_rowset, + *_tablet_schema, full_block, use_default_or_null_flag, has_default_or_nullable, + segment_start_pos, data.block)); if (_tablet_schema->num_variant_columns() > 0) { RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns( @@ -797,9 +799,9 @@ Status VerticalSegmentWriter::_append_block_with_flexible_partial_content(RowsIn // 6. read according plan to fill full_block RETURN_IF_ERROR(read_plan.fill_non_primary_key_columns( - _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, - use_default_or_null_flag, has_default_or_nullable, segment_start_pos, - cast_set(data.row_pos), data.block, skip_bitmaps)); + _opts.rowset_ctx->make_historical_row_retriever_context(), _rsid_to_rowset, + *_tablet_schema, full_block, use_default_or_null_flag, has_default_or_nullable, + segment_start_pos, cast_set(data.row_pos), data.block, skip_bitmaps)); // TODO(bobhan1): should we replace the skip bitmap column with empty bitmaps to reduce storage occupation? // this column is not needed in read path for merge-on-write table diff --git a/be/src/storage/storage_engine.cpp b/be/src/storage/storage_engine.cpp index 1a15929111d275..65d06de16b850f 100644 --- a/be/src/storage/storage_engine.cpp +++ b/be/src/storage/storage_engine.cpp @@ -943,6 +943,8 @@ void StorageEngine::_clean_unused_rowset_metas() { bool parsed = rowset_meta->init(meta_str); if (!parsed) { LOG(WARNING) << "parse rowset meta string failed for rowset_id:" << rowset_id; + rowset_meta->set_rowset_id(rowset_id); + rowset_meta->set_tablet_uid(tablet_uid); invalid_rowset_metas.push_back(rowset_meta); return true; } @@ -988,6 +990,45 @@ void StorageEngine::_clean_unused_rowset_metas() { } return true; }; + std::vector> invalid_row_binlog_metas; + auto clean_row_binlog_rowsets = [this, &invalid_row_binlog_metas]( + const TabletUid& tablet_uid, RowsetId rowset_id, + RowsetId row_binlog_rowset_id, + const std::string& meta_str) -> bool { + // return false will break meta iterator, return true to skip this error + RowsetMetaSharedPtr row_binlog_rowset_meta(new RowsetMeta()); + bool parsed = row_binlog_rowset_meta->init(meta_str); + if (!parsed) { + LOG(WARNING) << "parse binlog meta string failed for rowset_id:" + << row_binlog_rowset_id; + row_binlog_rowset_meta->set_rowset_id(row_binlog_rowset_id); + row_binlog_rowset_meta->set_tablet_uid(tablet_uid); + invalid_row_binlog_metas.emplace_back(rowset_id, row_binlog_rowset_meta); + return true; + } + TabletSharedPtr tablet = _tablet_manager->get_tablet(row_binlog_rowset_meta->tablet_id()); + if (tablet == nullptr) { + LOG(INFO) << "failed to find tablet " << row_binlog_rowset_meta->tablet_id() + << " for binlog: " << row_binlog_rowset_meta->rowset_id() + << ", tablet may be dropped"; + invalid_row_binlog_metas.emplace_back(rowset_id, row_binlog_rowset_meta); + return true; + } + if (tablet->tablet_uid() != row_binlog_rowset_meta->tablet_uid()) { + LOG(WARNING) << "binlog meta's tablet uid " << row_binlog_rowset_meta->tablet_uid() + << " does not equal to tablet uid: " << tablet->tablet_uid(); + invalid_row_binlog_metas.emplace_back(rowset_id, row_binlog_rowset_meta); + return true; + } + if (row_binlog_rowset_meta->rowset_state() == RowsetStatePB::VISIBLE && + !tablet->rowset_meta_is_useful(row_binlog_rowset_meta) && + !check_rowset_id_in_unused_rowsets(rowset_id)) { + LOG(INFO) << "binlog meta is not used any more, remove it. rowset_id=" + << row_binlog_rowset_meta->rowset_id(); + invalid_row_binlog_metas.emplace_back(rowset_id, row_binlog_rowset_meta); + } + return true; + }; auto data_dirs = get_stores(); for (auto data_dir : data_dirs) { static_cast( @@ -1016,6 +1057,17 @@ void StorageEngine::_clean_unused_rowset_metas() { } LOG(INFO) << "remove " << invalid_rowset_metas.size() << " invalid rowset meta from dir: " << data_dir->path(); + + static_cast(RowsetMetaManager::traverse_row_binlog_metas(data_dir->get_meta(), + clean_row_binlog_rowsets)); + for (auto& rs_id_to_meta : invalid_row_binlog_metas) { + static_cast(RowsetMetaManager::remove_row_binlog( + data_dir->get_meta(), rs_id_to_meta.second->tablet_uid(), rs_id_to_meta.first, + rs_id_to_meta.second->rowset_id())); + } + LOG(INFO) << "remove " << invalid_row_binlog_metas.size() + << " invalid binlog meta from dir: " << data_dir->path(); + invalid_row_binlog_metas.clear(); invalid_rowset_metas.clear(); } } diff --git a/be/src/storage/tablet/base_tablet.cpp b/be/src/storage/tablet/base_tablet.cpp index 6802092c74995e..8e1a3165d4e575 100644 --- a/be/src/storage/tablet/base_tablet.cpp +++ b/be/src/storage/tablet/base_tablet.cpp @@ -36,8 +36,10 @@ #include "common/status.h" #include "core/assert_cast.h" #include "core/data_type/data_type_factory.hpp" +#include "exec/sink/autoinc_buffer.h" // GlobalAutoIncBuffers #include "load/memtable/memtable.h" #include "service/point_query_executor.h" +#include "storage/binlog.h" #include "storage/compaction/cumulative_compaction_time_series_policy.h" #include "storage/delete/calc_delete_bitmap_executor.h" #include "storage/delete/delete_bitmap_calculator.h" @@ -46,9 +48,12 @@ #include "storage/partial_update_info.h" #include "storage/rowid_conversion.h" #include "storage/rowset/beta_rowset.h" +#include "storage/rowset/group_rowset_writer.h" #include "storage/rowset/rowset.h" +#include "storage/rowset/rowset_factory.h" #include "storage/rowset/rowset_fwd.h" #include "storage/rowset/rowset_reader.h" +#include "storage/rowset/rowset_writer_context.h" #include "storage/segment/column_reader.h" #include "storage/tablet/tablet_fwd.h" #include "storage/txn/txn_manager.h" @@ -123,6 +128,8 @@ BaseTablet::BaseTablet(TabletMetaSharedPtr tablet_meta) : _tablet_meta(std::move // construct _timestamped_versioned_tracker from rs and stale rs meta _timestamped_version_tracker.construct_versioned_tracker(_tablet_meta->all_rs_metas(), _tablet_meta->all_stale_rs_metas()); + _row_binlog_version_tracker.construct_versioned_tracker( + _tablet_meta->all_row_binlog_rs_metas()); // if !_tablet_meta->all_rs_metas()[0]->tablet_schema(), // that mean the tablet_meta is still no upgrade to doris 1.2 versions. @@ -297,11 +304,14 @@ Versions BaseTablet::get_missed_versions(int64_t spec_version) const { return calc_missed_versions(spec_version, std::move(existing_versions)); } -Versions BaseTablet::get_missed_versions_unlocked(int64_t spec_version) const { +Versions BaseTablet::get_missed_versions_unlocked(int64_t spec_version, + bool capture_row_binlog) const { DCHECK(spec_version > 0) << "invalid spec_version: " << spec_version; Versions existing_versions; - for (const auto& [ver, _] : _tablet_meta->all_rs_metas()) { + const auto& rs_metas = capture_row_binlog ? _tablet_meta->all_row_binlog_rs_metas() + : _tablet_meta->all_rs_metas(); + for (const auto& [ver, _] : rs_metas) { existing_versions.emplace_back(ver); } return calc_missed_versions(spec_version, std::move(existing_versions)); @@ -576,8 +586,10 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, Version dummy_version(end_version + 1, end_version + 1); auto rowset_schema = rowset->tablet_schema(); - PartialUpdateInfo* partial_update_info {nullptr}; - bool is_partial_update = rowset_writer && rowset_writer->is_partial_update(); + PartialUpdateInfo* partial_update_info = + rowset_writer != nullptr ? rowset_writer->get_partial_update_info().get() : nullptr; + bool is_partial_update = partial_update_info && partial_update_info->is_partial_update(); + bool need_rewrite_conflict = partial_update_info != nullptr; // `have_input_seq_column` is for fixed partial update only. For flexible partial update, we should use // the skip bitmap to determine wheather a row has specified the sequence column bool have_input_seq_column = false; @@ -585,7 +597,6 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, // another row with higher seqeucne value std::set rids_be_overwritten; if (is_partial_update) { - partial_update_info = rowset_writer->get_partial_update_info().get(); if (partial_update_info->is_fixed_partial_update() && rowset_schema->has_sequence_col()) { std::vector including_cids = rowset_writer->get_partial_update_info()->update_cids; @@ -739,7 +750,7 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, // NOTE: for partial update which doesn't specify the sequence column, we can't use the sequence column value filled in flush phase // as its final value. Otherwise it may cause inconsistency between replicas. } - if (is_partial_update && rowset_writer != nullptr) { + if (need_rewrite_conflict) { // In publish version, record rows to be deleted for concurrent update // For example, if version 5 and 6 update a row, but version 6 only see // version 4 when write, and when publish version, version 5's value will @@ -752,8 +763,11 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, // and read missing columns from previous rowsets to create the final block // - for flexible partial update, we should read all columns from current load's rowset // and read non sort key columns from previous rowsets to create the final block - // So we only need to record rows to read for both mode partial update - read_plan_ori.prepare_to_read(loc, pos); + // - for upsert rewrite, we should read all columns from current load's rowset + // So we only need to record rows to read for both mode partial update and upsert rewrite + if (is_partial_update) { + read_plan_ori.prepare_to_read(loc, pos); + } read_plan_update.prepare_to_read(RowLocation {rowset_id, seg->id(), row_id}, pos); // For flexible partial update, we should use skip bitmap to determine wheather @@ -803,17 +817,39 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, } if (pos > 0) { - DCHECK(partial_update_info); - if (partial_update_info->is_fixed_partial_update()) { - RETURN_IF_ERROR(generate_new_block_for_partial_update( - rowset_schema, partial_update_info, read_plan_ori, read_plan_update, - rsid_to_rowset, &block)); - } else { + DCHECK(partial_update_info != nullptr); + if (partial_update_info->is_flexible_partial_update()) { RETURN_IF_ERROR(generate_new_block_for_flexible_partial_update( rowset_schema, partial_update_info, rids_be_overwritten, read_plan_ori, read_plan_update, rsid_to_rowset, &block)); + } else { + RETURN_IF_ERROR(generate_new_block_for_partial_update( + rowset_schema, partial_update_info, read_plan_ori, read_plan_update, + rsid_to_rowset, &block)); } RETURN_IF_ERROR(sort_block(block, ordered_block)); + + // Publish-phase partial update may flush transient segments to a GroupRowsetWriter. + // For row-binlog writing, RowBinlogSegmentWriter requires `seg_id -> lsn_ids` to be + // registered before the segment writer is constructed. + if (auto* group_writer = typeid_cast(rowset_writer); + group_writer != nullptr) { + auto seg_id = group_writer->get_allocated_segment_id(); + auto binlog_writer = group_writer->row_binlog_writer(); + auto& binlog_ctx = const_cast(binlog_writer->context()); + if (binlog_ctx.write_binlog_opt().enable) { + auto db_id = binlog_writer->rowset_meta()->db_id(); + auto table_id = binlog_writer->rowset_meta()->table_id(); + DCHECK_GT(db_id, 0); + DCHECK_GT(table_id, 0); + auto lsn_buffer = GlobalAutoIncBuffers::GetInstance()->get_auto_inc_buffer( + db_id, table_id, kBinlogLsnAutoIncId); + std::shared_ptr> lsn_ids; + RETURN_IF_ERROR(allocate_binlog_lsn(lsn_buffer, ordered_block.rows(), &lsn_ids)); + binlog_ctx.write_binlog_opt().write_binlog_config().insert_seg_lsn( + seg_id, std::move(lsn_ids)); + } + } RETURN_IF_ERROR(rowset_writer->flush_single_block(&ordered_block)); auto cost_us = watch.get_elapse_time_us(); if (config::enable_mow_verbose_log || cost_us > 10 * 1000) { @@ -1414,18 +1450,43 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf RowsetIdUnorderedSet rowset_ids_to_add; RowsetIdUnorderedSet rowset_ids_to_del; RowsetSharedPtr rowset = txn_info->rowset; + RowsetSharedPtr row_binlog_rowset; + bool build_row_binlog = false; int64_t cur_version = rowset->start_version(); - std::unique_ptr transient_rs_writer; DeleteBitmapPtr delete_bitmap = txn_info->delete_bitmap; bool is_partial_update = txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update(); - if (is_partial_update) { + for (const auto& rs : txn_info->attach_rowsets) { + if (rs != nullptr && rs->rowset_meta() != nullptr && rs->rowset_meta()->is_row_binlog()) { + row_binlog_rowset = rs; + build_row_binlog = is_partial_update || + self->tablet_meta()->binlog_config().need_historical_value(); + break; + } + } + + // rewrite conflict only when partial update or need before + if (is_partial_update || build_row_binlog) { + if (txn_info->partial_update_info == nullptr) { + txn_info->partial_update_info = std::make_shared(); + } + if (txn_info->partial_update_info->partial_update_mode == UniqueKeyUpdateModePB::UPSERT) { + txn_info->partial_update_info->partial_update_input_columns.clear(); + txn_info->partial_update_info->missing_cids.clear(); + txn_info->partial_update_info->default_values.clear(); + auto& update_cids = txn_info->partial_update_info->update_cids; + update_cids.resize(rowset->tablet_schema()->num_columns()); + std::iota(update_cids.begin(), update_cids.end(), 0); + } + + DCHECK(txn_info->partial_update_info != nullptr); + transient_rs_writer = DORIS_TRY(self->create_transient_rowset_writer( *rowset, txn_info->partial_update_info, txn_expiration)); DBUG_EXECUTE_IF("BaseTablet::update_delete_bitmap.after.create_transient_rs_writer", DBUG_BLOCK); - // Partial update might generate new segments when there is conflicts while publish, and mark + // Partial update or upsert rewrite might generate new segments when there is conflicts while publish, and mark // the same key in original segments as delete. // When the new segment flush fails or the rowset build fails, the deletion marker for the // duplicate key of the original segment should not remain in `txn_info->delete_bitmap`, @@ -1523,6 +1584,43 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf RETURN_IF_ERROR(token->wait()); } + // Publish-phase partial update or upsert rewrite may generate transient segments. If row binlog is enabled for + // this txn (row binlog rowset attached), we need to build row binlog segments together with + // transient data segments: + // 1) create a transient row binlog writer that appends to the same version row binlog rowset + // 2) pre-allocate per-row LSNs for each transient segment and register them in binlog options + // 3) wrap both writers into a GroupRowsetWriter so calc_delete_bitmap writes to both. + if (build_row_binlog) { + DCHECK(transient_rs_writer != nullptr); + + // Create transient row binlog writer for publish-phase segment appending. + auto transient_row_binlog_writer = DORIS_TRY(self->create_transient_rowset_writer( + *row_binlog_rowset, txn_info->partial_update_info, txn_expiration)); + + // Prepare source MOW context for historical row retrieval in binlog writer. + auto& data_ctx = const_cast(transient_rs_writer->context()); + data_ctx.mow_context = std::make_shared( + cur_version - 1, txn_id, std::make_shared(), + specified_rowsets, nullptr); + + auto& binlog_ctx = const_cast(transient_row_binlog_writer->context()); + auto& cfg = binlog_ctx.write_binlog_opt().write_binlog_config(); + cfg.source.tablet_schema = data_ctx.tablet_schema; + cfg.source.partial_update_info = data_ctx.partial_update_info; + cfg.source.mow_context = data_ctx.mow_context; + cfg.source.is_transient_rowset_writer = data_ctx.is_transient_rowset_writer; + cfg.source.source_write_type = data_ctx.write_type; + + // Wrap two transient writers into a group writer for dual flush/build. + RowsetWriterSharedPtr data_writer_sp(std::move(transient_rs_writer)); + RowsetWriterSharedPtr row_binlog_writer_sp(std::move(transient_row_binlog_writer)); + std::unique_ptr group_writer; + RETURN_IF_ERROR(RowsetFactory::create_empty_group_rowset_writer(&group_writer)); + group_writer->set_data_writer(data_writer_sp); + group_writer->set_row_binlog_writer(row_binlog_writer_sp); + transient_rs_writer = std::move(group_writer); + } + // When there is only one segment, it will be calculated in the current thread. // Otherwise, it will be submitted to the thread pool for calculation. if (segments.size() <= 1) { @@ -1566,7 +1664,17 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf // build rowset writer and merge transient rowset RETURN_IF_ERROR(transient_rs_writer->flush()); RowsetSharedPtr transient_rowset; - RETURN_IF_ERROR(transient_rs_writer->build(transient_rowset)); + RowsetSharedPtr transient_row_binlog; + if (build_row_binlog) { + auto* group_rowset_writer = typeid_cast(transient_rs_writer.get()); + DCHECK(group_rowset_writer != nullptr); + std::vector waited_build_rowsets; + RETURN_IF_ERROR(group_rowset_writer->build_rowsets(waited_build_rowsets)); + transient_rowset = waited_build_rowsets.at(0); + transient_row_binlog = waited_build_rowsets.at(1); + } else { + RETURN_IF_ERROR(transient_rs_writer->build(transient_rowset)); + } auto old_segments = rowset->num_segments(); rowset->merge_rowset_meta(*transient_rowset->rowset_meta()); auto new_segments = rowset->num_segments(); @@ -1575,6 +1683,19 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf << ", new segment num: " << new_segments << ")" << ", cost:" << watch.get_elapse_time_us() - t4 << "(us)"; + if (build_row_binlog) { + DCHECK(row_binlog_rowset != nullptr); + old_segments = row_binlog_rowset->num_segments(); + row_binlog_rowset->merge_rowset_meta(*transient_row_binlog->rowset_meta()); + new_segments = row_binlog_rowset->num_segments(); + ss << ", " << txn_info->partial_update_info->partial_update_mode_str() + << " flush binlog (old segment num: " << old_segments + << ", new segment num: " << new_segments << ")"; + + SegmentLoader::instance()->erase_segments(row_binlog_rowset->rowset_id(), + row_binlog_rowset->num_segments()); + } + // update the shared_ptr to new bitmap, which is consistent with current rowset. txn_info->delete_bitmap = delete_bitmap; // erase segment cache cause we will add a segment to rowset diff --git a/be/src/storage/tablet/base_tablet.h b/be/src/storage/tablet/base_tablet.h index 4459a62d12ab77..e677e6cd5e5f60 100644 --- a/be/src/storage/tablet/base_tablet.h +++ b/be/src/storage/tablet/base_tablet.h @@ -103,6 +103,11 @@ class BaseTablet : public std::enable_shared_from_this { return _max_version_schema; } + TabletSchemaSPtr row_binlog_tablet_schema() const { + std::shared_lock rlock(_meta_lock); + return _tablet_meta->row_binlog_schema(); + } + void set_alter_failed(bool alter_failed) { _alter_failed = alter_failed; } bool is_alter_failed() { return _alter_failed; } @@ -138,7 +143,8 @@ class BaseTablet : public std::enable_shared_from_this { // Get the missed versions until the spec_version. Versions get_missed_versions(int64_t spec_version) const; - Versions get_missed_versions_unlocked(int64_t spec_version) const; + Versions get_missed_versions_unlocked(int64_t spec_version, + bool capture_row_binlog = false) const; void generate_tablet_meta_copy(TabletMeta& new_tablet_meta, bool cloud_get_rowset_meta) const; void generate_tablet_meta_copy_unlocked(TabletMeta& new_tablet_meta, @@ -451,6 +457,7 @@ struct CaptureRowsetOps { bool quiet = false; bool include_stale_rowsets = true; bool enable_fetch_rowsets_from_peers = false; + bool capture_row_binlog = false; // ======== only take effect in cloud mode ======== diff --git a/be/src/storage/tablet/tablet.cpp b/be/src/storage/tablet/tablet.cpp index 9f0ba06696a3b9..1d9c17f36209f3 100644 --- a/be/src/storage/tablet/tablet.cpp +++ b/be/src/storage/tablet/tablet.cpp @@ -330,7 +330,7 @@ Status Tablet::_init_once_action() { RowsetSharedPtr rowset; res = create_rowset(row_binlog_rs_meta, &rowset); if (!res.ok()) { - LOG(WARNING) << "fail to init row_binlog rowset. tablet_id:" << tablet_id() + LOG(WARNING) << "fail to init binlog rowset. tablet_id:" << tablet_id() << ", schema_hash:" << schema_hash() << ", version=" << version << ", res:" << res; return res; @@ -501,7 +501,8 @@ Status Tablet::add_rowset(RowsetSharedPtr rowset, RowsetSharedPtr row_binlog_row // If the rowset already exist, just return directly. The rowset_id is an unique-id, // we can use it to check this situation. if (_contains_rowset(rowset->rowset_id())) { - return Status::OK(); + // Ensure binlog is also added on retry. + return _add_row_binlog_rowset_unlocked(rowset, row_binlog_rowset); } // Otherwise, the version should be not contained in any existing rowset. RETURN_IF_ERROR(_contains_version(rowset->version())); @@ -511,11 +512,7 @@ Status Tablet::add_rowset(RowsetSharedPtr rowset, RowsetSharedPtr row_binlog_row _timestamped_version_tracker.add_version(rowset->version()); add_compaction_score(rowset->rowset_meta()->get_compaction_score()); - if (row_binlog_rowset != nullptr) { - RETURN_IF_ERROR(_tablet_meta->add_row_binlog_rs_meta(row_binlog_rowset->rowset_meta())); - _row_binlog_rs_version_map[rowset->version()] = row_binlog_rowset; - _row_binlog_version_tracker.add_version(row_binlog_rowset->version()); - } + RETURN_IF_ERROR(_add_row_binlog_rowset_unlocked(rowset, row_binlog_rowset)); std::vector rowsets_to_delete; // yiguolei: temp code, should remove the rowset contains by this rowset @@ -535,6 +532,32 @@ Status Tablet::add_rowset(RowsetSharedPtr rowset, RowsetSharedPtr row_binlog_row return Status::OK(); } +Status Tablet::_add_row_binlog_rowset_unlocked(const RowsetSharedPtr& rowset, + const RowsetSharedPtr& row_binlog_rowset) { + if (row_binlog_rowset == nullptr) { + return Status::OK(); + } + DCHECK(rowset != nullptr); + DCHECK_EQ(row_binlog_rowset->version(), rowset->version()); + + const auto& version = row_binlog_rowset->version(); + if (auto it = _row_binlog_rs_version_map.find(version); + it != _row_binlog_rs_version_map.end()) { + if (it->second != nullptr && it->second->rowset_id() == row_binlog_rowset->rowset_id()) { + return Status::OK(); + } + return Status::Error( + "binlog version already exists. existing rowset_id={}, version={}, tablet={}", + it->second != nullptr ? it->second->rowset_id().to_string() : "0", + version.to_string(), tablet_id()); + } + + RETURN_IF_ERROR(_tablet_meta->add_row_binlog_rs_meta(row_binlog_rowset->rowset_meta())); + _row_binlog_rs_version_map[version] = row_binlog_rowset; + _row_binlog_version_tracker.add_version(version); + return Status::OK(); +} + bool Tablet::rowset_exists_unlocked(const RowsetSharedPtr& rowset) { if (auto it = _rs_version_map.find(rowset->version()); it == _rs_version_map.end()) { return false; @@ -710,12 +733,14 @@ RowsetSharedPtr Tablet::_rowset_with_largest_size() { } // add inc rowset should not persist tablet meta, because it will be persisted when publish txn. -Status Tablet::add_inc_rowset(const RowsetSharedPtr& rowset) { +Status Tablet::add_inc_rowset(const RowsetSharedPtr& rowset, + const RowsetSharedPtr& row_binlog_rowset) { DCHECK(rowset != nullptr); std::lock_guard wrlock(_meta_lock); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); if (_contains_rowset(rowset->rowset_id())) { - return Status::OK(); + // Ensure binlog is also added on retry. + return _add_row_binlog_rowset_unlocked(rowset, row_binlog_rowset); } RETURN_IF_ERROR(_contains_version(rowset->version())); @@ -728,7 +753,7 @@ Status Tablet::add_inc_rowset(const RowsetSharedPtr& rowset) { add_compaction_score(rowset->rowset_meta()->get_compaction_score()); - return Status::OK(); + return _add_row_binlog_rowset_unlocked(rowset, row_binlog_rowset); } void Tablet::_delete_stale_rowset_by_version(const Version& version) { @@ -1558,6 +1583,44 @@ bool Tablet::do_tablet_meta_checkpoint() { rs_meta->set_remove_from_rowset_meta(); } + // Remove row binlog metas from rowset meta store after tablet meta is checkpointed. + // Row binlog metas are stored in meta KV with key: + // {kRowBinlogPrefix}{tablet_uid}_{base_rowset_id}_{row_binlog_rowset_id} + // Here we only have row binlog rowset metas, so locate base rowset by version. + const auto& base_rs_metas = _tablet_meta->all_rs_metas(); + const auto& stale_rs_metas = _tablet_meta->all_stale_rs_metas(); + for (const auto& [_, rb_meta] : _tablet_meta->all_row_binlog_rs_metas()) { + // Reuse the same flag to avoid repeated removals across checkpoints. + if (rb_meta->is_remove_from_rowset_meta()) { + continue; + } + + RowsetMetaSharedPtr base_rs_meta; + if (auto base_it = base_rs_metas.find(rb_meta->version()); base_it != base_rs_metas.end()) { + base_rs_meta = base_it->second; + } else if (auto stale_it = stale_rs_metas.find(rb_meta->version()); + stale_it != stale_rs_metas.end()) { + base_rs_meta = stale_it->second; + } + if (base_rs_meta == nullptr) { + LOG(WARNING) << "failed to locate base rowset meta for binlog by version, tablet=" + << tablet_id() << ", version=" << rb_meta->version().to_string() + << ", binlog_rowset_id=" << rb_meta->rowset_id() + << ", try to remove by scanning row-binlog meta store"; + RETURN_FALSE_IF_ERROR(RowsetMetaManager::remove_row_binlog_metas( + _data_dir->get_meta(), tablet_uid(), {rb_meta->rowset_id()})); + rb_meta->set_remove_from_rowset_meta(); + continue; + } + + RETURN_FALSE_IF_ERROR(RowsetMetaManager::remove_row_binlog( + _data_dir->get_meta(), tablet_uid(), base_rs_meta->rowset_id(), + rb_meta->rowset_id())); + VLOG_NOTICE << "remove binlog meta from meta store, base_rowset_id=" + << base_rs_meta->rowset_id() << ", binlog_rowset_id=" << rb_meta->rowset_id(); + rb_meta->set_remove_from_rowset_meta(); + } + if (keys_type() == UNIQUE_KEYS && enable_unique_key_merge_on_write()) { RETURN_FALSE_IF_ERROR(TabletMetaManager::remove_old_version_delete_bitmap( _data_dir, tablet_id(), max_version_unlocked())); @@ -1587,6 +1650,16 @@ bool Tablet::rowset_meta_is_useful(RowsetMetaSharedPtr rowset_meta) { find_version = true; } } + if (rowset_meta->is_row_binlog()) { + for (auto& version_rowset : _row_binlog_rs_version_map) { + if (version_rowset.second->rowset_id() == rowset_meta->rowset_id()) { + return true; + } + if (version_rowset.second->contains_version(rowset_meta->version())) { + find_version = true; + } + } + } return !find_version; } @@ -1970,29 +2043,22 @@ Status Tablet::create_initial_rowset(const int64_t req_version) { RETURN_IF_ERROR(RowsetFactory::create_empty_group_rowset_writer(&group_rowset_writer)); RowsetWriterContext data_context; - data_context.write_binlog_opt().mark_primary_writer(); RETURN_IF_ERROR(get_rowset_writer_context(data_context, tablet_schema())); auto data_writer = DORIS_TRY(create_rowset_writer(data_context, false)); group_rowset_writer->set_data_writer(std::move(data_writer)); RowsetWriterContext row_binlog_context; - row_binlog_context.write_binlog_opt().mark_binlog_writer(); + row_binlog_context.write_binlog_opt().enable = true; RETURN_IF_ERROR(get_rowset_writer_context(row_binlog_context, row_binlog_tablet_schema())); auto row_binlog_writer = DORIS_TRY(create_rowset_writer(row_binlog_context, false)); group_rowset_writer->set_row_binlog_writer(std::move(row_binlog_writer)); RETURN_IF_ERROR(group_rowset_writer->flush_rowsets()); - RowsetSharedPtr new_data_rowset; - RowsetSharedPtr new_row_binlog_rowset; std::vector waited_build_rowsets; - waited_build_rowsets.push_back(std::move(new_data_rowset)); - waited_build_rowsets.push_back(std::move(new_row_binlog_rowset)); RETURN_IF_ERROR(group_rowset_writer->build_rowsets(waited_build_rowsets)); - // don't need to think rollback when only one rowset build success becuase they had not been persisted. - RETURN_IF_ERROR( - add_rowset(std::move(waited_build_rowsets.at(0)), waited_build_rowsets.at(1))); + RETURN_IF_ERROR(add_rowset(waited_build_rowsets.at(0), waited_build_rowsets.at(1))); } set_cumulative_layer_point(req_version + 1); @@ -2014,6 +2080,8 @@ Result> Tablet::create_transient_rowset_writer( RowsetWriterContext context; context.rowset_state = PREPARED; context.segments_overlap = OVERLAPPING; + context.db_id = rowset.rowset_meta()->db_id(); + context.table_id = rowset.rowset_meta()->table_id(); context.tablet_schema = std::make_shared(); // During a partial update, the extracted columns of a variant should not be included in the tablet schema. // This is because the partial update for a variant needs to ignore the extracted columns. @@ -2034,6 +2102,11 @@ Result> Tablet::create_transient_rowset_writer( context.write_type = DataWriteType::TYPE_DIRECT; context.partial_update_info = std::move(partial_update_info); context.is_transient_rowset_writer = true; + + if (rowset.rowset_meta() != nullptr && rowset.rowset_meta()->is_row_binlog()) { + context.write_binlog_opt().enable = true; + } + return create_transient_rowset_writer(context, rowset.rowset_id()) .transform([&](auto&& writer) { writer->set_segment_start_id(cast_set(rowset.num_segments())); @@ -2049,6 +2122,13 @@ Result> Tablet::create_transient_rowset_writer( } void Tablet::_init_context_common_fields(RowsetWriterContext& context) { + if (context.db_id <= 0) { + context.db_id = tablet_meta()->tablet_schema()->db_id(); + } + if (context.table_id <= 0) { + context.table_id = tablet_meta()->tablet_schema()->table_id(); + } + context.tablet_uid = tablet_uid(); context.tablet_id = tablet_id(); context.partition_id = partition_id(); @@ -2069,7 +2149,7 @@ void Tablet::_init_context_common_fields(RowsetWriterContext& context) { context.encrypt_algorithm = tablet_meta()->encryption_algorithm(); - if (context.write_binlog_opt().is_binlog_writer()) { + if (context.write_binlog_opt().enable) { context.tablet_schema_hash = row_binlog_schema_hash(); bool need_before = tablet_meta()->binlog_config().need_historical_value(); context.write_binlog_opt().set_need_before(need_before); @@ -2627,6 +2707,20 @@ Status Tablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, RowsetSharedPtr rowset = txn_info->rowset; int64_t cur_version = rowset->start_version(); + // For binlog publish, sync current rowset delete bitmap deltas to `binlog_delvec` + // so row binlog reads can skip rows deleted by MOW conflict resolution. + const bool build_row_binlog = !txn_info->attach_rowsets.empty(); + const RowsetId cur_build_rid = txn_info->rowset->rowset_id(); + const RowsetId binlog_rid = + build_row_binlog ? txn_info->attach_rowsets[0]->rowset_id() : cur_build_rid; + auto* binlog_delvec = txn_info->binlog_delvec.get(); + if (build_row_binlog) { + DCHECK(txn_info->attach_rowsets[0] != nullptr); + DCHECK(txn_info->attach_rowsets[0]->rowset_meta() != nullptr); + DCHECK(txn_info->attach_rowsets[0]->rowset_meta()->is_row_binlog()); + DCHECK(binlog_delvec != nullptr); + } + // update version without write lock, compaction and publish_txn // will update delete bitmap, handle compaction with _rowset_update_lock // and publish_txn runs sequential so no need to lock here @@ -2635,6 +2729,15 @@ Status Tablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, if (std::get<1>(key) != DeleteBitmap::INVALID_SEGMENT_ID) { _tablet_meta->delete_bitmap().merge({std::get<0>(key), std::get<1>(key), cur_version}, bitmap); + + if (build_row_binlog && std::get<0>(key) == cur_build_rid) { + const DeleteBitmap::SegmentId& sid = std::get<1>(key); + // Merge current delete bitmap deltas to binlog delvec because publish-phase + // partial-update and seq replace will influence the binlog file. + binlog_delvec->merge({binlog_rid, sid, cast_set(cur_version)}, bitmap); + _tablet_meta->binlog_delvec().merge( + {binlog_rid, sid, cast_set(cur_version)}, bitmap); + } } } diff --git a/be/src/storage/tablet/tablet.h b/be/src/storage/tablet/tablet.h index 3c715cea7e48d0..5d2a0e5dcebe8f 100644 --- a/be/src/storage/tablet/tablet.h +++ b/be/src/storage/tablet/tablet.h @@ -182,7 +182,9 @@ class Tablet final : public BaseTablet { std::vector& to_delete, bool check_delete = false); bool rowset_exists_unlocked(const RowsetSharedPtr& rowset); - Status add_inc_rowset(const RowsetSharedPtr& rowset); + // Add a committed data rowset and its row binlog rowset + Status add_inc_rowset(const RowsetSharedPtr& rowset, + const RowsetSharedPtr& row_binlog_rowset = nullptr); /// Delete stale rowset by timing. This delete policy uses now() minutes /// config::tablet_rowset_expired_stale_sweep_time_sec to compute the deadline of expired rowset /// to delete. When rowset is deleted, it will be added to StorageEngine unused map and record @@ -483,11 +485,11 @@ class Tablet final : public BaseTablet { return config::enable_feature_binlog && _tablet_meta->binlog_config().is_enable(); } bool enable_ccr_binlog() const { - return enable_binlog() && _tablet_meta->binlog_config().isCCRBinlogFormat(); + return enable_binlog() && _tablet_meta->binlog_config().is_ccr_binlog_format(); } bool enable_row_binlog() const { return _tablet_meta->binlog_config().is_enable() && - _tablet_meta->binlog_config().isRowBinlogFormat(); + _tablet_meta->binlog_config().is_row_binlog_format(); } int64_t binlog_ttl_ms() const { return _tablet_meta->binlog_config().ttl_seconds(); } @@ -497,10 +499,6 @@ class Tablet final : public BaseTablet { // row_binlog int32_t row_binlog_schema_hash() const { return _tablet_meta->row_binlog_schema_hash(); } - TabletSchemaSPtr row_binlog_tablet_schema() { - std::shared_lock rdlock(_meta_lock); - return _tablet_meta->row_binlog_schema(); - } void set_is_full_compaction_running(bool is_full_compaction_running) { _is_full_compaction_running = is_full_compaction_running; @@ -534,6 +532,8 @@ class Tablet final : public BaseTablet { Status _init_once_action(); bool _contains_rowset(const RowsetId rowset_id); Status _contains_version(const Version& version); + Status _add_row_binlog_rowset_unlocked(const RowsetSharedPtr& rowset, + const RowsetSharedPtr& row_binlog_rowset); // Returns: // version: the max continuous version from beginning diff --git a/be/src/storage/tablet/tablet_manager.cpp b/be/src/storage/tablet/tablet_manager.cpp index 4c3b54e3cf7f20..4d776827ae32d2 100644 --- a/be/src/storage/tablet/tablet_manager.cpp +++ b/be/src/storage/tablet/tablet_manager.cpp @@ -480,6 +480,9 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked( _gen_tablet_dir(data_dir->path(), tablet_meta->shard_id(), request.tablet_id); string schema_hash_dir = path_util::join_path_segments( tablet_dir, std::to_string(request.tablet_schema.schema_hash)); + bool has_row_binlog = tablet_meta->binlog_config().is_enable() && + tablet_meta->binlog_config().is_row_binlog_format(); + string row_binlog_dir = path_util::join_path_segments(schema_hash_dir, "_row_binlog"); // Because the tablet is removed asynchronously, so that the dir may still exist when BE // receive create-tablet request again, For example retried schema-change request @@ -498,6 +501,15 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked( } } + if (has_row_binlog) { + Status st = io::global_local_filesystem()->create_directory(row_binlog_dir); + if (!st.ok()) { + WARN_IF_ERROR(io::global_local_filesystem()->delete_directory(schema_hash_dir), + "failed to cleanup tablet dir after create sub directory failed"); + continue; + } + } + if (tablet_meta->partition_id() <= 0) { LOG(WARNING) << "invalid partition id " << tablet_meta->partition_id() << ", tablet " << tablet_meta->tablet_id(); diff --git a/be/src/storage/tablet/tablet_meta_manager.cpp b/be/src/storage/tablet/tablet_meta_manager.cpp index e6d10c28665a82..45815a292aa96e 100644 --- a/be/src/storage/tablet/tablet_meta_manager.cpp +++ b/be/src/storage/tablet/tablet_meta_manager.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "common/logging.h" @@ -234,21 +235,48 @@ void NO_SANITIZE_UNDEFINED TabletMetaManager::decode_delete_bitmap_key(std::stri Status TabletMetaManager::save_delete_bitmap(DataDir* store, TTabletId tablet_id, DeleteBitmapPtr delete_bitmap, int64_t version) { + return save_delete_bitmap(store, tablet_id, std::move(delete_bitmap), nullptr, version); +} + +Status TabletMetaManager::save_delete_bitmap(DataDir* store, TTabletId tablet_id, + DeleteBitmapPtr delete_bitmap, + DeleteBitmapPtr binlog_delvec, int64_t version) { VLOG_NOTICE << "save delete bitmap, tablet_id:" << tablet_id << ", version: " << version; - if (delete_bitmap->delete_bitmap.empty()) { + if ((delete_bitmap == nullptr || delete_bitmap->delete_bitmap.empty()) && + (binlog_delvec == nullptr || binlog_delvec->delete_bitmap.empty())) { return Status::OK(); } OlapMeta* meta = store->get_meta(); DeleteBitmapPB delete_bitmap_pb; - for (auto& [id, bitmap] : delete_bitmap->delete_bitmap) { - auto& rowset_id = std::get<0>(id); - auto segment_id = std::get<1>(id); - delete_bitmap_pb.add_rowset_ids(rowset_id.to_string()); - delete_bitmap_pb.add_segment_ids(segment_id); - std::string bitmap_data(bitmap.getSizeInBytes(), '\0'); - bitmap.write(bitmap_data.data()); - *(delete_bitmap_pb.add_segment_delete_bitmaps()) = std::move(bitmap_data); + + // Normal delete bitmap. + if (delete_bitmap != nullptr) { + for (auto& [id, bitmap] : delete_bitmap->delete_bitmap) { + auto& rowset_id = std::get<0>(id); + auto segment_id = std::get<1>(id); + delete_bitmap_pb.add_rowset_ids(rowset_id.to_string()); + delete_bitmap_pb.add_segment_ids(segment_id); + std::string bitmap_data(bitmap.getSizeInBytes(), '\0'); + bitmap.write(bitmap_data.data()); + *(delete_bitmap_pb.add_segment_delete_bitmaps()) = std::move(bitmap_data); + delete_bitmap_pb.add_is_binlog_delvec(false); + } } + + // Binlog delvec. + if (binlog_delvec != nullptr) { + for (auto& [id, bitmap] : binlog_delvec->delete_bitmap) { + auto& rowset_id = std::get<0>(id); + auto segment_id = std::get<1>(id); + delete_bitmap_pb.add_rowset_ids(rowset_id.to_string()); + delete_bitmap_pb.add_segment_ids(segment_id); + std::string bitmap_data(bitmap.getSizeInBytes(), '\0'); + bitmap.write(bitmap_data.data()); + *(delete_bitmap_pb.add_segment_delete_bitmaps()) = std::move(bitmap_data); + delete_bitmap_pb.add_is_binlog_delvec(true); + } + } + std::string key = encode_delete_bitmap_key(tablet_id, version); std::string val; bool ok = delete_bitmap_pb.SerializeToString(&val); diff --git a/be/src/storage/tablet/tablet_meta_manager.h b/be/src/storage/tablet/tablet_meta_manager.h index eafcce9191191f..c0676a7ef6c5d1 100644 --- a/be/src/storage/tablet/tablet_meta_manager.h +++ b/be/src/storage/tablet/tablet_meta_manager.h @@ -75,6 +75,12 @@ class TabletMetaManager { static Status save_delete_bitmap(DataDir* store, TTabletId tablet_id, DeleteBitmapPtr delete_bitmap, int64_t version); + // Persist both normal delete bitmap and binlog delvec for a specific visible version. + // `binlog_delvec` is optional. + static Status save_delete_bitmap(DataDir* store, TTabletId tablet_id, + DeleteBitmapPtr delete_bitmap, DeleteBitmapPtr binlog_delvec, + int64_t version); + static Status traverse_delete_bitmap( OlapMeta* meta, std::function const& func); diff --git a/be/src/storage/tablet/tablet_reader.cpp b/be/src/storage/tablet/tablet_reader.cpp index 79a9d05599a5e7..a66493c80f0985 100644 --- a/be/src/storage/tablet/tablet_reader.cpp +++ b/be/src/storage/tablet/tablet_reader.cpp @@ -121,7 +121,8 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { } bool need_ordered_result = true; - if (read_params.reader_type == ReaderType::READER_QUERY) { + if (read_params.reader_type == ReaderType::READER_QUERY || + read_params.reader_type == ReaderType::READER_BINLOG) { if (_tablet_schema->keys_type() == DUP_KEYS) { // duplicated keys are allowed, no need to merge sort keys in rowset need_ordered_result = false; @@ -272,7 +273,8 @@ Status TabletReader::_init_params(const ReaderParams& read_params) { Status TabletReader::_init_return_columns(const ReaderParams& read_params) { SCOPED_RAW_TIMER(&_stats.tablet_reader_init_return_columns_timer_ns); - if (read_params.reader_type == ReaderType::READER_QUERY) { + if (read_params.reader_type == ReaderType::READER_QUERY || + read_params.reader_type == ReaderType::READER_BINLOG) { _return_columns = read_params.return_columns; _tablet_columns_convert_to_null_set = read_params.tablet_columns_convert_to_null_set; for (auto id : read_params.return_columns) { diff --git a/be/src/storage/tablet/tablet_schema.h b/be/src/storage/tablet/tablet_schema.h index d1f286c3a503df..dec976d26e0624 100644 --- a/be/src/storage/tablet/tablet_schema.h +++ b/be/src/storage/tablet/tablet_schema.h @@ -457,6 +457,15 @@ class TabletSchema : public MetadataAdder { void replace_column(size_t pos, TabletColumn new_col); const std::vector& columns() const; size_t num_columns() const { return _num_columns; } + size_t num_visible_columns() const { + return std::count_if(_cols.begin(), _cols.end(), + [](const TabletColumnPtr& column) { return column->visible(); }); + } + size_t num_visible_value_columns() const { + return std::count_if(_cols.begin(), _cols.end(), [](const TabletColumnPtr& column) { + return column->visible() && !column->is_key(); + }); + } size_t num_key_columns() const { return _num_key_columns; } const std::vector& cluster_key_uids() const { return _cluster_key_uids; } size_t num_null_columns() const { return _num_null_columns; } diff --git a/be/src/storage/tablet_info.cpp b/be/src/storage/tablet_info.cpp index 34abb1136e8e1d..518dd7baee4878 100644 --- a/be/src/storage/tablet_info.cpp +++ b/be/src/storage/tablet_info.cpp @@ -63,6 +63,9 @@ namespace doris { void OlapTableIndexSchema::to_protobuf(POlapTableIndexSchema* pindex) const { pindex->set_id(index_id); pindex->set_schema_hash(schema_hash); + if (row_binlog_id > 0) { + pindex->set_row_binlog_id(row_binlog_id); + } for (auto* slot : slots) { pindex->add_columns(slot->col_name()); } @@ -182,6 +185,9 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { auto* index = _obj_pool.add(new OlapTableIndexSchema()); index->index_id = p_index.id(); index->schema_hash = p_index.schema_hash(); + if (p_index.has_row_binlog_id()) { + index->row_binlog_id = p_index.row_binlog_id(); + } for (const auto& pcolumn_desc : p_index.columns_desc()) { if (_unique_key_update_mode != UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS || _partial_update_input_columns.contains(pcolumn_desc.name())) { @@ -219,6 +225,27 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { _indexes.emplace_back(index); } + if (pschema.has_row_binlog_index_schema()) { + const auto& p_index = pschema.row_binlog_index_schema(); + auto* index = _obj_pool.add(new OlapTableIndexSchema()); + index->index_id = p_index.id(); + index->schema_hash = p_index.schema_hash(); + if (p_index.has_row_binlog_id()) { + index->row_binlog_id = p_index.row_binlog_id(); + } + for (const auto& pcolumn_desc : p_index.columns_desc()) { + TabletColumn* tc = _obj_pool.add(new TabletColumn()); + tc->init_from_pb(pcolumn_desc); + index->columns.emplace_back(tc); + } + for (const auto& pindex_desc : p_index.indexes_desc()) { + TabletIndex* ti = _obj_pool.add(new TabletIndex()); + ti->init_from_pb(pindex_desc); + index->indexes.emplace_back(ti); + } + _row_binlog_index_schema = index; + } + std::sort(_indexes.begin(), _indexes.end(), [](const OlapTableIndexSchema* lhs, const OlapTableIndexSchema* rhs) { return lhs->index_id < rhs->index_id; @@ -320,6 +347,9 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { auto* index = _obj_pool.add(new OlapTableIndexSchema()); index->index_id = t_index.id; index->schema_hash = t_index.schema_hash; + if (t_index.__isset.row_binlog_id) { + index->row_binlog_id = t_index.row_binlog_id; + } for (const auto& tcolumn_desc : t_index.columns_desc) { if (_unique_key_update_mode != UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS || _partial_update_input_columns.contains(tcolumn_desc.column_name)) { @@ -372,6 +402,22 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { _indexes.emplace_back(index); } + if (tschema.__isset.row_binlog_index_schema) { + const auto& t_index = tschema.row_binlog_index_schema; + auto* index = _obj_pool.add(new OlapTableIndexSchema()); + index->index_id = t_index.id; + index->schema_hash = t_index.schema_hash; + if (t_index.__isset.row_binlog_id) { + index->row_binlog_id = t_index.row_binlog_id; + } + for (const auto& tcolumn_desc : t_index.columns_desc) { + TabletColumn* tc = _obj_pool.add(new TabletColumn()); + tc->init_from_thrift(tcolumn_desc); + index->columns.emplace_back(tc); + } + _row_binlog_index_schema = index; + } + std::sort(_indexes.begin(), _indexes.end(), [](const OlapTableIndexSchema* lhs, const OlapTableIndexSchema* rhs) { return lhs->index_id < rhs->index_id; @@ -406,6 +452,9 @@ void OlapTableSchemaParam::to_protobuf(POlapTableSchemaParam* pschema) const { for (auto* index : _indexes) { index->to_protobuf(pschema->add_indexes()); } + if (_row_binlog_index_schema != nullptr) { + _row_binlog_index_schema->to_protobuf(pschema->mutable_row_binlog_index_schema()); + } } std::string OlapTableSchemaParam::debug_string() const { diff --git a/be/src/storage/tablet_info.h b/be/src/storage/tablet_info.h index 0982df56b8c526..70d187ebac98f0 100644 --- a/be/src/storage/tablet_info.h +++ b/be/src/storage/tablet_info.h @@ -56,6 +56,7 @@ class TupleDescriptor; struct OlapTableIndexSchema { int64_t index_id; + int64_t row_binlog_id = 0; std::vector slots; int32_t schema_hash; std::vector columns; @@ -79,6 +80,7 @@ class OlapTableSchemaParam { TupleDescriptor* tuple_desc() const { return _tuple_desc; } const std::vector& indexes() const { return _indexes; } + const OlapTableIndexSchema* row_binlog_index_schema() const { return _row_binlog_index_schema; } void to_protobuf(POlapTableSchemaParam* pschema) const; @@ -131,6 +133,7 @@ class OlapTableSchemaParam { TupleDescriptor* _tuple_desc = nullptr; mutable POlapTableSchemaParam* _proto_schema = nullptr; std::vector _indexes; + OlapTableIndexSchema* _row_binlog_index_schema = nullptr; mutable ObjectPool _obj_pool; UniqueKeyUpdateModePB _unique_key_update_mode {UniqueKeyUpdateModePB::UPSERT}; PartialUpdateNewRowPolicyPB _partial_update_new_row_policy { diff --git a/be/src/storage/task/engine_publish_version_task.cpp b/be/src/storage/task/engine_publish_version_task.cpp index 22d78f21f94403..240cef62d0edf1 100644 --- a/be/src/storage/task/engine_publish_version_task.cpp +++ b/be/src/storage/task/engine_publish_version_task.cpp @@ -154,8 +154,9 @@ Status EnginePublishVersionTask::execute() { } map tablet_related_rs; - _engine.txn_manager()->get_txn_related_tablets(transaction_id, partition_id, - &tablet_related_rs); + map> tablet_related_attach_rowsets; + _engine.txn_manager()->get_txn_related_tablets( + transaction_id, partition_id, &tablet_related_rs, &tablet_related_attach_rowsets); Version version(par_ver_info.version, par_ver_info.version); @@ -169,6 +170,8 @@ Status EnginePublishVersionTask::execute() { for (auto& tablet_rs : tablet_related_rs) { TabletInfo tablet_info = tablet_rs.first; RowsetSharedPtr rowset = tablet_rs.second; + auto attach_rowsets_it = tablet_related_attach_rowsets.find(tablet_info); + DCHECK(attach_rowsets_it != tablet_related_attach_rowsets.end()); VLOG_CRITICAL << "begin to publish version on tablet. " << "tablet_id=" << tablet_info.tablet_id << ", version=" << version.first << ", transaction_id=" << transaction_id; @@ -247,8 +250,8 @@ Status EnginePublishVersionTask::execute() { } auto tablet_publish_txn_ptr = std::make_shared( - _engine, this, tablet, rowset, partition_id, transaction_id, version, - tablet_info, par_ver_info.commit_tso); + _engine, this, tablet, rowset, attach_rowsets_it->second, partition_id, + transaction_id, version, tablet_info, par_ver_info.commit_tso); tablet_tasks.push_back(tablet_publish_txn_ptr); auto submit_st = token->submit_func([=]() { tablet_publish_txn_ptr->handle(); }); #ifndef NDEBUG @@ -404,16 +407,15 @@ void EnginePublishVersionTask::_calculate_tbl_num_delta_rows( } } -TabletPublishTxnTask::TabletPublishTxnTask(StorageEngine& engine, - EnginePublishVersionTask* engine_task, - TabletSharedPtr tablet, RowsetSharedPtr rowset, - int64_t partition_id, int64_t transaction_id, - Version version, const TabletInfo& tablet_info, - int64_t commit_tso) +TabletPublishTxnTask::TabletPublishTxnTask( + StorageEngine& engine, EnginePublishVersionTask* engine_task, TabletSharedPtr tablet, + RowsetSharedPtr rowset, std::vector attach_rowsets, int64_t partition_id, + int64_t transaction_id, Version version, const TabletInfo& tablet_info, int64_t commit_tso) : _engine(engine), _engine_publish_version_task(engine_task), _tablet(std::move(tablet)), _rowset(std::move(rowset)), + _attach_rowsets(std::move(attach_rowsets)), _partition_id(partition_id), _transaction_id(transaction_id), _version(version), @@ -431,6 +433,7 @@ TabletPublishTxnTask::~TabletPublishTxnTask() = default; Status publish_version_and_add_rowset(StorageEngine& engine, int64_t partition_id, const TabletSharedPtr& tablet, const RowsetSharedPtr& rowset, + const std::vector& attach_rowsets, int64_t transaction_id, const Version& version, EnginePublishVersionTask* engine_publish_version_task, TabletPublishStatistics& stats, int64_t commit_tso) { @@ -456,7 +459,12 @@ Status publish_version_and_add_rowset(StorageEngine& engine, int64_t partition_i // Add visible rowset to tablet int64_t start_time = MonotonicMicros(); - result = tablet->add_inc_rowset(rowset); + RowsetSharedPtr row_binlog_rowset; + DCHECK_LE(attach_rowsets.size(), 1); + if (!attach_rowsets.empty()) { + row_binlog_rowset = attach_rowsets[0]; + } + result = tablet->add_inc_rowset(rowset, row_binlog_rowset); DBUG_EXECUTE_IF("EnginePublishVersionTask.handle.after_add_inc_rowset_rowsets_block", DBUG_BLOCK); stats.add_inc_rowset_us = MonotonicMicros() - start_time; @@ -489,7 +497,7 @@ void TabletPublishTxnTask::handle() { } _stats.schedule_time_us = MonotonicMicros() - _stats.submit_time_us; _result = publish_version_and_add_rowset(_engine, _partition_id, _tablet, _rowset, - _transaction_id, _version, + _attach_rowsets, _transaction_id, _version, _engine_publish_version_task, _stats, _commit_tso); if (!_result.ok()) { @@ -520,18 +528,22 @@ void AsyncTabletPublishTask::handle() { std::lock_guard wrlock(_tablet->get_rowset_update_lock()); _stats.schedule_time_us = MonotonicMicros() - _stats.submit_time_us; std::map tablet_related_rs; - _engine.txn_manager()->get_txn_related_tablets(_transaction_id, _partition_id, - &tablet_related_rs); + std::map> tablet_related_attach_rowsets; + _engine.txn_manager()->get_txn_related_tablets( + _transaction_id, _partition_id, &tablet_related_rs, &tablet_related_attach_rowsets); auto iter = tablet_related_rs.find(TabletInfo(_tablet->tablet_id(), _tablet->tablet_uid())); if (iter == tablet_related_rs.end()) { return; } + auto attach_rowsets_it = tablet_related_attach_rowsets.find( + TabletInfo(_tablet->tablet_id(), _tablet->tablet_uid())); + DCHECK(attach_rowsets_it != tablet_related_attach_rowsets.end()); RowsetSharedPtr rowset = iter->second; Version version(_version, _version); - auto publish_status = - publish_version_and_add_rowset(_engine, _partition_id, _tablet, rowset, _transaction_id, - version, nullptr, _stats, _commit_tso); + auto publish_status = publish_version_and_add_rowset(_engine, _partition_id, _tablet, rowset, + attach_rowsets_it->second, _transaction_id, + version, nullptr, _stats, _commit_tso); if (!publish_status.ok()) { return; diff --git a/be/src/storage/task/engine_publish_version_task.h b/be/src/storage/task/engine_publish_version_task.h index ba1f70588ae481..e2edb36db17fdc 100644 --- a/be/src/storage/task/engine_publish_version_task.h +++ b/be/src/storage/task/engine_publish_version_task.h @@ -64,7 +64,8 @@ struct TabletPublishStatistics { class TabletPublishTxnTask { public: TabletPublishTxnTask(StorageEngine& engine, EnginePublishVersionTask* engine_task, - TabletSharedPtr tablet, RowsetSharedPtr rowset, int64_t partition_id, + TabletSharedPtr tablet, RowsetSharedPtr rowset, + std::vector attach_rowsets, int64_t partition_id, int64_t transaction_id, Version version, const TabletInfo& tablet_info, int64_t commit_tso); ~TabletPublishTxnTask(); @@ -78,6 +79,7 @@ class TabletPublishTxnTask { TabletSharedPtr _tablet; RowsetSharedPtr _rowset; + std::vector _attach_rowsets; int64_t _partition_id; int64_t _transaction_id; Version _version; diff --git a/be/src/storage/txn/txn_manager.cpp b/be/src/storage/txn/txn_manager.cpp index d70141d24d55b7..21f8ebb4a4e013 100644 --- a/be/src/storage/txn/txn_manager.cpp +++ b/be/src/storage/txn/txn_manager.cpp @@ -31,11 +31,13 @@ #include #include #include +#include #include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "load/delta_writer/delta_writer.h" +#include "storage/binlog.h" #include "storage/data_dir.h" #include "storage/olap_common.h" #include "storage/partial_update_info.h" @@ -191,10 +193,11 @@ Status TxnManager::commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, - std::shared_ptr partial_update_info) { + std::shared_ptr partial_update_info, + std::vector* attach_rowsets) { return commit_txn(tablet.data_dir()->get_meta(), partition_id, transaction_id, tablet.tablet_id(), tablet.tablet_uid(), load_id, rowset_ptr, - std::move(guard), is_recovery, partial_update_info); + std::move(guard), is_recovery, partial_update_info, attach_rowsets); } Status TxnManager::publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, @@ -281,7 +284,8 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, TabletUid tablet_uid, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, - std::shared_ptr partial_update_info) { + std::shared_ptr partial_update_info, + std::vector* attach_rowsets) { if (partition_id < 1 || transaction_id < 1 || tablet_id < 1) { LOG(WARNING) << "invalid commit req " << " partition_id=" << partition_id << " transaction_id=" << transaction_id @@ -375,9 +379,23 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, // save meta need access disk, it maybe very slow, so that it is not in global txn lock // it is under a single txn lock if (!is_recovery) { + std::optional binlog_format; + std::map attach_rowset_map; + if (attach_rowsets != nullptr) { + for (const auto& rs : *attach_rowsets) { + if (rs == nullptr) { + continue; + } + attach_rowset_map.emplace(rs->rowset_id(), rs->rowset_meta()->get_rowset_pb()); + } + if (!attach_rowset_map.empty()) { + binlog_format = BinlogFormatPB::ROW; + } + } Status save_status = RowsetMetaManager::save(meta, tablet_uid, rowset_ptr->rowset_id(), - rowset_ptr->rowset_meta()->get_rowset_pb(), false); + rowset_ptr->rowset_meta()->get_rowset_pb(), binlog_format, + attach_rowset_map.empty() ? nullptr : &attach_rowset_map); DBUG_EXECUTE_IF("TxnManager.RowsetMetaManager.save_wait", { if (auto wait = dp->param("duration", 0); wait > 0) { LOG_WARNING("TxnManager.RowsetMetaManager.save_wait") @@ -426,6 +444,9 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, { std::lock_guard wrlock(_get_txn_map_lock(transaction_id)); auto load_info = std::make_shared(load_id, rowset_ptr); + if (attach_rowsets != nullptr) { + load_info->attach_rowsets = *attach_rowsets; + } load_info->pending_rs_guard = std::move(guard); if (is_recovery) { if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { @@ -441,6 +462,15 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, } } } + + // For binlog txn (attached rowsets exist), binlog_delvec is only needed for publish + // phase to copy delete bitmap deltas. + if (attach_rowsets != nullptr && !attach_rowsets->empty()) { + TabletSharedPtr t = _engine.tablet_manager()->get_tablet(tablet_id, tablet_uid); + if (t != nullptr && t->enable_unique_key_merge_on_write()) { + load_info->binlog_delvec.reset(new DeleteBitmap(t->tablet_id())); + } + } load_info->commit(); txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id); @@ -523,6 +553,14 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, // it maybe a fatal error rowset->make_visible(version, commit_tso); + // Make all attached rowsets visible together. + for (const auto& rs : tablet_txn_info->attach_rowsets) { + if (rs == nullptr) { + continue; + } + rs->make_visible(version, commit_tso); + } + DBUG_EXECUTE_IF("TxnManager.publish_txn.random_failed_after_save_rs_meta", { if (rand() % 100 < (100 * dp->param("percent", 0.5))) { LOG_WARNING("TxnManager.publish_txn.random_failed_after_save_rs_meta") @@ -560,13 +598,14 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, stats->calc_delete_bitmap_time_us = t3 - t2; RETURN_IF_ERROR(TabletMetaManager::save_delete_bitmap( tablet->data_dir(), tablet->tablet_id(), tablet_txn_info->delete_bitmap, - version.second)); + tablet_txn_info->binlog_delvec, version.second)); stats->save_meta_time_us = MonotonicMicros() - t3; } /// Step 3: add to binlog - auto enable_binlog = tablet->enable_ccr_binlog(); - if (enable_binlog) { + std::optional binlog_format; + if (tablet->enable_ccr_binlog()) { + binlog_format = BinlogFormatPB::STATEMENT_AND_SNAPSHOT; auto status = rowset->add_to_binlog(); if (!status.ok()) { return Status::Error( @@ -577,10 +616,24 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, } } + std::map attach_rowset_map; + if (tablet_txn_info != nullptr) { + for (const auto& rs : tablet_txn_info->attach_rowsets) { + if (rs == nullptr) { + continue; + } + attach_rowset_map.emplace(rs->rowset_id(), rs->rowset_meta()->get_rowset_pb()); + } + if (!attach_rowset_map.empty()) { + binlog_format = BinlogFormatPB::ROW; + } + } + /// Step 4: save meta int64_t t5 = MonotonicMicros(); auto status = RowsetMetaManager::save(meta, tablet_uid, rowset->rowset_id(), - rowset->rowset_meta()->get_rowset_pb(), enable_binlog); + rowset->rowset_meta()->get_rowset_pb(), binlog_format, + attach_rowset_map.empty() ? nullptr : &attach_rowset_map); stats->save_meta_time_us += MonotonicMicros() - t5; if (!status.ok()) { status.append(fmt::format(", txn id: {}", transaction_id)); @@ -602,7 +655,7 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, } // TODO(Drogon): remove these test codes - if (enable_binlog) { + if (tablet->enable_ccr_binlog()) { auto version_str = fmt::format("{}", version.first); VLOG_DEBUG << fmt::format("tabletid: {}, version: {}, binlog filepath: {}", tablet_id, version_str, tablet->get_binlog_filepath(version_str)); @@ -721,6 +774,11 @@ Status TxnManager::delete_txn(OlapMeta* meta, TPartitionId partition_id, rowset->rowset_id().to_string(), rowset->version().to_string(), RowsetStatePB_Name(rowset->rowset_meta_state())); } else { + for (const auto& attach_rowset : load_info->attach_rowsets) { + static_cast(RowsetMetaManager::remove_row_binlog( + meta, tablet_uid, rowset->rowset_id(), attach_rowset->rowset_id())); + _engine.add_unused_rowset(attach_rowset); + } static_cast(RowsetMetaManager::remove(meta, tablet_uid, rowset->rowset_id())); #ifndef BE_TEST _engine.add_unused_rowset(rowset); @@ -728,7 +786,11 @@ Status TxnManager::delete_txn(OlapMeta* meta, TPartitionId partition_id, VLOG_NOTICE << "delete transaction from engine successfully." << " partition_id: " << key.first << ", transaction_id: " << key.second << ", tablet: " << tablet_info.to_string() << ", rowset: " - << (rowset != nullptr ? rowset->rowset_id().to_string() : "0"); + << (rowset != nullptr ? rowset->rowset_id().to_string() : "0") + << ", binlog rowset: " + << (load_info->attach_rowsets.empty() + ? "0" + : load_info->attach_rowsets[0]->rowset_id().to_string()); } } it->second.erase(load_itr); @@ -783,6 +845,16 @@ void TxnManager::force_rollback_tablet_related_txns(OlapMeta* meta, TTabletId ta LOG(INFO) << " delete transaction from engine " << ", tablet: " << tablet_info.to_string() << ", rowset id: " << rowset->rowset_id(); + // clean attach rowset first + for (const auto& attach_rowset : load_info->attach_rowsets) { + Status status = RowsetMetaManager::remove_row_binlog( + meta, tablet_uid, rowset->rowset_id(), attach_rowset->rowset_id()); + if (!status.ok()) { + if (status.is()) { + continue; + } + } + } static_cast( RowsetMetaManager::remove(meta, tablet_uid, rowset->rowset_id())); } @@ -790,7 +862,11 @@ void TxnManager::force_rollback_tablet_related_txns(OlapMeta* meta, TTabletId ta << " partition_id: " << it->first.first << ", transaction_id: " << it->first.second << ", tablet: " << tablet_info.to_string() << ", rowset: " - << (rowset != nullptr ? rowset->rowset_id().to_string() : "0"); + << (rowset != nullptr ? rowset->rowset_id().to_string() : "0") + << ", binlog rowset: " + << (load_info->attach_rowsets.empty() + ? "0" + : load_info->attach_rowsets[0]->rowset_id().to_string()); it->second.erase(load_itr); } if (it->second.empty()) { @@ -811,9 +887,10 @@ void TxnManager::force_rollback_tablet_related_txns(OlapMeta* meta, TTabletId ta } } -void TxnManager::get_txn_related_tablets(const TTransactionId transaction_id, - TPartitionId partition_id, - std::map* tablet_infos) { +void TxnManager::get_txn_related_tablets( + const TTransactionId transaction_id, TPartitionId partition_id, + std::map* tablet_infos, + std::map>* tablet_attach_rowsets) { // get tablets in this transaction pair key(partition_id, transaction_id); std::shared_lock txn_rdlock(_get_txn_map_lock(transaction_id)); @@ -832,6 +909,9 @@ void TxnManager::get_txn_related_tablets(const TTransactionId transaction_id, // must not check rowset == null here, because if rowset == null // publish version should failed tablet_infos->emplace(tablet_info, load_info.second->rowset); + if (tablet_attach_rowsets != nullptr) { + tablet_attach_rowsets->emplace(tablet_info, load_info.second->attach_rowsets); + } } } diff --git a/be/src/storage/txn/txn_manager.h b/be/src/storage/txn/txn_manager.h index c7d9fc23796869..d7d3404822455a 100644 --- a/be/src/storage/txn/txn_manager.h +++ b/be/src/storage/txn/txn_manager.h @@ -73,9 +73,14 @@ struct TxnPublishInfo { struct TabletTxnInfo { PUniqueId load_id; RowsetSharedPtr rowset; + // The list of rowsets committed along with the transaction rowset + // currently contains only the binlog rowset. + std::vector attach_rowsets; PendingRowsetGuard pending_rs_guard; bool unique_key_merge_on_write {false}; DeleteBitmapPtr delete_bitmap; + // copy delete_bitmap of data rowset to binlog + DeleteBitmapPtr binlog_delvec; // records rowsets calc in commit txn RowsetIdUnorderedSet rowset_ids; int64_t creation_time; @@ -168,7 +173,8 @@ class TxnManager { Status commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, - std::shared_ptr partial_update_info = nullptr); + std::shared_ptr partial_update_info = nullptr, + std::vector* attach_rowsets = nullptr); Status publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, TTransactionId transaction_id, const Version& version, @@ -186,7 +192,8 @@ class TxnManager { Status commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, - std::shared_ptr partial_update_info = nullptr); + std::shared_ptr partial_update_info = nullptr, + std::vector* attach_rowsets = nullptr); // remove a txn from txn manager // not persist rowset meta because @@ -213,8 +220,10 @@ class TxnManager { void get_tablet_related_txns(TTabletId tablet_id, TabletUid tablet_uid, int64_t* partition_id, std::set* transaction_ids); - void get_txn_related_tablets(const TTransactionId transaction_id, TPartitionId partition_ids, - std::map* tablet_infos); + void get_txn_related_tablets( + const TTransactionId transaction_id, TPartitionId partition_ids, + std::map* tablet_infos, + std::map>* tablet_attach_rowsets = nullptr); void get_all_related_tablets(std::set* tablet_infos); diff --git a/be/src/util/thrift_util.cpp b/be/src/util/thrift_util.cpp index 015b5382cfc21d..b0d6d3b3093b45 100644 --- a/be/src/util/thrift_util.cpp +++ b/be/src/util/thrift_util.cpp @@ -179,4 +179,17 @@ bool _has_inverted_index_v1_or_partial_update(TOlapTableSink sink) { return false; } +bool _has_row_binlog(const TOlapTableSink& sink) { + OlapTableSchemaParam schema; + if (!schema.init(sink.schema).ok()) { + return false; + } + for (const auto* index_schema : schema.indexes()) { + if (index_schema->row_binlog_id > 0) { + return true; + } + } + return false; +} + } // namespace doris diff --git a/be/src/util/thrift_util.h b/be/src/util/thrift_util.h index a7d6620d5d31f3..b4060c23fe0939 100644 --- a/be/src/util/thrift_util.h +++ b/be/src/util/thrift_util.h @@ -178,5 +178,6 @@ bool t_network_address_comparator(const TNetworkAddress& a, const TNetworkAddres PURE std::string to_string(const TUniqueId& id); PURE bool _has_inverted_index_v1_or_partial_update(TOlapTableSink sink); +PURE bool _has_row_binlog(const TOlapTableSink& sink); } // namespace doris diff --git a/be/test/load/memtable/memtable_flush_executor_test.cpp b/be/test/load/memtable/memtable_flush_executor_test.cpp index 8b5d2990c06cc4..eacf2cadcb9a9c 100644 --- a/be/test/load/memtable/memtable_flush_executor_test.cpp +++ b/be/test/load/memtable/memtable_flush_executor_test.cpp @@ -22,29 +22,261 @@ #include #include #include +#include +#include +#include #include #include #include "common/config.h" +#include "exec/sink/autoinc_buffer.h" #include "io/fs/local_file_system.h" #include "load/delta_writer/delta_writer.h" #include "load/memtable/memtable.h" #include "runtime/descriptor_helper.h" +#include "runtime/descriptors.h" #include "runtime/exec_env.h" +#include "runtime/thread_context.h" #include "storage/field.h" #include "storage/options.h" +#include "storage/rowset/group_rowset_writer.h" +#include "storage/rowset/rowset_writer.h" #include "storage/schema.h" #include "storage/storage_engine.h" #include "storage/tablet/tablet.h" +#include "storage/tablet/tablet_manager.h" #include "storage/tablet/tablet_meta_manager.h" #include "storage/utils.h" +#include "testutil/creators.h" namespace doris { +namespace { + +class MockRowsetWriter final : public RowsetWriter { +public: + explicit MockRowsetWriter(std::atomic* flush_cnt, bool fail_on_flush = false, + const std::string& flush_error_msg = "mock flush failed", + int flush_delay_ms = 0) + : _flush_cnt(flush_cnt), + _fail_on_flush(fail_on_flush), + _flush_error_msg(flush_error_msg), + _flush_delay_ms(flush_delay_ms) {} + + Status init(const RowsetWriterContext& ctx) override { + _context = ctx; + return Status::OK(); + } + + Status add_rowset(RowsetSharedPtr) override { return Status::OK(); } + + Status add_rowset_for_linked_schema_change(RowsetSharedPtr) override { return Status::OK(); } + + Status flush() override { return Status::OK(); } + + Status flush_memtable(Block* block, int32_t segment_id, int64_t* flush_size) override { + EXPECT_GT(block->rows(), 0); + if (_flush_delay_ms > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(_flush_delay_ms)); + } + _last_segment_id = segment_id; + ++(*_flush_cnt); + *flush_size = 1; + if (_fail_on_flush) { + return Status::InternalError(_flush_error_msg); + } + return Status::OK(); + } + + Status build(RowsetSharedPtr& rowset) override { + rowset = nullptr; + return Status::OK(); + } + + RowsetSharedPtr manual_build(const RowsetMetaSharedPtr&) override { return nullptr; } + + PUniqueId load_id() override { return _context.load_id; } + + Version version() override { return _context.version; } + + int64_t num_rows() const override { return 0; } + int64_t num_rows_updated() const override { return 0; } + int64_t num_rows_deleted() const override { return 0; } + int64_t num_rows_new_added() const override { return 0; } + int64_t num_rows_filtered() const override { return 0; } + RowsetId rowset_id() override { return _context.rowset_id; } + RowsetTypePB type() const override { return BETA_ROWSET; } + int32_t allocate_segment_id() override { return _next_segment_id++; } + int32_t get_allocated_segment_id() override { return _next_segment_id; } + std::shared_ptr get_partial_update_info() override { return nullptr; } + bool is_partial_update() override { return false; } + + int32_t last_segment_id() const { return _last_segment_id; } + +private: + std::atomic* _flush_cnt; + bool _fail_on_flush; + std::string _flush_error_msg; + int _flush_delay_ms; + int32_t _next_segment_id = 0; + int32_t _last_segment_id = -1; +}; + +struct GroupFlushTestContext { + TCreateTabletReq request; + TDescriptorTable tdesc_tbl; + TabletSharedPtr tablet; + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + TupleDescriptor* tuple_desc = nullptr; + std::shared_ptr memtable; +}; + +class MemTableFlushExecutorGroupFlushTest : public testing::Test { +protected: + void SetUp() override { + char buffer[1024]; + ASSERT_NE(getcwd(buffer, 1024), nullptr); + config::storage_root_path = std::string(buffer) + "/flush_test"; + auto st = io::global_local_filesystem()->delete_directory(config::storage_root_path); + ASSERT_TRUE(st.ok()) << st; + st = io::global_local_filesystem()->create_directory(config::storage_root_path); + ASSERT_TRUE(st.ok()) << st; + + std::vector paths; + paths.emplace_back(config::storage_root_path, -1); + + doris::EngineOptions options; + options.store_paths = paths; + auto engine = std::make_unique(options); + Status s = engine->open(); + ASSERT_TRUE(s.ok()) << s.to_string(); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + ExecEnv::GetInstance()->set_storage_engine(nullptr); + EXPECT_EQ(system("rm -rf ./flush_test"), 0); + EXPECT_TRUE( + io::global_local_filesystem() + ->delete_directory(std::string(getenv("DORIS_HOME")) + "/" + UNUSED_PREFIX) + .ok()); + } + + StorageEngine* storage_engine() { return &ExecEnv::GetInstance()->storage_engine().to_local(); } + + std::shared_ptr create_group_flush_table_schema_param( + const GroupFlushTestContext& ctx) { + auto table_schema_param = testutil::create_table_schema_param( + ctx.tdesc_tbl, ctx.request.tablet_id, ctx.request.tablet_schema.schema_hash, + ctx.request.tablet_schema.columns, ctx.request.tablet_id + 1, + ctx.request.tablet_schema.schema_hash + 1, &ctx.request.tablet_schema.columns); + EXPECT_NE(table_schema_param, nullptr); + return table_schema_param; + } + + Status create_group_rowset_writer(const GroupFlushTestContext& ctx, int64_t load_id, + const std::shared_ptr& data_writer, + const std::shared_ptr& binlog_writer, + std::shared_ptr* group_writer) { + RowsetWriterContext data_ctx; + data_ctx.tablet_schema = ctx.tablet->tablet_schema(); + data_ctx.load_id.set_hi(load_id); + data_ctx.load_id.set_lo(load_id); + RETURN_IF_ERROR(data_writer->init(data_ctx)); + + RowsetWriterContext binlog_ctx = data_ctx; + binlog_ctx.write_binlog_opt().enable = true; + RETURN_IF_ERROR(binlog_writer->init(binlog_ctx)); + + auto writer = std::make_shared(); + writer->set_data_writer(data_writer); + writer->set_row_binlog_writer(binlog_writer); + RETURN_IF_ERROR(writer->init(data_ctx)); + *group_writer = std::move(writer); + return Status::OK(); + } + + Status create_group_flush_token(const GroupFlushTestContext& ctx, + const std::shared_ptr& group_writer, + int64_t start_lsn, std::shared_ptr* flush_token, + ThreadPool* pool = nullptr) { + auto table_schema_param = create_group_flush_table_schema_param(ctx); + if (table_schema_param == nullptr) { + return Status::InternalError("failed to create group flush table schema param"); + } + + if (pool == nullptr) { + RETURN_IF_ERROR(storage_engine()->memtable_flush_executor()->create_flush_token( + *flush_token, group_writer, false, nullptr, table_schema_param)); + } else { + auto token = FlushToken::create_shared(pool, nullptr); + token->set_rowset_writer(group_writer); + token->set_table_schema_param(table_schema_param); + *flush_token = std::move(token); + } + + auto row_binlog_lsn_buffer = AutoIncIDBuffer::create_shared( + table_schema_param->db_id(), table_schema_param->table_id(), kBinlogLsnAutoIncId); + row_binlog_lsn_buffer->append_range_for_test(start_lsn, ctx.memtable->raw_rows()); + (*flush_token)->set_row_binlog_lsn_buffer_for_test(row_binlog_lsn_buffer); + return Status::OK(); + } + + void prepare_group_flush_test_context(int64_t tablet_id, int32_t schema_hash, + GroupFlushTestContext* ctx) { + ctx->request = testutil::create_tablet_request(tablet_id, schema_hash, 30002, 3, + TKeysType::UNIQUE_KEYS, + {{"k1", TPrimitiveType::TINYINT, true}, + {"k2", TPrimitiveType::SMALLINT, true}, + {"k3", TPrimitiveType::INT, true}}); + ctx->request.__set_enable_unique_key_merge_on_write(true); + testutil::enable_row_binlog(&ctx->request); + auto profile = std::make_unique("CreateTablet"); + ASSERT_TRUE(storage_engine()->create_tablet(ctx->request, profile.get()).ok()); + + ctx->tablet = storage_engine()->tablet_manager()->get_tablet(ctx->request.tablet_id); + ASSERT_NE(ctx->tablet, nullptr); + + ctx->tdesc_tbl = testutil::create_descriptor_table({{TYPE_TINYINT, "k1", false}, + {TYPE_SMALLINT, "k2", false}, + {TYPE_INT, "k3", false}}); + ASSERT_TRUE(DescriptorTbl::create(&ctx->obj_pool, ctx->tdesc_tbl, &ctx->desc_tbl).ok()); + ctx->tuple_desc = ctx->desc_tbl->get_tuple_descriptor(0); + ASSERT_NE(ctx->tuple_desc, nullptr); + + ctx->memtable = std::make_shared( + ctx->request.tablet_id, ctx->tablet->tablet_schema(), &ctx->tuple_desc->slots(), + ctx->tuple_desc, false, nullptr, thread_context()->resource_ctx()); + Block block; + for (const auto& slot : ctx->tuple_desc->slots()) { + block.insert(ColumnWithTypeAndName(slot->get_empty_mutable_column(), slot->type(), + slot->col_name())); + } + auto cols = block.mutate_columns(); + int8_t k1 = -127; + int16_t k2 = -32767; + int32_t k3 = -2147483647; + cols[0]->insert_data((const char*)&k1, sizeof(k1)); + cols[1]->insert_data((const char*)&k2, sizeof(k2)); + cols[2]->insert_data((const char*)&k3, sizeof(k3)); + ASSERT_TRUE(ctx->memtable->insert(&block, {0}).ok()); + } + + void drop_tablet(const TCreateTabletReq& request) { + EXPECT_TRUE(storage_engine() + ->tablet_manager() + ->drop_tablet(request.tablet_id, request.replica_id, false) + .ok()); + } +}; + +} // namespace + void set_up() { char buffer[1024]; - getcwd(buffer, 1024); + ASSERT_NE(getcwd(buffer, 1024), nullptr); config::storage_root_path = std::string(buffer) + "/flush_test"; auto st = io::global_local_filesystem()->delete_directory(config::storage_root_path); ASSERT_TRUE(st.ok()) << st; @@ -207,4 +439,104 @@ TEST(MemTableFlushExecutorTest, TestThreadPoolMinMaxRelationship) { tear_down(); } +TEST_F(MemTableFlushExecutorGroupFlushTest, TestGroupFlushToken) { + SCOPED_INIT_THREAD_CONTEXT(); + + { + GroupFlushTestContext ctx; + prepare_group_flush_test_context(10001, 270068373, &ctx); + + std::atomic data_flush_cnt = 0; + std::atomic binlog_flush_cnt = 0; + auto data_writer = std::make_shared(&data_flush_cnt); + auto binlog_writer = std::make_shared(&binlog_flush_cnt); + std::shared_ptr group_writer; + ASSERT_TRUE( + create_group_rowset_writer(ctx, 1, data_writer, binlog_writer, &group_writer).ok()); + + std::shared_ptr flush_token; + ASSERT_TRUE(create_group_flush_token(ctx, group_writer, 1000, &flush_token).ok()); + ASSERT_TRUE(flush_token->submit(ctx.memtable).ok()); + ASSERT_TRUE(flush_token->wait().ok()); + EXPECT_EQ(1, data_flush_cnt.load()); + EXPECT_EQ(1, binlog_flush_cnt.load()); + EXPECT_EQ(data_writer->last_segment_id(), binlog_writer->last_segment_id()); + auto seg_lsn = + binlog_writer->context().write_binlog_opt().write_binlog_config().get_seg_lsn( + binlog_writer->last_segment_id()); + ASSERT_NE(seg_lsn, nullptr); + ASSERT_EQ(ctx.memtable->raw_rows(), seg_lsn->size()); + EXPECT_EQ(1000, (*seg_lsn)[0]); + EXPECT_EQ(2, flush_token->get_stats().flush_finish_count.load()); + EXPECT_EQ(0, flush_token->get_stats().flush_submit_count.load()); + + drop_tablet(ctx.request); + } + + { + GroupFlushTestContext ctx; + prepare_group_flush_test_context(10002, 270068374, &ctx); + + std::atomic data_flush_cnt = 0; + std::atomic binlog_flush_cnt = 0; + auto data_writer = std::make_shared(&data_flush_cnt); + auto binlog_writer = + std::make_shared(&binlog_flush_cnt, true, "binlog flush failed"); + std::shared_ptr group_writer; + ASSERT_TRUE( + create_group_rowset_writer(ctx, 2, data_writer, binlog_writer, &group_writer).ok()); + + std::shared_ptr flush_token; + ASSERT_TRUE(create_group_flush_token(ctx, group_writer, 2000, &flush_token).ok()); + ASSERT_TRUE(flush_token->submit(ctx.memtable).ok()); + + Status wait_st = flush_token->wait(); + EXPECT_FALSE(wait_st.ok()); + EXPECT_NE(wait_st.to_string().find("binlog flush failed"), std::string::npos); + EXPECT_EQ(1, binlog_flush_cnt.load()); + // Data and binlog flush tasks run concurrently. If binlog fails first, + // data flush may be skipped by the failed flush status. + EXPECT_LE(flush_token->get_stats().flush_finish_count.load(), 1); + EXPECT_EQ(0, flush_token->get_stats().flush_submit_count.load()); + + drop_tablet(ctx.request); + } +} + +TEST_F(MemTableFlushExecutorGroupFlushTest, TestGroupFlushTokenPartialSuccess) { + SCOPED_INIT_THREAD_CONTEXT(); + + GroupFlushTestContext ctx; + prepare_group_flush_test_context(10003, 270068375, &ctx); + + std::atomic data_flush_cnt = 0; + std::atomic binlog_flush_cnt = 0; + auto data_writer = std::make_shared(&data_flush_cnt); + auto binlog_writer = + std::make_shared(&binlog_flush_cnt, true, "binlog flush failed", 100); + std::shared_ptr group_writer; + ASSERT_TRUE(create_group_rowset_writer(ctx, 3, data_writer, binlog_writer, &group_writer).ok()); + + std::unique_ptr pool; + ASSERT_TRUE(ThreadPoolBuilder("MemTableGroupFlushTestPool") + .set_min_threads(2) + .set_max_threads(2) + .build(&pool) + .ok()); + + std::shared_ptr flush_token; + ASSERT_TRUE(create_group_flush_token(ctx, group_writer, 3000, &flush_token, pool.get()).ok()); + ASSERT_TRUE(flush_token->submit(ctx.memtable).ok()); + + Status wait_st = flush_token->wait(); + EXPECT_FALSE(wait_st.ok()); + EXPECT_NE(wait_st.to_string().find("binlog flush failed"), std::string::npos); + EXPECT_EQ(1, data_flush_cnt.load()); + EXPECT_EQ(1, binlog_flush_cnt.load()); + EXPECT_EQ(1, flush_token->get_stats().flush_finish_count.load()); + EXPECT_EQ(0, flush_token->get_stats().flush_submit_count.load()); + + drop_tablet(ctx.request); +} + } // namespace doris diff --git a/be/test/olap/rowset/group_rowset_builder_test.cpp b/be/test/olap/rowset/group_rowset_builder_test.cpp index a55278782b6def..fe6d1300fa0746 100644 --- a/be/test/olap/rowset/group_rowset_builder_test.cpp +++ b/be/test/olap/rowset/group_rowset_builder_test.cpp @@ -32,12 +32,15 @@ #include "runtime/descriptor_helper.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" +#include "storage/binlog.h" #include "storage/data_dir.h" #include "storage/rowset_builder.h" +#include "storage/segment/segment_writer.h" #include "storage/storage_engine.h" #include "storage/tablet/tablet.h" #include "storage/tablet/tablet_manager.h" #include "storage/tablet_info.h" +#include "testutil/creators.h" namespace doris { @@ -77,86 +80,6 @@ static void tear_down() { std::string(getenv("DORIS_HOME")) + "/" + UNUSED_PREFIX)); } -static void create_tablet_request(int64_t tablet_id, int32_t schema_hash, - TCreateTabletReq* request) { - request->tablet_id = tablet_id; - request->__set_version(1); - request->partition_id = 10001; - request->tablet_schema.schema_hash = schema_hash; - request->tablet_schema.short_key_column_count = 1; - request->tablet_schema.keys_type = TKeysType::AGG_KEYS; - request->tablet_schema.storage_type = TStorageType::COLUMN; - request->__set_storage_format(TStorageFormat::V2); - - TColumn k1; - k1.column_name = "k1"; - k1.__set_is_key(true); - k1.column_type.type = TPrimitiveType::INT; - request->tablet_schema.columns.push_back(k1); - - TColumn v1; - v1.column_name = "v1"; - v1.__set_is_key(false); - v1.column_type.type = TPrimitiveType::INT; - v1.__set_aggregation_type(TAggregationType::SUM); - request->tablet_schema.columns.push_back(v1); -} - -static void create_tablet_request_with_row_binlog(int64_t tablet_id, int32_t schema_hash, - TCreateTabletReq* request) { - create_tablet_request(tablet_id, schema_hash, request); - TBinlogConfig binlog_config; - binlog_config.__set_enable(true); - binlog_config.__set_binlog_format(TBinlogFormat::ROW); - request->__set_binlog_config(binlog_config); - TTabletSchema row_binlog_schema = request->tablet_schema; - row_binlog_schema.schema_hash = schema_hash + 1; - request->__set_row_binlog_schema(row_binlog_schema); -} - -static TDescriptorTable create_descriptor_tablet() { - TDescriptorTableBuilder dtb; - TTupleDescriptorBuilder tuple_builder; - - tuple_builder.add_slot(TSlotDescriptorBuilder() - .type(TYPE_INT) - .column_name("k1") - .column_pos(0) - .nullable(false) - .build()); - tuple_builder.add_slot(TSlotDescriptorBuilder() - .type(TYPE_INT) - .column_name("v1") - .column_pos(1) - .nullable(false) - .build()); - tuple_builder.build(&dtb); - - return dtb.desc_tbl(); -} - -static std::shared_ptr create_table_schema_param( - const TDescriptorTable& tdesc_tbl, int64_t index_id, int32_t schema_hash, - const std::vector& columns) { - auto param = std::make_shared(); - TOlapTableSchemaParam tschema; - tschema.db_id = 1; - tschema.table_id = 2; - tschema.version = 0; - tschema.slot_descs = tdesc_tbl.slotDescriptors; - tschema.tuple_desc = tdesc_tbl.tupleDescriptors[0]; - tschema.indexes.resize(1); - tschema.indexes[0].id = index_id; - tschema.indexes[0].schema_hash = schema_hash; - tschema.indexes[0].columns_desc = columns; - for (const auto& col : columns) { - tschema.indexes[0].columns.push_back(col.column_name); - } - Status st = param->init(tschema); - EXPECT_TRUE(st.ok()) << st; - return param; -} - class GroupRowsetBuilderTest : public ::testing::Test { public: static void SetUpTestSuite() { set_up(); } @@ -165,8 +88,11 @@ class GroupRowsetBuilderTest : public ::testing::Test { TEST_F(GroupRowsetBuilderTest, buildWithRowBinlogMeta) { std::unique_ptr profile = std::make_unique("CreateTablet"); - TCreateTabletReq request; - create_tablet_request_with_row_binlog(10010, 270068390, &request); + auto request = testutil::create_tablet_request( + 10010, 270068390, 10001, 1, TKeysType::UNIQUE_KEYS, + {{"k1", TPrimitiveType::INT, true}, {"v1", TPrimitiveType::INT, false}}); + request.__set_enable_unique_key_merge_on_write(true); + testutil::enable_row_binlog(&request); Status res = engine_ref->create_tablet(request, profile.get()); ASSERT_TRUE(res.ok()); @@ -179,10 +105,15 @@ TEST_F(GroupRowsetBuilderTest, buildWithRowBinlogMeta) { load_id.set_hi(0); load_id.set_lo(0); const int64_t index_id = 10001; + const int64_t row_binlog_index_id = 10002; - TDescriptorTable tdesc_tbl = create_descriptor_tablet(); - auto param = create_table_schema_param(tdesc_tbl, index_id, request.tablet_schema.schema_hash, - request.tablet_schema.columns); + TDescriptorTable tdesc_tbl = + testutil::create_descriptor_table({{TYPE_INT, "k1", false}, {TYPE_INT, "v1", false}}); + auto param = testutil::create_table_schema_param( + tdesc_tbl, index_id, request.tablet_schema.schema_hash, request.tablet_schema.columns, + row_binlog_index_id, request.row_binlog_schema.schema_hash, + &request.row_binlog_schema.columns); + ASSERT_NE(param, nullptr); WriteRequest data_req; data_req.tablet_id = request.tablet_id; @@ -195,10 +126,14 @@ TEST_F(GroupRowsetBuilderTest, buildWithRowBinlogMeta) { data_req.write_req_type = WriteRequestType::DATA; WriteRequest row_binlog_req = data_req; + row_binlog_req.index_id = row_binlog_index_id; row_binlog_req.schema_hash = request.row_binlog_schema.schema_hash; row_binlog_req.write_req_type = WriteRequestType::ROW_BINLOG; - GroupRowsetBuilder builder(*engine_ref, data_req, row_binlog_req, profile.get()); + WriteRequest group_req = data_req; + group_req.write_req_type = WriteRequestType::GROUP; + + GroupRowsetBuilder builder(*engine_ref, group_req, data_req, row_binlog_req, profile.get()); ASSERT_TRUE(builder.init().ok()); ASSERT_TRUE(builder.rowset_writer()->flush().ok()); ASSERT_TRUE(builder.build_rowset().ok()); @@ -209,9 +144,12 @@ TEST_F(GroupRowsetBuilderTest, buildWithRowBinlogMeta) { ASSERT_FALSE(data_meta->is_row_binlog()); ASSERT_EQ(request.row_binlog_schema.schema_hash, row_binlog_meta->tablet_schema_hash()); ASSERT_EQ(request.tablet_schema.schema_hash, data_meta->tablet_schema_hash()); - ASSERT_EQ(index_id, row_binlog_meta->index_id()); + ASSERT_EQ(row_binlog_index_id, row_binlog_meta->index_id()); ASSERT_EQ(index_id, data_meta->index_id()); + // Row-binlog schema must contain LSN column so that RowBinlogSegmentWriter can locate it. + ASSERT_GE(row_binlog_meta->tablet_schema()->field_index(std::string(kRowBinlogLsnColName)), 0); + res = engine_ref->tablet_manager()->drop_tablet(request.tablet_id, request.replica_id, false); ASSERT_TRUE(res.ok()); } diff --git a/be/test/olap/rowset/group_rowset_writer_test.cpp b/be/test/olap/rowset/group_rowset_writer_test.cpp new file mode 100644 index 00000000000000..c1aa21c6b4547d --- /dev/null +++ b/be/test/olap/rowset/group_rowset_writer_test.cpp @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/rowset/group_rowset_writer.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "core/block/block.h" +#include "core/field.h" +#include "exec/sink/autoinc_buffer.h" +#include "io/fs/local_file_system.h" +#include "runtime/exec_env.h" +#include "runtime/runtime_profile.h" +#include "storage/binlog.h" +#include "storage/olap_define.h" +#include "storage/storage_engine.h" +#include "storage/tablet/tablet.h" +#include "storage/tablet/tablet_manager.h" +#include "testutil/creators.h" +#include "util/debug_points.h" + +namespace doris { + +namespace { + +constexpr uint32_t MAX_PATH_LEN = 1024; +constexpr std::string_view kStorageRootDir = "/ut_dir/group_rowset_writer_test"; + +} // namespace + +class GroupRowsetWriterTest : public testing::Test { +protected: + void SetUp() override { + char buffer[MAX_PATH_LEN]; + getcwd(buffer, MAX_PATH_LEN); + + _storage_root_path = std::string(buffer) + std::string(kStorageRootDir); + config::storage_root_path = _storage_root_path; + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_storage_root_path).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_storage_root_path).ok()); + + std::vector paths; + paths.emplace_back(_storage_root_path, -1); + EngineOptions options; + options.store_paths = paths; + auto engine = std::make_unique(options); + auto* engine_ptr = engine.get(); + ASSERT_TRUE(engine_ptr->open().ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + + _request = testutil::create_tablet_request( + 10010, 270068390, 10001, 1, TKeysType::UNIQUE_KEYS, + {{"k1", TPrimitiveType::INT, true}, {"v1", TPrimitiveType::INT, false}}); + _request.__set_enable_unique_key_merge_on_write(true); + testutil::enable_row_binlog(&_request); + auto profile = std::make_unique("GroupRowsetWriterTest"); + ASSERT_TRUE(engine_ptr->create_tablet(_request, profile.get()).ok()); + _tablet = engine_ptr->tablet_manager()->get_tablet(_request.tablet_id); + ASSERT_TRUE(_tablet != nullptr); + EXPECT_TRUE( + io::global_local_filesystem()->create_directory(_tablet->row_binlog_path()).ok()); + + config::enable_debug_points = true; + } + + void TearDown() override { + DebugPoints::instance()->clear(); + config::enable_debug_points = false; + _tablet.reset(); + ExecEnv::GetInstance()->set_storage_engine(nullptr); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_storage_root_path).ok()); + } + + Block create_block(int start_key, int num_rows) const { + Block block = _tablet->tablet_schema()->create_block(); + auto columns = block.mutate_columns(); + for (int i = 0; i < num_rows; ++i) { + columns[0]->insert(Field::create_field(start_key + i)); + columns[1]->insert(Field::create_field((start_key + i) * 10)); + } + block.set_columns(std::move(columns)); + return block; + } + + Status create_group_rowset_writer(std::unique_ptr* group_writer, + RowsetId* data_rowset_id, size_t num_rows) { + RowsetWriterContext data_context; + data_context.tablet = _tablet; + data_context.tablet_schema = _tablet->tablet_schema(); + data_context.rowset_state = PREPARED; + data_context.segments_overlap = OVERLAPPING; + data_context.max_rows_per_segment = 1024; + data_context.write_type = DataWriteType::TYPE_DIRECT; + data_context.is_transient_rowset_writer = true; + *data_rowset_id = ExecEnv::GetInstance()->storage_engine().next_rowset_id(); + auto data_writer_res = + _tablet->create_transient_rowset_writer(data_context, *data_rowset_id); + if (!data_writer_res.has_value()) { + return data_writer_res.error(); + } + + RowsetWriterContext row_binlog_context; + row_binlog_context.tablet = _tablet; + row_binlog_context.tablet_schema = _tablet->row_binlog_tablet_schema(); + row_binlog_context.rowset_state = PREPARED; + row_binlog_context.segments_overlap = NONOVERLAPPING; + row_binlog_context.max_rows_per_segment = 1024; + row_binlog_context.write_type = DataWriteType::TYPE_DIRECT; + row_binlog_context.is_transient_rowset_writer = true; + row_binlog_context.write_binlog_opt().enable = true; + auto& cfg = row_binlog_context.write_binlog_opt().write_binlog_config(); + cfg.source.tablet_schema = _tablet->tablet_schema(); + cfg.source.is_transient_rowset_writer = true; + cfg.source.source_write_type = DataWriteType::TYPE_DIRECT; + auto lsn_buffer = AutoIncIDBuffer::create_shared(1, 1, kBinlogLsnAutoIncId); + lsn_buffer->append_range_for_test(1000, num_rows); + std::shared_ptr> lsn_ids; + RETURN_IF_ERROR(allocate_binlog_lsn(lsn_buffer, num_rows, &lsn_ids)); + cfg.insert_seg_lsn(0, lsn_ids); + auto row_binlog_writer_res = _tablet->create_rowset_writer(row_binlog_context, false); + if (!row_binlog_writer_res.has_value()) { + return row_binlog_writer_res.error(); + } + + *group_writer = std::make_unique(); + (*group_writer) + ->set_data_writer( + std::shared_ptr(std::move(data_writer_res.value()))); + (*group_writer) + ->set_row_binlog_writer( + std::shared_ptr(std::move(row_binlog_writer_res.value()))); + return Status::OK(); + } + + bool file_exists(const std::string& path) const { + bool exists = false; + EXPECT_TRUE(io::global_local_filesystem()->exists(path, &exists).ok()); + return exists; + } + + TabletSharedPtr _tablet; + TCreateTabletReq _request; + std::string _storage_root_path; +}; + +TEST_F(GroupRowsetWriterTest, sub_writer_rollback) { + std::unique_ptr group_writer; + RowsetId data_rowset_id; + auto st = create_group_rowset_writer(&group_writer, &data_rowset_id, 2); + ASSERT_TRUE(st.ok()) << st; + + auto block = create_block(100, 2); + st = group_writer->flush_single_block(&block); + ASSERT_TRUE(st.ok()) << st; + + const auto transient_segment_path = + local_segment_path(_tablet->tablet_path(), data_rowset_id.to_string(), 0); + ASSERT_TRUE(file_exists(transient_segment_path)); + + DebugPoints::instance()->add("GroupRowsetWriter::build_rowsets.row_binlog_build_failed"); + std::vector rowsets; + st = group_writer->build_rowsets(rowsets); + DebugPoints::instance()->remove("GroupRowsetWriter::build_rowsets.row_binlog_build_failed"); + + EXPECT_FALSE(st.ok()); + EXPECT_TRUE(rowsets.empty()); + group_writer.reset(); + bool removed = false; + for (int i = 0; i < 3; ++i) { + if (!file_exists(transient_segment_path)) { + removed = true; + break; + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + EXPECT_TRUE(removed); +} + +TEST_F(GroupRowsetWriterTest, success) { + std::unique_ptr group_writer; + RowsetId data_rowset_id; + auto st = create_group_rowset_writer(&group_writer, &data_rowset_id, 2); + ASSERT_TRUE(st.ok()) << st; + + auto block = create_block(100, 2); + st = group_writer->flush_single_block(&block); + ASSERT_TRUE(st.ok()) << st; + + std::vector rowsets; + st = group_writer->build_rowsets(rowsets); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(2, rowsets.size()); + + const auto data_segment_path = + local_segment_path(_tablet->tablet_path(), data_rowset_id.to_string(), 0); + const auto second_segment_path = + local_segment_path(_tablet->row_binlog_path(), rowsets[1]->rowset_id().to_string(), 0); + EXPECT_TRUE(file_exists(data_segment_path)); + EXPECT_TRUE(file_exists(second_segment_path)); +} + +} // namespace doris diff --git a/be/test/storage/rowset/rowset_meta_manager_test.cpp b/be/test/storage/rowset/rowset_meta_manager_test.cpp index 61d40aa63dcec1..5ab24cc2aaadb8 100644 --- a/be/test/storage/rowset/rowset_meta_manager_test.cpp +++ b/be/test/storage/rowset/rowset_meta_manager_test.cpp @@ -27,8 +27,12 @@ #include #include #include +#include #include +#include #include +#include +#include #include "common/config.h" #include "gtest/gtest_pred_impl.h" @@ -79,10 +83,145 @@ class RowsetMetaManagerTest : public testing::Test { LOG(INFO) << "TearDown"; } +protected: + RowsetMetaSharedPtr create_rowset_meta(int64_t rowset_id, RowsetStatePB state, Version version, + bool is_row_binlog = false) { + auto rowset_meta = std::make_shared(); + EXPECT_TRUE(rowset_meta->init_from_json(_json_rowset_meta)); + RowsetId rs_id; + rs_id.init(rowset_id); + rowset_meta->set_rowset_id(rs_id); + rowset_meta->set_tablet_uid(_tablet_uid); + rowset_meta->set_rowset_state(state); + rowset_meta->set_version(version); + if (is_row_binlog) { + rowset_meta->mark_row_binlog(); + } + return rowset_meta; + } + + RowsetMetaPB to_rowset_meta_pb(const RowsetMetaSharedPtr& rowset_meta) { + std::string serialized; + EXPECT_TRUE(rowset_meta->serialize(&serialized)); + RowsetMetaPB rowset_meta_pb; + EXPECT_TRUE(rowset_meta_pb.ParseFromString(serialized)); + return rowset_meta_pb; + } + + OlapMeta* meta() { return _meta; } + TabletUid tablet_uid() const { return _tablet_uid; } + private: OlapMeta* _meta; std::string _json_rowset_meta; TabletUid _tablet_uid {0, 0}; }; +TEST_F(RowsetMetaManagerTest, SaveAndLoad) { + auto base_rowset_meta = create_rowset_meta(20000, RowsetStatePB::COMMITTED, Version {7, 7}); + auto attach_rowset_meta = + create_rowset_meta(20001, RowsetStatePB::COMMITTED, Version {7, 7}, true); + + std::map attach_rowset_map; + attach_rowset_map.emplace(attach_rowset_meta->rowset_id(), + to_rowset_meta_pb(attach_rowset_meta)); + + auto st = RowsetMetaManager::save(meta(), tablet_uid(), base_rowset_meta->rowset_id(), + to_rowset_meta_pb(base_rowset_meta), BinlogFormatPB::ROW, + &attach_rowset_map); + ASSERT_TRUE(st.ok()) << st; + + RowsetMetaSharedPtr loaded_base_meta = std::make_shared(); + st = RowsetMetaManager::get_rowset_meta(meta(), tablet_uid(), base_rowset_meta->rowset_id(), + loaded_base_meta); + ASSERT_TRUE(st.ok()) << st; + EXPECT_EQ(loaded_base_meta->rowset_id(), base_rowset_meta->rowset_id()); + EXPECT_EQ(loaded_base_meta->tablet_uid().to_string(), tablet_uid().to_string()); + EXPECT_EQ(loaded_base_meta->version(), base_rowset_meta->version()); + + std::vector> + traversed_attach_metas; + st = RowsetMetaManager::traverse_row_binlog_metas( + meta(), [&traversed_attach_metas]( + const TabletUid& tablet_uid, const RowsetId& base_rowset_id, + const RowsetId& row_binlog_rowset_id, const std::string& value) { + auto rowset_meta = std::make_shared(); + EXPECT_TRUE(rowset_meta->init(value)); + traversed_attach_metas.emplace_back(tablet_uid, base_rowset_id, + row_binlog_rowset_id, std::move(rowset_meta)); + return true; + }); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(traversed_attach_metas.size(), 1); + EXPECT_EQ(std::get<0>(traversed_attach_metas[0]).to_string(), tablet_uid().to_string()); + EXPECT_EQ(std::get<1>(traversed_attach_metas[0]), base_rowset_meta->rowset_id()); + EXPECT_EQ(std::get<2>(traversed_attach_metas[0]), attach_rowset_meta->rowset_id()); + EXPECT_EQ(std::get<3>(traversed_attach_metas[0])->rowset_id(), attach_rowset_meta->rowset_id()); + EXPECT_TRUE(std::get<3>(traversed_attach_metas[0])->is_row_binlog()); +} + +TEST_F(RowsetMetaManagerTest, Remove) { + auto base_rowset_meta = create_rowset_meta(20010, RowsetStatePB::VISIBLE, Version {9, 9}); + auto attach_rowset_meta = + create_rowset_meta(20011, RowsetStatePB::VISIBLE, Version {9, 9}, true); + + std::map attach_rowset_map; + attach_rowset_map.emplace(attach_rowset_meta->rowset_id(), + to_rowset_meta_pb(attach_rowset_meta)); + + auto st = RowsetMetaManager::save(meta(), tablet_uid(), base_rowset_meta->rowset_id(), + to_rowset_meta_pb(base_rowset_meta), BinlogFormatPB::ROW, + &attach_rowset_map); + ASSERT_TRUE(st.ok()) << st; + + std::map base_rowset_id_to_row_binlog; + st = RowsetMetaManager::get_row_binlog_base_rowset_ids( + meta(), tablet_uid(), base_rowset_id_to_row_binlog, + std::set {attach_rowset_meta->rowset_id()}); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(base_rowset_id_to_row_binlog.size(), 1); + EXPECT_EQ(base_rowset_id_to_row_binlog.begin()->first, base_rowset_meta->rowset_id()); + EXPECT_EQ(base_rowset_id_to_row_binlog.begin()->second, attach_rowset_meta->rowset_id()); + + st = RowsetMetaManager::remove_row_binlog(meta(), tablet_uid(), base_rowset_meta->rowset_id(), + attach_rowset_meta->rowset_id()); + ASSERT_TRUE(st.ok()) << st; + + int traversed_attach_meta_count = 0; + st = RowsetMetaManager::traverse_row_binlog_metas( + meta(), [&traversed_attach_meta_count](const TabletUid&, const RowsetId&, + const RowsetId&, const std::string&) { + ++traversed_attach_meta_count; + return true; + }); + ASSERT_TRUE(st.ok()) << st; + EXPECT_EQ(traversed_attach_meta_count, 0); + + auto base_rowset_meta_2 = create_rowset_meta(20012, RowsetStatePB::VISIBLE, Version {10, 10}); + auto attach_rowset_meta_2 = + create_rowset_meta(20013, RowsetStatePB::VISIBLE, Version {10, 10}, true); + std::map attach_rowset_map_2; + attach_rowset_map_2.emplace(attach_rowset_meta_2->rowset_id(), + to_rowset_meta_pb(attach_rowset_meta_2)); + + st = RowsetMetaManager::save(meta(), tablet_uid(), base_rowset_meta_2->rowset_id(), + to_rowset_meta_pb(base_rowset_meta_2), BinlogFormatPB::ROW, + &attach_rowset_map_2); + ASSERT_TRUE(st.ok()) << st; + + st = RowsetMetaManager::remove_row_binlog_metas( + meta(), tablet_uid(), std::set {attach_rowset_meta_2->rowset_id()}); + ASSERT_TRUE(st.ok()) << st; + + traversed_attach_meta_count = 0; + st = RowsetMetaManager::traverse_row_binlog_metas( + meta(), [&traversed_attach_meta_count](const TabletUid&, const RowsetId&, + const RowsetId&, const std::string&) { + ++traversed_attach_meta_count; + return true; + }); + ASSERT_TRUE(st.ok()) << st; + EXPECT_EQ(traversed_attach_meta_count, 0); +} + } // namespace doris diff --git a/be/test/storage/segment/inverted_index_file_writer_test.cpp b/be/test/storage/segment/inverted_index_file_writer_test.cpp index 1e0126168f0b61..f0a7680d25152b 100644 --- a/be/test/storage/segment/inverted_index_file_writer_test.cpp +++ b/be/test/storage/segment/inverted_index_file_writer_test.cpp @@ -1354,6 +1354,7 @@ class MockRowsetWriter : public RowsetWriter { RowsetId rowset_id() override { return _context.rowset_id; } RowsetTypePB type() const override { return BETA_ROWSET; } int32_t allocate_segment_id() override { return 0; } + int32_t get_allocated_segment_id() override { return 0; } std::shared_ptr get_partial_update_info() override { return nullptr; } bool is_partial_update() override { return false; } diff --git a/be/test/storage/txn/txn_manager_test.cpp b/be/test/storage/txn/txn_manager_test.cpp index 7a39eebb0c6b6f..30e2e96233e725 100644 --- a/be/test/storage/txn/txn_manager_test.cpp +++ b/be/test/storage/txn/txn_manager_test.cpp @@ -26,8 +26,10 @@ #include #include +#include #include #include +#include #include #include "common/config.h" @@ -110,6 +112,38 @@ class TxnManagerTest : public testing::Test { tablet_map[tablet_id] = std::move(tablet); } + std::string read_rowset_meta_json(const std::string& path) { + std::ifstream infile(path); + std::string json_rowset_meta; + char buffer[1024]; + while (!infile.eof()) { + infile.getline(buffer, 1024); + json_rowset_meta += buffer; + json_rowset_meta += "\n"; + } + return json_rowset_meta.substr(0, json_rowset_meta.size() - 1); + } + + RowsetSharedPtr create_binlog_rowset(int64_t rowset_id, const Version& version) { + auto json_rowset_meta = read_rowset_meta_json(rowset_meta_path); + RowsetMetaSharedPtr rowset_meta(new RowsetMeta()); + rowset_meta->init_from_json(json_rowset_meta); + RowsetId rs_id; + rs_id.init(rowset_id); + rowset_meta->set_rowset_id(rs_id); + rowset_meta->set_tablet_id(tablet_id); + rowset_meta->set_tablet_uid(_tablet_uid); + rowset_meta->set_txn_id(transaction_id); + rowset_meta->set_rowset_state(RowsetStatePB::COMMITTED); + rowset_meta->set_version(version); + rowset_meta->mark_row_binlog(); + + RowsetSharedPtr rowset; + EXPECT_EQ(Status::OK(), + RowsetFactory::create_rowset(_schema, rowset_meta_path, rowset_meta, &rowset)); + return rowset; + } + virtual void SetUp() { config::max_runnings_transactions_per_txn_map = 500; @@ -367,7 +401,60 @@ TEST_F(TxnManagerTest, PublishVersionWithCommitTSO) { ASSERT_TRUE(st.ok()) << st; EXPECT_EQ(rowset_meta->start_version(), 10); EXPECT_EQ(rowset_meta->end_version(), 11); - EXPECT_EQ(rowset_meta->commit_tso(), commit_tso); + EXPECT_EQ(rowset_meta->commit_tso(), TsoRange(commit_tso, commit_tso)); +} + +TEST_F(TxnManagerTest, TxnWithRowBinlog) { + auto binlog_rowset = create_binlog_rowset(30000, _rowset->version()); + std::vector attach_rowsets {binlog_rowset}; + auto guard = k_engine->pending_local_rowsets().add( + {_rowset->rowset_id(), binlog_rowset->rowset_id()}); + + auto st = k_engine->txn_manager()->commit_txn( + _meta.get(), partition_id, transaction_id, tablet_id, _tablet_uid, load_id, _rowset, + std::move(guard), false, nullptr, &attach_rowsets); + ASSERT_TRUE(st.ok()) << st; + + std::map base_rowset_id_to_row_binlog; + st = RowsetMetaManager::get_row_binlog_base_rowset_ids( + _meta.get(), _tablet_uid, base_rowset_id_to_row_binlog, + std::set {binlog_rowset->rowset_id()}); + ASSERT_TRUE(st.ok()) << st; + EXPECT_EQ(base_rowset_id_to_row_binlog.begin()->first, _rowset->rowset_id()); + EXPECT_EQ(base_rowset_id_to_row_binlog.begin()->second, binlog_rowset->rowset_id()); + + Version new_version(10, 10); + TabletPublishStatistics stats; + std::shared_ptr tablet_txn_info = nullptr; + st = k_engine->txn_manager()->publish_txn(_meta.get(), partition_id, transaction_id, tablet_id, + _tablet_uid, new_version, &stats, tablet_txn_info); + ASSERT_TRUE(st.ok()) << st; + + RowsetMetaSharedPtr rowset_meta(new RowsetMeta()); + st = RowsetMetaManager::get_rowset_meta(_meta.get(), _tablet_uid, _rowset->rowset_id(), + rowset_meta); + ASSERT_TRUE(st.ok()) << st; + EXPECT_EQ(rowset_meta->start_version(), 10); + EXPECT_EQ(rowset_meta->end_version(), 10); + + bool found_binlog_rowset = false; + st = RowsetMetaManager::traverse_row_binlog_metas( + _meta.get(), [&](const TabletUid&, const RowsetId& base_rowset_id, + const RowsetId& row_binlog_rowset_id, const std::string& value) { + if (row_binlog_rowset_id != binlog_rowset->rowset_id()) { + return true; + } + found_binlog_rowset = true; + EXPECT_EQ(base_rowset_id, _rowset->rowset_id()); + RowsetMetaSharedPtr binlog_rowset_meta(new RowsetMeta()); + EXPECT_TRUE(binlog_rowset_meta->init(value)); + EXPECT_EQ(binlog_rowset_meta->version(), new_version); + EXPECT_EQ(binlog_rowset_meta->rowset_state(), RowsetStatePB::VISIBLE); + EXPECT_TRUE(binlog_rowset_meta->is_row_binlog()); + return true; + }); + ASSERT_TRUE(st.ok()) << st; + EXPECT_TRUE(found_binlog_rowset); } // 1. publish version failed if not found related txn and rowset @@ -412,6 +499,36 @@ TEST_F(TxnManagerTest, DeleteCommittedTxn) { EXPECT_FALSE(k_engine->pending_local_rowsets().contains(_rowset->rowset_id())); } +TEST_F(TxnManagerTest, DeleteCommittedTxnWithBinlogRowset) { + auto binlog_rowset = create_binlog_rowset(30002, _rowset->version()); + std::vector attach_rowsets {binlog_rowset}; + auto guard = k_engine->pending_local_rowsets().add( + {_rowset->rowset_id(), binlog_rowset->rowset_id()}); + + auto st = k_engine->txn_manager()->commit_txn( + _meta.get(), partition_id, transaction_id, tablet_id, _tablet_uid, load_id, _rowset, + std::move(guard), false, nullptr, &attach_rowsets); + ASSERT_TRUE(st.ok()) << st; + + std::map base_rowset_id_to_row_binlog; + st = RowsetMetaManager::get_row_binlog_base_rowset_ids( + _meta.get(), _tablet_uid, base_rowset_id_to_row_binlog, + std::set {binlog_rowset->rowset_id()}); + ASSERT_TRUE(st.ok()) << st; + EXPECT_EQ(base_rowset_id_to_row_binlog.begin()->first, _rowset->rowset_id()); + EXPECT_EQ(base_rowset_id_to_row_binlog.begin()->second, binlog_rowset->rowset_id()); + + st = k_engine->txn_manager()->delete_txn(_meta.get(), partition_id, transaction_id, tablet_id, + _tablet_uid); + ASSERT_TRUE(st.ok()) << st; + + EXPECT_TRUE(RowsetMetaManager::exists(_meta.get(), _tablet_uid, _rowset->rowset_id()) + .is()); + + EXPECT_FALSE(RowsetMetaManager::row_binlog_meta_exists(_meta.get(), _tablet_uid, + binlog_rowset->rowset_id())); +} + TEST_F(TxnManagerTest, TabletVersionCache) { std::unique_ptr txn_mgr = std::make_unique(*k_engine, 64, 1024); txn_mgr->update_tablet_version_txn(123, 100, 456); diff --git a/be/test/testutil/creators.h b/be/test/testutil/creators.h index 9150a8a32f05e0..c346020818aad7 100644 --- a/be/test/testutil/creators.h +++ b/be/test/testutil/creators.h @@ -21,8 +21,12 @@ #include #include #include +#include +#include +#include #include +#include #include #include @@ -32,6 +36,8 @@ #include "runtime/descriptor_helper.h" #include "runtime/descriptors.h" #include "runtime/query_context.h" +#include "storage/binlog.h" +#include "storage/tablet_info.h" #include "util/uid_util.h" namespace doris { @@ -101,4 +107,144 @@ inline std::unique_ptr create_spill_partitioner( return partitioner; } -} // namespace doris \ No newline at end of file +namespace testutil { + +struct DescriptorTableSlotDef { + PrimitiveType type; + std::string column_name; + bool nullable = false; +}; + +struct CreateTabletRequestColumnDef { + std::string column_name; + TPrimitiveType::type type; + bool is_key = false; + bool has_aggregation_type = false; + TAggregationType::type aggregation_type = TAggregationType::NONE; +}; + +inline TColumn create_tablet_column(const CreateTabletRequestColumnDef& column_def) { + TColumn column; + column.column_name = column_def.column_name; + column.__set_is_key(column_def.is_key); + column.column_type.type = column_def.type; + if (column_def.has_aggregation_type) { + column.__set_aggregation_type(column_def.aggregation_type); + } + return column; +} + +inline TCreateTabletReq create_tablet_request( + int64_t tablet_id, int32_t schema_hash, int64_t partition_id, + int16_t short_key_column_count, TKeysType::type keys_type, + std::initializer_list column_defs, int64_t version = 1, + TStorageType::type storage_type = TStorageType::COLUMN, + TStorageFormat::type storage_format = TStorageFormat::V2) { + TCreateTabletReq request; + request.tablet_id = tablet_id; + request.__set_version(version); + request.partition_id = partition_id; + request.tablet_schema.schema_hash = schema_hash; + request.tablet_schema.short_key_column_count = short_key_column_count; + request.tablet_schema.keys_type = keys_type; + request.tablet_schema.storage_type = storage_type; + request.__set_storage_format(storage_format); + request.tablet_schema.columns.reserve(column_defs.size()); + for (const auto& column_def : column_defs) { + request.tablet_schema.columns.push_back(create_tablet_column(column_def)); + } + return request; +} + +inline void enable_row_binlog(TCreateTabletReq* request, int32_t row_binlog_schema_hash = 0) { + DCHECK(request != nullptr); + + TBinlogConfig binlog_config; + binlog_config.__set_enable(true); + binlog_config.__set_binlog_format(TBinlogFormat::ROW); + request->__set_binlog_config(binlog_config); + + TTabletSchema row_binlog_schema = request->tablet_schema; + row_binlog_schema.schema_hash = row_binlog_schema_hash > 0 + ? row_binlog_schema_hash + : request->tablet_schema.schema_hash + 1; + row_binlog_schema.keys_type = TKeysType::DUP_KEYS; + + for (auto& col : row_binlog_schema.columns) { + if (!col.is_key) { + col.__set_aggregation_type(TAggregationType::NONE); + } + } + + row_binlog_schema.columns.push_back( + create_tablet_column({std::string(kRowBinlogLsnColName), TPrimitiveType::LARGEINT, + false, true, TAggregationType::NONE})); + row_binlog_schema.columns.push_back(create_tablet_column( + {"__DORIS_BINLOG_OP__", TPrimitiveType::BIGINT, false, true, TAggregationType::NONE})); + row_binlog_schema.columns.push_back( + create_tablet_column({std::string(kRowBinlogTimestampColName), TPrimitiveType::BIGINT, + false, true, TAggregationType::NONE})); + request->__set_row_binlog_schema(row_binlog_schema); +} + +inline TDescriptorTable create_descriptor_table( + std::initializer_list slot_defs) { + TDescriptorTableBuilder dtb; + TTupleDescriptorBuilder tuple_builder; + int column_pos = 0; + for (const auto& slot_def : slot_defs) { + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(slot_def.type) + .nullable(slot_def.nullable) + .column_name(slot_def.column_name) + .column_pos(column_pos++) + .build()); + } + tuple_builder.build(&dtb); + return dtb.desc_tbl(); +} + +inline std::shared_ptr create_table_schema_param( + const TDescriptorTable& tdesc_tbl, int64_t index_id, int32_t schema_hash, + const std::vector& columns, int64_t row_binlog_index_id = -1, + int32_t row_binlog_schema_hash = 0, + const std::vector* row_binlog_columns = nullptr, int64_t db_id = 1, + int64_t table_id = 2, int64_t version = 0) { + auto param = std::make_shared(); + TOlapTableSchemaParam tschema; + tschema.db_id = db_id; + tschema.table_id = table_id; + tschema.version = version; + tschema.slot_descs = tdesc_tbl.slotDescriptors; + tschema.tuple_desc = tdesc_tbl.tupleDescriptors[0]; + tschema.indexes.resize(1); + tschema.indexes[0].id = index_id; + tschema.indexes[0].schema_hash = schema_hash; + tschema.indexes[0].columns_desc = columns; + for (const auto& col : columns) { + tschema.indexes[0].columns.push_back(col.column_name); + } + if (row_binlog_index_id > 0) { + tschema.indexes[0].__set_row_binlog_id(row_binlog_index_id); + TOlapTableIndexSchema row_binlog_index_schema; + row_binlog_index_schema.id = row_binlog_index_id; + row_binlog_index_schema.schema_hash = row_binlog_schema_hash; + if (row_binlog_columns != nullptr) { + row_binlog_index_schema.columns_desc = *row_binlog_columns; + for (const auto& col : *row_binlog_columns) { + row_binlog_index_schema.columns.push_back(col.column_name); + } + } + tschema.__set_row_binlog_index_schema(row_binlog_index_schema); + } + Status st = param->init(tschema); + EXPECT_TRUE(st.ok()) << st; + if (!st.ok()) { + return nullptr; + } + return param; +} + +} // namespace testutil + +} // namespace doris diff --git a/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Column.java b/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Column.java index 2f00be157ac502..fd298d9872b99b 100644 --- a/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Column.java +++ b/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Column.java @@ -69,6 +69,7 @@ public class Column implements GsonPostProcessable { private static final String COLUMN_ARRAY_CHILDREN = "item"; private static final String COLUMN_AGG_ARGUMENT_CHILDREN = "argument"; public static final int COLUMN_UNIQUE_ID_INIT_VALUE = -1; + public static final int BINLOG_LSN_AUTO_INC_ID = -1; private static final String COLUMN_MAP_KEY = "key"; private static final String COLUMN_MAP_VALUE = "value"; public static final Column STREAM_SEQ_VIRTUAL_COLUMN = diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 7c251b2a61b452..db0522a48c95fa 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -2003,6 +2003,7 @@ nonReserved | BEGIN | BELONG | BIN + | BINLOG | BITAND | BITMAP | BITMAP_EMPTY diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java index 1142b1645a978b..2a1c3fba031f24 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java @@ -321,6 +321,12 @@ public boolean processAddColumns(AddColumnsOp addColumnsOp, OlapTable olapTable, private void addColumnRowBinlog(List rowBinlogSchema, Column newColumn, ColumnPosition columnPos, Set newColNameSet, boolean needHistoricalValue, IntSupplier columnUniqueIdSupplier) throws DdlException { + if (!newColumn.isVisible()) { + // row binlog schema is generated from visible columns only, so schema change must not + // sync hidden system columns such as sequence/delete/version/skip-bitmap columns. + return; + } + if (newColumn.isAutoInc() || newColumn.getDataType().isVariantType()) { throw new DdlException("can't add AutoInc/Variant column " + " on table with binlog, column: " + newColumn.getDataType()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/AutoIncrementGenerator.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/AutoIncrementGenerator.java index 7d1977a796d509..1ebeb7cda7a422 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/AutoIncrementGenerator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/AutoIncrementGenerator.java @@ -63,6 +63,10 @@ public void setEditLog(EditLog editLog) { this.editLog = editLog; } + public long getColumnId() { + return columnId; + } + public synchronized void applyChange(long columnId, long batchNextId) { if (this.columnId == columnId && batchEndId < batchNextId) { LOG.info("[auto-inc] AutoIncrementGenerator applyChange, db_id={}, table_id={}, column_id={}, " diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java index ca3a5b688c95b6..973a3dcb09a377 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinTableValuedFunctions.java @@ -18,6 +18,7 @@ package org.apache.doris.catalog; import org.apache.doris.nereids.trees.expressions.functions.table.Backends; +import org.apache.doris.nereids.trees.expressions.functions.table.Binlog; import org.apache.doris.nereids.trees.expressions.functions.table.Catalogs; import org.apache.doris.nereids.trees.expressions.functions.table.CdcStream; import org.apache.doris.nereids.trees.expressions.functions.table.File; @@ -53,6 +54,7 @@ public class BuiltinTableValuedFunctions implements FunctionHelper { public final ImmutableList tableValuedFunctions = ImmutableList.of( tableValued(Backends.class, "backends"), + tableValued(Binlog.class, "binlog"), tableValued(Catalogs.class, "catalogs"), tableValued(Frontends.class, "frontends"), tableValued(FrontendsDisks.class, "frontends_disks"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionGenTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionGenTable.java index 748564e9ace159..2e81714282e61c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionGenTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionGenTable.java @@ -34,4 +34,12 @@ public TableValuedFunctionIf getTvf() { return tvf; } + @Override + public List getBaseSchema() { + // TVF columns come from the function itself rather than a regular table schema. + // If we keep filtering by visibility here, callers may fail to bind hidden columns + // returned by the TVF (for example direct reads of binlog hidden columns). + return getFullSchema(); + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 0b0458bf6c12e8..4996e5ed026329 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1516,6 +1516,17 @@ public Partition getPartition(long partitionId) { return partition; } + /** + * Get the materialized index for scan planning. + * + *

Default behavior is equivalent to {@link Partition#getIndex(long)}. + * Wrapper tables may override this to redirect index selection (e.g. row-binlog wrapper + * uses the base index's tablets while keeping a different schema/index meta). + */ + public MaterializedIndex getPartitionIndex(Partition partition, long indexId) { + return partition.getIndex(indexId); + } + public PartitionItem getPartitionItemOrAnalysisException(String partitionName) throws AnalysisException { Partition partition = nameToPartition.get(partitionName); if (partition == null) { @@ -2350,7 +2361,8 @@ public boolean needRowBinlog() { return getBinlogConfig().isEnableForStreaming(); } - public void createNewRowBinlogMeta(IdGeneratorBuffer idGeneratorBuffer) { + public void createNewRowBinlogMeta(IdGeneratorBuffer idGeneratorBuffer, long dbId) + throws DdlException { writeLock(); try { List schema = generateTableRowBinlogSchema(); @@ -2362,6 +2374,11 @@ public void createNewRowBinlogMeta(IdGeneratorBuffer idGeneratorBuffer) { rowBinlogMeta.initSchemaColumnUniqueId(); rowBinlogMeta.setRowBinlogIndexId(indexId); this.setRowBinlogMeta(rowBinlogMeta, BinlogUtils.wrapBinlogName(this.name)); + + // todo: support multi column for autoIncrementGenerator + if (autoIncrementGenerator != null) { + throw new DdlException("enable binlog isn't allowed on the table with auto-increment column"); + } } finally { writeUnlock(); } @@ -3318,6 +3335,15 @@ public void initAutoIncrementGenerator(long dbId) { break; } } + + if (needRowBinlog()) { + // use auto-increment allocator to improve locality of Binlog LSN. + Preconditions.checkState(autoIncrementGenerator == null); + MaterializedIndexMeta rowBinlogMeta = getRowBinlogMeta(); + Preconditions.checkNotNull(rowBinlogMeta); + autoIncrementGenerator = new AutoIncrementGenerator(dbId, id, Column.BINLOG_LSN_AUTO_INC_ID, 1L); + autoIncrementGenerator.setEditLog(Env.getCurrentEnv().getEditLog()); + } } public AutoIncrementGenerator getAutoIncrementGenerator() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java index c80b0f45a3f01e..c8fa7e0b0a5484 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java @@ -36,12 +36,14 @@ protected OlapTableWrapper(OlapTable originTable, String wrapperName, List of table + * A lightweight wrapper base for read binlog of table */ public class RowBinlogTableWrapper extends OlapTableWrapper { private final MaterializedIndexMeta rowBinlogMeta; public RowBinlogTableWrapper(OlapTable originTable) { - super(originTable, originTable.getName(), originTable.generateTableRowBinlogSchema(), KeysType.DUP_KEYS); + super(originTable, originTable.getName(), originTable.getRowBinlogMeta().getSchema(), KeysType.DUP_KEYS); this.rowBinlogMeta = originTable.getRowBinlogMeta(); Preconditions.checkNotNull(rowBinlogMeta, "row binlog meta is null, table=%s", originTable.getName()); this.setBaseIndexId(rowBinlogMeta.getIndexId()); @@ -37,4 +37,18 @@ public RowBinlogTableWrapper(OlapTable originTable) { public long getBaseIndexId() { return rowBinlogMeta.getIndexId(); } + + @Override + public MaterializedIndex getPartitionIndex(Partition partition, long indexId) { + MaterializedIndex index = partition.getIndex(indexId); + if (index != null) { + return index; + } + // The row-binlog index meta does not exist as a partition index. + // For scan range generation, reuse the base index's tablets. + if (indexId == rowBinlogMeta.getIndexId()) { + return partition.getIndex(originTable.getBaseIndexId()); + } + return null; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java index da32fa376e8335..f389eca0a416b7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java @@ -426,7 +426,7 @@ public class SchemaTable extends Table { .column("CREATION_TIME", ScalarType.createType(PrimitiveType.DATETIME)) .column("NEWEST_WRITE_TIMESTAMP", ScalarType.createType(PrimitiveType.DATETIME)) .column("SCHEMA_VERSION", ScalarType.createType(PrimitiveType.INT)) - .column("COMMIT_TSO", ScalarType.createType(PrimitiveType.BIGINT)) + .column("COMMIT_TSO", ScalarType.createVarchar(64)) .build())) .put("parameters", new SchemaTable(SystemIdGenerator.getNextId(), "parameters", TableType.SCHEMA, builder().column("SPECIFIC_CATALOG", ScalarType.createVarchar(64)) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 5ed2a2edde3b81..3c239ed96d82da 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -2967,7 +2967,7 @@ private boolean createOlapTable(Database db, CreateTableInfo createTableInfo) th baseIndexStorageType, keysType, olapTable.getIndexes()); if (olapTable.getBinlogConfig().isEnableForStreaming()) { - olapTable.createNewRowBinlogMeta(idGeneratorBuffer); + olapTable.createNewRowBinlogMeta(idGeneratorBuffer, db.getId()); } for (AlterOp alterOp : createTableInfo.getAddRollupOps()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/DeleteHandler.java b/fe/fe-core/src/main/java/org/apache/doris/load/DeleteHandler.java index c9aa476b3f3ad6..8f2aee7f9af203 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/DeleteHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/DeleteHandler.java @@ -20,8 +20,10 @@ import org.apache.doris.analysis.Predicate; import org.apache.doris.catalog.Database; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.KeysType; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; +import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.common.io.Text; @@ -110,6 +112,11 @@ public void process(Database targetDb, OlapTable targetTbl, List sele try { targetTbl.readLock(); try { + if (targetTbl.needRowBinlog() && targetTbl.getKeysType() == KeysType.DUP_KEYS) { + throw new AnalysisException( + "DELETE with predicates is not supported on DUPLICATE KEY tables when binlog" + + "is enabled. Please disable binlog for this table or avoid DELETE."); + } if (targetTbl.getState() != OlapTable.OlapTableState.NORMAL) { // table under alter operation can also do delete. // just add a comment here to notice. diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Binlog.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Binlog.java new file mode 100644 index 00000000000000..f1f1a46348cfc1 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Binlog.java @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.table; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Properties; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.coercion.AnyDataType; +import org.apache.doris.tablefunction.TableBinlogFunction; +import org.apache.doris.tablefunction.TableValuedFunctionIf; + +import java.util.Map; + +/** binlog */ +public class Binlog extends TableValuedFunction { + public Binlog(Properties properties) { + super("binlog", properties); + } + + @Override + public FunctionSignature customSignature() { + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, getArgumentsTypes()); + } + + @Override + protected TableValuedFunctionIf toCatalogFunction() { + try { + Map arguments = getTVFProperties().getMap(); + return new TableBinlogFunction(arguments); + } catch (Throwable t) { + throw new AnalysisException("Can not build TableBinlogFunction by " + + this + ": " + t.getMessage(), t); + } + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitBinlog(this, context); + } +} + diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java index 8ae391e1f0e267..34bcbe4d6dc372 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableValuedFunctionVisitor.java @@ -18,6 +18,7 @@ package org.apache.doris.nereids.trees.expressions.visitor; import org.apache.doris.nereids.trees.expressions.functions.table.Backends; +import org.apache.doris.nereids.trees.expressions.functions.table.Binlog; import org.apache.doris.nereids.trees.expressions.functions.table.Catalogs; import org.apache.doris.nereids.trees.expressions.functions.table.CdcStream; import org.apache.doris.nereids.trees.expressions.functions.table.File; @@ -48,6 +49,10 @@ default R visitBackends(Backends backends, C context) { return visitTableValuedFunction(backends, context); } + default R visitBinlog(Binlog binlog, C context) { + return visitTableValuedFunction(binlog, context); + } + default R visitCatalogs(Catalogs catalogs, C context) { return visitTableValuedFunction(catalogs, context); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java index 6ab25dcce02666..f1d89b63d51a25 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java @@ -44,6 +44,7 @@ import org.apache.doris.catalog.PartitionItem; import org.apache.doris.catalog.PartitionType; import org.apache.doris.catalog.Replica; +import org.apache.doris.catalog.RowBinlogTableWrapper; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Tablet; import org.apache.doris.catalog.stream.OlapTableStreamUpdate; @@ -189,6 +190,9 @@ public class OlapScanNode extends ScanNode { private Set distributionColumnIds; private long maxVersion = -1L; + + // Only for debug: restrict tablets to scan. + private Set specifiedTabletIds = Sets.newHashSet(); private SortInfo annSortInfo = null; private long annSortLimit = -1; @@ -402,7 +406,7 @@ public void updateScanRangeVersions(Map visibleVersionMap) { .collect(Collectors.toMap(loc -> loc.getScanRange().getPaloScanRange().getTabletId(), loc -> loc)); for (Long partitionId : selectedPartitionIds) { final Partition partition = olapTable.getPartition(partitionId); - final MaterializedIndex selectedTable = partition.getIndex(selectedIndexId); + final MaterializedIndex selectedTable = olapTable.getPartitionIndex(partition, selectedIndexId); final List tablets = selectedTable.getTablets(); Long visibleVersion = visibleVersionMap.get(partitionId); assert visibleVersion != null : "the acquried version is not exists in the visible version map"; @@ -427,6 +431,10 @@ public long getMaxVersion() { return maxVersion; } + public void setSpecifiedTabletIds(Set specifiedTabletIds) { + this.specifiedTabletIds = specifiedTabletIds; + } + private void addScanRangeLocations(Partition partition, List tablets, Map> backendAlivePathHashs) throws UserException { long visibleVersion = Partition.PARTITION_INIT_VERSION; @@ -695,7 +703,7 @@ private boolean isEnableCooldownReplicaAffinity(ConnectContext connectContext) { return true; } - private void computePartitionInfo() throws AnalysisException { + public void computePartitionInfo() throws AnalysisException { long start = System.currentTimeMillis(); // Step1: compute partition ids PartitionInfo partitionInfo = olapTable.getPartitionInfo(); @@ -756,7 +764,7 @@ public void computeSampleTabletIds() { Preconditions.checkState(selectedIndexId != -1); for (Long partitionId : selectedPartitionIds) { final Partition partition = olapTable.getPartition(partitionId); - final MaterializedIndex selectedIndex = partition.getIndex(selectedIndexId); + final MaterializedIndex selectedIndex = olapTable.getPartitionIndex(partition, selectedIndexId); // selectedIndex is not expected to be null, because MaterializedIndex ids in one rollup's partitions // are all same. skip this partition here. if (selectedIndex != null) { @@ -787,7 +795,7 @@ public void computeSampleTabletIds() { for (int i = 0; i < selectedPartitionList.size(); i++) { int seekPid = (int) ((i + partitionSeek) % selectedPartitionList.size()); final Partition partition = olapTable.getPartition(selectedPartitionList.get(seekPid)); - final MaterializedIndex selectedTable = partition.getIndex(selectedIndexId); + final MaterializedIndex selectedTable = olapTable.getPartitionIndex(partition, selectedIndexId); List tablets = selectedTable.getTablets(); if (tablets.isEmpty()) { continue; @@ -879,7 +887,7 @@ private void computeTabletInfo() throws UserException { && connectContext.getStatementContext().isShortCircuitQuery(); for (Long partitionId : selectedPartitionIds) { final Partition partition = olapTable.getPartition(partitionId); - final MaterializedIndex selectedTable = partition.getIndex(selectedIndexId); + final MaterializedIndex selectedTable = olapTable.getPartitionIndex(partition, selectedIndexId); final List tablets = Lists.newArrayList(); List allTabletIds = selectedTable.getTabletIdsInOrder(); // point query need prune tablets at this place @@ -899,6 +907,14 @@ private void computeTabletInfo() throws UserException { } } + if (specifiedTabletIds != null && !specifiedTabletIds.isEmpty()) { + if (prunedTabletIds != null) { + prunedTabletIds.retainAll(specifiedTabletIds); + } else { + prunedTabletIds = new ArrayList<>(specifiedTabletIds); + } + } + boolean notExistsSampleAndPrunedTablets = sampleTabletIds.isEmpty() && nereidsPrunedTabletIds.isEmpty(); if (prunedTabletIds != null) { for (Long id : prunedTabletIds) { @@ -1145,6 +1161,9 @@ protected void toThrift(TPlanNode msg) { msg.olap_scan_node = new TOlapScanNode(desc.getId().asInt(), keyColumnNames, keyColumnTypes, isPreAggregation); msg.olap_scan_node.setColumnsDesc(columnsDesc); msg.olap_scan_node.setIndexesDesc(indexDesc); + if (olapTable instanceof RowBinlogTableWrapper) { + msg.olap_scan_node.setReadRowBinlog(true); + } if (selectedIndexId != -1) { msg.olap_scan_node.setSchemaVersion(olapTable.getIndexSchemaVersion(selectedIndexId)); } @@ -1220,6 +1239,10 @@ protected void toThrift(TPlanNode msg) { msg.olap_scan_node.setDistributeColumnIds(new ArrayList<>(distributionColumnIds)); + if (selectedIndexId != -1 && olapTable.getIndexMetaByIndexId(selectedIndexId).isRowBinlogIndex()) { + msg.olap_scan_node.setReadRowBinlog(true); + } + super.toThrift(msg); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java index dc845d4451ba94..4d95fababc149f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapTableSink.java @@ -436,6 +436,7 @@ private TOlapTableSchemaParam createSchema(long dbId, OlapTable table) throws An } TOlapTableIndexSchema indexSchema = new TOlapTableIndexSchema(pair.getKey(), columns, indexMeta.getSchemaHash()); + indexSchema.setRowBinlogId(indexMeta.getRowBinlogIndexId()); Expr whereClause = indexMeta.getWhereClause(); if (whereClause != null) { Expr expr = syncMvWhereClauses.getOrDefault(pair.getKey(), null); @@ -449,6 +450,22 @@ private TOlapTableSchemaParam createSchema(long dbId, OlapTable table) throws An indexSchema.setIndexesDesc(indexDesc); schemaParam.addToIndexes(indexSchema); } + + if (table.needRowBinlog()) { + MaterializedIndexMeta rowBinlogMeta = table.getRowBinlogMeta(); + List binlogColumns = Lists.newArrayList(); + List binlogColumnsDesc = Lists.newArrayList(); + for (Column column : rowBinlogMeta.getSchema(true)) { + TColumn tColumn = ColumnToThrift.toThrift(column); + binlogColumnsDesc.add(tColumn); + binlogColumns.add(column.getName()); + } + TOlapTableIndexSchema rowBinlogIndexSchema = new TOlapTableIndexSchema( + rowBinlogMeta.getIndexId(), binlogColumns, rowBinlogMeta.getSchemaHash()); + rowBinlogIndexSchema.setColumnsDesc(binlogColumnsDesc); + schemaParam.setRowBinlogIndexSchema(rowBinlogIndexSchema); + } + setPartialUpdateInfoForParam(schemaParam, table, uniqueKeyUpdateMode); schemaParam.setInvertedIndexFileStorageFormat(table.getInvertedIndexFileStorageFormat()); return schemaParam; diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableBinlogFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableBinlogFunction.java new file mode 100644 index 00000000000000..3493f04580be31 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableBinlogFunction.java @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.tablefunction; + +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.RowBinlogTableWrapper; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.catalog.TableIf.TableType; +import org.apache.doris.catalog.info.PartitionNamesInfo; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.Config; +import org.apache.doris.common.MetaNotFoundException; +import org.apache.doris.planner.OlapScanNode; +import org.apache.doris.planner.PlanNodeId; +import org.apache.doris.planner.ScanContext; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; + +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import org.apache.commons.lang3.StringUtils; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * The implementation of table valued function `binlog`. + * + * This TVF is debug-only, used to read binlog data from a table. + */ +public class TableBinlogFunction extends TableValuedFunctionIf { + public static final String NAME = "binlog"; + + private static final String DB = "db"; + private static final String TABLE = "table"; + private static final String PARTITION = "partition"; + private static final String TABLET = "tablet"; + + private static final ImmutableSet PROPERTIES_SET = ImmutableSet.of(DB, TABLE, PARTITION, TABLET); + + private final String dbName; + private final String tableName; + private final PartitionNamesInfo partitionNamesInfo; + private final Set specifiedTabletIds; + private final OlapTable originTable; + private final RowBinlogTableWrapper rowBinlogTableWrapper; + + public TableBinlogFunction(Map params) throws AnalysisException { + // Cloud mode uses a single-version snapshot for scan range versioning and is not supported here. + if (Config.isCloudMode()) { + throw new AnalysisException("binlog table valued function is not supported in cloud mode"); + } + + Map validParams = Maps.newHashMap(); + for (Map.Entry e : params.entrySet()) { + String key = StringUtils.lowerCase(e.getKey()); + if (!PROPERTIES_SET.contains(key)) { + throw new AnalysisException("'" + e.getKey() + "' is invalid property"); + } + validParams.put(key, e.getValue()); + } + + this.tableName = StringUtils.trimToEmpty(validParams.get(TABLE)); + if (Strings.isNullOrEmpty(tableName)) { + throw new AnalysisException("'table' is required for binlog"); + } + + String db = Strings.nullToEmpty(validParams.get(DB)).trim(); + if (db.isEmpty()) { + ConnectContext ctx = ConnectContext.get(); + if (ctx != null) { + db = Strings.nullToEmpty(ctx.getDatabase()).trim(); + } + } + if (db.isEmpty()) { + throw new AnalysisException("'db' is required for binlog"); + } + this.dbName = db; + + this.partitionNamesInfo = parsePartitionNamesInfo(validParams.get(PARTITION)); + this.specifiedTabletIds = parseTabletIds(validParams.get(TABLET)); + + DatabaseIf dbIf; + TableIf tableIf; + try { + dbIf = Env.getCurrentEnv().getInternalCatalog().getDbOrMetaException(dbName); + tableIf = dbIf.getTableOrMetaException(tableName, TableType.OLAP); + } catch (MetaNotFoundException e) { + throw new AnalysisException(e.getMessage(), e); + } + if (tableIf.getType() != TableType.OLAP) { + throw new AnalysisException("binlog only supports OLAP table, table=" + tableName); + } + + this.originTable = (OlapTable) tableIf; + originTable.readLock(); + try { + if (!originTable.needRowBinlog()) { + throw new AnalysisException("binlog is not enabled for table=" + originTable.getName()); + } + this.rowBinlogTableWrapper = new RowBinlogTableWrapper(originTable); + } finally { + originTable.readUnlock(); + } + } + + @Override + public String getTableName() { + return "BinlogTableFunction"; + } + + @Override + public List getTableColumns() throws AnalysisException { + originTable.readLock(); + try { + return originTable.getRowBinlogMeta().getSchema(true); + } finally { + originTable.readUnlock(); + } + } + + @Override + public ScanNode getScanNode(PlanNodeId id, TupleDescriptor desc, SessionVariable sv) { + // Replace tvf FunctionGenTable with the binlog OlapTable wrapper. + desc.setTable(rowBinlogTableWrapper); + OlapScanNode olapScanNode = new OlapScanNode(id, desc, "OlapScanNode", + ScanContext.builder().clusterName(sv.resolveCloudClusterName()).build()); + olapScanNode.setSelectedIndexInfo(rowBinlogTableWrapper.getBaseIndexId(), false, "binlog read"); + if (specifiedTabletIds != null && !specifiedTabletIds.isEmpty()) { + olapScanNode.setSpecifiedTabletIds(specifiedTabletIds); + } + // Resolve partition names to IDs, same pattern as Nereids PhysicalPlanTranslator. + if (partitionNamesInfo != null && !partitionNamesInfo.getPartitionNames().isEmpty()) { + List partitionIds = Lists.newArrayList(); + originTable.readLock(); + try { + for (String partName : partitionNamesInfo.getPartitionNames()) { + Partition partition = originTable.getPartition(partName); + if (partition == null) { + throw new IllegalStateException("Partition not found: " + partName); + } + partitionIds.add(partition.getId()); + } + } finally { + originTable.readUnlock(); + } + olapScanNode.setSelectedPartitionIds(partitionIds); + } else { + try { + olapScanNode.computePartitionInfo(); + } catch (AnalysisException e) { + throw new IllegalStateException(e.getMessage(), e); + } + } + return olapScanNode; + } + + private static PartitionNamesInfo parsePartitionNamesInfo(String partitions) throws AnalysisException { + if (Strings.isNullOrEmpty(partitions)) { + return null; + } + List partitionNames = Lists.newArrayList( + partitions.split(",", -1)).stream().map(String::trim).filter(s -> !s.isEmpty()).collect( + Collectors.toList()); + if (partitionNames.isEmpty()) { + throw new AnalysisException("Invalid partition names: " + partitions); + } + return new PartitionNamesInfo(false, partitionNames); + } + + private static Set parseTabletIds(String tabletIds) throws AnalysisException { + if (Strings.isNullOrEmpty(tabletIds)) { + return null; + } + try { + Set tabletIdSet = Lists.newArrayList(tabletIds.split(",", -1)).stream().map(String::trim).filter( + s -> !s.isEmpty()).map(Long::parseLong).collect(Collectors.toSet()); + if (tabletIdSet.isEmpty()) { + throw new AnalysisException("Invalid tablet ids: " + tabletIds); + } + return tabletIdSet; + } catch (NumberFormatException e) { + throw new AnalysisException("Invalid tablet ids: " + tabletIds); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java index 6e489d08c93f36..a36c289aca182c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java @@ -102,6 +102,8 @@ public static TableValuedFunctionIf getTableFunction(String funcName, Map