|
| 1 | +#include <Storages/MergeTree/IMergeTreeCleanupThread.h> |
| 2 | + |
| 3 | +#include <Interpreters/Context.h> |
| 4 | +#include <Storages/MergeTree/MergeTreeData.h> |
| 5 | +#include <Storages/MergeTree/MergeTreeSettings.h> |
| 6 | +#include <Common/ZooKeeper/KeeperException.h> |
| 7 | + |
| 8 | +namespace DB |
| 9 | +{ |
| 10 | + |
| 11 | +namespace MergeTreeSetting |
| 12 | +{ |
| 13 | + extern const MergeTreeSettingsUInt64 cleanup_delay_period; |
| 14 | + extern const MergeTreeSettingsUInt64 cleanup_delay_period_random_add; |
| 15 | + extern const MergeTreeSettingsUInt64 cleanup_thread_preferred_points_per_iteration; |
| 16 | + extern const MergeTreeSettingsUInt64 max_cleanup_delay_period; |
| 17 | +} |
| 18 | + |
| 19 | +IMergeTreeCleanupThread::IMergeTreeCleanupThread(MergeTreeData & data_) |
| 20 | + : data(data_) |
| 21 | + , log_name(data.getStorageID().getFullTableName() + " (CleanupThread)") |
| 22 | + , log(getLogger(log_name)) |
| 23 | + , sleep_ms((*data.getSettings())[MergeTreeSetting::cleanup_delay_period] * 1000) |
| 24 | +{ |
| 25 | + task = data.getContext()->getSchedulePool().createTask(log_name, [this] { run(); }); |
| 26 | +} |
| 27 | + |
| 28 | +IMergeTreeCleanupThread::~IMergeTreeCleanupThread() = default; |
| 29 | + |
| 30 | +void IMergeTreeCleanupThread::start() |
| 31 | +{ |
| 32 | + task->activateAndSchedule(); |
| 33 | +} |
| 34 | + |
| 35 | +void IMergeTreeCleanupThread::wakeup() |
| 36 | +{ |
| 37 | + task->schedule(); |
| 38 | +} |
| 39 | + |
| 40 | +void IMergeTreeCleanupThread::stop() |
| 41 | +{ |
| 42 | + task->deactivate(); |
| 43 | +} |
| 44 | + |
| 45 | +void IMergeTreeCleanupThread::wakeupEarlierIfNeeded() |
| 46 | +{ |
| 47 | + /// It may happen that the tables was idle for a long time, but then a user started to aggressively insert (or mutate) data. |
| 48 | + /// In this case, sleep_ms was set to the highest possible value, the task is not going to wake up soon, |
| 49 | + /// but the number of objects to clean up is growing. We need to wakeup the task earlier. |
| 50 | + auto storage_settings = data.getSettings(); |
| 51 | + if (!(*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration]) |
| 52 | + return; |
| 53 | + |
| 54 | + /// The number of other objects (logs, blocks, etc) is usually correlated with the number of Outdated parts. |
| 55 | + /// Do not wake up unless we have too many. |
| 56 | + size_t number_of_outdated_objects = data.getOutdatedPartsCount(); |
| 57 | + if (number_of_outdated_objects < (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration] * 2) |
| 58 | + return; |
| 59 | + |
| 60 | + /// A race condition is possible here, but it's okay |
| 61 | + if (is_running.load(std::memory_order_relaxed)) |
| 62 | + return; |
| 63 | + |
| 64 | + /// Do not re-check all parts too often (avoid constantly calling getNumberOfOutdatedPartsWithExpiredRemovalTime()) |
| 65 | + if (!wakeup_check_timer.compareAndRestart(static_cast<double>((*storage_settings)[MergeTreeSetting::cleanup_delay_period]) / 4.0)) |
| 66 | + return; |
| 67 | + |
| 68 | + UInt64 prev_run_timestamp_ms = prev_cleanup_timestamp_ms.load(std::memory_order_relaxed); |
| 69 | + UInt64 now_ms = clock_gettime_ns_adjusted(prev_run_timestamp_ms * 1'000'000) / 1'000'000; |
| 70 | + if (!prev_run_timestamp_ms || now_ms <= prev_run_timestamp_ms) |
| 71 | + return; |
| 72 | + |
| 73 | + /// Don't run it more often than cleanup_delay_period |
| 74 | + UInt64 seconds_passed = (now_ms - prev_run_timestamp_ms) / 1000; |
| 75 | + if (seconds_passed < (*storage_settings)[MergeTreeSetting::cleanup_delay_period]) |
| 76 | + return; |
| 77 | + |
| 78 | + /// Do not count parts that cannot be removed anyway. Do not wake up unless we have too many. |
| 79 | + number_of_outdated_objects = data.getNumberOfOutdatedPartsWithExpiredRemovalTime(); |
| 80 | + if (number_of_outdated_objects < (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration] * 2) |
| 81 | + return; |
| 82 | + |
| 83 | + LOG_TRACE( |
| 84 | + log, |
| 85 | + "Waking up cleanup thread because there are {} outdated objects and previous cleanup finished {}s ago", |
| 86 | + number_of_outdated_objects, |
| 87 | + seconds_passed); |
| 88 | + |
| 89 | + wakeup(); |
| 90 | +} |
| 91 | + |
| 92 | +void IMergeTreeCleanupThread::run() |
| 93 | +{ |
| 94 | + if (cleanup_blocker.isCancelled()) |
| 95 | + { |
| 96 | + LOG_TRACE(LogFrequencyLimiter(log, 30), "Cleanup is cancelled, exiting"); |
| 97 | + return; |
| 98 | + } |
| 99 | + |
| 100 | + SCOPE_EXIT({ is_running.store(false, std::memory_order_relaxed); }); |
| 101 | + is_running.store(true, std::memory_order_relaxed); |
| 102 | + |
| 103 | + auto storage_settings = data.getSettings(); |
| 104 | + |
| 105 | + Float32 cleanup_points = 0; |
| 106 | + try |
| 107 | + { |
| 108 | + cleanup_points = iterate(); |
| 109 | + } |
| 110 | + catch (const Coordination::Exception & e) |
| 111 | + { |
| 112 | + tryLogCurrentException(log, __PRETTY_FUNCTION__); |
| 113 | + |
| 114 | + if (e.code == Coordination::Error::ZSESSIONEXPIRED) |
| 115 | + return; |
| 116 | + } |
| 117 | + catch (...) |
| 118 | + { |
| 119 | + tryLogCurrentException(log, __PRETTY_FUNCTION__); |
| 120 | + } |
| 121 | + |
| 122 | + UInt64 prev_timestamp = prev_cleanup_timestamp_ms.load(std::memory_order_relaxed); |
| 123 | + UInt64 now_ms = clock_gettime_ns_adjusted(prev_timestamp * 1'000'000) / 1'000'000; |
| 124 | + |
| 125 | + /// Do not adjust sleep_ms on the first run after starting the server |
| 126 | + if (prev_timestamp && (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration]) |
| 127 | + { |
| 128 | + /// We don't want to run the task too often when the table was barely changed and there's almost nothing to cleanup. |
| 129 | + /// But we cannot simply sleep max_cleanup_delay_period (300s) when nothing was cleaned up and cleanup_delay_period (30s) |
| 130 | + /// when we removed something, because inserting one part per 30s will lead to running cleanup each 30s just to remove one part. |
| 131 | + /// So we need some interpolation based on preferred batch size. |
| 132 | + auto expected_cleanup_points = (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration]; |
| 133 | + |
| 134 | + /// How long should we sleep to remove cleanup_thread_preferred_points_per_iteration on the next iteration? |
| 135 | + Float32 ratio = cleanup_points / static_cast<Float32>(expected_cleanup_points); |
| 136 | + if (ratio == 0) |
| 137 | + sleep_ms = (*storage_settings)[MergeTreeSetting::max_cleanup_delay_period] * 1000; |
| 138 | + else |
| 139 | + sleep_ms = static_cast<UInt64>(static_cast<Float32>(sleep_ms) / ratio); |
| 140 | + |
| 141 | + sleep_ms = std::clamp( |
| 142 | + sleep_ms, |
| 143 | + (*storage_settings)[MergeTreeSetting::cleanup_delay_period] * 1000, |
| 144 | + (*storage_settings)[MergeTreeSetting::max_cleanup_delay_period] * 1000); |
| 145 | + |
| 146 | + UInt64 interval_ms = now_ms - prev_timestamp; |
| 147 | + LOG_TRACE( |
| 148 | + log, |
| 149 | + "Scheduling next cleanup after {}ms (points: {}, interval: {}ms, ratio: {}, points per minute: {})", |
| 150 | + sleep_ms, |
| 151 | + cleanup_points, |
| 152 | + interval_ms, |
| 153 | + ratio, |
| 154 | + cleanup_points / static_cast<Float32>(interval_ms * 60'000)); |
| 155 | + } |
| 156 | + prev_cleanup_timestamp_ms.store(now_ms, std::memory_order_relaxed); |
| 157 | + |
| 158 | + sleep_ms += std::uniform_int_distribution<UInt64>(0, (*storage_settings)[MergeTreeSetting::cleanup_delay_period_random_add] * 1000)(rng); |
| 159 | + task->scheduleAfter(sleep_ms); |
| 160 | +} |
| 161 | + |
| 162 | +} |
0 commit comments