From d318d413b986e0e5a3b79415b1b1577de17a85be Mon Sep 17 00:00:00 2001 From: xylaaaaa <2392805527@qq.com> Date: Fri, 27 Feb 2026 16:59:07 +0800 Subject: [PATCH 1/2] [fix](paimon-cpp) deduplicate Arrow linking to fix SIGSEGV in FilterRowGroupsByPredicate When ENABLE_PAIMON_CPP is ON, both Doris's own libarrow.a and paimon-cpp's libarrow.a were linked into doris_be, causing 3698 duplicate global symbols. This led to SIGSEGV crashes in paimon::parquet::ParquetFileBatchReader:: FilterRowGroupsByPredicate when libarrow_dataset.a resolved arrow core calls to the wrong copy (compiled with different feature flags). Both are Arrow 17.0.0 but compiled with different options: - Doris: COMPUTE=OFF, DATASET=OFF, ACERO=OFF, FLIGHT=ON - paimon: COMPUTE=ON, DATASET=ON, ACERO=ON, FLIGHT=OFF Fix: when paimon_deps Arrow stack is selected, remove Doris's 'arrow' from COMMON_THIRDPARTY. paimon's libarrow.a is a superset and provides all symbols needed by Doris's arrow_flight / arrow_flight_sql. --- be/CMakeLists.txt | 147 ++++++++++++++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 51 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 924a88706bbe67..c0bc3a4d8c7b05 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -630,62 +630,107 @@ set(PAIMON_ARROW_FILESYSTEM_LIB) set(PAIMON_ARROW_DATASET_LIB) set(PAIMON_ARROW_ACERO_LIB) if (ENABLE_PAIMON_CPP) - set(_paimon_arrow_core_candidates - ${THIRDPARTY_DIR}/paimon-cpp/lib64/paimon_deps/libarrow.a - ${THIRDPARTY_DIR}/lib64/libarrow.a - ${THIRDPARTY_DIR}/lib/libarrow.a - ) - foreach(_paimon_arrow_core_candidate IN LISTS _paimon_arrow_core_candidates) - if (EXISTS "${_paimon_arrow_core_candidate}") - add_library(paimon_arrow_core STATIC IMPORTED) - set_target_properties(paimon_arrow_core PROPERTIES - IMPORTED_LOCATION ${_paimon_arrow_core_candidate}) - set(PAIMON_ARROW_CORE_LIB paimon_arrow_core) - break() + # Select Arrow as one consistent stack (Doris or paimon_deps) to avoid + # mixing different Arrow versions (e.g. Doris core + paimon dataset/acero), + # which causes link/runtime instability. + set(_doris_arrow_core) + set(_doris_arrow_dataset) + set(_doris_arrow_acero) + set(_doris_arrow_filesystem) + foreach(_doris_arrow_dir IN ITEMS ${THIRDPARTY_DIR}/lib64 ${THIRDPARTY_DIR}/lib) + if (NOT _doris_arrow_core AND EXISTS "${_doris_arrow_dir}/libarrow.a") + set(_doris_arrow_core "${_doris_arrow_dir}/libarrow.a") endif() - endforeach() - set(_paimon_arrow_filesystem_candidates - ${THIRDPARTY_DIR}/paimon-cpp/lib64/paimon_deps/libarrow_filesystem.a - ${THIRDPARTY_DIR}/lib64/libarrow_filesystem.a - ${THIRDPARTY_DIR}/lib/libarrow_filesystem.a - ) - foreach(_paimon_arrow_filesystem_candidate IN LISTS _paimon_arrow_filesystem_candidates) - if (EXISTS "${_paimon_arrow_filesystem_candidate}") - add_library(paimon_arrow_filesystem STATIC IMPORTED) - set_target_properties(paimon_arrow_filesystem PROPERTIES - IMPORTED_LOCATION ${_paimon_arrow_filesystem_candidate}) - set(PAIMON_ARROW_FILESYSTEM_LIB paimon_arrow_filesystem) - break() + if (NOT _doris_arrow_dataset AND EXISTS "${_doris_arrow_dir}/libarrow_dataset.a") + set(_doris_arrow_dataset "${_doris_arrow_dir}/libarrow_dataset.a") endif() - endforeach() - set(_paimon_arrow_dataset_candidates - ${THIRDPARTY_DIR}/paimon-cpp/lib64/paimon_deps/libarrow_dataset.a - ${THIRDPARTY_DIR}/lib64/libarrow_dataset.a - ${THIRDPARTY_DIR}/lib/libarrow_dataset.a - ) - foreach(_paimon_arrow_dataset_candidate IN LISTS _paimon_arrow_dataset_candidates) - if (EXISTS "${_paimon_arrow_dataset_candidate}") - add_library(paimon_arrow_dataset STATIC IMPORTED) - set_target_properties(paimon_arrow_dataset PROPERTIES - IMPORTED_LOCATION ${_paimon_arrow_dataset_candidate}) - set(PAIMON_ARROW_DATASET_LIB paimon_arrow_dataset) - break() + if (NOT _doris_arrow_acero AND EXISTS "${_doris_arrow_dir}/libarrow_acero.a") + set(_doris_arrow_acero "${_doris_arrow_dir}/libarrow_acero.a") endif() - endforeach() - set(_paimon_arrow_acero_candidates - ${THIRDPARTY_DIR}/paimon-cpp/lib64/paimon_deps/libarrow_acero.a - ${THIRDPARTY_DIR}/lib64/libarrow_acero.a - ${THIRDPARTY_DIR}/lib/libarrow_acero.a - ) - foreach(_paimon_arrow_acero_candidate IN LISTS _paimon_arrow_acero_candidates) - if (EXISTS "${_paimon_arrow_acero_candidate}") - add_library(paimon_arrow_acero STATIC IMPORTED) - set_target_properties(paimon_arrow_acero PROPERTIES - IMPORTED_LOCATION ${_paimon_arrow_acero_candidate}) - set(PAIMON_ARROW_ACERO_LIB paimon_arrow_acero) - break() + if (NOT _doris_arrow_filesystem AND EXISTS "${_doris_arrow_dir}/libarrow_filesystem.a") + set(_doris_arrow_filesystem "${_doris_arrow_dir}/libarrow_filesystem.a") endif() endforeach() + + set(_paimon_arrow_dir ${THIRDPARTY_DIR}/paimon-cpp/lib64/paimon_deps) + set(_paimon_arrow_core) + set(_paimon_arrow_dataset) + set(_paimon_arrow_acero) + set(_paimon_arrow_filesystem) + if (EXISTS "${_paimon_arrow_dir}/libarrow.a") + set(_paimon_arrow_core "${_paimon_arrow_dir}/libarrow.a") + endif() + if (EXISTS "${_paimon_arrow_dir}/libarrow_dataset.a") + set(_paimon_arrow_dataset "${_paimon_arrow_dir}/libarrow_dataset.a") + endif() + if (EXISTS "${_paimon_arrow_dir}/libarrow_acero.a") + set(_paimon_arrow_acero "${_paimon_arrow_dir}/libarrow_acero.a") + endif() + if (EXISTS "${_paimon_arrow_dir}/libarrow_filesystem.a") + set(_paimon_arrow_filesystem "${_paimon_arrow_dir}/libarrow_filesystem.a") + endif() + + set(_selected_arrow_core) + set(_selected_arrow_dataset) + set(_selected_arrow_acero) + set(_selected_arrow_filesystem) + set(_selected_arrow_stack) + if (_doris_arrow_core AND _doris_arrow_dataset AND _doris_arrow_acero) + set(_selected_arrow_stack "doris") + set(_selected_arrow_core "${_doris_arrow_core}") + set(_selected_arrow_dataset "${_doris_arrow_dataset}") + set(_selected_arrow_acero "${_doris_arrow_acero}") + set(_selected_arrow_filesystem "${_doris_arrow_filesystem}") + elseif (_paimon_arrow_core AND _paimon_arrow_dataset AND _paimon_arrow_acero) + set(_selected_arrow_stack "paimon_deps") + set(_selected_arrow_core "${_paimon_arrow_core}") + set(_selected_arrow_dataset "${_paimon_arrow_dataset}") + set(_selected_arrow_acero "${_paimon_arrow_acero}") + set(_selected_arrow_filesystem "${_paimon_arrow_filesystem}") + endif() + + if (_selected_arrow_core) + add_library(paimon_arrow_core STATIC IMPORTED) + set_target_properties(paimon_arrow_core PROPERTIES + IMPORTED_LOCATION ${_selected_arrow_core}) + set(PAIMON_ARROW_CORE_LIB paimon_arrow_core) + endif() + if (_selected_arrow_filesystem) + add_library(paimon_arrow_filesystem STATIC IMPORTED) + set_target_properties(paimon_arrow_filesystem PROPERTIES + IMPORTED_LOCATION ${_selected_arrow_filesystem}) + set(PAIMON_ARROW_FILESYSTEM_LIB paimon_arrow_filesystem) + endif() + if (_selected_arrow_dataset) + add_library(paimon_arrow_dataset STATIC IMPORTED) + set_target_properties(paimon_arrow_dataset PROPERTIES + IMPORTED_LOCATION ${_selected_arrow_dataset}) + set(PAIMON_ARROW_DATASET_LIB paimon_arrow_dataset) + endif() + if (_selected_arrow_acero) + add_library(paimon_arrow_acero STATIC IMPORTED) + set_target_properties(paimon_arrow_acero PROPERTIES + IMPORTED_LOCATION ${_selected_arrow_acero}) + set(PAIMON_ARROW_ACERO_LIB paimon_arrow_acero) + endif() + + if (_selected_arrow_stack) + message(STATUS "Paimon C++: using ${_selected_arrow_stack} Arrow stack") + if (_selected_arrow_stack STREQUAL "paimon_deps") + # paimon's libarrow.a is a superset of Doris's libarrow.a + # (COMPUTE/FILESYSTEM/DATASET=ON vs OFF), same Arrow 17.0.0. + # Remove Doris's arrow from COMMON_THIRDPARTY to avoid duplicate + # symbols (3698 overlapping globals), which cause SIGSEGV when + # libarrow_dataset.a resolves arrow core calls to the wrong copy. + # Doris's arrow_flight / arrow_flight_sql remain – they only + # depend on arrow core ABI, which paimon's libarrow.a provides. + list(REMOVE_ITEM COMMON_THIRDPARTY arrow) + message(STATUS "Paimon C++: removed Doris 'arrow' from COMMON_THIRDPARTY to avoid duplicate libarrow.a") + endif() + else() + message(STATUS "Paimon C++: no complete Arrow stack found (need libarrow + libarrow_dataset + libarrow_acero)") + endif() + if (PAIMON_ARROW_DATASET_LIB) # paimon_parquet_file_format depends on Arrow Dataset symbols. # Force-link it only when arrow_dataset is available. From 4a6ddee0c5455e597777f18f7ccfa5811a02272f Mon Sep 17 00:00:00 2001 From: xylaaaaa <2392805527@qq.com> Date: Sat, 28 Feb 2026 13:38:30 +0800 Subject: [PATCH 2/2] [chore](paimon-cpp) reuse Doris Arrow stack for paimon-cpp build --- be/CMakeLists.txt | 134 +++----------------------------------- be/cmake/thirdparty.cmake | 2 + 2 files changed, 10 insertions(+), 126 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index c0bc3a4d8c7b05..05153524f22600 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -625,122 +625,16 @@ if (BUILD_BENCHMARK) endif() set(PAIMON_FACTORY_REGISTRY_LIBS) -set(PAIMON_ARROW_CORE_LIB) -set(PAIMON_ARROW_FILESYSTEM_LIB) -set(PAIMON_ARROW_DATASET_LIB) -set(PAIMON_ARROW_ACERO_LIB) if (ENABLE_PAIMON_CPP) - # Select Arrow as one consistent stack (Doris or paimon_deps) to avoid - # mixing different Arrow versions (e.g. Doris core + paimon dataset/acero), - # which causes link/runtime instability. - set(_doris_arrow_core) - set(_doris_arrow_dataset) - set(_doris_arrow_acero) - set(_doris_arrow_filesystem) - foreach(_doris_arrow_dir IN ITEMS ${THIRDPARTY_DIR}/lib64 ${THIRDPARTY_DIR}/lib) - if (NOT _doris_arrow_core AND EXISTS "${_doris_arrow_dir}/libarrow.a") - set(_doris_arrow_core "${_doris_arrow_dir}/libarrow.a") - endif() - if (NOT _doris_arrow_dataset AND EXISTS "${_doris_arrow_dir}/libarrow_dataset.a") - set(_doris_arrow_dataset "${_doris_arrow_dir}/libarrow_dataset.a") - endif() - if (NOT _doris_arrow_acero AND EXISTS "${_doris_arrow_dir}/libarrow_acero.a") - set(_doris_arrow_acero "${_doris_arrow_dir}/libarrow_acero.a") - endif() - if (NOT _doris_arrow_filesystem AND EXISTS "${_doris_arrow_dir}/libarrow_filesystem.a") - set(_doris_arrow_filesystem "${_doris_arrow_dir}/libarrow_filesystem.a") - endif() - endforeach() - - set(_paimon_arrow_dir ${THIRDPARTY_DIR}/paimon-cpp/lib64/paimon_deps) - set(_paimon_arrow_core) - set(_paimon_arrow_dataset) - set(_paimon_arrow_acero) - set(_paimon_arrow_filesystem) - if (EXISTS "${_paimon_arrow_dir}/libarrow.a") - set(_paimon_arrow_core "${_paimon_arrow_dir}/libarrow.a") - endif() - if (EXISTS "${_paimon_arrow_dir}/libarrow_dataset.a") - set(_paimon_arrow_dataset "${_paimon_arrow_dir}/libarrow_dataset.a") - endif() - if (EXISTS "${_paimon_arrow_dir}/libarrow_acero.a") - set(_paimon_arrow_acero "${_paimon_arrow_dir}/libarrow_acero.a") - endif() - if (EXISTS "${_paimon_arrow_dir}/libarrow_filesystem.a") - set(_paimon_arrow_filesystem "${_paimon_arrow_dir}/libarrow_filesystem.a") - endif() - - set(_selected_arrow_core) - set(_selected_arrow_dataset) - set(_selected_arrow_acero) - set(_selected_arrow_filesystem) - set(_selected_arrow_stack) - if (_doris_arrow_core AND _doris_arrow_dataset AND _doris_arrow_acero) - set(_selected_arrow_stack "doris") - set(_selected_arrow_core "${_doris_arrow_core}") - set(_selected_arrow_dataset "${_doris_arrow_dataset}") - set(_selected_arrow_acero "${_doris_arrow_acero}") - set(_selected_arrow_filesystem "${_doris_arrow_filesystem}") - elseif (_paimon_arrow_core AND _paimon_arrow_dataset AND _paimon_arrow_acero) - set(_selected_arrow_stack "paimon_deps") - set(_selected_arrow_core "${_paimon_arrow_core}") - set(_selected_arrow_dataset "${_paimon_arrow_dataset}") - set(_selected_arrow_acero "${_paimon_arrow_acero}") - set(_selected_arrow_filesystem "${_paimon_arrow_filesystem}") - endif() - - if (_selected_arrow_core) - add_library(paimon_arrow_core STATIC IMPORTED) - set_target_properties(paimon_arrow_core PROPERTIES - IMPORTED_LOCATION ${_selected_arrow_core}) - set(PAIMON_ARROW_CORE_LIB paimon_arrow_core) - endif() - if (_selected_arrow_filesystem) - add_library(paimon_arrow_filesystem STATIC IMPORTED) - set_target_properties(paimon_arrow_filesystem PROPERTIES - IMPORTED_LOCATION ${_selected_arrow_filesystem}) - set(PAIMON_ARROW_FILESYSTEM_LIB paimon_arrow_filesystem) - endif() - if (_selected_arrow_dataset) - add_library(paimon_arrow_dataset STATIC IMPORTED) - set_target_properties(paimon_arrow_dataset PROPERTIES - IMPORTED_LOCATION ${_selected_arrow_dataset}) - set(PAIMON_ARROW_DATASET_LIB paimon_arrow_dataset) - endif() - if (_selected_arrow_acero) - add_library(paimon_arrow_acero STATIC IMPORTED) - set_target_properties(paimon_arrow_acero PROPERTIES - IMPORTED_LOCATION ${_selected_arrow_acero}) - set(PAIMON_ARROW_ACERO_LIB paimon_arrow_acero) - endif() + # Plan B: Doris Arrow is now built with COMPUTE/DATASET/ACERO/FILESYSTEM, + # so arrow, arrow_dataset, arrow_acero are all in COMMON_THIRDPARTY via + # thirdparty.cmake. paimon-cpp reuses the same Arrow (no paimon_deps). + # No dual-stack selection needed — single Arrow for everything. - if (_selected_arrow_stack) - message(STATUS "Paimon C++: using ${_selected_arrow_stack} Arrow stack") - if (_selected_arrow_stack STREQUAL "paimon_deps") - # paimon's libarrow.a is a superset of Doris's libarrow.a - # (COMPUTE/FILESYSTEM/DATASET=ON vs OFF), same Arrow 17.0.0. - # Remove Doris's arrow from COMMON_THIRDPARTY to avoid duplicate - # symbols (3698 overlapping globals), which cause SIGSEGV when - # libarrow_dataset.a resolves arrow core calls to the wrong copy. - # Doris's arrow_flight / arrow_flight_sql remain – they only - # depend on arrow core ABI, which paimon's libarrow.a provides. - list(REMOVE_ITEM COMMON_THIRDPARTY arrow) - message(STATUS "Paimon C++: removed Doris 'arrow' from COMMON_THIRDPARTY to avoid duplicate libarrow.a") - endif() - else() - message(STATUS "Paimon C++: no complete Arrow stack found (need libarrow + libarrow_dataset + libarrow_acero)") - endif() - - if (PAIMON_ARROW_DATASET_LIB) - # paimon_parquet_file_format depends on Arrow Dataset symbols. - # Force-link it only when arrow_dataset is available. - set(PAIMON_FACTORY_REGISTRY_LIBS - paimon_parquet_file_format - ) - list(REMOVE_ITEM COMMON_THIRDPARTY ${PAIMON_FACTORY_REGISTRY_LIBS}) - else() - message(STATUS "Paimon C++: libarrow_dataset.a not found, keep paimon_parquet_file_format as regular static lib") - endif() + # paimon_parquet_file_format depends on Arrow Dataset symbols. + # Force-link it with --whole-archive so its factory registration runs. + set(PAIMON_FACTORY_REGISTRY_LIBS paimon_parquet_file_format) + list(REMOVE_ITEM COMMON_THIRDPARTY ${PAIMON_FACTORY_REGISTRY_LIBS}) endif() set(DORIS_DEPENDENCIES @@ -767,18 +661,6 @@ if (ENABLE_PAIMON_CPP) ${PAIMON_FACTORY_REGISTRY_LIBS} -Wl,--no-whole-archive) endif() - if (PAIMON_ARROW_CORE_LIB) - set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${PAIMON_ARROW_CORE_LIB}) - endif() - if (PAIMON_ARROW_FILESYSTEM_LIB) - set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${PAIMON_ARROW_FILESYSTEM_LIB}) - endif() - if (PAIMON_ARROW_DATASET_LIB) - set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${PAIMON_ARROW_DATASET_LIB}) - endif() - if (PAIMON_ARROW_ACERO_LIB) - set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${PAIMON_ARROW_ACERO_LIB}) - endif() # paimon-cpp internal dependencies (renamed with _paimon suffix) # These must come after paimon libraries to resolve symbols. diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake index 441ebe8dc738ba..227f81411f1322 100644 --- a/be/cmake/thirdparty.cmake +++ b/be/cmake/thirdparty.cmake @@ -106,6 +106,8 @@ add_thirdparty(zstd LIB64) add_thirdparty(arrow LIB64) add_thirdparty(arrow_flight LIB64) add_thirdparty(arrow_flight_sql LIB64) +add_thirdparty(arrow_dataset LIB64) +add_thirdparty(arrow_acero LIB64) add_thirdparty(parquet LIB64) add_thirdparty(brpc LIB64) add_thirdparty(rocksdb)