From cc8cfae715db5854183ec397975b54e5a02c45b6 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Sat, 8 Nov 2025 11:57:38 +0800
Subject: [PATCH 01/23] support gpu direct rdma

1) recv all data on gpu first
2) the gpu block is alloced from a gpu block pool
3) brpc header, meta and body will be copied from gpu to cpu to process.
4) To decrease the d2h counts, we will prefetch 512B to memory

Co-authored-by: sunce4t <847480001@qq.com>
---
 BUILD.bazel                            |  13 +
 bazel/config/BUILD.bazel               |   8 +-
 src/brpc/policy/baidu_rpc_protocol.cpp | 142 +++++++-
 src/brpc/rdma/rdma_endpoint.cpp        | 131 +++++--
 src/brpc/rdma/rdma_endpoint.h          |   4 +-
 src/brpc/rdma/rdma_helper.cpp          |  18 +-
 src/brpc/rdma/rdma_helper.h            |   3 +
 src/butil/gpu/gpu_block_pool.cpp       | 450 +++++++++++++++++++++++++
 src/butil/gpu/gpu_block_pool.h         | 201 +++++++++++
 src/butil/iobuf.cpp                    | 109 ++++++
 src/butil/iobuf.h                      |   9 +-
 11 files changed, 1056 insertions(+), 32 deletions(-)
 create mode 100644 src/butil/gpu/gpu_block_pool.cpp
 create mode 100644 src/butil/gpu/gpu_block_pool.h

diff --git a/BUILD.bazel b/BUILD.bazel
index 138e416b10..e1a853d2db 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -54,6 +54,9 @@ COPTS = [
 }) + select({
     "//bazel/config:brpc_with_asan": ["-fsanitize=address"],
     "//conditions:default": [""],
+}) + select({
+    ":brpc_with_gdr": ["-DBRPC_WITH_GDR=1"],
+    "//conditions:default": [""],
 }) + select({
     "//bazel/config:brpc_with_no_pthread_mutex_hook": ["-DNO_PTHREAD_MUTEX_HOOK"],
     "//conditions:default": [""],
@@ -232,6 +235,7 @@ BUTIL_SRCS = [
     "src/butil/iobuf.cpp",
     "src/butil/single_iobuf.cpp",
     "src/butil/iobuf_profiler.cpp",
+    "src/butil/gpu/gpu_block_pool.cpp",
     "src/butil/binary_printer.cpp",
     "src/butil/recordio.cc",
     "src/butil/popen.cpp",
@@ -337,6 +341,9 @@ cc_library(
             "-DUNIT_TEST",
         ],
         "//conditions:default": [],
+    }) + select({
+        ":brpc_with_gdr": ["@local_config_cuda//cuda:cuda_headers"],
+        "//conditions:default": [],
     }),
     includes = [
         "src/",
@@ -356,6 +363,9 @@ cc_library(
     }) + select({
         "//bazel/config:brpc_with_boringssl": ["@boringssl//:ssl", "@boringssl//:crypto"],
         "//conditions:default": ["@openssl//:ssl", "@openssl//:crypto"],
+    }) + select({
+        ":brpc_with_gdr": ["@local_config_cuda//cuda:cuda_headers"],
+        "//conditions:default": [],
     }),
 )
 
@@ -573,6 +583,9 @@ cc_library(
             "@org_apache_thrift//:thrift",
         ],
         "//conditions:default": [],
+    }) + select({
+        ":brpc_with_gdr": ["@local_config_cuda//cuda:cuda_headers"],
+        "//conditions:default": [],
     }),
 )
 
diff --git a/bazel/config/BUILD.bazel b/bazel/config/BUILD.bazel
index d08ea2ec23..06376cf85c 100644
--- a/bazel/config/BUILD.bazel
+++ b/bazel/config/BUILD.bazel
@@ -104,6 +104,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "brpc_with_gdr",
+    define_values = {"BRPC_WITH_GDR": "true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "brpc_with_boringssl",
     define_values = {"BRPC_WITH_BORINGSSL": "true"},
@@ -148,4 +154,4 @@ config_setting(
     name = "with_babylon_counter",
     define_values = {"with_babylon_counter": "true"},
     visibility = ["//visibility:public"],
-)
\ No newline at end of file
+)
diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index 5adf77b2c5..2e37bae3a3 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -23,6 +23,7 @@
 #include <google/protobuf/text_format.h>
 #include "butil/logging.h"                       // LOG()
 #include "butil/iobuf.h"                         // butil::IOBuf
+#include "butil/gpu/gpu_block_pool.h"
 #include "butil/raw_pack.h"                      // RawPacker RawUnpacker
 #include "butil/memory/scope_guard.h"
 #include "json2pb/json_to_pb.h"
@@ -69,6 +70,10 @@ DECLARE_bool(pb_enum_as_number);
 // 5. Not supported: chunk_info
 
 // Pack header into `buf'
+
+const int header_size = 12;
+const int prefetch_d2h_size = 512;
+
 inline void PackRpcHeader(char* rpc_header, uint32_t meta_size, int payload_size) {
     uint32_t* dummy = (uint32_t*)rpc_header;  // suppress strict-alias warning
     *dummy = *(uint32_t*)"PRPC";
@@ -101,31 +106,81 @@ static void SerializeRpcHeaderAndMeta(
 
 ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
                             bool /*read_eof*/, const void*) {
+
     char header_buf[12];
-    const size_t n = source->copy_to(header_buf, sizeof(header_buf));
+    size_t n = 0;
+#if BRPC_WITH_GDR
+    void* prefetch_d2h_data = NULL;
+
+    uint64_t data_meta = source->get_first_data_meta();
+    bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+    butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
+    if (is_gpu_memory) {
+        prefetch_d2h_data = host_allocator->AllocateRaw(prefetch_d2h_size);
+        if (prefetch_d2h_data == NULL) {
+            LOG(FATAL) << "alloc host data failed!!!";
+        }
+        n = source->copy_from_gpu(prefetch_d2h_data, prefetch_d2h_size);
+        size_t copy_size = n > 12 ? 12 : n;
+        memcpy(header_buf, prefetch_d2h_data, copy_size);
+    } else {
+        n = source->copy_to(header_buf, sizeof(header_buf));
+    }
+#else
+    n = source->copy_to(header_buf, sizeof(header_buf));
+#endif  // BRPC_WITH_GDR
     if (n >= 4) {
         void* dummy = header_buf;
         if (*(const uint32_t*)dummy != *(const uint32_t*)"PRPC") {
+#if BRPC_WITH_GDR
+            if (is_gpu_memory) {
+                host_allocator->DeallocateRaw(prefetch_d2h_data);
+            }
+#endif  // BRPC_WITH_GDR
             return MakeParseError(PARSE_ERROR_TRY_OTHERS);
         }
     } else {
         if (memcmp(header_buf, "PRPC", n) != 0) {
+#if BRPC_WITH_GDR
+            if (is_gpu_memory) {
+                host_allocator->DeallocateRaw(prefetch_d2h_data);
+            }
+#endif  // BRPC_WITH_GDR
             return MakeParseError(PARSE_ERROR_TRY_OTHERS);
         }
     }
     if (n < sizeof(header_buf)) {
+#if BRPC_WITH_GDR
+        if (is_gpu_memory) {
+            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        }
+#endif  // BRPC_WITH_GDR
         return MakeParseError(PARSE_ERROR_NOT_ENOUGH_DATA);
     }
     uint32_t body_size;
     uint32_t meta_size;
     butil::RawUnpacker(header_buf + 4).unpack32(body_size).unpack32(meta_size);
+    if (body_size > 128 * 1024 * 1024) {
+        LOG(ERROR) << "body_size=" << body_size << " from "
+                   << socket->remote_side() << " is too large";
+    }
     if (body_size > FLAGS_max_body_size) {
         // We need this log to report the body_size to give users some clues
         // which is not printed in InputMessenger.
         LOG(ERROR) << "body_size=" << body_size << " from "
                    << socket->remote_side() << " is too large";
+#if BRPC_WITH_GDR
+        if (is_gpu_memory) {
+            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        }
+#endif  // BRPC_WITH_GDR
         return MakeParseError(PARSE_ERROR_TOO_BIG_DATA);
     } else if (source->length() < sizeof(header_buf) + body_size) {
+#if BRPC_WITH_GDR
+        if (is_gpu_memory) {
+            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        }
+#endif  // BRPC_WITH_GDR
         return MakeParseError(PARSE_ERROR_NOT_ENOUGH_DATA);
     }
     if (meta_size > body_size) {
@@ -133,12 +188,34 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
                    << body_size;
         // Pop the message
         source->pop_front(sizeof(header_buf) + body_size);
+#if BRPC_WITH_GDR
+        if (is_gpu_memory) {
+            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        }
+#endif  // BRPC_WITH_GDR
         return MakeParseError(PARSE_ERROR_TRY_OTHERS);
     }
     source->pop_front(sizeof(header_buf));
     MostCommonMessage* msg = MostCommonMessage::Get();
+#if BRPC_WITH_GDR
+    if (is_gpu_memory) {
+        if (header_size + meta_size <= n) {
+            auto deleter = [host_allocator, prefetch_d2h_data](void* data) { host_allocator->DeallocateRaw(prefetch_d2h_data); };
+            msg->meta.append_user_data_with_meta((char*)prefetch_d2h_data + header_size, meta_size, deleter, n);
+            source->pop_front(meta_size);
+        } else {
+            host_allocator->DeallocateRaw(prefetch_d2h_data);
+            source->cutn_from_gpu(&msg->meta, meta_size);
+        }
+        source->cutn(&msg->payload, body_size - meta_size);
+    } else {
+        source->cutn(&msg->meta, meta_size);
+        source->cutn(&msg->payload, body_size - meta_size);
+    }
+#else
     source->cutn(&msg->meta, meta_size);
     source->cutn(&msg->payload, body_size - meta_size);
+#endif  // BRPC_WITH_GDR
     return MakeMessage(msg);
 }
 
@@ -793,7 +870,29 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
 
             butil::IOBuf req_buf;
             int body_without_attachment_size = req_size - meta.attachment_size();
+#if BRPC_WITH_GDR
+            int meta_size = msg->meta.size();
+            uint64_t data_meta = msg->payload.get_first_data_meta();
+            bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+            if(is_gpu_memory) {
+                int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
+                if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
+                    void* data = msg->meta.get_first_data_ptr();
+                    if (data == nullptr) {
+                        LOG(FATAL) << "illegal data!!!";
+                    }
+                    req_buf.append((char*)data + meta_size, body_without_attachment_size);
+                    msg->payload.pop_front(body_without_attachment_size);
+                } else {
+                    msg->payload.cutn_from_gpu(&req_buf, body_without_attachment_size);
+                }
+            }
+            else {
+                msg->payload.cutn(&req_buf, body_without_attachment_size);
+            }
+#else
             msg->payload.cutn(&req_buf, body_without_attachment_size);
+#endif  // BRPC_WITH_GDR
             if (meta.attachment_size() > 0) {
                 cntl->request_attachment().swap(msg->payload);
             }
@@ -963,8 +1062,14 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
         } 
         // Parse response message iff error code from meta is 0
         butil::IOBuf res_buf;
+        int meta_size = msg->meta.size();
         const int res_size = msg->payload.length();
         butil::IOBuf* res_buf_ptr = &msg->payload;
+
+#if BRPC_WITH_GDR
+        uint64_t data_meta = msg->payload.get_first_data_meta();
+        bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+#endif  // BRPC_WITH_GDR
         if (meta.has_attachment_size()) {
             if (meta.attachment_size() > res_size) {
                 cntl->SetFailed(
@@ -973,9 +1078,44 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
                 break;
             }
             int body_without_attachment_size = res_size - meta.attachment_size();
+
+#if BRPC_WITH_GDR
+            if(is_gpu_memory) {
+                int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
+                if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
+                    void* data = msg->meta.get_first_data_ptr();
+                    if (data == nullptr) {
+                        LOG(FATAL) << "illegal data!!!";
+                    }
+                    res_buf.append((char*)data + meta_size, body_without_attachment_size);
+                    msg->payload.pop_front(body_without_attachment_size);
+                } else {
+                    msg->payload.cutn_from_gpu(&res_buf, body_without_attachment_size);
+                }
+            }
+            else {
+                msg->payload.cutn(&res_buf, body_without_attachment_size);
+            }
+#else
             msg->payload.cutn(&res_buf, body_without_attachment_size);
+#endif  // BRPC_WITH_GDR
             res_buf_ptr = &res_buf;
             cntl->response_attachment().swap(msg->payload);
+#if BRPC_WITH_GDR
+        } else if(is_gpu_memory) {
+            int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
+            if (header_size + meta_size + res_size <= real_prefetch_d2h_size) {
+                void* data = msg->meta.get_first_data_ptr();
+                if (data == nullptr) {
+                    LOG(FATAL) << "illegal data!!!";
+                }
+                res_buf.append((char*)data + meta_size, res_size);
+                msg->payload.pop_front(res_size);
+            } else {
+                msg->payload.cutn_from_gpu(&res_buf, res_size);
+            }
+            res_buf_ptr = &res_buf;
+#endif  // BRPC_WITH_GDR
         }
 
         ContentType content_type = meta.content_type();
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 1d502a98f7..fc6ce20928 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -20,6 +20,7 @@
 #include <gflags/gflags.h>
 #include "butil/fd_utility.h"
 #include "butil/logging.h"                   // CHECK, LOG
+#include "butil/gpu/gpu_block_pool.h"
 #include "butil/sys_byteorder.h"             // HostToNet,NetToHost
 #include "bthread/bthread.h"
 #include "brpc/errno.pb.h"
@@ -30,6 +31,7 @@
 #include "brpc/rdma/block_pool.h"
 #include "brpc/rdma/rdma_helper.h"
 #include "brpc/rdma/rdma_endpoint.h"
+#include "brpc/traceprintf.h"
 
 DECLARE_int32(task_group_ntags);
 
@@ -48,15 +50,15 @@ extern int (*IbvQueryQp)(ibv_qp*, ibv_qp_attr*, ibv_qp_attr_mask, ibv_qp_init_at
 extern int (*IbvDestroyQp)(ibv_qp*);
 extern bool g_skip_rdma_init;
 
-DEFINE_int32(rdma_sq_size, 128, "SQ size for RDMA");
-DEFINE_int32(rdma_rq_size, 128, "RQ size for RDMA");
+DEFINE_int32(rdma_sq_size, 64, "SQ size for RDMA");
+DEFINE_int32(rdma_rq_size, 64, "RQ size for RDMA");
 DEFINE_bool(rdma_recv_zerocopy, true, "Enable zerocopy for receive side");
 DEFINE_int32(rdma_zerocopy_min_size, 512, "The minimal size for receive zerocopy");
 DEFINE_string(rdma_recv_block_type, "default", "Default size type for recv WR: "
               "default(8KB - 32B)/large(64KB - 32B)/huge(2MB - 32B)");
 DEFINE_int32(rdma_cqe_poll_once, 32, "The maximum of cqe number polled once.");
 DEFINE_int32(rdma_prepared_qp_size, 128, "SQ and RQ size for prepared QP.");
-DEFINE_int32(rdma_prepared_qp_cnt, 1024, "Initial count of prepared QP.");
+DEFINE_int32(rdma_prepared_qp_cnt, 256, "Initial count of prepared QP.");
 DEFINE_bool(rdma_trace_verbose, false, "Print log message verbosely");
 BRPC_VALIDATE_GFLAG(rdma_trace_verbose, brpc::PassValidate);
 DEFINE_bool(rdma_use_polling, false, "Use polling mode for RDMA.");
@@ -98,6 +100,7 @@ static const uint16_t MIN_QP_SIZE = 16;
 static const uint16_t MAX_QP_SIZE = 4096;
 static const uint16_t MIN_BLOCK_SIZE = 1024;
 static const uint32_t ACK_MSG_RDMA_OK = 0x1;
+static const uint64_t FIXED_ACK_WR_ID = 1;
 
 static butil::Mutex* g_rdma_resource_mutex = NULL;
 static RdmaResource* g_rdma_resource_list = NULL;
@@ -191,6 +194,7 @@ RdmaEndpoint::RdmaEndpoint(Socket* s)
     , _remote_window_capacity(0)
     , _window_size(0)
     , _new_rq_wrs(0)
+    , _remote_recv_window(0)
 {
     if (_sq_size < MIN_QP_SIZE) {
         _sq_size = MIN_QP_SIZE;
@@ -208,6 +212,7 @@ RdmaEndpoint::RdmaEndpoint(Socket* s)
 }
 
 RdmaEndpoint::~RdmaEndpoint() {
+    // LOG(INFO) << _window_size << " " << _remote_recv_window << " " <<  _sq_unsignaled;
     Reset();
     bthread::butex_destroy(_read_butex);
 }
@@ -231,6 +236,7 @@ void RdmaEndpoint::Reset() {
     _new_rq_wrs = 0;
     _sq_sent = 0;
     _rq_received = 0;
+    _remote_recv_window.store(0, butil::memory_order_relaxed);
 }
 
 void RdmaConnect::StartConnect(const Socket* socket,
@@ -514,7 +520,7 @@ void* RdmaEndpoint::ProcessHandshakeAtClient(void* arg) {
         ep->_remote_window_capacity = 
             std::min(ep->_rq_size, remote_msg.sq_size) - RESERVED_WR_NUM,
         ep->_window_size.store(ep->_local_window_capacity, butil::memory_order_relaxed);
-
+        ep->_remote_recv_window.store(ep->_remote_window_capacity, butil::memory_order_relaxed);
         ep->_state = C_BRINGUP_QP;
         if (ep->BringUpQp(remote_msg.lid, remote_msg.gid, remote_msg.qp_num) < 0) {
             LOG(WARNING) << "Fail to bringup QP, fallback to tcp:" << s->description();
@@ -622,7 +628,7 @@ void* RdmaEndpoint::ProcessHandshakeAtServer(void* arg) {
         ep->_remote_window_capacity = 
             std::min(ep->_rq_size, remote_msg.sq_size) - RESERVED_WR_NUM,
         ep->_window_size.store(ep->_local_window_capacity, butil::memory_order_relaxed);
-
+        ep->_remote_recv_window.store(ep->_remote_window_capacity, butil::memory_order_relaxed);
         ep->_state = S_ALLOC_QPCQ;
         if (ep->AllocateResources() < 0) {
             LOG(WARNING) << "Fail to allocate rdma resources, fallback to tcp:"
@@ -716,7 +722,7 @@ bool RdmaEndpoint::IsWritable() const {
         return false;
     }
 
-    return _window_size.load(butil::memory_order_relaxed) > 0;
+    return _window_size.load(butil::memory_order_relaxed) > 0 && _remote_recv_window.load(butil::memory_order_relaxed) > 0;
 }
 
 // RdmaIOBuf inherits from IOBuf to provide a new function.
@@ -787,12 +793,14 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
     size_t total_len = 0;
     size_t current = 0;
     uint32_t window = 0;
+    uint32_t recv_window = 0;
     ibv_send_wr wr;
     int max_sge = GetRdmaMaxSge();
     ibv_sge sglist[max_sge];
     while (current < ndata) {
-        window = _window_size.load(butil::memory_order_relaxed);
-        if (window == 0) {
+        window = _window_size.load(butil::memory_order_acquire);
+        recv_window = _remote_recv_window.load(butil::memory_order_acquire);
+        if (window == 0 || recv_window == 0) {
             if (total_len > 0) {
                 break;
             } else {
@@ -883,6 +891,8 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
             // We use other way to guarantee the Send Queue is not full.
             // So we just consider this error as an unrecoverable error.
             LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
+                         << ", window_size:" <<  _window_size
+                         << ", emote_recv_window: " << _remote_recv_window
                          << ", window=" << window
                          << ", sq_current=" << _sq_current;
             errno = err;
@@ -898,7 +908,8 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
         // Because there is at most one thread can enter this function for each
         // Socket, and the other thread of HandleCompletion can only add this
         // counter.
-        _window_size.fetch_sub(1, butil::memory_order_relaxed);
+        _window_size.fetch_sub(1, butil::memory_order_release);
+        _remote_recv_window.fetch_sub(1, butil::memory_order_release);
     }
 
     return total_len;
@@ -922,13 +933,16 @@ int RdmaEndpoint::SendImm(uint32_t imm) {
     wr.imm_data = butil::HostToNet32(imm);
     wr.send_flags |= IBV_SEND_SOLICITED;
     wr.send_flags |= IBV_SEND_SIGNALED;
+    wr.wr_id = FIXED_ACK_WR_ID;
 
     ibv_send_wr* bad = NULL;
     int err = ibv_post_send(_resource->qp, &wr, &bad);
     if (err != 0) {
         // We use other way to guarantee the Send Queue is not full.
         // So we just consider this error as an unrecoverable error.
-        LOG(WARNING) << "Fail to ibv_post_send: " << berror(err);
+        LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
+                     << ", window_size:" <<  _window_size
+                     << ", emote_recv_window: " << _remote_recv_window;
         return -1;
     }
     return 0;
@@ -936,17 +950,40 @@ int RdmaEndpoint::SendImm(uint32_t imm) {
 
 ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
     bool zerocopy = FLAGS_rdma_recv_zerocopy;
+    //LOG(INFO) << "Handle Completion: " << wc.opcode;
     switch (wc.opcode) {
     case IBV_WC_SEND: {  // send completion
-        // Do nothing
+        if (wc.wr_id == 0) {
+            uint16_t wnd_to_update = _local_window_capacity / 4;
+            uint32_t num = wnd_to_update;
+            while(num > 0) {
+                _sbuf[_sq_sent++].clear();
+                if (_sq_sent == _sq_size - RESERVED_WR_NUM) {
+                    _sq_sent = 0;
+                }
+                --num;
+            }
+            butil::subtle::MemoryBarrier();
+            uint32_t wnd_thresh = _local_window_capacity / 8;
+            _window_size.fetch_add(wnd_to_update, butil::memory_order_release);
+            //if ((_remote_recv_window.load(butil::memory_order_relaxed) >= wnd_thresh)) {
+             // Do not wake up writing thread right after _window_size > 0.
+            // Otherwise the writing thread may switch to background too quickly.
+            _socket->WakeAsEpollOut();
+        //}
+        }
         break;
     }
     case IBV_WC_RECV: {  // recv completion
         // Please note that only the first wc.byte_len bytes is valid
         if (wc.byte_len > 0) {
+#if BRPC_WITH_GDR
+            zerocopy = true;
+#else
             if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
                 zerocopy = false;
             }
+#endif // BRPC_WITH_GDR
             CHECK(_state != FALLBACK_TCP);
             if (zerocopy) {
                 butil::IOBuf tmp;
@@ -958,24 +995,10 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
             }
         }
         if (wc.imm_data > 0) {
-            // Clear sbuf here because we ignore event wakeup for send completions
             uint32_t acks = butil::NetToHost32(wc.imm_data);
-            uint32_t num = acks;
-            while (num > 0) {
-                _sbuf[_sq_sent++].clear();
-                if (_sq_sent == _sq_size - RESERVED_WR_NUM) {
-                    _sq_sent = 0;
-                }
-                --num;
-            }
-            butil::subtle::MemoryBarrier();
-
-            // Update window
             uint32_t wnd_thresh = _local_window_capacity / 8;
-            if (_window_size.fetch_add(acks, butil::memory_order_relaxed) >= wnd_thresh
-                    || acks >= wnd_thresh) {
-                // Do not wake up writing thread right after _window_size > 0.
-                // Otherwise the writing thread may switch to background too quickly.
+            if(_remote_recv_window.fetch_add(acks, butil::memory_order_release) >= wnd_thresh ||
+                    acks >= wnd_thresh) {
                 _socket->WakeAsEpollOut();
             }
         }
@@ -1017,11 +1040,43 @@ int RdmaEndpoint::DoPostRecv(void* block, size_t block_size) {
     return 0;
 }
 
+int RdmaEndpoint::DoPostRecvGDR(void* block, size_t block_size, uint32_t lkey) {
+    ibv_recv_wr wr;
+    memset(&wr, 0, sizeof(wr));
+    ibv_sge sge;
+    sge.addr = (uint64_t)block;
+    sge.length = block_size;
+    sge.lkey = lkey;
+    wr.num_sge = 1;
+    wr.sg_list = &sge;
+    //LOG(INFO) << "POST recv: addr=0x" << std::hex << sge.addr
+    //      << std::dec  << " length=0x" << sge.length
+    //      << " lkey=0x" << sge.lkey;
+    //LOG(INFO) << block << " " << _device_allocator->get_lkey();
+    ibv_recv_wr* bad = NULL;
+    int err = ibv_post_recv(_resource->qp, &wr, &bad);
+    if (err != 0) {
+        LOG(WARNING) << "Fail to ibv_post_recv: " << berror(err);
+        return -1;
+    }
+    return 0;
+}
+
 int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
     // We do the post repeatedly from the _rbuf[_rq_received].
     while (num > 0) {
+        uint32_t lkey = 0;
         if (zerocopy) {
             _rbuf[_rq_received].clear();
+
+#if BRPC_WITH_GDR
+            butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
+            void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
+            auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
+            lkey = device_allocator->get_lkey(device_ptr);
+            _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , lkey);
+            _rbuf_data[_rq_received] = device_ptr;
+#else
             butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
                     g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
             int size = 0;
@@ -1032,11 +1087,20 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
             } else {
                 CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
             }
+#endif  // if BRPC_WITH_GDR
         }
+#if BRPC_WITH_GDR
+        if (DoPostRecvGDR(_rbuf_data[_rq_received], g_rdma_recv_block_size, lkey) < 0) {
+            _rbuf[_rq_received].clear();
+            return -1;
+        }
+#else
         if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
             _rbuf[_rq_received].clear();
             return -1;
         }
+#endif  // if BRPC_WITH_GDR
+
         --num;
         ++_rq_received;
         if (_rq_received == _rq_size) {
@@ -1504,6 +1568,10 @@ void RdmaEndpoint::DebugInfo(std::ostream& os) const {
 }
 
 int RdmaEndpoint::GlobalInitialize() {
+#if BRPC_WITH_GDR
+    LOG(INFO) << ", gdr_block_size_kb: " << butil::gdr::gdr_block_size_kb;
+    g_rdma_recv_block_size = butil::gdr::gdr_block_size_kb * 1024 - IOBUF_BLOCK_HEADER_LEN;
+#else
     if (FLAGS_rdma_recv_block_type == "default") {
         g_rdma_recv_block_size = GetBlockSize(0) - IOBUF_BLOCK_HEADER_LEN;
     } else if (FLAGS_rdma_recv_block_type == "large") {
@@ -1514,6 +1582,15 @@ int RdmaEndpoint::GlobalInitialize() {
         errno = EINVAL;
         return -1;
     }
+#endif // BRPC_WITH_GDR
+
+    LOG(INFO) << "rdma_use_polling :" << FLAGS_rdma_use_polling
+      << ", rdma_poller_num : " << FLAGS_rdma_poller_num
+      << ", rdma_poller_yield : " << FLAGS_rdma_poller_yield
+      << ", rdma_sq_size: " << FLAGS_rdma_sq_size
+      << ", rdma_rq_size: " << FLAGS_rdma_rq_size
+      << ", rdma_zerocopy_min_size: " << FLAGS_rdma_zerocopy_min_size
+      << ", g_rdma_recv_block_size: " << g_rdma_recv_block_size;
 
     g_rdma_resource_mutex = new butil::Mutex;
     for (int i = 0; i < FLAGS_rdma_prepared_qp_cnt; ++i) {
diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index de7cd5f6d8..6f0c917e9e 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -31,7 +31,6 @@
 #include "butil/containers/mpsc_queue.h"
 #include "brpc/socket.h"
 
-
 namespace brpc {
 class Socket;
 namespace rdma {
@@ -173,6 +172,8 @@ friend class brpc::Socket;
     //     -1:  failed, errno set
     int DoPostRecv(void* block, size_t block_size);
 
+
+    int DoPostRecvGDR(void* block, size_t block_size, uint32_t lkey);
     // Read at most len bytes from fd in _socket to data
     // wait for _read_butex if encounter EAGAIN
     // return -1 if encounter other errno (including EOF)
@@ -262,6 +263,7 @@ friend class brpc::Socket;
     // The number of new WRs posted in the local Recv Queue
     butil::atomic<uint16_t> _new_rq_wrs;
 
+    butil::atomic<uint16_t> _remote_recv_window;
     // butex for inform read events on TCP fd during handshake
     butil::atomic<int> *_read_butex;
 
diff --git a/src/brpc/rdma/rdma_helper.cpp b/src/brpc/rdma/rdma_helper.cpp
index 9bad33750c..1b6ad85ae0 100644
--- a/src/brpc/rdma/rdma_helper.cpp
+++ b/src/brpc/rdma/rdma_helper.cpp
@@ -25,6 +25,7 @@
 #include "butil/containers/flat_map.h"            // butil::FlatMap
 #include "butil/fd_guard.h"
 #include "butil/fd_utility.h"                     // butil::make_non_blocking
+#include "butil/gpu/gpu_block_pool.h"
 #include "butil/logging.h"
 #include "brpc/socket.h"
 #include "brpc/rdma/block_pool.h"
@@ -84,6 +85,8 @@ static uint16_t g_lid;
 static int g_max_sge = 0;
 static uint8_t g_port_num = 1;
 
+static int g_gpu_index = 0;
+
 static int g_comp_vector_index = 0;
 
 butil::atomic<bool> g_rdma_available(false);
@@ -93,7 +96,7 @@ DEFINE_string(rdma_device, "", "The name of the HCA device used "
                                "(Empty means using the first active device)");
 DEFINE_int32(rdma_port, 1, "The port number to use. For RoCE, it is always 1.");
 DEFINE_int32(rdma_gid_index, -1, "The GID index to use. -1 means using the last one.");
-
+DEFINE_int32(gpu_index, 0, "The GPU device index to use.In GDR, we suggest to use the GPU that is connected to the same PCIe switch with rdma devices");
 // static const size_t SYSFS_SIZE = 4096;
 static ibv_device** g_devices = NULL;
 static ibv_context* g_context = NULL;
@@ -477,6 +480,7 @@ static void GlobalRdmaInitializeOrDieImpl() {
         ExitWithError();
     }
 
+    g_gpu_index = FLAGS_gpu_index;
     // Find the first active port
     g_port_num = FLAGS_rdma_port;
     int available_devices;
@@ -552,6 +556,13 @@ static void GlobalRdmaInitializeOrDieImpl() {
         ExitWithError();
     }
 
+#if BRPC_WITH_GDR
+    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
+        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
+        ExitWithError();
+    }
+#endif  // if BRPC_WITH_GDR
+
     if (RdmaEndpoint::GlobalInitialize() < 0) {
         LOG(ERROR) << "rdma_recv_block_type incorrect "
                    << "(valid value: default/large/huge)";
@@ -679,6 +690,11 @@ uint8_t GetRdmaPortNum() {
     return g_port_num;
 }
 
+int GetGPUIndex() {
+    return g_gpu_index;
+}
+
+
 bool IsRdmaAvailable() {
     return g_rdma_available.load(butil::memory_order_acquire);
 }
diff --git a/src/brpc/rdma/rdma_helper.h b/src/brpc/rdma/rdma_helper.h
index 052763325b..25a93476e7 100644
--- a/src/brpc/rdma/rdma_helper.h
+++ b/src/brpc/rdma/rdma_helper.h
@@ -74,6 +74,9 @@ int GetRdmaCompVector();
 // Return current port number used
 uint8_t GetRdmaPortNum();
 
+// Get GPU index used
+int GetGPUIndex();
+
 // Get max_sge supported by the device
 int GetRdmaMaxSge();
 
diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
new file mode 100644
index 0000000000..dac26f4b5c
--- /dev/null
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -0,0 +1,450 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#if BRPC_WITH_GDR
+
+#include <iostream>
+#include <chrono>
+#include "butil/fast_rand.h"
+#include "gpu_block_pool.h"
+namespace butil {
+namespace gdr {
+
+#define CHECK_CUDA(call)  \
+do {                                                                     \
+    auto _sts = (call);                                                    \
+    if (_sts != cudaSuccess) {                                             \
+        LOG(FATAL) << " cuda error:"                                       \
+        << (cudaGetErrorString(_sts)) << std::string(" at ")    \
+        << __FILE__ << ": " << __LINE__;                        \
+    }                                                                      \
+} while (0);
+
+bool verify_same_context() {
+  static int original_device = -1;
+  static bool first_call = true;
+
+  int current_device;
+  cudaGetDevice(&current_device);
+
+  if (first_call) {
+      original_device = current_device;
+      first_call = false;
+      return true;
+  }
+
+  return (current_device == original_device);
+}
+
+void* get_gpu_mem(int gpu_id, int64_t gpu_mem_size) {
+    CHECK_CUDA(cudaSetDevice(gpu_id));
+    void *d_data;
+
+    LOG(INFO) << "try to alloc " << gpu_mem_size << " bytes from gpu " << gpu_id;
+
+    CHECK_CUDA(cudaMalloc(&d_data, gpu_mem_size));
+    cudaDeviceSynchronize();
+    return (void *)d_data;
+}
+
+void* get_cpu_mem(int gpu_id, int64_t cpu_mem_size) {
+    CHECK_CUDA(cudaSetDevice(gpu_id));
+
+    LOG(INFO) << "try to alloc " << cpu_mem_size << " bytes from gpu " << gpu_id << "on host";
+
+    void* mem = NULL;
+
+    CHECK_CUDA(cudaMallocHost(&mem, cpu_mem_size));
+
+    cudaDeviceSynchronize();
+
+    return mem;
+}
+
+
+BlockPoolAllocators* BlockPoolAllocators::instance_ = nullptr;
+
+BlockPoolAllocators* BlockPoolAllocators::singleton() {
+    static std::mutex mutex;
+    if (instance_ == nullptr) {
+        std::lock_guard<std::mutex> l(mutex);
+        if(instance_ == nullptr) {
+            instance_ = new BlockPoolAllocators();
+            std::atomic_thread_fence(std::memory_order_release);
+        }
+    }
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return instance_;
+}
+
+bool InitGPUBlockPool(int gpu_id, ibv_pd* pd) {
+    BlockPoolAllocators::singleton()->init(gpu_id, pd);
+    return true;
+}
+
+class BlockHeaderList {
+  public:
+    BlockHeaderList() {
+        objects_.reserve(kMaxObjects);
+    }
+    virtual ~BlockHeaderList() {
+        for (size_t i = 0; i < objects_.size(); i++) {
+            delete objects_[i];
+      }
+    }
+
+    BlockHeader* New() {
+        {
+            std::lock_guard<std::mutex> lock(mu_);
+            if (!objects_.empty()) {
+                BlockHeader* result = objects_.back();
+                objects_.pop_back();
+                return result;
+            }
+        }
+        return new BlockHeader;
+    }
+    void Release(BlockHeader* obj) {
+        obj->Reset();
+        {
+            std::lock_guard<std::mutex> lock(mu_);
+            if (objects_.size() < kMaxObjects) {
+                objects_.push_back(obj);
+                return;
+            }
+        }
+        delete obj;
+    }
+
+  private:
+    static const int kMaxObjects = 100000;
+
+    std::mutex mu_;
+    std::vector<BlockHeader*> objects_;
+};
+
+static BlockHeaderList* get_bh_list() {
+    static BlockHeaderList* bh_list = new BlockHeaderList();
+    return bh_list;
+}
+
+
+BlockPoolAllocator::BlockPoolAllocator(int gpuId, bool onGpu, ibv_pd* brpc_pd,
+    size_t blockSize, size_t regionSize) :
+    gpu_id(gpuId)
+    , on_gpu(onGpu)
+    , pd(brpc_pd)
+    , BLOCK_SIZE(std::max(blockSize, sizeof(BlockHeader)))
+    , REGION_SIZE((regionSize / blockSize) * blockSize)  // 对齐到块大小的倍数
+    , freeList(nullptr)
+    , g_region_num(0)
+    , totalAllocated(0)
+    , totalDeallocated(0)
+    , peakUsage(0) {
+    LOG(INFO) << "Memory Pool initialized: block_size=" << BLOCK_SIZE 
+      << ", region_size=" << REGION_SIZE 
+      << ", gpu_id=" << gpu_id << ", on_gpu=" << on_gpu << ", pd=" << pd;
+
+    extendRegion();
+}
+
+BlockPoolAllocator::~BlockPoolAllocator() {
+#ifdef DEBUG
+    printStatistics();
+#endif
+
+    for (int i = 0; i < max_regions; i++) {
+        Region* r = &g_regions[i];
+        if (!r->mr) {
+            return;
+        }
+
+        LOG(INFO) << "try to free " << r->size << " bytes from gpu " << gpu_id << ", on_gpu " << on_gpu;
+        ibv_dereg_mr(r->mr);
+        if (on_gpu) {
+            CHECK_CUDA(cudaFree(reinterpret_cast<void*>(r->start)));
+        } else {
+            CHECK_CUDA(cudaFreeHost(reinterpret_cast<void*>(r->start)));
+        }
+    }
+}
+
+Region* BlockPoolAllocator::GetRegion(const void* buf) {
+    if (!buf) {
+        errno = EINVAL;
+        return NULL;
+    }
+    Region* r = NULL;
+    uintptr_t addr = (uintptr_t)buf;
+    for (int i = 0; i < max_regions; ++i) {
+        if (g_regions[i].aligned_start == 0) {
+            break;
+        }
+        if (addr >= g_regions[i].aligned_start &&
+                addr < g_regions[i].aligned_start + g_regions[i].aligned_size) {
+            r = &g_regions[i];
+            break;
+        }
+    }
+    return r;
+}
+
+uint32_t BlockPoolAllocator::get_lkey(const void* buf) {
+    Region* r = GetRegion(buf);
+    if (!r) {
+        LOG(ERROR) << "can not get a region for buf " << buf;
+        return 0;
+    }
+    return r->lkey;
+}
+
+void* BlockPoolAllocator::AllocateRaw(size_t num_bytes) {
+    if (num_bytes == 0) {
+        return nullptr;
+    }
+    if (num_bytes > BLOCK_SIZE) {
+        LOG(FATAL) << "try to alloc " << num_bytes << " bytes, its bigger than block_size " << BLOCK_SIZE;
+    }
+
+    auto startTime = std::chrono::high_resolution_clock::now();
+
+    std::lock_guard<std::mutex> lock(poolMutex);
+
+    if (!freeList) {
+        extendRegion();
+    }
+
+    BlockHeader* block = freeList;
+    freeList = freeList->next;
+
+    void* addr = block->addr;
+    get_bh_list()->Release(block);
+
+    totalAllocated++;
+    peakUsage = std::max(peakUsage, totalAllocated - totalDeallocated);
+
+    auto endTime = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);
+
+#ifdef DEBUG
+    if (duration.count() > 1000) {  // 如果分配时间超过1微秒
+        LOG(INFO) << "Slow allocation: " << duration.count() << " ns";
+    }
+#endif
+
+    return addr;
+}
+
+void BlockPoolAllocator::DeallocateRaw(void* ptr) {
+    if (!ptr) return;
+
+    std::lock_guard<std::mutex> lock(poolMutex);
+
+    BlockHeader* block = get_bh_list()->New();
+    block->addr = ptr;
+    block->next = freeList;
+    freeList = block;
+
+    totalDeallocated++;
+}
+
+// 获取统计信息
+void BlockPoolAllocator::printStatistics() const {
+    LOG(INFO) << "=== Memory Pool Statistics ===";
+    LOG(INFO) << "Total regions: " << g_region_num
+        << ", Total blocks allocated: " << totalAllocated
+        << ", Total blocks deallocated: " << totalDeallocated
+        << ", Current usage: " << (totalAllocated - totalDeallocated) << " blocks"
+        << ", Peak usage: " << peakUsage << " blocks"
+        << ", Memory efficiency: "
+        << (static_cast<double>(totalAllocated - totalDeallocated) /
+                (g_region_num * (REGION_SIZE / BLOCK_SIZE)) * 100)
+        << "%";
+}
+
+void BlockPoolAllocator::extendRegion() {
+    if (g_region_num == max_regions) {
+        LOG(FATAL) << "Gdr Memory pool reaches max regions";
+        return ;
+    }
+
+    auto startTime = std::chrono::high_resolution_clock::now();
+    void* ptr = nullptr;
+    void* aligned_ptr = nullptr;
+    int alignment = 4096;
+
+    if (on_gpu) {
+        ptr = get_gpu_mem(gpu_id, REGION_SIZE);
+    } else {
+        ptr = get_cpu_mem(gpu_id, REGION_SIZE);
+    }
+
+    aligned_ptr = (void*)(((uintptr_t)ptr + alignment - 1) & ~(alignment - 1));
+
+    int64_t aligned_bytes = REGION_SIZE;
+    if (ptr != aligned_ptr) {
+        uintptr_t region_end = uintptr_t(ptr) + REGION_SIZE;
+        uintptr_t aligned_end_ptr = (region_end + alignment - 1) & ~(alignment - 1);
+        aligned_bytes = uintptr_t(aligned_end_ptr) - uintptr_t(aligned_ptr);
+        LOG(WARNING) << "addr is not aligned with 4096: " << ptr << ", aligned_bytes: " << aligned_bytes
+            << ", region_size: " << REGION_SIZE;
+    }
+
+    LOG(INFO) << "reg_mr for ptr: " << aligned_ptr << ", size:" << aligned_bytes;
+    auto mr = ibv_reg_mr(pd, aligned_ptr, aligned_bytes,
+            IBV_ACCESS_LOCAL_WRITE |
+            IBV_ACCESS_REMOTE_READ |
+            IBV_ACCESS_REMOTE_WRITE);
+    //IBV_ACCESS_RELAXED_ORDERING);
+
+    if (!mr) {
+        LOG(FATAL) << "Failed to register MR: " << strerror(errno)
+            << ", pd " << pd << ", aligned_ptr:" << aligned_ptr;
+    } else {
+        LOG(INFO) << "Success to register MR: "
+            << ", pd " << pd << ", aligned_ptr:" << aligned_ptr;
+    }
+
+    LOG(INFO) << "try to init region, g_region_num:" << g_region_num;
+    size_t blockCount = aligned_bytes / BLOCK_SIZE;
+    Region* region = &g_regions[g_region_num++];
+    region->start = (uintptr_t)ptr;
+    region->aligned_start = (uintptr_t)aligned_ptr;
+    region->mr = mr;
+    region->size = REGION_SIZE;
+    region->aligned_size = aligned_bytes;
+    region->lkey = mr->lkey;
+    region->blockCount = blockCount;
+
+
+    LOG(INFO) << "try to insert list, freeList:" << freeList << ", blockCount:" << blockCount;
+    BlockHeader* lastBlock = nullptr;
+    for (size_t i = 0; i < blockCount; ++i) {
+        BlockHeader* block = get_bh_list()->New();
+        block->addr = reinterpret_cast<void*>(static_cast<char*>(aligned_ptr) + i * BLOCK_SIZE);
+        if (lastBlock != nullptr) {
+            lastBlock->next = block;
+        } else {
+            freeList = block;
+        }
+        lastBlock = block;
+    }
+
+    if (lastBlock) {
+        lastBlock->next = nullptr;
+    }
+
+    auto endTime = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);
+
+    LOG(INFO) << "Extended region #" << g_region_num << ": " << blockCount 
+        << " blocks (" << (REGION_SIZE / (1024 * 1024)) << " MB)" << ", on_gpu " << on_gpu
+        << ", cost " << duration.count()  << " ns";
+}
+
+GPUStreamPool::GPUStreamPool(int gpu_id) :
+                    gpu_id_(gpu_id) {
+    CHECK_CUDA(cudaSetDevice(gpu_id));
+    d2d_streams_.resize(kMaxConcurrent);
+    d2h_streams_.resize(kMaxConcurrent);
+    for (int i = 0; i < kMaxConcurrent; i++) {
+        CHECK_CUDA(cudaStreamCreate(&d2d_streams_[i]));
+        CHECK_CUDA(cudaStreamCreate(&d2h_streams_[i]));
+    }
+    CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+GPUStreamPool::~GPUStreamPool() {
+    CHECK_CUDA(cudaDeviceSynchronize());
+    for (int i = 0; i < kMaxConcurrent; i++) {
+        CHECK_CUDA(cudaStreamDestroy(d2d_streams_[i]));
+        CHECK_CUDA(cudaStreamDestroy(d2h_streams_[i]));
+    }
+}
+
+void GPUStreamPool::fast_d2d(std::vector<void*>& src_list,
+                                  std::vector<int64_t>& length_list,
+                                  void* dst) {
+#ifdef DEBUG
+    if (!verify_same_context()) {
+        LOG(FATAL) << "Context mismatch!";
+        return;
+    }
+#endif
+    int64_t offset = 0;
+    int segs = src_list.size();
+    if (segs == 0) return;
+    if (segs != length_list.size()) {
+        LOG(FATAL) << "src list size is not equal with length list size!!!";
+    }
+
+    int stream_idx = 0;
+    {
+        std::lock_guard<std::mutex> stream_lb_lock(d2d_lb_lock_);
+        d2d_cnt_.fetch_add(1);
+        stream_idx = d2d_cnt_ % kMaxConcurrent;
+    }
+    std::lock_guard<std::mutex> stream_lock(d2d_locks_[stream_idx]);
+    CHECK_CUDA(cudaStreamSynchronize(d2d_streams_[stream_idx]));
+    for (int i = 0; i < segs; i++) {
+        if (length_list[i] == 0) {
+            continue;
+        }
+        CHECK_CUDA(cudaMemcpyAsync(static_cast<char*>(dst) + offset, src_list[i], length_list[i],
+                    cudaMemcpyDeviceToDevice, d2d_streams_[stream_idx]));
+        offset += length_list[i];
+    }
+    CHECK_CUDA(cudaStreamSynchronize(d2d_streams_[stream_idx]));
+}
+
+void GPUStreamPool::fast_d2h(std::vector<void*>& src_list,
+                                  std::vector<int64_t>& length_list,
+                                  void* dst) {
+    if (!verify_same_context()) {
+        LOG(FATAL) << "Context mismatch!";
+        return;
+    }
+    int64_t offset = 0;
+    int segs = src_list.size();
+    if (segs == 0) return;
+    if (segs != length_list.size()) {
+        LOG(FATAL) << "src list size is not equal with length list size!!!";
+    }
+
+    int stream_idx = 0;
+    {
+        std::lock_guard<std::mutex> stream_lb_lock(d2h_lb_lock_);
+        d2h_cnt_.fetch_add(1);
+        stream_idx = d2h_cnt_ % kMaxConcurrent;
+    }
+    std::lock_guard<std::mutex> stream_lock(d2h_locks_[stream_idx]);
+    CHECK_CUDA(cudaStreamSynchronize(d2h_streams_[stream_idx]));
+    for (int i = 0; i < segs; i++) {
+        if (length_list[i] == 0) {
+            continue;
+        }
+        CHECK_CUDA(cudaMemcpyAsync(static_cast<char*>(dst) + offset, src_list[i], length_list[i],
+                    cudaMemcpyDeviceToHost, d2h_streams_[stream_idx]));
+        offset += length_list[i];
+    }
+    CHECK_CUDA(cudaStreamSynchronize(d2h_streams_[stream_idx]));
+}
+
+}
+}
+
+#endif // BRPC_WITH_GDR
diff --git a/src/butil/gpu/gpu_block_pool.h b/src/butil/gpu/gpu_block_pool.h
new file mode 100644
index 0000000000..1d6a444a36
--- /dev/null
+++ b/src/butil/gpu/gpu_block_pool.h
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#ifndef BUTIL_GPU_GPU_BLOCK_POOL_H
+#define BUTIL_GPU_GPU_BLOCK_POOL_H
+
+#if BRPC_WITH_GDR
+
+#include <infiniband/verbs.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <string>
+#include <vector>
+#include <mutex>
+#include <infiniband/verbs.h>
+#include "butil/containers/hash_tables.h"
+#include "butil/logging.h"
+#include <cuda_runtime.h>
+#include "cuda.h"
+
+// #include "gdrapi.h"
+namespace butil {
+namespace gdr {
+
+static int gdr_block_size_kb = [](){
+    int ret = 64;
+    const char* env_var_val = getenv("GDR_BLOCK_SIZE_KB");
+    if (env_var_val == nullptr) {
+        return ret;
+    }
+    ret = std::stoi(env_var_val);
+
+    return ret;
+}();
+
+void* get_gpu_mem(int gpu_id, int64_t gpu_mem_size);
+void* get_cpu_mem(int gpu_id, int64_t cpu_mem_size);
+
+bool InitGPUBlockPool(int gpu_id, ibv_pd* pd);
+
+struct Region {
+    Region() { start = 0; aligned_start = 0;}
+    uintptr_t start;
+    uintptr_t aligned_start;
+
+    size_t size;
+    size_t aligned_size;
+    size_t blockCount;
+    struct ibv_mr *mr {nullptr};
+    uint32_t lkey;
+};
+
+struct BlockHeader {
+    BlockHeader() { addr = nullptr; next = nullptr;}
+    void Reset() { addr = nullptr; next = nullptr; }
+    void* addr;
+    BlockHeader* next;
+};
+
+class BlockPoolAllocator {
+  private:
+    int gpu_id;
+    bool on_gpu;
+    ibv_pd* pd {nullptr};
+
+    const size_t BLOCK_SIZE;
+    const size_t REGION_SIZE;
+
+    BlockHeader* freeList;
+    static constexpr size_t max_regions = 16;
+    int g_region_num {0};
+    Region g_regions[max_regions];
+    std::mutex poolMutex;
+
+    // 统计信息
+    size_t totalAllocated;
+    size_t totalDeallocated;
+    size_t peakUsage;
+
+  public:
+    explicit BlockPoolAllocator(int gpu_id,
+                                bool on_gpu, ibv_pd* pd,
+                                size_t blockSize, size_t regionSize);
+
+    ~BlockPoolAllocator();
+
+    void* AllocateRaw(size_t num_bytes);
+
+    void DeallocateRaw(void* ptr);
+
+    // 获取统计信息
+    void printStatistics() const;
+
+    int64_t getCurrentUsage() const {
+      return totalAllocated - totalDeallocated;
+    }
+
+    int64_t getTotalMemory() const {
+      return g_region_num * REGION_SIZE;
+    }
+
+    int64_t get_block_size() const {
+      return BLOCK_SIZE;
+    }
+
+    uint32_t get_lkey(const void* buf);
+
+  private:
+    Region* GetRegion(const void* buf);
+    void extendRegion();
+};
+
+class GPUStreamPool {
+public:
+    explicit GPUStreamPool(int gpu_id);
+
+    ~GPUStreamPool();
+
+    GPUStreamPool(const GPUStreamPool&) = delete;
+    GPUStreamPool& operator=(const GPUStreamPool&) = delete;
+
+    void fast_d2h(std::vector<void*>& src_list, std::vector<int64_t>& length_list, void* dst);
+
+    void fast_d2d(std::vector<void*>& src_list, std::vector<int64_t>& length_list, void* dst);
+
+    static constexpr int kMaxConcurrent = 32;
+private:
+    int gpu_id_ {-1};
+    std::atomic<int64_t> d2h_cnt_ {0};
+    std::atomic<int64_t> d2d_cnt_ {0};
+    std::mutex d2h_locks_[kMaxConcurrent];
+    std::mutex d2d_locks_[kMaxConcurrent];
+    std::mutex d2h_lb_lock_;
+    std::mutex d2d_lb_lock_;
+    std::vector<cudaStream_t> d2h_streams_;
+    std::vector<cudaStream_t> d2d_streams_;
+};
+
+class BlockPoolAllocators {
+public:
+    static BlockPoolAllocators* singleton();
+    BlockPoolAllocators() {}
+    virtual ~BlockPoolAllocators() {
+        CHECK_EQ(this, instance_);
+        instance_ = nullptr;
+    }
+
+    void init(int gpu_id, ibv_pd* pd) {
+        LOG(INFO) << "set GPU BlockPoolAllocator for " << gpu_id;
+        size_t region_size = 512LL * 1024 * 1024;
+        size_t block_size = gdr_block_size_kb * 1024;
+        gpu_mem_alloc = new BlockPoolAllocator(gpu_id, true, pd, block_size, region_size);
+
+        region_size = 32LL * 1024 * 1024;
+        block_size = 512;
+        cpu_mem_alloc = new BlockPoolAllocator(gpu_id, false, pd, block_size, region_size);
+
+        gpu_stream_pool = new GPUStreamPool(gpu_id);
+    }
+
+    BlockPoolAllocator* get_gpu_allocator() {
+        return gpu_mem_alloc;
+    }
+
+    BlockPoolAllocator* get_cpu_allocator() {
+        return cpu_mem_alloc;
+    }
+
+    GPUStreamPool* get_gpu_stream_pool() {
+        return gpu_stream_pool;
+    }
+
+public:
+    static BlockPoolAllocators* instance_;
+
+private:
+    BlockPoolAllocator* gpu_mem_alloc {nullptr};
+    BlockPoolAllocator* cpu_mem_alloc {nullptr};
+    GPUStreamPool*      gpu_stream_pool {nullptr};
+};
+
+}
+}
+
+#endif // BRPC_WITH_GDR
+
+#endif
diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp
index 26046e3745..b111f00454 100644
--- a/src/butil/iobuf.cpp
+++ b/src/butil/iobuf.cpp
@@ -40,6 +40,7 @@
 #include "butil/fd_guard.h"                 // butil::fd_guard
 #include "butil/iobuf.h"
 #include "butil/iobuf_profiler.h"
+#include "butil/gpu/gpu_block_pool.h"
 
 namespace butil {
 namespace iobuf {
@@ -722,6 +723,52 @@ size_t IOBuf::cutn(IOBuf* out, size_t n) {
     return saved_n;
 }
 
+#if BRPC_WITH_GDR
+size_t IOBuf::cutn_from_gpu(IOBuf* out, size_t n) {
+    if (n == 0) {
+      return 0;
+    }
+
+    butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
+    bool alloc_from_host_alloc = (n <= host_allocator->get_block_size());
+    void* mem = NULL;
+    if (alloc_from_host_alloc) {
+      mem = host_allocator->AllocateRaw(n);
+    } else {
+      mem = malloc(n);
+    }
+
+    if (mem == NULL) {
+        return -1;
+    }
+    struct timespec start, end;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    size_t saved_n = copy_from_gpu(mem, n, 0, false);
+    if (saved_n > 0) {
+      if (alloc_from_host_alloc) {
+        auto deleter = [host_allocator](void* data) { host_allocator->DeallocateRaw(data); };
+        out->append_user_data(mem, saved_n, deleter);
+      } else {
+        auto deleter = [](void* data) { free(data); };
+        out->append_user_data(mem, saved_n, deleter);
+      }
+      pop_front(saved_n);
+    } else {
+      if (alloc_from_host_alloc) {
+        host_allocator->DeallocateRaw(mem);
+      } else {
+        free(mem);
+      }
+    }
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    double time_us = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_nsec - start.tv_nsec) / 1e3;
+
+    // LOG(INFO) << "GDRCopy: " << saved_n << " bytes, "
+    //   << time_us << " us";
+    return saved_n;
+}
+#endif  // BRPC_WITH_GDR
+
 size_t IOBuf::cutn(void* out, size_t n) {
     const size_t len = length();
     if (n > len) {
@@ -1155,6 +1202,15 @@ uint64_t IOBuf::get_first_data_meta() {
     return r.block->u.data_meta;
 }
 
+void* IOBuf::get_first_data_ptr() {
+    if (_ref_num() == 0) {
+        return 0;
+    }
+    IOBuf::BlockRef const& r = _ref_at(0);
+    return r.block->data;
+}
+
+
 int IOBuf::resize(size_t n, char c) {
     const size_t saved_len = length();
     if (n < saved_len) {
@@ -1317,6 +1373,59 @@ size_t IOBuf::copy_to(void* d, size_t n, size_t pos) const {
     return n - m;
 }
 
+#if BRPC_WITH_GDR
+size_t IOBuf::copy_from_gpu(void* d, size_t n, size_t pos, bool to_gpu) const {
+    if (n == 0) {
+        return 0;
+    }
+    const size_t nref = _ref_num();
+    // Skip `pos' bytes. `offset' is the starting position in starting BlockRef.
+    size_t offset = pos;
+    size_t i = 0;
+    for (; offset != 0 && i < nref; ++i) {
+        IOBuf::BlockRef const& r = _ref_at(i);
+        if (offset < (size_t)r.length) {
+            break;
+        }
+        offset -= r.length;
+    }
+
+    butil::gdr::GPUStreamPool* gpu_stream_pool = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_stream_pool();
+    struct timespec start, end;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    size_t m = n;
+    std::vector<void*> src_list;
+    std::vector<int64_t> length_list;
+    for (; m != 0 && i < nref; ++i) {
+        IOBuf::BlockRef const& r = _ref_at(i);
+        const size_t nc = std::min(m, (size_t)r.length - offset);
+        void* gpu_src = r.block->data + r.offset + offset;
+        // cudaMemcpy(d, gpu_src, nc, cudaMemcpyDeviceToDevice);
+        src_list.push_back(gpu_src);
+        length_list.push_back(nc);
+        //cuMemcpyDtoH(d, (CUdeviceptr)(r.block->data + r.offset + offset), nc);
+        // gdr_copy_from_mapping(allocator->mh(), d, allocator->ToCPUPtr(gpu_src), nc);
+        offset = 0;
+        // d = (char*)d + nc;
+        m -= nc;
+    }
+    if (to_gpu) {
+        gpu_stream_pool->fast_d2d(src_list, length_list, d);
+    } else {
+        gpu_stream_pool->fast_d2h(src_list, length_list, d);
+    }
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    double time_us = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_nsec - start.tv_nsec) / 1e3;
+    size_t copied_bytes = n - m;
+
+    // LOG(INFO) << "GDRCopy: " << copied_bytes << " bytes, "
+    //           << time_us << " us" << ", to_gpu " << to_gpu;
+    //cuCtxSetCurrent(saved_context);
+    // If nref == 0, here returns 0 correctly
+    return n - m;
+}
+#endif  // BRPC_WITH_GDR
+
 size_t IOBuf::copy_to(std::string* s, size_t n, size_t pos) const {
     const size_t len = length();
     if (len <= pos) {
diff --git a/src/butil/iobuf.h b/src/butil/iobuf.h
index 239e82d950..e554dd0e40 100644
--- a/src/butil/iobuf.h
+++ b/src/butil/iobuf.h
@@ -141,6 +141,12 @@ friend class SingleIOBuf;
     size_t cutn(IOBuf* out, size_t n);
     size_t cutn(void* out, size_t n);
     size_t cutn(std::string* out, size_t n);
+
+#if BRPC_WITH_GDR
+    size_t cutn_from_gpu(IOBuf* out, size_t n);
+    size_t copy_from_gpu(void* d, size_t n, size_t pos = 0, bool to_gpu = false) const;
+#endif  // BRPC_WITH_GDR
+
     // Cut off 1 byte from the front side and set to *c
     // Return true on cut, false otherwise.
     bool cut1(void* c);
@@ -259,6 +265,7 @@ friend class SingleIOBuf;
     // The meta is specified with append_user_data_with_meta before.
     // 0 means the meta is invalid.
     uint64_t get_first_data_meta();
+    void* get_first_data_ptr();
 
     // Resizes the buf to a length of n characters.
     // If n is smaller than the current length, all bytes after n will be
@@ -775,4 +782,4 @@ inline void swap(butil::IOBuf& a, butil::IOBuf& b) {
 
 #include "butil/iobuf_inl.h"
 
-#endif  // BUTIL_IOBUF_H
\ No newline at end of file
+#endif  // BUTIL_IOBUF_H

From 4811b060b1fcb1243eae69c19d7328e78b4b43f5 Mon Sep 17 00:00:00 2001
From: randomkang <75484924+randomkang@users.noreply.github.com>
Date: Sun, 9 Nov 2025 20:17:11 +0800
Subject: [PATCH 02/23] Apply suggestions from code review

Fix code style

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/brpc/rdma/rdma_endpoint.cpp  | 6 +++---
 src/brpc/rdma/rdma_helper.cpp    | 2 +-
 src/butil/gpu/gpu_block_pool.cpp | 2 +-
 src/butil/gpu/gpu_block_pool.h   | 1 -
 src/butil/iobuf.cpp              | 2 +-
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index fc6ce20928..2a47a23a8a 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -892,7 +892,7 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
             // So we just consider this error as an unrecoverable error.
             LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
                          << ", window_size:" <<  _window_size
-                         << ", emote_recv_window: " << _remote_recv_window
+                         << ", remote_recv_window: " << _remote_recv_window
                          << ", window=" << window
                          << ", sq_current=" << _sq_current;
             errno = err;
@@ -942,7 +942,7 @@ int RdmaEndpoint::SendImm(uint32_t imm) {
         // So we just consider this error as an unrecoverable error.
         LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
                      << ", window_size:" <<  _window_size
-                     << ", emote_recv_window: " << _remote_recv_window;
+                     << ", remote_recv_window: " << _remote_recv_window;
         return -1;
     }
     return 0;
@@ -997,7 +997,7 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
         if (wc.imm_data > 0) {
             uint32_t acks = butil::NetToHost32(wc.imm_data);
             uint32_t wnd_thresh = _local_window_capacity / 8;
-            if(_remote_recv_window.fetch_add(acks, butil::memory_order_release) >= wnd_thresh ||
+            if (_remote_recv_window.fetch_add(acks, butil::memory_order_release) >= wnd_thresh ||
                     acks >= wnd_thresh) {
                 _socket->WakeAsEpollOut();
             }
diff --git a/src/brpc/rdma/rdma_helper.cpp b/src/brpc/rdma/rdma_helper.cpp
index 1b6ad85ae0..3b45b2621c 100644
--- a/src/brpc/rdma/rdma_helper.cpp
+++ b/src/brpc/rdma/rdma_helper.cpp
@@ -96,7 +96,7 @@ DEFINE_string(rdma_device, "", "The name of the HCA device used "
                                "(Empty means using the first active device)");
 DEFINE_int32(rdma_port, 1, "The port number to use. For RoCE, it is always 1.");
 DEFINE_int32(rdma_gid_index, -1, "The GID index to use. -1 means using the last one.");
-DEFINE_int32(gpu_index, 0, "The GPU device index to use.In GDR, we suggest to use the GPU that is connected to the same PCIe switch with rdma devices");
+DEFINE_int32(gpu_index, 0, "The GPU device index to use. In GDR, we suggest to use the GPU that is connected to the same PCIe switch with rdma devices");
 // static const size_t SYSFS_SIZE = 4096;
 static ibv_device** g_devices = NULL;
 static ibv_context* g_context = NULL;
diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
index dac26f4b5c..336d4b6dca 100644
--- a/src/butil/gpu/gpu_block_pool.cpp
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -82,7 +82,7 @@ BlockPoolAllocators* BlockPoolAllocators::singleton() {
     static std::mutex mutex;
     if (instance_ == nullptr) {
         std::lock_guard<std::mutex> l(mutex);
-        if(instance_ == nullptr) {
+        if (instance_ == nullptr) {
             instance_ = new BlockPoolAllocators();
             std::atomic_thread_fence(std::memory_order_release);
         }
diff --git a/src/butil/gpu/gpu_block_pool.h b/src/butil/gpu/gpu_block_pool.h
index 1d6a444a36..6106952c76 100644
--- a/src/butil/gpu/gpu_block_pool.h
+++ b/src/butil/gpu/gpu_block_pool.h
@@ -192,7 +192,6 @@ class BlockPoolAllocators {
     BlockPoolAllocator* cpu_mem_alloc {nullptr};
     GPUStreamPool*      gpu_stream_pool {nullptr};
 };
-
 }
 }
 
diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp
index b111f00454..79e992034e 100644
--- a/src/butil/iobuf.cpp
+++ b/src/butil/iobuf.cpp
@@ -739,7 +739,7 @@ size_t IOBuf::cutn_from_gpu(IOBuf* out, size_t n) {
     }
 
     if (mem == NULL) {
-        return -1;
+        return 0;
     }
     struct timespec start, end;
     clock_gettime(CLOCK_MONOTONIC, &start);

From 31a30f7cd6ecd95e7e89295c2d9b88c08352a602 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Thu, 13 Nov 2025 23:45:12 +0800
Subject: [PATCH 03/23] clean unused code

---
 src/butil/iobuf.cpp | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp
index 79e992034e..469e3775b3 100644
--- a/src/butil/iobuf.cpp
+++ b/src/butil/iobuf.cpp
@@ -741,8 +741,6 @@ size_t IOBuf::cutn_from_gpu(IOBuf* out, size_t n) {
     if (mem == NULL) {
         return 0;
     }
-    struct timespec start, end;
-    clock_gettime(CLOCK_MONOTONIC, &start);
     size_t saved_n = copy_from_gpu(mem, n, 0, false);
     if (saved_n > 0) {
       if (alloc_from_host_alloc) {
@@ -760,11 +758,7 @@ size_t IOBuf::cutn_from_gpu(IOBuf* out, size_t n) {
         free(mem);
       }
     }
-    clock_gettime(CLOCK_MONOTONIC, &end);
-    double time_us = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_nsec - start.tv_nsec) / 1e3;
 
-    // LOG(INFO) << "GDRCopy: " << saved_n << " bytes, "
-    //   << time_us << " us";
     return saved_n;
 }
 #endif  // BRPC_WITH_GDR
@@ -1391,8 +1385,6 @@ size_t IOBuf::copy_from_gpu(void* d, size_t n, size_t pos, bool to_gpu) const {
     }
 
     butil::gdr::GPUStreamPool* gpu_stream_pool = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_stream_pool();
-    struct timespec start, end;
-    clock_gettime(CLOCK_MONOTONIC, &start);
     size_t m = n;
     std::vector<void*> src_list;
     std::vector<int64_t> length_list;
@@ -1400,13 +1392,9 @@ size_t IOBuf::copy_from_gpu(void* d, size_t n, size_t pos, bool to_gpu) const {
         IOBuf::BlockRef const& r = _ref_at(i);
         const size_t nc = std::min(m, (size_t)r.length - offset);
         void* gpu_src = r.block->data + r.offset + offset;
-        // cudaMemcpy(d, gpu_src, nc, cudaMemcpyDeviceToDevice);
         src_list.push_back(gpu_src);
         length_list.push_back(nc);
-        //cuMemcpyDtoH(d, (CUdeviceptr)(r.block->data + r.offset + offset), nc);
-        // gdr_copy_from_mapping(allocator->mh(), d, allocator->ToCPUPtr(gpu_src), nc);
         offset = 0;
-        // d = (char*)d + nc;
         m -= nc;
     }
     if (to_gpu) {
@@ -1414,13 +1402,6 @@ size_t IOBuf::copy_from_gpu(void* d, size_t n, size_t pos, bool to_gpu) const {
     } else {
         gpu_stream_pool->fast_d2h(src_list, length_list, d);
     }
-    clock_gettime(CLOCK_MONOTONIC, &end);
-    double time_us = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_nsec - start.tv_nsec) / 1e3;
-    size_t copied_bytes = n - m;
-
-    // LOG(INFO) << "GDRCopy: " << copied_bytes << " bytes, "
-    //           << time_us << " us" << ", to_gpu " << to_gpu;
-    //cuCtxSetCurrent(saved_context);
     // If nref == 0, here returns 0 correctly
     return n - m;
 }
@@ -2211,4 +2192,4 @@ bool IOBufBytesIterator::forward_one_block(const void** data, size_t* size) {
 
 void* fast_memcpy(void *__restrict dest, const void *__restrict src, size_t n) {
     return butil::iobuf::cp(dest, src, n);
-} // namespace butil
\ No newline at end of file
+} // namespace butil

From 88fc739ba11ac02e9cd85697c45e567f214e184a Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Thu, 13 Nov 2025 23:47:57 +0800
Subject: [PATCH 04/23] revert the fix of rdma window

---
 src/brpc/rdma/rdma_endpoint.cpp | 70 ++++++++++++---------------------
 src/brpc/rdma/rdma_endpoint.h   |  1 -
 2 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 2a47a23a8a..6f451e841f 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -31,7 +31,6 @@
 #include "brpc/rdma/block_pool.h"
 #include "brpc/rdma/rdma_helper.h"
 #include "brpc/rdma/rdma_endpoint.h"
-#include "brpc/traceprintf.h"
 
 DECLARE_int32(task_group_ntags);
 
@@ -50,8 +49,8 @@ extern int (*IbvQueryQp)(ibv_qp*, ibv_qp_attr*, ibv_qp_attr_mask, ibv_qp_init_at
 extern int (*IbvDestroyQp)(ibv_qp*);
 extern bool g_skip_rdma_init;
 
-DEFINE_int32(rdma_sq_size, 64, "SQ size for RDMA");
-DEFINE_int32(rdma_rq_size, 64, "RQ size for RDMA");
+DEFINE_int32(rdma_sq_size, 128, "SQ size for RDMA");
+DEFINE_int32(rdma_rq_size, 128, "RQ size for RDMA");
 DEFINE_bool(rdma_recv_zerocopy, true, "Enable zerocopy for receive side");
 DEFINE_int32(rdma_zerocopy_min_size, 512, "The minimal size for receive zerocopy");
 DEFINE_string(rdma_recv_block_type, "default", "Default size type for recv WR: "
@@ -100,7 +99,6 @@ static const uint16_t MIN_QP_SIZE = 16;
 static const uint16_t MAX_QP_SIZE = 4096;
 static const uint16_t MIN_BLOCK_SIZE = 1024;
 static const uint32_t ACK_MSG_RDMA_OK = 0x1;
-static const uint64_t FIXED_ACK_WR_ID = 1;
 
 static butil::Mutex* g_rdma_resource_mutex = NULL;
 static RdmaResource* g_rdma_resource_list = NULL;
@@ -194,7 +192,6 @@ RdmaEndpoint::RdmaEndpoint(Socket* s)
     , _remote_window_capacity(0)
     , _window_size(0)
     , _new_rq_wrs(0)
-    , _remote_recv_window(0)
 {
     if (_sq_size < MIN_QP_SIZE) {
         _sq_size = MIN_QP_SIZE;
@@ -212,7 +209,6 @@ RdmaEndpoint::RdmaEndpoint(Socket* s)
 }
 
 RdmaEndpoint::~RdmaEndpoint() {
-    // LOG(INFO) << _window_size << " " << _remote_recv_window << " " <<  _sq_unsignaled;
     Reset();
     bthread::butex_destroy(_read_butex);
 }
@@ -236,7 +232,6 @@ void RdmaEndpoint::Reset() {
     _new_rq_wrs = 0;
     _sq_sent = 0;
     _rq_received = 0;
-    _remote_recv_window.store(0, butil::memory_order_relaxed);
 }
 
 void RdmaConnect::StartConnect(const Socket* socket,
@@ -520,7 +515,7 @@ void* RdmaEndpoint::ProcessHandshakeAtClient(void* arg) {
         ep->_remote_window_capacity = 
             std::min(ep->_rq_size, remote_msg.sq_size) - RESERVED_WR_NUM,
         ep->_window_size.store(ep->_local_window_capacity, butil::memory_order_relaxed);
-        ep->_remote_recv_window.store(ep->_remote_window_capacity, butil::memory_order_relaxed);
+
         ep->_state = C_BRINGUP_QP;
         if (ep->BringUpQp(remote_msg.lid, remote_msg.gid, remote_msg.qp_num) < 0) {
             LOG(WARNING) << "Fail to bringup QP, fallback to tcp:" << s->description();
@@ -628,7 +623,7 @@ void* RdmaEndpoint::ProcessHandshakeAtServer(void* arg) {
         ep->_remote_window_capacity = 
             std::min(ep->_rq_size, remote_msg.sq_size) - RESERVED_WR_NUM,
         ep->_window_size.store(ep->_local_window_capacity, butil::memory_order_relaxed);
-        ep->_remote_recv_window.store(ep->_remote_window_capacity, butil::memory_order_relaxed);
+
         ep->_state = S_ALLOC_QPCQ;
         if (ep->AllocateResources() < 0) {
             LOG(WARNING) << "Fail to allocate rdma resources, fallback to tcp:"
@@ -722,7 +717,7 @@ bool RdmaEndpoint::IsWritable() const {
         return false;
     }
 
-    return _window_size.load(butil::memory_order_relaxed) > 0 && _remote_recv_window.load(butil::memory_order_relaxed) > 0;
+    return _window_size.load(butil::memory_order_relaxed) > 0;
 }
 
 // RdmaIOBuf inherits from IOBuf to provide a new function.
@@ -793,14 +788,12 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
     size_t total_len = 0;
     size_t current = 0;
     uint32_t window = 0;
-    uint32_t recv_window = 0;
     ibv_send_wr wr;
     int max_sge = GetRdmaMaxSge();
     ibv_sge sglist[max_sge];
     while (current < ndata) {
-        window = _window_size.load(butil::memory_order_acquire);
-        recv_window = _remote_recv_window.load(butil::memory_order_acquire);
-        if (window == 0 || recv_window == 0) {
+        window = _window_size.load(butil::memory_order_relaxed);
+        if (window == 0) {
             if (total_len > 0) {
                 break;
             } else {
@@ -891,8 +884,6 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
             // We use other way to guarantee the Send Queue is not full.
             // So we just consider this error as an unrecoverable error.
             LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
-                         << ", window_size:" <<  _window_size
-                         << ", remote_recv_window: " << _remote_recv_window
                          << ", window=" << window
                          << ", sq_current=" << _sq_current;
             errno = err;
@@ -908,8 +899,7 @@ ssize_t RdmaEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) {
         // Because there is at most one thread can enter this function for each
         // Socket, and the other thread of HandleCompletion can only add this
         // counter.
-        _window_size.fetch_sub(1, butil::memory_order_release);
-        _remote_recv_window.fetch_sub(1, butil::memory_order_release);
+        _window_size.fetch_sub(1, butil::memory_order_relaxed);
     }
 
     return total_len;
@@ -933,16 +923,13 @@ int RdmaEndpoint::SendImm(uint32_t imm) {
     wr.imm_data = butil::HostToNet32(imm);
     wr.send_flags |= IBV_SEND_SOLICITED;
     wr.send_flags |= IBV_SEND_SIGNALED;
-    wr.wr_id = FIXED_ACK_WR_ID;
 
     ibv_send_wr* bad = NULL;
     int err = ibv_post_send(_resource->qp, &wr, &bad);
     if (err != 0) {
         // We use other way to guarantee the Send Queue is not full.
         // So we just consider this error as an unrecoverable error.
-        LOG(WARNING) << "Fail to ibv_post_send: " << berror(err)
-                     << ", window_size:" <<  _window_size
-                     << ", remote_recv_window: " << _remote_recv_window;
+        LOG(WARNING) << "Fail to ibv_post_send: " << berror(err);
         return -1;
     }
     return 0;
@@ -950,28 +937,9 @@ int RdmaEndpoint::SendImm(uint32_t imm) {
 
 ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
     bool zerocopy = FLAGS_rdma_recv_zerocopy;
-    //LOG(INFO) << "Handle Completion: " << wc.opcode;
     switch (wc.opcode) {
     case IBV_WC_SEND: {  // send completion
-        if (wc.wr_id == 0) {
-            uint16_t wnd_to_update = _local_window_capacity / 4;
-            uint32_t num = wnd_to_update;
-            while(num > 0) {
-                _sbuf[_sq_sent++].clear();
-                if (_sq_sent == _sq_size - RESERVED_WR_NUM) {
-                    _sq_sent = 0;
-                }
-                --num;
-            }
-            butil::subtle::MemoryBarrier();
-            uint32_t wnd_thresh = _local_window_capacity / 8;
-            _window_size.fetch_add(wnd_to_update, butil::memory_order_release);
-            //if ((_remote_recv_window.load(butil::memory_order_relaxed) >= wnd_thresh)) {
-             // Do not wake up writing thread right after _window_size > 0.
-            // Otherwise the writing thread may switch to background too quickly.
-            _socket->WakeAsEpollOut();
-        //}
-        }
+        // Do nothing
         break;
     }
     case IBV_WC_RECV: {  // recv completion
@@ -995,10 +963,24 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
             }
         }
         if (wc.imm_data > 0) {
+            // Clear sbuf here because we ignore event wakeup for send completions
             uint32_t acks = butil::NetToHost32(wc.imm_data);
+            uint32_t num = acks;
+            while (num > 0) {
+                _sbuf[_sq_sent++].clear();
+                if (_sq_sent == _sq_size - RESERVED_WR_NUM) {
+                    _sq_sent = 0;
+                }
+                --num;
+            }
+            butil::subtle::MemoryBarrier();
+
+            // Update window
             uint32_t wnd_thresh = _local_window_capacity / 8;
-            if (_remote_recv_window.fetch_add(acks, butil::memory_order_release) >= wnd_thresh ||
-                    acks >= wnd_thresh) {
+            if (_window_size.fetch_add(acks, butil::memory_order_relaxed) >= wnd_thresh
+                    || acks >= wnd_thresh) {
+                // Do not wake up writing thread right after _window_size > 0.
+                // Otherwise the writing thread may switch to background too quickly.
                 _socket->WakeAsEpollOut();
             }
         }
diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index 6f0c917e9e..4705e362ea 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -263,7 +263,6 @@ friend class brpc::Socket;
     // The number of new WRs posted in the local Recv Queue
     butil::atomic<uint16_t> _new_rq_wrs;
 
-    butil::atomic<uint16_t> _remote_recv_window;
     // butex for inform read events on TCP fd during handshake
     butil::atomic<int> *_read_butex;
 

From cec0bf8130a8c47560dce961bda10e98d3b01720 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 14 Nov 2025 23:11:46 +0800
Subject: [PATCH 05/23] fix align

---
 src/butil/gpu/gpu_block_pool.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
index 336d4b6dca..b768e408e8 100644
--- a/src/butil/gpu/gpu_block_pool.cpp
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -298,7 +298,7 @@ void BlockPoolAllocator::extendRegion() {
     int64_t aligned_bytes = REGION_SIZE;
     if (ptr != aligned_ptr) {
         uintptr_t region_end = uintptr_t(ptr) + REGION_SIZE;
-        uintptr_t aligned_end_ptr = (region_end + alignment - 1) & ~(alignment - 1);
+        uintptr_t aligned_end_ptr = region_end & ~(alignment - 1);
         aligned_bytes = uintptr_t(aligned_end_ptr) - uintptr_t(aligned_ptr);
         LOG(WARNING) << "addr is not aligned with 4096: " << ptr << ", aligned_bytes: " << aligned_bytes
             << ", region_size: " << REGION_SIZE;

From b856888fdc01491df4b92d87714c572bbf3e94cb Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 14 Nov 2025 23:24:17 +0800
Subject: [PATCH 06/23] reorg code

---
 src/brpc/policy/baidu_rpc_protocol.cpp | 97 +++++++++++---------------
 1 file changed, 42 insertions(+), 55 deletions(-)

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index 2e37bae3a3..bbc678c921 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -109,9 +109,12 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
 
     char header_buf[12];
     size_t n = 0;
+    uint32_t body_size;
+    uint32_t meta_size;
+    ParseError pe = PARSE_OK;
+
 #if BRPC_WITH_GDR
     void* prefetch_d2h_data = NULL;
-
     uint64_t data_meta = source->get_first_data_meta();
     bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
     butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
@@ -129,74 +132,58 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
 #else
     n = source->copy_to(header_buf, sizeof(header_buf));
 #endif  // BRPC_WITH_GDR
-    if (n >= 4) {
-        void* dummy = header_buf;
-        if (*(const uint32_t*)dummy != *(const uint32_t*)"PRPC") {
-#if BRPC_WITH_GDR
-            if (is_gpu_memory) {
-                host_allocator->DeallocateRaw(prefetch_d2h_data);
+
+    do {
+        if (n >= 4) {
+            void* dummy = header_buf;
+            if (*(const uint32_t*)dummy != *(const uint32_t*)"PRPC") {
+                pe = PARSE_ERROR_TRY_OTHERS;
+                break;
             }
-#endif  // BRPC_WITH_GDR
-            return MakeParseError(PARSE_ERROR_TRY_OTHERS);
-        }
-    } else {
-        if (memcmp(header_buf, "PRPC", n) != 0) {
-#if BRPC_WITH_GDR
-            if (is_gpu_memory) {
-                host_allocator->DeallocateRaw(prefetch_d2h_data);
+        } else {
+            if (memcmp(header_buf, "PRPC", n) != 0) {
+                pe = PARSE_ERROR_TRY_OTHERS;
+                break;
             }
-#endif  // BRPC_WITH_GDR
-            return MakeParseError(PARSE_ERROR_TRY_OTHERS);
         }
-    }
-    if (n < sizeof(header_buf)) {
-#if BRPC_WITH_GDR
-        if (is_gpu_memory) {
-            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        if (n < sizeof(header_buf)) {
+            pe = PARSE_ERROR_NOT_ENOUGH_DATA;
+            break;
         }
-#endif  // BRPC_WITH_GDR
-        return MakeParseError(PARSE_ERROR_NOT_ENOUGH_DATA);
-    }
-    uint32_t body_size;
-    uint32_t meta_size;
-    butil::RawUnpacker(header_buf + 4).unpack32(body_size).unpack32(meta_size);
-    if (body_size > 128 * 1024 * 1024) {
-        LOG(ERROR) << "body_size=" << body_size << " from "
-                   << socket->remote_side() << " is too large";
-    }
-    if (body_size > FLAGS_max_body_size) {
-        // We need this log to report the body_size to give users some clues
-        // which is not printed in InputMessenger.
-        LOG(ERROR) << "body_size=" << body_size << " from "
-                   << socket->remote_side() << " is too large";
-#if BRPC_WITH_GDR
-        if (is_gpu_memory) {
-            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        butil::RawUnpacker(header_buf + 4).unpack32(body_size).unpack32(meta_size);
+        if (body_size > FLAGS_max_body_size) {
+            // We need this log to report the body_size to give users some clues
+            // which is not printed in InputMessenger.
+            LOG(ERROR) << "body_size=" << body_size << " from "
+                       << socket->remote_side() << " is too large";
+            pe = PARSE_ERROR_TOO_BIG_DATA;
+            break;
+        } else if (source->length() < sizeof(header_buf) + body_size) {
+            pe = PARSE_ERROR_NOT_ENOUGH_DATA;
+            break;
         }
-#endif  // BRPC_WITH_GDR
-        return MakeParseError(PARSE_ERROR_TOO_BIG_DATA);
-    } else if (source->length() < sizeof(header_buf) + body_size) {
-#if BRPC_WITH_GDR
-        if (is_gpu_memory) {
-            host_allocator->DeallocateRaw(prefetch_d2h_data);
+        if (meta_size > body_size) {
+            LOG(ERROR) << "meta_size=" << meta_size << " is bigger than body_size="
+                       << body_size;
+            // Pop the message
+            source->pop_front(sizeof(header_buf) + body_size);
+            pe = PARSE_ERROR_TRY_OTHERS;
+            break;
         }
-#endif  // BRPC_WITH_GDR
-        return MakeParseError(PARSE_ERROR_NOT_ENOUGH_DATA);
-    }
-    if (meta_size > body_size) {
-        LOG(ERROR) << "meta_size=" << meta_size << " is bigger than body_size="
-                   << body_size;
-        // Pop the message
-        source->pop_front(sizeof(header_buf) + body_size);
+    } while (0);
+
+    if (pe != PARSE_OK) {
 #if BRPC_WITH_GDR
         if (is_gpu_memory) {
             host_allocator->DeallocateRaw(prefetch_d2h_data);
         }
 #endif  // BRPC_WITH_GDR
-        return MakeParseError(PARSE_ERROR_TRY_OTHERS);
+        return MakeParseError(pe);
     }
+
     source->pop_front(sizeof(header_buf));
     MostCommonMessage* msg = MostCommonMessage::Get();
+
 #if BRPC_WITH_GDR
     if (is_gpu_memory) {
         if (header_size + meta_size <= n) {

From 1fb9370dfad21d3a3f9ebaa3695ff0338aa3426e Mon Sep 17 00:00:00 2001
From: sunce4t <qssue@foxmail.com>
Date: Wed, 26 Nov 2025 10:17:33 +0800
Subject: [PATCH 07/23] Change GPU memory detection logics in
 baidu_rpc_protocol

---
 src/brpc/policy/baidu_rpc_protocol.cpp | 12 ++++++------
 src/brpc/rdma/rdma_endpoint.cpp        |  3 ++-
 src/butil/iobuf.cpp                    | 16 +++++++++++++++-
 src/butil/iobuf.h                      | 10 ++++++++++
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index bbc678c921..7a2c079c24 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -115,8 +115,8 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
 
 #if BRPC_WITH_GDR
     void* prefetch_d2h_data = NULL;
-    uint64_t data_meta = source->get_first_data_meta();
-    bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+    uint32_t data_meta = source->get_first_data_meta_high32();
+    bool is_gpu_memory = (data_meta == static_cast<uint32_t>(butil::IOBuf::GPU_MEMORY));
     butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
     if (is_gpu_memory) {
         prefetch_d2h_data = host_allocator->AllocateRaw(prefetch_d2h_size);
@@ -859,8 +859,8 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
             int body_without_attachment_size = req_size - meta.attachment_size();
 #if BRPC_WITH_GDR
             int meta_size = msg->meta.size();
-            uint64_t data_meta = msg->payload.get_first_data_meta();
-            bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+            uint32_t data_meta = msg->payload.get_first_data_meta_high32();
+            bool is_gpu_memory = (data_meta == static_cast<uint32_t>(butil::IOBuf::GPU_MEMORY));
             if(is_gpu_memory) {
                 int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
                 if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
@@ -1054,8 +1054,8 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
         butil::IOBuf* res_buf_ptr = &msg->payload;
 
 #if BRPC_WITH_GDR
-        uint64_t data_meta = msg->payload.get_first_data_meta();
-        bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+        uint32_t data_meta = msg->payload.get_first_data_meta_high32();
+        bool is_gpu_memory = (data_meta == static_cast<uint32_t>(butil::IOBuf::GPU_MEMORY));
 #endif  // BRPC_WITH_GDR
         if (meta.has_attachment_size()) {
             if (meta.attachment_size() > res_size) {
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 6f451e841f..73bf974330 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -1056,7 +1056,8 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
             void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
             auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
             lkey = device_allocator->get_lkey(device_ptr);
-            _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , lkey);
+            uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
+            _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , data_meta);
             _rbuf_data[_rq_received] = device_ptr;
 #else
             butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp
index 469e3775b3..ce3c0cc0bb 100644
--- a/src/butil/iobuf.cpp
+++ b/src/butil/iobuf.cpp
@@ -1193,7 +1193,21 @@ uint64_t IOBuf::get_first_data_meta() {
     if (!(r.block->flags & IOBUF_BLOCK_FLAGS_USER_DATA)) {
         return 0;
     }
-    return r.block->u.data_meta;
+    return (r.block->u.data_meta & 0x00000000FFFFFFFF);
+}
+
+// only when user use append_user_data_with_meta(), lkey is stored in data_meta
+// We add this function for GDR, we want to know whether the data is in Host memory or GPU memory
+// since lkey is uint32_t type, thus we use the high 32 bit to store
+uint32_t IOBuf::get_first_data_meta_high32() {
+    if (_ref_num() == 0) {
+        return 0;
+    }
+    IOBuf::BlockRef const& r = _ref_at(0);
+    if (!(r.block->flags & IOBUF_BLOCK_FLAGS_USER_DATA)) {
+        return 0;
+    }
+    return (uint32_t)(r.block->u.data_meta >> 32);
 }
 
 void* IOBuf::get_first_data_ptr() {
diff --git a/src/butil/iobuf.h b/src/butil/iobuf.h
index e554dd0e40..14077f0c29 100644
--- a/src/butil/iobuf.h
+++ b/src/butil/iobuf.h
@@ -70,6 +70,11 @@ friend class SingleIOBuf;
     static const size_t DEFAULT_BLOCK_SIZE = 8192;
     static const size_t INITIAL_CAP = 32; // must be power of 2
 
+    enum MemoryMeta {
+        HOST_MEMORY = 0,
+        GPU_MEMORY = 1
+    };
+
     struct Block;
 
     // can't directly use `struct iovec' here because we also need to access the
@@ -265,6 +270,11 @@ friend class SingleIOBuf;
     // The meta is specified with append_user_data_with_meta before.
     // 0 means the meta is invalid.
     uint64_t get_first_data_meta();
+
+    // Get the high 32 bits of the data meta of the first byte in this IOBuf.
+    // The meta is specified with append_user_data_with_meta before.
+    // we use 0 to specify host memory, 1 to specify GPU memory
+    uint32_t get_first_data_meta_high32();
     void* get_first_data_ptr();
 
     // Resizes the buf to a length of n characters.

From 007c073dc0610c309d1c12c394545ec8fc292aef Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Wed, 21 Jan 2026 01:00:48 +0800
Subject: [PATCH 08/23] use IBV_MTU_4096 in gdr

---
 src/brpc/rdma/rdma_endpoint.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 73bf974330..d21082ed0a 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -1256,7 +1256,11 @@ int RdmaEndpoint::BringUpQp(uint16_t lid, ibv_gid gid, uint32_t qp_num) {
     }
 
     attr.qp_state = IBV_QPS_RTR;
+#if BRPC_WITH_GDR
+    attr.path_mtu = IBV_MTU_4096;  // TODO: detect mtu automatically
+#else
     attr.path_mtu = IBV_MTU_1024;  // TODO: support more mtu in future
+#endif  // if BRPC_WITH_GDR
     attr.ah_attr.grh.dgid = gid;
     attr.ah_attr.grh.flow_label = 0;
     attr.ah_attr.grh.sgid_index = GetRdmaGidIndex();

From 7a8bd8f009c9d7df91caf444030479a12aea7193 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Sun, 22 Mar 2026 01:21:52 +0800
Subject: [PATCH 09/23] Add use_gdr in ChannelOptions

---
 src/brpc/acceptor.cpp                      |  1 +
 src/brpc/acceptor.h                        |  3 +
 src/brpc/channel.cpp                       | 13 ++-
 src/brpc/channel.h                         |  4 +
 src/brpc/details/naming_service_thread.cpp |  3 +-
 src/brpc/details/naming_service_thread.h   |  4 +-
 src/brpc/rdma/rdma_endpoint.cpp            | 98 +++++++++++++---------
 src/brpc/rdma/rdma_endpoint.h              |  9 +-
 src/brpc/rdma/rdma_helper.cpp              | 31 +++++--
 src/brpc/rdma/rdma_helper.h                |  4 +
 src/brpc/server.cpp                        |  7 ++
 src/brpc/server.h                          |  4 +
 src/brpc/socket.cpp                        |  4 +-
 src/brpc/socket.h                          |  1 +
 src/brpc/socket_map.cpp                    |  5 +-
 src/brpc/socket_map.h                      | 10 ++-
 16 files changed, 145 insertions(+), 56 deletions(-)

diff --git a/src/brpc/acceptor.cpp b/src/brpc/acceptor.cpp
index 616c1a3044..4487fc2df8 100644
--- a/src/brpc/acceptor.cpp
+++ b/src/brpc/acceptor.cpp
@@ -41,6 +41,7 @@ Acceptor::Acceptor(bthread_keytable_pool_t* pool)
     , _force_ssl(false)
     , _ssl_ctx(NULL) 
     , _use_rdma(false)
+    , _use_gdr(false)
     , _bthread_tag(BTHREAD_TAG_DEFAULT) {
 }
 
diff --git a/src/brpc/acceptor.h b/src/brpc/acceptor.h
index 69f632aaca..66f85c4904 100644
--- a/src/brpc/acceptor.h
+++ b/src/brpc/acceptor.h
@@ -113,6 +113,9 @@ friend class Server;
     // Whether to use rdma or not
     bool _use_rdma;
 
+    // Whether to use gdr or not
+    bool _use_gdr;
+
     // Acceptor belongs to this tag
     bthread_tag_t _bthread_tag;
 };
diff --git a/src/brpc/channel.cpp b/src/brpc/channel.cpp
index 0252e97d74..bce554380b 100644
--- a/src/brpc/channel.cpp
+++ b/src/brpc/channel.cpp
@@ -61,6 +61,7 @@ ChannelOptions::ChannelOptions()
     , succeed_without_server(true)
     , log_succeed_without_server(true)
     , use_rdma(false)
+    , use_gdr(false)
     , auth(NULL)
     , backup_request_policy(NULL)
     , retry_policy(NULL)
@@ -123,6 +124,9 @@ static ChannelSignature ComputeChannelSignature(const ChannelOptions& opt) {
         if (opt.use_rdma) {
             buf.append("|rdma");
         }
+        if (opt.use_gdr) {
+            buf.append("|gdr");
+        }
         butil::MurmurHash3_x64_128_Update(&mm_ctx, buf.data(), buf.size());
         buf.clear();
     
@@ -197,6 +201,11 @@ int Channel::InitChannelOptions(const ChannelOptions* options) {
             return -1;
         }
         rdma::GlobalRdmaInitializeOrDie();
+#if BRPC_WITH_GDR
+        if (_options.use_gdr) {
+            rdma::GlobalGdrInitializeOrDie();
+        }
+#endif // BRPC_WITH_GDR
         if (!rdma::InitPollingModeWithTag(bthread_self_tag())) {
             return -1;
         }
@@ -369,7 +378,8 @@ int Channel::InitSingle(const butil::EndPoint& server_addr_and_port,
         return -1;
     }
     if (SocketMapInsert(SocketMapKey(server_addr_and_port, sig),
-                        &_server_id, ssl_ctx, _options.use_rdma, _options.hc_option) != 0) {
+                        &_server_id, ssl_ctx, _options.use_rdma, _options.use_gdr,
+                        _options.hc_option) != 0) {
         LOG(ERROR) << "Fail to insert into SocketMap";
         return -1;
     }
@@ -407,6 +417,7 @@ int Channel::Init(const char* ns_url,
     ns_opt.succeed_without_server = _options.succeed_without_server;
     ns_opt.log_succeed_without_server = _options.log_succeed_without_server;
     ns_opt.use_rdma = _options.use_rdma;
+    ns_opt.use_gdr = _options.use_gdr;
     ns_opt.channel_signature = ComputeChannelSignature(_options);
     ns_opt.hc_option =  _options.hc_option;
     if (CreateSocketSSLContext(_options, &ns_opt.ssl_ctx) != 0) {
diff --git a/src/brpc/channel.h b/src/brpc/channel.h
index c970209b3a..160651882b 100644
--- a/src/brpc/channel.h
+++ b/src/brpc/channel.h
@@ -109,6 +109,10 @@ struct ChannelOptions {
     // Default: false
     bool use_rdma;
 
+    // Let this channel use gdu direct rdma.
+    // Default: false
+    bool use_gdr;
+
     // Turn on authentication for this channel if `auth' is not NULL.
     // Note `auth' will not be deleted by channel and must remain valid when
     // the channel is being used.
diff --git a/src/brpc/details/naming_service_thread.cpp b/src/brpc/details/naming_service_thread.cpp
index 341ca35b09..5c62de9ad6 100644
--- a/src/brpc/details/naming_service_thread.cpp
+++ b/src/brpc/details/naming_service_thread.cpp
@@ -126,7 +126,8 @@ void NamingServiceThread::Actions::ResetServers(
         //       to pick those Sockets with the right settings during OnAddedServers
         const SocketMapKey key(_added[i], _owner->_options.channel_signature);
         CHECK_EQ(0, SocketMapInsert(key, &tagged_id.id, _owner->_options.ssl_ctx,
-                                    _owner->_options.use_rdma, _owner->_options.hc_option));
+                                    _owner->_options.use_rdma, _owner->_options.use_gdr,
+                                    _owner->_options.hc_option));
         _added_sockets.push_back(tagged_id);
     }
 
diff --git a/src/brpc/details/naming_service_thread.h b/src/brpc/details/naming_service_thread.h
index 1745e5f267..4f9c2b744e 100644
--- a/src/brpc/details/naming_service_thread.h
+++ b/src/brpc/details/naming_service_thread.h
@@ -45,11 +45,13 @@ struct GetNamingServiceThreadOptions {
     GetNamingServiceThreadOptions()
         : succeed_without_server(false)
         , log_succeed_without_server(true)
-        , use_rdma(false) {}
+        , use_rdma(false)
+        , use_gdr(false) {}
     
     bool succeed_without_server;
     bool log_succeed_without_server;
     bool use_rdma;
+    bool use_gdr;
     HealthCheckOption hc_option;
     ChannelSignature channel_signature;
     std::shared_ptr<SocketSSLContext> ssl_ctx;
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index d21082ed0a..a8bb0a5e77 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -66,6 +66,12 @@ DEFINE_bool(rdma_poller_yield, false, "Yield thread in RDMA polling mode.");
 DEFINE_bool(rdma_edisp_unsched, false, "Disable event dispatcher schedule");
 DEFINE_bool(rdma_disable_bthread, false, "Disable bthread in RDMA");
 
+namespace butil {
+    namespace gdr {
+        extern int gdr_block_size_kb;
+    }
+}
+
 static const size_t IOBUF_BLOCK_HEADER_LEN = 32; // implementation-dependent
 
 // DO NOT change this value unless you know the safe value!!!
@@ -90,6 +96,7 @@ static uint16_t g_rdma_hello_msg_len = 40;  // In Byte
 static uint16_t g_rdma_hello_version = 2;
 static uint16_t g_rdma_impl_version = 1;
 static uint32_t g_rdma_recv_block_size = 0;
+static uint32_t g_gdr_recv_block_size = 0;
 
 // static const uint32_t MAX_INLINE_DATA = 64;
 static const uint8_t MAX_HOP_LIMIT = 16;
@@ -169,8 +176,9 @@ RdmaResource::~RdmaResource() {
     }
 }
 
-RdmaEndpoint::RdmaEndpoint(Socket* s)
+RdmaEndpoint::RdmaEndpoint(Socket* s, bool use_gdr)
     : _socket(s)
+    , _use_gdr(use_gdr)
     , _state(UNINIT)
     , _resource(NULL)
     , _cq_events(0)
@@ -440,6 +448,7 @@ void* RdmaEndpoint::ProcessHandshakeAtClient(void* arg) {
     local_msg.hello_ver = g_rdma_hello_version;
     local_msg.impl_ver = g_rdma_impl_version;
     local_msg.block_size = g_rdma_recv_block_size;
+    local_msg.block_size = ep->use_gdr() ? g_gdr_recv_block_size : g_rdma_recv_block_size;
     local_msg.sq_size = ep->_sq_size;
     local_msg.rq_size = ep->_rq_size;
     local_msg.lid = GetRdmaLid();
@@ -649,7 +658,7 @@ void* RdmaEndpoint::ProcessHandshakeAtServer(void* arg) {
     } else {
         local_msg.lid = GetRdmaLid();
         local_msg.gid = GetRdmaGid();
-        local_msg.block_size = g_rdma_recv_block_size;
+        local_msg.block_size = ep->use_gdr() ? g_gdr_recv_block_size : g_rdma_recv_block_size;
         local_msg.sq_size = ep->_sq_size;
         local_msg.rq_size = ep->_rq_size;
         local_msg.hello_ver = g_rdma_hello_version;
@@ -946,12 +955,15 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
         // Please note that only the first wc.byte_len bytes is valid
         if (wc.byte_len > 0) {
 #if BRPC_WITH_GDR
-            zerocopy = true;
-#else
-            if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
-                zerocopy = false;
-            }
+            if (_use_gdr) {
+                zerocopy = true;
+            } else
 #endif // BRPC_WITH_GDR
+            {
+                if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
+                    zerocopy = false;
+                }
+            }
             CHECK(_state != FALLBACK_TCP);
             if (zerocopy) {
                 butil::IOBuf tmp;
@@ -1052,37 +1064,43 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
             _rbuf[_rq_received].clear();
 
 #if BRPC_WITH_GDR
-            butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
-            void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
-            auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
-            lkey = device_allocator->get_lkey(device_ptr);
-            uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
-            _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , data_meta);
-            _rbuf_data[_rq_received] = device_ptr;
-#else
-            butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
-                    g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
-            int size = 0;
-            if (!os.Next(&_rbuf_data[_rq_received], &size)) {
-                // Memory is not enough for preparing a block
-                PLOG(WARNING) << "Fail to allocate rbuf";
-                return -1;
-            } else {
-                CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
-            }
+            if (_use_gdr) {
+                butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
+                void* device_ptr = device_allocator->AllocateRaw(g_gdr_recv_block_size);
+                auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
+                lkey = device_allocator->get_lkey(device_ptr);
+                uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
+                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_gdr_recv_block_size, deleter , data_meta);
+                _rbuf_data[_rq_received] = device_ptr;
+            } else
 #endif  // if BRPC_WITH_GDR
+            {
+                butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
+                        g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
+                int size = 0;
+                if (!os.Next(&_rbuf_data[_rq_received], &size)) {
+                    // Memory is not enough for preparing a block
+                    PLOG(WARNING) << "Fail to allocate rbuf";
+                    return -1;
+                } else {
+                    CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
+                }
+             }
         }
 #if BRPC_WITH_GDR
-        if (DoPostRecvGDR(_rbuf_data[_rq_received], g_rdma_recv_block_size, lkey) < 0) {
-            _rbuf[_rq_received].clear();
-            return -1;
-        }
-#else
-        if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
-            _rbuf[_rq_received].clear();
-            return -1;
-        }
+        if (_use_gdr) {
+            if (DoPostRecvGDR(_rbuf_data[_rq_received], g_gdr_recv_block_size, lkey) < 0) {
+                _rbuf[_rq_received].clear();
+                return -1;
+            }
+        } else
 #endif  // if BRPC_WITH_GDR
+        {
+            if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
+                _rbuf[_rq_received].clear();
+                return -1;
+            }
+        }
 
         --num;
         ++_rq_received;
@@ -1554,11 +1572,14 @@ void RdmaEndpoint::DebugInfo(std::ostream& os) const {
        << "\n";
 }
 
-int RdmaEndpoint::GlobalInitialize() {
+int RdmaEndpoint::GlobalGdrInitialize() {
 #if BRPC_WITH_GDR
-    LOG(INFO) << ", gdr_block_size_kb: " << butil::gdr::gdr_block_size_kb;
-    g_rdma_recv_block_size = butil::gdr::gdr_block_size_kb * 1024 - IOBUF_BLOCK_HEADER_LEN;
-#else
+    LOG(INFO) << "gdr_block_size_kb: " << butil::gdr::gdr_block_size_kb;
+    g_gdr_recv_block_size = butil::gdr::gdr_block_size_kb * 1024 - IOBUF_BLOCK_HEADER_LEN;
+#endif // BRPC_WITH_GDR
+    return 0;
+}
+int RdmaEndpoint::GlobalInitialize() {
     if (FLAGS_rdma_recv_block_type == "default") {
         g_rdma_recv_block_size = GetBlockSize(0) - IOBUF_BLOCK_HEADER_LEN;
     } else if (FLAGS_rdma_recv_block_type == "large") {
@@ -1569,7 +1590,6 @@ int RdmaEndpoint::GlobalInitialize() {
         errno = EINVAL;
         return -1;
     }
-#endif // BRPC_WITH_GDR
 
     LOG(INFO) << "rdma_use_polling :" << FLAGS_rdma_use_polling
       << ", rdma_poller_num : " << FLAGS_rdma_poller_num
diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index 4705e362ea..a258abfb74 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -71,18 +71,23 @@ class BAIDU_CACHELINE_ALIGNMENT RdmaEndpoint : public SocketUser {
 friend class RdmaConnect;
 friend class brpc::Socket;
 public:
-    RdmaEndpoint(Socket* s);
+    RdmaEndpoint(Socket* s, bool use_gdr);
     ~RdmaEndpoint();
 
     // Global initialization
     // Return 0 if success, -1 if failed and errno set
     static int GlobalInitialize();
 
+    // Global initialization for gdr
+    static int GlobalGdrInitialize();
+
     static void GlobalRelease();
 
     // Reset the endpoint (for next use)
     void Reset();
 
+    bool use_gdr() { return _use_gdr; }
+
     // Cut data from the given IOBuf list and use RDMA to send
     // Return bytes cut if success, -1 if failed and errno set
     ssize_t CutFromIOBufList(butil::IOBuf** data, size_t ndata);
@@ -294,6 +299,8 @@ friend class brpc::Socket;
         std::atomic<bool> running;
     };
     static std::vector<PollerGroup> _poller_groups;
+ 
+    bool _use_gdr;
 };
 
 }  // namespace rdma
diff --git a/src/brpc/rdma/rdma_helper.cpp b/src/brpc/rdma/rdma_helper.cpp
index 3b45b2621c..cbd7889084 100644
--- a/src/brpc/rdma/rdma_helper.cpp
+++ b/src/brpc/rdma/rdma_helper.cpp
@@ -25,7 +25,9 @@
 #include "butil/containers/flat_map.h"            // butil::FlatMap
 #include "butil/fd_guard.h"
 #include "butil/fd_utility.h"                     // butil::make_non_blocking
+#if BRPC_WITH_GDR
 #include "butil/gpu/gpu_block_pool.h"
+#endif // BRPC_WITH_GDR
 #include "butil/logging.h"
 #include "brpc/socket.h"
 #include "brpc/rdma/block_pool.h"
@@ -556,13 +558,6 @@ static void GlobalRdmaInitializeOrDieImpl() {
         ExitWithError();
     }
 
-#if BRPC_WITH_GDR
-    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
-        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
-        ExitWithError();
-    }
-#endif  // if BRPC_WITH_GDR
-
     if (RdmaEndpoint::GlobalInitialize() < 0) {
         LOG(ERROR) << "rdma_recv_block_type incorrect "
                    << "(valid value: default/large/huge)";
@@ -591,7 +586,21 @@ static void GlobalRdmaInitializeOrDieImpl() {
     g_rdma_available.store(true, butil::memory_order_relaxed);
 }
 
+static void GlobalGdrInitializeOrDieImpl() {
+#if BRPC_WITH_GDR
+    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
+        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
+        ExitWithError();
+    }
+    if (RdmaEndpoint::GlobalGdrInitialize() < 0) {
+        LOG(ERROR) << "g_gdr_recv_block_size incorrect.";
+        ExitWithError();
+    }
+#endif  // if BRPC_WITH_GDR
+}
+
 static pthread_once_t initialize_rdma_once = PTHREAD_ONCE_INIT;
+static pthread_once_t initialize_gdr_once = PTHREAD_ONCE_INIT;
 
 void GlobalRdmaInitializeOrDie() {
     if (pthread_once(&initialize_rdma_once,
@@ -601,6 +610,14 @@ void GlobalRdmaInitializeOrDie() {
     }
 }
 
+void GlobalGdrInitializeOrDie() {
+    if (pthread_once(&initialize_gdr_once,
+                     GlobalGdrInitializeOrDieImpl) != 0) {
+        LOG(FATAL) << "Fail to pthread_once GlobalGdrInitializeOrDie";
+        exit(1);
+    }
+}
+
 uint32_t RegisterMemoryForRdma(void* buf, size_t len) {
     ibv_mr* mr = IbvRegMr(g_pd, buf, len, IBV_ACCESS_LOCAL_WRITE);
     if (!mr) {
diff --git a/src/brpc/rdma/rdma_helper.h b/src/brpc/rdma/rdma_helper.h
index 25a93476e7..06cbb1f5c2 100644
--- a/src/brpc/rdma/rdma_helper.h
+++ b/src/brpc/rdma/rdma_helper.h
@@ -33,6 +33,10 @@ namespace rdma {
 // Exit if failed
 void GlobalRdmaInitializeOrDie();
 
+// Initialize GDR environment
+// Exit if failed
+void GlobalGdrInitializeOrDie();
+
 // Initialize RDMA polling mode with tag
 bool InitPollingModeWithTag(bthread_tag_t tag,
                             std::function<void(void)> callback = nullptr,
diff --git a/src/brpc/server.cpp b/src/brpc/server.cpp
index cd83053a42..38bf808e48 100644
--- a/src/brpc/server.cpp
+++ b/src/brpc/server.cpp
@@ -146,6 +146,7 @@ ServerOptions::ServerOptions()
     , has_builtin_services(true)
     , force_ssl(false)
     , use_rdma(false)
+    , use_gdr(false)
     , baidu_master_service(NULL)
     , http_master_service(NULL)
     , health_reporter(NULL)
@@ -895,6 +896,11 @@ int Server::StartInternal(const butil::EndPoint& endpoint,
             return -1;
         }
         rdma::GlobalRdmaInitializeOrDie();
+#if BRPC_WITH_GDR
+        if (_options.use_gdr) {
+            rdma::GlobalGdrInitializeOrDie();
+        }
+#endif // BRPC_WITH_GDR
         if (!rdma::InitPollingModeWithTag(_options.bthread_tag)) {
             return -1;
         }
@@ -1170,6 +1176,7 @@ int Server::StartInternal(const butil::EndPoint& endpoint,
                 return -1;
             }
             _am->_use_rdma = _options.use_rdma;
+            _am->_use_gdr = _options.use_gdr;
             _am->_bthread_tag = _options.bthread_tag;
         }
         // Set `_status' to RUNNING before accepting connections
diff --git a/src/brpc/server.h b/src/brpc/server.h
index 2cf34dbd82..f720956bb3 100644
--- a/src/brpc/server.h
+++ b/src/brpc/server.h
@@ -227,6 +227,10 @@ struct ServerOptions {
     // Default: false
     bool use_rdma;
 
+    // Whether the server uses gdr or not
+    // Default: false
+    bool use_gdr;
+
     // [CAUTION] This option is for implementing specialized baidu-std proxies,
     // most users don't need it. Don't change this option unless you fully
     // understand the description below.
diff --git a/src/brpc/socket.cpp b/src/brpc/socket.cpp
index 73ea309a71..e6938b332a 100644
--- a/src/brpc/socket.cpp
+++ b/src/brpc/socket.cpp
@@ -759,7 +759,7 @@ int Socket::OnCreated(const SocketOptions& options) {
 #if BRPC_WITH_RDMA
     CHECK(_rdma_ep == NULL);
     if (options.use_rdma) {
-        _rdma_ep = new (std::nothrow)rdma::RdmaEndpoint(this);
+        _rdma_ep = new (std::nothrow)rdma::RdmaEndpoint(this, options.use_gdr);
         if (!_rdma_ep) {
             const int saved_errno = errno;
             PLOG(ERROR) << "Fail to create RdmaEndpoint";
@@ -2811,6 +2811,7 @@ int Socket::GetPooledSocket(SocketUniquePtr* pooled_socket) {
         opt.keytable_pool = _keytable_pool;
         opt.app_connect = _app_connect;
         opt.use_rdma =  (_rdma_ep) ? true : false;
+        opt.use_gdr =  (_rdma_ep) ? _rdma_ep->use_gdr() : false;
         socket_pool = new SocketPool(opt);
         SocketPool* expected = NULL;
         if (!main_sp->socket_pool.compare_exchange_strong(
@@ -2912,6 +2913,7 @@ int Socket::GetShortSocket(SocketUniquePtr* short_socket) {
     opt.keytable_pool = _keytable_pool;
     opt.app_connect = _app_connect;
     opt.use_rdma =  (_rdma_ep) ? true : false;
+    opt.use_gdr =  (_rdma_ep) ? _rdma_ep->use_gdr() : false;
     if (get_client_side_messenger()->Create(opt, &id) != 0 ||
         Socket::Address(id, short_socket) != 0) {
         return -1;
diff --git a/src/brpc/socket.h b/src/brpc/socket.h
index 03ad43f867..4dee716fd8 100644
--- a/src/brpc/socket.h
+++ b/src/brpc/socket.h
@@ -271,6 +271,7 @@ struct SocketOptions {
     bool force_ssl{false};
     std::shared_ptr<SocketSSLContext> initial_ssl_ctx;
     bool use_rdma{false};
+    bool use_gdr{false};
     bthread_keytable_pool_t* keytable_pool{NULL};
     SocketConnection* conn{NULL};
     std::shared_ptr<AppConnect> app_connect;
diff --git a/src/brpc/socket_map.cpp b/src/brpc/socket_map.cpp
index c5c94bc747..2397f62819 100644
--- a/src/brpc/socket_map.cpp
+++ b/src/brpc/socket_map.cpp
@@ -92,8 +92,9 @@ SocketMap* get_or_new_client_side_socket_map() {
 int SocketMapInsert(const SocketMapKey& key, SocketId* id,
                     const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                     bool use_rdma,
+                    bool use_gdr,
                     const HealthCheckOption& hc_option) {
-    return get_or_new_client_side_socket_map()->Insert(key, id, ssl_ctx, use_rdma, hc_option);
+    return get_or_new_client_side_socket_map()->Insert(key, id, ssl_ctx, use_rdma, use_gdr, hc_option);
 }    
 
 int SocketMapFind(const SocketMapKey& key, SocketId* id) {
@@ -227,6 +228,7 @@ void SocketMap::ShowSocketMapInBvarIfNeed() {
 int SocketMap::Insert(const SocketMapKey& key, SocketId* id,
                       const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                       bool use_rdma,
+                      bool use_gdr,
                       const HealthCheckOption& hc_option) {
     ShowSocketMapInBvarIfNeed();
 
@@ -251,6 +253,7 @@ int SocketMap::Insert(const SocketMapKey& key, SocketId* id,
     opt.remote_side = key.peer.addr;
     opt.initial_ssl_ctx = ssl_ctx;
     opt.use_rdma = use_rdma;
+    opt.use_gdr = use_gdr;
     opt.hc_option = hc_option;
     if (_options.socket_creator->CreateSocket(opt, &tmp_id) != 0) {
         PLOG(FATAL) << "Fail to create socket to " << key.peer;
diff --git a/src/brpc/socket_map.h b/src/brpc/socket_map.h
index b0d542e78e..698bdab7ed 100644
--- a/src/brpc/socket_map.h
+++ b/src/brpc/socket_map.h
@@ -82,18 +82,19 @@ struct SocketMapKeyHasher {
 int SocketMapInsert(const SocketMapKey& key, SocketId* id,
                     const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                     bool use_rdma,
+                    bool use_gdr,
                     const HealthCheckOption& hc_option);
 
 inline int SocketMapInsert(const SocketMapKey& key, SocketId* id,
                     const std::shared_ptr<SocketSSLContext>& ssl_ctx) {
     HealthCheckOption hc_option;
-    return SocketMapInsert(key, id, ssl_ctx, false, hc_option);
+    return SocketMapInsert(key, id, ssl_ctx, false, false, hc_option);
 }
 
 inline int SocketMapInsert(const SocketMapKey& key, SocketId* id) {
     std::shared_ptr<SocketSSLContext> empty_ptr;
     HealthCheckOption hc_option;
-    return SocketMapInsert(key, id, empty_ptr, false, hc_option);
+    return SocketMapInsert(key, id, empty_ptr, false, false, hc_option);
 }
 
 // Find the SocketId associated with `key'.
@@ -155,17 +156,18 @@ class SocketMap {
     int Insert(const SocketMapKey& key, SocketId* id,
                const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                bool use_rdma,
+               bool use_gdr,
                const HealthCheckOption& hc_option);
 
     int Insert(const SocketMapKey& key, SocketId* id,
                const std::shared_ptr<SocketSSLContext>& ssl_ctx) {
         HealthCheckOption hc_option;
-        return Insert(key, id, ssl_ctx, false, hc_option);   
+        return Insert(key, id, ssl_ctx, false, false, hc_option);
     }
     int Insert(const SocketMapKey& key, SocketId* id) {
         std::shared_ptr<SocketSSLContext> empty_ptr;
         HealthCheckOption hc_option;
-        return Insert(key, id, empty_ptr, false, hc_option);
+        return Insert(key, id, empty_ptr, false, false, hc_option);
     }
 
     void Remove(const SocketMapKey& key, SocketId expected_id);

From 8f6132123198ad538a459c0d8e42c54e539930f1 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Wed, 27 May 2026 00:48:13 +0800
Subject: [PATCH 10/23] Revert "Add use_gdr in ChannelOptions"

This reverts commit 7a8bd8f009c9d7df91caf444030479a12aea7193.
---
 src/brpc/acceptor.cpp                      |  1 -
 src/brpc/acceptor.h                        |  3 -
 src/brpc/channel.cpp                       | 13 +--
 src/brpc/channel.h                         |  4 -
 src/brpc/details/naming_service_thread.cpp |  3 +-
 src/brpc/details/naming_service_thread.h   |  4 +-
 src/brpc/rdma/rdma_endpoint.cpp            | 98 +++++++++-------------
 src/brpc/rdma/rdma_endpoint.h              |  9 +-
 src/brpc/rdma/rdma_helper.cpp              | 31 ++-----
 src/brpc/rdma/rdma_helper.h                |  4 -
 src/brpc/server.cpp                        |  7 --
 src/brpc/server.h                          |  4 -
 src/brpc/socket.cpp                        |  4 +-
 src/brpc/socket.h                          |  1 -
 src/brpc/socket_map.cpp                    |  5 +-
 src/brpc/socket_map.h                      | 10 +--
 16 files changed, 56 insertions(+), 145 deletions(-)

diff --git a/src/brpc/acceptor.cpp b/src/brpc/acceptor.cpp
index 4487fc2df8..616c1a3044 100644
--- a/src/brpc/acceptor.cpp
+++ b/src/brpc/acceptor.cpp
@@ -41,7 +41,6 @@ Acceptor::Acceptor(bthread_keytable_pool_t* pool)
     , _force_ssl(false)
     , _ssl_ctx(NULL) 
     , _use_rdma(false)
-    , _use_gdr(false)
     , _bthread_tag(BTHREAD_TAG_DEFAULT) {
 }
 
diff --git a/src/brpc/acceptor.h b/src/brpc/acceptor.h
index 66f85c4904..69f632aaca 100644
--- a/src/brpc/acceptor.h
+++ b/src/brpc/acceptor.h
@@ -113,9 +113,6 @@ friend class Server;
     // Whether to use rdma or not
     bool _use_rdma;
 
-    // Whether to use gdr or not
-    bool _use_gdr;
-
     // Acceptor belongs to this tag
     bthread_tag_t _bthread_tag;
 };
diff --git a/src/brpc/channel.cpp b/src/brpc/channel.cpp
index bce554380b..0252e97d74 100644
--- a/src/brpc/channel.cpp
+++ b/src/brpc/channel.cpp
@@ -61,7 +61,6 @@ ChannelOptions::ChannelOptions()
     , succeed_without_server(true)
     , log_succeed_without_server(true)
     , use_rdma(false)
-    , use_gdr(false)
     , auth(NULL)
     , backup_request_policy(NULL)
     , retry_policy(NULL)
@@ -124,9 +123,6 @@ static ChannelSignature ComputeChannelSignature(const ChannelOptions& opt) {
         if (opt.use_rdma) {
             buf.append("|rdma");
         }
-        if (opt.use_gdr) {
-            buf.append("|gdr");
-        }
         butil::MurmurHash3_x64_128_Update(&mm_ctx, buf.data(), buf.size());
         buf.clear();
     
@@ -201,11 +197,6 @@ int Channel::InitChannelOptions(const ChannelOptions* options) {
             return -1;
         }
         rdma::GlobalRdmaInitializeOrDie();
-#if BRPC_WITH_GDR
-        if (_options.use_gdr) {
-            rdma::GlobalGdrInitializeOrDie();
-        }
-#endif // BRPC_WITH_GDR
         if (!rdma::InitPollingModeWithTag(bthread_self_tag())) {
             return -1;
         }
@@ -378,8 +369,7 @@ int Channel::InitSingle(const butil::EndPoint& server_addr_and_port,
         return -1;
     }
     if (SocketMapInsert(SocketMapKey(server_addr_and_port, sig),
-                        &_server_id, ssl_ctx, _options.use_rdma, _options.use_gdr,
-                        _options.hc_option) != 0) {
+                        &_server_id, ssl_ctx, _options.use_rdma, _options.hc_option) != 0) {
         LOG(ERROR) << "Fail to insert into SocketMap";
         return -1;
     }
@@ -417,7 +407,6 @@ int Channel::Init(const char* ns_url,
     ns_opt.succeed_without_server = _options.succeed_without_server;
     ns_opt.log_succeed_without_server = _options.log_succeed_without_server;
     ns_opt.use_rdma = _options.use_rdma;
-    ns_opt.use_gdr = _options.use_gdr;
     ns_opt.channel_signature = ComputeChannelSignature(_options);
     ns_opt.hc_option =  _options.hc_option;
     if (CreateSocketSSLContext(_options, &ns_opt.ssl_ctx) != 0) {
diff --git a/src/brpc/channel.h b/src/brpc/channel.h
index 160651882b..c970209b3a 100644
--- a/src/brpc/channel.h
+++ b/src/brpc/channel.h
@@ -109,10 +109,6 @@ struct ChannelOptions {
     // Default: false
     bool use_rdma;
 
-    // Let this channel use gdu direct rdma.
-    // Default: false
-    bool use_gdr;
-
     // Turn on authentication for this channel if `auth' is not NULL.
     // Note `auth' will not be deleted by channel and must remain valid when
     // the channel is being used.
diff --git a/src/brpc/details/naming_service_thread.cpp b/src/brpc/details/naming_service_thread.cpp
index 5c62de9ad6..341ca35b09 100644
--- a/src/brpc/details/naming_service_thread.cpp
+++ b/src/brpc/details/naming_service_thread.cpp
@@ -126,8 +126,7 @@ void NamingServiceThread::Actions::ResetServers(
         //       to pick those Sockets with the right settings during OnAddedServers
         const SocketMapKey key(_added[i], _owner->_options.channel_signature);
         CHECK_EQ(0, SocketMapInsert(key, &tagged_id.id, _owner->_options.ssl_ctx,
-                                    _owner->_options.use_rdma, _owner->_options.use_gdr,
-                                    _owner->_options.hc_option));
+                                    _owner->_options.use_rdma, _owner->_options.hc_option));
         _added_sockets.push_back(tagged_id);
     }
 
diff --git a/src/brpc/details/naming_service_thread.h b/src/brpc/details/naming_service_thread.h
index 4f9c2b744e..1745e5f267 100644
--- a/src/brpc/details/naming_service_thread.h
+++ b/src/brpc/details/naming_service_thread.h
@@ -45,13 +45,11 @@ struct GetNamingServiceThreadOptions {
     GetNamingServiceThreadOptions()
         : succeed_without_server(false)
         , log_succeed_without_server(true)
-        , use_rdma(false)
-        , use_gdr(false) {}
+        , use_rdma(false) {}
     
     bool succeed_without_server;
     bool log_succeed_without_server;
     bool use_rdma;
-    bool use_gdr;
     HealthCheckOption hc_option;
     ChannelSignature channel_signature;
     std::shared_ptr<SocketSSLContext> ssl_ctx;
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index a8bb0a5e77..d21082ed0a 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -66,12 +66,6 @@ DEFINE_bool(rdma_poller_yield, false, "Yield thread in RDMA polling mode.");
 DEFINE_bool(rdma_edisp_unsched, false, "Disable event dispatcher schedule");
 DEFINE_bool(rdma_disable_bthread, false, "Disable bthread in RDMA");
 
-namespace butil {
-    namespace gdr {
-        extern int gdr_block_size_kb;
-    }
-}
-
 static const size_t IOBUF_BLOCK_HEADER_LEN = 32; // implementation-dependent
 
 // DO NOT change this value unless you know the safe value!!!
@@ -96,7 +90,6 @@ static uint16_t g_rdma_hello_msg_len = 40;  // In Byte
 static uint16_t g_rdma_hello_version = 2;
 static uint16_t g_rdma_impl_version = 1;
 static uint32_t g_rdma_recv_block_size = 0;
-static uint32_t g_gdr_recv_block_size = 0;
 
 // static const uint32_t MAX_INLINE_DATA = 64;
 static const uint8_t MAX_HOP_LIMIT = 16;
@@ -176,9 +169,8 @@ RdmaResource::~RdmaResource() {
     }
 }
 
-RdmaEndpoint::RdmaEndpoint(Socket* s, bool use_gdr)
+RdmaEndpoint::RdmaEndpoint(Socket* s)
     : _socket(s)
-    , _use_gdr(use_gdr)
     , _state(UNINIT)
     , _resource(NULL)
     , _cq_events(0)
@@ -448,7 +440,6 @@ void* RdmaEndpoint::ProcessHandshakeAtClient(void* arg) {
     local_msg.hello_ver = g_rdma_hello_version;
     local_msg.impl_ver = g_rdma_impl_version;
     local_msg.block_size = g_rdma_recv_block_size;
-    local_msg.block_size = ep->use_gdr() ? g_gdr_recv_block_size : g_rdma_recv_block_size;
     local_msg.sq_size = ep->_sq_size;
     local_msg.rq_size = ep->_rq_size;
     local_msg.lid = GetRdmaLid();
@@ -658,7 +649,7 @@ void* RdmaEndpoint::ProcessHandshakeAtServer(void* arg) {
     } else {
         local_msg.lid = GetRdmaLid();
         local_msg.gid = GetRdmaGid();
-        local_msg.block_size = ep->use_gdr() ? g_gdr_recv_block_size : g_rdma_recv_block_size;
+        local_msg.block_size = g_rdma_recv_block_size;
         local_msg.sq_size = ep->_sq_size;
         local_msg.rq_size = ep->_rq_size;
         local_msg.hello_ver = g_rdma_hello_version;
@@ -955,15 +946,12 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
         // Please note that only the first wc.byte_len bytes is valid
         if (wc.byte_len > 0) {
 #if BRPC_WITH_GDR
-            if (_use_gdr) {
-                zerocopy = true;
-            } else
-#endif // BRPC_WITH_GDR
-            {
-                if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
-                    zerocopy = false;
-                }
+            zerocopy = true;
+#else
+            if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
+                zerocopy = false;
             }
+#endif // BRPC_WITH_GDR
             CHECK(_state != FALLBACK_TCP);
             if (zerocopy) {
                 butil::IOBuf tmp;
@@ -1064,43 +1052,37 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
             _rbuf[_rq_received].clear();
 
 #if BRPC_WITH_GDR
-            if (_use_gdr) {
-                butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
-                void* device_ptr = device_allocator->AllocateRaw(g_gdr_recv_block_size);
-                auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
-                lkey = device_allocator->get_lkey(device_ptr);
-                uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
-                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_gdr_recv_block_size, deleter , data_meta);
-                _rbuf_data[_rq_received] = device_ptr;
-            } else
-#endif  // if BRPC_WITH_GDR
-            {
-                butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
-                        g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
-                int size = 0;
-                if (!os.Next(&_rbuf_data[_rq_received], &size)) {
-                    // Memory is not enough for preparing a block
-                    PLOG(WARNING) << "Fail to allocate rbuf";
-                    return -1;
-                } else {
-                    CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
-                }
-             }
-        }
-#if BRPC_WITH_GDR
-        if (_use_gdr) {
-            if (DoPostRecvGDR(_rbuf_data[_rq_received], g_gdr_recv_block_size, lkey) < 0) {
-                _rbuf[_rq_received].clear();
+            butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
+            void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
+            auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
+            lkey = device_allocator->get_lkey(device_ptr);
+            uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
+            _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , data_meta);
+            _rbuf_data[_rq_received] = device_ptr;
+#else
+            butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
+                    g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
+            int size = 0;
+            if (!os.Next(&_rbuf_data[_rq_received], &size)) {
+                // Memory is not enough for preparing a block
+                PLOG(WARNING) << "Fail to allocate rbuf";
                 return -1;
+            } else {
+                CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
             }
-        } else
 #endif  // if BRPC_WITH_GDR
-        {
-            if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
-                _rbuf[_rq_received].clear();
-                return -1;
-            }
         }
+#if BRPC_WITH_GDR
+        if (DoPostRecvGDR(_rbuf_data[_rq_received], g_rdma_recv_block_size, lkey) < 0) {
+            _rbuf[_rq_received].clear();
+            return -1;
+        }
+#else
+        if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
+            _rbuf[_rq_received].clear();
+            return -1;
+        }
+#endif  // if BRPC_WITH_GDR
 
         --num;
         ++_rq_received;
@@ -1572,14 +1554,11 @@ void RdmaEndpoint::DebugInfo(std::ostream& os) const {
        << "\n";
 }
 
-int RdmaEndpoint::GlobalGdrInitialize() {
-#if BRPC_WITH_GDR
-    LOG(INFO) << "gdr_block_size_kb: " << butil::gdr::gdr_block_size_kb;
-    g_gdr_recv_block_size = butil::gdr::gdr_block_size_kb * 1024 - IOBUF_BLOCK_HEADER_LEN;
-#endif // BRPC_WITH_GDR
-    return 0;
-}
 int RdmaEndpoint::GlobalInitialize() {
+#if BRPC_WITH_GDR
+    LOG(INFO) << ", gdr_block_size_kb: " << butil::gdr::gdr_block_size_kb;
+    g_rdma_recv_block_size = butil::gdr::gdr_block_size_kb * 1024 - IOBUF_BLOCK_HEADER_LEN;
+#else
     if (FLAGS_rdma_recv_block_type == "default") {
         g_rdma_recv_block_size = GetBlockSize(0) - IOBUF_BLOCK_HEADER_LEN;
     } else if (FLAGS_rdma_recv_block_type == "large") {
@@ -1590,6 +1569,7 @@ int RdmaEndpoint::GlobalInitialize() {
         errno = EINVAL;
         return -1;
     }
+#endif // BRPC_WITH_GDR
 
     LOG(INFO) << "rdma_use_polling :" << FLAGS_rdma_use_polling
       << ", rdma_poller_num : " << FLAGS_rdma_poller_num
diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index a258abfb74..4705e362ea 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -71,23 +71,18 @@ class BAIDU_CACHELINE_ALIGNMENT RdmaEndpoint : public SocketUser {
 friend class RdmaConnect;
 friend class brpc::Socket;
 public:
-    RdmaEndpoint(Socket* s, bool use_gdr);
+    RdmaEndpoint(Socket* s);
     ~RdmaEndpoint();
 
     // Global initialization
     // Return 0 if success, -1 if failed and errno set
     static int GlobalInitialize();
 
-    // Global initialization for gdr
-    static int GlobalGdrInitialize();
-
     static void GlobalRelease();
 
     // Reset the endpoint (for next use)
     void Reset();
 
-    bool use_gdr() { return _use_gdr; }
-
     // Cut data from the given IOBuf list and use RDMA to send
     // Return bytes cut if success, -1 if failed and errno set
     ssize_t CutFromIOBufList(butil::IOBuf** data, size_t ndata);
@@ -299,8 +294,6 @@ friend class brpc::Socket;
         std::atomic<bool> running;
     };
     static std::vector<PollerGroup> _poller_groups;
- 
-    bool _use_gdr;
 };
 
 }  // namespace rdma
diff --git a/src/brpc/rdma/rdma_helper.cpp b/src/brpc/rdma/rdma_helper.cpp
index cbd7889084..3b45b2621c 100644
--- a/src/brpc/rdma/rdma_helper.cpp
+++ b/src/brpc/rdma/rdma_helper.cpp
@@ -25,9 +25,7 @@
 #include "butil/containers/flat_map.h"            // butil::FlatMap
 #include "butil/fd_guard.h"
 #include "butil/fd_utility.h"                     // butil::make_non_blocking
-#if BRPC_WITH_GDR
 #include "butil/gpu/gpu_block_pool.h"
-#endif // BRPC_WITH_GDR
 #include "butil/logging.h"
 #include "brpc/socket.h"
 #include "brpc/rdma/block_pool.h"
@@ -558,6 +556,13 @@ static void GlobalRdmaInitializeOrDieImpl() {
         ExitWithError();
     }
 
+#if BRPC_WITH_GDR
+    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
+        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
+        ExitWithError();
+    }
+#endif  // if BRPC_WITH_GDR
+
     if (RdmaEndpoint::GlobalInitialize() < 0) {
         LOG(ERROR) << "rdma_recv_block_type incorrect "
                    << "(valid value: default/large/huge)";
@@ -586,21 +591,7 @@ static void GlobalRdmaInitializeOrDieImpl() {
     g_rdma_available.store(true, butil::memory_order_relaxed);
 }
 
-static void GlobalGdrInitializeOrDieImpl() {
-#if BRPC_WITH_GDR
-    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
-        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
-        ExitWithError();
-    }
-    if (RdmaEndpoint::GlobalGdrInitialize() < 0) {
-        LOG(ERROR) << "g_gdr_recv_block_size incorrect.";
-        ExitWithError();
-    }
-#endif  // if BRPC_WITH_GDR
-}
-
 static pthread_once_t initialize_rdma_once = PTHREAD_ONCE_INIT;
-static pthread_once_t initialize_gdr_once = PTHREAD_ONCE_INIT;
 
 void GlobalRdmaInitializeOrDie() {
     if (pthread_once(&initialize_rdma_once,
@@ -610,14 +601,6 @@ void GlobalRdmaInitializeOrDie() {
     }
 }
 
-void GlobalGdrInitializeOrDie() {
-    if (pthread_once(&initialize_gdr_once,
-                     GlobalGdrInitializeOrDieImpl) != 0) {
-        LOG(FATAL) << "Fail to pthread_once GlobalGdrInitializeOrDie";
-        exit(1);
-    }
-}
-
 uint32_t RegisterMemoryForRdma(void* buf, size_t len) {
     ibv_mr* mr = IbvRegMr(g_pd, buf, len, IBV_ACCESS_LOCAL_WRITE);
     if (!mr) {
diff --git a/src/brpc/rdma/rdma_helper.h b/src/brpc/rdma/rdma_helper.h
index 06cbb1f5c2..25a93476e7 100644
--- a/src/brpc/rdma/rdma_helper.h
+++ b/src/brpc/rdma/rdma_helper.h
@@ -33,10 +33,6 @@ namespace rdma {
 // Exit if failed
 void GlobalRdmaInitializeOrDie();
 
-// Initialize GDR environment
-// Exit if failed
-void GlobalGdrInitializeOrDie();
-
 // Initialize RDMA polling mode with tag
 bool InitPollingModeWithTag(bthread_tag_t tag,
                             std::function<void(void)> callback = nullptr,
diff --git a/src/brpc/server.cpp b/src/brpc/server.cpp
index 38bf808e48..cd83053a42 100644
--- a/src/brpc/server.cpp
+++ b/src/brpc/server.cpp
@@ -146,7 +146,6 @@ ServerOptions::ServerOptions()
     , has_builtin_services(true)
     , force_ssl(false)
     , use_rdma(false)
-    , use_gdr(false)
     , baidu_master_service(NULL)
     , http_master_service(NULL)
     , health_reporter(NULL)
@@ -896,11 +895,6 @@ int Server::StartInternal(const butil::EndPoint& endpoint,
             return -1;
         }
         rdma::GlobalRdmaInitializeOrDie();
-#if BRPC_WITH_GDR
-        if (_options.use_gdr) {
-            rdma::GlobalGdrInitializeOrDie();
-        }
-#endif // BRPC_WITH_GDR
         if (!rdma::InitPollingModeWithTag(_options.bthread_tag)) {
             return -1;
         }
@@ -1176,7 +1170,6 @@ int Server::StartInternal(const butil::EndPoint& endpoint,
                 return -1;
             }
             _am->_use_rdma = _options.use_rdma;
-            _am->_use_gdr = _options.use_gdr;
             _am->_bthread_tag = _options.bthread_tag;
         }
         // Set `_status' to RUNNING before accepting connections
diff --git a/src/brpc/server.h b/src/brpc/server.h
index f720956bb3..2cf34dbd82 100644
--- a/src/brpc/server.h
+++ b/src/brpc/server.h
@@ -227,10 +227,6 @@ struct ServerOptions {
     // Default: false
     bool use_rdma;
 
-    // Whether the server uses gdr or not
-    // Default: false
-    bool use_gdr;
-
     // [CAUTION] This option is for implementing specialized baidu-std proxies,
     // most users don't need it. Don't change this option unless you fully
     // understand the description below.
diff --git a/src/brpc/socket.cpp b/src/brpc/socket.cpp
index e6938b332a..73ea309a71 100644
--- a/src/brpc/socket.cpp
+++ b/src/brpc/socket.cpp
@@ -759,7 +759,7 @@ int Socket::OnCreated(const SocketOptions& options) {
 #if BRPC_WITH_RDMA
     CHECK(_rdma_ep == NULL);
     if (options.use_rdma) {
-        _rdma_ep = new (std::nothrow)rdma::RdmaEndpoint(this, options.use_gdr);
+        _rdma_ep = new (std::nothrow)rdma::RdmaEndpoint(this);
         if (!_rdma_ep) {
             const int saved_errno = errno;
             PLOG(ERROR) << "Fail to create RdmaEndpoint";
@@ -2811,7 +2811,6 @@ int Socket::GetPooledSocket(SocketUniquePtr* pooled_socket) {
         opt.keytable_pool = _keytable_pool;
         opt.app_connect = _app_connect;
         opt.use_rdma =  (_rdma_ep) ? true : false;
-        opt.use_gdr =  (_rdma_ep) ? _rdma_ep->use_gdr() : false;
         socket_pool = new SocketPool(opt);
         SocketPool* expected = NULL;
         if (!main_sp->socket_pool.compare_exchange_strong(
@@ -2913,7 +2912,6 @@ int Socket::GetShortSocket(SocketUniquePtr* short_socket) {
     opt.keytable_pool = _keytable_pool;
     opt.app_connect = _app_connect;
     opt.use_rdma =  (_rdma_ep) ? true : false;
-    opt.use_gdr =  (_rdma_ep) ? _rdma_ep->use_gdr() : false;
     if (get_client_side_messenger()->Create(opt, &id) != 0 ||
         Socket::Address(id, short_socket) != 0) {
         return -1;
diff --git a/src/brpc/socket.h b/src/brpc/socket.h
index 4dee716fd8..03ad43f867 100644
--- a/src/brpc/socket.h
+++ b/src/brpc/socket.h
@@ -271,7 +271,6 @@ struct SocketOptions {
     bool force_ssl{false};
     std::shared_ptr<SocketSSLContext> initial_ssl_ctx;
     bool use_rdma{false};
-    bool use_gdr{false};
     bthread_keytable_pool_t* keytable_pool{NULL};
     SocketConnection* conn{NULL};
     std::shared_ptr<AppConnect> app_connect;
diff --git a/src/brpc/socket_map.cpp b/src/brpc/socket_map.cpp
index 2397f62819..c5c94bc747 100644
--- a/src/brpc/socket_map.cpp
+++ b/src/brpc/socket_map.cpp
@@ -92,9 +92,8 @@ SocketMap* get_or_new_client_side_socket_map() {
 int SocketMapInsert(const SocketMapKey& key, SocketId* id,
                     const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                     bool use_rdma,
-                    bool use_gdr,
                     const HealthCheckOption& hc_option) {
-    return get_or_new_client_side_socket_map()->Insert(key, id, ssl_ctx, use_rdma, use_gdr, hc_option);
+    return get_or_new_client_side_socket_map()->Insert(key, id, ssl_ctx, use_rdma, hc_option);
 }    
 
 int SocketMapFind(const SocketMapKey& key, SocketId* id) {
@@ -228,7 +227,6 @@ void SocketMap::ShowSocketMapInBvarIfNeed() {
 int SocketMap::Insert(const SocketMapKey& key, SocketId* id,
                       const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                       bool use_rdma,
-                      bool use_gdr,
                       const HealthCheckOption& hc_option) {
     ShowSocketMapInBvarIfNeed();
 
@@ -253,7 +251,6 @@ int SocketMap::Insert(const SocketMapKey& key, SocketId* id,
     opt.remote_side = key.peer.addr;
     opt.initial_ssl_ctx = ssl_ctx;
     opt.use_rdma = use_rdma;
-    opt.use_gdr = use_gdr;
     opt.hc_option = hc_option;
     if (_options.socket_creator->CreateSocket(opt, &tmp_id) != 0) {
         PLOG(FATAL) << "Fail to create socket to " << key.peer;
diff --git a/src/brpc/socket_map.h b/src/brpc/socket_map.h
index 698bdab7ed..b0d542e78e 100644
--- a/src/brpc/socket_map.h
+++ b/src/brpc/socket_map.h
@@ -82,19 +82,18 @@ struct SocketMapKeyHasher {
 int SocketMapInsert(const SocketMapKey& key, SocketId* id,
                     const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                     bool use_rdma,
-                    bool use_gdr,
                     const HealthCheckOption& hc_option);
 
 inline int SocketMapInsert(const SocketMapKey& key, SocketId* id,
                     const std::shared_ptr<SocketSSLContext>& ssl_ctx) {
     HealthCheckOption hc_option;
-    return SocketMapInsert(key, id, ssl_ctx, false, false, hc_option);
+    return SocketMapInsert(key, id, ssl_ctx, false, hc_option);
 }
 
 inline int SocketMapInsert(const SocketMapKey& key, SocketId* id) {
     std::shared_ptr<SocketSSLContext> empty_ptr;
     HealthCheckOption hc_option;
-    return SocketMapInsert(key, id, empty_ptr, false, false, hc_option);
+    return SocketMapInsert(key, id, empty_ptr, false, hc_option);
 }
 
 // Find the SocketId associated with `key'.
@@ -156,18 +155,17 @@ class SocketMap {
     int Insert(const SocketMapKey& key, SocketId* id,
                const std::shared_ptr<SocketSSLContext>& ssl_ctx,
                bool use_rdma,
-               bool use_gdr,
                const HealthCheckOption& hc_option);
 
     int Insert(const SocketMapKey& key, SocketId* id,
                const std::shared_ptr<SocketSSLContext>& ssl_ctx) {
         HealthCheckOption hc_option;
-        return Insert(key, id, ssl_ctx, false, false, hc_option);
+        return Insert(key, id, ssl_ctx, false, hc_option);   
     }
     int Insert(const SocketMapKey& key, SocketId* id) {
         std::shared_ptr<SocketSSLContext> empty_ptr;
         HealthCheckOption hc_option;
-        return Insert(key, id, empty_ptr, false, false, hc_option);
+        return Insert(key, id, empty_ptr, false, hc_option);
     }
 
     void Remove(const SocketMapKey& key, SocketId expected_id);

From 2a3208a58764d5385064b7ea0fd7d1fc25a4b9b9 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Wed, 27 May 2026 00:48:39 +0800
Subject: [PATCH 11/23] Revert "use IBV_MTU_4096 in gdr"

This reverts commit 007c073dc0610c309d1c12c394545ec8fc292aef.
---
 src/brpc/rdma/rdma_endpoint.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index d21082ed0a..73bf974330 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -1256,11 +1256,7 @@ int RdmaEndpoint::BringUpQp(uint16_t lid, ibv_gid gid, uint32_t qp_num) {
     }
 
     attr.qp_state = IBV_QPS_RTR;
-#if BRPC_WITH_GDR
-    attr.path_mtu = IBV_MTU_4096;  // TODO: detect mtu automatically
-#else
     attr.path_mtu = IBV_MTU_1024;  // TODO: support more mtu in future
-#endif  // if BRPC_WITH_GDR
     attr.ah_attr.grh.dgid = gid;
     attr.ah_attr.grh.flow_label = 0;
     attr.ah_attr.grh.sgid_index = GetRdmaGidIndex();

From f491a316e38fcaf6db3197c63a02edf5f80318a0 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Thu, 28 May 2026 00:04:28 +0800
Subject: [PATCH 12/23] fix build

---
 BUILD.bazel                         | 20 +++----
 bazel/config/BUILD.bazel            |  5 ++
 config_brpc.sh                      | 17 +++++-
 example/BUILD.bazel                 |  5 +-
 example/rdma_performance/client.cpp | 59 +++++++++++++++++++--
 example/rdma_performance/server.cpp |  7 +++
 src/brpc/acceptor.h                 |  2 +-
 src/brpc/channel.cpp                |  2 +
 src/brpc/rdma/rdma_endpoint.cpp     | 82 +++++++++++++++++------------
 src/brpc/rdma/rdma_endpoint.h       |  8 ++-
 src/brpc/rdma/rdma_helper.cpp       | 34 +++++++++---
 src/brpc/rdma/rdma_helper.h         |  4 ++
 src/brpc/rdma_transport.cpp         | 11 ++--
 src/brpc/rdma_transport.h           |  3 +-
 src/brpc/socket_mode.h              |  5 +-
 src/brpc/transport_factory.cpp      | 17 +++++-
 src/butil/gpu/gpu_block_pool.cpp    | 12 +++--
 src/butil/gpu/gpu_block_pool.h      | 14 +----
 18 files changed, 224 insertions(+), 83 deletions(-)

diff --git a/BUILD.bazel b/BUILD.bazel
index 12aeb03cbb..d4c56fe9ce 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -45,6 +45,9 @@ COPTS = [
 }) + select({
     "//bazel/config:brpc_with_rdma": ["-DBRPC_WITH_RDMA=1"],
     "//conditions:default": [""],
+}) + select({
+    "//bazel/config:brpc_with_gdr": ["-DBRPC_WITH_gdr=1"],
+    "//conditions:default": [""],
 }) + select({
     "//bazel/config:brpc_with_debug_bthread_sche_safety": ["-DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=1"],
     "//conditions:default": ["-DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=0"],
@@ -54,9 +57,6 @@ COPTS = [
 }) + select({
     "//bazel/config:brpc_with_asan": ["-fsanitize=address"],
     "//conditions:default": [""],
-}) + select({
-    ":brpc_with_gdr": ["-DBRPC_WITH_GDR=1"],
-    "//conditions:default": [""],
 }) + select({
     "//bazel/config:brpc_with_no_pthread_mutex_hook": ["-DNO_PTHREAD_MUTEX_HOOK"],
     "//conditions:default": [""],
@@ -94,6 +94,11 @@ LINKOPTS = [
         "-libverbs",
     ],
     "//conditions:default": [],
+}) + select({
+    "//bazel/config:brpc_with_gdr": [
+        "-lcuda -lcudart",
+    ],
+    "//conditions:default": [],
 }) + select({
         "//bazel/config:brpc_with_asan": ["-fsanitize=address"],
         "//conditions:default": [""],
@@ -341,9 +346,6 @@ cc_library(
             "-DUNIT_TEST",
         ],
         "//conditions:default": [],
-    }) + select({
-        ":brpc_with_gdr": ["@local_config_cuda//cuda:cuda_headers"],
-        "//conditions:default": [],
     }),
     includes = [
         "src/",
@@ -363,9 +365,6 @@ cc_library(
     }) + select({
         "//bazel/config:brpc_with_boringssl": ["@boringssl//:ssl", "@boringssl//:crypto"],
         "//conditions:default": ["@openssl//:ssl", "@openssl//:crypto"],
-    }) + select({
-        ":brpc_with_gdr": ["@local_config_cuda//cuda:cuda_headers"],
-        "//conditions:default": [],
     }),
 )
 
@@ -583,9 +582,6 @@ cc_library(
             "@org_apache_thrift//:thrift",
         ],
         "//conditions:default": [],
-    }) + select({
-        ":brpc_with_gdr": ["@local_config_cuda//cuda:cuda_headers"],
-        "//conditions:default": [],
     }),
 )
 
diff --git a/bazel/config/BUILD.bazel b/bazel/config/BUILD.bazel
index 06376cf85c..17dbe2bb94 100644
--- a/bazel/config/BUILD.bazel
+++ b/bazel/config/BUILD.bazel
@@ -104,6 +104,11 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "brpc_with_gdr",
+    define_values = {"BRPC_WITH_GDR": "true"},
+    visibility = ["//visibility:public"],
+
 config_setting(
     name = "brpc_with_gdr",
     define_values = {"BRPC_WITH_GDR": "true"},
diff --git a/config_brpc.sh b/config_brpc.sh
index 4526d218a8..de2dfdc74a 100755
--- a/config_brpc.sh
+++ b/config_brpc.sh
@@ -54,10 +54,11 @@ else
     LDD=ldd
 fi
 
-TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-mesalink,with-bthread-tracer,with-debug-bthread-sche-safety,with-debug-lock,with-asan,nodebugsymbols,werror -n 'config_brpc' -- "$@"`
+TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-gdr,with-mesalink,with-bthread-tracer,with-debug-bthread-sche-safety,with-debug-lock,with-asan,nodebugsymbols,werror -n 'config_brpc' -- "$@"`
 WITH_GLOG=0
 WITH_THRIFT=0
 WITH_RDMA=0
+WITH_GDR=0
 WITH_MESALINK=0
 WITH_BTHREAD_TRACER=0
 WITH_ASAN=0
@@ -87,6 +88,7 @@ while true; do
         --with-glog ) WITH_GLOG=1; shift 1 ;;
         --with-thrift) WITH_THRIFT=1; shift 1 ;;
         --with-rdma) WITH_RDMA=1; shift 1 ;;
+        --with-gdr) WITH_GDR=1; shift 1 ;;
         --with-mesalink) WITH_MESALINK=1; shift 1 ;;
         --with-bthread-tracer) WITH_BTHREAD_TRACER=1; shift 1 ;;
         --with-debug-bthread-sche-safety ) BRPC_DEBUG_BTHREAD_SCHE_SAFETY=1; shift 1 ;;
@@ -532,6 +534,18 @@ if [ $WITH_RDMA != 0 ]; then
     append_to_output "WITH_RDMA=1"
 fi
 
+if [ $WITH_GDR != 0 ]; then
+    CUDA_LIB="/usr/local/cuda/lib64"
+    CUDA_HDR="/usr/local/cuda/include"
+    append_to_output_libs "$CUDA_LIB"
+    append_to_output_headers "$CUDA_HDR"
+
+    CPPFLAGS="${CPPFLAGS} -DBRPC_WITH_GDR"
+
+    append_to_output "DYNAMIC_LINKINGS+=-lcuda -lcudart"
+    append_to_output "WITH_GDR=1"
+fi
+
 if [ $WITH_MESALINK != 0 ]; then
     CPPFLAGS="${CPPFLAGS} -DUSE_MESALINK"
 fi
@@ -652,6 +666,7 @@ print_info "System:    $SYSTEM"
 if [ $WITH_GLOG -ne 0 ]; then print_info "With glog: yes"; fi
 if [ $WITH_THRIFT -ne 0 ]; then print_info "With thrift: yes"; fi
 if [ $WITH_RDMA -ne 0 ]; then print_info "With RDMA: yes"; fi
+if [ $WITH_GDR -ne 0 ]; then print_info "With GDR: yes"; fi
 if [ $WITH_MESALINK -ne 0 ]; then print_info "With MesaLink: yes"; fi
 if [ $WITH_BTHREAD_TRACER -ne 0 ]; then print_info "With bthread tracer: yes"; fi
 if [ $WITH_ASAN -ne 0 ]; then print_info "With ASAN: yes"; fi
diff --git a/example/BUILD.bazel b/example/BUILD.bazel
index df2722a4f6..c6bbafac82 100644
--- a/example/BUILD.bazel
+++ b/example/BUILD.bazel
@@ -34,6 +34,9 @@ COPTS = [
 }) + select({
     "//bazel/config:brpc_with_rdma": ["-DBRPC_WITH_RDMA=1"],
     "//conditions:default": [""],
+}) + select({
+    "//bazel/config:brpc_with_gdr": ["-DBRPC_WITH_GDR=1"],
+    "//conditions:default": [""],
 })
 
 proto_library(
@@ -133,4 +136,4 @@ cc_binary(
     deps = [
         "//:brpc",
     ],
-)
\ No newline at end of file
+)
diff --git a/example/rdma_performance/client.cpp b/example/rdma_performance/client.cpp
index 2e8acc4051..f957afd2a4 100644
--- a/example/rdma_performance/client.cpp
+++ b/example/rdma_performance/client.cpp
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#ifdef BRPC_WITH_GDR
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
 #include <stdlib.h>
 #include <unistd.h>
 #include <vector>
@@ -42,6 +46,7 @@ DEFINE_string(connection_type, "single", "Connection type of the channel");
 DEFINE_string(protocol, "baidu_std", "Protocol type.");
 DEFINE_string(servers, "0.0.0.0:8002+0.0.0.0:8002", "IP Address of servers");
 DEFINE_bool(use_rdma, true, "Use RDMA or not");
+DEFINE_bool(use_gdr, false, "Use GDR or not");
 DEFINE_int32(rpc_timeout_ms, 2000, "RPC call timeout");
 DEFINE_int32(test_seconds, 20, "Test running time");
 DEFINE_int32(test_iterations, 0, "Test iterations");
@@ -84,16 +89,47 @@ class PerformanceTest {
         , _stop(false)
     {
         if (attachment_size > 0) {
-            _addr = malloc(attachment_size);
-            butil::fast_rand_bytes(_addr, attachment_size);
-            _attachment.append(_addr, attachment_size);
+#ifdef BRPC_WITH_GDR
+            if (FLAGS_use_gdr) {
+                int gpu_id = 0;
+                cudaSetDevice(gpu_id);
+                cudaMalloc(&_addr, attachment_size);
+                auto pd = brpc::rdma::GetRdmaPd();
+                mr = ibv_reg_mr(pd, _addr, attachment_size,
+                        IBV_ACCESS_LOCAL_WRITE |
+                        IBV_ACCESS_LOCAL_READ |
+                        IBV_ACCESS_REMOTE_WRITE |
+                        );
+                if (!mr) {
+                    LOG(FATAL) << "Failed to register MR:" << strerror(errno)
+                        << ", addr:" << _addr;
+                }
+                auto deleter = [](void* date) {};
+                _attachment.append_user_data_with_meta(_addr, attachment_size, deleter, mr->lkey);
+            }
+            else
+#endif
+            {
+                _addr = malloc(attachment_size);
+                butil::fast_rand_bytes(_addr, attachment_size);
+                _attachment.append(_addr, attachment_size);
+            }
         }
         _echo_attachment = echo_attachment;
     }
 
     ~PerformanceTest() {
         if (_addr) {
-            free(_addr);
+#ifdef BRPC_WITH_GDR
+            if (FLAGS_use_gdr) {
+                ibv_dereg_mr(mr);
+                cudaFree(_addr);
+            }
+            else
+#endif
+            {
+                free(_addr);
+            }
         }
         delete _channel;
     }
@@ -103,6 +139,11 @@ class PerformanceTest {
     int Init() {
         brpc::ChannelOptions options;
         options.socket_mode = FLAGS_use_rdma? brpc::SOCKET_MODE_RDMA : brpc::SOCKET_MODE_TCP;
+#ifdef BRPC_WITH_GDR
+        if (FLAGS_use_gdr) {
+            options.socket_mode = brpc::SOCKET_MODE_GDR;
+        }
+#endif
         options.protocol = FLAGS_protocol;
         options.connection_type = FLAGS_connection_type;
         options.timeout_ms = FLAGS_rpc_timeout_ms;
@@ -203,6 +244,9 @@ class PerformanceTest {
     }
 
 private:
+#ifdef BRPC_WITH_GDR
+    ibv_mr* mr;
+#endif
     void* _addr;
     brpc::Channel* _channel;
     uint64_t _start_time;
@@ -223,6 +267,7 @@ void Test(int thread_num, int attachment_size) {
         << ", Depth: " << FLAGS_queue_depth
         << ", Attachment: " << attachment_size << "B"
         << ", RDMA: " << (FLAGS_use_rdma ? "yes" : "no")
+        << ", GDR: " << (FLAGS_use_gdr ? "yes" : "no")
         << ", Echo: " << (FLAGS_echo_attachment ? "yes]" : "no]")
         << std::endl;
     g_total_bytes.store(0, butil::memory_order_relaxed);
@@ -278,6 +323,12 @@ int main(int argc, char* argv[]) {
     if (FLAGS_use_rdma) {
         brpc::rdma::GlobalRdmaInitializeOrDie();
     }
+#ifdef BRPC_WITH_GDR
+    else if (FLAGS_use_gdr) {
+        brpc::rdma::GlobalRdmaInitializeOrDie();
+        brpc::rdma::GlobalGdrInitializeOrDie();
+    }
+#endif
 
     brpc::StartDummyServerAt(FLAGS_dummy_port);
 
diff --git a/example/rdma_performance/server.cpp b/example/rdma_performance/server.cpp
index 2e93e1eec7..5eaa43a307 100644
--- a/example/rdma_performance/server.cpp
+++ b/example/rdma_performance/server.cpp
@@ -28,6 +28,7 @@
 
 DEFINE_int32(port, 8002, "TCP Port of this server");
 DEFINE_bool(use_rdma, true, "Use RDMA or not");
+DEFINE_bool(use_gdr, false, "Use RDMA or not");
 
 butil::atomic<uint64_t> g_last_time(0);
 
@@ -77,6 +78,12 @@ int main(int argc, char* argv[]) {
 
     brpc::ServerOptions options;
     options.socket_mode = FLAGS_use_rdma? brpc::SOCKET_MODE_RDMA : brpc::SOCKET_MODE_TCP;
+#ifdef BRPC_WITH_GDR
+    if (FLAGS_use_gdr) {
+        options.socket_mode = brpc::SOCKET_MODE_GDR;
+    }
+#endif
+
     if (server.Start(FLAGS_port, &options) != 0) {
         LOG(ERROR) << "Fail to start EchoServer";
         return -1;
diff --git a/src/brpc/acceptor.h b/src/brpc/acceptor.h
index 77942beca2..f28d3f5bce 100644
--- a/src/brpc/acceptor.h
+++ b/src/brpc/acceptor.h
@@ -111,7 +111,7 @@ friend class Server;
     bool _force_ssl;
     std::shared_ptr<SocketSSLContext> _ssl_ctx;
 
-    // Choose to use a certain socket: 0 TCP, 1 RDMA
+    // Choose to use a certain socket: 0 TCP, 1 RDMA, 2 GDR
     SocketMode _socket_mode;
 
     // Acceptor belongs to this tag
diff --git a/src/brpc/channel.cpp b/src/brpc/channel.cpp
index a8caeaf953..08f1445a58 100644
--- a/src/brpc/channel.cpp
+++ b/src/brpc/channel.cpp
@@ -134,6 +134,8 @@ static ChannelSignature ComputeChannelSignature(const ChannelOptions& opt) {
         }
         if (opt.socket_mode == SOCKET_MODE_RDMA) {
             buf.append("|rdma");
+        } else if (opt.socket_mode == SOCKET_MODE_GDR) {
+            buf.append("|gdr");
         }
         butil::MurmurHash3_x64_128_Update(&mm_ctx, buf.data(), buf.size());
         buf.clear();
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 090e788b5c..f4251b4fec 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -169,8 +169,9 @@ RdmaResource::~RdmaResource() {
     }
 }
 
-RdmaEndpoint::RdmaEndpoint(Socket* s)
+RdmaEndpoint::RdmaEndpoint(Socket* s, bool use_gdr)
     : _socket(s)
+    : _use_gdr(use_gdr)
     , _state(UNINIT)
     , _resource(NULL)
     , _send_cq_events(0)
@@ -453,7 +454,7 @@ void* RdmaEndpoint::ProcessHandshakeAtClient(void* arg) {
     local_msg.msg_len = g_rdma_hello_msg_len;
     local_msg.hello_ver = g_rdma_hello_version;
     local_msg.impl_ver = g_rdma_impl_version;
-    local_msg.block_size = g_rdma_recv_block_size;
+    local_msg.block_size = ep->use_gdr() ? g_gdr_recv_block_size : g_rdma_recv_block_size;
     local_msg.sq_size = ep->_sq_size;
     local_msg.rq_size = ep->_rq_size;
     local_msg.lid = GetRdmaLid();
@@ -671,7 +672,7 @@ void* RdmaEndpoint::ProcessHandshakeAtServer(void* arg) {
     } else {
         local_msg.lid = GetRdmaLid();
         local_msg.gid = GetRdmaGid();
-        local_msg.block_size = g_rdma_recv_block_size;
+        local_msg.block_size = ep->use_gdr() ? g_gdr_recv_block_size : g_rdma_recv_block_size;
         local_msg.sq_size = ep->_sq_size;
         local_msg.rq_size = ep->_rq_size;
         local_msg.hello_ver = g_rdma_hello_version;
@@ -1005,10 +1006,14 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
         // Please note that only the first wc.byte_len bytes is valid
         if (wc.byte_len > 0) {
 #if BRPC_WITH_GDR
-            zerocopy = true;
+            if (_use_gdr) {
+                zerocopy = true;
+            } else
 #else
-            if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
-                zerocopy = false;
+            {
+                if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
+                    zerocopy = false;
+                }
             }
 #endif // BRPC_WITH_GDR
             CHECK(_state != FALLBACK_TCP);
@@ -1100,35 +1105,43 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
             _rbuf[_rq_received].clear();
 
 #if BRPC_WITH_GDR
-            butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
-            void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
-            auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
-            lkey = device_allocator->get_lkey(device_ptr);
-            uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
-            _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , data_meta);
-            _rbuf_data[_rq_received] = device_ptr;
+            if (_use_gdr) {
+                butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
+                void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
+                auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
+                lkey = device_allocator->get_lkey(device_ptr);
+                uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
+                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , data_meta);
+                _rbuf_data[_rq_received] = device_ptr;
+            } else
 #else
-            butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
-                    g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
-            int size = 0;
-            if (!os.Next(&_rbuf_data[_rq_received], &size)) {
-                // Memory is not enough for preparing a block
-                PLOG(WARNING) << "Fail to allocate rbuf";
-                return -1;
-            } else {
-                CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
+            {
+                butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
+                        g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
+                int size = 0;
+                if (!os.Next(&_rbuf_data[_rq_received], &size)) {
+                    // Memory is not enough for preparing a block
+                    PLOG(WARNING) << "Fail to allocate rbuf";
+                    return -1;
+                } else {
+                    CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
+                }
             }
 #endif  // if BRPC_WITH_GDR
         }
 #if BRPC_WITH_GDR
-        if (DoPostRecvGDR(_rbuf_data[_rq_received], g_rdma_recv_block_size, lkey) < 0) {
-            _rbuf[_rq_received].clear();
-            return -1;
-        }
+        if (_use_gdr) {
+            if (DoPostRecvGDR(_rbuf_data[_rq_received], g_rdma_recv_block_size, lkey) < 0) {
+                _rbuf[_rq_received].clear();
+                return -1;
+            }
+        } else
 #else
-        if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
-            _rbuf[_rq_received].clear();
-            return -1;
+        {
+            if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
+                _rbuf[_rq_received].clear();
+                return -1;
+            }
         }
 #endif  // if BRPC_WITH_GDR
 
@@ -1676,9 +1689,13 @@ void RdmaEndpoint::DebugInfo(std::ostream& os, butil::StringPiece connector) con
 
 int RdmaEndpoint::GlobalInitialize() {
 #if BRPC_WITH_GDR
-    LOG(INFO) << ", gdr_block_size_kb: " << butil::gdr::gdr_block_size_kb;
-    g_rdma_recv_block_size = butil::gdr::gdr_block_size_kb * 1024 - IOBUF_BLOCK_HEADER_LEN;
-#else
+    g_gdr_recv_block_size = butil::gdr::GetGdrBlockSize() * 1024 - IOBUF_BLOCK_HEADER_LEN;
+    LOG(INFO) << "g_gdr_recv_block_size: " << g_gdr_recv_block_size;
+#endif // BRPC_WITH_GDR
+    return 0;
+}
+
+int RdmaEndpoint::GlobalInitialize() {
     if (FLAGS_rdma_recv_block_type == "default") {
         g_rdma_recv_block_size = GetBlockSize(0) - IOBUF_BLOCK_HEADER_LEN;
     } else if (FLAGS_rdma_recv_block_type == "large") {
@@ -1691,7 +1708,6 @@ int RdmaEndpoint::GlobalInitialize() {
         errno = EINVAL;
         return -1;
     }
-#endif // BRPC_WITH_GDR
 
     LOG(INFO) << "rdma_use_polling :" << FLAGS_rdma_use_polling
       << ", rdma_poller_num : " << FLAGS_rdma_poller_num
diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index 03a3e8415b..ed8c0ed56f 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -74,15 +74,19 @@ class BAIDU_CACHELINE_ALIGNMENT RdmaEndpoint : public SocketUser {
 friend class RdmaConnect;
 friend class Socket;
 public:
-    explicit RdmaEndpoint(Socket* s);
+    explicit RdmaEndpoint(Socket* s, bool use_gdr = false);
     ~RdmaEndpoint() override;
 
     // Global initialization
     // Return 0 if success, -1 if failed and errno set
     static int GlobalInitialize();
 
+    static int GlobalGdrInitialize();
+
     static void GlobalRelease();
 
+    bool use_gdr() { return _use_gdr; }
+
     // Reset the endpoint (for next use)
     void Reset();
 
@@ -306,6 +310,8 @@ friend class Socket;
         std::atomic<bool> running;
     };
     static std::vector<PollerGroup> _poller_groups;
+
+    bool _use_gdr;
 };
 
 }  // namespace rdma
diff --git a/src/brpc/rdma/rdma_helper.cpp b/src/brpc/rdma/rdma_helper.cpp
index 62d1a1707a..0c49e1fdae 100644
--- a/src/brpc/rdma/rdma_helper.cpp
+++ b/src/brpc/rdma/rdma_helper.cpp
@@ -25,7 +25,9 @@
 #include "butil/containers/flat_map.h"            // butil::FlatMap
 #include "butil/fd_guard.h"
 #include "butil/fd_utility.h"                     // butil::make_non_blocking
+#if BRPC_WITH_GDR
 #include "butil/gpu/gpu_block_pool.h"
+#endif
 #include "butil/logging.h"
 #include "brpc/socket.h"
 #include "brpc/rdma/block_pool.h"
@@ -559,13 +561,6 @@ static void GlobalRdmaInitializeOrDieImpl() {
         ExitWithError();
     }
 
-#if BRPC_WITH_GDR
-    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
-        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
-        ExitWithError();
-    }
-#endif  // if BRPC_WITH_GDR
-
     if (RdmaEndpoint::GlobalInitialize() < 0) {
         LOG(ERROR) << "rdma_recv_block_type incorrect "
                    << "(valid value: default/large/huge)";
@@ -594,7 +589,24 @@ static void GlobalRdmaInitializeOrDieImpl() {
     g_rdma_available.store(true, butil::memory_order_relaxed);
 }
 
+static void GlobalGdrInitializeOrDieImpl() {
+#if BRPC_WITH_GDR
+    if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
+        PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
+        ExitWithError();
+    }
+    if (RdmaEndpoint::GlobalGdrInitialize() < 0) {
+        LOG(ERROR) << "gdr_block_size_kb incorrect "
+                   << "(must be larger than 0)";
+        ExitWithError();
+    }
+
+#endif  // if BRPC_WITH_GDR
+
+}
+
 static pthread_once_t initialize_rdma_once = PTHREAD_ONCE_INIT;
+static pthread_once_t initialize_gdr_once = PTHREAD_ONCE_INIT;
 
 void GlobalRdmaInitializeOrDie() {
     if (pthread_once(&initialize_rdma_once,
@@ -604,6 +616,14 @@ void GlobalRdmaInitializeOrDie() {
     }
 }
 
+void GlobalGdrInitializeOrDie() {
+    if (pthread_once(&initialize_gdr_once,
+                     GlobalGdrInitializeOrDieImpl) != 0) {
+        LOG(FATAL) << "Fail to pthread_once GlobalGdrInitializeOrDie";
+        exit(1);
+    }
+}
+
 uint32_t RegisterMemoryForRdma(void* buf, size_t len) {
     ibv_mr* mr = IbvRegMr(g_pd, buf, len, IBV_ACCESS_LOCAL_WRITE);
     if (!mr) {
diff --git a/src/brpc/rdma/rdma_helper.h b/src/brpc/rdma/rdma_helper.h
index 25a93476e7..06cbb1f5c2 100644
--- a/src/brpc/rdma/rdma_helper.h
+++ b/src/brpc/rdma/rdma_helper.h
@@ -33,6 +33,10 @@ namespace rdma {
 // Exit if failed
 void GlobalRdmaInitializeOrDie();
 
+// Initialize GDR environment
+// Exit if failed
+void GlobalGdrInitializeOrDie();
+
 // Initialize RDMA polling mode with tag
 bool InitPollingModeWithTag(bthread_tag_t tag,
                             std::function<void(void)> callback = nullptr,
diff --git a/src/brpc/rdma_transport.cpp b/src/brpc/rdma_transport.cpp
index 88d89a7b06..b3c8404763 100644
--- a/src/brpc/rdma_transport.cpp
+++ b/src/brpc/rdma_transport.cpp
@@ -29,10 +29,11 @@ DECLARE_bool(usercode_in_pthread);
 
 extern SocketVarsCollector *g_vars;
 
-void RdmaTransport::Init(Socket *socket, const SocketOptions &options) {
+void RdmaTransport::DoInit(Socket *socket, const SocketOptions &options, bool use_gdr) {
     CHECK(_rdma_ep == NULL);
-    if (options.socket_mode == SOCKET_MODE_RDMA) {
-        _rdma_ep = new(std::nothrow)rdma::RdmaEndpoint(socket);
+    if (options.socket_mode == SOCKET_MODE_RDMA ||
+            options.socket_mode == SOCKET_MODE_GDR) {
+        _rdma_ep = new(std::nothrow)rdma::RdmaEndpoint(socket, use_gdr);
         if (!_rdma_ep) {
             const int saved_errno = errno;
             PLOG(ERROR) << "Fail to create RdmaEndpoint";
@@ -54,6 +55,10 @@ void RdmaTransport::Init(Socket *socket, const SocketOptions &options) {
     _tcp_transport->Init(socket, options);
 }
 
+void RdmaTransport::Init(Socket *socket, const SocketOptions &options) {
+    DoInit(socket, options, false);
+}
+
 void RdmaTransport::Release() {
     if (_rdma_ep) {
         delete _rdma_ep;
diff --git a/src/brpc/rdma_transport.h b/src/brpc/rdma_transport.h
index 65ae88f7a6..bb579c6ac5 100644
--- a/src/brpc/rdma_transport.h
+++ b/src/brpc/rdma_transport.h
@@ -29,6 +29,7 @@ class RdmaTransport : public Transport {
     friend class rdma::RdmaEndpoint;
     friend class rdma::RdmaConnect;
 public:
+    void DoInit(Socket* socket, const SocketOptions& options, bool use_gdr);
     void Init(Socket* socket, const SocketOptions& options) override;
     void Release() override;
     int Reset(int32_t expected_nref) override;
@@ -62,4 +63,4 @@ class RdmaTransport : public Transport {
 };
 } // namespace brpc
 #endif // BRPC_WITH_RDMA
-#endif //BRPC_RDMA_TRANSPORT_H
\ No newline at end of file
+#endif //BRPC_RDMA_TRANSPORT_H
diff --git a/src/brpc/socket_mode.h b/src/brpc/socket_mode.h
index b5d42be4aa..9fb0276efa 100644
--- a/src/brpc/socket_mode.h
+++ b/src/brpc/socket_mode.h
@@ -20,7 +20,8 @@
 namespace brpc {
 enum SocketMode {
     SOCKET_MODE_TCP = 0,
-    SOCKET_MODE_RDMA = 1
+    SOCKET_MODE_RDMA = 1,
+    SOCKET_MODE_GDR = 2
 };
 } // namespace brpc
-#endif //BRPC_SOCKET_MODE_H
\ No newline at end of file
+#endif //BRPC_SOCKET_MODE_H
diff --git a/src/brpc/transport_factory.cpp b/src/brpc/transport_factory.cpp
index b689e2edd2..1ad61e7ff1 100644
--- a/src/brpc/transport_factory.cpp
+++ b/src/brpc/transport_factory.cpp
@@ -18,6 +18,7 @@
 #include "brpc/transport_factory.h"
 #include "brpc/tcp_transport.h"
 #include "brpc/rdma_transport.h"
+#include "brpc/gdr_transport.h"
 
 namespace brpc {
 int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const void* _options) {
@@ -28,6 +29,15 @@ int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const
     else if (mode == SOCKET_MODE_RDMA) {
         return RdmaTransport::ContextInitOrDie(serverOrNot, _options);
     }
+#endif
+#if BRPC_WITH_GDR
+    else if (mode == SOCKET_MODE_GDR) {
+        // gdr is a special case of rdma, so we should init rdma first;
+        if (RdmaTransport::ContextInitOrDie(serverOrNot, _options) < 0) {
+            return -1;
+        }
+        return GdrTransport::GdrContextInitOrDie(serverOrNot, _options);
+    }
 #endif
     else {
         LOG(ERROR) << "unknown transport type  " << mode;
@@ -43,10 +53,15 @@ std::unique_ptr<Transport> TransportFactory::CreateTransport(SocketMode mode) {
     else if (mode == SOCKET_MODE_RDMA) {
         return std::unique_ptr<RdmaTransport>(new RdmaTransport());
     }
+#endif
+#if BRPC_WITH_GDR
+    else if (mode == SOCKET_MODE_GDR) {
+        return std::unique_ptr<GdrTransport>(new GdrTransport());
+    }
 #endif
     else {
         LOG(ERROR) << "socket_mode set error";
         return nullptr;
     }
 }
-} // namespace brpc
\ No newline at end of file
+} // namespace brpc
diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
index b768e408e8..52b673b14f 100644
--- a/src/butil/gpu/gpu_block_pool.cpp
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -23,6 +23,8 @@
 #include "gpu_block_pool.h"
 namespace butil {
 namespace gdr {
+DEFINE_int32(gdr_block_size_kb, 512, "gdr block size in KB");
+DEFINE_int32(max_gdr_regions, 32, "max num of gdr regions");
 
 #define CHECK_CUDA(call)  \
 do {                                                                     \
@@ -34,6 +36,10 @@ do {                                                                     \
     }                                                                      \
 } while (0);
 
+size_t GetGdrBlockSize() {
+    return FLAGS_gdr_block_size_kb * 1024;
+}
+
 bool verify_same_context() {
   static int original_device = -1;
   static bool first_call = true;
@@ -167,7 +173,7 @@ BlockPoolAllocator::~BlockPoolAllocator() {
     printStatistics();
 #endif
 
-    for (int i = 0; i < max_regions; i++) {
+    for (int i = 0; i < FLAGS_max_gdr_regions; i++) {
         Region* r = &g_regions[i];
         if (!r->mr) {
             return;
@@ -190,7 +196,7 @@ Region* BlockPoolAllocator::GetRegion(const void* buf) {
     }
     Region* r = NULL;
     uintptr_t addr = (uintptr_t)buf;
-    for (int i = 0; i < max_regions; ++i) {
+    for (int i = 0; i < FLAGS_max_gdr_regions; ++i) {
         if (g_regions[i].aligned_start == 0) {
             break;
         }
@@ -277,7 +283,7 @@ void BlockPoolAllocator::printStatistics() const {
 }
 
 void BlockPoolAllocator::extendRegion() {
-    if (g_region_num == max_regions) {
+    if (g_region_num == FLAGS_max_gdr_regions) {
         LOG(FATAL) << "Gdr Memory pool reaches max regions";
         return ;
     }
diff --git a/src/butil/gpu/gpu_block_pool.h b/src/butil/gpu/gpu_block_pool.h
index 6106952c76..c568b731e6 100644
--- a/src/butil/gpu/gpu_block_pool.h
+++ b/src/butil/gpu/gpu_block_pool.h
@@ -36,17 +36,7 @@
 namespace butil {
 namespace gdr {
 
-static int gdr_block_size_kb = [](){
-    int ret = 64;
-    const char* env_var_val = getenv("GDR_BLOCK_SIZE_KB");
-    if (env_var_val == nullptr) {
-        return ret;
-    }
-    ret = std::stoi(env_var_val);
-
-    return ret;
-}();
-
+size_t GetGdrBlockSize();
 void* get_gpu_mem(int gpu_id, int64_t gpu_mem_size);
 void* get_cpu_mem(int gpu_id, int64_t cpu_mem_size);
 
@@ -81,9 +71,7 @@ class BlockPoolAllocator {
     const size_t REGION_SIZE;
 
     BlockHeader* freeList;
-    static constexpr size_t max_regions = 16;
     int g_region_num {0};
-    Region g_regions[max_regions];
     std::mutex poolMutex;
 
     // 统计信息

From 92b99be985fc335fdda8d790ec90df0f4acd79bf Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 29 May 2026 00:40:16 +0800
Subject: [PATCH 13/23] Revert "Change GPU memory detection logics in
 baidu_rpc_protocol"

This reverts commit 1fb9370dfad21d3a3f9ebaa3695ff0338aa3426e.
---
 src/brpc/policy/baidu_rpc_protocol.cpp | 12 ++++++------
 src/butil/iobuf.cpp                    | 16 +---------------
 src/butil/iobuf.h                      | 10 ----------
 3 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index ccff27fd4e..e6605bfc1d 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -119,8 +119,8 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
 
 #if BRPC_WITH_GDR
     void* prefetch_d2h_data = NULL;
-    uint32_t data_meta = source->get_first_data_meta_high32();
-    bool is_gpu_memory = (data_meta == static_cast<uint32_t>(butil::IOBuf::GPU_MEMORY));
+    uint64_t data_meta = source->get_first_data_meta();
+    bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
     butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
     if (is_gpu_memory) {
         prefetch_d2h_data = host_allocator->AllocateRaw(prefetch_d2h_size);
@@ -863,8 +863,8 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
             int body_without_attachment_size = req_size - meta.attachment_size();
 #if BRPC_WITH_GDR
             int meta_size = msg->meta.size();
-            uint32_t data_meta = msg->payload.get_first_data_meta_high32();
-            bool is_gpu_memory = (data_meta == static_cast<uint32_t>(butil::IOBuf::GPU_MEMORY));
+            uint64_t data_meta = msg->payload.get_first_data_meta();
+            bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
             if(is_gpu_memory) {
                 int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
                 if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
@@ -1058,8 +1058,8 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
         butil::IOBuf* res_buf_ptr = &msg->payload;
 
 #if BRPC_WITH_GDR
-        uint32_t data_meta = msg->payload.get_first_data_meta_high32();
-        bool is_gpu_memory = (data_meta == static_cast<uint32_t>(butil::IOBuf::GPU_MEMORY));
+        uint64_t data_meta = msg->payload.get_first_data_meta();
+        bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
 #endif  // BRPC_WITH_GDR
         if (meta.has_attachment_size()) {
             if (meta.attachment_size() > res_size) {
diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp
index bc1e295115..02cb457ff6 100644
--- a/src/butil/iobuf.cpp
+++ b/src/butil/iobuf.cpp
@@ -1193,21 +1193,7 @@ uint64_t IOBuf::get_first_data_meta() {
     if (!(r.block->flags & IOBUF_BLOCK_FLAGS_USER_DATA)) {
         return 0;
     }
-    return (r.block->u.data_meta & 0x00000000FFFFFFFF);
-}
-
-// only when user use append_user_data_with_meta(), lkey is stored in data_meta
-// We add this function for GDR, we want to know whether the data is in Host memory or GPU memory
-// since lkey is uint32_t type, thus we use the high 32 bit to store
-uint32_t IOBuf::get_first_data_meta_high32() {
-    if (_ref_num() == 0) {
-        return 0;
-    }
-    IOBuf::BlockRef const& r = _ref_at(0);
-    if (!(r.block->flags & IOBUF_BLOCK_FLAGS_USER_DATA)) {
-        return 0;
-    }
-    return (uint32_t)(r.block->u.data_meta >> 32);
+    return r.block->u.data_meta;
 }
 
 void* IOBuf::get_first_data_ptr() {
diff --git a/src/butil/iobuf.h b/src/butil/iobuf.h
index 14077f0c29..e554dd0e40 100644
--- a/src/butil/iobuf.h
+++ b/src/butil/iobuf.h
@@ -70,11 +70,6 @@ friend class SingleIOBuf;
     static const size_t DEFAULT_BLOCK_SIZE = 8192;
     static const size_t INITIAL_CAP = 32; // must be power of 2
 
-    enum MemoryMeta {
-        HOST_MEMORY = 0,
-        GPU_MEMORY = 1
-    };
-
     struct Block;
 
     // can't directly use `struct iovec' here because we also need to access the
@@ -270,11 +265,6 @@ friend class SingleIOBuf;
     // The meta is specified with append_user_data_with_meta before.
     // 0 means the meta is invalid.
     uint64_t get_first_data_meta();
-
-    // Get the high 32 bits of the data meta of the first byte in this IOBuf.
-    // The meta is specified with append_user_data_with_meta before.
-    // we use 0 to specify host memory, 1 to specify GPU memory
-    uint32_t get_first_data_meta_high32();
     void* get_first_data_ptr();
 
     // Resizes the buf to a length of n characters.

From 31a911935e4414e99f2209be84a90759345f9cec Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 29 May 2026 00:46:33 +0800
Subject: [PATCH 14/23] Fix

---
 bazel/config/BUILD.bazel            |  5 -----
 example/rdma_performance/client.cpp |  2 +-
 src/brpc/gdr_transport.cpp          | 35 +++++++++++++++++++++++++++++
 src/brpc/gdr_transport.h            | 32 ++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 6 deletions(-)
 create mode 100644 src/brpc/gdr_transport.cpp
 create mode 100644 src/brpc/gdr_transport.h

diff --git a/bazel/config/BUILD.bazel b/bazel/config/BUILD.bazel
index 17dbe2bb94..06376cf85c 100644
--- a/bazel/config/BUILD.bazel
+++ b/bazel/config/BUILD.bazel
@@ -104,11 +104,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "brpc_with_gdr",
-    define_values = {"BRPC_WITH_GDR": "true"},
-    visibility = ["//visibility:public"],
-
 config_setting(
     name = "brpc_with_gdr",
     define_values = {"BRPC_WITH_GDR": "true"},
diff --git a/example/rdma_performance/client.cpp b/example/rdma_performance/client.cpp
index f957afd2a4..5dcf2a26f3 100644
--- a/example/rdma_performance/client.cpp
+++ b/example/rdma_performance/client.cpp
@@ -104,7 +104,7 @@ class PerformanceTest {
                     LOG(FATAL) << "Failed to register MR:" << strerror(errno)
                         << ", addr:" << _addr;
                 }
-                auto deleter = [](void* date) {};
+                auto deleter = [](void* data) {};
                 _attachment.append_user_data_with_meta(_addr, attachment_size, deleter, mr->lkey);
             }
             else
diff --git a/src/brpc/gdr_transport.cpp b/src/brpc/gdr_transport.cpp
new file mode 100644
index 0000000000..005b191380
--- /dev/null
+++ b/src/brpc/gdr_transport.cpp
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#if BRPC_WITH_GDR
+
+#include "brpc/gdr_transport.h"
+#include "brpc/rdma/rdma_helper.h"
+
+namespace brpc {
+
+void GdrTransport::Init(Socket *socket, const SocketOptions &options) {
+    DoInit(socket, options, true);
+}
+
+int GdrTransport::GdrContextInitOrDie() {
+    rdma::GlobalGdrInitializeOrDie();
+    return 0;
+}
+
+} // namespace brpc
+#endif
diff --git a/src/brpc/gdr_transport.h b/src/brpc/gdr_transport.h
new file mode 100644
index 0000000000..0f41c0c4db
--- /dev/null
+++ b/src/brpc/gdr_transport.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef BRPC_GDR_TRANSPORT_H
+#define BRPC_GDR_TRANSPORT_H
+
+#if BRPC_WITH_GDR
+#include "brpc/rdma_transport.h"
+
+namespace brpc {
+class GdrTransport : public RdmaTransport {
+public:
+    void Init(Socket* socket, const SocketOptions& options) override;
+    static int GdrContextInitOrDie();
+};
+} // namespace brpc
+#endif // BRPC_WITH_GDR
+#endif //BRPC_GDR_TRANSPORT_H

From 72b2f889cc2f4a6ebbd16fff4dc94f40ae167019 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 29 May 2026 02:01:47 +0800
Subject: [PATCH 15/23] Fix

---
 src/brpc/policy/baidu_rpc_protocol.cpp | 46 +++++++++++--------------
 src/brpc/rdma/rdma_endpoint.cpp        | 21 +++---------
 src/brpc/rdma/rdma_endpoint.h          | 14 ++++++--
 src/brpc/rdma/rdma_helper.cpp          |  4 +--
 src/brpc/rdma_transport.cpp            |  2 ++
 src/brpc/transport_factory.cpp         |  2 +-
 src/butil/gpu/gpu_block_pool.cpp       | 47 ++++++++++++++++++--------
 src/butil/gpu/gpu_block_pool.h         | 22 +++---------
 src/butil/iobuf.cpp                    | 12 +++++++
 src/butil/iobuf.h                      |  3 ++
 10 files changed, 94 insertions(+), 79 deletions(-)

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index e6605bfc1d..f9decff638 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -76,6 +76,11 @@ DECLARE_bool(pb_enum_as_number);
 // Pack header into `buf'
 
 const int header_size = 12;
+// if we recv data into gpu, the header/meta/body will be copied to cpu and processed.
+// in to to limit the count of d2h, we will prefetch 512B from gpu to cpu.
+// if header_size + meta_size + body_size(without attachment) is less than 512, then one
+// d2h is enough for one rpc.
+
 const int prefetch_d2h_size = 512;
 
 inline void PackRpcHeader(char* rpc_header, uint32_t meta_size, int payload_size) {
@@ -119,8 +124,7 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
 
 #if BRPC_WITH_GDR
     void* prefetch_d2h_data = NULL;
-    uint64_t data_meta = source->get_first_data_meta();
-    bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+    bool is_gpu_memory = source->is_gpu_memory();
     butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
     if (is_gpu_memory) {
         prefetch_d2h_data = host_allocator->AllocateRaw(prefetch_d2h_size);
@@ -130,12 +134,11 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
         n = source->copy_from_gpu(prefetch_d2h_data, prefetch_d2h_size);
         size_t copy_size = n > 12 ? 12 : n;
         memcpy(header_buf, prefetch_d2h_data, copy_size);
-    } else {
+    } else
+#endif  // BRPC_WITH_GDR
+    {
         n = source->copy_to(header_buf, sizeof(header_buf));
     }
-#else
-    n = source->copy_to(header_buf, sizeof(header_buf));
-#endif  // BRPC_WITH_GDR
 
     do {
         if (n >= 4) {
@@ -199,14 +202,12 @@ ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
             source->cutn_from_gpu(&msg->meta, meta_size);
         }
         source->cutn(&msg->payload, body_size - meta_size);
-    } else {
+    } else
+#endif  // BRPC_WITH_GDR
+    {
         source->cutn(&msg->meta, meta_size);
         source->cutn(&msg->payload, body_size - meta_size);
     }
-#else
-    source->cutn(&msg->meta, meta_size);
-    source->cutn(&msg->payload, body_size - meta_size);
-#endif  // BRPC_WITH_GDR
     return MakeMessage(msg);
 }
 
@@ -862,9 +863,7 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
             butil::IOBuf req_buf;
             int body_without_attachment_size = req_size - meta.attachment_size();
 #if BRPC_WITH_GDR
-            int meta_size = msg->meta.size();
-            uint64_t data_meta = msg->payload.get_first_data_meta();
-            bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+            bool is_gpu_memory = msg->payload.is_gpu_memory();
             if(is_gpu_memory) {
                 int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
                 if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
@@ -877,13 +876,11 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
                 } else {
                     msg->payload.cutn_from_gpu(&req_buf, body_without_attachment_size);
                 }
-            }
-            else {
+            } else
+#endif  // BRPC_WITH_GDR
+            {
                 msg->payload.cutn(&req_buf, body_without_attachment_size);
             }
-#else
-            msg->payload.cutn(&req_buf, body_without_attachment_size);
-#endif  // BRPC_WITH_GDR
             if (meta.attachment_size() > 0) {
                 cntl->request_attachment().swap(msg->payload);
             }
@@ -1058,8 +1055,7 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
         butil::IOBuf* res_buf_ptr = &msg->payload;
 
 #if BRPC_WITH_GDR
-        uint64_t data_meta = msg->payload.get_first_data_meta();
-        bool is_gpu_memory = (data_meta > 0 && data_meta <= UINT_MAX);
+        bool is_gpu_memory = msg->payload.is_gpu_memory();
 #endif  // BRPC_WITH_GDR
         if (meta.has_attachment_size()) {
             if (meta.attachment_size() > res_size) {
@@ -1083,13 +1079,11 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
                 } else {
                     msg->payload.cutn_from_gpu(&res_buf, body_without_attachment_size);
                 }
-            }
-            else {
+            } else
+#endif  // BRPC_WITH_GDR
+            {
                 msg->payload.cutn(&res_buf, body_without_attachment_size);
             }
-#else
-            msg->payload.cutn(&res_buf, body_without_attachment_size);
-#endif  // BRPC_WITH_GDR
             res_buf_ptr = &res_buf;
             cntl->response_attachment().swap(msg->payload);
 #if BRPC_WITH_GDR
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index f4251b4fec..57314a06fa 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -19,8 +19,8 @@
 
 #include <gflags/gflags.h>
 #include "butil/fd_utility.h"
-#include "butil/logging.h"                   // CHECK, LOG
 #include "butil/gpu/gpu_block_pool.h"
+#include "butil/logging.h"                   // CHECK, LOG
 #include "butil/sys_byteorder.h"             // HostToNet,NetToHost
 #include "bthread/bthread.h"
 #include "brpc/errno.pb.h"
@@ -1107,14 +1107,13 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
 #if BRPC_WITH_GDR
             if (_use_gdr) {
                 butil::gdr::BlockPoolAllocator* device_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_gpu_allocator();
-                void* device_ptr = device_allocator->AllocateRaw(g_rdma_recv_block_size);
+                void* device_ptr = device_allocator->AllocateRaw(g_gdr_recv_block_size);
                 auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
                 lkey = device_allocator->get_lkey(device_ptr);
-                uint64_t data_meta = (static_cast<uint64_t>(butil::IOBuf::GPU_MEMORY) << 32) | lkey;
-                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_rdma_recv_block_size, deleter , data_meta);
+                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_gdr_recv_block_size, deleter, data_meta, lkey);
                 _rbuf_data[_rq_received] = device_ptr;
             } else
-#else
+#endif  // if BRPC_WITH_GDR
             {
                 butil::IOBufAsZeroCopyOutputStream os(&_rbuf[_rq_received],
                         g_rdma_recv_block_size + IOBUF_BLOCK_HEADER_LEN);
@@ -1127,7 +1126,6 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
                     CHECK(static_cast<uint32_t>(size) == g_rdma_recv_block_size) << size;
                 }
             }
-#endif  // if BRPC_WITH_GDR
         }
 #if BRPC_WITH_GDR
         if (_use_gdr) {
@@ -1136,14 +1134,13 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
                 return -1;
             }
         } else
-#else
+#endif  // if BRPC_WITH_GDR
         {
             if (DoPostRecv(_rbuf_data[_rq_received], g_rdma_recv_block_size) < 0) {
                 _rbuf[_rq_received].clear();
                 return -1;
             }
         }
-#endif  // if BRPC_WITH_GDR
 
         --num;
         ++_rq_received;
@@ -1709,14 +1706,6 @@ int RdmaEndpoint::GlobalInitialize() {
         return -1;
     }
 
-    LOG(INFO) << "rdma_use_polling :" << FLAGS_rdma_use_polling
-      << ", rdma_poller_num : " << FLAGS_rdma_poller_num
-      << ", rdma_poller_yield : " << FLAGS_rdma_poller_yield
-      << ", rdma_sq_size: " << FLAGS_rdma_sq_size
-      << ", rdma_rq_size: " << FLAGS_rdma_rq_size
-      << ", rdma_zerocopy_min_size: " << FLAGS_rdma_zerocopy_min_size
-      << ", g_rdma_recv_block_size: " << g_rdma_recv_block_size;
-
     g_rdma_resource_mutex = new butil::Mutex;
     for (int i = 0; i < FLAGS_rdma_prepared_qp_cnt; ++i) {
         RdmaResource* res = AllocateQpCq(FLAGS_rdma_prepared_qp_size,
diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index ed8c0ed56f..84a5e270c5 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -77,10 +77,12 @@ friend class Socket;
     explicit RdmaEndpoint(Socket* s, bool use_gdr = false);
     ~RdmaEndpoint() override;
 
-    // Global initialization
+    // Global Rdma initialization
     // Return 0 if success, -1 if failed and errno set
     static int GlobalInitialize();
 
+    // Global Gdr initialization
+    // Return 0 if success, -1 if failed and errno set
     static int GlobalGdrInitialize();
 
     static void GlobalRelease();
@@ -180,8 +182,16 @@ friend class Socket;
     //     -1:  failed, errno set
     int DoPostRecv(void* block, size_t block_size);
 
-
+    // Post a WR pointing to the gpu block to the local Recv Queue
+    // Arguments:
+    //     block: the gpu addr to receive data (ibv_sge.addr)
+    //     block_size: the maximum length can be received (ibv_sge.length)
+    //     lkey: the lkey of block
+    // Return:
+    //     0:   success
+    //     -1:  failed, errno set
     int DoPostRecvGDR(void* block, size_t block_size, uint32_t lkey);
+
     // Read at most len bytes from fd in _socket to data
     // wait for _read_butex if encounter EAGAIN
     // return -1 if encounter other errno (including EOF)
diff --git a/src/brpc/rdma/rdma_helper.cpp b/src/brpc/rdma/rdma_helper.cpp
index 0c49e1fdae..7c6820d914 100644
--- a/src/brpc/rdma/rdma_helper.cpp
+++ b/src/brpc/rdma/rdma_helper.cpp
@@ -486,7 +486,6 @@ static void GlobalRdmaInitializeOrDieImpl() {
         ExitWithError();
     }
 
-    g_gpu_index = FLAGS_gpu_index;
     // Find the first active port
     g_port_num = FLAGS_rdma_port;
     int available_devices;
@@ -591,6 +590,8 @@ static void GlobalRdmaInitializeOrDieImpl() {
 
 static void GlobalGdrInitializeOrDieImpl() {
 #if BRPC_WITH_GDR
+    g_gpu_index = FLAGS_gpu_index;
+
     if (!butil::gdr::InitGPUBlockPool(g_gpu_index, GetRdmaPd())) {
         PLOG(ERROR) << "Fail to initialize RDMA GPU memory pool";
         ExitWithError();
@@ -717,7 +718,6 @@ int GetGPUIndex() {
     return g_gpu_index;
 }
 
-
 bool IsRdmaAvailable() {
     return g_rdma_available.load(butil::memory_order_acquire);
 }
diff --git a/src/brpc/rdma_transport.cpp b/src/brpc/rdma_transport.cpp
index b3c8404763..97b231ddcc 100644
--- a/src/brpc/rdma_transport.cpp
+++ b/src/brpc/rdma_transport.cpp
@@ -31,6 +31,8 @@ extern SocketVarsCollector *g_vars;
 
 void RdmaTransport::DoInit(Socket *socket, const SocketOptions &options, bool use_gdr) {
     CHECK(_rdma_ep == NULL);
+    // gdr mode is a special mode of rdma mode.
+    // both rdma mode and gdr mode need init rdma::RdmaEndpoint.
     if (options.socket_mode == SOCKET_MODE_RDMA ||
             options.socket_mode == SOCKET_MODE_GDR) {
         _rdma_ep = new(std::nothrow)rdma::RdmaEndpoint(socket, use_gdr);
diff --git a/src/brpc/transport_factory.cpp b/src/brpc/transport_factory.cpp
index 1ad61e7ff1..76623f505c 100644
--- a/src/brpc/transport_factory.cpp
+++ b/src/brpc/transport_factory.cpp
@@ -36,7 +36,7 @@ int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const
         if (RdmaTransport::ContextInitOrDie(serverOrNot, _options) < 0) {
             return -1;
         }
-        return GdrTransport::GdrContextInitOrDie(serverOrNot, _options);
+        return GdrTransport::GdrContextInitOrDie();
     }
 #endif
     else {
diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
index 52b673b14f..7169df02c4 100644
--- a/src/butil/gpu/gpu_block_pool.cpp
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -19,6 +19,8 @@
 
 #include <iostream>
 #include <chrono>
+#include <gflags/gflags.h>
+#include <sys/stat.h>
 #include "butil/fast_rand.h"
 #include "gpu_block_pool.h"
 namespace butil {
@@ -97,6 +99,19 @@ BlockPoolAllocators* BlockPoolAllocators::singleton() {
     return instance_;
 }
 
+void BlockPoolAllocators::init(int gpu_id, ibv_pd* pd) {
+    LOG(INFO) << "set GPU BlockPoolAllocator for " << gpu_id;
+    size_t region_size = 1024LL * 1024 * 1024;
+    size_t block_size = FLAGS_gdr_block_size_kb * 1024;
+    gpu_mem_alloc = new BlockPoolAllocator(gpu_id, true, pd, block_size, region_size);
+
+    region_size = 32LL * 1024 * 1024;
+    block_size = 512;
+    cpu_mem_alloc = new BlockPoolAllocator(gpu_id, false, pd, block_size, region_size);
+
+    gpu_stream_pool = new GPUStreamPool(gpu_id);
+}
+
 bool InitGPUBlockPool(int gpu_id, ibv_pd* pd) {
     BlockPoolAllocators::singleton()->init(gpu_id, pd);
     return true;
@@ -148,21 +163,21 @@ static BlockHeaderList* get_bh_list() {
     return bh_list;
 }
 
-
-BlockPoolAllocator::BlockPoolAllocator(int gpuId, bool onGpu, ibv_pd* brpc_pd,
-    size_t blockSize, size_t regionSize) :
-    gpu_id(gpuId)
-    , on_gpu(onGpu)
+BlockPoolAllocator::BlockPoolAllocator(int gpu_id, bool on_gpu, ibv_pd* brpc_pd,
+    size_t block_size, size_t region_size) :
+    gpu_id(gpu_id)
+    , on_gpu(on_gpu)
     , pd(brpc_pd)
-    , BLOCK_SIZE(std::max(blockSize, sizeof(BlockHeader)))
-    , REGION_SIZE((regionSize / blockSize) * blockSize)  // 对齐到块大小的倍数
+    , BLOCK_SIZE(std::max(block_size, sizeof(BlockHeader)))
+    , REGION_SIZE((region_size / block_size) * block_size)  // 对齐到块大小的倍数
     , freeList(nullptr)
     , g_region_num(0)
     , totalAllocated(0)
     , totalDeallocated(0)
     , peakUsage(0) {
+    g_regions.resize(FLAGS_max_gdr_regions);
     LOG(INFO) << "Memory Pool initialized: block_size=" << BLOCK_SIZE 
-      << ", region_size=" << REGION_SIZE 
+      << ", region_size=" << REGION_SIZE  << ", max_gdr_regions=" << FLAGS_max_gdr_regions
       << ", gpu_id=" << gpu_id << ", on_gpu=" << on_gpu << ", pd=" << pd;
 
     extendRegion();
@@ -215,7 +230,13 @@ uint32_t BlockPoolAllocator::get_lkey(const void* buf) {
         LOG(ERROR) << "can not get a region for buf " << buf;
         return 0;
     }
-    return r->lkey;
+    
+    if(!r->mr) {
+        LOG(FATAL) << "region has not been registered into rdma yet, addr:" << r->start;
+        return 0;
+    }
+
+    return r->mr->lkey;
 }
 
 void* BlockPoolAllocator::AllocateRaw(size_t num_bytes) {
@@ -247,7 +268,7 @@ void* BlockPoolAllocator::AllocateRaw(size_t num_bytes) {
     auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);
 
 #ifdef DEBUG
-    if (duration.count() > 1000) {  // 如果分配时间超过1微秒
+    if (duration.count() > 1000) {
         LOG(INFO) << "Slow allocation: " << duration.count() << " ns";
     }
 #endif
@@ -268,7 +289,6 @@ void BlockPoolAllocator::DeallocateRaw(void* ptr) {
     totalDeallocated++;
 }
 
-// 获取统计信息
 void BlockPoolAllocator::printStatistics() const {
     LOG(INFO) << "=== Memory Pool Statistics ===";
     LOG(INFO) << "Total regions: " << g_region_num
@@ -314,8 +334,8 @@ void BlockPoolAllocator::extendRegion() {
     auto mr = ibv_reg_mr(pd, aligned_ptr, aligned_bytes,
             IBV_ACCESS_LOCAL_WRITE |
             IBV_ACCESS_REMOTE_READ |
-            IBV_ACCESS_REMOTE_WRITE);
-    //IBV_ACCESS_RELAXED_ORDERING);
+            IBV_ACCESS_REMOTE_WRITE |
+            IBV_ACCESS_RELAXED_ORDERING);
 
     if (!mr) {
         LOG(FATAL) << "Failed to register MR: " << strerror(errno)
@@ -333,7 +353,6 @@ void BlockPoolAllocator::extendRegion() {
     region->mr = mr;
     region->size = REGION_SIZE;
     region->aligned_size = aligned_bytes;
-    region->lkey = mr->lkey;
     region->blockCount = blockCount;
 
 
diff --git a/src/butil/gpu/gpu_block_pool.h b/src/butil/gpu/gpu_block_pool.h
index c568b731e6..f790f9a2f1 100644
--- a/src/butil/gpu/gpu_block_pool.h
+++ b/src/butil/gpu/gpu_block_pool.h
@@ -26,13 +26,11 @@
 #include <string>
 #include <vector>
 #include <mutex>
-#include <infiniband/verbs.h>
 #include "butil/containers/hash_tables.h"
 #include "butil/logging.h"
 #include <cuda_runtime.h>
 #include "cuda.h"
 
-// #include "gdrapi.h"
 namespace butil {
 namespace gdr {
 
@@ -51,7 +49,6 @@ struct Region {
     size_t aligned_size;
     size_t blockCount;
     struct ibv_mr *mr {nullptr};
-    uint32_t lkey;
 };
 
 struct BlockHeader {
@@ -74,7 +71,7 @@ class BlockPoolAllocator {
     int g_region_num {0};
     std::mutex poolMutex;
 
-    // 统计信息
+    // stat
     size_t totalAllocated;
     size_t totalDeallocated;
     size_t peakUsage;
@@ -90,7 +87,6 @@ class BlockPoolAllocator {
 
     void DeallocateRaw(void* ptr);
 
-    // 获取统计信息
     void printStatistics() const;
 
     int64_t getCurrentUsage() const {
@@ -105,10 +101,11 @@ class BlockPoolAllocator {
       return BLOCK_SIZE;
     }
 
+    Region* GetRegion(const void* buf);
+
     uint32_t get_lkey(const void* buf);
 
   private:
-    Region* GetRegion(const void* buf);
     void extendRegion();
 };
 
@@ -147,18 +144,7 @@ class BlockPoolAllocators {
         instance_ = nullptr;
     }
 
-    void init(int gpu_id, ibv_pd* pd) {
-        LOG(INFO) << "set GPU BlockPoolAllocator for " << gpu_id;
-        size_t region_size = 512LL * 1024 * 1024;
-        size_t block_size = gdr_block_size_kb * 1024;
-        gpu_mem_alloc = new BlockPoolAllocator(gpu_id, true, pd, block_size, region_size);
-
-        region_size = 32LL * 1024 * 1024;
-        block_size = 512;
-        cpu_mem_alloc = new BlockPoolAllocator(gpu_id, false, pd, block_size, region_size);
-
-        gpu_stream_pool = new GPUStreamPool(gpu_id);
-    }
+    void init(int gpu_id, ibv_pd* pd);
 
     BlockPoolAllocator* get_gpu_allocator() {
         return gpu_mem_alloc;
diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp
index 02cb457ff6..3dcb52982d 100644
--- a/src/butil/iobuf.cpp
+++ b/src/butil/iobuf.cpp
@@ -40,7 +40,9 @@
 #include "butil/fd_guard.h"                 // butil::fd_guard
 #include "butil/iobuf.h"
 #include "butil/iobuf_profiler.h"
+#ifdef BRPC_WITH_GDR
 #include "butil/gpu/gpu_block_pool.h"
+#endif
 
 namespace butil {
 namespace iobuf {
@@ -1553,6 +1555,16 @@ bool IOBuf::equals(const butil::IOBuf& other) const {
     return true;
 }
 
+#if BRPC_WITH_GDR
+// when IOBuf is used for send, data_meta is set by user;
+// when IOBf is used for recv and gdr is open, data_meta is set by brpc
+//  and it is lkey.
+bool IOBuf::is_gpu_memory() {
+    uint64_t data_meta = get_first_data_meta();
+    return (data_meta > 0 && data_meta <= UINT_MAX);
+}
+#endif
+
 ////////////////////////////// IOPortal //////////////////
 IOPortal::~IOPortal() { return_cached_blocks(); }
 
diff --git a/src/butil/iobuf.h b/src/butil/iobuf.h
index e554dd0e40..417c4aced8 100644
--- a/src/butil/iobuf.h
+++ b/src/butil/iobuf.h
@@ -145,6 +145,7 @@ friend class SingleIOBuf;
 #if BRPC_WITH_GDR
     size_t cutn_from_gpu(IOBuf* out, size_t n);
     size_t copy_from_gpu(void* d, size_t n, size_t pos = 0, bool to_gpu = false) const;
+    bool is_gpu_memory();
 #endif  // BRPC_WITH_GDR
 
     // Cut off 1 byte from the front side and set to *c
@@ -265,6 +266,8 @@ friend class SingleIOBuf;
     // The meta is specified with append_user_data_with_meta before.
     // 0 means the meta is invalid.
     uint64_t get_first_data_meta();
+
+    // Get the data addr of the first byte in this IOBuf.
     void* get_first_data_ptr();
 
     // Resizes the buf to a length of n characters.

From e01a54addb23cf4241973be7b17713eaa0c1290d Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 29 May 2026 23:10:27 +0800
Subject: [PATCH 16/23] Fix

---
 Makefile                               |  1 +
 src/brpc/policy/baidu_rpc_protocol.cpp |  8 +++++---
 src/brpc/rdma/rdma_endpoint.cpp        |  7 +++++--
 src/butil/gpu/gpu_block_pool.cpp       | 14 +++++++-------
 src/butil/gpu/gpu_block_pool.h         |  4 ++--
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index abe029e360..29390ac6f5 100644
--- a/Makefile
+++ b/Makefile
@@ -97,6 +97,7 @@ BUTIL_SOURCES = \
     src/butil/files/scoped_temp_dir.cc \
     src/butil/file_util.cc \
     src/butil/file_util_posix.cc \
+    src/butil/gpu/gpu_block_pool.cpp \
     src/butil/guid.cc \
     src/butil/guid_posix.cc \
     src/butil/hash.cc \
diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index f9decff638..24e3b1d3b4 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -22,9 +22,11 @@
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/text_format.h>
 
-#include "butil/logging.h"                       // LOG()
-#include "butil/iobuf.h"                         // butil::IOBuf
+#if BRPC_WITH_GDR
 #include "butil/gpu/gpu_block_pool.h"
+#endif
+#include "butil/iobuf.h"                         // butil::IOBuf
+#include "butil/logging.h"                       // LOG()
 #include "butil/raw_pack.h"                      // RawPacker RawUnpacker
 #include "butil/memory/scope_guard.h"
 #include "butil/raw_pack.h"                      // RawPacker RawUnpacker
@@ -1050,11 +1052,11 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
         } 
         // Parse response message iff error code from meta is 0
         butil::IOBuf res_buf;
-        int meta_size = msg->meta.size();
         const int res_size = msg->payload.length();
         butil::IOBuf* res_buf_ptr = &msg->payload;
 
 #if BRPC_WITH_GDR
+        int meta_size = msg->meta.size();
         bool is_gpu_memory = msg->payload.is_gpu_memory();
 #endif  // BRPC_WITH_GDR
         if (meta.has_attachment_size()) {
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 57314a06fa..d39a4e41af 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -19,7 +19,9 @@
 
 #include <gflags/gflags.h>
 #include "butil/fd_utility.h"
+#if BRPC_WITH_GDR
 #include "butil/gpu/gpu_block_pool.h"
+#endif
 #include "butil/logging.h"                   // CHECK, LOG
 #include "butil/sys_byteorder.h"             // HostToNet,NetToHost
 #include "bthread/bthread.h"
@@ -93,6 +95,7 @@ static uint16_t g_rdma_hello_msg_len = 40;  // In Byte
 static uint16_t g_rdma_hello_version = 2;
 static uint16_t g_rdma_impl_version = 1;
 static uint32_t g_rdma_recv_block_size = 0;
+static uint32_t g_gdr_recv_block_size = 0;
 
 // static const uint32_t MAX_INLINE_DATA = 64;
 static const uint8_t MAX_HOP_LIMIT = 16;
@@ -1129,7 +1132,7 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
         }
 #if BRPC_WITH_GDR
         if (_use_gdr) {
-            if (DoPostRecvGDR(_rbuf_data[_rq_received], g_rdma_recv_block_size, lkey) < 0) {
+            if (DoPostRecvGDR(_rbuf_data[_rq_received], g_gdr_recv_block_size, lkey) < 0) {
                 _rbuf[_rq_received].clear();
                 return -1;
             }
@@ -1684,7 +1687,7 @@ void RdmaEndpoint::DebugInfo(std::ostream& os, butil::StringPiece connector) con
        << connector << "rdma_unsignaled_sq_wr=" << _sq_unsignaled;
 }
 
-int RdmaEndpoint::GlobalInitialize() {
+int RdmaEndpoint::GlobalGdrInitialize() {
 #if BRPC_WITH_GDR
     g_gdr_recv_block_size = butil::gdr::GetGdrBlockSize() * 1024 - IOBUF_BLOCK_HEADER_LEN;
     LOG(INFO) << "g_gdr_recv_block_size: " << g_gdr_recv_block_size;
diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
index 7169df02c4..a100cf8fb0 100644
--- a/src/butil/gpu/gpu_block_pool.cpp
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -163,13 +163,13 @@ static BlockHeaderList* get_bh_list() {
     return bh_list;
 }
 
-BlockPoolAllocator::BlockPoolAllocator(int gpu_id, bool on_gpu, ibv_pd* brpc_pd,
-    size_t block_size, size_t region_size) :
-    gpu_id(gpu_id)
-    , on_gpu(on_gpu)
-    , pd(brpc_pd)
-    , BLOCK_SIZE(std::max(block_size, sizeof(BlockHeader)))
-    , REGION_SIZE((region_size / block_size) * block_size)  // 对齐到块大小的倍数
+BlockPoolAllocator::BlockPoolAllocator(int gpuId, bool onGpu, ibv_pd* ibvPd,
+    size_t blockSize, size_t regionSize) :
+    gpu_id(gpuId)
+    , on_gpu(onGpu)
+    , pd(ibvPd)
+    , BLOCK_SIZE(std::max(blockSize, sizeof(BlockHeader)))
+    , REGION_SIZE((regionSize / blockSize) * blockSize)  // 对齐到块大小的倍数
     , freeList(nullptr)
     , g_region_num(0)
     , totalAllocated(0)
diff --git a/src/butil/gpu/gpu_block_pool.h b/src/butil/gpu/gpu_block_pool.h
index f790f9a2f1..16da712178 100644
--- a/src/butil/gpu/gpu_block_pool.h
+++ b/src/butil/gpu/gpu_block_pool.h
@@ -77,8 +77,8 @@ class BlockPoolAllocator {
     size_t peakUsage;
 
   public:
-    explicit BlockPoolAllocator(int gpu_id,
-                                bool on_gpu, ibv_pd* pd,
+    explicit BlockPoolAllocator(int gpuId,
+                                bool onGpu, ibv_pd* ibvPd,
                                 size_t blockSize, size_t regionSize);
 
     ~BlockPoolAllocator();

From 0e940678520dc721b0632d0c5530b00fd976d025 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 29 May 2026 23:21:27 +0800
Subject: [PATCH 17/23] Fix

---
 src/brpc/policy/baidu_rpc_protocol.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index 24e3b1d3b4..be0dc4c045 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -77,6 +77,7 @@ DECLARE_bool(pb_enum_as_number);
 
 // Pack header into `buf'
 
+#if BRPC_WITH_GDR
 const int header_size = 12;
 // if we recv data into gpu, the header/meta/body will be copied to cpu and processed.
 // in to to limit the count of d2h, we will prefetch 512B from gpu to cpu.
@@ -84,6 +85,7 @@ const int header_size = 12;
 // d2h is enough for one rpc.
 
 const int prefetch_d2h_size = 512;
+#endif
 
 inline void PackRpcHeader(char* rpc_header, uint32_t meta_size, int payload_size) {
     uint32_t* dummy = (uint32_t*)rpc_header;  // suppress strict-alias warning

From 812cdc64cf8efcacb56ed22e22f0547523eda688 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Fri, 29 May 2026 23:36:18 +0800
Subject: [PATCH 18/23] Fix

---
 src/brpc/rdma/rdma_endpoint.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index d39a4e41af..5b1e0a933b 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -174,7 +174,7 @@ RdmaResource::~RdmaResource() {
 
 RdmaEndpoint::RdmaEndpoint(Socket* s, bool use_gdr)
     : _socket(s)
-    : _use_gdr(use_gdr)
+    , _use_gdr(use_gdr)
     , _state(UNINIT)
     , _resource(NULL)
     , _send_cq_events(0)
@@ -1103,7 +1103,9 @@ int RdmaEndpoint::DoPostRecvGDR(void* block, size_t block_size, uint32_t lkey) {
 int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
     // We do the post repeatedly from the _rbuf[_rq_received].
     while (num > 0) {
+#if BRPC_WITH_GDR
         uint32_t lkey = 0;
+#endif  // if BRPC_WITH_GDR
         if (zerocopy) {
             _rbuf[_rq_received].clear();
 

From 1f2fa3a2128d40148309b3ac008c41c4e559da8c Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Sat, 30 May 2026 00:22:08 +0800
Subject: [PATCH 19/23] Fix

---
 src/brpc/rdma/rdma_endpoint.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/brpc/rdma/rdma_endpoint.h b/src/brpc/rdma/rdma_endpoint.h
index 84a5e270c5..d6e891903e 100644
--- a/src/brpc/rdma/rdma_endpoint.h
+++ b/src/brpc/rdma/rdma_endpoint.h
@@ -237,6 +237,9 @@ friend class Socket;
     // Not owner
     Socket* _socket;
 
+    // whether open gpu direct rdma
+    bool _use_gdr;
+
     // State of Handshake
     State _state;
 
@@ -320,8 +323,6 @@ friend class Socket;
         std::atomic<bool> running;
     };
     static std::vector<PollerGroup> _poller_groups;
-
-    bool _use_gdr;
 };
 
 }  // namespace rdma

From 3a17a4dcff99efaf8f12c8c74f2c8b43f7e884eb Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Sat, 30 May 2026 14:02:15 +0800
Subject: [PATCH 20/23] Fix

---
 src/brpc/policy/baidu_rpc_protocol.cpp | 1 +
 src/brpc/rdma/rdma_endpoint.cpp        | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index be0dc4c045..decb07fc9a 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -867,6 +867,7 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
             butil::IOBuf req_buf;
             int body_without_attachment_size = req_size - meta.attachment_size();
 #if BRPC_WITH_GDR
+            int meta_size = msg->meta.size();
             bool is_gpu_memory = msg->payload.is_gpu_memory();
             if(is_gpu_memory) {
                 int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 5b1e0a933b..6f77d2cf3d 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -1115,7 +1115,7 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
                 void* device_ptr = device_allocator->AllocateRaw(g_gdr_recv_block_size);
                 auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
                 lkey = device_allocator->get_lkey(device_ptr);
-                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_gdr_recv_block_size, deleter, data_meta, lkey);
+                _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_gdr_recv_block_size, deleter, lkey);
                 _rbuf_data[_rq_received] = device_ptr;
             } else
 #endif  // if BRPC_WITH_GDR

From 77043baf009ae355081d465044771ac47c2d7a6b Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Sat, 30 May 2026 14:27:03 +0800
Subject: [PATCH 21/23] baidu_rpc_protocol_gpu.cpp

---
 src/brpc/policy/baidu_rpc_protocol.cpp     | 204 +++++-------------
 src/brpc/policy/baidu_rpc_protocol.h       |  10 +
 src/brpc/policy/baidu_rpc_protocol_gpu.cpp | 227 +++++++++++++++++++++
 src/brpc/rdma/rdma_endpoint.cpp            |   1 +
 4 files changed, 288 insertions(+), 154 deletions(-)
 create mode 100644 src/brpc/policy/baidu_rpc_protocol_gpu.cpp

diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index decb07fc9a..43bf7a8b6d 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -22,12 +22,8 @@
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/text_format.h>
 
-#if BRPC_WITH_GDR
-#include "butil/gpu/gpu_block_pool.h"
-#endif
 #include "butil/iobuf.h"                         // butil::IOBuf
 #include "butil/logging.h"                       // LOG()
-#include "butil/raw_pack.h"                      // RawPacker RawUnpacker
 #include "butil/memory/scope_guard.h"
 #include "butil/raw_pack.h"                      // RawPacker RawUnpacker
 #include "butil/strings/string_util.h"
@@ -76,17 +72,6 @@ DECLARE_bool(pb_enum_as_number);
 // 5. Not supported: chunk_info
 
 // Pack header into `buf'
-
-#if BRPC_WITH_GDR
-const int header_size = 12;
-// if we recv data into gpu, the header/meta/body will be copied to cpu and processed.
-// in to to limit the count of d2h, we will prefetch 512B from gpu to cpu.
-// if header_size + meta_size + body_size(without attachment) is less than 512, then one
-// d2h is enough for one rpc.
-
-const int prefetch_d2h_size = 512;
-#endif
-
 inline void PackRpcHeader(char* rpc_header, uint32_t meta_size, int payload_size) {
     uint32_t* dummy = (uint32_t*)rpc_header;  // suppress strict-alias warning
     *dummy = *(uint32_t*)"PRPC";
@@ -119,99 +104,50 @@ static void SerializeRpcHeaderAndMeta(
 
 ParseResult ParseRpcMessage(butil::IOBuf* source, Socket* socket,
                             bool /*read_eof*/, const void*) {
-
-    char header_buf[12];
-    size_t n = 0;
-    uint32_t body_size;
-    uint32_t meta_size;
-    ParseError pe = PARSE_OK;
-
 #if BRPC_WITH_GDR
-    void* prefetch_d2h_data = NULL;
     bool is_gpu_memory = source->is_gpu_memory();
-    butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
     if (is_gpu_memory) {
-        prefetch_d2h_data = host_allocator->AllocateRaw(prefetch_d2h_size);
-        if (prefetch_d2h_data == NULL) {
-            LOG(FATAL) << "alloc host data failed!!!";
-        }
-        n = source->copy_from_gpu(prefetch_d2h_data, prefetch_d2h_size);
-        size_t copy_size = n > 12 ? 12 : n;
-        memcpy(header_buf, prefetch_d2h_data, copy_size);
-    } else
-#endif  // BRPC_WITH_GDR
-    {
-        n = source->copy_to(header_buf, sizeof(header_buf));
+        return ParseRpcMessageGpu(source, socket, false /* not use */, nullptr /* not use */);
     }
-
-    do {
-        if (n >= 4) {
-            void* dummy = header_buf;
-            if (*(const uint32_t*)dummy != *(const uint32_t*)"PRPC") {
-                pe = PARSE_ERROR_TRY_OTHERS;
-                break;
-            }
-        } else {
-            if (memcmp(header_buf, "PRPC", n) != 0) {
-                pe = PARSE_ERROR_TRY_OTHERS;
-                break;
-            }
-        }
-        if (n < sizeof(header_buf)) {
-            pe = PARSE_ERROR_NOT_ENOUGH_DATA;
-            break;
-        }
-        butil::RawUnpacker(header_buf + 4).unpack32(body_size).unpack32(meta_size);
-        if (body_size > FLAGS_max_body_size) {
-            // We need this log to report the body_size to give users some clues
-            // which is not printed in InputMessenger.
-            LOG(ERROR) << "body_size=" << body_size << " from "
-                       << socket->remote_side() << " is too large";
-            pe = PARSE_ERROR_TOO_BIG_DATA;
-            break;
-        } else if (source->length() < sizeof(header_buf) + body_size) {
-            pe = PARSE_ERROR_NOT_ENOUGH_DATA;
-            break;
-        }
-        if (meta_size > body_size) {
-            LOG(ERROR) << "meta_size=" << meta_size << " is bigger than body_size="
-                       << body_size;
-            // Pop the message
-            source->pop_front(sizeof(header_buf) + body_size);
-            pe = PARSE_ERROR_TRY_OTHERS;
-            break;
+#endif  // BRPC_WITH_GDR
+    char header_buf[12];
+    const size_t n = source->copy_to(header_buf, sizeof(header_buf));
+    if (n >= 4) {
+        void* dummy = header_buf;
+        if (*(const uint32_t*)dummy != *(const uint32_t*)"PRPC") {
+            return MakeParseError(PARSE_ERROR_TRY_OTHERS);
         }
-    } while (0);
-
-    if (pe != PARSE_OK) {
-#if BRPC_WITH_GDR
-        if (is_gpu_memory) {
-            host_allocator->DeallocateRaw(prefetch_d2h_data);
+    } else {
+        if (memcmp(header_buf, "PRPC", n) != 0) {
+            return MakeParseError(PARSE_ERROR_TRY_OTHERS);
         }
-#endif  // BRPC_WITH_GDR
-        return MakeParseError(pe);
     }
-
+    if (n < sizeof(header_buf)) {
+        return MakeParseError(PARSE_ERROR_NOT_ENOUGH_DATA);
+    }
+    uint32_t body_size;
+    uint32_t meta_size;
+    butil::RawUnpacker(header_buf + 4).unpack32(body_size).unpack32(meta_size);
+    if (body_size > FLAGS_max_body_size) {
+        // We need this log to report the body_size to give users some clues
+        // which is not printed in InputMessenger.
+        LOG(ERROR) << "body_size=" << body_size << " from "
+                   << socket->remote_side() << " is too large";
+        return MakeParseError(PARSE_ERROR_TOO_BIG_DATA);
+    } else if (source->length() < sizeof(header_buf) + body_size) {
+        return MakeParseError(PARSE_ERROR_NOT_ENOUGH_DATA);
+    }
+    if (meta_size > body_size) {
+        LOG(ERROR) << "meta_size=" << meta_size << " is bigger than body_size="
+                   << body_size;
+        // Pop the message
+        source->pop_front(sizeof(header_buf) + body_size);
+        return MakeParseError(PARSE_ERROR_TRY_OTHERS);
+    }
     source->pop_front(sizeof(header_buf));
     MostCommonMessage* msg = MostCommonMessage::Get();
-
-#if BRPC_WITH_GDR
-    if (is_gpu_memory) {
-        if (header_size + meta_size <= n) {
-            auto deleter = [host_allocator, prefetch_d2h_data](void* data) { host_allocator->DeallocateRaw(prefetch_d2h_data); };
-            msg->meta.append_user_data_with_meta((char*)prefetch_d2h_data + header_size, meta_size, deleter, n);
-            source->pop_front(meta_size);
-        } else {
-            host_allocator->DeallocateRaw(prefetch_d2h_data);
-            source->cutn_from_gpu(&msg->meta, meta_size);
-        }
-        source->cutn(&msg->payload, body_size - meta_size);
-    } else
-#endif  // BRPC_WITH_GDR
-    {
-        source->cutn(&msg->meta, meta_size);
-        source->cutn(&msg->payload, body_size - meta_size);
-    }
+    source->cutn(&msg->meta, meta_size);
+    source->cutn(&msg->payload, body_size - meta_size);
     return MakeMessage(msg);
 }
 
@@ -869,18 +805,8 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
 #if BRPC_WITH_GDR
             int meta_size = msg->meta.size();
             bool is_gpu_memory = msg->payload.is_gpu_memory();
-            if(is_gpu_memory) {
-                int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
-                if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
-                    void* data = msg->meta.get_first_data_ptr();
-                    if (data == nullptr) {
-                        LOG(FATAL) << "illegal data!!!";
-                    }
-                    req_buf.append((char*)data + meta_size, body_without_attachment_size);
-                    msg->payload.pop_front(body_without_attachment_size);
-                } else {
-                    msg->payload.cutn_from_gpu(&req_buf, body_without_attachment_size);
-                }
+            if (is_gpu_memory) {
+                FillReqBufGpu(&req_buf, msg, body_without_attachment_size);
             } else
 #endif  // BRPC_WITH_GDR
             {
@@ -1057,55 +983,25 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
         butil::IOBuf res_buf;
         const int res_size = msg->payload.length();
         butil::IOBuf* res_buf_ptr = &msg->payload;
-
 #if BRPC_WITH_GDR
-        int meta_size = msg->meta.size();
         bool is_gpu_memory = msg->payload.is_gpu_memory();
+        if (is_gpu_memory) {
+            FillResBufGpu(&res_buf, msg, meta);
+        } else
 #endif  // BRPC_WITH_GDR
-        if (meta.has_attachment_size()) {
-            if (meta.attachment_size() > res_size) {
-                cntl->SetFailed(
-                    ERESPONSE, "attachment_size=%d is larger than response_size=%d",
-                    meta.attachment_size(), res_size);
-                break;
-            }
-            int body_without_attachment_size = res_size - meta.attachment_size();
-
-#if BRPC_WITH_GDR
-            if(is_gpu_memory) {
-                int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
-                if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
-                    void* data = msg->meta.get_first_data_ptr();
-                    if (data == nullptr) {
-                        LOG(FATAL) << "illegal data!!!";
-                    }
-                    res_buf.append((char*)data + meta_size, body_without_attachment_size);
-                    msg->payload.pop_front(body_without_attachment_size);
-                } else {
-                    msg->payload.cutn_from_gpu(&res_buf, body_without_attachment_size);
+        {
+            if (meta.has_attachment_size()) {
+                if (meta.attachment_size() > res_size) {
+                    cntl->SetFailed(
+                            ERESPONSE, "attachment_size=%d is larger than response_size=%d",
+                            meta.attachment_size(), res_size);
+                    break;
                 }
-            } else
-#endif  // BRPC_WITH_GDR
-            {
+                int body_without_attachment_size = res_size - meta.attachment_size();
                 msg->payload.cutn(&res_buf, body_without_attachment_size);
+                res_buf_ptr = &res_buf;
+                cntl->response_attachment().swap(msg->payload);
             }
-            res_buf_ptr = &res_buf;
-            cntl->response_attachment().swap(msg->payload);
-#if BRPC_WITH_GDR
-        } else if(is_gpu_memory) {
-            int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
-            if (header_size + meta_size + res_size <= real_prefetch_d2h_size) {
-                void* data = msg->meta.get_first_data_ptr();
-                if (data == nullptr) {
-                    LOG(FATAL) << "illegal data!!!";
-                }
-                res_buf.append((char*)data + meta_size, res_size);
-                msg->payload.pop_front(res_size);
-            } else {
-                msg->payload.cutn_from_gpu(&res_buf, res_size);
-            }
-            res_buf_ptr = &res_buf;
-#endif  // BRPC_WITH_GDR
         }
 
         ContentType content_type = meta.content_type();
diff --git a/src/brpc/policy/baidu_rpc_protocol.h b/src/brpc/policy/baidu_rpc_protocol.h
index 77ecc780a2..cd0046302d 100644
--- a/src/brpc/policy/baidu_rpc_protocol.h
+++ b/src/brpc/policy/baidu_rpc_protocol.h
@@ -53,6 +53,16 @@ void PackRpcRequest(butil::IOBuf* buf,
 // Returns the `name' of the 'content_type'.
 const char* ContentTypeToCStr(ContentType content_type);
 
+#if BRPC_WITH_GDR
+// Parse binary format of baidu_std
+ParseResult ParseRpcMessageGpu(butil::IOBuf* source, Socket *socket, bool read_eof,
+                            const void *arg);
+
+void FillReqBufGpu(butil::IOBuf* req_buf, MostCommonMessage* msg, int body_without_attachment_size);
+
+void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta& meta);
+
+#endif
 }  // namespace policy
 } // namespace brpc
 
diff --git a/src/brpc/policy/baidu_rpc_protocol_gpu.cpp b/src/brpc/policy/baidu_rpc_protocol_gpu.cpp
new file mode 100644
index 0000000000..6ec373b9e0
--- /dev/null
+++ b/src/brpc/policy/baidu_rpc_protocol_gpu.cpp
@@ -0,0 +1,227 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#if BRPC_WITH_GDR
+
+#include <google/protobuf/descriptor.h>         // MethodDescriptor
+#include <google/protobuf/message.h>            // Message
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/text_format.h>
+
+#include "butil/gpu/gpu_block_pool.h"
+#include "butil/iobuf.h"                         // butil::IOBuf
+#include "butil/logging.h"                       // LOG()
+#include "butil/raw_pack.h"                      // RawPacker RawUnpacker
+#include "butil/memory/scope_guard.h"
+#include "butil/raw_pack.h"                      // RawPacker RawUnpacker
+#include "butil/strings/string_util.h"
+
+#include "json2pb/json_to_pb.h"
+#include "json2pb/pb_to_json.h"
+#include "brpc/controller.h"                    // Controller
+#include "brpc/socket.h"                        // Socket
+#include "brpc/server.h"                        // Server
+#include "brpc/span.h"
+#include "brpc/compress.h"                      // ParseFromCompressedData
+#include "brpc/checksum.h"
+#include "brpc/stream_impl.h"
+#include "brpc/rpc_dump.h"                      // SampledRequest
+#include "brpc/rpc_pb_message_factory.h"
+#include "brpc/policy/baidu_rpc_meta.pb.h"      // RpcRequestMeta
+#include "brpc/policy/baidu_rpc_protocol.h"
+#include "brpc/policy/most_common_message.h"
+#include "brpc/policy/streaming_rpc_protocol.h"
+#include "brpc/details/usercode_backup_pool.h"
+#include "brpc/details/controller_private_accessor.h"
+#include "brpc/details/server_private_accessor.h"
+
+namespace brpc {
+namespace policy {
+
+// Notes:
+// 1. 12-byte header [PRPC][body_size][meta_size]
+// 2. body_size and meta_size are in network byte order
+// 3. Use service->full_name() + method_name to specify the method to call
+// 4. `attachment_size' is set iff request/response has attachment
+// 5. Not supported: chunk_info
+
+// Pack header into `buf'
+
+const int header_size = 12;
+// if we recv data into gpu, the header/meta/body will be copied to cpu and processed.
+// in to to limit the count of d2h, we will prefetch 512B from gpu to cpu.
+// if header_size + meta_size + body_size(without attachment) is less than 512, then one
+// d2h is enough for one rpc.
+
+const int prefetch_d2h_size = 512;
+
+ParseResult ParseRpcMessageGpu(butil::IOBuf* source, Socket* socket,
+                            bool /*read_eof*/, const void*) {
+
+    char header_buf[12];
+    size_t n = 0;
+    uint32_t body_size;
+    uint32_t meta_size;
+    ParseError pe = PARSE_OK;
+
+    void* prefetch_d2h_data = NULL;
+    bool is_gpu_memory = source->is_gpu_memory();
+    if (!is_gpu_memory) {
+        LOG(FATAL) << "RpcMessage is not in gpu!!!";
+    }
+    butil::gdr::BlockPoolAllocator* host_allocator = butil::gdr::BlockPoolAllocators::singleton()->get_cpu_allocator();
+    prefetch_d2h_data = host_allocator->AllocateRaw(prefetch_d2h_size);
+    if (prefetch_d2h_data == NULL) {
+        LOG(FATAL) << "alloc host data failed!!!";
+    }
+
+    // n is the bytes we real frefetch, n maybe less than prefetch_d2h_size;
+    n = source->copy_from_gpu(prefetch_d2h_data, prefetch_d2h_size);
+    size_t copy_size = n > 12 ? 12 : n;
+    memcpy(header_buf, prefetch_d2h_data, copy_size);
+
+    do {
+        if (n >= 4) {
+            void* dummy = header_buf;
+            if (*(const uint32_t*)dummy != *(const uint32_t*)"PRPC") {
+                pe = PARSE_ERROR_TRY_OTHERS;
+                break;
+            }
+        } else {
+            if (memcmp(header_buf, "PRPC", n) != 0) {
+                pe = PARSE_ERROR_TRY_OTHERS;
+                break;
+            }
+        }
+        if (n < sizeof(header_buf)) {
+            pe = PARSE_ERROR_NOT_ENOUGH_DATA;
+            break;
+        }
+        butil::RawUnpacker(header_buf + 4).unpack32(body_size).unpack32(meta_size);
+        if (body_size > FLAGS_max_body_size) {
+            // We need this log to report the body_size to give users some clues
+            // which is not printed in InputMessenger.
+            LOG(ERROR) << "body_size=" << body_size << " from "
+                       << socket->remote_side() << " is too large";
+            pe = PARSE_ERROR_TOO_BIG_DATA;
+            break;
+        } else if (source->length() < sizeof(header_buf) + body_size) {
+            pe = PARSE_ERROR_NOT_ENOUGH_DATA;
+            break;
+        }
+        if (meta_size > body_size) {
+            LOG(ERROR) << "meta_size=" << meta_size << " is bigger than body_size="
+                       << body_size;
+            // Pop the message
+            source->pop_front(sizeof(header_buf) + body_size);
+            pe = PARSE_ERROR_TRY_OTHERS;
+            break;
+        }
+    } while (0);
+
+    if (pe != PARSE_OK) {
+        host_allocator->DeallocateRaw(prefetch_d2h_data);
+        return MakeParseError(pe);
+    }
+
+    source->pop_front(sizeof(header_buf));
+    MostCommonMessage* msg = MostCommonMessage::Get();
+
+    if (header_size + meta_size <= n) {
+        auto deleter = [host_allocator, prefetch_d2h_data](void* data) { host_allocator->DeallocateRaw(prefetch_d2h_data); };
+        // n is the bytes we real frefetch. We set n as the meta and n will be used in ProcessRpcRequest/ProcessRpcResponse.
+        // This is a trick, we should keep n in another better way.
+        msg->meta.append_user_data_with_meta((char*)prefetch_d2h_data + header_size, meta_size, deleter, n);
+        source->pop_front(meta_size);
+    } else {
+        host_allocator->DeallocateRaw(prefetch_d2h_data);
+        source->cutn_from_gpu(&msg->meta, meta_size);
+    }
+    source->cutn(&msg->payload, body_size - meta_size);
+    return MakeMessage(msg);
+}
+
+
+void FillReqBufGpu(butil::IOBuf* req_buf, MostCommonMessage* msg, int body_without_attachment_size) {
+    int meta_size = msg->meta.size();
+    bool is_gpu_memory = msg->payload.is_gpu_memory();
+    if (!is_gpu_memory) {
+        LOG(FATAL) << "message is not on gpu!!!";
+    }
+    int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
+    if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
+        void* data = msg->meta.get_first_data_ptr();
+        if (data == nullptr) {
+            LOG(FATAL) << "illegal data!!!";
+        }
+        req_buf->append((char*)data + meta_size, body_without_attachment_size);
+        msg->payload.pop_front(body_without_attachment_size);
+    } else {
+        msg->payload.cutn_from_gpu(req_buf, body_without_attachment_size);
+    }
+}
+
+void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta& meta) {
+    const int res_size = msg->payload.length();
+    int meta_size = msg->meta.size();
+    butil::IOBuf* res_buf_ptr = &msg->payload;
+    bool is_gpu_memory = msg->payload.is_gpu_memory();
+    if (!is_gpu_memory) {
+        LOG(FATAL) << "message is not on gpu!!!";
+    }
+    if (meta.has_attachment_size()) {
+        if (meta.attachment_size() > res_size) {
+            cntl->SetFailed(
+                    ERESPONSE, "attachment_size=%d is larger than response_size=%d",
+                    meta.attachment_size(), res_size);
+            break;
+        }
+        int body_without_attachment_size = res_size - meta.attachment_size();
+
+        int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
+        if (header_size + meta_size + body_without_attachment_size <= real_prefetch_d2h_size) {
+            void* data = msg->meta.get_first_data_ptr();
+            if (data == nullptr) {
+                LOG(FATAL) << "illegal data!!!";
+            }
+            res_buf->append((char*)data + meta_size, body_without_attachment_size);
+            msg->payload.pop_front(body_without_attachment_size);
+        } else {
+            msg->payload.cutn_from_gpu(res_buf, body_without_attachment_size);
+        }
+        res_buf_ptr = res_buf;
+        cntl->response_attachment().swap(msg->payload);
+    } else {
+        int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
+        if (header_size + meta_size + res_size <= real_prefetch_d2h_size) {
+            void* data = msg->meta.get_first_data_ptr();
+            if (data == nullptr) {
+                LOG(FATAL) << "illegal data!!!";
+            }
+            res_buf->append((char*)data + meta_size, res_size);
+            msg->payload.pop_front(res_size);
+        } else {
+            msg->payload.cutn_from_gpu(res_buf, res_size);
+        }
+        res_buf_ptr = res_buf;
+    }
+}
+
+#endif
+}  // namespace policy
+} // namespace brpc
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 6f77d2cf3d..6d26779eb7 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -1115,6 +1115,7 @@ int RdmaEndpoint::PostRecv(uint32_t num, bool zerocopy) {
                 void* device_ptr = device_allocator->AllocateRaw(g_gdr_recv_block_size);
                 auto deleter = [device_allocator](void* data) { device_allocator->DeallocateRaw(data); };
                 lkey = device_allocator->get_lkey(device_ptr);
+                // we keep lkey into the meta, and this is a thick. we also keep prefetch d2h size in meta too.
                 _rbuf[_rq_received].append_user_data_with_meta(device_ptr, g_gdr_recv_block_size, deleter, lkey);
                 _rbuf_data[_rq_received] = device_ptr;
             } else

From aeec3c7fb35d83e7f0ba6d01092cf5090af86e46 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Sun, 31 May 2026 23:45:18 +0800
Subject: [PATCH 22/23] Fix

---
 BUILD.bazel                                |  2 +-
 example/rdma_performance/client.cpp        |  5 ++---
 example/rdma_performance/server.cpp        |  2 +-
 src/brpc/policy/baidu_rpc_protocol.cpp     |  5 ++---
 src/brpc/policy/baidu_rpc_protocol.h       |  5 ++++-
 src/brpc/policy/baidu_rpc_protocol_gpu.cpp | 10 +++++-----
 src/brpc/rdma/rdma_endpoint.cpp            |  5 ++---
 src/butil/gpu/gpu_block_pool.cpp           |  2 +-
 src/butil/gpu/gpu_block_pool.h             |  1 +
 9 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/BUILD.bazel b/BUILD.bazel
index d4c56fe9ce..bff12cbc1e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -46,7 +46,7 @@ COPTS = [
     "//bazel/config:brpc_with_rdma": ["-DBRPC_WITH_RDMA=1"],
     "//conditions:default": [""],
 }) + select({
-    "//bazel/config:brpc_with_gdr": ["-DBRPC_WITH_gdr=1"],
+    "//bazel/config:brpc_with_gdr": ["-DBRPC_WITH_GDR=1"],
     "//conditions:default": [""],
 }) + select({
     "//bazel/config:brpc_with_debug_bthread_sche_safety": ["-DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=1"],
diff --git a/example/rdma_performance/client.cpp b/example/rdma_performance/client.cpp
index 5dcf2a26f3..40944d65a1 100644
--- a/example/rdma_performance/client.cpp
+++ b/example/rdma_performance/client.cpp
@@ -97,9 +97,8 @@ class PerformanceTest {
                 auto pd = brpc::rdma::GetRdmaPd();
                 mr = ibv_reg_mr(pd, _addr, attachment_size,
                         IBV_ACCESS_LOCAL_WRITE |
-                        IBV_ACCESS_LOCAL_READ |
-                        IBV_ACCESS_REMOTE_WRITE |
-                        );
+                        IBV_ACCESS_REMOTE_READ |
+                        IBV_ACCESS_REMOTE_WRITE);
                 if (!mr) {
                     LOG(FATAL) << "Failed to register MR:" << strerror(errno)
                         << ", addr:" << _addr;
diff --git a/example/rdma_performance/server.cpp b/example/rdma_performance/server.cpp
index 5eaa43a307..eca2641513 100644
--- a/example/rdma_performance/server.cpp
+++ b/example/rdma_performance/server.cpp
@@ -28,7 +28,7 @@
 
 DEFINE_int32(port, 8002, "TCP Port of this server");
 DEFINE_bool(use_rdma, true, "Use RDMA or not");
-DEFINE_bool(use_gdr, false, "Use RDMA or not");
+DEFINE_bool(use_gdr, false, "Use GDR or not");
 
 butil::atomic<uint64_t> g_last_time(0);
 
diff --git a/src/brpc/policy/baidu_rpc_protocol.cpp b/src/brpc/policy/baidu_rpc_protocol.cpp
index 43bf7a8b6d..56b694726d 100644
--- a/src/brpc/policy/baidu_rpc_protocol.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol.cpp
@@ -803,10 +803,9 @@ void ProcessRpcRequest(InputMessageBase* msg_base) {
             butil::IOBuf req_buf;
             int body_without_attachment_size = req_size - meta.attachment_size();
 #if BRPC_WITH_GDR
-            int meta_size = msg->meta.size();
             bool is_gpu_memory = msg->payload.is_gpu_memory();
             if (is_gpu_memory) {
-                FillReqBufGpu(&req_buf, msg, body_without_attachment_size);
+                FillReqBufGpu(&req_buf, msg.get(), body_without_attachment_size);
             } else
 #endif  // BRPC_WITH_GDR
             {
@@ -986,7 +985,7 @@ void ProcessRpcResponse(InputMessageBase* msg_base) {
 #if BRPC_WITH_GDR
         bool is_gpu_memory = msg->payload.is_gpu_memory();
         if (is_gpu_memory) {
-            FillResBufGpu(&res_buf, msg, meta);
+            FillResBufGpu(&res_buf, msg.get(), meta, &res_buf_ptr, cntl);
         } else
 #endif  // BRPC_WITH_GDR
         {
diff --git a/src/brpc/policy/baidu_rpc_protocol.h b/src/brpc/policy/baidu_rpc_protocol.h
index cd0046302d..6a3c379142 100644
--- a/src/brpc/policy/baidu_rpc_protocol.h
+++ b/src/brpc/policy/baidu_rpc_protocol.h
@@ -19,6 +19,8 @@
 #ifndef BRPC_POLICY_BRPC_PROTOCOL_H
 #define BRPC_POLICY_BRPC_PROTOCOL_H
 
+#include "brpc/policy/baidu_rpc_meta.pb.h"      // RpcRequestMeta
+#include "brpc/policy/most_common_message.h"
 #include "brpc/protocol.h"
 
 namespace brpc {
@@ -60,7 +62,8 @@ ParseResult ParseRpcMessageGpu(butil::IOBuf* source, Socket *socket, bool read_e
 
 void FillReqBufGpu(butil::IOBuf* req_buf, MostCommonMessage* msg, int body_without_attachment_size);
 
-void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta& meta);
+void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta& meta,
+                   butil::IOBuf** res_buf_ptr, Controller* cntl);
 
 #endif
 }  // namespace policy
diff --git a/src/brpc/policy/baidu_rpc_protocol_gpu.cpp b/src/brpc/policy/baidu_rpc_protocol_gpu.cpp
index 6ec373b9e0..67b06cdcf4 100644
--- a/src/brpc/policy/baidu_rpc_protocol_gpu.cpp
+++ b/src/brpc/policy/baidu_rpc_protocol_gpu.cpp
@@ -176,10 +176,10 @@ void FillReqBufGpu(butil::IOBuf* req_buf, MostCommonMessage* msg, int body_witho
     }
 }
 
-void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta& meta) {
+void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta& meta,
+                   butil::IOBuf** res_buf_ptr, Controller* cntl) {
     const int res_size = msg->payload.length();
     int meta_size = msg->meta.size();
-    butil::IOBuf* res_buf_ptr = &msg->payload;
     bool is_gpu_memory = msg->payload.is_gpu_memory();
     if (!is_gpu_memory) {
         LOG(FATAL) << "message is not on gpu!!!";
@@ -189,7 +189,7 @@ void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta&
             cntl->SetFailed(
                     ERESPONSE, "attachment_size=%d is larger than response_size=%d",
                     meta.attachment_size(), res_size);
-            break;
+            return;
         }
         int body_without_attachment_size = res_size - meta.attachment_size();
 
@@ -204,7 +204,7 @@ void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta&
         } else {
             msg->payload.cutn_from_gpu(res_buf, body_without_attachment_size);
         }
-        res_buf_ptr = res_buf;
+        *res_buf_ptr = res_buf;
         cntl->response_attachment().swap(msg->payload);
     } else {
         int64_t real_prefetch_d2h_size = msg->meta.get_first_data_meta();
@@ -218,7 +218,7 @@ void FillResBufGpu(butil::IOBuf* res_buf, MostCommonMessage* msg, const RpcMeta&
         } else {
             msg->payload.cutn_from_gpu(res_buf, res_size);
         }
-        res_buf_ptr = res_buf;
+        *res_buf_ptr = res_buf;
     }
 }
 
diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp
index 6d26779eb7..159a8008ed 100644
--- a/src/brpc/rdma/rdma_endpoint.cpp
+++ b/src/brpc/rdma/rdma_endpoint.cpp
@@ -1012,13 +1012,12 @@ ssize_t RdmaEndpoint::HandleCompletion(ibv_wc& wc) {
             if (_use_gdr) {
                 zerocopy = true;
             } else
-#else
+#endif // BRPC_WITH_GDR
             {
                 if (wc.byte_len < (uint32_t)FLAGS_rdma_zerocopy_min_size) {
                     zerocopy = false;
                 }
             }
-#endif // BRPC_WITH_GDR
             CHECK(_state != FALLBACK_TCP);
             if (zerocopy) {
                 _rbuf[_rq_received].cutn(&_socket->_read_buf, wc.byte_len);
@@ -1692,7 +1691,7 @@ void RdmaEndpoint::DebugInfo(std::ostream& os, butil::StringPiece connector) con
 
 int RdmaEndpoint::GlobalGdrInitialize() {
 #if BRPC_WITH_GDR
-    g_gdr_recv_block_size = butil::gdr::GetGdrBlockSize() * 1024 - IOBUF_BLOCK_HEADER_LEN;
+    g_gdr_recv_block_size = butil::gdr::GetGdrBlockSize() - IOBUF_BLOCK_HEADER_LEN;
     LOG(INFO) << "g_gdr_recv_block_size: " << g_gdr_recv_block_size;
 #endif // BRPC_WITH_GDR
     return 0;
diff --git a/src/butil/gpu/gpu_block_pool.cpp b/src/butil/gpu/gpu_block_pool.cpp
index a100cf8fb0..86a307ba06 100644
--- a/src/butil/gpu/gpu_block_pool.cpp
+++ b/src/butil/gpu/gpu_block_pool.cpp
@@ -231,7 +231,7 @@ uint32_t BlockPoolAllocator::get_lkey(const void* buf) {
         return 0;
     }
     
-    if(!r->mr) {
+    if (!r->mr) {
         LOG(FATAL) << "region has not been registered into rdma yet, addr:" << r->start;
         return 0;
     }
diff --git a/src/butil/gpu/gpu_block_pool.h b/src/butil/gpu/gpu_block_pool.h
index 16da712178..655c487a92 100644
--- a/src/butil/gpu/gpu_block_pool.h
+++ b/src/butil/gpu/gpu_block_pool.h
@@ -69,6 +69,7 @@ class BlockPoolAllocator {
 
     BlockHeader* freeList;
     int g_region_num {0};
+    std::vector<Region> g_regions;
     std::mutex poolMutex;
 
     // stat

From ec969ab6112134410650647edc7f94eb89ecdf35 Mon Sep 17 00:00:00 2001
From: randomkang <550941794@qq.com>
Date: Mon, 1 Jun 2026 00:14:57 +0800
Subject: [PATCH 23/23] Add docs

---
 docs/cn/gdr.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 docs/en/gdr.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 docs/cn/gdr.md
 create mode 100644 docs/en/gdr.md

diff --git a/docs/cn/gdr.md b/docs/cn/gdr.md
new file mode 100644
index 0000000000..7e46f19d2f
--- /dev/null
+++ b/docs/cn/gdr.md
@@ -0,0 +1,43 @@
+# 编译
+
+GDR: GPU Direct Rdma, gdr 是rdma的一种特殊模式，其通过rdma将数据直接收到了gpu的显存上。
+
+由于GDR对驱动与硬件有要求，目前仅支持在Linux系统编译并运行GDR功能。
+
+目前GDR只支持baidu std protocol。
+
+使用config_brpc：
+```bash
+sh config_brpc.sh --with-rdma --with-gdr --headers="/usr/include" --libs="/usr/lib64 /usr/bin"
+make
+
+cd example/rdma_performance  # 示例程序
+make
+```
+
+使用bazel:
+```bash
+# Server
+bazel build --define=BRPC_WITH_RDMA=true --define=BRPC_WITH_GDR=true example:rdma_performance_server
+# Client
+bazel build --define=BRPC_WITH_RDMA=true --define=BRPC_WITH_GDR=true example:rdma_performance_client
+```
+
+# 基本实现
+
+GDR是RDMA的一种特殊形式，在使用GDR之前，必须对RDMA和GDR都进行Global Init。
+GDR新增了一个显存池，类似于RDMA内存池，显存池的数据也是按照block进行组织的。
+当打开GDR功能后，框架通过DoPostRecvGDR来发起显存上的WQE。
+在接收到数据后，我们将header、meta、body（不包括attachment）copy回内存进行处理。
+AttachMent位于显存上，用户可以调用IOBuf::copy_from_gpu接口将attachment从brpc框架层copy到应用层进行处理。
+
+
+注意：
+1. 在使用gdr功能时，需要将环境变量MLX5_SCATTER_TO_CQE设置为0.
+
+
+# 参数
+
+可配置参数说明：
+* gdr_block_size_kb: 使用gdr传送数据时，block的大小（单位为KB），默认为512；
+* max_gdr_regions: gdr显存池所使用Region的最大个数，每个Region大小为1GB； 
diff --git a/docs/en/gdr.md b/docs/en/gdr.md
new file mode 100644
index 0000000000..2e968f3e69
--- /dev/null
+++ b/docs/en/gdr.md
@@ -0,0 +1,44 @@
+Compile GDR:
+
+GPU Direct RDMA. GDR is a special mode of RDMA that allows data to be received directly into the GPU’s memory through RDMA.
+Because GDR requires specific drivers and hardware support, it is currently only available for compilation and execution on Linux systems.
+At present, GDR only supports the Baidu STD protocol.
+
+To use config_brpc:
+
+sh config_brpc.sh --with-rdma --with-gdr --headers="/usr/include" --libs="/usr/lib64 /usr/bin"
+make
+cd example/rdma_performance   # Example program
+make
+
+To use Bazel:
+
+# Server
+bazel build --define=BRPC_WITH_RDMA=true --define=BRPC_WITH_GDR=true example:rdma_performance_server
+
+# Client
+bazel build --define=BRPC_WITH_RDMA=true --define=BRPC_WITH_GDR=true example:rdma_performance_client
+
+
+Basic Implementation:
+
+GDR is a special form of RDMA. Before using GDR, both RDMA and GDR must be globally initialized.
+
+GDR introduces a GPU memory pool, similar to the RDMA memory pool. Data in the GPU memory pool is also organized in blocks.
+
+When GDR is enabled, the framework initiates WQEs on GPU memory through DoPostRecvGDR.
+
+After receiving data, the header, meta, and body (excluding attachments) are copied back to host memory for processing.
+Attachments remain in GPU memory, and users can call IOBuf::copy_from_gpu to copy attachments from the brpc framework layer to the application layer.
+
+Note:
+
+When using GDR, the environment variable MLX5_SCATTER_TO_CQE must be set to 0.
+
+Parameters
+
+Configurable parameters:
+
+gdr_block_size_kb: The block size (in KB) used when transferring data via GDR. Default is 512.
+
+max_gdr_regions: The maximum number of regions used by the GDR GPU memory pool. Each region is 1 GB.