PaddlePaddle · llbdyiu66 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/examples/best_practices/DeepseekV3/dsv3_128k_config.yaml b/examples/best_practices/DeepseekV3/dsv3_128k_config.yaml
@@ -68,5 +68,5 @@ use_attn_mask_startend_row_indices: true
 using_fake_gate: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-use_fused_rms_norm: true
+fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 1024
diff --git a/examples/best_practices/DeepseekV3/dsv3_32k_config.yaml b/examples/best_practices/DeepseekV3/dsv3_32k_config.yaml
@@ -68,5 +68,5 @@ use_attn_mask_startend_row_indices: true
 using_fake_gate: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-use_fused_rms_norm: true
+fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
diff --git a/examples/best_practices/DeepseekV3/dsv3_4k_config.yaml b/examples/best_practices/DeepseekV3/dsv3_4k_config.yaml
@@ -68,5 +68,5 @@ use_attn_mask_startend_row_indices: true
 using_fake_gate: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-use_fused_rms_norm: true
+fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
diff --git a/examples/experiments/deepseek_v3_pretrain/config/config.json b/examples/experiments/deepseek_v3_pretrain/config/config.json
@@ -61,9 +61,9 @@
     "v_head_dim": 128,
     "vocab_size": 129280,
     "using_flex_token": true,
-    "use_fused_rms_norm": true,
+    "fuse_rms_norm": true,
     "fuse_attention_ffn": true,
-    "use_fused_rope": true,
+    "apply_rope_fusion": true,
     "token_drop_steps": 0,
     "recompute_fwd_gate_up": true,
     "adaptive_remained_O1_recompute_ratio": 0,

diff --git a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json
@@ -42,9 +42,9 @@
     "unified_checkpoint": true,
     "save_total_limit": 2,
     "skip_profile_timer": false,
-    "use_fused_rms_norm": true,
+    "fuse_rms_norm": true,
     "fuse_attention_ffn": true,
-    "use_fused_rope": true,
+    "apply_rope_fusion": true,
     "save_sharded_model": false,
     "load_sharded_model": false,
     "use_expert_parallel": true,

diff --git a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml
@@ -42,9 +42,9 @@ distributed_dataloader: 1
 unified_checkpoint: true
 save_total_limit: 2
 skip_profile_timer: false
-use_fused_rms_norm: true
+fuse_rms_norm: true
 fuse_attention_ffn: true
-use_fused_rope: true
+apply_rope_fusion: true
 save_sharded_model: false
 load_sharded_model: false
 use_expert_parallel: true

diff --git a/examples/experiments/deepseek_v3_pretrain/modeling.py b/examples/experiments/deepseek_v3_pretrain/modeling.py
@@ -655,7 +655,7 @@ def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: bool = Fal
         self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
 
         self.is_causal = True
-        self.fuse_rope = config.use_fused_rope
+        self.apply_rope_fusion = config.apply_rope_fusion
 
         if config.num_nextn_predict_layers > 0:
             self.seq_length = config.seq_length - config.num_nextn_predict_layers
@@ -858,7 +858,7 @@ def forward(
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
             cos = cos[None, :, None, :]
             sin = sin[None, :, None, :]
-            q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.fuse_rope)
+            q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.apply_rope_fusion)
 
             query_states = paddle.cat([q_nope, q_pe], axis=-1)
             key_states = paddle.cat([k_nope, k_pe], axis=-1)
@@ -1975,7 +1975,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps=1e-6, use
             mark_as_sequence_parallel_parameter(self.weight)
 
     def forward(self, hidden_states):
-        if self.config.use_fused_rms_norm:
+        if self.config.fuse_rms_norm:
             return RmsNormFunction.apply(hidden_states, self.weight, self.variance_epsilon)
 
         with paddle.amp.auto_cast(False):
@@ -1991,7 +1991,7 @@ def extra_repr(self):
         return f"hidden_size={self.hidden_size}, dtype={self.weight.dtype}"
 
 
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, fuse_rope=False):
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion=False):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -2018,7 +2018,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, fuse_rope=False):
     b, s, h, d = k.shape
     k = k.reshape([b, s, h, d // 2, 2]).transpose([0, 1, 2, 4, 3]).reshape([b, s, h, d])
 
-    if (get_env_device() == "xpu" or get_env_device() == "gpu") and fuse_rope:
+    if (get_env_device() == "xpu" or get_env_device() == "gpu") and apply_rope_fusion:
         q_embed, k_embed, _ = fused_rotary_position_embedding(
             q,
             k,

diff --git a/examples/experiments/deepseek_v3_pretrain/run_pretrain.py b/examples/experiments/deepseek_v3_pretrain/run_pretrain.py
@@ -509,9 +509,9 @@ def main():
         # config.using_flex_token = True
         # config.num_nextn_predict_layers = 1
         # config.using_fake_gate = True
-        # config.use_fused_rms_norm = True
+        # config.fuse_rms_norm = True
         # config.fuse_attention_ffn = True
-        # config.use_fused_rope = True
+        # config.apply_rope_fusion = True
         # config.token_drop_steps = 0
         model = model_class.from_config(config, dtype=dtype)
 

diff --git a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json
@@ -32,7 +32,7 @@
   "fuse_attn_ffn": true,
   "fuse_linear": true,
   "rope_reorder": false,
-  "fuse_rope": true,
+  "apply_rope_fusion": true,
   "fuse_swiglu": true,
   "fuse_gate_detach_matmul": true,
   "remove_tail_layer": 2,

diff --git a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json
@@ -31,7 +31,7 @@
   "fuse_attn_ffn": true,
   "fuse_linear": true,
   "rope_reorder": false,
-  "fuse_rope": true,
+  "apply_rope_fusion": true,
   "fuse_swiglu": true,
   "fuse_gate_detach_matmul": true,
   "remove_tail_layer": 2,

diff --git a/examples/experiments/ernie_pretrain/models/ernie/configuration.py b/examples/experiments/ernie_pretrain/models/ernie/configuration.py
@@ -99,7 +99,7 @@ def __init__(
         expert_mlp_use_bias=None,
         rope_reorder=True,
         rope_theta=10000,
-        fuse_rope=False,
+        apply_rope_fusion=False,
         use_fast_ln=False,
         weight_share_add_bias=True,
         fuse_linear=False,
@@ -230,7 +230,7 @@ def __init__(
         self.weight_share_add_bias = weight_share_add_bias
         self.rope_reorder = rope_reorder
         self.rope_theta = rope_theta
-        self.fuse_rope = fuse_rope
+        self.apply_rope_fusion = apply_rope_fusion
         self.use_fast_ln = use_fast_ln
 
         self.fuse_linear = fuse_linear

diff --git a/examples/experiments/ernie_pretrain/models/ernie/modeling.py b/examples/experiments/ernie_pretrain/models/ernie/modeling.py
@@ -847,12 +847,12 @@ def __init__(self, config, layer_idx=0):
         self.use_recompute_attn = config.use_recompute_attn
         logger.info(f"using recompute attn={self.use_recompute_attn}")
         self.is_gqa = config.num_key_value_heads is not None and config.num_key_value_heads != self.num_heads
-        if config.fuse_rope:
+        if config.apply_rope_fusion:
             assert fused_rope is not None, "fused_rope is not supported"
-        self.fuse_rope = config.fuse_rope
+        self.apply_rope_fusion = config.apply_rope_fusion
         self.rope_3d = config.rope_3d
         if self.rope_3d:
-            assert not self.fuse_rope, "does not support fuse rope when rope_3d is on for now."
+            assert not self.apply_rope_fusion, "does not support fuse rope when rope_3d is on for now."
             assert not config.rope_reorder, "does not support rope_reorder when rope_3d is on for now."
             assert config.freq_allocation is not None, "freq_allocation must be provided if rope_3d is on."
 
@@ -1135,7 +1135,7 @@ def rope_attn(
                 offset=offset if position_ids is None else 0,
             )
         else:
-            if offset > 0 or position_ids is not None or not self.fuse_rope:
+            if offset > 0 or position_ids is not None or not self.apply_rope_fusion:
                 if not self.rope_3d:
                     cos_sin = self.rotary_emb(kv_seq_len, position_ids).transpose([0, 2, 1, 3])
                     if offset > 0 and position_ids is None:

diff --git a/paddleformers/cli/hparams/model_args.py b/paddleformers/cli/hparams/model_args.py
@@ -80,7 +80,6 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether to fuse softmax and add"},
     )
-    fuse_rms_norm: bool = field(default=True, metadata={"help": "Whether to fuse RMSNorm for efficiency"})
     use_fast_layer_norm: bool = field(
         default=False,
         metadata={"help": "GPT3 model, use fast layernorm"},

diff --git a/paddleformers/cli/train/pretrain/workflow.py b/paddleformers/cli/train/pretrain/workflow.py
@@ -514,9 +514,9 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args):
         # config.using_flex_token = True
         # config.num_nextn_predict_layers = 1
         # config.using_fake_gate = True
-        # config.use_fused_rms_norm = True
+        # config.fuse_rms_norm = True
         # config.fuse_attention_ffn = True
-        # config.use_fused_rope = True
+        # config.apply_rope_fusion = True
         # config.token_drop_steps = 0
         model = model_class.from_config(config, dtype=dtype)
 

diff --git a/paddleformers/nn/pp_model.py b/paddleformers/nn/pp_model.py
@@ -267,7 +267,7 @@ def forward(self, args):
         )
         input_ids.stop_gradient = True
         emb = self.embed_tokens(input_ids).astype(self.embed_tokens.weight.dtype)
-        if position_ids is None and not self.config.fuse_rope:
+        if position_ids is None and not self.config.apply_rope_fusion:
             position_ids = (
                 paddle.arange(
                     0,
@@ -277,7 +277,7 @@ def forward(self, args):
                 .unsqueeze(0)
                 .tile([input_ids.shape[0], 1])
             )
-        if self.config.fuse_rope:
+        if self.config.apply_rope_fusion:
             position_embeddings = None
         else:
             position_embeddings = paddle.stack(self.rotary_emb(emb, position_ids))  # cos and sin

diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
@@ -229,14 +229,11 @@ def llmmetaclass(cls):
 class LlmMetaConfig:
     op_fusion_attributes = [
         # name, type, default_value, comment
-        ("use_flash_attention", bool, False, "Whether to use flash attention to accelerate training."),
-        ("use_fused_rms_norm", bool, False, "llama or other model, use_fused_rms_norm"),
-        ("use_fused_rope", bool, False, "Enable rope fusion or not."),
-        ("use_fused_linear", bool, False, "GPT3 model, use fused linear layer"),
-        ("use_fused_dropout_add", bool, False, "GPT3 model, use fused `dropout + residual add` op."),
+        ("use_flash_attention", bool, False, "Only used in `ernie45_vl` and `deepseek_v3_pretrain`."),
+        ("fuse_rms_norm", bool, True, "Whether to fuse RMSNorm for efficiency"),
         ("use_fused_linear_cross_entropy", bool, False, "use fused `linear + cross_entropy` fuse op."),
         ("fuse_linear", bool, False, "Use fused linear layer instead of normal linear layer."),
-        ("fuse_rope", bool, False, "Whether to fuse RoPE operation"),
+        ("apply_rope_fusion", bool, False, "Whether to fuse RoPE operation"),
         ("fuse_swiglu", bool, False, "Whether to fuse SwiGLU operations"),
         ("fuse_attention_qkv", bool, False, "Whether to fuse Attention QKV operations"),
         ("fuse_attention_ffn", bool, False, "Whether to fuse Attention FFN operations"),

diff --git a/paddleformers/transformers/deepseek_v3/modeling.py b/paddleformers/transformers/deepseek_v3/modeling.py
@@ -201,7 +201,7 @@ def rotate_half(x):
     return paddle.cat([-x2, x1], axis=-1)  # shape is the same as x
 
 
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, fuse_rope=False):
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion=False):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -535,7 +535,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
         self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
 
         self.is_causal = True
-        self.fuse_rope = config.use_fused_rope
+        self.apply_rope_fusion = config.apply_rope_fusion
 
         self.seq_length = config.seq_length
         self.tensor_parallel = config.tensor_model_parallel_size > 1
@@ -702,7 +702,7 @@ def forward(
         cos, sin = position_embeddings[0], position_embeddings[1]
         cos = cos[None, :, None, :]
         sin = sin[None, :, None, :]
-        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.fuse_rope)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.apply_rope_fusion)
         query_states = paddle.cat([q_nope, q_pe], axis=-1)
         key_states = paddle.cat([k_nope, k_pe], axis=-1)
 
@@ -1551,7 +1551,7 @@ def forward(
         if past_key_values is not None:
             seq_length_with_past += past_key_values_length
 
-        if position_ids is None and not self.config.fuse_rope:
+        if position_ids is None and not self.config.apply_rope_fusion:
             position_ids = (
                 paddle.arange(
                     0,
@@ -2234,7 +2234,7 @@ def forward(self, args):
             )
         attn_mask = attn_mask_startend_row_indices if attn_mask_startend_row_indices is not None else attn_mask
 
-        if position_ids is None and not self.config.fuse_rope:
+        if position_ids is None and not self.config.apply_rope_fusion:
             position_ids = (
                 paddle.arange(
                     0,

diff --git a/paddleformers/transformers/ernie4_5/configuration.py b/paddleformers/transformers/ernie4_5/configuration.py
@@ -40,7 +40,6 @@ def __init__(
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=False,
-        use_flash_attention=False,
         recompute=False,
         recompute_granularity="core_attn",
         recompute_use_reentrant=False,
@@ -50,7 +49,7 @@ def __init__(
         eos_token_id=2,
         use_bias=False,
         rope_theta=10000,
-        fuse_rope=False,
+        apply_rope_fusion=False,
         fuse_softmax_mask=False,
         fuse_linear=False,
         max_sequence_length=None,
@@ -77,7 +76,6 @@ def __init__(
             num_attention_heads (int): Number of attention heads for each attention layer
             rms_norm_eps (float): The epsilon used by the RMS normalization layers
             use_cache (bool): Whether to use caching for faster generation (decoding)
-            use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
             recompute (bool): Whether to use gradient checkpointing to save memory
             recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
             recompute_use_reentrant (bool): Whether to use reentrant checkpointing
@@ -89,7 +87,7 @@ def __init__(
             eos_token_id (int): Token ID used for end-of-sequence
             use_bias (bool): Whether to use bias terms in linear layers
             rope_theta (float): The base period of the RoPE embeddings
-            fuse_rope (bool): Whether to fuse RoPE operations
+            apply_rope_fusion (bool): Whether to fuse RoPE operations
             fuse_linear (bool): Whether to fuse linear operations
             fuse_up_gate (bool): Whether to fuse up_proj and gate_proj to a single linear layer
             max_sequence_length (int): Maximum sequence length for positional embeddings
@@ -123,7 +121,6 @@ def __init__(
         self.use_cache = use_cache
         self.recompute = recompute
         self.recompute_granularity = recompute_granularity
-        self.use_flash_attention = use_flash_attention
         self.recompute_use_reentrant = recompute_use_reentrant
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
@@ -134,7 +131,7 @@ def __init__(
         self.use_bias = use_bias
         self.rope_theta = rope_theta
         self.tie_word_embeddings = tie_word_embeddings
-        self.fuse_rope = fuse_rope
+        self.apply_rope_fusion = apply_rope_fusion
         self.fuse_softmax_mask = fuse_softmax_mask
         self.fuse_linear = fuse_linear
         self.ignored_index = ignored_index

diff --git a/paddleformers/transformers/ernie4_5/modeling.py b/paddleformers/transformers/ernie4_5/modeling.py
@@ -296,7 +296,7 @@ def forward(
 
         attention_interface = ALL_ATTENTION_FUNCTIONS[self.attn_implementation]
 
-        if self.config.fuse_rope:
+        if self.config.apply_rope_fusion:
             query_states, key_states = apply_fused_rope(query_states, key_states, self.config.rope_theta)
         else:
             cos, sin = position_embeddings
@@ -668,7 +668,7 @@ def forward(
         if position_ids is None:
             position_ids = paddle.arange(kv_seq_len, seq_length).unsqueeze(0).tile((bsz, 1))
 
-        if not self.config.fuse_rope:
+        if not self.config.apply_rope_fusion:
             position_embeddings = self.rotary_emb(hidden_states, position_ids)  # cos and sin
         else:
             position_embeddings = None
@@ -768,7 +768,7 @@ def __init__(self, config):
 
     def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
         """Avoid using attention_mask with flash_attn on generation."""
-        if self.config.use_flash_attention:
+        if self.config._attn_implementation == "sdpa":
             return None
         return super().prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
 

diff --git a/paddleformers/transformers/ernie4_5_moe/configuration.py b/paddleformers/transformers/ernie4_5_moe/configuration.py
@@ -46,7 +46,6 @@ def __init__(
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=False,
-        use_flash_attention=True,
         use_rmsnorm=True,
         pad_token_id=0,
         bos_token_id=1,
@@ -110,7 +109,6 @@ def __init__(
             hidden_act (str): Name of the activation function used in the feed-forward network
             rms_norm_eps (float): The epsilon used by the RMS normalization layers
             use_cache (bool): Whether to use caching for faster generation (decoding)
-            use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
             recompute (bool): Whether to use gradient checkpointing to save memory
             recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
             recompute_use_reentrant (bool): Whether to use reentrant checkpointing
@@ -187,7 +185,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.use_flash_attention = use_flash_attention
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id

diff --git a/paddleformers/transformers/ernie4_5_moe/modeling.py b/paddleformers/transformers/ernie4_5_moe/modeling.py
@@ -907,7 +907,7 @@ def forward(
 
         hidden_states = inputs_embeds
 
-        if self.config.fuse_rope:
+        if self.config.apply_rope_fusion:
             position_embeddings = None
         else:
             position_embeddings = self.rotary_emb(hidden_states, position_ids)  # cos and sin
@@ -1111,7 +1111,7 @@ def __init__(self, config):
 
     def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
         """Avoid using attention_mask with flash_attn on generation."""
-        if self.config.use_flash_attention:
+        if self.config._attn_implementation == "sdpa":
             return None
         return super().prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)