Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/best_practices/DeepseekV3/dsv3_128k_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,5 @@ use_attn_mask_startend_row_indices: true
using_fake_gate: false
pre_alloc_memory: 60
tensorwise_offload_optimizer: true
use_fused_rms_norm: true
fuse_rms_norm: true
moe_subbatch_token_num_before_dispatch: 1024
2 changes: 1 addition & 1 deletion examples/best_practices/DeepseekV3/dsv3_32k_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,5 @@ use_attn_mask_startend_row_indices: true
using_fake_gate: false
pre_alloc_memory: 60
tensorwise_offload_optimizer: true
use_fused_rms_norm: true
fuse_rms_norm: true
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deepseek v3 pretrain修改需要和张博确认

moe_subbatch_token_num_before_dispatch: 0
2 changes: 1 addition & 1 deletion examples/best_practices/DeepseekV3/dsv3_4k_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,5 @@ use_attn_mask_startend_row_indices: true
using_fake_gate: false
pre_alloc_memory: 60
tensorwise_offload_optimizer: true
use_fused_rms_norm: true
fuse_rms_norm: true
moe_subbatch_token_num_before_dispatch: 0
4 changes: 2 additions & 2 deletions examples/experiments/deepseek_v3_pretrain/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@
"v_head_dim": 128,
"vocab_size": 129280,
"using_flex_token": true,
"use_fused_rms_norm": true,
"fuse_rms_norm": true,
"fuse_attention_ffn": true,
"use_fused_rope": true,
"apply_rope_fusion": true,
"token_drop_steps": 0,
"recompute_fwd_gate_up": true,
"adaptive_remained_O1_recompute_ratio": 0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@
"unified_checkpoint": true,
"save_total_limit": 2,
"skip_profile_timer": false,
"use_fused_rms_norm": true,
"fuse_rms_norm": true,
"fuse_attention_ffn": true,
"use_fused_rope": true,
"apply_rope_fusion": true,
"save_sharded_model": false,
"load_sharded_model": false,
"use_expert_parallel": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ distributed_dataloader: 1
unified_checkpoint: true
save_total_limit: 2
skip_profile_timer: false
use_fused_rms_norm: true
fuse_rms_norm: true
fuse_attention_ffn: true
use_fused_rope: true
apply_rope_fusion: true
save_sharded_model: false
load_sharded_model: false
use_expert_parallel: true
Expand Down
10 changes: 5 additions & 5 deletions examples/experiments/deepseek_v3_pretrain/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def __init__(self, config: DeepseekV2FastConfig, layerwise_recompute: bool = Fal
self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

self.is_causal = True
self.fuse_rope = config.use_fused_rope
self.apply_rope_fusion = config.apply_rope_fusion

if config.num_nextn_predict_layers > 0:
self.seq_length = config.seq_length - config.num_nextn_predict_layers
Expand Down Expand Up @@ -858,7 +858,7 @@ def forward(
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
cos = cos[None, :, None, :]
sin = sin[None, :, None, :]
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.fuse_rope)
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.apply_rope_fusion)

query_states = paddle.cat([q_nope, q_pe], axis=-1)
key_states = paddle.cat([k_nope, k_pe], axis=-1)
Expand Down Expand Up @@ -1975,7 +1975,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps=1e-6, use
mark_as_sequence_parallel_parameter(self.weight)

def forward(self, hidden_states):
if self.config.use_fused_rms_norm:
if self.config.fuse_rms_norm:
return RmsNormFunction.apply(hidden_states, self.weight, self.variance_epsilon)

with paddle.amp.auto_cast(False):
Expand All @@ -1991,7 +1991,7 @@ def extra_repr(self):
return f"hidden_size={self.hidden_size}, dtype={self.weight.dtype}"


def apply_rotary_pos_emb(q, k, cos, sin, position_ids, fuse_rope=False):
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion=False):
"""Applies Rotary Position Embedding to the query and key tensors.

Args:
Expand All @@ -2018,7 +2018,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, fuse_rope=False):
b, s, h, d = k.shape
k = k.reshape([b, s, h, d // 2, 2]).transpose([0, 1, 2, 4, 3]).reshape([b, s, h, d])

if (get_env_device() == "xpu" or get_env_device() == "gpu") and fuse_rope:
if (get_env_device() == "xpu" or get_env_device() == "gpu") and apply_rope_fusion:
q_embed, k_embed, _ = fused_rotary_position_embedding(
q,
k,
Expand Down
4 changes: 2 additions & 2 deletions examples/experiments/deepseek_v3_pretrain/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,9 +509,9 @@ def main():
# config.using_flex_token = True
# config.num_nextn_predict_layers = 1
# config.using_fake_gate = True
# config.use_fused_rms_norm = True
# config.fuse_rms_norm = True
# config.fuse_attention_ffn = True
# config.use_fused_rope = True
# config.apply_rope_fusion = True
# config.token_drop_steps = 0
model = model_class.from_config(config, dtype=dtype)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"fuse_attn_ffn": true,
"fuse_linear": true,
"rope_reorder": false,
"fuse_rope": true,
"apply_rope_fusion": true,
"fuse_swiglu": true,
"fuse_gate_detach_matmul": true,
"remove_tail_layer": 2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"fuse_attn_ffn": true,
"fuse_linear": true,
"rope_reorder": false,
"fuse_rope": true,
"apply_rope_fusion": true,
"fuse_swiglu": true,
"fuse_gate_detach_matmul": true,
"remove_tail_layer": 2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __init__(
expert_mlp_use_bias=None,
rope_reorder=True,
rope_theta=10000,
fuse_rope=False,
apply_rope_fusion=False,
use_fast_ln=False,
weight_share_add_bias=True,
fuse_linear=False,
Expand Down Expand Up @@ -230,7 +230,7 @@ def __init__(
self.weight_share_add_bias = weight_share_add_bias
self.rope_reorder = rope_reorder
self.rope_theta = rope_theta
self.fuse_rope = fuse_rope
self.apply_rope_fusion = apply_rope_fusion
self.use_fast_ln = use_fast_ln

self.fuse_linear = fuse_linear
Expand Down
8 changes: 4 additions & 4 deletions examples/experiments/ernie_pretrain/models/ernie/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,12 +847,12 @@ def __init__(self, config, layer_idx=0):
self.use_recompute_attn = config.use_recompute_attn
logger.info(f"using recompute attn={self.use_recompute_attn}")
self.is_gqa = config.num_key_value_heads is not None and config.num_key_value_heads != self.num_heads
if config.fuse_rope:
if config.apply_rope_fusion:
assert fused_rope is not None, "fused_rope is not supported"
self.fuse_rope = config.fuse_rope
self.apply_rope_fusion = config.apply_rope_fusion
self.rope_3d = config.rope_3d
if self.rope_3d:
assert not self.fuse_rope, "does not support fuse rope when rope_3d is on for now."
assert not self.apply_rope_fusion, "does not support fuse rope when rope_3d is on for now."
assert not config.rope_reorder, "does not support rope_reorder when rope_3d is on for now."
assert config.freq_allocation is not None, "freq_allocation must be provided if rope_3d is on."

Expand Down Expand Up @@ -1135,7 +1135,7 @@ def rope_attn(
offset=offset if position_ids is None else 0,
)
else:
if offset > 0 or position_ids is not None or not self.fuse_rope:
if offset > 0 or position_ids is not None or not self.apply_rope_fusion:
if not self.rope_3d:
cos_sin = self.rotary_emb(kv_seq_len, position_ids).transpose([0, 2, 1, 3])
if offset > 0 and position_ids is None:
Expand Down
1 change: 0 additions & 1 deletion paddleformers/cli/hparams/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ class ModelArguments:
default=False,
metadata={"help": "Whether to fuse softmax and add"},
)
fuse_rms_norm: bool = field(default=True, metadata={"help": "Whether to fuse RMSNorm for efficiency"})
use_fast_layer_norm: bool = field(
default=False,
metadata={"help": "GPT3 model, use fast layernorm"},
Expand Down
4 changes: 2 additions & 2 deletions paddleformers/cli/train/pretrain/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,9 +514,9 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args):
# config.using_flex_token = True
# config.num_nextn_predict_layers = 1
# config.using_fake_gate = True
# config.use_fused_rms_norm = True
# config.fuse_rms_norm = True
# config.fuse_attention_ffn = True
# config.use_fused_rope = True
# config.apply_rope_fusion = True
# config.token_drop_steps = 0
model = model_class.from_config(config, dtype=dtype)

Expand Down
4 changes: 2 additions & 2 deletions paddleformers/nn/pp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def forward(self, args):
)
input_ids.stop_gradient = True
emb = self.embed_tokens(input_ids).astype(self.embed_tokens.weight.dtype)
if position_ids is None and not self.config.fuse_rope:
if position_ids is None and not self.config.apply_rope_fusion:
position_ids = (
paddle.arange(
0,
Expand All @@ -277,7 +277,7 @@ def forward(self, args):
.unsqueeze(0)
.tile([input_ids.shape[0], 1])
)
if self.config.fuse_rope:
if self.config.apply_rope_fusion:
position_embeddings = None
else:
position_embeddings = paddle.stack(self.rotary_emb(emb, position_ids)) # cos and sin
Expand Down
9 changes: 3 additions & 6 deletions paddleformers/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,11 @@ def llmmetaclass(cls):
class LlmMetaConfig:
op_fusion_attributes = [
# name, type, default_value, comment
("use_flash_attention", bool, False, "Whether to use flash attention to accelerate training."),
("use_fused_rms_norm", bool, False, "llama or other model, use_fused_rms_norm"),
("use_fused_rope", bool, False, "Enable rope fusion or not."),
("use_fused_linear", bool, False, "GPT3 model, use fused linear layer"),
("use_fused_dropout_add", bool, False, "GPT3 model, use fused `dropout + residual add` op."),
("use_flash_attention", bool, False, "Only used in `ernie45_vl` and `deepseek_v3_pretrain`."),
("fuse_rms_norm", bool, True, "Whether to fuse RMSNorm for efficiency"),
("use_fused_linear_cross_entropy", bool, False, "use fused `linear + cross_entropy` fuse op."),
("fuse_linear", bool, False, "Use fused linear layer instead of normal linear layer."),
("fuse_rope", bool, False, "Whether to fuse RoPE operation"),
("apply_rope_fusion", bool, False, "Whether to fuse RoPE operation"),
("fuse_swiglu", bool, False, "Whether to fuse SwiGLU operations"),
("fuse_attention_qkv", bool, False, "Whether to fuse Attention QKV operations"),
("fuse_attention_ffn", bool, False, "Whether to fuse Attention FFN operations"),
Expand Down
10 changes: 5 additions & 5 deletions paddleformers/transformers/deepseek_v3/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def rotate_half(x):
return paddle.cat([-x2, x1], axis=-1) # shape is the same as x


def apply_rotary_pos_emb(q, k, cos, sin, position_ids, fuse_rope=False):
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, apply_rope_fusion=False):
"""Applies Rotary Position Embedding to the query and key tensors.

Args:
Expand Down Expand Up @@ -535,7 +535,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

self.is_causal = True
self.fuse_rope = config.use_fused_rope
self.apply_rope_fusion = config.apply_rope_fusion

self.seq_length = config.seq_length
self.tensor_parallel = config.tensor_model_parallel_size > 1
Expand Down Expand Up @@ -702,7 +702,7 @@ def forward(
cos, sin = position_embeddings[0], position_embeddings[1]
cos = cos[None, :, None, :]
sin = sin[None, :, None, :]
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.fuse_rope)
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.apply_rope_fusion)
query_states = paddle.cat([q_nope, q_pe], axis=-1)
key_states = paddle.cat([k_nope, k_pe], axis=-1)

Expand Down Expand Up @@ -1551,7 +1551,7 @@ def forward(
if past_key_values is not None:
seq_length_with_past += past_key_values_length

if position_ids is None and not self.config.fuse_rope:
if position_ids is None and not self.config.apply_rope_fusion:
position_ids = (
paddle.arange(
0,
Expand Down Expand Up @@ -2234,7 +2234,7 @@ def forward(self, args):
)
attn_mask = attn_mask_startend_row_indices if attn_mask_startend_row_indices is not None else attn_mask

if position_ids is None and not self.config.fuse_rope:
if position_ids is None and not self.config.apply_rope_fusion:
position_ids = (
paddle.arange(
0,
Expand Down
9 changes: 3 additions & 6 deletions paddleformers/transformers/ernie4_5/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def __init__(
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=False,
use_flash_attention=False,
recompute=False,
recompute_granularity="core_attn",
recompute_use_reentrant=False,
Expand All @@ -50,7 +49,7 @@ def __init__(
eos_token_id=2,
use_bias=False,
rope_theta=10000,
fuse_rope=False,
apply_rope_fusion=False,
fuse_softmax_mask=False,
fuse_linear=False,
max_sequence_length=None,
Expand All @@ -77,7 +76,6 @@ def __init__(
num_attention_heads (int): Number of attention heads for each attention layer
rms_norm_eps (float): The epsilon used by the RMS normalization layers
use_cache (bool): Whether to use caching for faster generation (decoding)
use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
recompute (bool): Whether to use gradient checkpointing to save memory
recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
recompute_use_reentrant (bool): Whether to use reentrant checkpointing
Expand All @@ -89,7 +87,7 @@ def __init__(
eos_token_id (int): Token ID used for end-of-sequence
use_bias (bool): Whether to use bias terms in linear layers
rope_theta (float): The base period of the RoPE embeddings
fuse_rope (bool): Whether to fuse RoPE operations
apply_rope_fusion (bool): Whether to fuse RoPE operations
fuse_linear (bool): Whether to fuse linear operations
fuse_up_gate (bool): Whether to fuse up_proj and gate_proj to a single linear layer
max_sequence_length (int): Maximum sequence length for positional embeddings
Expand Down Expand Up @@ -123,7 +121,6 @@ def __init__(
self.use_cache = use_cache
self.recompute = recompute
self.recompute_granularity = recompute_granularity
self.use_flash_attention = use_flash_attention
self.recompute_use_reentrant = recompute_use_reentrant
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
Expand All @@ -134,7 +131,7 @@ def __init__(
self.use_bias = use_bias
self.rope_theta = rope_theta
self.tie_word_embeddings = tie_word_embeddings
self.fuse_rope = fuse_rope
self.apply_rope_fusion = apply_rope_fusion
self.fuse_softmax_mask = fuse_softmax_mask
self.fuse_linear = fuse_linear
self.ignored_index = ignored_index
Expand Down
6 changes: 3 additions & 3 deletions paddleformers/transformers/ernie4_5/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def forward(

attention_interface = ALL_ATTENTION_FUNCTIONS[self.attn_implementation]

if self.config.fuse_rope:
if self.config.apply_rope_fusion:
query_states, key_states = apply_fused_rope(query_states, key_states, self.config.rope_theta)
else:
cos, sin = position_embeddings
Expand Down Expand Up @@ -668,7 +668,7 @@ def forward(
if position_ids is None:
position_ids = paddle.arange(kv_seq_len, seq_length).unsqueeze(0).tile((bsz, 1))

if not self.config.fuse_rope:
if not self.config.apply_rope_fusion:
position_embeddings = self.rotary_emb(hidden_states, position_ids) # cos and sin
else:
position_embeddings = None
Expand Down Expand Up @@ -768,7 +768,7 @@ def __init__(self, config):

def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
"""Avoid using attention_mask with flash_attn on generation."""
if self.config.use_flash_attention:
if self.config._attn_implementation == "sdpa":
return None
return super().prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)

Expand Down
3 changes: 0 additions & 3 deletions paddleformers/transformers/ernie4_5_moe/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def __init__(
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=False,
use_flash_attention=True,
use_rmsnorm=True,
pad_token_id=0,
bos_token_id=1,
Expand Down Expand Up @@ -110,7 +109,6 @@ def __init__(
hidden_act (str): Name of the activation function used in the feed-forward network
rms_norm_eps (float): The epsilon used by the RMS normalization layers
use_cache (bool): Whether to use caching for faster generation (decoding)
use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
recompute (bool): Whether to use gradient checkpointing to save memory
recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
recompute_use_reentrant (bool): Whether to use reentrant checkpointing
Expand Down Expand Up @@ -187,7 +185,6 @@ def __init__(
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.use_flash_attention = use_flash_attention
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
Expand Down
4 changes: 2 additions & 2 deletions paddleformers/transformers/ernie4_5_moe/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,7 @@ def forward(

hidden_states = inputs_embeds

if self.config.fuse_rope:
if self.config.apply_rope_fusion:
position_embeddings = None
else:
position_embeddings = self.rotary_emb(hidden_states, position_ids) # cos and sin
Expand Down Expand Up @@ -1111,7 +1111,7 @@ def __init__(self, config):

def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
"""Avoid using attention_mask with flash_attn on generation."""
if self.config.use_flash_attention:
if self.config._attn_implementation == "sdpa":
return None
return super().prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)

Expand Down
Loading
Loading