Small updates on MoE configuration

RissyRan · RissyRan · commit 8597e7da9a80 · 2025-12-31T03:16:25.000Z
diff --git a/docs/reference/core_concepts/moe_configuration.md b/docs/reference/core_concepts/moe_configuration.md
@@ -50,7 +50,7 @@ Dropping:
 
 `first_num_dense_layers`: The number of initial dense layers before the first MoE layer is introduced.
 
-`float32_weight_sum`: If enabled, performs the summation of expert weights using float32 precision for improved numerical stability.
+`float32_weight_sum`: If enabled, performs the summation of expert weights using float32 precision for improved numerical stability. Recommended specifically when lower precision types cause convergence or quality issues.
 
 ### Routing Mechanism
 `use_random_routing`: If enabled, ignores the gate logits and routes tokens to random experts. This is designed to simulate load balancing for debugging and performance testing purposes.
@@ -80,9 +80,9 @@ Dropping:
   * Value > 0: Enforces a strict capacity limit; tokens exceeding this limit are dropped.
   * Value = -1: Dropless with dense matrix multiplication, which is computationally expensive and typically used only as a baseline.
 
-`use_custom_sort_vjp`: If enabled, use a custom Vector-Jacobian Product (VJP) sort for efficient backward pass processing in sparse matmul.
+`use_custom_sort_vjp`: If enabled, use a custom Vector-Jacobian Product (VJP) sort for efficient backward pass processing in sparse matmul. Recommended to replace the inefficient scatter-add generated by the `jax.numpy.take` in the backward pass.
 
-`mlp_bias`: If enabled, add bias terms within the expert MLP layers.
+`mlp_bias`: If enabled, add learnable bias terms for MLP matmul. Originally implemented to support the GPT-OSS model architecture.
 
 `use_batch_split_schedule` (experimental): If enabled, split batch into micro-batches to hide communications.
 
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -157,7 +157,7 @@ logits_dot_in_fp32: false  # whether to use fp32 in logits_dense or shared_embed
 cast_logits_to_fp32: true # whether to cast the logits to fp32. the higher precision is generally beneficial, but it can vary slightly.
 float32_qk_product: false # in dot_product attention, whether to cast to fp32 the inputs to qk product
 float32_logits: false # in dot_product attention, whether to cast to fp32 the inputs to softmax
-float32_weight_sum: true # whether to use full fp32 precision for weight_sum during final unpermute in moe
+float32_weight_sum: true # # whether to use full fp32 precision to sum expert weights for numerical stability
 
 # multi-token prediction configs
 # the number of auxiliary prediction layers to use for mtp.
@@ -179,7 +179,7 @@ sparse_matmul: true
 capacity_factor: -1.0 # a factor to decide expert capacity for token dropping, and no dropping by default
 load_balance_loss_weight: 0.01 # weight for the load balance loss
 use_random_routing: false # whether to use random routing for debug/test purpose
-use_custom_sort_vjp: true # whether to use a custom sort vjp for sparse matmul ops
+use_custom_sort_vjp: true # whether to use a custom VJP sort for efficient backward pass processing in sparse matmul
 use_ring_of_experts: false # whether to use ring of experts for sparse matmul expert parallelism
 # tunable tiling dimensions used for mlp gmm
 # megablox/jax ragged dot - supports forward pass only (6 configs: `wi_tile_fwd...` and `wo_tile_fwd_...`)
@@ -212,7 +212,7 @@ expert_shard_attention_option: "fsdp"
 
 # when moe weight matrices are sharded on both fsdp and fsdp-transpose axes, use two separate all-gather calls
 moe_fsdp_use_two_stage_all_gather: false
-# shard the moe weights on num_expert_dim. this can be performanct when num_expert % fdsp_parallisum
+# Shard the expert dimension of the MLP weights on the FSDP axis, and recommended when num_experts is a multiple of fsdp_parallelism
 fsdp_shard_on_exp: False
 # use fsdp and fsdp_transpose axes for sharding the moe weights
 use_2d_fsdp_sharding: False
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -583,7 +583,7 @@ class MoEGeneral(BaseModel):
   )
   float32_weight_sum: bool = Field(
       True,
-      description="Whether to use full fp32 precision for weight_sum during final unpermute in MoE.",
+      description="Whether to use full fp32 precision to sum expert weights for numerical stability.",
   )
 
 

Original file line number	Diff line number	Diff line change
`@@ -583,7 +583,7 @@ class MoEGeneral(BaseModel):`
`583`	`583`	`)`
`584`	`584`	`float32_weight_sum: bool = Field(`
`585`	`585`	`True,`
`586`		`- description="Whether to use full fp32 precision for weight_sum during final unpermute in MoE.",`
	`586`	`+ description="Whether to use full fp32 precision to sum expert weights for numerical stability.",`
`587`	`587`	`)`
`588`	`588`
`589`	`589`