Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ NVIDIA Model Optimizer Changelog (Linux)
- Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
- Add support for rotating the input before quantization for RHT.
- Add support for advanced weight scale search for NVFP4 quantization and its export path.
- Enable PTQ workflow for Qwen3.5 MoE models.

0.42 (2026-02-xx)
^^^^^^^^^^^^^^^^^
Expand Down
3 changes: 2 additions & 1 deletion examples/llm_ptq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
| Llama-Nemotron Ultra | ✅ | ❌ | ❌ | ❌ | ❌ |
| Gemma 3 | ✅<sup>2</sup> | - | ✅ | - | - |
| QWen 2, 2.5 <sup>4</sup> | ✅ | ✅ | ✅ | ✅ | ✅ |
| QWen3 MOE, Next <sup>6</sup> | ✅ | - | - | - | ✅ |
| QWen3, 3.5 MOE, Next <sup>6</sup> | ✅ | - | - | - | ✅ |
| QwQ | ✅ | - | - | - | ✅ |
| DeepSeek V3, R1, V3.1, V3.2<sup>7</sup> | - | - | - | - | ✅ |
| GLM-4.7<sup>8</sup> | ✅ | - | - | - | ✅ |
Expand Down Expand Up @@ -402,6 +402,7 @@ print(llm_fp8.generate(["What's the age of the earth? "]))
| QWen3 | FP4 | ✅ | ✅ | - |
| QWen3 MoE | FP8 | ✅ | ✅ | ✅ |
| QWen3 MoE | FP4 | ✅ | - | - |
| QWen3.5 MoE | FP4 | - | - | ✅ |
| QWen2.5 | FP8 | ✅ | ✅ | ✅ |
| QWen2.5 | FP4 | ✅ | ✅ | - |
| QwQ-32B | FP8 | ✅ | ✅ | ✅ |
Expand Down
9 changes: 6 additions & 3 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,16 +650,19 @@ def export_quantized(
extra_state_dict=mtp_state_dict,
)

# Copy custom model files (Python files and JSON configs) if trust_remote_code is used
copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)

# Restore default padding and export the tokenizer as well.
if tokenizer is not None:
tokenizer.padding_side = default_padding_side
if default_pad_token is not None:
tokenizer.pad_token = default_pad_token
tokenizer.save_pretrained(export_path)

# Copy custom model files (Python files and JSON configs) if trust_remote_code is used.
# This must run AFTER tokenizer.save_pretrained() so original tokenizer files
# from the source checkpoint take precedence over regenerated ones (which may
# differ in format due to newer transformers versions).
copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)

end_time = time.time()
print(
f"Quantized model exported to: {export_path}. Total time used {end_time - start_time}s"
Expand Down
28 changes: 12 additions & 16 deletions modelopt/torch/export/layer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,20 +327,12 @@ def is_mlp(module: nn.Module) -> bool:

def is_moe(module: nn.Module) -> bool:
"""Returns whether the module is an MOE layer."""
return any(
key in type(module).__name__.lower()
for key in [
"MixtralSparseMoeBlock".lower(),
"ArcticMoE".lower(),
"DbrxFFN".lower(),
"MoELayer".lower(),
"PhimoeSparseMoeBlock".lower(),
"DeepseekMoE".lower(),
"Qwen2MoeSparseMoeBlock".lower(),
"Qwen3MoeSparseMoeBlock".lower(),
"Qwen3NextSparseMoeBlock".lower(),
]
)
name = type(module).__name__.lower()
# Auto-detect common MoE patterns
if name.endswith("sparsemoeblock") or "moelayer" in name:
return True
# Explicit matches for non-standard naming
return any(key in name for key in ["arcticmoe", "deepseekmoe", "dbrxffn"])


def is_quantlinear(module: nn.Module) -> bool:
Expand Down Expand Up @@ -1006,6 +998,7 @@ def module_match_name_list(module, name_list):
"Qwen2MoeSparseMoeBlock",
"Qwen3MoeSparseMoeBlock",
"Qwen3NextSparseMoeBlock",
"Qwen3_5MoeSparseMoeBlock",
"DeepseekMoE",
],
):
Expand Down Expand Up @@ -1141,7 +1134,10 @@ def set_expert_quantizer_amax(
# Apply target amax to quantizers that need it
for module, attr_name, quantizer in all_quantizers:
# Check if quantizer needs amax (use property for consistency)
needs_amax = getattr(quantizer, "amax", None) is None
# Also treat zero amax as needing recalibration — a zero amax is never valid
# and indicates the quantizer wasn't activated during calibration
amax = getattr(quantizer, "amax", None)
needs_amax = amax is None or (isinstance(amax, torch.Tensor) and torch.all(amax == 0))

# Skip dynamic quantizers for input quantizers
if "input_quantizer" in attr_name and getattr(quantizer, "_dynamic", False):
Expand Down Expand Up @@ -1747,7 +1743,7 @@ def _split_fused_qkv_weight_and_scaling(

qkv_in = weight.shape[-1] if weight_dim > 1 else 1

num_kv_heads = num_kv_heads if num_kv_heads else num_heads
num_kv_heads = num_kv_heads or num_heads
assert num_heads % num_kv_heads == 0, (
f"num_heads({num_heads}) must be divisible by num_kv_heads({num_kv_heads}))."
)
Expand Down
74 changes: 66 additions & 8 deletions modelopt/torch/export/unified_export_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ def _process_quantized_modules(
"""
fsdp_module_to_reshard = None

for _, sub_module in model.named_modules():
for name, sub_module in model.named_modules():
# Optimization to perform resharding only once per decoder layer to avoid extra communication overhead
if isinstance(sub_module, FSDPModule):
# Every time we encounter a new FSDPModule, the previous decoder layer is fully processed.
Expand All @@ -610,8 +610,13 @@ def _process_quantized_modules(
sub_module.unpack_weight()
if get_quantization_format(sub_module) != QUANTIZATION_NONE:
if is_quantlinear(sub_module):
with fsdp2_aware_weight_update(model, sub_module, reshard=False):
_export_quantized_weight(sub_module, dtype)
try:
with fsdp2_aware_weight_update(model, sub_module, reshard=False):
_export_quantized_weight(sub_module, dtype)
except AssertionError as e:
raise AssertionError(
f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}"
) from e
elif (
"Llama4TextExperts" in type(sub_module).__name__
or "GptOssExperts" in type(sub_module).__name__
Expand Down Expand Up @@ -988,6 +993,50 @@ def _export_diffusers_checkpoint(
print(f"Export complete. Saved to: {export_dir}")


# TODO: Remove this workaround once HuggingFace fixes revert_weight_conversion to handle
# scalar (0-d) tensors. The bug is in transformers' Chunk.convert() which calls
# tensor.size(self.dim) on quantization scale buffers that are 0-d scalars, causing
# IndexError. Confirmed still present in transformers 5.2.0.
# See: transformers/core_model_loading.py, Chunk.convert()
def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict:
"""No-op replacement for transformers' revert_weight_conversion."""
return state_dict


def _try_patch_module(mod_path: str) -> tuple[Any, Any] | None:
"""Try to patch revert_weight_conversion in a single module."""
import importlib

try:
mod = importlib.import_module(mod_path)
if hasattr(mod, "revert_weight_conversion"):
original = getattr(mod, "revert_weight_conversion")
setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop)
return (mod, original)
except (ImportError, AttributeError):
pass
return None


def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]:
"""Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors."""
patches: list[tuple[Any, Any]] = []
for mod_path in [
"transformers.core_model_loading",
"transformers.modeling_utils",
]:
result = _try_patch_module(mod_path)
if result is not None:
patches.append(result)
return patches


def _unpatch_revert_weight_conversion(patches: list[tuple[Any, Any]]) -> None:
"""Restore the original revert_weight_conversion functions."""
for mod, original in patches:
mod.revert_weight_conversion = original


def export_hf_checkpoint(
model: Any,
dtype: torch.dtype | None = None,
Expand Down Expand Up @@ -1047,11 +1096,20 @@ def export_hf_checkpoint(
model.hf_quantizer = None

# Save model
model.save_pretrained(
export_dir,
state_dict={**post_state_dict, **(extra_state_dict or {})},
save_modelopt_state=save_modelopt_state,
)
# Temporarily disable revert_weight_conversion if available — it doesn't handle
# quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
# We must patch both the source module and the importing module since
# modeling_utils does `from core_model_loading import revert_weight_conversion`.
_patches = _patch_revert_weight_conversion()

try:
model.save_pretrained(
export_dir,
state_dict={**post_state_dict, **(extra_state_dict or {})},
save_modelopt_state=save_modelopt_state,
)
finally:
_unpatch_revert_weight_conversion(_patches)

Comment on lines +1099 to 1113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Serialize global patching to avoid cross-export races.

The patch/unpatch sequence mutates module globals process-wide. Concurrent exports can interleave and restore the wrong function, causing flaky behavior.

🔒 Proposed fix (serialize patch window)
+import threading
...
+_REVERT_WEIGHT_CONVERSION_PATCH_LOCK = threading.Lock()
...
-        _patches = _patch_revert_weight_conversion()
-
-        try:
-            model.save_pretrained(
-                export_dir,
-                state_dict={**post_state_dict, **(extra_state_dict or {})},
-                save_modelopt_state=save_modelopt_state,
-            )
-        finally:
-            _unpatch_revert_weight_conversion(_patches)
+        with _REVERT_WEIGHT_CONVERSION_PATCH_LOCK:
+            _patches = _patch_revert_weight_conversion()
+            try:
+                model.save_pretrained(
+                    export_dir,
+                    state_dict={**post_state_dict, **(extra_state_dict or {})},
+                    save_modelopt_state=save_modelopt_state,
+                )
+            finally:
+                _unpatch_revert_weight_conversion(_patches)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Temporarily disable revert_weight_conversion if available — it doesn't handle
# quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
# We must patch both the source module and the importing module since
# modeling_utils does `from core_model_loading import revert_weight_conversion`.
_patches = _patch_revert_weight_conversion()
try:
model.save_pretrained(
export_dir,
state_dict={**post_state_dict, **(extra_state_dict or {})},
save_modelopt_state=save_modelopt_state,
)
finally:
_unpatch_revert_weight_conversion(_patches)
# Temporarily disable revert_weight_conversion if available — it doesn't handle
# quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
# We must patch both the source module and the importing module since
# modeling_utils does `from core_model_loading import revert_weight_conversion`.
with _REVERT_WEIGHT_CONVERSION_PATCH_LOCK:
_patches = _patch_revert_weight_conversion()
try:
model.save_pretrained(
export_dir,
state_dict={**post_state_dict, **(extra_state_dict or {})},
save_modelopt_state=save_modelopt_state,
)
finally:
_unpatch_revert_weight_conversion(_patches)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelopt/torch/export/unified_export_hf.py` around lines 1060 - 1074, The
patch/unpatch sequence around
_patch_revert_weight_conversion/_unpatch_revert_weight_conversion mutates
process globals and must be serialized to avoid cross-export races; add a
module-level lock (e.g., threading.RLock) and acquire it before calling
_patch_revert_weight_conversion, keep it held across model.save_pretrained(...)
and the finally block, then release after _unpatch_revert_weight_conversion so
only one export at a time can patch globals; update any helper initialization to
use the new lock and ensure exceptions still trigger unpatch+release.

original_config = f"{export_dir}/config.json"
config_data = {}
Expand Down
115 changes: 115 additions & 0 deletions modelopt/torch/quantization/plugins/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,107 @@ def forward(
return next_states


class _Qwen35MoeExpertModule(nn.Module):
"""Container for a single Qwen3.5 MoE expert's linear layers.

Produces the naming pattern: experts.{id}.gate_proj.weight
(consistent with standard Qwen3 MoE per-expert module structure).
"""

def __init__(self, hidden_dim: int, expert_dim: int):
super().__init__()
self.gate_proj = nn.Linear(hidden_dim, expert_dim, bias=False)
self.up_proj = nn.Linear(hidden_dim, expert_dim, bias=False)
self.down_proj = nn.Linear(expert_dim, hidden_dim, bias=False)


class _QuantQwen35MoeExperts(QuantModule):
def _setup(self):
"""Modify the Qwen3_5MoeExperts by using per-expert nn.Module containers.

This produces the naming pattern: experts.{id}.gate_proj.weight
(consistent with standard Qwen3 MoE).
"""
from accelerate import init_empty_weights

dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device

def _copy_weight(module, weight):
module.to_empty(device=device)
with torch.no_grad():
module.weight.data = weight.detach().data.to(dtype=dtype, device=device)

expert_dim = self.intermediate_dim
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _QuantQwen35MoeExperts class directly accesses self.intermediate_dim without fallback handling, unlike _QuantQwen3VLMoeTextExperts which checks for both intermediate_size and intermediate_dim attributes. If the Qwen3.5 model uses intermediate_size instead of intermediate_dim, this will cause an AttributeError. Consider adding the same fallback logic used in _QuantQwen3VLMoeTextExperts (lines 671-676).

Suggested change
expert_dim = self.intermediate_dim
# Support both `intermediate_size` and `intermediate_dim` depending on the model config.
if hasattr(self, "intermediate_size"):
expert_dim = self.intermediate_size
else:
expert_dim = self.intermediate_dim

Copilot uses AI. Check for mistakes.

with init_empty_weights():
expert_modules = nn.ModuleList(
[
_Qwen35MoeExpertModule(self.hidden_dim, expert_dim)
for _ in range(self.num_experts)
]
)

for idx in range(self.num_experts):
# gate_up_proj shape: (num_experts, 2*intermediate_dim, hidden_dim)
# Already in (out_features, in_features) format, no transpose needed
_copy_weight(expert_modules[idx].gate_proj, self.gate_up_proj[idx, :expert_dim, :])
_copy_weight(expert_modules[idx].up_proj, self.gate_up_proj[idx, expert_dim:, :])
# down_proj shape: (num_experts, hidden_dim, intermediate_dim)
# Already in (out_features, in_features) format
_copy_weight(expert_modules[idx].down_proj, self.down_proj[idx])

delattr(self, "gate_up_proj")
delattr(self, "down_proj")
# Register expert modules directly as numbered children (like nn.ModuleList)
# so the naming pattern is: experts.{id}.gate_proj.weight (no extra nesting)
for idx in range(self.num_experts):
self.add_module(str(idx), expert_modules[idx])

def __len__(self):
"""Support len() so the module is iterable like standard MoE experts."""
return self.num_experts

def __iter__(self):
"""Support iteration over expert modules."""
for idx in range(self.num_experts):
yield getattr(self, str(idx))

def __getitem__(self, idx):
"""Support indexing to get individual expert modules."""
return getattr(self, str(int(idx)))

def forward(
self,
hidden_states: torch.Tensor,
top_k_index: torch.Tensor,
top_k_weights: torch.Tensor,
) -> torch.Tensor:
final_hidden_states = torch.zeros_like(hidden_states)
with torch.no_grad():
expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
expert_mask = expert_mask.permute(2, 1, 0)
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
for expert_idx in expert_hit:
expert_idx = expert_idx[0]
if expert_idx == self.num_experts:
continue
with torch.no_grad():
top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
current_state = hidden_states[token_idx]
expert = self[expert_idx]
gate = expert.gate_proj(current_state)
up = expert.up_proj(current_state)
current_hidden_states = self.act_fn(gate) * up
current_hidden_states = expert.down_proj(current_hidden_states)
current_hidden_states = (
current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
)
final_hidden_states.index_add_(
0, token_idx, current_hidden_states.to(final_hidden_states.dtype)
)
return final_hidden_states


class _QuantDbrxFFN(_QuantSparseMoe):
@property
def num_experts(self):
Expand Down Expand Up @@ -882,6 +983,20 @@ def unpack_weight(self):
pass


try:
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeExperts

# Qwen3_5MoeSparseMoeBlock registration is handled by register_sparse_moe_on_the_fly
# (auto-detected via gate.top_k + gate.num_experts + experts pattern).
# Only the fused expert weights need explicit registration.
if Qwen3_5MoeExperts not in QuantModuleRegistry:
QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})(
_QuantQwen35MoeExperts
)
except ImportError:
pass


class _QuantGptOssExperts(_QuantFunctionalMixin):
"""Quantized wrapper for `transformers.GptOssExperts`.

Expand Down
4 changes: 2 additions & 2 deletions modelopt/torch/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def get_dataset_dataloader(
An instance of dataloader.
"""
assert tokenizer is not None, "Please provide a tokenizer."
# batch_encode_plus will modify the tokenizer in place, so we need to clone it.
# Tokenizer encoding may modify the tokenizer in place, so we need to clone it.
tokenizer = copy.deepcopy(tokenizer)

if tokenizer.padding_side != "left":
Expand All @@ -323,7 +323,7 @@ def get_dataset_dataloader(
)
all_samples.extend(samples)

batch_encoded = tokenizer.batch_encode_plus(
batch_encoded = tokenizer(
all_samples,
return_tensors="pt",
padding=True,
Expand Down