Consolidate lm-eval scripts: merge AnyModel auto-detection into lm_eval_hf.py (#1206)

j-rausch · web-flow · commit fd5694dc1821 · 2026-04-09T14:15:08.000Z
## Summary

- Merge `examples/puzzletron/evaluation/lm_eval_anymodel.py` into the
existing
`examples/llm_eval/lm_eval_hf.py` so there is a single evaluation entry
point
  for both standard HF and AnyModel/Puzzletron checkpoints.
- AnyModel support is auto-detected at load time via
`resolve_descriptor_from_pretrained`;
  the puzzletron extra is optional 


## Notes

AnyModel auto-detection uses `resolve_descriptor_from_pretrained`, which
currently
relies on a hardcoded `_MODEL_TYPE_TO_DESCRIPTOR` dict that must be kept
in sync
manually with descriptor registrations. This should be addressed in the
future.

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Automated detection and correct loading of Puzzletron heterogeneous
pruned checkpoints via the main evaluation entrypoint.

* **Documentation**
* Added a “Heterogeneous Pruned Checkpoints (Puzzletron)” subsection
with install notes, example evaluation commands, and smoke-test
guidance.

* **Chores**
* Removed the separate Puzzletron evaluation script and consolidated
evaluation into the primary lm-eval workflow.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: jrausch &lt;jrausch@nvidia.com&gt;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -96,7 +96,6 @@ repos:
               examples/llm_eval/modeling.py|
               examples/llm_qat/main.py|
               examples/llm_sparsity/weight_sparsity/finetune.py|
-              examples/puzzletron/evaluation/lm_eval_anymodel.py|
               examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
diff --git a/examples/llm_eval/README.md b/examples/llm_eval/README.md
@@ -40,6 +40,22 @@ accelerate launch --multi_gpu --num_processes <num_copies_of_your_model> \
     --batch_size 4
 ```
 
+### Heterogeneous Pruned Checkpoints (Puzzletron)
+
+Heterogeneous pruned checkpoints produced by Puzzletron are automatically detected and loaded with the appropriate model patcher. No additional flags are needed beyond specifying the checkpoint path:
+
+```sh
+python lm_eval_hf.py --model hf \
+    --model_args pretrained=path/to/anymodel/checkpoint,dtype=bfloat16,parallelize=True \
+    --tasks mmlu \
+    --num_fewshot 5 \
+    --batch_size 4
+```
+
+For a quick smoke test, add `--limit 10`.
+
+> **Note:** Requires the `puzzletron` extra to be installed (`pip install -e ".[puzzletron]"`).
+
 ### Quantized (simulated)
 
 - For simulated quantization with any of the default quantization formats:
diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py
@@ -36,6 +36,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 import warnings
 
 import datasets
@@ -50,9 +51,33 @@
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.sparsity.attention_sparsity.conversion import is_attn_sparsified
 
+try:
+    import modelopt.torch.puzzletron.anymodel.models  # noqa: F401
+    from modelopt.torch.puzzletron.anymodel.model_descriptor.model_descriptor_factory import (
+        resolve_descriptor_from_pretrained,
+    )
+    from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
+
+    _ANYMODEL_AVAILABLE = True
+except ImportError:
+    _ANYMODEL_AVAILABLE = False
+
+
+def _anymodel_patcher_context(pretrained, trust_remote_code=False):
+    """Return a deci_x_patcher context if *pretrained* is a Puzzletron checkpoint, else a no-op."""
+    if not _ANYMODEL_AVAILABLE or not pretrained:
+        return contextlib.nullcontext()
+    try:
+        descriptor = resolve_descriptor_from_pretrained(
+            pretrained, trust_remote_code=trust_remote_code
+        )
+    except (ValueError, AttributeError):
+        return contextlib.nullcontext()
+    return deci_x_patcher(model_descriptor=descriptor)
+
 
 def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict | None = None) -> T:
-    """Overrides the HFLM.create_from_arg_obj"""
+    """Override HFLM.create_from_arg_obj to add quantization, sparsity, and Puzzletron support."""
 
     quant_cfg = arg_dict.pop("quant_cfg", None)
     auto_quantize_bits = arg_dict.pop("auto_quantize_bits", None)
@@ -72,7 +97,10 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
     # Enable automatic save/load of modelopt state huggingface checkpointing
     mto.enable_huggingface_checkpointing()
 
-    model_obj = cls(**arg_dict, **additional_config)
+    with _anymodel_patcher_context(
+        arg_dict.get("pretrained"), arg_dict.get("trust_remote_code", False)
+    ):
+        model_obj = cls(**arg_dict, **additional_config)
     model_obj.tokenizer.padding_side = "left"
     if is_quantized(model_obj.model):
         # return if model is already quantized
@@ -109,10 +137,28 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
     return model_obj
 
 
+def create_from_arg_string(
+    cls: type[T], arg_string: str, additional_config: dict | None = None
+) -> T:
+    """Override HFLM.create_from_arg_string to support Puzzletron checkpoints."""
+    args = utils.simple_parse_args_string(arg_string)
+    additional_config = {} if additional_config is None else additional_config
+    args2 = {k: v for k, v in additional_config.items() if v is not None}
+
+    mto.enable_huggingface_checkpointing()
+
+    with _anymodel_patcher_context(args.get("pretrained"), args.get("trust_remote_code", False)):
+        model_obj = cls(**args, **args2)
+
+    return model_obj
+
+
 HFLM.create_from_arg_obj = classmethod(create_from_arg_obj)
+HFLM.create_from_arg_string = classmethod(create_from_arg_string)
 
 
 def setup_parser_with_modelopt_args():
+    """Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
     parser = setup_parser()
     parser.add_argument(
         "--quant_cfg",
diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
@@ -235,7 +235,7 @@ The plot shows how token accuracy changes with different compression rates. High
 Evaluate AnyModel checkpoints using [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) directly.
 
 ```bash
-python examples/puzzletron/evaluation/lm_eval_anymodel.py \
+python examples/llm_eval/lm_eval_hf.py \
     --model hf \
     --model_args pretrained=path/to/checkpoint,dtype=bfloat16,parallelize=True \
     --tasks mmlu \
diff --git a/examples/puzzletron/evaluation/lm_eval_anymodel.py b/examples/puzzletron/evaluation/lm_eval_anymodel.py