diff --git a/cranelift/codegen/meta/src/pulley.rs b/cranelift/codegen/meta/src/pulley.rs
index 77e760eda7a3..ef8a91f33ab0 100644
--- a/cranelift/codegen/meta/src/pulley.rs
+++ b/cranelift/codegen/meta/src/pulley.rs
@@ -97,6 +97,13 @@ impl Inst<'_> {
// Skip special instructions not used in Cranelift.
"XPush32Many" | "XPush64Many" | "XPop32Many" | "XPop64Many" => true,
+ // Phase-3 fused dispatch op: 3 writable results would
+ // require extending the auto-codegen `results[..]` match
+ // arms below. The op is emitted only via the hand-written
+ // `MInst::BandFuncrefDispatch` path, so no auto-generated
+ // ISLE rule is needed — skip here.
+ n if n.starts_with("XbandFuncrefDispatch") => true,
+
// Skip more branching-related instructions.
n => n.starts_with("Br"),
}
diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle
index 258551a17598..bab0fa9a25de 100644
--- a/cranelift/codegen/src/isa/pulley_shared/inst.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle
@@ -67,6 +67,48 @@
;; Jump to `then` if `c` is true, otherwise to `else`.
(BrIf (cond Cond) (taken MachLabel) (not_taken MachLabel))
+ ;; Fused `band src, mask` + `brif src` emitted at the call_indirect
+ ;; lazy-init brif site. `dst = src & sign_extend(mask)` is
+ ;; unconditional; the branch test is on `src`'s low-32 or full-64 bits
+ ;; per `size`. Pulley-side: `xband*_s8_br_if_*`.
+ (BandBrIf
+ (dst WritableXReg)
+ (src XReg)
+ (mask i8)
+ (size OperandSize)
+ (taken MachLabel)
+ (not_taken MachLabel))
+
+ ;; Funcref-dispatch fusion: `brif (band v -2) + load code + load vmctx`
+ ;; across the brif and its continuation block. Emitted at the
+ ;; call_indirect lazy-init site under
+ ;; `is_eagerly_initialized_funcref_table`. Pulley-side:
+ ;; `xfuncref_dispatch_{x64,not_x64,x32,not_x32}`.
+ (FuncrefDispatch
+ (dst_code WritableXReg)
+ (dst_vmctx WritableXReg)
+ (src XReg)
+ (offset_code i8)
+ (offset_vmctx i8)
+ (size OperandSize)
+ (taken MachLabel)
+ (not_taken MachLabel))
+
+ ;; FuncrefDispatch + the preceding `xband_s8 -2` absorbed. `src` is
+ ;; the unmasked funcref; the fused op writes `dst_masked = src & -2`
+ ;; so the brif's block-call-arg copy still has a producer.
+ ;; Pulley-side: `xband_funcref_dispatch_{x64,not_x64,x32,not_x32}`.
+ (BandFuncrefDispatch
+ (dst_masked WritableXReg)
+ (dst_code WritableXReg)
+ (dst_vmctx WritableXReg)
+ (src XReg)
+ (offset_code i8)
+ (offset_vmctx i8)
+ (size OperandSize)
+ (taken MachLabel)
+ (not_taken MachLabel))
+
;; Load the memory address referenced by `mem` into `dst`.
(LoadAddr (dst WritableXReg) (mem Amode))
diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs
index e97e3303ef99..8e385df62a2e 100644
--- a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs
@@ -577,6 +577,19 @@ pub struct PulleyCall {
pub args: SmallVec<[XReg; 4]>,
}
+/// Payload of `CallInfo` for `Inst::IndirectCall`. Mirror of `PulleyCall`:
+/// the first 0–4 integer ABI args are tracked here so the emitted
+/// `call_indirect{1,2,3,4}` opcode moves them into `x0..x3` itself
+/// instead of regalloc synthesising `xmov`s. Remaining args use the
+/// fixed-preg path in `CallInfo::uses`.
+#[derive(Clone, Debug)]
+pub struct PulleyCallIndirect {
+ /// The register holding the call target.
+ pub target: XReg,
+ /// Up to 4 integer args destined for `x0..x3`.
+ pub args: SmallVec<[XReg; 4]>,
+}
+
pub use super::super::lower::isle::generated_code::AddrO32;
impl Copy for AddrO32 {}
diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs
index 74bff5d97a7d..b06535c56312 100644
--- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs
@@ -233,7 +233,22 @@ fn pulley_emit
(
}
Inst::IndirectCall { info } => {
- enc::call_indirect(sink, info.dest);
+ // Drop args already in their ABI register so we can pick a
+ // narrower `call_indirectN` — mirrors the direct-call shrink
+ // above.
+ let target = info.dest.target;
+ let mut args = &info.dest.args[..];
+ while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) {
+ args = &args[..args.len() - 1];
+ }
+ match args {
+ [] => enc::call_indirect(sink, target),
+ [x0] => enc::call_indirect1(sink, target, *x0),
+ [x0, x1] => enc::call_indirect2(sink, target, *x0, *x1),
+ [x0, x1, x2] => enc::call_indirect3(sink, target, *x0, *x1, *x2),
+ [x0, x1, x2, x3] => enc::call_indirect4(sink, target, *x0, *x1, *x2, *x3),
+ _ => unreachable!(),
+ }
if let Some(s) = state.take_stack_map() {
let offset = sink.cur_offset();
@@ -367,6 +382,297 @@ fn pulley_emit
(
assert_eq!(sink.cur_offset(), not_taken_end);
}
+ Inst::BandBrIf {
+ dst,
+ src,
+ mask,
+ size,
+ taken,
+ not_taken,
+ } => {
+ // The forward form branches to `taken` if `src` is non-zero
+ // (after computing `dst = src & sext(mask)`). The inverted form
+ // branches if `src` is zero — used by MachBuffer's fallthrough-
+ // flip optimization. Both must encode to equal-length bytes; the
+ // `_x*` and `_not_x*` ops share the same operand shape, so they
+ // do.
+ let dst_writable = *dst;
+ let src_reg = *src;
+ let mask_imm = *mask;
+
+ // Compute the inverted-form encoding (branch on src == 0) into a
+ // SmallVec so MachBuffer can use it for branch-direction flipping.
+ let mut inverted = SmallVec::<[u8; 16]>::new();
+ match size {
+ OperandSize::Size32 => {
+ enc::xband32_s8_br_if_not_x32(
+ &mut inverted,
+ dst_writable,
+ src_reg,
+ mask_imm,
+ 0,
+ );
+ }
+ OperandSize::Size64 => {
+ enc::xband64_s8_br_if_not_x64(
+ &mut inverted,
+ dst_writable,
+ src_reg,
+ mask_imm,
+ 0,
+ );
+ }
+ }
+ let len = inverted.len() as u32;
+ inverted.clear();
+ let inv_rel = i32::try_from(len - 4).unwrap();
+ match size {
+ OperandSize::Size32 => {
+ enc::xband32_s8_br_if_not_x32(
+ &mut inverted,
+ dst_writable,
+ src_reg,
+ mask_imm,
+ inv_rel,
+ );
+ }
+ OperandSize::Size64 => {
+ enc::xband64_s8_br_if_not_x64(
+ &mut inverted,
+ dst_writable,
+ src_reg,
+ mask_imm,
+ inv_rel,
+ );
+ }
+ }
+ assert!(len > 4);
+
+ // Emit the forward form (branch on src != 0).
+ let taken_end = *start_offset + len;
+ sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel);
+ sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted);
+ patch_pc_rel_offset(sink, |sink| match size {
+ OperandSize::Size32 => {
+ enc::xband32_s8_br_if_x32(sink, dst_writable, src_reg, mask_imm, 0)
+ }
+ OperandSize::Size64 => {
+ enc::xband64_s8_br_if_x64(sink, dst_writable, src_reg, mask_imm, 0)
+ }
+ });
+ debug_assert_eq!(sink.cur_offset(), taken_end);
+
+ // Unconditional jump to `not_taken` for the fall-through path.
+ let not_taken_start = taken_end + 1;
+ let not_taken_end = not_taken_start + 4;
+ sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel);
+ sink.add_uncond_branch(taken_end, not_taken_end, *not_taken);
+ patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0));
+ assert_eq!(sink.cur_offset(), not_taken_end);
+ }
+
+ Inst::FuncrefDispatch {
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code,
+ offset_vmctx,
+ size,
+ taken,
+ not_taken,
+ } => {
+ // Same scaffolding as Inst::BrIf / Inst::BandBrIf. Forward
+ // form's branch fires on `src != 0` (after loads); inverted
+ // form branches on `src == 0` (loads on fall-through). Both
+ // encodings have the same length because they share the
+ // 5-operand shape.
+ let dst_code_w = *dst_code;
+ let dst_vmctx_w = *dst_vmctx;
+ let src_reg = *src;
+ let oc = *offset_code;
+ let ov = *offset_vmctx;
+
+ // Inverted encoding into a scratch SmallVec for MachBuffer.
+ let mut inverted = SmallVec::<[u8; 16]>::new();
+ match size {
+ OperandSize::Size32 => {
+ enc::xfuncref_dispatch_not_x32(
+ &mut inverted,
+ dst_code_w,
+ dst_vmctx_w,
+ src_reg,
+ oc,
+ ov,
+ 0,
+ );
+ }
+ OperandSize::Size64 => {
+ enc::xfuncref_dispatch_not_x64(
+ &mut inverted,
+ dst_code_w,
+ dst_vmctx_w,
+ src_reg,
+ oc,
+ ov,
+ 0,
+ );
+ }
+ }
+ let len = inverted.len() as u32;
+ inverted.clear();
+ let inv_rel = i32::try_from(len - 4).unwrap();
+ match size {
+ OperandSize::Size32 => {
+ enc::xfuncref_dispatch_not_x32(
+ &mut inverted,
+ dst_code_w,
+ dst_vmctx_w,
+ src_reg,
+ oc,
+ ov,
+ inv_rel,
+ );
+ }
+ OperandSize::Size64 => {
+ enc::xfuncref_dispatch_not_x64(
+ &mut inverted,
+ dst_code_w,
+ dst_vmctx_w,
+ src_reg,
+ oc,
+ ov,
+ inv_rel,
+ );
+ }
+ }
+ assert!(len > 4);
+
+ // Emit the forward form (branch on src != 0).
+ let taken_end = *start_offset + len;
+ sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel);
+ sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted);
+ patch_pc_rel_offset(sink, |sink| match size {
+ OperandSize::Size32 => {
+ enc::xfuncref_dispatch_x32(sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0)
+ }
+ OperandSize::Size64 => {
+ enc::xfuncref_dispatch_x64(sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0)
+ }
+ });
+ debug_assert_eq!(sink.cur_offset(), taken_end);
+
+ // Unconditional jump to `not_taken` for the fall-through path.
+ let not_taken_start = taken_end + 1;
+ let not_taken_end = not_taken_start + 4;
+ sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel);
+ sink.add_uncond_branch(taken_end, not_taken_end, *not_taken);
+ patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0));
+ assert_eq!(sink.cur_offset(), not_taken_end);
+ }
+
+ Inst::BandFuncrefDispatch {
+ dst_masked,
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code,
+ offset_vmctx,
+ size,
+ taken,
+ not_taken,
+ } => {
+ // Same scaffolding as Inst::FuncrefDispatch, but with an
+ // extra `dst_masked` operand. The forward form branches on
+ // `src != 0` (after computing dst_masked AND the two loads);
+ // the inverted form branches on `src == 0` (only dst_masked
+ // is written on that side). MachBuffer flips between them
+ // for the fall-through optimisation.
+ let dm_w = *dst_masked;
+ let dc_w = *dst_code;
+ let dv_w = *dst_vmctx;
+ let src_reg = *src;
+ let oc = *offset_code;
+ let ov = *offset_vmctx;
+
+ let mut inverted = SmallVec::<[u8; 16]>::new();
+ match size {
+ OperandSize::Size32 => {
+ enc::xband_funcref_dispatch_not_x32(
+ &mut inverted,
+ dm_w,
+ dc_w,
+ dv_w,
+ src_reg,
+ oc,
+ ov,
+ 0,
+ );
+ }
+ OperandSize::Size64 => {
+ enc::xband_funcref_dispatch_not_x64(
+ &mut inverted,
+ dm_w,
+ dc_w,
+ dv_w,
+ src_reg,
+ oc,
+ ov,
+ 0,
+ );
+ }
+ }
+ let len = inverted.len() as u32;
+ inverted.clear();
+ let inv_rel = i32::try_from(len - 4).unwrap();
+ match size {
+ OperandSize::Size32 => {
+ enc::xband_funcref_dispatch_not_x32(
+ &mut inverted,
+ dm_w,
+ dc_w,
+ dv_w,
+ src_reg,
+ oc,
+ ov,
+ inv_rel,
+ );
+ }
+ OperandSize::Size64 => {
+ enc::xband_funcref_dispatch_not_x64(
+ &mut inverted,
+ dm_w,
+ dc_w,
+ dv_w,
+ src_reg,
+ oc,
+ ov,
+ inv_rel,
+ );
+ }
+ }
+ assert!(len > 4);
+
+ let taken_end = *start_offset + len;
+ sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel);
+ sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted);
+ patch_pc_rel_offset(sink, |sink| match size {
+ OperandSize::Size32 => {
+ enc::xband_funcref_dispatch_x32(sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0)
+ }
+ OperandSize::Size64 => {
+ enc::xband_funcref_dispatch_x64(sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0)
+ }
+ });
+ debug_assert_eq!(sink.cur_offset(), taken_end);
+
+ let not_taken_start = taken_end + 1;
+ let not_taken_end = not_taken_start + 4;
+ sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel);
+ sink.add_uncond_branch(taken_end, not_taken_end, *not_taken);
+ patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0));
+ assert_eq!(sink.cur_offset(), not_taken_end);
+ }
+
Inst::LoadAddr { dst, mem } => {
let base = mem.get_base_register();
let offset = mem.get_offset_with_state(state);
diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
index 6bbe69795e51..f9b1a518ae32 100644
--- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
@@ -206,14 +206,23 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
}
}
Inst::IndirectCall { info } => {
- collector.reg_use(&mut info.dest);
let CallInfo {
uses,
defs,
+ dest,
try_call_info,
clobbers,
..
} = &mut **info;
+
+ // First 0–4 integer args are passed as free reg uses; the
+ // emitted `call_indirect{1,2,3,4}` op moves them into x0..x3.
+ // Remaining args use the fixed-preg path in `uses`.
+ let PulleyCallIndirect { target, args } = dest;
+ collector.reg_use(target);
+ for arg in args {
+ collector.reg_use(arg);
+ }
for CallArgPair { vreg, preg } in uses {
collector.reg_fixed_use(vreg, *preg);
}
@@ -261,6 +270,50 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
cond.get_operands(collector);
}
+ Inst::BandBrIf {
+ dst,
+ src,
+ mask: _,
+ size: _,
+ taken: _,
+ not_taken: _,
+ } => {
+ collector.reg_def(dst);
+ collector.reg_use(src);
+ }
+
+ Inst::FuncrefDispatch {
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code: _,
+ offset_vmctx: _,
+ size: _,
+ taken: _,
+ not_taken: _,
+ } => {
+ collector.reg_def(dst_code);
+ collector.reg_def(dst_vmctx);
+ collector.reg_use(src);
+ }
+
+ Inst::BandFuncrefDispatch {
+ dst_masked,
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code: _,
+ offset_vmctx: _,
+ size: _,
+ taken: _,
+ not_taken: _,
+ } => {
+ collector.reg_def(dst_masked);
+ collector.reg_def(dst_code);
+ collector.reg_def(dst_vmctx);
+ collector.reg_use(src);
+ }
+
Inst::LoadAddr { dst, mem } => {
collector.reg_def(dst);
mem.get_operands(collector);
@@ -483,6 +536,9 @@ where
| Inst::Rets { .. } => MachTerminator::Ret,
Inst::Jump { .. } => MachTerminator::Branch,
Inst::BrIf { .. } => MachTerminator::Branch,
+ Inst::BandBrIf { .. } => MachTerminator::Branch,
+ Inst::FuncrefDispatch { .. } => MachTerminator::Branch,
+ Inst::BandFuncrefDispatch { .. } => MachTerminator::Branch,
Inst::BrTable { .. } => MachTerminator::Branch,
Inst::ReturnCall { .. } | Inst::ReturnIndirectCall { .. } => MachTerminator::RetCall,
Inst::Call { info } if info.try_call_info.is_some() => MachTerminator::Branch,
@@ -723,7 +779,7 @@ impl Inst {
}
Inst::IndirectCall { info } => {
- let callee = format_reg(*info.dest);
+ let callee = format_reg(*info.dest.target);
let try_call = info
.try_call_info
.as_ref()
@@ -762,6 +818,82 @@ impl Inst {
format!("br_{cond}, {taken}; jump {not_taken}")
}
+ Inst::BandBrIf {
+ dst,
+ src,
+ mask,
+ size,
+ taken,
+ not_taken,
+ } => {
+ let dst = format_reg(*dst.to_reg());
+ let src = format_reg(**src);
+ let taken = taken.to_string();
+ let not_taken = not_taken.to_string();
+ let width = match size {
+ OperandSize::Size32 => 32,
+ OperandSize::Size64 => 64,
+ };
+ format!(
+ "{dst} = xband{width}_s8 {src}, {mask}; \
+ br_if_x{width} {src}, {taken}; jump {not_taken}"
+ )
+ }
+
+ Inst::FuncrefDispatch {
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code,
+ offset_vmctx,
+ size,
+ taken,
+ not_taken,
+ } => {
+ let dst_code = format_reg(*dst_code.to_reg());
+ let dst_vmctx = format_reg(*dst_vmctx.to_reg());
+ let src = format_reg(**src);
+ let taken = taken.to_string();
+ let not_taken = not_taken.to_string();
+ let width = match size {
+ OperandSize::Size32 => 32,
+ OperandSize::Size64 => 64,
+ };
+ format!(
+ "{dst_code}, {dst_vmctx} = xfuncref_dispatch_x{width} \
+ {src}, code+{offset_code}, vmctx+{offset_vmctx}; \
+ br_if {taken}; jump {not_taken}"
+ )
+ }
+
+ Inst::BandFuncrefDispatch {
+ dst_masked,
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code,
+ offset_vmctx,
+ size,
+ taken,
+ not_taken,
+ } => {
+ let dst_masked = format_reg(*dst_masked.to_reg());
+ let dst_code = format_reg(*dst_code.to_reg());
+ let dst_vmctx = format_reg(*dst_vmctx.to_reg());
+ let src = format_reg(**src);
+ let taken = taken.to_string();
+ let not_taken = not_taken.to_string();
+ let width = match size {
+ OperandSize::Size32 => 32,
+ OperandSize::Size64 => 64,
+ };
+ format!(
+ "{dst_masked}, {dst_code}, {dst_vmctx} = xband_funcref_dispatch_x{width} \
+ {src}, code+{offset_code}, vmctx+{offset_vmctx}; \
+ br_if {taken}; jump {not_taken}"
+ )
+ }
+
Inst::LoadAddr { dst, mem } => {
let dst = format_reg(*dst.to_reg());
let mem = mem.to_string();
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs
index 2039c7de8dd3..23f9c00865f4 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs
@@ -4,7 +4,8 @@ pub mod isle;
use super::{PulleyBackend, PulleyTargetKind, inst::*};
use crate::{
- ir,
+ ir::{self, InstructionData, Opcode},
+ isa::pulley_shared::inst::Inst,
machinst::{lower::*, *},
};
@@ -24,6 +25,15 @@ where
ir_inst: ir::Inst,
targets: &[MachLabel],
) -> Option<()> {
+ // Phase-2/3 fuse band+brif+xload+xload across the brif and its
+ // continuation block; phase-1 just band+brif. Both gated on the
+ // eager-init predicate.
+ if try_fuse_funcref_dispatch::
(ctx, ir_inst, targets) {
+ return Some(());
+ }
+ if try_fuse_band_brif(ctx, ir_inst, targets) {
+ return Some(());
+ }
isle::lower_branch(ctx, self, ir_inst, targets)
}
@@ -31,4 +41,448 @@ where
// Pulley does not support this feature right now.
None
}
+
+ fn pre_lower(&self, ctx: &mut Lower) {
+ // Block lowering runs in reverse layout order, so by the time
+ // `lower_branch` sees the brif, the continuation block has already
+ // been lowered. Marking the continuation's loads `absorbed_pure`
+ // after the fact would create double-writes to their result vregs.
+ // Run the recogniser once up front instead.
+ pre_lower_pulley(ctx, P::pointer_width().bytes());
+ }
+}
+
+/// Recognise `brif (band v -2) ...` at the call_indirect lazy-init site
+/// and fuse it into `MInst::BandBrIf`. Returns true if fusion fired.
+///
+/// Soundness: testing `v_masked != 0` instead of `v != 0` is identical for
+/// every reachable funcref-slot value under
+/// `is_eagerly_initialized_funcref_table` — they differ only at the
+/// tagged-null value `1`, which the predicate excludes.
+fn try_fuse_band_brif(
+ ctx: &mut Lower>,
+ ir_inst: ir::Inst,
+ targets: &[MachLabel],
+) -> bool
+where
+ P: PulleyTargetKind,
+{
+ if targets.len() != 2 {
+ return false;
+ }
+
+ let dfg = ctx.dfg();
+ let InstructionData::Brif {
+ opcode: Opcode::Brif,
+ arg: cond,
+ ..
+ } = dfg.insts[ir_inst]
+ else {
+ return false;
+ };
+
+ // The brif's cond must be `band(v, -2)` with a bit-exact `Imm64(-2)`.
+ // The bit-exact match is load-bearing: it confines the fusion to
+ // func_environ's `Imm64::from(-2_i64)` IR-rewrite site. The wat parser
+ // encodes `(i32.const -2)` as `Imm64(0xFFFFFFFE)`, so user wasm can't
+ // produce `Imm64(-2)` and slip into this code path.
+ let band_inst = match dfg.value_def(cond).inst() {
+ Some(inst) => inst,
+ None => return false,
+ };
+ let (band_src, band_imm) = match dfg.insts[band_inst] {
+ InstructionData::Binary {
+ opcode: Opcode::Band,
+ args: [a, b],
+ } => match dfg.value_def(b).inst() {
+ Some(b_inst) => match dfg.insts[b_inst] {
+ InstructionData::UnaryImm {
+ opcode: Opcode::Iconst,
+ imm,
+ } if imm.bits() == -2 => (a, -2_i8),
+ _ => return false,
+ },
+ None => return false,
+ },
+ _ => return false,
+ };
+
+ // Both ops of the fusion must agree on size: the band's result is the
+ // brif's cond, and its type drives the comparison width.
+ let cond_ty = dfg.value_type(cond);
+ let size = match cond_ty {
+ ir::types::I32 => OperandSize::Size32,
+ ir::types::I64 => OperandSize::Size64,
+ _ => return false,
+ };
+
+ // Reuse the band-result vreg as the fused op's dst, so the block-arg
+ // machinery downstream observes the correct masked value via the same
+ // vreg (single def, single use — no SSA violation). The original band
+ // CLIF inst is then marked as absorbed and skipped in lower_clif_block.
+ let dst_vreg = ctx.put_value_in_regs(cond);
+ let dst_reg = dst_vreg.only_reg().expect("scalar band result");
+ let dst = WritableXReg::try_from(Writable::from_reg(dst_reg))
+ .expect("band result is an x-class register");
+ let src = XReg::new(ctx.put_value_in_regs(band_src).only_reg().expect("scalar"))
+ .expect("band source is an x-class register");
+
+ // Sink the band: the BandBrIf we emit below defines the same dst vreg,
+ // so downstream uses of `cond` still find the value populated.
+ ctx.sink_pure_inst(band_inst);
+
+ ctx.emit(
+ Inst::BandBrIf {
+ dst,
+ src,
+ mask: band_imm,
+ size,
+ taken: targets[0],
+ not_taken: targets[1],
+ }
+ .into(),
+ );
+
+ true
+}
+
+/// True iff `imm` encodes `-2` in `ty`'s width. The egraph canonicalises
+/// `i32(-2)` as `Imm64(0xFFFFFFFE)`, not `Imm64(-2)`, so a width-aware
+/// compare is needed for pulley32.
+fn is_minus_two_for(imm: ir::immediates::Imm64, ty: ir::Type) -> bool {
+ match ty {
+ ir::types::I32 => (imm.bits() as u32) == (-2_i32 as u32),
+ ir::types::I64 => imm.bits() == -2_i64,
+ _ => false,
+ }
+}
+
+/// `(wasm_call, vmctx)` byte offsets in `VMFuncRef`. Both fit in i8 (8/24
+/// on 64-bit, 4/12 on 32-bit), matching the `xfuncref_dispatch_*` ops'
+/// sign-extended-i8 offset operand.
+fn vm_func_ref_offsets(pointer_bytes: u8) -> (i8, i8) {
+ let size = pointer_bytes as i8;
+ (size, size.checked_mul(3).expect("VMFuncRef offsets fit i8"))
+}
+
+/// Recognise the canonical funcref-dispatch shape:
+///
+/// ```text
+/// predecessor:
+/// value = load .ptr (table_entry + 0)
+/// value_masked = band value, -2
+/// brif value_masked, continuation([value_masked]), null_block([])
+/// continuation(funcref_ptr):
+/// code = load .ptr (funcref_ptr + offset_code)
+/// vmctx = load .ptr (funcref_ptr + offset_vmctx)
+/// ```
+fn match_funcref_dispatch_pattern(
+ f: &ir::Function,
+ brif_inst: ir::Inst,
+ pointer_bytes: u8,
+) -> Option {
+ let dfg = &f.dfg;
+ let InstructionData::Brif {
+ opcode: Opcode::Brif,
+ arg: cond,
+ blocks,
+ ..
+ } = dfg.insts[brif_inst]
+ else {
+ return None;
+ };
+ // cond = band(v, -2)
+ let band_inst = dfg.value_def(cond).inst()?;
+ let (v, _imm) = match dfg.insts[band_inst] {
+ InstructionData::Binary {
+ opcode: Opcode::Band,
+ args: [a, b],
+ } => match dfg.value_def(b).inst() {
+ Some(b_inst) => match dfg.insts[b_inst] {
+ InstructionData::UnaryImm {
+ opcode: Opcode::Iconst,
+ imm,
+ } if is_minus_two_for(imm, dfg.value_type(cond)) => (a, -2_i8),
+ _ => return None,
+ },
+ None => return None,
+ },
+ _ => return None,
+ };
+ let cond_ty = dfg.value_type(cond);
+ let size = match cond_ty {
+ ir::types::I32 => OperandSize::Size32,
+ ir::types::I64 => OperandSize::Size64,
+ _ => return None,
+ };
+ // The 64-bit fused op handles I64 pointer types; the 32-bit fused op
+ // handles I32. They line up with the target's pointer width.
+ let expected_size = match pointer_bytes {
+ 4 => OperandSize::Size32,
+ 8 => OperandSize::Size64,
+ _ => return None,
+ };
+ if size != expected_size {
+ return None;
+ }
+
+ // Taken target = continuation block. Its first block param must equal
+ // the brif's first block-call-arg (i.e. value_masked).
+ let taken_call = blocks[0];
+ let continuation = taken_call.block(&dfg.value_lists);
+ let taken_args: smallvec::SmallVec<[ir::BlockArg; 4]> =
+ taken_call.args(&dfg.value_lists).collect();
+ if taken_args.len() < 1 {
+ return None;
+ }
+ let first_arg_val = match taken_args[0] {
+ ir::BlockArg::Value(v) => v,
+ _ => return None,
+ };
+ if first_arg_val != cond {
+ // The brif must pass value_masked as the first block-call-arg.
+ return None;
+ }
+ let cont_params = dfg.block_params(continuation);
+ if cont_params.is_empty() {
+ return None;
+ }
+ let funcref_ptr = cont_params[0];
+
+ // The first two instructions in the continuation block must be the
+ // two field loads in either order.
+ let (offset_code_expected, offset_vmctx_expected) = vm_func_ref_offsets(pointer_bytes);
+ let mut iter = f.layout.block_insts(continuation);
+ let load1 = iter.next()?;
+ let load2 = iter.next()?;
+ let (load_code_inst, load_vmctx_inst) = classify_funcref_loads(
+ dfg,
+ load1,
+ load2,
+ funcref_ptr,
+ offset_code_expected,
+ offset_vmctx_expected,
+ cond_ty,
+ )?;
+ let code_val = dfg.inst_results(load_code_inst)[0];
+ let vmctx_val = dfg.inst_results(load_vmctx_inst)[0];
+
+ let _ = (band_inst, v); // captured for future variants of the pattern check
+ Some(FuncrefDispatchPattern {
+ load_code_inst,
+ load_vmctx_inst,
+ code_val,
+ vmctx_val,
+ offset_code: offset_code_expected,
+ offset_vmctx: offset_vmctx_expected,
+ size,
+ })
+}
+
+struct FuncrefDispatchPattern {
+ load_code_inst: ir::Inst,
+ load_vmctx_inst: ir::Inst,
+ code_val: ir::Value,
+ vmctx_val: ir::Value,
+ offset_code: i8,
+ offset_vmctx: i8,
+ size: OperandSize,
+}
+
+fn classify_funcref_loads(
+ dfg: &ir::DataFlowGraph,
+ a: ir::Inst,
+ b: ir::Inst,
+ funcref_ptr: ir::Value,
+ offset_code: i8,
+ offset_vmctx: i8,
+ pointer_ty: ir::Type,
+) -> Option<(ir::Inst, ir::Inst)> {
+ let (a_off, a_base) = classify_load(dfg, a, pointer_ty)?;
+ let (b_off, b_base) = classify_load(dfg, b, pointer_ty)?;
+ if a_base != funcref_ptr || b_base != funcref_ptr {
+ return None;
+ }
+ if a_off == offset_code && b_off == offset_vmctx {
+ Some((a, b))
+ } else if a_off == offset_vmctx && b_off == offset_code {
+ Some((b, a))
+ } else {
+ None
+ }
+}
+
+fn classify_load(
+ dfg: &ir::DataFlowGraph,
+ inst: ir::Inst,
+ pointer_ty: ir::Type,
+) -> Option<(i8, ir::Value)> {
+ match dfg.insts[inst] {
+ InstructionData::Load {
+ opcode: Opcode::Load,
+ arg,
+ offset,
+ ..
+ } => {
+ let result = *dfg.inst_results(inst).first()?;
+ if dfg.value_type(result) != pointer_ty {
+ return None;
+ }
+ let off_i32: i32 = offset.into();
+ let off_i8 = i8::try_from(off_i32).ok()?;
+ Some((off_i8, arg))
+ }
+ _ => None,
+ }
+}
+
+/// Pulley-specific pre-lowering analysis. Walks every block looking for
+/// the funcref-dispatch fusion shape (see
+/// `match_funcref_dispatch_pattern`), and when it matches, sinks the band
+/// inst and the two continuation-block loads via `sink_pure_inst`. The
+/// brif's lowering (in `try_fuse_funcref_dispatch`) then emits one
+/// `MInst::FuncrefDispatch` whose def vregs replace the absorbed loads'
+/// def vregs.
+fn pre_lower_pulley(ctx: &mut Lower>, pointer_bytes: u8)
+where
+ P: PulleyTargetKind,
+{
+ // Collect candidates first so `&ctx.f` isn't held across the
+ // `sink_pure_inst` calls below.
+ let mut to_sink: smallvec::SmallVec<[(ir::Inst, ir::Inst); 8]> = smallvec::SmallVec::new();
+ {
+ let f = ctx.f;
+ for block in f.layout.blocks() {
+ let Some(term) = f.layout.last_inst(block) else {
+ continue;
+ };
+ if !matches!(f.dfg.insts[term], InstructionData::Brif { .. }) {
+ continue;
+ }
+ if let Some(pat) = match_funcref_dispatch_pattern::(f, term, pointer_bytes) {
+ to_sink.push((pat.load_code_inst, pat.load_vmctx_inst));
+ }
+ }
+ }
+ for (l_code, l_vmctx) in to_sink {
+ ctx.sink_pure_inst(l_code);
+ ctx.sink_pure_inst(l_vmctx);
+ }
+}
+
+/// Phase-2 fusion: emit `MInst::FuncrefDispatch` when the brif matches the
+/// canonical pattern. Relies on the pre-pass having marked the band + two
+/// continuation-block loads as absorbed_pure; this routine just re-derives
+/// the pattern, looks up the relevant vregs, and emits the single fused
+/// MachInst. Returns `true` iff the fusion fired.
+fn try_fuse_funcref_dispatch
(
+ ctx: &mut Lower>,
+ ir_inst: ir::Inst,
+ targets: &[MachLabel],
+) -> bool
+where
+ P: PulleyTargetKind,
+{
+ if targets.len() != 2 {
+ return false;
+ }
+ let pointer_bytes = P::pointer_width().bytes();
+ let Some(pat) = match_funcref_dispatch_pattern::(ctx.f, ir_inst, pointer_bytes) else {
+ return false;
+ };
+
+ let InstructionData::Brif { arg: cond, .. } = ctx.f.dfg.insts[ir_inst] else {
+ return false;
+ };
+
+ // Try phase-3 (absorb the band into BandFuncrefDispatch). The fused
+ // op defines `dst_masked` (= cond's vreg) so the brif's block-call
+ // copy still has a producer, plus `dst_code` and `dst_vmctx`.
+ let dfg = ctx.dfg();
+ let band_inst = dfg.value_def(cond).inst();
+ let v = band_inst.and_then(|bi| match dfg.insts[bi] {
+ InstructionData::Binary {
+ opcode: Opcode::Band,
+ args: [a, b],
+ } => match dfg.value_def(b).inst() {
+ Some(b_inst) => match dfg.insts[b_inst] {
+ InstructionData::UnaryImm {
+ opcode: Opcode::Iconst,
+ imm,
+ } if is_minus_two_for(imm, dfg.value_type(cond)) => Some(a),
+ _ => None,
+ },
+ None => None,
+ },
+ _ => None,
+ });
+
+ // The loads' result vregs become the fused op's defs. Their original
+ // lowering was skipped via `sink_pure_inst` in `pre_lower_pulley`.
+ let dst_code_reg = ctx
+ .put_value_in_regs(pat.code_val)
+ .only_reg()
+ .expect("scalar funcref code result");
+ let dst_vmctx_reg = ctx
+ .put_value_in_regs(pat.vmctx_val)
+ .only_reg()
+ .expect("scalar funcref vmctx result");
+ let dst_code = WritableXReg::try_from(Writable::from_reg(dst_code_reg))
+ .expect("funcref code dst is an x-class register");
+ let dst_vmctx = WritableXReg::try_from(Writable::from_reg(dst_vmctx_reg))
+ .expect("funcref vmctx dst is an x-class register");
+
+ if let (Some(band_inst), Some(v)) = (band_inst, v) {
+ // Phase 3 fires: source is the unmasked `v`; the fused op masks
+ // internally and writes `dst_masked = cond`.
+ let dst_masked_regs = ctx.put_value_in_regs(cond);
+ let dst_masked_reg = dst_masked_regs.only_reg().expect("scalar cond");
+ let dst_masked = WritableXReg::try_from(Writable::from_reg(dst_masked_reg))
+ .expect("cond is an x-class register");
+ let src_reg = ctx
+ .put_value_in_regs(v)
+ .only_reg()
+ .expect("scalar funcref source");
+ let src = XReg::new(src_reg).expect("funcref source is an x-class register");
+ ctx.sink_pure_inst(band_inst);
+ ctx.emit(
+ Inst::BandFuncrefDispatch {
+ dst_masked,
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code: pat.offset_code,
+ offset_vmctx: pat.offset_vmctx,
+ size: pat.size,
+ taken: targets[0],
+ not_taken: targets[1],
+ }
+ .into(),
+ );
+ return true;
+ }
+
+ // Phase-2 fallback: band stays as a standalone op; FuncrefDispatch
+ // consumes its masked result.
+ let src_reg = ctx
+ .put_value_in_regs(cond)
+ .only_reg()
+ .expect("scalar funcref source");
+ let src = XReg::new(src_reg).expect("funcref source is an x-class register");
+
+ ctx.emit(
+ Inst::FuncrefDispatch {
+ dst_code,
+ dst_vmctx,
+ src,
+ offset_code: pat.offset_code,
+ offset_vmctx: pat.offset_vmctx,
+ size: pat.size,
+ taken: targets[0],
+ not_taken: targets[1],
+ }
+ .into(),
+ );
+
+ true
}
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs
index 3068fb1137ff..c065c45a2c6a 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs
@@ -10,8 +10,8 @@ use crate::ir::{condcodes::*, immediates::*, types::*, *};
use crate::isa::CallConv;
use crate::isa::pulley_shared::{
inst::{
- FReg, OperandSize, PulleyCall, ReturnCallInfo, VReg, WritableFReg, WritableVReg,
- WritableXReg, XReg,
+ FReg, OperandSize, PulleyCall, PulleyCallIndirect, ReturnCallInfo, VReg, WritableFReg,
+ WritableVReg, WritableXReg, XReg,
},
lower::{Cond, regs},
*,
@@ -30,7 +30,7 @@ type Unit = ();
type VecArgPair = Vec;
type VecRetPair = Vec;
type BoxCallInfo = Box>;
-type BoxCallIndInfo = Box>;
+type BoxCallIndInfo = Box>;
type BoxCallIndirectHostInfo = Box>;
type BoxReturnCallInfo = Box>;
type BoxReturnCallIndInfo = Box>;
@@ -124,7 +124,7 @@ where
&mut self,
sig: Sig,
dest: Reg,
- uses: CallArgList,
+ mut uses: CallArgList,
defs: CallRetList,
try_call_info: Option,
) -> BoxCallIndInfo {
@@ -133,8 +133,30 @@ where
self.lower_ctx
.abi_mut()
.accumulate_outgoing_args_size(stack_ret_space + stack_arg_space);
+ let call_conv = self.lower_ctx.sigs()[sig].call_conv();
- let dest = XReg::new(dest).unwrap();
+ // Mirror of `gen_call_info`: take out the first four integer
+ // arguments (x0..x3) and pass them through the `args` list so the
+ // emitted `call_indirect{1,2,3,4}` op can move them at call time.
+ // Saves one Pulley dispatch per moved arg vs the previous "regalloc
+ // emits xmov; then `call_indirect`" sequence.
+ let mut args = SmallVec::new();
+ uses.sort_by_key(|arg| arg.preg);
+ if call_conv != CallConv::PreserveAll {
+ uses.retain(|arg| {
+ if arg.preg != regs::x0()
+ && arg.preg != regs::x1()
+ && arg.preg != regs::x2()
+ && arg.preg != regs::x3()
+ {
+ return true;
+ }
+ args.push(XReg::new(arg.vreg).unwrap());
+ false
+ });
+ }
+ let target = XReg::new(dest).unwrap();
+ let dest = PulleyCallIndirect { target, args };
Box::new(
self.lower_ctx
.gen_call_info(sig, dest, uses, defs, try_call_info, false),
diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs
index 7747d804cafe..91d1da84c910 100644
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -25,9 +25,16 @@ pub fn compile(
let block_order = BlockLoweringOrder::new(f, domtree, ctrl_plane);
// Build the lowering context.
- let lower =
+ let mut lower =
crate::machinst::Lower::new(f, abi, emit_info, block_order, sigs, b.flags().clone())?;
+ // Backend-specific pre-lowering analysis. Default impl on LowerBackend
+ // is a no-op; Pulley overrides it to mark continuation-block loads as
+ // absorbed_pure when the call_indirect lazy-init brif pattern is
+ // present, so they can be fused into a single Pulley dispatch op
+ // emitted at the brif's lowering time.
+ b.pre_lower(&mut lower);
+
// Lower the IR.
let vcode = {
log::debug!(
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index 0a09edc5c374..5301c30ee454 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -148,6 +148,18 @@ pub trait LowerBackend {
fn maybe_pinned_reg(&self) -> Option {
None
}
+
+ /// Backend-specific analysis hook, run once after `Lower::new` but
+ /// before the main reverse-block lowering loop. Default: no-op.
+ ///
+ /// Use this to mark instructions as `sink_pure_inst` when they will be
+ /// absorbed by a fused MachInst emitted in a different (earlier-in-CFG,
+ /// later-in-reverse-order) block. The block-by-block lowering loop
+ /// processes blocks in reverse, so cross-block absorption can't be
+ /// arranged at the absorbing instruction's lowering time — it has to be
+ /// arranged here, before any block is lowered. Within a single block,
+ /// `sink_pure_inst` called during normal lowering is still sufficient.
+ fn pre_lower(&self, _ctx: &mut Lower) {}
}
/// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence
@@ -204,6 +216,14 @@ pub struct Lower<'func, I: VCodeInst> {
/// their original locations.
inst_sunk: FxHashSet,
+ /// Pure (non-side-effecting) instructions whose value-production has been
+ /// absorbed by a later-emitted MachInst (typically a terminator that
+ /// fuses an ALU op with a branch). The absorbing MachInst writes to the
+ /// absorbed inst's result vreg, so subsequent `put_value_in_regs` of that
+ /// vreg observes the value normally — but the absorbed inst itself is
+ /// skipped in `lower_clif_block`, avoiding a redundant double-write.
+ inst_absorbed_pure: FxHashSet,
+
/// Instructions collected for the CLIF inst in progress, in forward order.
ir_insts: Vec,
@@ -504,6 +524,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
value_ir_uses,
value_lowered_uses: SecondaryMap::default(),
inst_sunk: FxHashSet::default(),
+ inst_absorbed_pure: FxHashSet::default(),
cur_scan_entry_color: None,
cur_inst: None,
ir_insts: vec![],
@@ -708,6 +729,12 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
self.inst_sunk.contains(&inst)
}
+ /// Has the value-production of this pure instruction been absorbed by a
+ /// later-emitted MachInst? See [`Lower::inst_absorbed_pure`].
+ fn is_inst_absorbed_pure(&self, inst: Inst) -> bool {
+ self.inst_absorbed_pure.contains(&inst)
+ }
+
// Is any result of this instruction needed?
fn is_any_inst_result_needed(&self, inst: Inst) -> bool {
self.f
@@ -750,6 +777,13 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
if self.is_inst_sunk(inst) {
continue;
}
+ // Same for pure-instruction absorption: a terminator earlier in
+ // the reverse-scan emitted a MachInst that writes to this inst's
+ // result vreg directly, so emitting it again here would be a
+ // redundant double-write.
+ if self.is_inst_absorbed_pure(inst) {
+ continue;
+ }
// Are any outputs used at least once?
let value_needed = self.is_any_inst_result_needed(inst);
trace!(
@@ -1666,6 +1700,46 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
self.ir_insts.push(mach_inst);
}
+ /// Indicate that the value-production of a pure (non-side-effecting)
+ /// instruction has been absorbed by a later-emitted MachInst — typically a
+ /// terminator that fuses an ALU op with a branch (e.g. Pulley's
+ /// `xband_brif` fused dispatch op).
+ ///
+ /// The absorbing MachInst must write to the absorbed inst's result vreg
+ /// (`value_regs[result]`) directly, so subsequent `put_value_in_regs` of
+ /// that vreg observes the correct value. The absorbed inst itself is
+ /// skipped in `lower_clif_block`, preventing a redundant second write to
+ /// the same vreg (which would violate SSA single-def).
+ ///
+ /// Unlike [`Lower::sink_inst`], this does not require the inst to have a
+ /// lowering side effect: it is specifically for pure ALU ops whose value
+ /// flows into the fused MachInst's output operand. Color tracking is
+ /// likewise unnecessary because pure insts have no color anchor.
+ ///
+ /// We additionally allow absorbing trusted readonly loads — CLIF
+ /// considers them side-effecting (via `can_load()`), but the
+ /// `notrap + readonly` flags assert they're safe to skip from the
+ /// codegen's perspective. The absorbing MachInst takes responsibility
+ /// for performing the load itself. Color tracking is still
+ /// unnecessary because we're not moving a side-effecting op — we're
+ /// telling the lowerer it has been handled elsewhere.
+ pub fn sink_pure_inst(&mut self, ir_inst: Inst) {
+ let is_pure = !has_lowering_side_effect(self.f, ir_inst);
+ let is_safe_load = match &self.f.dfg.insts[ir_inst] {
+ InstructionData::Load {
+ opcode: crate::ir::Opcode::Load,
+ flags,
+ ..
+ } => {
+ let flags = self.f.dfg.mem_flags[*flags];
+ flags.readonly() && flags.notrap()
+ }
+ _ => false,
+ };
+ assert!(is_pure || is_safe_load);
+ self.inst_absorbed_pure.insert(ir_inst);
+ }
+
/// Indicate that the side-effect of an instruction has been sunk to the
/// current scan location. This should only be done with the instruction's
/// original results are not used (i.e., `put_input_in_regs` is not invoked
diff --git a/cranelift/filetests/filetests/isa/pulley32/call.clif b/cranelift/filetests/filetests/isa/pulley32/call.clif
index c2dc9a09f6c9..aece47fc9a19 100644
--- a/cranelift/filetests/filetests/isa/pulley32/call.clif
+++ b/cranelift/filetests/filetests/isa/pulley32/call.clif
@@ -291,7 +291,7 @@ block0(v0: i32):
; VCode:
; push_frame
; block0:
-; indirect_call x0, CallInfo { dest: XReg(p0i), uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false }
+; indirect_call x0, CallInfo { dest: PulleyCallIndirect { target: XReg(p0i), args: [] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false }
; pop_frame
; ret
;
diff --git a/cranelift/filetests/filetests/isa/pulley32/exceptions.clif b/cranelift/filetests/filetests/isa/pulley32/exceptions.clif
index 2d3dfef3e853..eeed198535d2 100644
--- a/cranelift/filetests/filetests/isa/pulley32/exceptions.clif
+++ b/cranelift/filetests/filetests/isa/pulley32/exceptions.clif
@@ -77,7 +77,7 @@ function %f2(i32, i32) -> i32, f32, f64 {
; block0:
; fconst64 f1, 4607182418800017408
; fstore64 Slot(0), f1 // flags = notrap aligned
-; indirect_call x1, CallInfo { dest: XReg(p1i), uses: [CallArgPair { vreg: p0i, preg: p0i }], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I32) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I32) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)]
+; indirect_call x1, CallInfo { dest: PulleyCallIndirect { target: XReg(p1i), args: [XReg(p0i)] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I32) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I32) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)]
; block1:
; xone x0
; f1 = fload64 Slot(0) // flags = notrap aligned
diff --git a/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif b/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif
index c7d523d4f6a6..c698bb1f71ea 100644
--- a/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif
+++ b/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif
@@ -15,8 +15,8 @@ block0(v0: i64):
; xmov x3, x0
; xmov x1, x3
; xmov x2, x3
-; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
-; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
+; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
+; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
; pop_frame
; ret
;
diff --git a/cranelift/filetests/filetests/isa/pulley64/call.clif b/cranelift/filetests/filetests/isa/pulley64/call.clif
index 16b271835620..bd6f9bba825f 100644
--- a/cranelift/filetests/filetests/isa/pulley64/call.clif
+++ b/cranelift/filetests/filetests/isa/pulley64/call.clif
@@ -291,7 +291,7 @@ block0(v0: i64):
; VCode:
; push_frame
; block0:
-; indirect_call x0, CallInfo { dest: XReg(p0i), uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false }
+; indirect_call x0, CallInfo { dest: PulleyCallIndirect { target: XReg(p0i), args: [] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false }
; pop_frame
; ret
;
diff --git a/cranelift/filetests/filetests/isa/pulley64/exceptions.clif b/cranelift/filetests/filetests/isa/pulley64/exceptions.clif
index 88c5528c1935..6a0b7b1577a1 100644
--- a/cranelift/filetests/filetests/isa/pulley64/exceptions.clif
+++ b/cranelift/filetests/filetests/isa/pulley64/exceptions.clif
@@ -79,7 +79,7 @@ function %f2(i32, i64) -> i32, f32, f64 {
; block0:
; fconst64 f1, 4607182418800017408
; fstore64 Slot(0), f1 // flags = notrap aligned
-; indirect_call x1, CallInfo { dest: XReg(p1i), uses: [CallArgPair { vreg: p0i, preg: p0i }], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I64) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)]
+; indirect_call x1, CallInfo { dest: PulleyCallIndirect { target: XReg(p1i), args: [XReg(p0i)] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I64) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)]
; block1:
; xone x0
; f1 = fload64 Slot(0) // flags = notrap aligned
diff --git a/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif b/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif
index 2b6a28ce9ece..44bc72fcaf25 100644
--- a/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif
+++ b/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif
@@ -15,8 +15,8 @@ block0(v0: i64):
; xmov x3, x0
; xmov x1, x3
; xmov x2, x3
-; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
-; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
+; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
+; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false }
; pop_frame
; ret
;
diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs
index 8bd81a6b46db..975a0a04377d 100644
--- a/crates/cranelift/src/func_environ.rs
+++ b/crates/cranelift/src/func_environ.rs
@@ -1074,6 +1074,10 @@ impl<'module_environment> FuncEnvironment<'module_environment> {
let result_param = builder.append_block_param(continuation_block, pointer_type);
builder.set_cold_block(null_block);
+ // Branching on `value_masked` instead (letting the Pulley backend
+ // fuse the `band + brif` pair) requires a table whose slots are
+ // all eagerly initialized; that variant comes with eager
+ // initialization support.
builder.ins().brif(
value,
continuation_block,
@@ -1855,7 +1859,12 @@ impl FuncEnvironment<'_> {
self.reference_type(table.ref_type.heap_type).0.bytes()
};
- let base_flags = if Some(table.limits.min) == table.limits.max {
+ // A table is fixed-size if min == max or if translation proved it
+ // is never mutated; either way the base address and element count
+ // are constant for the instance's lifetime.
+ let fixed_size =
+ !self.translation.tables_mutated[index] || Some(table.limits.min) == table.limits.max;
+ let base_flags = if fixed_size {
func.dfg
.mem_flags
.insert(MemFlagsData::trusted().with_readonly().with_can_move())
@@ -1867,11 +1876,10 @@ impl FuncEnvironment<'_> {
base: ptr,
offset: Offset32::new(base_offset),
global_type: pointer_type,
- // A fixed-size table can't be resized so its base address won't change.
flags: base_flags,
});
- let bound = if Some(table.limits.min) == table.limits.max {
+ let bound = if fixed_size {
TableSize::Static {
bound: table.limits.min,
}
@@ -2159,6 +2167,14 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> {
callee: ir::Value,
call_args: &[ir::Value],
) -> WasmResult> {
+ // Fast path: if we can statically resolve this indirect call to a
+ // single defined function (immutable funcref table + constant
+ // callee index + matching signature), emit a direct call instead.
+ // See `try_static_resolve_indirect_call`.
+ if let Some(target) = self.try_static_resolve_indirect_call(table_index, ty_index, callee) {
+ return self.direct_call(target, sig_ref, call_args).map(Some);
+ }
+
let (code_ptr, callee_vmctx) = match self.check_and_load_code_and_callee_vmctx(
table_index,
ty_index,
@@ -2173,6 +2189,198 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> {
.map(Some)
}
+ /// Try to statically resolve a `call_indirect` site to a single defined
+ /// function so the call can be lowered as a direct call.
+ ///
+ /// All four of these must hold for the resolution to succeed:
+ ///
+ /// 1. The target table must be provably immutable for the lifetime of
+ /// any instance of this module: defined (not imported) and never the
+ /// target of `table.set` / `table.fill` / `table.copy` (as the dst)
+ /// / `table.grow` / `table.init`. This is the `tables_mutated` bit
+ /// populated in `ModuleEnvironment::translate`.
+ ///
+ /// 2. The callee index value (the operand to `call_indirect`) must be a
+ /// compile-time constant — i.e., the wasm did `i32.const N;
+ /// call_indirect (table $t) (type $sig)`. This is what hand-lowered
+ /// C++/Rust vtable calls and AOT-compiled JS-to-wasm dispatch tables
+ /// look like in practice.
+ ///
+ /// 3. The slot at index `N` in the table must be precomputable from
+ /// static `elem` segments: `module.table_initialization
+ /// .initial_values[defined_index]` must be `TableInitialValue::Null
+ /// { precomputed }` (i.e., not a fully-dynamic `Expr`-style init),
+ /// and the index `N` must be in range and resolved to a concrete
+ /// `FuncIndex` (not the reserved-value sentinel).
+ ///
+ /// 4. The function's signature in the module's interned type table
+ /// must equal the `ty_index` declared by the `call_indirect` site.
+ /// Otherwise the original semantics are "trap on signature
+ /// mismatch", which we don't want to replace with a static direct
+ /// call.
+ ///
+ /// Returns the resolved function on success, `None` otherwise (in
+ /// which case the caller falls back to a normal indirect call).
+ fn try_static_resolve_indirect_call(
+ &self,
+ table_index: TableIndex,
+ ty_index: TypeIndex,
+ callee: ir::Value,
+ ) -> Option {
+ let translation = self.env.translation;
+ let module = &translation.module;
+
+ // (1) Table must be provably immutable. Imported tables are
+ // pre-marked as mutated in `ModuleEnvironment::translate`, so
+ // this check also rules them out (along with the explicit
+ // `defined_table_index` check below for clarity).
+ if translation.tables_mutated[table_index] {
+ return None;
+ }
+ let defined_table = module.defined_table_index(table_index)?;
+
+ // (2) Callee must be a constant `iconst`. Pattern adapted from
+ // `bounds_checks::statically_known_in_bounds`.
+ let dfg = &self.builder.func.dfg;
+ let inst = dfg.value_def(callee).inst()?;
+ let imm = match dfg.insts[inst] {
+ ir::InstructionData::UnaryImm {
+ opcode: ir::Opcode::Iconst,
+ imm,
+ } => imm,
+ _ => return None,
+ };
+ let callee_ty = dfg.value_type(callee);
+ let callee_idx_u64 = imm
+ .zero_extend_from_width(callee_ty.bits())
+ .bits()
+ .cast_unsigned();
+
+ // (3) Slot must be precomputable from the static funcref image.
+ let precomputed = module.table_initialization.get(defined_table)?;
+ let slot = usize::try_from(callee_idx_u64).ok()?;
+ if slot >= precomputed.len() {
+ return None;
+ }
+ let target = precomputed[slot];
+ // `FuncIndex::reserved_value()` marks a null (uncovered) slot.
+ if target.is_reserved_value() {
+ return None;
+ }
+
+ // (4) Signature match. The site's declared `ty_index` and the
+ // target function's declared signature must intern to the same
+ // module type index.
+ let expected_ty = module.types[ty_index].unwrap_module_type_index();
+ let target_ty = module.functions[target]
+ .signature
+ .unwrap_module_type_index();
+ if expected_ty != target_ty {
+ return None;
+ }
+
+ Some(target)
+ }
+
+ /// Try to prove that the runtime signature check at a `call_indirect`
+ /// site through an untyped `funcref` table is redundant.
+ ///
+ /// True when:
+ ///
+ /// 1. The table is provably immutable (`tables_mutated[table_index] ==
+ /// false`). Defined-not-imported is implied since imported tables
+ /// are pre-marked as mutated.
+ ///
+ /// 2. The table is precomputable from static `elem` segments
+ /// (`TableInitialValue::Null { precomputed }`).
+ ///
+ /// 3. Every non-null entry in `precomputed` has the same module-
+ /// interned signature as the `ty_index` declared at the call site.
+ /// Null slots are fine — they trap on the funcref-NULL load that
+ /// happens after sig-check elision.
+ ///
+ /// When this returns true, the caller short-circuits to
+ /// `CheckIndirectCallTypeSignature::StaticMatch`, which removes the
+ /// sig load + compare from the hot path. Bounds-check on the table
+ /// index and the funcref-NULL check are still emitted by the
+ /// surrounding code, so the call still traps correctly on OOB or
+ /// null index — only the sig check is elided.
+ ///
+ /// This is the static analog of an inline-cache: instead of caching
+ /// the resolved target per call site, we observe at module-load that
+ /// the table contents make the sig check uninformative for the
+ /// lifetime of any instance.
+ /// True iff every slot in the precomputed `elem`-segment contents for
+ /// `table_index` is a concrete `FuncIndex` (no
+ /// `FuncIndex::reserved_value()` "no-entry" sentinel).
+ ///
+ /// Caller has already proven the table is immutable, so the contents
+ /// observed here are stable for the lifetime of any instance —
+ /// `false` here implies "no slot is ever null at runtime."
+ ///
+ /// When this is true, the runtime funcref-NULL check on the loaded
+ /// funcref pointer is provably redundant: any in-bounds index leads
+ /// to a non-null funcref. The bounds check still runs (so an
+ /// out-of-bounds index traps as before with `TRAP_TABLE_OUT_OF_BOUNDS`).
+ fn precomputed_table_has_no_null_slots(&self, table_index: TableIndex) -> bool {
+ let module = &self.env.translation.module;
+ let Some(defined_table) = module.defined_table_index(table_index) else {
+ return false;
+ };
+ let Some(precomputed) = module.table_initialization.get(defined_table) else {
+ return false;
+ };
+ if precomputed.is_empty() {
+ return false;
+ }
+ // Slots beyond `precomputed.len()` are null at runtime; coverage
+ // up to `limits.min` is required (caller proved immutable, so the
+ // table can't grow beyond min).
+ let table_min = module.tables[table_index].limits.min;
+ if (precomputed.len() as u64) < table_min {
+ return false;
+ }
+ precomputed.iter().all(|f| !f.is_reserved_value())
+ }
+
+ fn try_elide_sig_check_for_immutable_table(
+ &self,
+ table_index: TableIndex,
+ ty_index: TypeIndex,
+ ) -> bool {
+ let translation = self.env.translation;
+ let module = &translation.module;
+
+ if translation.tables_mutated[table_index] {
+ return false;
+ }
+ let defined_table = match module.defined_table_index(table_index) {
+ Some(d) => d,
+ None => return false,
+ };
+
+ let precomputed = match module.table_initialization.get(defined_table) {
+ Some(p) if !p.is_empty() => p,
+ _ => return false,
+ };
+
+ let expected_ty = module.types[ty_index].unwrap_module_type_index();
+ for &func_idx in precomputed.iter() {
+ // Null slots will trap on the funcref-NULL load anyway.
+ if func_idx.is_reserved_value() {
+ continue;
+ }
+ let actual_ty = module.functions[func_idx]
+ .signature
+ .unwrap_module_type_index();
+ if actual_ty != expected_ty {
+ return false;
+ }
+ }
+
+ true
+ }
+
fn check_and_load_code_and_callee_vmctx(
&mut self,
table_index: TableIndex,
@@ -2230,6 +2438,34 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> {
// table of typed functions and that type matches `ty_index`, then
// there's no need to perform a typecheck.
match table.ref_type.heap_type {
+ // Untyped `funcref` tables ordinarily need a runtime sig check.
+ // But if (a) the table is provably immutable (`tables_mutated`
+ // bit clear) and (b) every non-null entry in the precomputed
+ // static `elem` segments has the same `VMSharedTypeIndex` as
+ // the call site, then the runtime check is provably redundant
+ // and we can elide it the same way we do for typed-funcref
+ // tables.
+ //
+ // This is the AOT-IC-seeding analog: instead of caching the
+ // resolved target at the call site, we cache the *signature*
+ // at module-load time and skip the hot-path sig load+compare.
+ // Helps the megamorphic case (computed `call_indirect` index)
+ // that the static-monomorphization fast path above can't
+ // handle.
+ WasmHeapType::Func
+ if self.try_elide_sig_check_for_immutable_table(table_index, ty_index) =>
+ {
+ // If we additionally know every entry in the precomputed
+ // table is non-null, lower `may_be_null` to false so the
+ // downstream funcref-NULL check is also elided. This is
+ // only sound if the table can't be grown or have its
+ // entries cleared after init (i.e., immutable, which we
+ // already proved above).
+ let may_be_null = table.ref_type.nullable
+ && !self.precomputed_table_has_no_null_slots(table_index);
+ return CheckIndirectCallTypeSignature::StaticMatch { may_be_null };
+ }
+
// Functions do not have a statically known type in the table, a
// typecheck is required. Fall through to below to perform the
// actual typecheck.
diff --git a/crates/environ/src/compile/module_environ.rs b/crates/environ/src/compile/module_environ.rs
index 542181e55fd6..192c090feda2 100644
--- a/crates/environ/src/compile/module_environ.rs
+++ b/crates/environ/src/compile/module_environ.rs
@@ -76,6 +76,26 @@ pub struct ModuleTranslation<'data> {
/// trampolines for each of these signatures are required.
pub exported_signatures: Vec,
+ /// Per-table flag indicating whether the table is ever mutated by any
+ /// function defined in this module via `table.set` / `table.fill` /
+ /// `table.copy` (as the destination) / `table.grow` / `table.init`.
+ ///
+ /// `false` (the default) means the table's contents are determined
+ /// entirely by its `elem` segments and any active initializer, and never
+ /// change at runtime — provably immutable for the lifetime of any
+ /// instance of this module.
+ ///
+ /// `true` means the contents can change at runtime (or the table is
+ /// imported, in which case we conservatively assume the importer
+ /// mutates it).
+ ///
+ /// This is groundwork for later passes that turn `call_indirect`
+ /// through provably-immutable function tables into direct calls when
+ /// the dispatched-to slot is statically known. Set during module
+ /// translation (see `analyze_table_mutability`); read by Cranelift
+ /// lowering and by Pulley AOT IC seeding.
+ pub tables_mutated: SecondaryMap,
+
/// DWARF debug information, if enabled, parsed from the module.
pub debuginfo: DebugInfoData<'data>,
@@ -193,6 +213,7 @@ impl<'data> ModuleTranslation<'data> {
function_body_inputs: PrimaryMap::default(),
known_imported_functions: SecondaryMap::default(),
exported_signatures: Vec::default(),
+ tables_mutated: SecondaryMap::default(),
debuginfo: DebugInfoData::default(),
has_unparsed_debuginfo: false,
data_align: None,
@@ -315,6 +336,8 @@ impl<'a, 'data> ModuleEnvironment<'a, 'data> {
self.translate_payload(payload?)?;
}
+ analyze_table_mutability(&mut self.result)?;
+
Ok(self.result)
}
@@ -1548,3 +1571,85 @@ impl ModuleTranslation<'_> {
self.module.startup = ModuleStartup::IfMemoriesNeedInit(ty);
}
}
+
+/// Walk every defined function body, recording in
+/// `translation.tables_mutated` each table that is the destination of any
+/// runtime mutation opcode (`table.set`, `table.fill`, `table.copy` as the
+/// destination, `table.grow`, `table.init`).
+///
+/// Imported tables are conservatively pre-marked as mutated since the
+/// importer can mutate them in ways we can't see. Active `elem` segments
+/// applied at instantiation time are NOT counted as mutations — they are
+/// part of the table's *initial* state, not a runtime change.
+///
+/// `elem.drop` drops a passive element segment but does not write to any
+/// table directly, so it is intentionally not counted here. Conservatively,
+/// any `table.init` from a passive segment marks the destination table as
+/// mutated.
+fn analyze_table_mutability<'data>(
+ translation: &mut ModuleTranslation<'data>,
+) -> Result<()> {
+ // Resize the table-mutability map to cover every table in the module
+ // (imports + defined). `SecondaryMap` defaults to `false` for all
+ // unset entries, which is the correct "definitely-not-mutated" default
+ // for defined tables we haven't observed any mutations on yet.
+ let num_tables = translation.module.tables.len();
+ if num_tables == 0 {
+ return Ok(());
+ }
+
+ // Mark all imported tables as mutated up front. The importer can
+ // mutate them in ways this module can't see, so the conservative
+ // assumption is that they are not stable across calls.
+ let num_imported = translation.module.num_imported_tables;
+ for i in 0..num_imported {
+ translation.tables_mutated[TableIndex::from_u32(i as u32)] = true;
+ }
+
+ // Mark all *exported* tables as mutated as well. A host (or another
+ // instance importing the export) can call `Table::set` /
+ // `Table::grow` via the public wasmtime API on any exported table,
+ // and those mutations are not visible in this module's bytecode.
+ // The `call_indirect` optimizations that read this bit must
+ // therefore treat exported tables as conservatively non-stable.
+ for (_, entity_index) in &translation.module.exports {
+ if let EntityIndex::Table(table_index) = entity_index {
+ translation.tables_mutated[*table_index] = true;
+ }
+ }
+
+ // Walk every defined function body and look for table-mutation opcodes.
+ // The cost is O(total opcodes), one extra pass on top of the validator;
+ // typical large modules (sqlite3 ~50K opcodes) take well under a
+ // millisecond.
+ for (_, body_data) in &translation.function_body_inputs {
+ let mut reader = body_data.body.get_operators_reader()?;
+ while !reader.eof() {
+ use wasmparser::Operator;
+ match reader.read()? {
+ Operator::TableSet { table }
+ | Operator::TableFill { table }
+ | Operator::TableGrow { table } => {
+ translation.tables_mutated[TableIndex::from_u32(table)] = true;
+ }
+ Operator::TableCopy {
+ dst_table,
+ src_table: _,
+ } => {
+ // `src_table` is read-only in `table.copy`; only the
+ // destination is mutated.
+ translation.tables_mutated[TableIndex::from_u32(dst_table)] = true;
+ }
+ Operator::TableInit {
+ table,
+ elem_index: _,
+ } => {
+ translation.tables_mutated[TableIndex::from_u32(table)] = true;
+ }
+ _ => {}
+ }
+ }
+ }
+
+ Ok(())
+}
diff --git a/crates/environ/tests/table_mutability.rs b/crates/environ/tests/table_mutability.rs
new file mode 100644
index 000000000000..562966a708e4
--- /dev/null
+++ b/crates/environ/tests/table_mutability.rs
@@ -0,0 +1,307 @@
+//! Integration tests for `analyze_table_mutability` and the surrounding
+//! precompute ordering invariants.
+//!
+//! The per-table mutability bit is the foundation of the `call_indirect`
+//! optimizations in `crates/cranelift/src/func_environ.rs`
+//! (constant-index direct call, sig-check elision, NULL elision, bound-
+//! load elision). A false negative here — failing to mark a table as
+//! mutated when it actually is — would silently turn correct calls into
+//! incorrect direct calls or skip required runtime checks. A false
+//! positive — marking an immutable table as mutated — is merely a missed
+//! optimization. Pin the analysis behaviour with focused module-level
+//! tests so any regression surfaces immediately, not after a downstream
+//! optimization fires on a now-invalid premise.
+//!
+//! Test scenario inspiration drawn from comparable bugs in peer
+//! interpreters that have shipped fixes for analogous IC-invalidation
+//! mistakes:
+//!
+//! - **Luau** (`LOP_NAMECALL`): inline cache had to be invalidated on
+//! `table.insert` / metatable change. Analogous wasm risk: `table.grow`
+//! not invalidating an immutability proof, so see `table_grow_marks…`.
+//! - **JavaScriptCore** (`ic_table`): inline-cache corruption from missed
+//! shape transitions. Analogous risk: over-marking, e.g. `table.copy`
+//! wrongly marking the SOURCE table as mutated would forbid downstream
+//! optimizations on a perfectly read-only table. See
+//! `table_copy_marks_destination_only_not_source`.
+//! - **Hermes** (`HiddenClass` cache): property cache misses with
+//! `Object.defineProperty`. Analogous risk: `table.init` (active-
+//! segment init at runtime) being treated as a no-op rather than a
+//! write. See `table_init_marks_destination`.
+//!
+//! Lives in `tests/` rather than as a `#[cfg(test)] mod` inside
+//! `module_environ.rs` because the latter triggers a pre-existing
+//! upstream compile failure in `key.rs` / `module_artifacts.rs` (their
+//! `arbitrary::Arbitrary` derives are stale relative to the workspace's
+//! pinned `arbitrary 1.4.2`). Integration tests build against the lib
+//! as a normal dependency and so do not set `cfg(test)` on
+//! `wasmtime-environ` itself.
+
+use wasmparser::{Parser, Validator, WasmFeatures};
+use wasmtime_environ::{
+ ModuleEnvironment, ModuleTypesBuilder, StaticModuleIndex, TableIndex, Tunables,
+};
+
+/// Translate `wat` and return the resulting `tables_mutated` bits, in
+/// table-index order. Helper to keep individual tests short.
+fn translate_and_get_mutability(wat: &str) -> Vec {
+ let bytes = wat::parse_str(wat).expect("WAT parse failed");
+ let tunables = Tunables::default_host();
+ // WASM2 covers reference-types + bulk-memory, which is what every
+ // table-mutating opcode below needs (`table.set`, `table.fill`,
+ // `table.grow`, `table.copy`, `table.init`, `elem.drop`).
+ let features = WasmFeatures::WASM2;
+ let mut validator = Validator::new_with_features(features);
+ let mut types = ModuleTypesBuilder::new(&validator);
+ let env = ModuleEnvironment::new(
+ &tunables,
+ &mut validator,
+ &mut types,
+ StaticModuleIndex::from_u32(0),
+ );
+ let parser = Parser::new(0);
+ let translation = env.translate(parser, &bytes).expect("translate failed");
+ let n: u32 = translation.module.tables.len().try_into().unwrap();
+ (0..n)
+ .map(|i| translation.tables_mutated[TableIndex::from_u32(i)])
+ .collect()
+}
+
+/// A table only used as the source of `call_indirect` and `table.get` is
+/// provably immutable. (Both ops READ the table; neither writes it.) The
+/// table is intentionally NOT exported — exported tables are
+/// conservatively pre-marked as mutated (see
+/// `exported_tables_are_pre_marked` for the export case) since the host
+/// can mutate them via the public wasmtime API.
+#[test]
+fn read_only_table_is_immutable() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func $f (result i32) i32.const 42)
+ (elem (i32.const 0) $f $f $f $f)
+ (func (export "call_zero") (result i32)
+ i32.const 0
+ call_indirect (param) (result i32))
+ (func (export "read_zero") (result funcref)
+ i32.const 0
+ table.get 0))
+ "#,
+ );
+ assert_eq!(bits, vec![false], "no opcode mutated this table");
+}
+
+/// Exported tables are always pre-marked as mutated, regardless of
+/// whether any opcode in this module touches them. The host can call
+/// `Table::set` / `Table::grow` via the public wasmtime API on any
+/// exported table, and another module that imports the export can also
+/// mutate it. Without this rule, downstream optimizations would
+/// happily elide null traps and sig checks on exported tables on the
+/// (false) assumption that the table contents are stable.
+#[test]
+fn exported_tables_are_pre_marked() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table (export "t") 4 funcref)
+ (func $f (result i32) i32.const 42)
+ (elem (i32.const 0) $f $f $f $f))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// `table.set` marks its destination as mutated.
+#[test]
+fn table_set_marks_destination() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (func (export "do_set")
+ i32.const 1
+ ref.func $f
+ table.set 0))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// `table.fill` marks its destination as mutated.
+#[test]
+fn table_fill_marks_destination() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (func (export "do_fill")
+ i32.const 0
+ ref.func $f
+ i32.const 4
+ table.fill 0))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// `table.grow` is treated as mutating — analogous to Luau's NAMECALL IC
+/// needing to invalidate on table-shape change.
+#[test]
+fn table_grow_marks_destination() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func (export "do_grow") (result i32)
+ ref.null func
+ i32.const 1
+ table.grow 0))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// `table.copy` marks the DESTINATION but explicitly NOT the source. The
+/// source is read-only (its contents aren't changed by the op); marking
+/// it as mutated would forbid downstream optimizations from treating it
+/// as immutable, which would be incorrect over-conservatism — the JSC
+/// `ic_table` analogue.
+#[test]
+fn table_copy_marks_destination_only_not_source() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table $dst (export "dst") 4 funcref)
+ (table $src 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (elem (table $src) (i32.const 0) func $f $f $f $f)
+ (func (export "do_copy")
+ i32.const 0 ;; dst offset
+ i32.const 0 ;; src offset
+ i32.const 4 ;; len
+ table.copy $dst $src))
+ "#,
+ );
+ assert_eq!(
+ bits,
+ vec![true, false],
+ "dst should be mutated, src should remain immutable",
+ );
+}
+
+/// `table.init` writes to the destination table from a passive elem
+/// segment, so it is treated as mutation (the destination's contents
+/// change at runtime).
+#[test]
+fn table_init_marks_destination() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (elem $e funcref (ref.func $f) (ref.func $f))
+ (func (export "do_init")
+ i32.const 0 ;; dst
+ i32.const 0 ;; src offset within elem
+ i32.const 2 ;; len
+ table.init 0 $e))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// `elem.drop` drops a passive element segment but does NOT write to any
+/// table — distinct from `table.init` which DOES write. A pessimistic
+/// implementation that marked all tables as mutated on `elem.drop` would
+/// hand out false positives and shut off optimizations on perfectly-
+/// immutable tables.
+#[test]
+fn elem_drop_does_not_mark_tables() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (elem $e funcref (ref.func $f))
+ (func (export "do_drop")
+ elem.drop $e))
+ "#,
+ );
+ assert_eq!(bits, vec![false]);
+}
+
+/// Imported tables are always pre-marked as mutated, regardless of
+/// whether any opcode in this module touches them. The importer can
+/// mutate the table in ways this module can't see.
+#[test]
+fn imported_tables_are_pre_marked() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (import "host" "t" (table 4 funcref)))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// A mutation in ONE function correctly marks the table — the analysis
+/// has to walk every function body, not just the first.
+#[test]
+fn mutation_in_any_function_counts() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (func (export "innocent") (result i32)
+ i32.const 0
+ call_indirect (param) (result i32))
+ (func (export "guilty")
+ i32.const 0
+ ref.func $f
+ table.set 0))
+ "#,
+ );
+ assert_eq!(bits, vec![true]);
+}
+
+/// Two tables, one mutated, one not. The analysis tracks per-table — a
+/// mutation on one must not leak to the other.
+#[test]
+fn mutation_isolated_to_target_table() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (table $a 4 funcref)
+ (table $b 4 funcref)
+ (func $f (result i32) i32.const 0)
+ (func (export "mut_a")
+ i32.const 0
+ ref.func $f
+ table.set $a))
+ "#,
+ );
+ assert_eq!(
+ bits,
+ vec![true, false],
+ "$a should be mutated, $b should remain immutable",
+ );
+}
+
+/// Translating without any tables at all must not panic. (Defensive: the
+/// analysis indexes a `SecondaryMap` keyed by `TableIndex`, and we want
+/// to confirm an empty module produces an empty result rather than e.g.
+/// a default-allocated single entry.)
+#[test]
+fn module_with_no_tables_produces_empty_mutability_vec() {
+ let bits = translate_and_get_mutability(
+ r#"
+ (module
+ (func (export "noop")))
+ "#,
+ );
+ assert!(bits.is_empty(), "no tables ⇒ no mutability bits");
+}
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 5b3f79445340..07c73584afd9 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -1425,6 +1425,83 @@ impl OpVisitor for Interpreter<'_> {
ControlFlow::Continue(())
}
+ fn call_indirect1(&mut self, dst: XReg, arg1: XReg) -> ControlFlow {
+ // Read arg1 before writing x0 so this is safe when `arg1 == x0`.
+ let arg1_val = self.state[arg1];
+ let target = self.state[dst].get_ptr();
+ let return_addr = self.pc.as_ptr();
+ self.state.lr = return_addr.as_ptr();
+ self.state[XReg::x0] = arg1_val;
+ // SAFETY: same as `call_indirect`.
+ unsafe {
+ self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target));
+ }
+ ControlFlow::Continue(())
+ }
+
+ fn call_indirect2(&mut self, dst: XReg, arg1: XReg, arg2: XReg) -> ControlFlow {
+ let (a1, a2) = (self.state[arg1], self.state[arg2]);
+ let target = self.state[dst].get_ptr();
+ let return_addr = self.pc.as_ptr();
+ self.state.lr = return_addr.as_ptr();
+ self.state[XReg::x0] = a1;
+ self.state[XReg::x1] = a2;
+ // SAFETY: same as `call_indirect`.
+ unsafe {
+ self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target));
+ }
+ ControlFlow::Continue(())
+ }
+
+ fn call_indirect3(
+ &mut self,
+ dst: XReg,
+ arg1: XReg,
+ arg2: XReg,
+ arg3: XReg,
+ ) -> ControlFlow {
+ let (a1, a2, a3) = (self.state[arg1], self.state[arg2], self.state[arg3]);
+ let target = self.state[dst].get_ptr();
+ let return_addr = self.pc.as_ptr();
+ self.state.lr = return_addr.as_ptr();
+ self.state[XReg::x0] = a1;
+ self.state[XReg::x1] = a2;
+ self.state[XReg::x2] = a3;
+ // SAFETY: same as `call_indirect`.
+ unsafe {
+ self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target));
+ }
+ ControlFlow::Continue(())
+ }
+
+ fn call_indirect4(
+ &mut self,
+ dst: XReg,
+ arg1: XReg,
+ arg2: XReg,
+ arg3: XReg,
+ arg4: XReg,
+ ) -> ControlFlow {
+ let (a1, a2, a3, a4) = (
+ self.state[arg1],
+ self.state[arg2],
+ self.state[arg3],
+ self.state[arg4],
+ );
+ let target = self.state[dst].get_ptr();
+ let return_addr = self.pc.as_ptr();
+ self.state.lr = return_addr.as_ptr();
+ self.state[XReg::x0] = a1;
+ self.state[XReg::x1] = a2;
+ self.state[XReg::x2] = a3;
+ self.state[XReg::x3] = a4;
+ // SAFETY: same as `call_indirect`.
+ unsafe {
+ self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target));
+ }
+ ControlFlow::Continue(())
+ }
+
fn jump(&mut self, offset: PcRelOffset) -> ControlFlow {
self.pc_rel_jump::(offset)
}
@@ -2296,6 +2373,340 @@ impl OpVisitor for Interpreter<'_> {
ControlFlow::Continue(())
}
+ fn xband32_s8_br_if_x32(
+ &mut self,
+ dst: XReg,
+ src: XReg,
+ mask: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let s = self.state[src].get_i32();
+ self.state[dst].set_i32(s & i32::from(mask));
+ if s != 0 {
+ self.pc_rel_jump::(offset)
+ } else {
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xband32_s8_br_if_not_x32(
+ &mut self,
+ dst: XReg,
+ src: XReg,
+ mask: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let s = self.state[src].get_i32();
+ self.state[dst].set_i32(s & i32::from(mask));
+ if s == 0 {
+ self.pc_rel_jump::(offset)
+ } else {
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xband64_s8_br_if_x64(
+ &mut self,
+ dst: XReg,
+ src: XReg,
+ mask: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let s = self.state[src].get_i64();
+ self.state[dst].set_i64(s & i64::from(mask));
+ if s != 0 {
+ self.pc_rel_jump::(offset)
+ } else {
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xband64_s8_br_if_not_x64(
+ &mut self,
+ dst: XReg,
+ src: XReg,
+ mask: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let s = self.state[src].get_i64();
+ self.state[dst].set_i64(s & i64::from(mask));
+ if s == 0 {
+ self.pc_rel_jump::(offset)
+ } else {
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xfuncref_dispatch_x64(
+ &mut self,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ // `src` is the already-masked funcref. The null side traps: the
+ // fusion absorbed the continuation-block loads, so the lazy-init
+ // slow path's rejoin would see uninitialized dst_code/dst_vmctx.
+ // Gated on `is_eagerly_initialized_funcref_table`, so trapping
+ // here is unreachable in correct code.
+ let s = self.state[src].get_u64();
+ if s == 0 {
+ self.done_trap::()
+ } else {
+ // SAFETY: predicate guarantees `src` points to a real VMFuncRef.
+ let base = s as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i64(code);
+ self.state[dst_vmctx].set_i64(vmctx);
+ }
+ self.pc_rel_jump::(offset)
+ }
+ }
+
+ fn xfuncref_dispatch_not_x64(
+ &mut self,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ // Inverted form: fast path falls through; null side traps.
+ // `offset` is unused (kept for encoding shape parity).
+ let _ = offset;
+ let s = self.state[src].get_u64();
+ if s == 0 {
+ self.done_trap::()
+ } else {
+ let base = s as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i64(code);
+ self.state[dst_vmctx].set_i64(vmctx);
+ }
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xfuncref_dispatch_x32(
+ &mut self,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let s = self.state[src].get_u32();
+ if s == 0 {
+ self.done_trap::()
+ } else {
+ let base = s as usize as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i32(code);
+ self.state[dst_vmctx].set_i32(vmctx);
+ }
+ self.pc_rel_jump::(offset)
+ }
+ }
+
+ fn xfuncref_dispatch_not_x32(
+ &mut self,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let _ = offset;
+ let s = self.state[src].get_u32();
+ if s == 0 {
+ self.done_trap::()
+ } else {
+ let base = s as usize as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i32(code);
+ self.state[dst_vmctx].set_i32(vmctx);
+ }
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xband_funcref_dispatch_x64(
+ &mut self,
+ dst_masked: XReg,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ // Combines the standalone xband64_s8 with the xfuncref dispatch.
+ // `src` is unmasked. `dst_masked = src & -2` is written
+ // unconditionally so the brif's block-call-arg copy still finds a
+ // producer; the loads + branch fire on `src != 0`. Null traps
+ // (same rationale as `xfuncref_dispatch_x64`).
+ let s = self.state[src].get_u64();
+ let masked = s & !1u64;
+ self.state[dst_masked].set_u64(masked);
+ if s != 0 {
+ let base = masked as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i64(code);
+ self.state[dst_vmctx].set_i64(vmctx);
+ }
+ self.pc_rel_jump::(offset)
+ } else {
+ self.done_trap::()
+ }
+ }
+
+ fn xband_funcref_dispatch_not_x64(
+ &mut self,
+ dst_masked: XReg,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ // Inverted form; `offset` is vestigial after the trap-on-null fix.
+ let _ = offset;
+ let s = self.state[src].get_u64();
+ let masked = s & !1u64;
+ self.state[dst_masked].set_u64(masked);
+ if s == 0 {
+ self.done_trap::()
+ } else {
+ let base = masked as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i64(code);
+ self.state[dst_vmctx].set_i64(vmctx);
+ }
+ ControlFlow::Continue(())
+ }
+ }
+
+ fn xband_funcref_dispatch_x32(
+ &mut self,
+ dst_masked: XReg,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let s = self.state[src].get_u32();
+ let masked = s & !1u32;
+ self.state[dst_masked].set_u32(masked);
+ if s != 0 {
+ let base = masked as usize as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i32(code);
+ self.state[dst_vmctx].set_i32(vmctx);
+ }
+ self.pc_rel_jump::(offset)
+ } else {
+ self.done_trap::()
+ }
+ }
+
+ fn xband_funcref_dispatch_not_x32(
+ &mut self,
+ dst_masked: XReg,
+ dst_code: XReg,
+ dst_vmctx: XReg,
+ src: XReg,
+ offset_code: i8,
+ offset_vmctx: i8,
+ offset: PcRelOffset,
+ ) -> ControlFlow {
+ let _ = offset;
+ let s = self.state[src].get_u32();
+ let masked = s & !1u32;
+ self.state[dst_masked].set_u32(masked);
+ if s == 0 {
+ self.done_trap::()
+ } else {
+ let base = masked as usize as *const u8;
+ unsafe {
+ let code = base
+ .byte_offset(offset_code as isize)
+ .cast::()
+ .read_unaligned();
+ let vmctx = base
+ .byte_offset(offset_vmctx as isize)
+ .cast::()
+ .read_unaligned();
+ self.state[dst_code].set_i32(code);
+ self.state[dst_vmctx].set_i32(vmctx);
+ }
+ ControlFlow::Continue(())
+ }
+ }
+
fn xbor32(&mut self, operands: BinaryOperands) -> ControlFlow {
let a = self.state[operands.src1].get_u32();
let b = self.state[operands.src2].get_u32();
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index 36a09cb13a34..de2210d7e4b0 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -115,6 +115,14 @@ macro_rules! for_each_op {
/// Transfer control to the PC in `reg` and set `lr` to the PC just
/// after this instruction.
call_indirect = CallIndirect { reg: XReg };
+ /// Like `call_indirect`, but also `x0 = arg1`.
+ call_indirect1 = CallIndirect1 { reg: XReg, arg1: XReg };
+ /// Like `call_indirect`, but also `x0, x1 = arg1, arg2`.
+ call_indirect2 = CallIndirect2 { reg: XReg, arg1: XReg, arg2: XReg };
+ /// Like `call_indirect`, but also `x0, x1, x2 = arg1, arg2, arg3`.
+ call_indirect3 = CallIndirect3 { reg: XReg, arg1: XReg, arg2: XReg, arg3: XReg };
+ /// Like `call_indirect`, but also `x0, x1, x2, x3 = arg1, arg2, arg3, arg4`.
+ call_indirect4 = CallIndirect4 { reg: XReg, arg1: XReg, arg2: XReg, arg3: XReg, arg4: XReg };
/// Unconditionally transfer control to the PC at the given offset.
jump = Jump { offset: PcRelOffset };
@@ -562,6 +570,58 @@ macro_rules! for_each_op {
xband64_s8 = Xband64S8 { dst: XReg, src1: XReg, src2: i8 };
/// Same as `xband64` but `src2` is a sign-extended 32-bit immediate.
xband64_s32 = Xband64S32 { dst: XReg, src1: XReg, src2: i32 };
+
+ /// `low32(dst) = low32(src) & sign_extend(mask)`, then branch by
+ /// `offset` if `low32(src)` is non-zero. Fused `xband32_s8 +
+ /// br_if32` for the call_indirect lazy-init brif site.
+ xband32_s8_br_if_x32 = Xband32S8BrIfX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset };
+ /// Inverted form of `xband32_s8_br_if_x32`: branch if `low32(src)`
+ /// is zero. Mask + dst write are unconditional.
+ xband32_s8_br_if_not_x32 = Xband32S8BrIfNotX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset };
+ /// 64-bit form of `xband32_s8_br_if_x32`.
+ xband64_s8_br_if_x64 = Xband64S8BrIfX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset };
+ /// Inverted form of `xband64_s8_br_if_x64`: branch if `src` is zero.
+ xband64_s8_br_if_not_x64 = Xband64S8BrIfNotX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset };
+
+ /// Funcref-dispatch fusion (64-bit). If `src != 0`, load
+ /// `dst_code = [src + offset_code]`, `dst_vmctx = [src +
+ /// offset_vmctx]`, and branch by `offset`. `src` is the
+ /// already-masked funcref pointer.
+ ///
+ /// The null side traps. The fusion absorbs the two field loads
+ /// from the brif's continuation block; if execution reached the
+ /// original lazy-init slow path, it would rejoin that
+ /// continuation with `dst_code`/`dst_vmctx` uninitialized, so
+ /// the null path can no longer fall through safely. Gated on
+ /// `is_eagerly_initialized_funcref_table`, which guarantees the
+ /// null path is unreachable at runtime.
+ xfuncref_dispatch_x64 = XfuncrefDispatchX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+ /// Inverted form of `xfuncref_dispatch_x64`: fast path falls
+ /// through; null path traps. `offset` is vestigial (kept for
+ /// shape parity with the forward variant).
+ xfuncref_dispatch_not_x64 = XfuncrefDispatchNotX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+ /// 32-bit pointer-width form of `xfuncref_dispatch_x64`.
+ xfuncref_dispatch_x32 = XfuncrefDispatchX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+ /// Inverted form of `xfuncref_dispatch_x32`.
+ xfuncref_dispatch_not_x32 = XfuncrefDispatchNotX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+
+ /// Combines `xband64_s8 dst_masked, src, -2` with
+ /// `xfuncref_dispatch_*_x64` into one op. `src` is the unmasked
+ /// funcref; the init-bit strip is internal.
+ ///
+ /// `dst_masked = src & -2` unconditionally. If `src != 0`, do
+ /// the two loads and branch by `offset`. Null side traps (same
+ /// rationale as `xfuncref_dispatch_*`).
+ xband_funcref_dispatch_x64 = XbandFuncrefDispatchX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+ /// Inverted form of `xband_funcref_dispatch_x64`: fast path
+ /// falls through; null path traps. `dst_masked` is still
+ /// written unconditionally. `offset` is vestigial.
+ xband_funcref_dispatch_not_x64 = XbandFuncrefDispatchNotX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+ /// 32-bit pointer-width form of `xband_funcref_dispatch_x64`.
+ xband_funcref_dispatch_x32 = XbandFuncrefDispatchX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+ /// Inverted form of `xband_funcref_dispatch_x32`.
+ xband_funcref_dispatch_not_x32 = XbandFuncrefDispatchNotX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset };
+
/// `low32(dst) = low32(src1) | low32(src2)`
xbor32 = XBor32 { operands: BinaryOperands };
/// Same as `xbor64` but `src2` is a sign-extended 8-bit immediate.
diff --git a/tests/all/pulley.rs b/tests/all/pulley.rs
index d4cd458f915d..7af28655567f 100644
--- a/tests/all/pulley.rs
+++ b/tests/all/pulley.rs
@@ -515,3 +515,282 @@ fn decode_unaligned() -> Result<()> {
Ok(())
}
+
+// Runtime-semantics tests for the call_indirect fusion stack
+// (`tests/disas/pulley-fusion-*.wat` covers the static disasm side).
+// Each test runs the same wasm under Pulley and native Cranelift and
+// asserts the results agree.
+
+/// Pulley config for tests that exercise traps. The interpreter can't
+/// catch signals, so trap emission must be explicit.
+fn pulley_trap_safe_config() -> Config {
+ let mut config = pulley_config();
+ config.signals_based_traps(false);
+ config
+}
+
+fn pulley_and_native_agree(
+ wat: &str,
+ func_name: &str,
+ params: Params,
+) -> Result
+where
+ Params: wasmtime::WasmParams + Copy,
+ Results: wasmtime::WasmResults + std::fmt::Debug + PartialEq,
+{
+ let bytes = wat::parse_str(wat)?;
+ let pulley = {
+ let engine = Engine::new(&pulley_trap_safe_config())?;
+ let module = Module::new(&engine, &bytes)?;
+ let mut store = Store::new(&engine, ());
+ let inst = Instance::new(&mut store, &module, &[])?;
+ let f = inst.get_typed_func::(&mut store, func_name)?;
+ f.call(&mut store, params)?
+ };
+ let native = {
+ let engine = Engine::new(&Config::new())?;
+ let module = Module::new(&engine, &bytes)?;
+ let mut store = Store::new(&engine, ());
+ let inst = Instance::new(&mut store, &module, &[])?;
+ let f = inst.get_typed_func::(&mut store, func_name)?;
+ f.call(&mut store, params)?
+ };
+ assert_eq!(
+ pulley, native,
+ "Pulley and native diverged for `{func_name}` — fusion lowering bug?"
+ );
+ Ok(pulley)
+}
+
+/// Fusion returns the right callee for every in-bounds index and traps
+/// on OOB.
+#[test]
+fn fusion_call_indirect_every_index() -> Result<()> {
+ let wat = r#"
+ (module
+ (table 3 3 funcref)
+ (func $f0 (result i32) i32.const 100)
+ (func $f1 (result i32) i32.const 101)
+ (func $f2 (result i32) i32.const 102)
+ (func (export "call") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+ (elem (i32.const 0) func $f0 $f1 $f2))
+ "#;
+ for (idx, expected) in [(0_i32, 100_i32), (1, 101), (2, 102)] {
+ let got: i32 = pulley_and_native_agree(wat, "call", idx)?;
+ assert_eq!(got, expected, "idx {idx}");
+ }
+ // Pulley only — native signal-based traps interact badly with
+ // `cargo test`'s debug-mode signal handlers.
+ let bytes = wat::parse_str(wat)?;
+ let engine = Engine::new(&pulley_trap_safe_config())?;
+ let module = Module::new(&engine, &bytes)?;
+ let mut store = Store::new(&engine, ());
+ let inst = Instance::new(&mut store, &module, &[])?;
+ let f = inst.get_typed_func::(&mut store, "call")?;
+ let err = f.call(&mut store, 3).unwrap_err();
+ let trap = err.downcast_ref::().expect("Trap");
+ assert_eq!(*trap, Trap::TableOutOfBounds);
+ Ok(())
+}
+
+/// Two call_indirect sites in the same function; each must fuse
+/// independently.
+#[test]
+fn fusion_call_indirect_multi_site() -> Result<()> {
+ let wat = r#"
+ (module
+ (table 3 3 funcref)
+ (func $f0 (result i32) i32.const 10)
+ (func $f1 (result i32) i32.const 20)
+ (func $f2 (result i32) i32.const 30)
+ (func (export "sum") (param i32 i32) (result i32)
+ local.get 0 call_indirect (result i32)
+ local.get 1 call_indirect (result i32)
+ i32.add)
+ (elem (i32.const 0) func $f0 $f1 $f2))
+ "#;
+ for (a, b, expected) in [(0_i32, 1_i32, 30_i32), (1, 2, 50), (2, 0, 40), (1, 1, 40)] {
+ let got: i32 = pulley_and_native_agree(wat, "sum", (a, b))?;
+ assert_eq!(got, expected, "a={a} b={b}");
+ }
+ Ok(())
+}
+
+/// `return_call_indirect` correctness with fusion applied.
+#[test]
+fn fusion_return_call_indirect() -> Result<()> {
+ let wat = r#"
+ (module
+ (table 2 2 funcref)
+ (type $sig (func (result i32)))
+ (func $f0 (result i32) i32.const 7)
+ (func $f1 (result i32) i32.const 11)
+ (func (export "tail") (param i32) (result i32)
+ local.get 0
+ return_call_indirect (type $sig))
+ (elem (i32.const 0) func $f0 $f1))
+ "#;
+ for (idx, expected) in [(0_i32, 7_i32), (1, 11)] {
+ let got: i32 = pulley_and_native_agree(wat, "tail", idx)?;
+ assert_eq!(got, expected, "idx {idx}");
+ }
+ Ok(())
+}
+
+/// Host mutates a slot to `ref.null func`; call_indirect must trap
+/// `IndirectCallToNull`.
+#[test]
+fn fusion_call_indirect_with_host_null_set() -> Result<()> {
+ let wat = r#"
+ (module
+ (table (export "t") 2 2 funcref)
+ (func $f0 (result i32) i32.const 100)
+ (func (export "call") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+ (elem (i32.const 0) func $f0 $f0))
+ "#;
+ let bytes = wat::parse_str(wat)?;
+
+ // Pulley only (see note on `fusion_call_indirect_null_slot`).
+ let engine = Engine::new(&pulley_trap_safe_config())?;
+ let module = Module::new(&engine, &bytes)?;
+ let mut store = Store::new(&engine, ());
+ let inst = Instance::new(&mut store, &module, &[])?;
+ let call = inst.get_typed_func::(&mut store, "call")?;
+ assert_eq!(call.call(&mut store, 0)?, 100);
+ assert_eq!(call.call(&mut store, 1)?, 100);
+
+ let table = inst.get_table(&mut store, "t").expect("table export");
+ table.set(&mut store, 1, wasmtime::Ref::Func(None))?;
+
+ assert_eq!(call.call(&mut store, 0)?, 100);
+ let err = call.call(&mut store, 1).unwrap_err();
+ let trap = err.downcast_ref::().expect("Trap");
+ assert_eq!(*trap, Trap::IndirectCallToNull);
+ Ok(())
+}
+
+/// Host `Table::set` swaps to a different funcref between calls; the
+/// second call must observe the new target.
+#[test]
+fn fusion_call_indirect_with_host_swap() -> Result<()> {
+ let wat = r#"
+ (module
+ (table (export "t") 1 1 funcref)
+ (func $f0 (result i32) i32.const 100)
+ (func $f1 (result i32) i32.const 200)
+ (func (export "f1_ref") (result funcref) ref.func $f1)
+ (func (export "call") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+ (elem declare func $f1)
+ (elem (i32.const 0) func $f0))
+ "#;
+ let bytes = wat::parse_str(wat)?;
+
+ for use_pulley in [true, false] {
+ let cfg = if use_pulley {
+ pulley_trap_safe_config()
+ } else {
+ Config::new()
+ };
+ let engine = Engine::new(&cfg)?;
+ let module = Module::new(&engine, &bytes)?;
+ let mut store = Store::new(&engine, ());
+ let inst = Instance::new(&mut store, &module, &[])?;
+ let call = inst.get_typed_func::(&mut store, "call")?;
+ assert_eq!(call.call(&mut store, 0)?, 100);
+
+ let f1_ref = inst
+ .get_typed_func::<(), Option>(&mut store, "f1_ref")?
+ .call(&mut store, ())?
+ .expect("f1_ref returned None");
+ let table = inst.get_table(&mut store, "t").expect("table export");
+ table.set(&mut store, 0, wasmtime::Ref::Func(Some(f1_ref)))?;
+
+ assert_eq!(call.call(&mut store, 0)?, 200, "use_pulley={use_pulley}");
+ }
+ Ok(())
+}
+
+/// Module B imports module A's table and calls into it. Tables are
+/// imported, so the importer's `tables_mutated` is `true` and no
+/// fusion fires on B's side; the call must still produce the right
+/// result.
+#[test]
+fn fusion_call_indirect_imported_table() -> Result<()> {
+ let wat_a = r#"
+ (module
+ (table (export "t") 2 2 funcref)
+ (func $f0 (result i32) i32.const 42)
+ (func $f1 (result i32) i32.const 84)
+ (elem (i32.const 0) func $f0 $f1))
+ "#;
+ let wat_b = r#"
+ (module
+ (import "a" "t" (table 2 2 funcref))
+ (func (export "call") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32)))
+ "#;
+ let bytes_a = wat::parse_str(wat_a)?;
+ let bytes_b = wat::parse_str(wat_b)?;
+
+ for use_pulley in [true, false] {
+ let cfg = if use_pulley {
+ pulley_trap_safe_config()
+ } else {
+ Config::new()
+ };
+ let engine = Engine::new(&cfg)?;
+ let module_a = Module::new(&engine, &bytes_a)?;
+ let module_b = Module::new(&engine, &bytes_b)?;
+ let mut store = Store::new(&engine, ());
+ let inst_a = Instance::new(&mut store, &module_a, &[])?;
+ let table_export = inst_a.get_export(&mut store, "t").expect("a.t");
+
+ let mut linker = wasmtime::Linker::new(&engine);
+ linker.define(&store, "a", "t", table_export)?;
+ let inst_b = linker.instantiate(&mut store, &module_b)?;
+
+ let call = inst_b.get_typed_func::(&mut store, "call")?;
+ for (idx, expected) in [(0_i32, 42_i32), (1, 84)] {
+ assert_eq!(
+ call.call(&mut store, idx)?,
+ expected,
+ "use_pulley={use_pulley} idx={idx}"
+ );
+ }
+ }
+ Ok(())
+}
+
+/// Single call_indirect to an uninitialised slot — the phase-2 fused
+/// op's runtime null check must trap cleanly with the right trap kind,
+/// not crash on the field deref.
+///
+/// Call into an uninitialised table slot must trap.
+#[test]
+fn fusion_call_indirect_null_slot() -> Result<()> {
+ let wat = r#"
+ (module
+ (table (export "t") 1 1 funcref)
+ (func (export "call") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32)))
+ "#;
+ let bytes = wat::parse_str(wat)?;
+ // Pulley only — see note on `fusion_call_indirect_every_index`.
+ let engine = Engine::new(&pulley_trap_safe_config())?;
+ let module = Module::new(&engine, &bytes)?;
+ let mut store = Store::new(&engine, ());
+ let inst = Instance::new(&mut store, &module, &[])?;
+ let call = inst.get_typed_func::(&mut store, "call")?;
+ let err = call.call(&mut store, 0).unwrap_err();
+ let trap = err.downcast_ref::().expect("Trap");
+ assert_eq!(*trap, Trap::IndirectCallToNull);
+ Ok(())
+}
diff --git a/tests/disas/call-indirect-immutable-elide-null.wat b/tests/disas/call-indirect-immutable-elide-null.wat
new file mode 100644
index 000000000000..35e2e0c7f0db
--- /dev/null
+++ b/tests/disas/call-indirect-immutable-elide-null.wat
@@ -0,0 +1,116 @@
+;;! target = "x86_64"
+
+;; Immutable funcref table where every slot is filled by the elem
+;; segment (no "no-entry" gaps). With both the sig check AND the
+;; funcref-NULL check elided, the dispatch path is reduced to:
+;; - bounds check (static)
+;; - lazy-init brif + masking
+;; - load code+vmctx
+;; - call_indirect
+;;
+;; In particular the cold block that handles the runtime trap-on-null
+;; path should not exist after the funcref load: the static-match path
+;; with `may_be_null = false` skips both the sig check and any
+;; downstream null-handling.
+
+(module
+ (table 3 3 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ ;; Fully cover the table — no null slot anywhere.
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; function u0:0(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @003f v3 = iconst.i32 1
+;; @0041 jump block1
+;;
+;; block1:
+;; @0041 return v3 ; v3 = 1
+;; }
+;;
+;; function u0:1(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0044 v3 = iconst.i32 2
+;; @0046 jump block1
+;;
+;; block1:
+;; @0046 return v3 ; v3 = 2
+;; }
+;;
+;; function u0:2(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0049 v3 = iconst.i32 3
+;; @004b jump block1
+;;
+;; block1:
+;; @004b return v3 ; v3 = 3
+;; }
+;;
+;; function u0:3(i64 vmctx, i64, i32) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; gv3 = vmctx
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
+;; sig0 = (i64 vmctx, i64) -> i32 tail
+;; sig1 = (i64 vmctx, i32, i64) -> i64 tail
+;; fn0 = colocated u805306368:7 sig1
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64, v2: i32):
+;; @0050 v4 = iconst.i32 3
+;; @0050 v5 = icmp uge v2, v4 ; v4 = 3
+;; @0050 v6 = uextend.i64 v2
+;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48
+;; @0050 v8 = iconst.i64 3
+;; @0050 v9 = ishl v6, v8 ; v8 = 3
+;; @0050 v10 = iadd v7, v9
+;; @0050 v11 = iconst.i64 0
+;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0
+;; @0050 v13 = load.i64 user6 aligned region1 v12
+;; @0050 v14 = iconst.i64 -2
+;; @0050 v15 = band v13, v14 ; v14 = -2
+;; @0050 brif v13, block3(v15), block2
+;;
+;; block2 cold:
+;; @0050 v17 = iconst.i32 0
+;; @0050 v18 = uextend.i64 v2
+;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0
+;; @0050 jump block3(v19)
+;;
+;; block3(v16: i64):
+;; @0050 v20 = load.i64 notrap aligned readonly v16+8
+;; @0050 v21 = load.i64 notrap aligned readonly v16+24
+;; @0050 v22 = call_indirect sig0, v20(v21, v0)
+;; @0053 jump block1
+;;
+;; block1:
+;; @0053 return v22
+;; }
diff --git a/tests/disas/call-indirect-immutable-elide-sig.wat b/tests/disas/call-indirect-immutable-elide-sig.wat
new file mode 100644
index 000000000000..d5d892f6d99a
--- /dev/null
+++ b/tests/disas/call-indirect-immutable-elide-sig.wat
@@ -0,0 +1,115 @@
+;;! target = "x86_64"
+
+;; Immutable funcref table where every elem-segment entry has the same
+;; declared type as the call site. This module's `tables_mutated` bit
+;; for table 0 is clear (no opcode in any function writes to it), and
+;; all three slots resolve to the same module type as the call site.
+;; That triggers `try_elide_sig_check_for_immutable_table` →
+;; `CheckIndirectCallTypeSignature::StaticMatch`, removing the runtime
+;; signature load + compare from the dispatch hot path.
+;;
+;; Look for the absence of `load.i32 user6 aligned readonly v_+16` (the
+;; sig-id load) and the matching `icmp eq / trapz user7` on the call
+;; site. Compare with `indirect-call-no-caching.wat` for the
+;; non-elided shape.
+
+(module
+ (table 10 10 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; function u0:0(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @003f v3 = iconst.i32 1
+;; @0041 jump block1
+;;
+;; block1:
+;; @0041 return v3 ; v3 = 1
+;; }
+;;
+;; function u0:1(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0044 v3 = iconst.i32 2
+;; @0046 jump block1
+;;
+;; block1:
+;; @0046 return v3 ; v3 = 2
+;; }
+;;
+;; function u0:2(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0049 v3 = iconst.i32 3
+;; @004b jump block1
+;;
+;; block1:
+;; @004b return v3 ; v3 = 3
+;; }
+;;
+;; function u0:3(i64 vmctx, i64, i32) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; gv3 = vmctx
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
+;; sig0 = (i64 vmctx, i64) -> i32 tail
+;; sig1 = (i64 vmctx, i32, i64) -> i64 tail
+;; fn0 = colocated u805306368:7 sig1
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64, v2: i32):
+;; @0050 v4 = iconst.i32 10
+;; @0050 v5 = icmp uge v2, v4 ; v4 = 10
+;; @0050 v6 = uextend.i64 v2
+;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48
+;; @0050 v8 = iconst.i64 3
+;; @0050 v9 = ishl v6, v8 ; v8 = 3
+;; @0050 v10 = iadd v7, v9
+;; @0050 v11 = iconst.i64 0
+;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0
+;; @0050 v13 = load.i64 user6 aligned region1 v12
+;; @0050 v14 = iconst.i64 -2
+;; @0050 v15 = band v13, v14 ; v14 = -2
+;; @0050 brif v13, block3(v15), block2
+;;
+;; block2 cold:
+;; @0050 v17 = iconst.i32 0
+;; @0050 v18 = uextend.i64 v2
+;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0
+;; @0050 jump block3(v19)
+;;
+;; block3(v16: i64):
+;; @0050 v20 = load.i64 user7 aligned readonly v16+8
+;; @0050 v21 = load.i64 notrap aligned readonly v16+24
+;; @0050 v22 = call_indirect sig0, v20(v21, v0)
+;; @0053 jump block1
+;;
+;; block1:
+;; @0053 return v22
+;; }
diff --git a/tests/disas/call-indirect-immutable-static-bound.wat b/tests/disas/call-indirect-immutable-static-bound.wat
new file mode 100644
index 000000000000..05c3ffd748ab
--- /dev/null
+++ b/tests/disas/call-indirect-immutable-static-bound.wat
@@ -0,0 +1,115 @@
+;;! target = "x86_64"
+
+;; Table declared with min < max (a "dynamic-declared" table) that is
+;; never written to in the module. Without the per-table mutability
+;; bit, Cranelift would emit `load.i64 v0+56` per dispatch to fetch
+;; the current bound. With it, `make_table` lowers to
+;; `TableSize::Static` and the bound becomes an immediate.
+;;
+;; Look for: bounds-check `iconst.i32 16` (the declared min, used as
+;; static bound) and NO `load.i64 ... v0+56` for the current_elements
+;; field. (`+48` for the funcref base is still loaded — that's the
+;; element-data pointer, separate from the bound.)
+
+(module
+ ;; min=16, max=64 — distinct, so without our optimization the
+ ;; bound would be loaded per dispatch from `current_elements`.
+ (table 16 64 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; function u0:0(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @003f v3 = iconst.i32 1
+;; @0041 jump block1
+;;
+;; block1:
+;; @0041 return v3 ; v3 = 1
+;; }
+;;
+;; function u0:1(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0044 v3 = iconst.i32 2
+;; @0046 jump block1
+;;
+;; block1:
+;; @0046 return v3 ; v3 = 2
+;; }
+;;
+;; function u0:2(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0049 v3 = iconst.i32 3
+;; @004b jump block1
+;;
+;; block1:
+;; @004b return v3 ; v3 = 3
+;; }
+;;
+;; function u0:3(i64 vmctx, i64, i32) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; gv3 = vmctx
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
+;; sig0 = (i64 vmctx, i64) -> i32 tail
+;; sig1 = (i64 vmctx, i32, i64) -> i64 tail
+;; fn0 = colocated u805306368:7 sig1
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64, v2: i32):
+;; @0050 v4 = iconst.i32 16
+;; @0050 v5 = icmp uge v2, v4 ; v4 = 16
+;; @0050 v6 = uextend.i64 v2
+;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48
+;; @0050 v8 = iconst.i64 3
+;; @0050 v9 = ishl v6, v8 ; v8 = 3
+;; @0050 v10 = iadd v7, v9
+;; @0050 v11 = iconst.i64 0
+;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0
+;; @0050 v13 = load.i64 user6 aligned region1 v12
+;; @0050 v14 = iconst.i64 -2
+;; @0050 v15 = band v13, v14 ; v14 = -2
+;; @0050 brif v13, block3(v15), block2
+;;
+;; block2 cold:
+;; @0050 v17 = iconst.i32 0
+;; @0050 v18 = uextend.i64 v2
+;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0
+;; @0050 jump block3(v19)
+;;
+;; block3(v16: i64):
+;; @0050 v20 = load.i64 user7 aligned readonly v16+8
+;; @0050 v21 = load.i64 notrap aligned readonly v16+24
+;; @0050 v22 = call_indirect sig0, v20(v21, v0)
+;; @0053 jump block1
+;;
+;; block1:
+;; @0053 return v22
+;; }
diff --git a/tests/disas/call-indirect-mutable-keeps-sigcheck.wat b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat
new file mode 100644
index 000000000000..03318a349ef7
--- /dev/null
+++ b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat
@@ -0,0 +1,159 @@
+;;! target = "x86_64"
+
+;; Counterpart to `call-indirect-immutable-elide-sig.wat`. Same module
+;; shape — same elem segment, same uniform call-site type — but one
+;; function writes to the table via `table.set`. That sets the
+;; `tables_mutated` bit and disables sig-check elision.
+;;
+;; Look for the runtime sig load + compare on the call site:
+;; load.i32 user6 aligned readonly v_+16
+;; icmp eq
+;; trapz user7
+;; (versus the elided form in the immutable test).
+
+(module
+ (table 10 10 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ ;; Mutator: this clears the immutability proof for table 0.
+ (func (export "mutate") (param i32)
+ local.get 0
+ ref.func $f1
+ table.set 0)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; function u0:0(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @004d v3 = iconst.i32 1
+;; @004f jump block1
+;;
+;; block1:
+;; @004f return v3 ; v3 = 1
+;; }
+;;
+;; function u0:1(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0052 v3 = iconst.i32 2
+;; @0054 jump block1
+;;
+;; block1:
+;; @0054 return v3 ; v3 = 2
+;; }
+;;
+;; function u0:2(i64 vmctx, i64) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64):
+;; @0057 v3 = iconst.i32 3
+;; @0059 jump block1
+;;
+;; block1:
+;; @0059 return v3 ; v3 = 3
+;; }
+;;
+;; function u0:3(i64 vmctx, i64, i32) tail {
+;; region0 = 8 "VMContext+0x8"
+;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; gv3 = vmctx
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
+;; sig0 = (i64 vmctx, i32) -> i64 tail
+;; fn0 = colocated u805306368:6 sig0
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64, v2: i32):
+;; @005e v3 = iconst.i32 0
+;; @005e v4 = call fn0(v0, v3) ; v3 = 0
+;; @0060 v5 = iconst.i32 10
+;; @0060 v6 = icmp uge v2, v5 ; v5 = 10
+;; @0060 v7 = uextend.i64 v2
+;; @0060 v8 = load.i64 notrap aligned readonly can_move v0+48
+;; @0060 v9 = iconst.i64 3
+;; @0060 v10 = ishl v7, v9 ; v9 = 3
+;; @0060 v11 = iadd v8, v10
+;; @0060 v12 = iconst.i64 0
+;; @0060 v13 = select_spectre_guard v6, v12, v11 ; v12 = 0
+;; @0060 v14 = iconst.i64 1
+;; @0060 v15 = bor v4, v14 ; v14 = 1
+;; @0060 store user6 aligned region1 v15, v13
+;; @0062 jump block1
+;;
+;; block1:
+;; @0062 return
+;; }
+;;
+;; function u0:4(i64 vmctx, i64, i32) -> i32 tail {
+;; region0 = 8 "VMContext+0x8"
+;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
+;; region2 = 40 "VMContext+0x28"
+;; gv0 = vmctx
+;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
+;; gv2 = load.i64 notrap aligned gv1+24
+;; gv3 = vmctx
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
+;; sig0 = (i64 vmctx, i64) -> i32 tail
+;; sig1 = (i64 vmctx, i32, i64) -> i64 tail
+;; fn0 = colocated u805306368:7 sig1
+;; stack_limit = gv2
+;;
+;; block0(v0: i64, v1: i64, v2: i32):
+;; @0067 v4 = iconst.i32 10
+;; @0067 v5 = icmp uge v2, v4 ; v4 = 10
+;; @0067 v6 = uextend.i64 v2
+;; @0067 v7 = load.i64 notrap aligned readonly can_move v0+48
+;; @0067 v8 = iconst.i64 3
+;; @0067 v9 = ishl v6, v8 ; v8 = 3
+;; @0067 v10 = iadd v7, v9
+;; @0067 v11 = iconst.i64 0
+;; @0067 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0
+;; @0067 v13 = load.i64 user6 aligned region1 v12
+;; @0067 v14 = iconst.i64 -2
+;; @0067 v15 = band v13, v14 ; v14 = -2
+;; @0067 brif v13, block3(v15), block2
+;;
+;; block2 cold:
+;; @0067 v17 = iconst.i32 0
+;; @0067 v18 = uextend.i64 v2
+;; @0067 v19 = call fn0(v0, v17, v18) ; v17 = 0
+;; @0067 jump block3(v19)
+;;
+;; block3(v16: i64):
+;; @0067 v20 = load.i64 notrap aligned readonly can_move region2 v0+40
+;; @0067 v21 = load.i32 notrap aligned readonly can_move v20
+;; @0067 v22 = load.i32 user7 aligned readonly v16+16
+;; @0067 v23 = icmp eq v22, v21
+;; @0067 v24 = uextend.i32 v23
+;; @0067 trapz v24, user8
+;; @0067 v25 = load.i64 notrap aligned readonly v16+8
+;; @0067 v26 = load.i64 notrap aligned readonly v16+24
+;; @0067 v27 = call_indirect sig0, v25(v26, v0)
+;; @006a jump block1
+;;
+;; block1:
+;; @006a return v27
+;; }
diff --git a/tests/disas/gc/call-indirect-final-type.wat b/tests/disas/gc/call-indirect-final-type.wat
index 0406261611bf..13ffa96bec62 100644
--- a/tests/disas/gc/call-indirect-final-type.wat
+++ b/tests/disas/gc/call-indirect-final-type.wat
@@ -23,47 +23,38 @@
;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
;; gv2 = load.i64 notrap aligned gv1+24
;; gv3 = vmctx
-;; gv4 = load.i64 notrap aligned gv3+48
-;; gv5 = load.i64 notrap aligned gv3+56
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
;; sig0 = (i64 vmctx, i64, i32) -> i32 tail
;; sig1 = (i64 vmctx, i32, i64) -> i64 tail
;; fn0 = colocated u805306368:7 sig1
;; stack_limit = gv2
;;
;; block0(v0: i64, v1: i64, v2: i32, v3: i32):
-;; @002b v5 = load.i64 notrap aligned v0+56
-;; @002b v9 = load.i64 notrap aligned v0+48
-;; @002b v6 = ireduce.i32 v5
-;; @002b v7 = icmp uge v3, v6
-;; @002b v13 = iconst.i64 0
-;; @002b v8 = uextend.i64 v3
-;; @002b v10 = iconst.i64 3
-;; @002b v11 = ishl v8, v10 ; v10 = 3
-;; @002b v12 = iadd v9, v11
-;; @002b v14 = select_spectre_guard v7, v13, v12 ; v13 = 0
-;; @002b v15 = load.i64 user6 aligned region1 v14
-;; @002b v16 = iconst.i64 -2
-;; @002b v17 = band v15, v16 ; v16 = -2
-;; @002b brif v15, block3(v17), block2
+;; @002b v12 = iconst.i64 0
+;; @002b v14 = load.i64 user6 aligned region1 v12 ; v12 = 0
+;; @002b v15 = iconst.i64 -2
+;; @002b v16 = band v14, v15 ; v15 = -2
+;; @002b brif v14, block3(v16), block2
;;
;; block2 cold:
-;; @002b v19 = iconst.i32 0
-;; @002b v21 = call fn0(v0, v19, v8) ; v19 = 0
-;; @002b jump block3(v21)
+;; @002b v5 = iconst.i32 0
+;; @002b v7 = uextend.i64 v3
+;; @002b v20 = call fn0(v0, v5, v7) ; v5 = 0
+;; @002b jump block3(v20)
;;
-;; block3(v18: i64):
-;; @002b v24 = load.i32 user7 aligned readonly v18+16
-;; @002b v22 = load.i64 notrap aligned readonly can_move region2 v0+40
-;; @002b v23 = load.i32 notrap aligned readonly can_move v22
-;; @002b v25 = icmp eq v24, v23
-;; @002b trapz v25, user8
-;; @002b v27 = load.i64 notrap aligned readonly v18+8
-;; @002b v28 = load.i64 notrap aligned readonly v18+24
-;; @002b v29 = call_indirect sig0, v27(v28, v0, v2)
+;; block3(v17: i64):
+;; @002b v23 = load.i32 user7 aligned readonly v17+16
+;; @002b v21 = load.i64 notrap aligned readonly can_move region2 v0+40
+;; @002b v22 = load.i32 notrap aligned readonly can_move v21
+;; @002b v24 = icmp eq v23, v22
+;; @002b trapz v24, user8
+;; @002b v26 = load.i64 notrap aligned readonly v17+8
+;; @002b v27 = load.i64 notrap aligned readonly v17+24
+;; @002b v28 = call_indirect sig0, v26(v27, v0, v2)
;; @002e jump block1
;;
;; block1:
-;; @002e return v29
+;; @002e return v28
;; }
;;
;; function u0:1(i64 vmctx, i64, i32, i32) -> i32 tail {
@@ -74,41 +65,32 @@
;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
;; gv2 = load.i64 notrap aligned gv1+24
;; gv3 = vmctx
-;; gv4 = load.i64 notrap aligned gv3+48
-;; gv5 = load.i64 notrap aligned gv3+56
+;; gv4 = load.i64 notrap aligned readonly can_move gv3+48
;; sig0 = (i64 vmctx, i64, i32) -> i32 tail
;; sig1 = (i64 vmctx, i32, i64) -> i64 tail
;; fn0 = colocated u805306368:7 sig1
;; stack_limit = gv2
;;
;; block0(v0: i64, v1: i64, v2: i32, v3: i32):
-;; @0035 v5 = load.i64 notrap aligned v0+56
-;; @0035 v9 = load.i64 notrap aligned v0+48
-;; @0035 v6 = ireduce.i32 v5
-;; @0035 v7 = icmp uge v3, v6
-;; @0035 v13 = iconst.i64 0
-;; @0035 v8 = uextend.i64 v3
-;; @0035 v10 = iconst.i64 3
-;; @0035 v11 = ishl v8, v10 ; v10 = 3
-;; @0035 v12 = iadd v9, v11
-;; @0035 v14 = select_spectre_guard v7, v13, v12 ; v13 = 0
-;; @0035 v15 = load.i64 user6 aligned region1 v14
-;; @0035 v16 = iconst.i64 -2
-;; @0035 v17 = band v15, v16 ; v16 = -2
-;; @0035 brif v15, block3(v17), block2
+;; @0035 v12 = iconst.i64 0
+;; @0035 v14 = load.i64 user6 aligned region1 v12 ; v12 = 0
+;; @0035 v15 = iconst.i64 -2
+;; @0035 v16 = band v14, v15 ; v15 = -2
+;; @0035 brif v14, block3(v16), block2
;;
;; block2 cold:
-;; @0035 v19 = iconst.i32 0
-;; @0035 v21 = call fn0(v0, v19, v8) ; v19 = 0
-;; @0035 jump block3(v21)
+;; @0035 v5 = iconst.i32 0
+;; @0035 v7 = uextend.i64 v3
+;; @0035 v20 = call fn0(v0, v5, v7) ; v5 = 0
+;; @0035 jump block3(v20)
;;
-;; block3(v18: i64):
-;; @0035 v24 = load.i32 user7 aligned readonly v18+16
-;; @0035 v22 = load.i64 notrap aligned readonly can_move region2 v0+40
-;; @0035 v23 = load.i32 notrap aligned readonly can_move v22
-;; @0035 v25 = icmp eq v24, v23
-;; @0035 trapz v25, user8
-;; @0035 v27 = load.i64 notrap aligned readonly v18+8
-;; @0035 v28 = load.i64 notrap aligned readonly v18+24
-;; @0035 return_call_indirect sig0, v27(v28, v0, v2)
+;; block3(v17: i64):
+;; @0035 v23 = load.i32 user7 aligned readonly v17+16
+;; @0035 v21 = load.i64 notrap aligned readonly can_move region2 v0+40
+;; @0035 v22 = load.i32 notrap aligned readonly can_move v21
+;; @0035 v24 = icmp eq v23, v22
+;; @0035 trapz v24, user8
+;; @0035 v26 = load.i64 notrap aligned readonly v17+8
+;; @0035 v27 = load.i64 notrap aligned readonly v17+24
+;; @0035 return_call_indirect sig0, v26(v27, v0, v2)
;; }
diff --git a/tests/disas/indirect-call-no-caching.wat b/tests/disas/indirect-call-no-caching.wat
index ae42c54f4c27..1a2e852558bb 100644
--- a/tests/disas/indirect-call-no-caching.wat
+++ b/tests/disas/indirect-call-no-caching.wat
@@ -68,7 +68,6 @@
;; function u0:3(i64 vmctx, i64, i32) -> i32 tail {
;; region0 = 8 "VMContext+0x8"
;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
-;; region2 = 40 "VMContext+0x28"
;; gv0 = vmctx
;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
;; gv2 = load.i64 notrap aligned gv1+24
@@ -101,17 +100,11 @@
;; @0050 jump block3(v19)
;;
;; block3(v16: i64):
-;; @0050 v20 = load.i64 notrap aligned readonly can_move region2 v0+40
-;; @0050 v21 = load.i32 notrap aligned readonly can_move v20
-;; @0050 v22 = load.i32 user7 aligned readonly v16+16
-;; @0050 v23 = icmp eq v22, v21
-;; @0050 v24 = uextend.i32 v23
-;; @0050 trapz v24, user8
-;; @0050 v25 = load.i64 notrap aligned readonly v16+8
-;; @0050 v26 = load.i64 notrap aligned readonly v16+24
-;; @0050 v27 = call_indirect sig0, v25(v26, v0)
+;; @0050 v20 = load.i64 user7 aligned readonly v16+8
+;; @0050 v21 = load.i64 notrap aligned readonly v16+24
+;; @0050 v22 = call_indirect sig0, v20(v21, v0)
;; @0053 jump block1
;;
;; block1:
-;; @0053 return v27
+;; @0053 return v22
;; }
diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat
new file mode 100644
index 000000000000..178f86f72259
--- /dev/null
+++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat
@@ -0,0 +1,239 @@
+;;! target = "pulley64"
+;;! test = "compile"
+;;! objdump = "--funcs all"
+
+;; Immutable funcref table fully populated by a static elem segment — the
+;; `is_eagerly_initialized_funcref_table` predicate holds AND sig check
+;; is statically elided. Two-layer fusion fires at the call_indirect
+;; dispatch tail:
+;;
+;; 1. `try_fuse_funcref_dispatch` (phase 2) absorbs the brif + the two
+;; VMFuncRef field loads (`wasm_call` + `vmctx`) emitted by
+;; `load_code_and_vmctx`, and emits one `xfuncref_dispatch_not_x64`
+;; Pulley op. The continuation block's standalone loads are skipped
+;; via the cross-block sink performed by Pulley's `pre_lower` hook.
+;;
+;; 2. The preceding `xband64_s8 v, -2` stays as a separate op (its
+;; result is `src` to the fused dispatch). Phase-1's `BandBrIf`
+;; fusion does NOT fire here because phase 2 absorbs the brif
+;; first (the recogniser tries phase 2 before phase 1).
+;;
+;; What we pin here: the dispatch tail is exactly
+;; `xband64_s8 ; xfuncref_dispatch_not_x64 ; call_indirect` — three
+;; Pulley dispatches instead of the unfused five.
+
+(module
+ (table 3 3 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 16, x25
+;; xmov x3, x0
+;; br_if_xugteq32_u8 x2, 3, 0x59 // target = 0x72
+;; 20: xmov x1, x3
+;; xload64le_o32 x0, x1, 48
+;; zext32 x15, x2
+;; xshl64_u6 x1, x15, 3
+;; xadd64 x0, x0, x1
+;; xload64le_o32 x1, x0, 0
+;; xband64_s8 x0, x1, -2
+;; br_if_xeq64_i8 x1, 0, 0x22 // target = 0x60
+;; 45: xmov x25, x3
+;; xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x25
+;; pop_frame_restore 16, x25
+;; ret
+;; 60: xzero x0
+;; 62: xmov x25, x3
+;; 65: call3 x25, x0, x15, 0x267 // target = 0x2cc
+;; 6d: jump -0x25 // target = 0x48
+;; 72: trap
+;; ╰─╼ trap: Normal(TableOutOfBounds)
+;;
+;; wasm[0]::array_to_wasm_trampoline[0]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x13, x0, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 72, x14
+;; xmov x14, sp
+;; xstore64le_o32 x13, 64, x14
+;; xpcadd x15, 0x2a // target = 0xc7
+;; xstore64le_o32 x13, 80, x15
+;; call -0xac // target = 0x0
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc7
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; c7: xzero x0
+;; c9: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ce: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[1]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x13, x0, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 72, x14
+;; xmov x14, sp
+;; xstore64le_o32 x13, 64, x14
+;; xpcadd x15, 0x2a // target = 0x121
+;; xstore64le_o32 x13, 80, x15
+;; call -0x101 // target = 0x5
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x121
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 121: xzero x0
+;; 123: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 128: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[2]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x13, x0, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 72, x14
+;; xmov x14, sp
+;; xstore64le_o32 x13, 64, x14
+;; xpcadd x15, 0x2a // target = 0x17b
+;; xstore64le_o32 x13, 80, x15
+;; call -0x155 // target = 0xb
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17b
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 17b: xzero x0
+;; 17d: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 182: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[3]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xload32le_o32 x14, x2, 0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x15, x0, 8
+;; xmov_fp x2
+;; xstore64le_o32 x15, 72, x2
+;; xmov x2, sp
+;; xstore64le_o32 x15, 64, x2
+;; xpcadd x2, 0x2d // target = 0x1df
+;; xstore64le_o32 x15, 80, x2
+;; call3 x0, x1, x14, -0x1b0 // target = 0x11
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1df
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 1df: xzero x0
+;; 1e1: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 1e6: ret
+;;
+;; signatures[0]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x2, x0
+;; xmov x17, x1
+;; xload64le_o32 x13, x1, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 48, x14
+;; xmov_lr x14
+;; xstore64le_o32 x13, 56, x14
+;; xload64le_o32 x0, x0, 8
+;; xmov x16, sp
+;; xone x4
+;; xmov x1, x2
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x15, x0
+;; br_if_not32 x15, 0x13 // target = 0x23e
+;; 231: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; 23e: xmov x1, x17
+;; 241: xload64le_o32 x0, x1, 16
+;; 248: xload64le_o32 x0, x0, 328
+;; 24f: call_indirect_host 42
+;; 253: trap
+;;
+;; signatures[1]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x3, x0
+;; xmov x17, x1
+;; xload64le_o32 x14, x1, 8
+;; xmov_fp x15
+;; xstore64le_o32 x14, 48, x15
+;; xmov_lr x15
+;; xstore64le_o32 x14, 56, x15
+;; xmov x16, sp
+;; xstore32le_o32 x16, 0, x2
+;; xload64le_o32 x0, x0, 8
+;; xone x4
+;; xmov x1, x3
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x0, x0
+;; br_if_not32 x0, 0x13 // target = 0x2b4
+;; 2a7: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; 2b4: xmov x1, x17
+;; 2b7: xload64le_o32 x0, x1, 16
+;; 2be: xload64le_o32 x0, x0, 328
+;; 2c5: call_indirect_host 42
+;; 2c9: trap
+;;
+;; wasmtime_builtin_table_get_lazy_init_func_ref:
+;; push_frame
+;; xload64le_o32 x9, x0, 8
+;; xmov_fp x10
+;; xstore64le_o32 x9, 48, x10
+;; xmov_lr x10
+;; xstore64le_o32 x9, 56, x10
+;; xload64le_o32 x11, x0, 16
+;; xmov x13, x0
+;; xload64le_o32 x0, x11, 56
+;; xmov x3, x2
+;; xmov x2, x1
+;; xmov x1, x13
+;; call_indirect_host 8
+;; pop_frame
+;; ret
diff --git a/tests/disas/pulley-fusion-fires-32bit.wat b/tests/disas/pulley-fusion-fires-32bit.wat
new file mode 100644
index 000000000000..90ccab4100a9
--- /dev/null
+++ b/tests/disas/pulley-fusion-fires-32bit.wat
@@ -0,0 +1,234 @@
+;;! target = "pulley32"
+;;! test = "compile"
+;;! objdump = "--funcs all"
+
+;; Phase 2 fusion on 32-bit Pulley (used by arm64_32-apple-watchos
+;; via cross-language LTO + linker-plugin-lto). The fused op is
+;; `xfuncref_dispatch_x32` with i8 offsets 4 (wasm_call) and 12
+;; (vmctx) — half of the pulley64 offsets (8 and 24).
+;;
+;; This test pins the 32-bit dispatch tail shape AND verifies that
+;; the `imm.bits() == -2` gate fires here (the band's Imm64 from
+;; func_environ's `Imm64::from(-2_i64)` still bits-equals -2 even
+;; though Cranelift truncates the imm to i32 for an i32 band).
+;;
+;; Known-follow-up from `docs/opcode-fusion-funcref-dispatch.md` →
+;; "Known follow-ups" — arm64_32 / Apple Watch confirmation. This
+;; test is the static side of that confirmation; the dynamic side
+;; (a Pulley-on-Apple-Watch run) is gated by Apple Watch SE2
+;; hardware access.
+
+(module
+ (table 3 3 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 16, x25
+;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x6e
+;; 1d: xload32le_o32 x15, x0, 24
+;; xmov x3, x0
+;; xshl32_u6 x0, x2, 2
+;; xadd32 x15, x15, x0
+;; xload32le_o32 x15, x15, 0
+;; xband32_s8 x0, x15, -2
+;; br_if_not32 x15, 0x21 // target = 0x59
+;; 3e: xmov x25, x3
+;; xload32le_o32 x1, x0, 4
+;; xload32le_o32 x0, x0, 12
+;; call_indirect2 x1, x0, x25
+;; pop_frame_restore 16, x25
+;; ret
+;; 59: xzero x0
+;; 5b: zext32 x1, x2
+;; 5e: xmov x25, x3
+;; 61: call3 x25, x0, x1, 0x267 // target = 0x2c8
+;; 69: jump -0x28 // target = 0x41
+;; 6e: trap
+;; ╰─╼ trap: Normal(TableOutOfBounds)
+;;
+;; wasm[0]::array_to_wasm_trampoline[0]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload32le_o32 x13, x0, 4
+;; xmov_fp x14
+;; xstore32le_o32 x13, 48, x14
+;; xmov x14, sp
+;; xstore32le_o32 x13, 44, x14
+;; xpcadd x15, 0x2a // target = 0xc3
+;; xstore32le_o32 x13, 52, x15
+;; call -0xa8 // target = 0x0
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; c3: xzero x0
+;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ca: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[1]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload32le_o32 x13, x0, 4
+;; xmov_fp x14
+;; xstore32le_o32 x13, 48, x14
+;; xmov x14, sp
+;; xstore32le_o32 x13, 44, x14
+;; xpcadd x15, 0x2a // target = 0x11d
+;; xstore32le_o32 x13, 52, x15
+;; call -0xfd // target = 0x5
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 11d: xzero x0
+;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 124: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[2]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload32le_o32 x13, x0, 4
+;; xmov_fp x14
+;; xstore32le_o32 x13, 48, x14
+;; xmov x14, sp
+;; xstore32le_o32 x13, 44, x14
+;; xpcadd x15, 0x2a // target = 0x177
+;; xstore32le_o32 x13, 52, x15
+;; call -0x151 // target = 0xb
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 177: xzero x0
+;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 17e: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[3]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xload32le_o32 x14, x2, 0
+;; xstore64le_o32 sp, 0, x2
+;; xload32le_o32 x15, x0, 4
+;; xmov_fp x2
+;; xstore32le_o32 x15, 48, x2
+;; xmov x2, sp
+;; xstore32le_o32 x15, 44, x2
+;; xpcadd x2, 0x2d // target = 0x1db
+;; xstore32le_o32 x15, 52, x2
+;; call3 x0, x1, x14, -0x1ac // target = 0x11
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 1db: xzero x0
+;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 1e2: ret
+;;
+;; signatures[0]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x2, x0
+;; xmov x17, x1
+;; xload32le_o32 x13, x1, 4
+;; xmov_fp x14
+;; xstore32le_o32 x13, 36, x14
+;; xmov_lr x14
+;; xstore32le_o32 x13, 40, x14
+;; xload32le_o32 x0, x0, 4
+;; xmov x16, sp
+;; xone x4
+;; xmov x1, x2
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x15, x0
+;; br_if_not32 x15, 0x13 // target = 0x23a
+;; 22d: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; 23a: xmov x1, x17
+;; 23d: xload32le_o32 x0, x1, 8
+;; 244: xload32le_o32 x0, x0, 164
+;; 24b: call_indirect_host 42
+;; 24f: trap
+;;
+;; signatures[1]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x3, x0
+;; xmov x17, x1
+;; xload32le_o32 x14, x1, 4
+;; xmov_fp x15
+;; xstore32le_o32 x14, 36, x15
+;; xmov_lr x15
+;; xstore32le_o32 x14, 40, x15
+;; xmov x16, sp
+;; xstore32le_o32 x16, 0, x2
+;; xload32le_o32 x0, x0, 4
+;; xone x4
+;; xmov x1, x3
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x0, x0
+;; br_if_not32 x0, 0x13 // target = 0x2b0
+;; 2a3: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; 2b0: xmov x1, x17
+;; 2b3: xload32le_o32 x0, x1, 8
+;; 2ba: xload32le_o32 x0, x0, 164
+;; 2c1: call_indirect_host 42
+;; 2c5: trap
+;;
+;; wasmtime_builtin_table_get_lazy_init_func_ref:
+;; push_frame
+;; xload32le_o32 x9, x0, 4
+;; xmov_fp x10
+;; xstore32le_o32 x9, 36, x10
+;; xmov_lr x10
+;; xstore32le_o32 x9, 40, x10
+;; xload32le_o32 x11, x0, 8
+;; xmov x13, x0
+;; xload32le_o32 x0, x11, 28
+;; xmov x3, x2
+;; xmov x2, x1
+;; xmov x1, x13
+;; call_indirect_host 8
+;; pop_frame
+;; ret
diff --git a/tests/disas/pulley-fusion-fires-multi-call.wat b/tests/disas/pulley-fusion-fires-multi-call.wat
new file mode 100644
index 000000000000..abd94de07148
--- /dev/null
+++ b/tests/disas/pulley-fusion-fires-multi-call.wat
@@ -0,0 +1,95 @@
+;;! target = "pulley64"
+;;! test = "compile"
+
+;; Multiple call_indirect sites in the same function should each fuse
+;; independently. The pre-pass scans every brif in every block; each
+;; matching pattern marks its own pair of continuation loads as
+;; absorbed. The lowering emits a separate FuncrefDispatch MachInst
+;; at each brif.
+;;
+;; This test pins that the optimisation is per-call-site, not
+;; per-function. A bug that misuses the pre-pass's `to_sink` list
+;; (e.g. accidental dedup, missing one of two patterns) would show up
+;; as one of the two dispatch tails reverting to unfused form.
+;;
+;; Reference precedent: ChakraCore #5915 ("setPrototypeOf does not
+;; invalidate cached instanceof IC inside currently-executing
+;; frame") — fused-op caches must be per-site, not per-function.
+
+(module
+ (table 3 3 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_two") (param i32 i32) (result i32)
+ local.get 0
+ call_indirect (result i32)
+ local.get 1
+ call_indirect (result i32)
+ i32.add)
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 32, x16, x17, x28, x29
+;; xmov x29, x3
+;; br_if_xugteq32_u8 x2, 3, 0xb1 // target = 0xca
+;; 20: xload64le_o32 x28, x0, 48
+;; xmov x4, x0
+;; zext32 x1, x2
+;; xshl64_u6 x0, x1, 3
+;; xadd64 x0, x28, x0
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x68 // target = 0xa6
+;; 45: xmov x16, x4
+;; xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x16
+;; xmov x3, x29
+;; xmov x4, x16
+;; xmov x17, x0
+;; br_if_xugteq32_u8 x3, 3, 0x6a // target = 0xcd
+;; 6a: zext32 x1, x3
+;; xshl64_u6 x0, x1, 3
+;; xadd64 x0, x28, x0
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x3a // target = 0xb8
+;; 85: xmov x16, x4
+;; xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x16
+;; xmov x1, x17
+;; xadd32 x0, x1, x0
+;; pop_frame_restore 32, x16, x17, x28, x29
+;; ret
+;; a6: xzero x0
+;; a8: xmov x16, x4
+;; ab: call3 x16, x0, x1, 0x28f // target = 0x33a
+;; b3: jump -0x6b // target = 0x48
+;; b8: xzero x0
+;; ba: xmov x16, x4
+;; bd: call3 x16, x0, x1, 0x27d // target = 0x33a
+;; c5: jump -0x3d // target = 0x88
+;; ca: trap
+;; cd: trap
diff --git a/tests/disas/pulley-fusion-fires-return-call-indirect.wat b/tests/disas/pulley-fusion-fires-return-call-indirect.wat
new file mode 100644
index 000000000000..ae5faaba802c
--- /dev/null
+++ b/tests/disas/pulley-fusion-fires-return-call-indirect.wat
@@ -0,0 +1,60 @@
+;;! target = "pulley64"
+;;! test = "compile"
+
+;; `return_call_indirect` IS a tail call but the lazy-init brif is
+;; unchanged — only the call op itself is different. Phase 2 still
+;; matches and fires here: the brif's continuation block contains
+;; the same canonical 2-load pattern, and after the loads is a
+;; `return_call_indirect` (lowered as `xjump` after the field reads)
+;; instead of a `call_indirect`. Both consume (code, vmctx) the same
+;; way, so the fusion is sound across the tail-call boundary.
+;;
+;; The disas confirms: `xband64_s8 ; xfuncref_dispatch_not_x64 ;
+;; xjump` — the tail jump replaces what would have been
+;; `call_indirect` in the non-tail case.
+;;
+;; Reference precedent: WAMR #2231 ("AOT/JIT tail-call:
+;; `return_call_indirect` is not actually tail" — uses LLVM `tail`
+;; hint instead of `musttail`). Our fusion preserves tail-call
+;; semantics because it runs upstream of the call_indirect-vs-
+;; return_call_indirect choice; this test pins that.
+
+(module
+ (table 1 1 funcref)
+ (type $sig (func (result i32)))
+
+ (func $f1 (result i32) i32.const 1)
+
+ (func (export "trampoline") (param i32) (result i32)
+ local.get 0
+ return_call_indirect (type $sig))
+
+ (elem (i32.const 0) func $f1))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]:
+;; push_frame_save 16, x25
+;; br_if32 x2, 0x58 // target = 0x62
+;; 10: xload64le_o32 x15, x0, 48
+;; xmov x1, x0
+;; zext32 x14, x2
+;; xshl64_u6 x0, x14, 3
+;; xadd64 x15, x15, x0
+;; xload64le_o32 x15, x15, 0
+;; xband64_s8 x0, x15, -2
+;; br_if_xeq64_i8 x15, 0, 0x22 // target = 0x50
+;; 35: xmov x25, x1
+;; xload64le_o32 x15, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; xmov x1, x25
+;; pop_frame_restore 16, x25
+;; xjump x15
+;; 50: xzero x0
+;; xmov x25, x1
+;; call3 x25, x0, x14, 0x1b3 // target = 0x208
+;; jump -0x25 // target = 0x38
+;; 62: trap
diff --git a/tests/disas/pulley-fusion-no-fire-mutable-table.wat b/tests/disas/pulley-fusion-no-fire-mutable-table.wat
new file mode 100644
index 000000000000..f1c57adc892a
--- /dev/null
+++ b/tests/disas/pulley-fusion-no-fire-mutable-table.wat
@@ -0,0 +1,341 @@
+;;! target = "pulley64"
+;;! test = "compile"
+;;! objdump = "--funcs all"
+
+;; Phase 1 / phase 2 fusion gating: a single `table.set` anywhere in
+;; the module sets `tables_mutated[idx] = true` for that table, which
+;; disables the `is_eagerly_initialized_funcref_table` predicate.
+;; func_environ's IR rewrite then emits the ORIGINAL brif on `value`
+;; (unmasked) instead of the rewritten brif on `value_masked`. With no
+;; `brif(band(v, -2))` pattern reaching the lowering, neither phase 1
+;; (BandBrIf) nor phase 2 (FuncrefDispatch) fires. The dispatch tail
+;; keeps its separate band + brif + xload + xload + call_indirect ops.
+;;
+;; Reference precedents in upstream interpreters where similar
+;; mutation-invariant edges caused real bugs:
+;; - V8 issue 5913 (call_indirect signature mismatch under table
+;; sharing) — the sig-elide invariant must not survive a foreign
+;; mutation.
+;; - GHSA-q49f-xg75-m9xw (wasmtime Winch table.fill host panic) —
+;; bulk table ops must invalidate fusion-eligibility.
+;; - Hermes 24a8fe64 (HiddenClass GC'd mid-IC), Luau release/717
+;; (userdata write didn't invalidate store cache) — the general
+;; shape "fused-op cached state survives invalidation source".
+;;
+;; This test pins the gating. Adding a `table.set` anywhere should
+;; produce the unfused dispatch sequence below.
+
+(module
+ (table 3 3 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ ;; Mutator: clears the immutability proof for table 0.
+ (func (export "mutate") (param i32)
+ local.get 0
+ ref.func $f1
+ table.set 0)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 16, x16, x17
+;; xmov x12, x0
+;; xmov x17, x2
+;; xzero x9
+;; xmov x16, x12
+;; call2 x16, x9, 0x3da // target = 0x3fb
+;; xmov x2, x17
+;; xmov x12, x16
+;; br_if_xugteq32_u8 x2, 3, 0x2b // target = 0x59
+;; 35: xbor64_s8 x10, x0, 1
+;; xmov x0, x12
+;; xload64le_o32 x11, x0, 48
+;; zext32 x12, x2
+;; xshl64_u6 x12, x12, 3
+;; xadd64 x11, x11, x12
+;; xstore64le_o32 x11, 0, x10
+;; pop_frame_restore 16, x16, x17
+;; ret
+;; 59: trap
+;; ╰─╼ trap: Normal(TableOutOfBounds)
+;;
+;; wasm[0]::function[4]:
+;; push_frame_save 16, x28
+;; xmov x3, x0
+;; br_if_xugteq32_u8 x2, 3, 0x7c // target = 0xe0
+;; 6b: xmov x1, x3
+;; xload64le_o32 x0, x1, 48
+;; zext32 x1, x2
+;; xshl64_u6 x2, x1, 3
+;; xadd64 x0, x0, x2
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0xce
+;; 90: xmov x28, x3
+;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0xe3
+;; 9a: xload32le_o32 x1, x0, 16
+;; xload64le_o32 x2, x28, 40
+;; xload32le_o32 x2, x2, 0
+;; br_if_xneq32 x1, x2, 0x37 // target = 0xe6
+;; b6: xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x28
+;; pop_frame_restore 16, x28
+;; ret
+;; ce: xzero x0
+;; d0: xmov x28, x3
+;; d3: call3 x28, x0, x1, 0x363 // target = 0x436
+;; db: jump -0x48 // target = 0x93
+;; e0: trap
+;; ╰─╼ trap: Normal(TableOutOfBounds)
+;; e3: trap
+;; ╰─╼ trap: Normal(IndirectCallToNull)
+;; e6: trap
+;; ╰─╼ trap: Normal(BadSignature)
+;;
+;; wasm[0]::array_to_wasm_trampoline[0]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x13, x0, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 72, x14
+;; xmov x14, sp
+;; xstore64le_o32 x13, 64, x14
+;; xpcadd x15, 0x2a // target = 0x13b
+;; xstore64le_o32 x13, 80, x15
+;; call -0x120 // target = 0x0
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x13b
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 13b: xzero x0
+;; 13d: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 142: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[1]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x13, x0, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 72, x14
+;; xmov x14, sp
+;; xstore64le_o32 x13, 64, x14
+;; xpcadd x15, 0x2a // target = 0x195
+;; xstore64le_o32 x13, 80, x15
+;; call -0x175 // target = 0x5
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x195
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 195: xzero x0
+;; 197: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 19c: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[2]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x13, x0, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 72, x14
+;; xmov x14, sp
+;; xstore64le_o32 x13, 64, x14
+;; xpcadd x15, 0x2a // target = 0x1ef
+;; xstore64le_o32 x13, 80, x15
+;; call -0x1c9 // target = 0xb
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1ef
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 1ef: xzero x0
+;; 1f1: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 1f6: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[3]:
+;; push_frame_save 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xload32le_o32 x13, x2, 0
+;; xload64le_o32 x14, x0, 8
+;; xmov_fp x15
+;; xstore64le_o32 x14, 72, x15
+;; xmov x15, sp
+;; xstore64le_o32 x14, 64, x15
+;; xpcadd x15, 0x1f // target = 0x23e
+;; xstore64le_o32 x14, 80, x15
+;; call3 x0, x1, x13, -0x21d // target = 0x11
+;; ├─╼ exception frame offset: SP = FP - 0x80
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x23e
+;; xone x0
+;; pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 23e: xzero x0
+;; 240: pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 245: ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[4]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xload32le_o32 x14, x2, 0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x15, x0, 8
+;; xmov_fp x2
+;; xstore64le_o32 x15, 72, x2
+;; xmov x2, sp
+;; xstore64le_o32 x15, 64, x2
+;; xpcadd x2, 0x2d // target = 0x2a2
+;; xstore64le_o32 x15, 80, x2
+;; call3 x0, x1, x14, -0x228 // target = 0x5c
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x2a2
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 2a2: xzero x0
+;; 2a4: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 2a9: ret
+;;
+;; signatures[0]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x2, x0
+;; xmov x17, x1
+;; xload64le_o32 x13, x1, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 48, x14
+;; xmov_lr x14
+;; xstore64le_o32 x13, 56, x14
+;; xload64le_o32 x0, x0, 8
+;; xmov x16, sp
+;; xone x4
+;; xmov x1, x2
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x15, x0
+;; br_if_not32 x15, 0x13 // target = 0x301
+;; 2f4: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; 301: xmov x1, x17
+;; 304: xload64le_o32 x0, x1, 16
+;; 30b: xload64le_o32 x0, x0, 328
+;; 312: call_indirect_host 42
+;; 316: trap
+;;
+;; signatures[1]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16
+;; xmov x5, x0
+;; xmov x16, x1
+;; xload64le_o32 x13, x1, 8
+;; xmov_fp x14
+;; xstore64le_o32 x13, 48, x14
+;; xmov_lr x14
+;; xstore64le_o32 x13, 56, x14
+;; xmov x3, sp
+;; xstore32le_o32 x3, 0, x2
+;; xload64le_o32 x0, x0, 8
+;; xone x4
+;; xmov x1, x5
+;; xmov x2, x16
+;; call_indirect_host 0
+;; zext8 x0, x0
+;; br_if_not32 x0, 0xc // target = 0x36d
+;; 367: pop_frame_restore 32, x16
+;; ret
+;; 36d: xmov x1, x16
+;; 370: xload64le_o32 x0, x1, 16
+;; 377: xload64le_o32 x0, x0, 328
+;; 37e: call_indirect_host 42
+;; 382: trap
+;;
+;; signatures[2]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x3, x0
+;; xmov x17, x1
+;; xload64le_o32 x14, x1, 8
+;; xmov_fp x15
+;; xstore64le_o32 x14, 48, x15
+;; xmov_lr x15
+;; xstore64le_o32 x14, 56, x15
+;; xmov x16, sp
+;; xstore32le_o32 x16, 0, x2
+;; xload64le_o32 x0, x0, 8
+;; xone x4
+;; xmov x1, x3
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x0, x0
+;; br_if_not32 x0, 0x13 // target = 0x3e3
+;; 3d6: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; 3e3: xmov x1, x17
+;; 3e6: xload64le_o32 x0, x1, 16
+;; 3ed: xload64le_o32 x0, x0, 328
+;; 3f4: call_indirect_host 42
+;; 3f8: trap
+;;
+;; wasmtime_builtin_ref_func:
+;; push_frame
+;; xload64le_o32 x8, x0, 8
+;; xmov_fp x9
+;; xstore64le_o32 x8, 48, x9
+;; xmov_lr x9
+;; xstore64le_o32 x8, 56, x9
+;; xload64le_o32 x10, x0, 16
+;; xmov x11, x0
+;; xload64le_o32 x0, x10, 48
+;; xmov x2, x1
+;; xmov x1, x11
+;; call_indirect_host 7
+;; pop_frame
+;; ret
+;;
+;; wasmtime_builtin_table_get_lazy_init_func_ref:
+;; push_frame
+;; xload64le_o32 x9, x0, 8
+;; xmov_fp x10
+;; xstore64le_o32 x9, 48, x10
+;; xmov_lr x10
+;; xstore64le_o32 x9, 56, x10
+;; xload64le_o32 x11, x0, 16
+;; xmov x13, x0
+;; xload64le_o32 x0, x11, 56
+;; xmov x3, x2
+;; xmov x2, x1
+;; xmov x1, x13
+;; call_indirect_host 8
+;; pop_frame
+;; ret
diff --git a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat
new file mode 100644
index 000000000000..398c5fee2cb3
--- /dev/null
+++ b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat
@@ -0,0 +1,88 @@
+;;! target = "pulley64"
+;;! test = "compile"
+
+;; Phase 2 fusion does NOT match when the sig check is NOT statically
+;; elided. With a runtime sig check, the continuation block starts
+;; with a sig load (from the funcref's `type_index` field) + comparison
+;; + trapz, NOT the two `wasm_call` / `vmctx` loads. Phase 2's
+;; recogniser requires the first two CLIF insts in the continuation
+;; to be the canonical loads, so it bails. Phase 1's band+brif fusion
+;; still applies as fallback.
+;;
+;; The module shape: an untyped `funcref` table with elem entries of
+;; MIXED signatures. With mixed sigs, `try_elide_sig_check_for_immutable_table`
+;; cannot establish a uniform static type, and the runtime sig check
+;; stays in the dispatch tail.
+;;
+;; Reference precedent: V8 issue 5913 ("call_indirect signature
+;; mismatch with table-sharing") + WebKit changeset 273962
+;; ("call_ref / non-null funcref"): sig elision under "assumed-
+;; immutable" predicates is a known footgun, and the safe fallback
+;; is "keep the runtime sig check".
+
+(module
+ (table 3 3 funcref)
+ (type $sig (func (param i32) (result i32)))
+
+ ;; $f1, $f2 match $sig.
+ (func $f1 (param i32) (result i32) i32.const 1)
+ (func $f2 (param i32) (result i32) i32.const 2)
+ ;; $f3 has a DIFFERENT signature — defeats uniform-sig elision.
+ (func $f3 (result i32) i32.const 3)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ local.get 0
+ call_indirect (type $sig))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 16, x16, x18
+;; xmov x3, x0
+;; br_if_xugteq32_u8 x2, 3, 0x82 // target = 0x9b
+;; 20: xmov x1, x3
+;; xload64le_o32 x0, x1, 48
+;; zext32 x1, x2
+;; xmov x18, x2
+;; xshl64_u6 x2, x1, 3
+;; xadd64 x0, x0, x2
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x48 // target = 0x89
+;; 48: xmov x16, x3
+;; br_if_xeq64_i8 x0, 0, 0x53 // target = 0x9e
+;; 52: xload32le_o32 x1, x0, 16
+;; xload64le_o32 x2, x16, 40
+;; xload32le_o32 x2, x2, 0
+;; br_if_xneq32 x1, x2, 0x3a // target = 0xa1
+;; 6e: xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; xmov x2, x18
+;; call_indirect2 x1, x0, x16
+;; pop_frame_restore 16, x16, x18
+;; ret
+;; 89: xzero x0
+;; 8b: xmov x16, x3
+;; 8e: call3 x16, x0, x1, 0x281 // target = 0x30f
+;; 96: jump -0x4b // target = 0x4b
+;; 9b: trap
+;; 9e: trap
+;; a1: trap
diff --git a/tests/disas/pulley-fusion-no-fire-table-copy.wat b/tests/disas/pulley-fusion-no-fire-table-copy.wat
new file mode 100644
index 000000000000..9b89bccf1ec9
--- /dev/null
+++ b/tests/disas/pulley-fusion-no-fire-table-copy.wat
@@ -0,0 +1,187 @@
+;;! target = "pulley64"
+;;! test = "compile"
+
+;; Phase 1 / phase 2 fusion gating: `table.copy` mutates the
+;; destination table. With table 0 as the copy destination, its
+;; immutability proof is cleared and the eager-init predicate becomes
+;; false — fusion does not fire.
+;;
+;; Note that this only marks the DESTINATION as mutated; the source
+;; table (table 1) keeps its proof. wasm-benchmark/`environ`'s
+;; `table_mutability` test suite has the integration coverage for the
+;; src-vs-dst marking; this filetest pins the lowering-level
+;; consequence (Pulley dispatch tail is unfused for the dst table).
+;;
+;; wasm3 #547 (`op_CallIndirect` SEGV — missing bounds check on table
+;; index) is a related precedent: bulk-copy invariants that fail
+;; silently in one engine produce dispatch-time crashes in another.
+
+(module
+ (table $tdst 5 5 funcref)
+ (table $tsrc 5 5 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ ;; Bulk mutator: clears the immutability proof for table $tdst.
+ (func (export "copy") (param i32 i32 i32)
+ local.get 0 local.get 1 local.get 2
+ table.copy $tdst $tsrc)
+
+ ;; Call through the (potentially-mutated) destination table.
+ (func (export "call_dst") (param i32) (result i32)
+ local.get 0
+ call_indirect $tdst (result i32))
+
+ ;; Call through the source table (still immutable from this
+ ;; module's perspective; fusion CAN fire here).
+ (func (export "call_src") (param i32) (result i32)
+ local.get 0
+ call_indirect $tsrc (result i32))
+
+ (elem (table $tdst) (i32.const 0) func $f1 $f2 $f3)
+ (elem (table $tsrc) (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 64, x16, x17, x20, x21, x24, x26, x28
+;; zext32 x2, x2
+;; zext32 x1, x4
+;; xadd64 x5, x2, x1
+;; br_if_xugt64_u8 x5, 5, 0x109 // target = 0x128
+;; 26: zext32 x5, x3
+;; xadd64 x6, x5, x1
+;; br_if_xugt64_u8 x6, 5, 0xff // target = 0x12b
+;; br_if_not32 x4, 0xcf // target = 0x102
+;; 39: xload64le_o32 x6, x0, 48
+;; xshl64_u6 x2, x2, 3
+;; xadd64 x17, x6, x2
+;; xload64le_o32 x16, x0, 64
+;; xmov x6, x0
+;; xshl64_u6 x0, x5, 3
+;; xadd64 x20, x16, x0
+;; xshl64_u6 x0, x1, 3
+;; xadd64 x24, x17, x0
+;; xadd64 x26, x20, x0
+;; xadd32 x28, x3, x4
+;; xmov x0, x3
+;; br_if_xulteq64 x20, x17, 0x12 // target = 0x77
+;; 6c: xmov x21, x6
+;; xmov x28, x0
+;; jump 0x50 // target = 0xc2
+;; 77: xsub32_u8 x28, x28, 1
+;; br_if_xugteq32_u8 x28, 5, 0xb3 // target = 0x12e
+;; 82: zext32 x1, x28
+;; xshl64_u6 x0, x1, 3
+;; xadd64 x0, x16, x0
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x72 // target = 0x108
+;; 9d: xmov x21, x6
+;; xbor64_s8 x0, x0, 1
+;; xsub64_u8 x24, x24, 8
+;; xstore64le_o32 x24, 0, x0
+;; xsub64_u8 x26, x26, 8
+;; br_if_xeq64 x26, x20, 0x4f // target = 0x102
+;; ba: xmov x6, x21
+;; jump -0x46 // target = 0x77
+;; br_if_xugteq32_u8 x28, 5, 0x6f // target = 0x131
+;; c9: zext32 x2, x28
+;; xshl64_u6 x3, x2, 3
+;; xadd64 x3, x16, x3
+;; xload64le_o32 x3, x3, 0
+;; xband64_s8 x0, x3, -2
+;; br_if_xeq64_i8 x3, 0, 0x3d // target = 0x11a
+;; e4: xbor64_s8 x5, x0, 1
+;; xstore64le_o32 x17, 0, x5
+;; xadd64_u8 x20, x20, 8
+;; xadd64_u8 x17, x17, 8
+;; xadd32_u8 x28, x28, 1
+;; br_if_xneq64 x20, x26, -0x39 // target = 0xc2
+;; 102: pop_frame_restore 64, x16, x17, x20, x21, x24, x26, x28
+;; ret
+;; 108: xone x0
+;; 10a: xmov x21, x6
+;; 10d: call3 x21, x0, x1, 0x4bf // target = 0x5cc
+;; 115: jump -0x75 // target = 0xa0
+;; 11a: xone x4
+;; 11c: call2 x21, x4, 0x4b0 // target = 0x5cc
+;; 123: jump -0x3f // target = 0xe4
+;; 128: trap
+;; 12b: trap
+;; 12e: trap
+;; 131: trap
+;;
+;; wasm[0]::function[4]:
+;; push_frame_save 16, x28
+;; xmov x3, x0
+;; br_if_xugteq32_u8 x2, 5, 0x7c // target = 0x1b8
+;; 143: xmov x1, x3
+;; xload64le_o32 x0, x1, 48
+;; zext32 x1, x2
+;; xshl64_u6 x2, x1, 3
+;; xadd64 x0, x0, x2
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0x1a6
+;; 168: xmov x28, x3
+;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0x1bb
+;; 172: xload32le_o32 x1, x0, 16
+;; xload64le_o32 x2, x28, 40
+;; xload32le_o32 x2, x2, 0
+;; br_if_xneq32 x1, x2, 0x37 // target = 0x1be
+;; 18e: xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x28
+;; pop_frame_restore 16, x28
+;; ret
+;; 1a6: xzero x0
+;; 1a8: xmov x28, x3
+;; 1ab: call3 x28, x0, x1, 0x421 // target = 0x5cc
+;; 1b3: jump -0x48 // target = 0x16b
+;; 1b8: trap
+;; 1bb: trap
+;; 1be: trap
+;;
+;; wasm[0]::function[5]:
+;; push_frame_save 16, x25
+;; xmov x3, x0
+;; br_if_xugteq32_u8 x2, 5, 0x60 // target = 0x229
+;; 1d0: xmov x1, x3
+;; xload64le_o32 x0, x1, 64
+;; zext32 x15, x2
+;; xshl64_u6 x1, x15, 3
+;; xadd64 x0, x0, x1
+;; xload64le_o32 x1, x0, 0
+;; xband64_s8 x0, x1, -2
+;; br_if_xeq64_i8 x1, 0, 0x29 // target = 0x217
+;; 1f5: xmov x25, x3
+;; br_if_xeq64_i8 x0, 0, 0x34 // target = 0x22c
+;; 1ff: xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x25
+;; pop_frame_restore 16, x25
+;; ret
+;; 217: xone x0
+;; 219: xmov x25, x3
+;; 21c: call3 x25, x0, x15, 0x3b0 // target = 0x5cc
+;; 224: jump -0x2c // target = 0x1f8
+;; 229: trap
+;; 22c: trap
diff --git a/tests/disas/pulley-fusion-no-fire-table-fill.wat b/tests/disas/pulley-fusion-no-fire-table-fill.wat
new file mode 100644
index 000000000000..9bb480d7ff20
--- /dev/null
+++ b/tests/disas/pulley-fusion-no-fire-table-fill.wat
@@ -0,0 +1,110 @@
+;;! target = "pulley64"
+;;! test = "compile"
+
+;; Phase 1 / phase 2 fusion gating: `table.fill` is a bulk-memory op
+;; that mutates an arbitrary range of the table. Like `table.set`, it
+;; sets `tables_mutated[idx] = true` for the target table and disables
+;; the eager-init predicate. The dispatch tail must be the unfused
+;; sequence with the original `brif value` (not `brif value_masked`),
+;; so neither phase 1 nor phase 2 fires.
+;;
+;; Reference: GHSA-q49f-xg75-m9xw (wasmtime Winch table.fill host
+;; panic) — bulk table ops are a classic invariant-edge for any
+;; "immutable-table" cache or fusion. wasm3 #335 (null table element
+;; on Swift reactor-mode tables) showed how a partially-initialised
+;; table breaks a "table is fully populated" assumption.
+
+(module
+ (table 3 3 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+ (func $f2 (result i32) i32.const 2)
+ (func $f3 (result i32) i32.const 3)
+
+ ;; Bulk mutator: clears the immutability proof for table 0.
+ (func (export "fill_some") (param $dst i32)
+ local.get $dst
+ ref.func $f1
+ i32.const 1
+ table.fill 0)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1 $f2 $f3))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]::f2:
+;; push_frame
+;; xconst8 x0, 2
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[2]::f3:
+;; push_frame
+;; xconst8 x0, 3
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[3]:
+;; push_frame_save 16, x16, x20
+;; xmov x16, x2
+;; xzero x12
+;; xmov x20, x0
+;; call2 x20, x12, 0x3f7 // target = 0x415
+;; xmov x15, x0
+;; xmov x2, x16
+;; xmov x0, x20
+;; zext32 x12, x2
+;; xadd64_u8 x13, x12, 1
+;; br_if_xugt64_u8 x13, 3, 0x3e // target = 0x73
+;; 3c: xload64le_o32 x13, x0, 48
+;; xshl64_u6 x14, x12, 3
+;; xadd64 x13, x13, x14
+;; xmov x0, x15
+;; xmov x12, x13
+;; xbor64_s8 x14, x0, 1
+;; xstore64le_o32 x12, 0, x14
+;; xadd64_u8 x15, x12, 8
+;; br_if_xeq64 x12, x13, 0xf // target = 0x6d
+;; 65: xmov x12, x15
+;; jump -0x19 // target = 0x4f
+;; 6d: pop_frame_restore 16, x16, x20
+;; ret
+;; 73: trap
+;;
+;; wasm[0]::function[4]:
+;; push_frame_save 16, x28
+;; xmov x3, x0
+;; br_if_xugteq32_u8 x2, 3, 0x7c // target = 0xfa
+;; 85: xmov x1, x3
+;; xload64le_o32 x0, x1, 48
+;; zext32 x1, x2
+;; xshl64_u6 x2, x1, 3
+;; xadd64 x0, x0, x2
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0xe8
+;; aa: xmov x28, x3
+;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0xfd
+;; b4: xload32le_o32 x1, x0, 16
+;; xload64le_o32 x2, x28, 40
+;; xload32le_o32 x2, x2, 0
+;; br_if_xneq32 x1, x2, 0x37 // target = 0x100
+;; d0: xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x28
+;; pop_frame_restore 16, x28
+;; ret
+;; e8: xzero x0
+;; ea: xmov x28, x3
+;; ed: call3 x28, x0, x1, 0x363 // target = 0x450
+;; f5: jump -0x48 // target = 0xad
+;; fa: trap
+;; fd: trap
+;; 100: trap
diff --git a/tests/disas/pulley-fusion-no-fire-table-grow.wat b/tests/disas/pulley-fusion-no-fire-table-grow.wat
new file mode 100644
index 000000000000..5dcac37c501e
--- /dev/null
+++ b/tests/disas/pulley-fusion-no-fire-table-grow.wat
@@ -0,0 +1,103 @@
+;;! target = "pulley64"
+;;! test = "compile"
+
+;; Phase 1 / phase 2 fusion gating: `table.grow` adds slots at the
+;; end of the table; new slots default to `ref.null func`. The
+;; "eagerly-initialised, fully-populated" predicate doesn't hold
+;; after grow, so fusion is disabled.
+;;
+;; In our `table_mutability` accounting (crates/environ), `table.grow`
+;; sets the mutated bit for the target table the same way
+;; `table.set` does. This filetest pins the lowering-level
+;; consequence: the unfused dispatch sequence on the grown table.
+;;
+;; Reference: wasm3 #547 — bounds-check ↔ growth races; Luau release/
+;; 717 — "writes to userdata did not invalidate the store cache",
+;; same shape of "fused-op cached a base pointer that got
+;; reallocated".
+
+(module
+ (table 1 funcref)
+
+ (func $f1 (result i32) i32.const 1)
+
+ (func (export "grow") (param i32) (result i32)
+ ref.func $f1
+ local.get 0
+ table.grow 0)
+
+ (func (export "call_it") (param i32) (result i32)
+ local.get 0
+ call_indirect (result i32))
+
+ (elem (i32.const 0) func $f1))
+;; wasm[0]::function[0]::f1:
+;; push_frame
+;; xone x0
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::function[1]:
+;; push_frame_save 48, x18, x19, x20, x23, x28
+;; xmov x23, x2
+;; xzero x19
+;; xmov x28, x0
+;; call2 x28, x19, 0x313 // target = 0x325
+;; xmov x20, x0
+;; xmov x2, x23
+;; xmov x0, x28
+;; zext32 x18, x2
+;; call3 x28, x19, x18, 0x379 // target = 0x39e
+;; xmov x1, x0
+;; br_if_xeq32_i8 x1, -1, 0x51 // target = 0x81
+;; 37: xload64le_o32 x3, x28, 56
+;; zext32 x2, x1
+;; xadd64 x4, x2, x18
+;; zext32 x0, x3
+;; br_if_xult64 x0, x4, 0x43 // target = 0x8a
+;; 4e: xload64le_o32 x0, x28, 48
+;; xshl64_u6 x2, x2, 3
+;; xadd64 x0, x0, x2
+;; xshl64_u6 x2, x18, 3
+;; xadd64 x2, x0, x2
+;; br_if_xeq64_i8 x18, 0, 0x20 // target = 0x81
+;; 68: xmov x3, x20
+;; xbor64_s8 x4, x3, 1
+;; xstore64le_o32 x0, 0, x4
+;; xadd64_u8 x0, x0, 8
+;; br_if_xneq64 x0, x2, -0xf // target = 0x6b
+;; 81: xmov x0, x1
+;; pop_frame_restore 48, x18, x19, x20, x23, x28
+;; ret
+;; 8a: trap
+;;
+;; wasm[0]::function[2]:
+;; push_frame_save 16, x16
+;; xload64le_o32 x1, x0, 56
+;; br_if_xulteq32 x1, x2, 0x7c // target = 0x115
+;; a0: xload64le_o32 x3, x0, 48
+;; xmov x4, x0
+;; zext32 x1, x2
+;; xshl64_u6 x0, x1, 3
+;; xadd64 x0, x3, x0
+;; xload64le_o32 x2, x0, 0
+;; xband64_s8 x0, x2, -2
+;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0x103
+;; c5: xmov x16, x4
+;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0x118
+;; cf: xload32le_o32 x1, x0, 16
+;; xload64le_o32 x2, x16, 40
+;; xload32le_o32 x2, x2, 0
+;; br_if_xneq32 x1, x2, 0x37 // target = 0x11b
+;; eb: xload64le_o32 x1, x0, 8
+;; xload64le_o32 x0, x0, 24
+;; call_indirect2 x1, x0, x16
+;; pop_frame_restore 16, x16
+;; ret
+;; 103: xzero x0
+;; 105: xmov x16, x4
+;; 108: call3 x16, x0, x1, 0x258 // target = 0x360
+;; 110: jump -0x48 // target = 0xc8
+;; 115: trap
+;; 118: trap
+;; 11b: trap
diff --git a/tests/disas/pulley-fusion-no-fire-user-mask.wat b/tests/disas/pulley-fusion-no-fire-user-mask.wat
new file mode 100644
index 000000000000..2ce412df8cb9
--- /dev/null
+++ b/tests/disas/pulley-fusion-no-fire-user-mask.wat
@@ -0,0 +1,92 @@
+;;! target = "pulley64"
+;;! test = "compile"
+;;! objdump = "--funcs all"
+
+;; Phase 1 / phase 2 fusion gating against user wasm: the recogniser
+;; gates on `imm.bits() == -2`, which would naively match the wat
+;; `(i32.const -2) (i32.and) (br_if)` user pattern and risk a soundness
+;; mismatch (the fused op tests UNMASKED src for non-zero, whereas the
+;; original brif tests `(v & -2) != 0` — they differ at v == 1).
+;;
+;; The bug is unreachable from wasm because:
+;; * `br_if` cond is always i32 (wasm validation), AND
+;; * the wat parser stores `(i32.const -2)` as `Imm64(0xFFFFFFFE)`
+;; (= 4294967294), NOT `Imm64(-2)`.
+;; So `imm.bits() == -2` doesn't match the wat-emitted i32 form. The
+;; only producer of `Imm64(-2)` reaching the recogniser is
+;; `func_environ::get_or_init_func_ref_table_elem`'s call to
+;; `Imm64::from(-2_i64)`.
+;;
+;; This test pins the surface behaviour. If the gate ever changes to
+;; accept i32 -2 encodings too, the disas would suddenly start
+;; containing `xband32_s8_br_if_*` or `xfuncref_dispatch_*` here, and
+;; this test fails — that's the signal to re-audit soundness.
+
+(module
+ (func (export "test") (param $v i32) (result i32) (local $tmp i32)
+ local.get $v
+ i32.const -2
+ i32.and
+ local.tee $tmp
+ local.get $tmp
+ br_if 0
+ drop
+ i32.const 999))
+;; wasm[0]::function[0]:
+;; push_frame
+;; xband32_s8 x0, x2, -2
+;; br_if32 x0, 0xa // target = 0xf
+;; b: xconst16 x0, 999
+;; pop_frame
+;; ret
+;;
+;; wasm[0]::array_to_wasm_trampoline[0]:
+;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; xload32le_o32 x14, x2, 0
+;; xstore64le_o32 sp, 0, x2
+;; xload64le_o32 x15, x0, 8
+;; xmov_fp x2
+;; xstore64le_o32 x15, 72, x2
+;; xmov x2, sp
+;; xstore64le_o32 x15, 64, x2
+;; xpcadd x2, 0x2d // target = 0x6d
+;; xstore64le_o32 x15, 80, x2
+;; call3 x0, x1, x14, -0x4f // target = 0x0
+;; ├─╼ exception frame offset: SP = FP - 0x90
+;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x6d
+;; xload64le_o32 x2, sp, 0
+;; xstore32le_o32 x2, 0, x0
+;; xone x0
+;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; ret
+;; 6d: xzero x0
+;; 6f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0
+;; 74: ret
+;;
+;; signatures[0]::wasm_to_array_trampoline:
+;; push_frame_save 32, x16, x17
+;; xmov x3, x0
+;; xmov x17, x1
+;; xload64le_o32 x14, x1, 8
+;; xmov_fp x15
+;; xstore64le_o32 x14, 48, x15
+;; xmov_lr x15
+;; xstore64le_o32 x14, 56, x15
+;; xmov x16, sp
+;; xstore32le_o32 x16, 0, x2
+;; xload64le_o32 x0, x0, 8
+;; xone x4
+;; xmov x1, x3
+;; xmov x2, x17
+;; xmov x3, x16
+;; call_indirect_host 0
+;; zext8 x0, x0
+;; br_if_not32 x0, 0x13 // target = 0xd3
+;; c6: xload32le_o32 x0, x16, 0
+;; pop_frame_restore 32, x16, x17
+;; ret
+;; d3: xmov x1, x17
+;; d6: xload64le_o32 x0, x1, 16
+;; dd: xload64le_o32 x0, x0, 328
+;; e4: call_indirect_host 42
+;; e8: trap
diff --git a/tests/disas/pulley/call.wat b/tests/disas/pulley/call.wat
index 233ca7be3c35..d9bc3142fd99 100644
--- a/tests/disas/pulley/call.wat
+++ b/tests/disas/pulley/call.wat
@@ -8,9 +8,7 @@
;; wasm[0]::function[1]:
;; push_frame
;; xload32le_o32 x3, x0, 28
-;; xmov x6, x0
-;; xload32le_o32 x0, x6, 36
-;; xmov x1, x6
-;; call_indirect x3
+;; xload32le_o32 x4, x0, 36
+;; call_indirect2 x3, x4, x0
;; pop_frame
;; ret
diff --git a/tests/disas/readonly-funcrefs.wat b/tests/disas/readonly-funcrefs.wat
index 9febf947e3b1..e341fbcc4dba 100644
--- a/tests/disas/readonly-funcrefs.wat
+++ b/tests/disas/readonly-funcrefs.wat
@@ -35,7 +35,6 @@
;; function u0:1(i64 vmctx, i64, i32) tail {
;; region0 = 8 "VMContext+0x8"
;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
-;; region2 = 40 "VMContext+0x28"
;; gv0 = vmctx
;; gv1 = load.i64 notrap aligned readonly region0 gv0+8
;; gv2 = load.i64 notrap aligned gv1+24
@@ -67,14 +66,9 @@
;; @0031 jump block3(v18)
;;
;; block3(v15: i64):
-;; @0031 v21 = load.i32 user7 aligned readonly v15+16
-;; @0031 v19 = load.i64 notrap aligned readonly can_move region2 v0+40
-;; @0031 v20 = load.i32 notrap aligned readonly can_move v19
-;; @0031 v22 = icmp eq v21, v20
-;; @0031 trapz v22, user8
-;; @0031 v24 = load.i64 notrap aligned readonly v15+8
-;; @0031 v25 = load.i64 notrap aligned readonly v15+24
-;; @0031 call_indirect sig0, v24(v25, v0)
+;; @0031 v19 = load.i64 user7 aligned readonly v15+8
+;; @0031 v20 = load.i64 notrap aligned readonly v15+24
+;; @0031 call_indirect sig0, v19(v20, v0)
;; @0034 jump block1
;;
;; block1:
diff --git a/tests/disas/startup-elem-active.wat b/tests/disas/startup-elem-active.wat
index 0c3158f8c2b1..40cdfd2d91f4 100644
--- a/tests/disas/startup-elem-active.wat
+++ b/tests/disas/startup-elem-active.wat
@@ -42,37 +42,21 @@
;; function u2415919104:0(i64 vmctx, i64) tail {
;; region0 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))"
;; gv0 = vmctx
-;; gv1 = load.i64 notrap aligned gv0+48
-;; gv2 = load.i64 notrap aligned gv0+56
+;; gv1 = load.i64 notrap aligned readonly can_move gv0+48
;;
;; block0(v0: i64, v1: i64):
-;; v4 = load.i64 notrap aligned v0+56
-;; v5 = ireduce.i32 v4
-;; v6 = uextend.i64 v5
-;; v86 = iconst.i64 4
-;; v92 = icmp ult v6, v86 ; v86 = 4
-;; trapnz v92, user6
-;; v13 = load.i64 notrap aligned v0+48
-;; v103 = iconst.i32 21
-;; v2 = iconst.i32 1
-;; v114 = icmp ule v5, v2 ; v2 = 1
-;; v79 = iconst.i64 0
-;; v17 = iadd v13, v86 ; v86 = 4
-;; v34 = select_spectre_guard v114, v79, v17 ; v79 = 0
-;; store user6 aligned region0 v103, v34 ; v103 = 21
+;; v100 = iconst.i32 21
+;; v12 = load.i64 notrap aligned readonly can_move v0+48
+;; v79 = iconst.i64 4
+;; v16 = iadd v12, v79 ; v79 = 4
+;; store user6 aligned region0 v100, v16 ; v100 = 21
;; v117 = iconst.i32 23
-;; v123 = iconst.i32 2
-;; v129 = icmp ule v5, v123 ; v123 = 2
-;; v131 = iconst.i64 8
-;; v49 = iadd v13, v131 ; v131 = 8
-;; v51 = select_spectre_guard v129, v79, v49 ; v79 = 0
-;; store user6 aligned region0 v117, v51 ; v117 = 23
-;; v133 = iconst.i32 25
-;; v3 = iconst.i32 3
-;; v144 = icmp ule v5, v3 ; v3 = 3
-;; v146 = iconst.i64 12
-;; v66 = iadd v13, v146 ; v146 = 12
-;; v68 = select_spectre_guard v144, v79, v66 ; v79 = 0
-;; store user6 aligned region0 v133, v68 ; v133 = 25
+;; v134 = iconst.i64 8
+;; v46 = iadd v12, v134 ; v134 = 8
+;; store user6 aligned region0 v117, v46 ; v117 = 23
+;; v136 = iconst.i32 25
+;; v152 = iconst.i64 12
+;; v62 = iadd v12, v152 ; v152 = 12
+;; store user6 aligned region0 v136, v62 ; v136 = 25
;; return
;; }
diff --git a/tests/disas/startup-table-initial-value.wat b/tests/disas/startup-table-initial-value.wat
index 7b39ecc93333..a2cb9a5f6da2 100644
--- a/tests/disas/startup-table-initial-value.wat
+++ b/tests/disas/startup-table-initial-value.wat
@@ -35,31 +35,24 @@
;;
;; function u2415919104:0(i64 vmctx, i64) tail {
;; gv0 = vmctx
-;; gv1 = load.i64 notrap aligned gv0+48
-;; gv2 = load.i64 notrap aligned gv0+56
+;; gv1 = load.i64 notrap aligned readonly can_move gv0+48
;;
;; block0(v0: i64, v1: i64):
-;; v9 = load.i64 notrap aligned v0+56
-;; v10 = ireduce.i32 v9
-;; v11 = uextend.i64 v10
-;; v41 = iconst.i64 10
-;; v53 = icmp ult v11, v41 ; v41 = 10
-;; trapnz v53, user6
-;; v18 = load.i64 notrap aligned v0+48
+;; v17 = load.i64 notrap aligned readonly can_move v0+48
;; v3 = iconst.i32 1
-;; v83 = iconst.i64 36
-;; v85 = iadd v18, v83 ; v83 = 36
-;; v20 = iconst.i64 4
-;; jump block1(v18)
-;;
-;; block1(v29: i64):
-;; v88 = iconst.i32 1
-;; store notrap aligned v88, v29 ; v88 = 1
-;; v89 = iadd.i64 v18, v83 ; v83 = 36
-;; v90 = icmp eq v29, v89
-;; v91 = iconst.i64 4
-;; v92 = iadd v29, v91 ; v91 = 4
-;; brif v90, block2, block1(v92)
+;; v84 = iconst.i64 36
+;; v86 = iadd v17, v84 ; v84 = 36
+;; v19 = iconst.i64 4
+;; jump block1(v17)
+;;
+;; block1(v28: i64):
+;; v89 = iconst.i32 1
+;; store notrap aligned v89, v28 ; v89 = 1
+;; v90 = iadd.i64 v17, v84 ; v84 = 36
+;; v91 = icmp eq v28, v90
+;; v92 = iconst.i64 4
+;; v93 = iadd v28, v92 ; v92 = 4
+;; brif v91, block2, block1(v93)
;;
;; block2:
;; return
diff --git a/tests/misc_testsuite/immutable-table-call-indirect.wast b/tests/misc_testsuite/immutable-table-call-indirect.wast
new file mode 100644
index 000000000000..3b40cb9ab534
--- /dev/null
+++ b/tests/misc_testsuite/immutable-table-call-indirect.wast
@@ -0,0 +1,71 @@
+;;! reference_types = true
+
+;; call_indirect through tables that are never grown, exported, or mutated.
+;; Compilation may use a constant bound and elide null/signature checks on
+;; these shapes; runtime behavior must be unchanged: in-bounds calls work,
+;; and out-of-bounds, null-slot, and signature-mismatch accesses still trap.
+
+;; Mixed-signature immutable table with a null hole.
+(module
+ (type $i2i (func (param i32) (result i32)))
+ (type $v2i (func (result i32)))
+ (table 5 funcref)
+ (elem (i32.const 0) $add1 $ten $add1)
+
+ (func $add1 (type $i2i) (i32.add (local.get 0) (i32.const 1)))
+ (func $ten (type $v2i) (i32.const 10))
+
+ (func (export "call-i2i") (param i32 i32) (result i32)
+ (call_indirect (type $i2i) (local.get 1) (local.get 0)))
+ (func (export "call-v2i") (param i32) (result i32)
+ (call_indirect (type $v2i) (local.get 0))))
+
+(assert_return (invoke "call-i2i" (i32.const 0) (i32.const 41)) (i32.const 42))
+(assert_return (invoke "call-i2i" (i32.const 2) (i32.const 7)) (i32.const 8))
+(assert_return (invoke "call-v2i" (i32.const 1)) (i32.const 10))
+
+;; Signature mismatch still traps.
+(assert_trap (invoke "call-i2i" (i32.const 1) (i32.const 0)) "indirect call type mismatch")
+(assert_trap (invoke "call-v2i" (i32.const 0)) "indirect call type mismatch")
+
+;; Null slots still trap: slot 3 was never initialized.
+(assert_trap (invoke "call-i2i" (i32.const 3) (i32.const 0)) "uninitialized element")
+(assert_trap (invoke "call-v2i" (i32.const 4)) "uninitialized element")
+
+;; Out of bounds still traps against the constant bound.
+(assert_trap (invoke "call-i2i" (i32.const 5) (i32.const 0)) "undefined element")
+(assert_trap (invoke "call-i2i" (i32.const -1) (i32.const 0)) "undefined element")
+
+;; Uniform-signature immutable table, fully initialized.
+(module
+ (type $v2i (func (result i32)))
+ (table 3 funcref)
+ (elem (i32.const 0) $a $b $c)
+
+ (func $a (type $v2i) (i32.const 1))
+ (func $b (type $v2i) (i32.const 2))
+ (func $c (type $v2i) (i32.const 3))
+
+ (func (export "call") (param i32) (result i32)
+ (call_indirect (type $v2i) (local.get 0)))
+ (func (export "call-wrong-type") (param i32 i32) (result i32)
+ (call_indirect (param i32) (result i32) (local.get 1) (local.get 0))))
+
+(assert_return (invoke "call" (i32.const 0)) (i32.const 1))
+(assert_return (invoke "call" (i32.const 1)) (i32.const 2))
+(assert_return (invoke "call" (i32.const 2)) (i32.const 3))
+(assert_trap (invoke "call" (i32.const 3)) "undefined element")
+
+;; A caller whose expected type differs from the table's uniform type must
+;; still observe the mismatch.
+(assert_trap (invoke "call-wrong-type" (i32.const 0) (i32.const 0)) "indirect call type mismatch")
+
+;; Same shapes through a declared-growable (no max) table never actually
+;; grown: an empty never-grown table has no valid index.
+(module
+ (table 0 100 funcref)
+ (func (export "call-empty") (param i32)
+ (call_indirect (local.get 0))))
+
+(assert_trap (invoke "call-empty" (i32.const 0)) "undefined element")
+(assert_trap (invoke "call-empty" (i32.const 99)) "undefined element")