diff --git a/cranelift/codegen/meta/src/pulley.rs b/cranelift/codegen/meta/src/pulley.rs index 77e760eda7a3..ef8a91f33ab0 100644 --- a/cranelift/codegen/meta/src/pulley.rs +++ b/cranelift/codegen/meta/src/pulley.rs @@ -97,6 +97,13 @@ impl Inst<'_> { // Skip special instructions not used in Cranelift. "XPush32Many" | "XPush64Many" | "XPop32Many" | "XPop64Many" => true, + // Phase-3 fused dispatch op: 3 writable results would + // require extending the auto-codegen `results[..]` match + // arms below. The op is emitted only via the hand-written + // `MInst::BandFuncrefDispatch` path, so no auto-generated + // ISLE rule is needed — skip here. + n if n.starts_with("XbandFuncrefDispatch") => true, + // Skip more branching-related instructions. n => n.starts_with("Br"), } diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 258551a17598..bab0fa9a25de 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -67,6 +67,48 @@ ;; Jump to `then` if `c` is true, otherwise to `else`. (BrIf (cond Cond) (taken MachLabel) (not_taken MachLabel)) + ;; Fused `band src, mask` + `brif src` emitted at the call_indirect + ;; lazy-init brif site. `dst = src & sign_extend(mask)` is + ;; unconditional; the branch test is on `src`'s low-32 or full-64 bits + ;; per `size`. Pulley-side: `xband*_s8_br_if_*`. + (BandBrIf + (dst WritableXReg) + (src XReg) + (mask i8) + (size OperandSize) + (taken MachLabel) + (not_taken MachLabel)) + + ;; Funcref-dispatch fusion: `brif (band v -2) + load code + load vmctx` + ;; across the brif and its continuation block. Emitted at the + ;; call_indirect lazy-init site under + ;; `is_eagerly_initialized_funcref_table`. Pulley-side: + ;; `xfuncref_dispatch_{x64,not_x64,x32,not_x32}`. + (FuncrefDispatch + (dst_code WritableXReg) + (dst_vmctx WritableXReg) + (src XReg) + (offset_code i8) + (offset_vmctx i8) + (size OperandSize) + (taken MachLabel) + (not_taken MachLabel)) + + ;; FuncrefDispatch + the preceding `xband_s8 -2` absorbed. `src` is + ;; the unmasked funcref; the fused op writes `dst_masked = src & -2` + ;; so the brif's block-call-arg copy still has a producer. + ;; Pulley-side: `xband_funcref_dispatch_{x64,not_x64,x32,not_x32}`. + (BandFuncrefDispatch + (dst_masked WritableXReg) + (dst_code WritableXReg) + (dst_vmctx WritableXReg) + (src XReg) + (offset_code i8) + (offset_vmctx i8) + (size OperandSize) + (taken MachLabel) + (not_taken MachLabel)) + ;; Load the memory address referenced by `mem` into `dst`. (LoadAddr (dst WritableXReg) (mem Amode)) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs index e97e3303ef99..8e385df62a2e 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs @@ -577,6 +577,19 @@ pub struct PulleyCall { pub args: SmallVec<[XReg; 4]>, } +/// Payload of `CallInfo` for `Inst::IndirectCall`. Mirror of `PulleyCall`: +/// the first 0–4 integer ABI args are tracked here so the emitted +/// `call_indirect{1,2,3,4}` opcode moves them into `x0..x3` itself +/// instead of regalloc synthesising `xmov`s. Remaining args use the +/// fixed-preg path in `CallInfo::uses`. +#[derive(Clone, Debug)] +pub struct PulleyCallIndirect { + /// The register holding the call target. + pub target: XReg, + /// Up to 4 integer args destined for `x0..x3`. + pub args: SmallVec<[XReg; 4]>, +} + pub use super::super::lower::isle::generated_code::AddrO32; impl Copy for AddrO32 {} diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 74bff5d97a7d..b06535c56312 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -233,7 +233,22 @@ fn pulley_emit

( } Inst::IndirectCall { info } => { - enc::call_indirect(sink, info.dest); + // Drop args already in their ABI register so we can pick a + // narrower `call_indirectN` — mirrors the direct-call shrink + // above. + let target = info.dest.target; + let mut args = &info.dest.args[..]; + while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) { + args = &args[..args.len() - 1]; + } + match args { + [] => enc::call_indirect(sink, target), + [x0] => enc::call_indirect1(sink, target, *x0), + [x0, x1] => enc::call_indirect2(sink, target, *x0, *x1), + [x0, x1, x2] => enc::call_indirect3(sink, target, *x0, *x1, *x2), + [x0, x1, x2, x3] => enc::call_indirect4(sink, target, *x0, *x1, *x2, *x3), + _ => unreachable!(), + } if let Some(s) = state.take_stack_map() { let offset = sink.cur_offset(); @@ -367,6 +382,297 @@ fn pulley_emit

( assert_eq!(sink.cur_offset(), not_taken_end); } + Inst::BandBrIf { + dst, + src, + mask, + size, + taken, + not_taken, + } => { + // The forward form branches to `taken` if `src` is non-zero + // (after computing `dst = src & sext(mask)`). The inverted form + // branches if `src` is zero — used by MachBuffer's fallthrough- + // flip optimization. Both must encode to equal-length bytes; the + // `_x*` and `_not_x*` ops share the same operand shape, so they + // do. + let dst_writable = *dst; + let src_reg = *src; + let mask_imm = *mask; + + // Compute the inverted-form encoding (branch on src == 0) into a + // SmallVec so MachBuffer can use it for branch-direction flipping. + let mut inverted = SmallVec::<[u8; 16]>::new(); + match size { + OperandSize::Size32 => { + enc::xband32_s8_br_if_not_x32( + &mut inverted, + dst_writable, + src_reg, + mask_imm, + 0, + ); + } + OperandSize::Size64 => { + enc::xband64_s8_br_if_not_x64( + &mut inverted, + dst_writable, + src_reg, + mask_imm, + 0, + ); + } + } + let len = inverted.len() as u32; + inverted.clear(); + let inv_rel = i32::try_from(len - 4).unwrap(); + match size { + OperandSize::Size32 => { + enc::xband32_s8_br_if_not_x32( + &mut inverted, + dst_writable, + src_reg, + mask_imm, + inv_rel, + ); + } + OperandSize::Size64 => { + enc::xband64_s8_br_if_not_x64( + &mut inverted, + dst_writable, + src_reg, + mask_imm, + inv_rel, + ); + } + } + assert!(len > 4); + + // Emit the forward form (branch on src != 0). + let taken_end = *start_offset + len; + sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); + sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); + patch_pc_rel_offset(sink, |sink| match size { + OperandSize::Size32 => { + enc::xband32_s8_br_if_x32(sink, dst_writable, src_reg, mask_imm, 0) + } + OperandSize::Size64 => { + enc::xband64_s8_br_if_x64(sink, dst_writable, src_reg, mask_imm, 0) + } + }); + debug_assert_eq!(sink.cur_offset(), taken_end); + + // Unconditional jump to `not_taken` for the fall-through path. + let not_taken_start = taken_end + 1; + let not_taken_end = not_taken_start + 4; + sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel); + sink.add_uncond_branch(taken_end, not_taken_end, *not_taken); + patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0)); + assert_eq!(sink.cur_offset(), not_taken_end); + } + + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + // Same scaffolding as Inst::BrIf / Inst::BandBrIf. Forward + // form's branch fires on `src != 0` (after loads); inverted + // form branches on `src == 0` (loads on fall-through). Both + // encodings have the same length because they share the + // 5-operand shape. + let dst_code_w = *dst_code; + let dst_vmctx_w = *dst_vmctx; + let src_reg = *src; + let oc = *offset_code; + let ov = *offset_vmctx; + + // Inverted encoding into a scratch SmallVec for MachBuffer. + let mut inverted = SmallVec::<[u8; 16]>::new(); + match size { + OperandSize::Size32 => { + enc::xfuncref_dispatch_not_x32( + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + 0, + ); + } + OperandSize::Size64 => { + enc::xfuncref_dispatch_not_x64( + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + 0, + ); + } + } + let len = inverted.len() as u32; + inverted.clear(); + let inv_rel = i32::try_from(len - 4).unwrap(); + match size { + OperandSize::Size32 => { + enc::xfuncref_dispatch_not_x32( + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + inv_rel, + ); + } + OperandSize::Size64 => { + enc::xfuncref_dispatch_not_x64( + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + inv_rel, + ); + } + } + assert!(len > 4); + + // Emit the forward form (branch on src != 0). + let taken_end = *start_offset + len; + sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); + sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); + patch_pc_rel_offset(sink, |sink| match size { + OperandSize::Size32 => { + enc::xfuncref_dispatch_x32(sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0) + } + OperandSize::Size64 => { + enc::xfuncref_dispatch_x64(sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0) + } + }); + debug_assert_eq!(sink.cur_offset(), taken_end); + + // Unconditional jump to `not_taken` for the fall-through path. + let not_taken_start = taken_end + 1; + let not_taken_end = not_taken_start + 4; + sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel); + sink.add_uncond_branch(taken_end, not_taken_end, *not_taken); + patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0)); + assert_eq!(sink.cur_offset(), not_taken_end); + } + + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + // Same scaffolding as Inst::FuncrefDispatch, but with an + // extra `dst_masked` operand. The forward form branches on + // `src != 0` (after computing dst_masked AND the two loads); + // the inverted form branches on `src == 0` (only dst_masked + // is written on that side). MachBuffer flips between them + // for the fall-through optimisation. + let dm_w = *dst_masked; + let dc_w = *dst_code; + let dv_w = *dst_vmctx; + let src_reg = *src; + let oc = *offset_code; + let ov = *offset_vmctx; + + let mut inverted = SmallVec::<[u8; 16]>::new(); + match size { + OperandSize::Size32 => { + enc::xband_funcref_dispatch_not_x32( + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + 0, + ); + } + OperandSize::Size64 => { + enc::xband_funcref_dispatch_not_x64( + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + 0, + ); + } + } + let len = inverted.len() as u32; + inverted.clear(); + let inv_rel = i32::try_from(len - 4).unwrap(); + match size { + OperandSize::Size32 => { + enc::xband_funcref_dispatch_not_x32( + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + inv_rel, + ); + } + OperandSize::Size64 => { + enc::xband_funcref_dispatch_not_x64( + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + inv_rel, + ); + } + } + assert!(len > 4); + + let taken_end = *start_offset + len; + sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); + sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); + patch_pc_rel_offset(sink, |sink| match size { + OperandSize::Size32 => { + enc::xband_funcref_dispatch_x32(sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0) + } + OperandSize::Size64 => { + enc::xband_funcref_dispatch_x64(sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0) + } + }); + debug_assert_eq!(sink.cur_offset(), taken_end); + + let not_taken_start = taken_end + 1; + let not_taken_end = not_taken_start + 4; + sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel); + sink.add_uncond_branch(taken_end, not_taken_end, *not_taken); + patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0)); + assert_eq!(sink.cur_offset(), not_taken_end); + } + Inst::LoadAddr { dst, mem } => { let base = mem.get_base_register(); let offset = mem.get_offset_with_state(state); diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index 6bbe69795e51..f9b1a518ae32 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -206,14 +206,23 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { } } Inst::IndirectCall { info } => { - collector.reg_use(&mut info.dest); let CallInfo { uses, defs, + dest, try_call_info, clobbers, .. } = &mut **info; + + // First 0–4 integer args are passed as free reg uses; the + // emitted `call_indirect{1,2,3,4}` op moves them into x0..x3. + // Remaining args use the fixed-preg path in `uses`. + let PulleyCallIndirect { target, args } = dest; + collector.reg_use(target); + for arg in args { + collector.reg_use(arg); + } for CallArgPair { vreg, preg } in uses { collector.reg_fixed_use(vreg, *preg); } @@ -261,6 +270,50 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { cond.get_operands(collector); } + Inst::BandBrIf { + dst, + src, + mask: _, + size: _, + taken: _, + not_taken: _, + } => { + collector.reg_def(dst); + collector.reg_use(src); + } + + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code: _, + offset_vmctx: _, + size: _, + taken: _, + not_taken: _, + } => { + collector.reg_def(dst_code); + collector.reg_def(dst_vmctx); + collector.reg_use(src); + } + + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code: _, + offset_vmctx: _, + size: _, + taken: _, + not_taken: _, + } => { + collector.reg_def(dst_masked); + collector.reg_def(dst_code); + collector.reg_def(dst_vmctx); + collector.reg_use(src); + } + Inst::LoadAddr { dst, mem } => { collector.reg_def(dst); mem.get_operands(collector); @@ -483,6 +536,9 @@ where | Inst::Rets { .. } => MachTerminator::Ret, Inst::Jump { .. } => MachTerminator::Branch, Inst::BrIf { .. } => MachTerminator::Branch, + Inst::BandBrIf { .. } => MachTerminator::Branch, + Inst::FuncrefDispatch { .. } => MachTerminator::Branch, + Inst::BandFuncrefDispatch { .. } => MachTerminator::Branch, Inst::BrTable { .. } => MachTerminator::Branch, Inst::ReturnCall { .. } | Inst::ReturnIndirectCall { .. } => MachTerminator::RetCall, Inst::Call { info } if info.try_call_info.is_some() => MachTerminator::Branch, @@ -723,7 +779,7 @@ impl Inst { } Inst::IndirectCall { info } => { - let callee = format_reg(*info.dest); + let callee = format_reg(*info.dest.target); let try_call = info .try_call_info .as_ref() @@ -762,6 +818,82 @@ impl Inst { format!("br_{cond}, {taken}; jump {not_taken}") } + Inst::BandBrIf { + dst, + src, + mask, + size, + taken, + not_taken, + } => { + let dst = format_reg(*dst.to_reg()); + let src = format_reg(**src); + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let width = match size { + OperandSize::Size32 => 32, + OperandSize::Size64 => 64, + }; + format!( + "{dst} = xband{width}_s8 {src}, {mask}; \ + br_if_x{width} {src}, {taken}; jump {not_taken}" + ) + } + + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + let dst_code = format_reg(*dst_code.to_reg()); + let dst_vmctx = format_reg(*dst_vmctx.to_reg()); + let src = format_reg(**src); + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let width = match size { + OperandSize::Size32 => 32, + OperandSize::Size64 => 64, + }; + format!( + "{dst_code}, {dst_vmctx} = xfuncref_dispatch_x{width} \ + {src}, code+{offset_code}, vmctx+{offset_vmctx}; \ + br_if {taken}; jump {not_taken}" + ) + } + + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + let dst_masked = format_reg(*dst_masked.to_reg()); + let dst_code = format_reg(*dst_code.to_reg()); + let dst_vmctx = format_reg(*dst_vmctx.to_reg()); + let src = format_reg(**src); + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let width = match size { + OperandSize::Size32 => 32, + OperandSize::Size64 => 64, + }; + format!( + "{dst_masked}, {dst_code}, {dst_vmctx} = xband_funcref_dispatch_x{width} \ + {src}, code+{offset_code}, vmctx+{offset_vmctx}; \ + br_if {taken}; jump {not_taken}" + ) + } + Inst::LoadAddr { dst, mem } => { let dst = format_reg(*dst.to_reg()); let mem = mem.to_string(); diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs index 2039c7de8dd3..23f9c00865f4 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs @@ -4,7 +4,8 @@ pub mod isle; use super::{PulleyBackend, PulleyTargetKind, inst::*}; use crate::{ - ir, + ir::{self, InstructionData, Opcode}, + isa::pulley_shared::inst::Inst, machinst::{lower::*, *}, }; @@ -24,6 +25,15 @@ where ir_inst: ir::Inst, targets: &[MachLabel], ) -> Option<()> { + // Phase-2/3 fuse band+brif+xload+xload across the brif and its + // continuation block; phase-1 just band+brif. Both gated on the + // eager-init predicate. + if try_fuse_funcref_dispatch::

(ctx, ir_inst, targets) { + return Some(()); + } + if try_fuse_band_brif(ctx, ir_inst, targets) { + return Some(()); + } isle::lower_branch(ctx, self, ir_inst, targets) } @@ -31,4 +41,448 @@ where // Pulley does not support this feature right now. None } + + fn pre_lower(&self, ctx: &mut Lower) { + // Block lowering runs in reverse layout order, so by the time + // `lower_branch` sees the brif, the continuation block has already + // been lowered. Marking the continuation's loads `absorbed_pure` + // after the fact would create double-writes to their result vregs. + // Run the recogniser once up front instead. + pre_lower_pulley(ctx, P::pointer_width().bytes()); + } +} + +/// Recognise `brif (band v -2) ...` at the call_indirect lazy-init site +/// and fuse it into `MInst::BandBrIf`. Returns true if fusion fired. +/// +/// Soundness: testing `v_masked != 0` instead of `v != 0` is identical for +/// every reachable funcref-slot value under +/// `is_eagerly_initialized_funcref_table` — they differ only at the +/// tagged-null value `1`, which the predicate excludes. +fn try_fuse_band_brif

( + ctx: &mut Lower>, + ir_inst: ir::Inst, + targets: &[MachLabel], +) -> bool +where + P: PulleyTargetKind, +{ + if targets.len() != 2 { + return false; + } + + let dfg = ctx.dfg(); + let InstructionData::Brif { + opcode: Opcode::Brif, + arg: cond, + .. + } = dfg.insts[ir_inst] + else { + return false; + }; + + // The brif's cond must be `band(v, -2)` with a bit-exact `Imm64(-2)`. + // The bit-exact match is load-bearing: it confines the fusion to + // func_environ's `Imm64::from(-2_i64)` IR-rewrite site. The wat parser + // encodes `(i32.const -2)` as `Imm64(0xFFFFFFFE)`, so user wasm can't + // produce `Imm64(-2)` and slip into this code path. + let band_inst = match dfg.value_def(cond).inst() { + Some(inst) => inst, + None => return false, + }; + let (band_src, band_imm) = match dfg.insts[band_inst] { + InstructionData::Binary { + opcode: Opcode::Band, + args: [a, b], + } => match dfg.value_def(b).inst() { + Some(b_inst) => match dfg.insts[b_inst] { + InstructionData::UnaryImm { + opcode: Opcode::Iconst, + imm, + } if imm.bits() == -2 => (a, -2_i8), + _ => return false, + }, + None => return false, + }, + _ => return false, + }; + + // Both ops of the fusion must agree on size: the band's result is the + // brif's cond, and its type drives the comparison width. + let cond_ty = dfg.value_type(cond); + let size = match cond_ty { + ir::types::I32 => OperandSize::Size32, + ir::types::I64 => OperandSize::Size64, + _ => return false, + }; + + // Reuse the band-result vreg as the fused op's dst, so the block-arg + // machinery downstream observes the correct masked value via the same + // vreg (single def, single use — no SSA violation). The original band + // CLIF inst is then marked as absorbed and skipped in lower_clif_block. + let dst_vreg = ctx.put_value_in_regs(cond); + let dst_reg = dst_vreg.only_reg().expect("scalar band result"); + let dst = WritableXReg::try_from(Writable::from_reg(dst_reg)) + .expect("band result is an x-class register"); + let src = XReg::new(ctx.put_value_in_regs(band_src).only_reg().expect("scalar")) + .expect("band source is an x-class register"); + + // Sink the band: the BandBrIf we emit below defines the same dst vreg, + // so downstream uses of `cond` still find the value populated. + ctx.sink_pure_inst(band_inst); + + ctx.emit( + Inst::BandBrIf { + dst, + src, + mask: band_imm, + size, + taken: targets[0], + not_taken: targets[1], + } + .into(), + ); + + true +} + +/// True iff `imm` encodes `-2` in `ty`'s width. The egraph canonicalises +/// `i32(-2)` as `Imm64(0xFFFFFFFE)`, not `Imm64(-2)`, so a width-aware +/// compare is needed for pulley32. +fn is_minus_two_for(imm: ir::immediates::Imm64, ty: ir::Type) -> bool { + match ty { + ir::types::I32 => (imm.bits() as u32) == (-2_i32 as u32), + ir::types::I64 => imm.bits() == -2_i64, + _ => false, + } +} + +/// `(wasm_call, vmctx)` byte offsets in `VMFuncRef`. Both fit in i8 (8/24 +/// on 64-bit, 4/12 on 32-bit), matching the `xfuncref_dispatch_*` ops' +/// sign-extended-i8 offset operand. +fn vm_func_ref_offsets(pointer_bytes: u8) -> (i8, i8) { + let size = pointer_bytes as i8; + (size, size.checked_mul(3).expect("VMFuncRef offsets fit i8")) +} + +/// Recognise the canonical funcref-dispatch shape: +/// +/// ```text +/// predecessor: +/// value = load .ptr (table_entry + 0) +/// value_masked = band value, -2 +/// brif value_masked, continuation([value_masked]), null_block([]) +/// continuation(funcref_ptr): +/// code = load .ptr (funcref_ptr + offset_code) +/// vmctx = load .ptr (funcref_ptr + offset_vmctx) +/// ``` +fn match_funcref_dispatch_pattern( + f: &ir::Function, + brif_inst: ir::Inst, + pointer_bytes: u8, +) -> Option { + let dfg = &f.dfg; + let InstructionData::Brif { + opcode: Opcode::Brif, + arg: cond, + blocks, + .. + } = dfg.insts[brif_inst] + else { + return None; + }; + // cond = band(v, -2) + let band_inst = dfg.value_def(cond).inst()?; + let (v, _imm) = match dfg.insts[band_inst] { + InstructionData::Binary { + opcode: Opcode::Band, + args: [a, b], + } => match dfg.value_def(b).inst() { + Some(b_inst) => match dfg.insts[b_inst] { + InstructionData::UnaryImm { + opcode: Opcode::Iconst, + imm, + } if is_minus_two_for(imm, dfg.value_type(cond)) => (a, -2_i8), + _ => return None, + }, + None => return None, + }, + _ => return None, + }; + let cond_ty = dfg.value_type(cond); + let size = match cond_ty { + ir::types::I32 => OperandSize::Size32, + ir::types::I64 => OperandSize::Size64, + _ => return None, + }; + // The 64-bit fused op handles I64 pointer types; the 32-bit fused op + // handles I32. They line up with the target's pointer width. + let expected_size = match pointer_bytes { + 4 => OperandSize::Size32, + 8 => OperandSize::Size64, + _ => return None, + }; + if size != expected_size { + return None; + } + + // Taken target = continuation block. Its first block param must equal + // the brif's first block-call-arg (i.e. value_masked). + let taken_call = blocks[0]; + let continuation = taken_call.block(&dfg.value_lists); + let taken_args: smallvec::SmallVec<[ir::BlockArg; 4]> = + taken_call.args(&dfg.value_lists).collect(); + if taken_args.len() < 1 { + return None; + } + let first_arg_val = match taken_args[0] { + ir::BlockArg::Value(v) => v, + _ => return None, + }; + if first_arg_val != cond { + // The brif must pass value_masked as the first block-call-arg. + return None; + } + let cont_params = dfg.block_params(continuation); + if cont_params.is_empty() { + return None; + } + let funcref_ptr = cont_params[0]; + + // The first two instructions in the continuation block must be the + // two field loads in either order. + let (offset_code_expected, offset_vmctx_expected) = vm_func_ref_offsets(pointer_bytes); + let mut iter = f.layout.block_insts(continuation); + let load1 = iter.next()?; + let load2 = iter.next()?; + let (load_code_inst, load_vmctx_inst) = classify_funcref_loads( + dfg, + load1, + load2, + funcref_ptr, + offset_code_expected, + offset_vmctx_expected, + cond_ty, + )?; + let code_val = dfg.inst_results(load_code_inst)[0]; + let vmctx_val = dfg.inst_results(load_vmctx_inst)[0]; + + let _ = (band_inst, v); // captured for future variants of the pattern check + Some(FuncrefDispatchPattern { + load_code_inst, + load_vmctx_inst, + code_val, + vmctx_val, + offset_code: offset_code_expected, + offset_vmctx: offset_vmctx_expected, + size, + }) +} + +struct FuncrefDispatchPattern { + load_code_inst: ir::Inst, + load_vmctx_inst: ir::Inst, + code_val: ir::Value, + vmctx_val: ir::Value, + offset_code: i8, + offset_vmctx: i8, + size: OperandSize, +} + +fn classify_funcref_loads( + dfg: &ir::DataFlowGraph, + a: ir::Inst, + b: ir::Inst, + funcref_ptr: ir::Value, + offset_code: i8, + offset_vmctx: i8, + pointer_ty: ir::Type, +) -> Option<(ir::Inst, ir::Inst)> { + let (a_off, a_base) = classify_load(dfg, a, pointer_ty)?; + let (b_off, b_base) = classify_load(dfg, b, pointer_ty)?; + if a_base != funcref_ptr || b_base != funcref_ptr { + return None; + } + if a_off == offset_code && b_off == offset_vmctx { + Some((a, b)) + } else if a_off == offset_vmctx && b_off == offset_code { + Some((b, a)) + } else { + None + } +} + +fn classify_load( + dfg: &ir::DataFlowGraph, + inst: ir::Inst, + pointer_ty: ir::Type, +) -> Option<(i8, ir::Value)> { + match dfg.insts[inst] { + InstructionData::Load { + opcode: Opcode::Load, + arg, + offset, + .. + } => { + let result = *dfg.inst_results(inst).first()?; + if dfg.value_type(result) != pointer_ty { + return None; + } + let off_i32: i32 = offset.into(); + let off_i8 = i8::try_from(off_i32).ok()?; + Some((off_i8, arg)) + } + _ => None, + } +} + +/// Pulley-specific pre-lowering analysis. Walks every block looking for +/// the funcref-dispatch fusion shape (see +/// `match_funcref_dispatch_pattern`), and when it matches, sinks the band +/// inst and the two continuation-block loads via `sink_pure_inst`. The +/// brif's lowering (in `try_fuse_funcref_dispatch`) then emits one +/// `MInst::FuncrefDispatch` whose def vregs replace the absorbed loads' +/// def vregs. +fn pre_lower_pulley

(ctx: &mut Lower>, pointer_bytes: u8) +where + P: PulleyTargetKind, +{ + // Collect candidates first so `&ctx.f` isn't held across the + // `sink_pure_inst` calls below. + let mut to_sink: smallvec::SmallVec<[(ir::Inst, ir::Inst); 8]> = smallvec::SmallVec::new(); + { + let f = ctx.f; + for block in f.layout.blocks() { + let Some(term) = f.layout.last_inst(block) else { + continue; + }; + if !matches!(f.dfg.insts[term], InstructionData::Brif { .. }) { + continue; + } + if let Some(pat) = match_funcref_dispatch_pattern::

(f, term, pointer_bytes) { + to_sink.push((pat.load_code_inst, pat.load_vmctx_inst)); + } + } + } + for (l_code, l_vmctx) in to_sink { + ctx.sink_pure_inst(l_code); + ctx.sink_pure_inst(l_vmctx); + } +} + +/// Phase-2 fusion: emit `MInst::FuncrefDispatch` when the brif matches the +/// canonical pattern. Relies on the pre-pass having marked the band + two +/// continuation-block loads as absorbed_pure; this routine just re-derives +/// the pattern, looks up the relevant vregs, and emits the single fused +/// MachInst. Returns `true` iff the fusion fired. +fn try_fuse_funcref_dispatch

( + ctx: &mut Lower>, + ir_inst: ir::Inst, + targets: &[MachLabel], +) -> bool +where + P: PulleyTargetKind, +{ + if targets.len() != 2 { + return false; + } + let pointer_bytes = P::pointer_width().bytes(); + let Some(pat) = match_funcref_dispatch_pattern::

(ctx.f, ir_inst, pointer_bytes) else { + return false; + }; + + let InstructionData::Brif { arg: cond, .. } = ctx.f.dfg.insts[ir_inst] else { + return false; + }; + + // Try phase-3 (absorb the band into BandFuncrefDispatch). The fused + // op defines `dst_masked` (= cond's vreg) so the brif's block-call + // copy still has a producer, plus `dst_code` and `dst_vmctx`. + let dfg = ctx.dfg(); + let band_inst = dfg.value_def(cond).inst(); + let v = band_inst.and_then(|bi| match dfg.insts[bi] { + InstructionData::Binary { + opcode: Opcode::Band, + args: [a, b], + } => match dfg.value_def(b).inst() { + Some(b_inst) => match dfg.insts[b_inst] { + InstructionData::UnaryImm { + opcode: Opcode::Iconst, + imm, + } if is_minus_two_for(imm, dfg.value_type(cond)) => Some(a), + _ => None, + }, + None => None, + }, + _ => None, + }); + + // The loads' result vregs become the fused op's defs. Their original + // lowering was skipped via `sink_pure_inst` in `pre_lower_pulley`. + let dst_code_reg = ctx + .put_value_in_regs(pat.code_val) + .only_reg() + .expect("scalar funcref code result"); + let dst_vmctx_reg = ctx + .put_value_in_regs(pat.vmctx_val) + .only_reg() + .expect("scalar funcref vmctx result"); + let dst_code = WritableXReg::try_from(Writable::from_reg(dst_code_reg)) + .expect("funcref code dst is an x-class register"); + let dst_vmctx = WritableXReg::try_from(Writable::from_reg(dst_vmctx_reg)) + .expect("funcref vmctx dst is an x-class register"); + + if let (Some(band_inst), Some(v)) = (band_inst, v) { + // Phase 3 fires: source is the unmasked `v`; the fused op masks + // internally and writes `dst_masked = cond`. + let dst_masked_regs = ctx.put_value_in_regs(cond); + let dst_masked_reg = dst_masked_regs.only_reg().expect("scalar cond"); + let dst_masked = WritableXReg::try_from(Writable::from_reg(dst_masked_reg)) + .expect("cond is an x-class register"); + let src_reg = ctx + .put_value_in_regs(v) + .only_reg() + .expect("scalar funcref source"); + let src = XReg::new(src_reg).expect("funcref source is an x-class register"); + ctx.sink_pure_inst(band_inst); + ctx.emit( + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code: pat.offset_code, + offset_vmctx: pat.offset_vmctx, + size: pat.size, + taken: targets[0], + not_taken: targets[1], + } + .into(), + ); + return true; + } + + // Phase-2 fallback: band stays as a standalone op; FuncrefDispatch + // consumes its masked result. + let src_reg = ctx + .put_value_in_regs(cond) + .only_reg() + .expect("scalar funcref source"); + let src = XReg::new(src_reg).expect("funcref source is an x-class register"); + + ctx.emit( + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code: pat.offset_code, + offset_vmctx: pat.offset_vmctx, + size: pat.size, + taken: targets[0], + not_taken: targets[1], + } + .into(), + ); + + true } diff --git a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs index 3068fb1137ff..c065c45a2c6a 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs @@ -10,8 +10,8 @@ use crate::ir::{condcodes::*, immediates::*, types::*, *}; use crate::isa::CallConv; use crate::isa::pulley_shared::{ inst::{ - FReg, OperandSize, PulleyCall, ReturnCallInfo, VReg, WritableFReg, WritableVReg, - WritableXReg, XReg, + FReg, OperandSize, PulleyCall, PulleyCallIndirect, ReturnCallInfo, VReg, WritableFReg, + WritableVReg, WritableXReg, XReg, }, lower::{Cond, regs}, *, @@ -30,7 +30,7 @@ type Unit = (); type VecArgPair = Vec; type VecRetPair = Vec; type BoxCallInfo = Box>; -type BoxCallIndInfo = Box>; +type BoxCallIndInfo = Box>; type BoxCallIndirectHostInfo = Box>; type BoxReturnCallInfo = Box>; type BoxReturnCallIndInfo = Box>; @@ -124,7 +124,7 @@ where &mut self, sig: Sig, dest: Reg, - uses: CallArgList, + mut uses: CallArgList, defs: CallRetList, try_call_info: Option, ) -> BoxCallIndInfo { @@ -133,8 +133,30 @@ where self.lower_ctx .abi_mut() .accumulate_outgoing_args_size(stack_ret_space + stack_arg_space); + let call_conv = self.lower_ctx.sigs()[sig].call_conv(); - let dest = XReg::new(dest).unwrap(); + // Mirror of `gen_call_info`: take out the first four integer + // arguments (x0..x3) and pass them through the `args` list so the + // emitted `call_indirect{1,2,3,4}` op can move them at call time. + // Saves one Pulley dispatch per moved arg vs the previous "regalloc + // emits xmov; then `call_indirect`" sequence. + let mut args = SmallVec::new(); + uses.sort_by_key(|arg| arg.preg); + if call_conv != CallConv::PreserveAll { + uses.retain(|arg| { + if arg.preg != regs::x0() + && arg.preg != regs::x1() + && arg.preg != regs::x2() + && arg.preg != regs::x3() + { + return true; + } + args.push(XReg::new(arg.vreg).unwrap()); + false + }); + } + let target = XReg::new(dest).unwrap(); + let dest = PulleyCallIndirect { target, args }; Box::new( self.lower_ctx .gen_call_info(sig, dest, uses, defs, try_call_info, false), diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs index 7747d804cafe..91d1da84c910 100644 --- a/cranelift/codegen/src/machinst/compile.rs +++ b/cranelift/codegen/src/machinst/compile.rs @@ -25,9 +25,16 @@ pub fn compile( let block_order = BlockLoweringOrder::new(f, domtree, ctrl_plane); // Build the lowering context. - let lower = + let mut lower = crate::machinst::Lower::new(f, abi, emit_info, block_order, sigs, b.flags().clone())?; + // Backend-specific pre-lowering analysis. Default impl on LowerBackend + // is a no-op; Pulley overrides it to mark continuation-block loads as + // absorbed_pure when the call_indirect lazy-init brif pattern is + // present, so they can be fused into a single Pulley dispatch op + // emitted at the brif's lowering time. + b.pre_lower(&mut lower); + // Lower the IR. let vcode = { log::debug!( diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index 0a09edc5c374..5301c30ee454 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -148,6 +148,18 @@ pub trait LowerBackend { fn maybe_pinned_reg(&self) -> Option { None } + + /// Backend-specific analysis hook, run once after `Lower::new` but + /// before the main reverse-block lowering loop. Default: no-op. + /// + /// Use this to mark instructions as `sink_pure_inst` when they will be + /// absorbed by a fused MachInst emitted in a different (earlier-in-CFG, + /// later-in-reverse-order) block. The block-by-block lowering loop + /// processes blocks in reverse, so cross-block absorption can't be + /// arranged at the absorbing instruction's lowering time — it has to be + /// arranged here, before any block is lowered. Within a single block, + /// `sink_pure_inst` called during normal lowering is still sufficient. + fn pre_lower(&self, _ctx: &mut Lower) {} } /// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence @@ -204,6 +216,14 @@ pub struct Lower<'func, I: VCodeInst> { /// their original locations. inst_sunk: FxHashSet, + /// Pure (non-side-effecting) instructions whose value-production has been + /// absorbed by a later-emitted MachInst (typically a terminator that + /// fuses an ALU op with a branch). The absorbing MachInst writes to the + /// absorbed inst's result vreg, so subsequent `put_value_in_regs` of that + /// vreg observes the value normally — but the absorbed inst itself is + /// skipped in `lower_clif_block`, avoiding a redundant double-write. + inst_absorbed_pure: FxHashSet, + /// Instructions collected for the CLIF inst in progress, in forward order. ir_insts: Vec, @@ -504,6 +524,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> { value_ir_uses, value_lowered_uses: SecondaryMap::default(), inst_sunk: FxHashSet::default(), + inst_absorbed_pure: FxHashSet::default(), cur_scan_entry_color: None, cur_inst: None, ir_insts: vec![], @@ -708,6 +729,12 @@ impl<'func, I: VCodeInst> Lower<'func, I> { self.inst_sunk.contains(&inst) } + /// Has the value-production of this pure instruction been absorbed by a + /// later-emitted MachInst? See [`Lower::inst_absorbed_pure`]. + fn is_inst_absorbed_pure(&self, inst: Inst) -> bool { + self.inst_absorbed_pure.contains(&inst) + } + // Is any result of this instruction needed? fn is_any_inst_result_needed(&self, inst: Inst) -> bool { self.f @@ -750,6 +777,13 @@ impl<'func, I: VCodeInst> Lower<'func, I> { if self.is_inst_sunk(inst) { continue; } + // Same for pure-instruction absorption: a terminator earlier in + // the reverse-scan emitted a MachInst that writes to this inst's + // result vreg directly, so emitting it again here would be a + // redundant double-write. + if self.is_inst_absorbed_pure(inst) { + continue; + } // Are any outputs used at least once? let value_needed = self.is_any_inst_result_needed(inst); trace!( @@ -1666,6 +1700,46 @@ impl<'func, I: VCodeInst> Lower<'func, I> { self.ir_insts.push(mach_inst); } + /// Indicate that the value-production of a pure (non-side-effecting) + /// instruction has been absorbed by a later-emitted MachInst — typically a + /// terminator that fuses an ALU op with a branch (e.g. Pulley's + /// `xband_brif` fused dispatch op). + /// + /// The absorbing MachInst must write to the absorbed inst's result vreg + /// (`value_regs[result]`) directly, so subsequent `put_value_in_regs` of + /// that vreg observes the correct value. The absorbed inst itself is + /// skipped in `lower_clif_block`, preventing a redundant second write to + /// the same vreg (which would violate SSA single-def). + /// + /// Unlike [`Lower::sink_inst`], this does not require the inst to have a + /// lowering side effect: it is specifically for pure ALU ops whose value + /// flows into the fused MachInst's output operand. Color tracking is + /// likewise unnecessary because pure insts have no color anchor. + /// + /// We additionally allow absorbing trusted readonly loads — CLIF + /// considers them side-effecting (via `can_load()`), but the + /// `notrap + readonly` flags assert they're safe to skip from the + /// codegen's perspective. The absorbing MachInst takes responsibility + /// for performing the load itself. Color tracking is still + /// unnecessary because we're not moving a side-effecting op — we're + /// telling the lowerer it has been handled elsewhere. + pub fn sink_pure_inst(&mut self, ir_inst: Inst) { + let is_pure = !has_lowering_side_effect(self.f, ir_inst); + let is_safe_load = match &self.f.dfg.insts[ir_inst] { + InstructionData::Load { + opcode: crate::ir::Opcode::Load, + flags, + .. + } => { + let flags = self.f.dfg.mem_flags[*flags]; + flags.readonly() && flags.notrap() + } + _ => false, + }; + assert!(is_pure || is_safe_load); + self.inst_absorbed_pure.insert(ir_inst); + } + /// Indicate that the side-effect of an instruction has been sunk to the /// current scan location. This should only be done with the instruction's /// original results are not used (i.e., `put_input_in_regs` is not invoked diff --git a/cranelift/filetests/filetests/isa/pulley32/call.clif b/cranelift/filetests/filetests/isa/pulley32/call.clif index c2dc9a09f6c9..aece47fc9a19 100644 --- a/cranelift/filetests/filetests/isa/pulley32/call.clif +++ b/cranelift/filetests/filetests/isa/pulley32/call.clif @@ -291,7 +291,7 @@ block0(v0: i32): ; VCode: ; push_frame ; block0: -; indirect_call x0, CallInfo { dest: XReg(p0i), uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x0, CallInfo { dest: PulleyCallIndirect { target: XReg(p0i), args: [] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/cranelift/filetests/filetests/isa/pulley32/exceptions.clif b/cranelift/filetests/filetests/isa/pulley32/exceptions.clif index 2d3dfef3e853..eeed198535d2 100644 --- a/cranelift/filetests/filetests/isa/pulley32/exceptions.clif +++ b/cranelift/filetests/filetests/isa/pulley32/exceptions.clif @@ -77,7 +77,7 @@ function %f2(i32, i32) -> i32, f32, f64 { ; block0: ; fconst64 f1, 4607182418800017408 ; fstore64 Slot(0), f1 // flags = notrap aligned -; indirect_call x1, CallInfo { dest: XReg(p1i), uses: [CallArgPair { vreg: p0i, preg: p0i }], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I32) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I32) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] +; indirect_call x1, CallInfo { dest: PulleyCallIndirect { target: XReg(p1i), args: [XReg(p0i)] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I32) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I32) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] ; block1: ; xone x0 ; f1 = fload64 Slot(0) // flags = notrap aligned diff --git a/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif b/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif index c7d523d4f6a6..c698bb1f71ea 100644 --- a/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif +++ b/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif @@ -15,8 +15,8 @@ block0(v0: i64): ; xmov x3, x0 ; xmov x1, x3 ; xmov x2, x3 -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/cranelift/filetests/filetests/isa/pulley64/call.clif b/cranelift/filetests/filetests/isa/pulley64/call.clif index 16b271835620..bd6f9bba825f 100644 --- a/cranelift/filetests/filetests/isa/pulley64/call.clif +++ b/cranelift/filetests/filetests/isa/pulley64/call.clif @@ -291,7 +291,7 @@ block0(v0: i64): ; VCode: ; push_frame ; block0: -; indirect_call x0, CallInfo { dest: XReg(p0i), uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x0, CallInfo { dest: PulleyCallIndirect { target: XReg(p0i), args: [] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/cranelift/filetests/filetests/isa/pulley64/exceptions.clif b/cranelift/filetests/filetests/isa/pulley64/exceptions.clif index 88c5528c1935..6a0b7b1577a1 100644 --- a/cranelift/filetests/filetests/isa/pulley64/exceptions.clif +++ b/cranelift/filetests/filetests/isa/pulley64/exceptions.clif @@ -79,7 +79,7 @@ function %f2(i32, i64) -> i32, f32, f64 { ; block0: ; fconst64 f1, 4607182418800017408 ; fstore64 Slot(0), f1 // flags = notrap aligned -; indirect_call x1, CallInfo { dest: XReg(p1i), uses: [CallArgPair { vreg: p0i, preg: p0i }], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I64) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] +; indirect_call x1, CallInfo { dest: PulleyCallIndirect { target: XReg(p1i), args: [XReg(p0i)] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I64) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] ; block1: ; xone x0 ; f1 = fload64 Slot(0) // flags = notrap aligned diff --git a/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif b/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif index 2b6a28ce9ece..44bc72fcaf25 100644 --- a/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif +++ b/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif @@ -15,8 +15,8 @@ block0(v0: i64): ; xmov x3, x0 ; xmov x1, x3 ; xmov x2, x3 -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 8bd81a6b46db..975a0a04377d 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -1074,6 +1074,10 @@ impl<'module_environment> FuncEnvironment<'module_environment> { let result_param = builder.append_block_param(continuation_block, pointer_type); builder.set_cold_block(null_block); + // Branching on `value_masked` instead (letting the Pulley backend + // fuse the `band + brif` pair) requires a table whose slots are + // all eagerly initialized; that variant comes with eager + // initialization support. builder.ins().brif( value, continuation_block, @@ -1855,7 +1859,12 @@ impl FuncEnvironment<'_> { self.reference_type(table.ref_type.heap_type).0.bytes() }; - let base_flags = if Some(table.limits.min) == table.limits.max { + // A table is fixed-size if min == max or if translation proved it + // is never mutated; either way the base address and element count + // are constant for the instance's lifetime. + let fixed_size = + !self.translation.tables_mutated[index] || Some(table.limits.min) == table.limits.max; + let base_flags = if fixed_size { func.dfg .mem_flags .insert(MemFlagsData::trusted().with_readonly().with_can_move()) @@ -1867,11 +1876,10 @@ impl FuncEnvironment<'_> { base: ptr, offset: Offset32::new(base_offset), global_type: pointer_type, - // A fixed-size table can't be resized so its base address won't change. flags: base_flags, }); - let bound = if Some(table.limits.min) == table.limits.max { + let bound = if fixed_size { TableSize::Static { bound: table.limits.min, } @@ -2159,6 +2167,14 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { callee: ir::Value, call_args: &[ir::Value], ) -> WasmResult> { + // Fast path: if we can statically resolve this indirect call to a + // single defined function (immutable funcref table + constant + // callee index + matching signature), emit a direct call instead. + // See `try_static_resolve_indirect_call`. + if let Some(target) = self.try_static_resolve_indirect_call(table_index, ty_index, callee) { + return self.direct_call(target, sig_ref, call_args).map(Some); + } + let (code_ptr, callee_vmctx) = match self.check_and_load_code_and_callee_vmctx( table_index, ty_index, @@ -2173,6 +2189,198 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { .map(Some) } + /// Try to statically resolve a `call_indirect` site to a single defined + /// function so the call can be lowered as a direct call. + /// + /// All four of these must hold for the resolution to succeed: + /// + /// 1. The target table must be provably immutable for the lifetime of + /// any instance of this module: defined (not imported) and never the + /// target of `table.set` / `table.fill` / `table.copy` (as the dst) + /// / `table.grow` / `table.init`. This is the `tables_mutated` bit + /// populated in `ModuleEnvironment::translate`. + /// + /// 2. The callee index value (the operand to `call_indirect`) must be a + /// compile-time constant — i.e., the wasm did `i32.const N; + /// call_indirect (table $t) (type $sig)`. This is what hand-lowered + /// C++/Rust vtable calls and AOT-compiled JS-to-wasm dispatch tables + /// look like in practice. + /// + /// 3. The slot at index `N` in the table must be precomputable from + /// static `elem` segments: `module.table_initialization + /// .initial_values[defined_index]` must be `TableInitialValue::Null + /// { precomputed }` (i.e., not a fully-dynamic `Expr`-style init), + /// and the index `N` must be in range and resolved to a concrete + /// `FuncIndex` (not the reserved-value sentinel). + /// + /// 4. The function's signature in the module's interned type table + /// must equal the `ty_index` declared by the `call_indirect` site. + /// Otherwise the original semantics are "trap on signature + /// mismatch", which we don't want to replace with a static direct + /// call. + /// + /// Returns the resolved function on success, `None` otherwise (in + /// which case the caller falls back to a normal indirect call). + fn try_static_resolve_indirect_call( + &self, + table_index: TableIndex, + ty_index: TypeIndex, + callee: ir::Value, + ) -> Option { + let translation = self.env.translation; + let module = &translation.module; + + // (1) Table must be provably immutable. Imported tables are + // pre-marked as mutated in `ModuleEnvironment::translate`, so + // this check also rules them out (along with the explicit + // `defined_table_index` check below for clarity). + if translation.tables_mutated[table_index] { + return None; + } + let defined_table = module.defined_table_index(table_index)?; + + // (2) Callee must be a constant `iconst`. Pattern adapted from + // `bounds_checks::statically_known_in_bounds`. + let dfg = &self.builder.func.dfg; + let inst = dfg.value_def(callee).inst()?; + let imm = match dfg.insts[inst] { + ir::InstructionData::UnaryImm { + opcode: ir::Opcode::Iconst, + imm, + } => imm, + _ => return None, + }; + let callee_ty = dfg.value_type(callee); + let callee_idx_u64 = imm + .zero_extend_from_width(callee_ty.bits()) + .bits() + .cast_unsigned(); + + // (3) Slot must be precomputable from the static funcref image. + let precomputed = module.table_initialization.get(defined_table)?; + let slot = usize::try_from(callee_idx_u64).ok()?; + if slot >= precomputed.len() { + return None; + } + let target = precomputed[slot]; + // `FuncIndex::reserved_value()` marks a null (uncovered) slot. + if target.is_reserved_value() { + return None; + } + + // (4) Signature match. The site's declared `ty_index` and the + // target function's declared signature must intern to the same + // module type index. + let expected_ty = module.types[ty_index].unwrap_module_type_index(); + let target_ty = module.functions[target] + .signature + .unwrap_module_type_index(); + if expected_ty != target_ty { + return None; + } + + Some(target) + } + + /// Try to prove that the runtime signature check at a `call_indirect` + /// site through an untyped `funcref` table is redundant. + /// + /// True when: + /// + /// 1. The table is provably immutable (`tables_mutated[table_index] == + /// false`). Defined-not-imported is implied since imported tables + /// are pre-marked as mutated. + /// + /// 2. The table is precomputable from static `elem` segments + /// (`TableInitialValue::Null { precomputed }`). + /// + /// 3. Every non-null entry in `precomputed` has the same module- + /// interned signature as the `ty_index` declared at the call site. + /// Null slots are fine — they trap on the funcref-NULL load that + /// happens after sig-check elision. + /// + /// When this returns true, the caller short-circuits to + /// `CheckIndirectCallTypeSignature::StaticMatch`, which removes the + /// sig load + compare from the hot path. Bounds-check on the table + /// index and the funcref-NULL check are still emitted by the + /// surrounding code, so the call still traps correctly on OOB or + /// null index — only the sig check is elided. + /// + /// This is the static analog of an inline-cache: instead of caching + /// the resolved target per call site, we observe at module-load that + /// the table contents make the sig check uninformative for the + /// lifetime of any instance. + /// True iff every slot in the precomputed `elem`-segment contents for + /// `table_index` is a concrete `FuncIndex` (no + /// `FuncIndex::reserved_value()` "no-entry" sentinel). + /// + /// Caller has already proven the table is immutable, so the contents + /// observed here are stable for the lifetime of any instance — + /// `false` here implies "no slot is ever null at runtime." + /// + /// When this is true, the runtime funcref-NULL check on the loaded + /// funcref pointer is provably redundant: any in-bounds index leads + /// to a non-null funcref. The bounds check still runs (so an + /// out-of-bounds index traps as before with `TRAP_TABLE_OUT_OF_BOUNDS`). + fn precomputed_table_has_no_null_slots(&self, table_index: TableIndex) -> bool { + let module = &self.env.translation.module; + let Some(defined_table) = module.defined_table_index(table_index) else { + return false; + }; + let Some(precomputed) = module.table_initialization.get(defined_table) else { + return false; + }; + if precomputed.is_empty() { + return false; + } + // Slots beyond `precomputed.len()` are null at runtime; coverage + // up to `limits.min` is required (caller proved immutable, so the + // table can't grow beyond min). + let table_min = module.tables[table_index].limits.min; + if (precomputed.len() as u64) < table_min { + return false; + } + precomputed.iter().all(|f| !f.is_reserved_value()) + } + + fn try_elide_sig_check_for_immutable_table( + &self, + table_index: TableIndex, + ty_index: TypeIndex, + ) -> bool { + let translation = self.env.translation; + let module = &translation.module; + + if translation.tables_mutated[table_index] { + return false; + } + let defined_table = match module.defined_table_index(table_index) { + Some(d) => d, + None => return false, + }; + + let precomputed = match module.table_initialization.get(defined_table) { + Some(p) if !p.is_empty() => p, + _ => return false, + }; + + let expected_ty = module.types[ty_index].unwrap_module_type_index(); + for &func_idx in precomputed.iter() { + // Null slots will trap on the funcref-NULL load anyway. + if func_idx.is_reserved_value() { + continue; + } + let actual_ty = module.functions[func_idx] + .signature + .unwrap_module_type_index(); + if actual_ty != expected_ty { + return false; + } + } + + true + } + fn check_and_load_code_and_callee_vmctx( &mut self, table_index: TableIndex, @@ -2230,6 +2438,34 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { // table of typed functions and that type matches `ty_index`, then // there's no need to perform a typecheck. match table.ref_type.heap_type { + // Untyped `funcref` tables ordinarily need a runtime sig check. + // But if (a) the table is provably immutable (`tables_mutated` + // bit clear) and (b) every non-null entry in the precomputed + // static `elem` segments has the same `VMSharedTypeIndex` as + // the call site, then the runtime check is provably redundant + // and we can elide it the same way we do for typed-funcref + // tables. + // + // This is the AOT-IC-seeding analog: instead of caching the + // resolved target at the call site, we cache the *signature* + // at module-load time and skip the hot-path sig load+compare. + // Helps the megamorphic case (computed `call_indirect` index) + // that the static-monomorphization fast path above can't + // handle. + WasmHeapType::Func + if self.try_elide_sig_check_for_immutable_table(table_index, ty_index) => + { + // If we additionally know every entry in the precomputed + // table is non-null, lower `may_be_null` to false so the + // downstream funcref-NULL check is also elided. This is + // only sound if the table can't be grown or have its + // entries cleared after init (i.e., immutable, which we + // already proved above). + let may_be_null = table.ref_type.nullable + && !self.precomputed_table_has_no_null_slots(table_index); + return CheckIndirectCallTypeSignature::StaticMatch { may_be_null }; + } + // Functions do not have a statically known type in the table, a // typecheck is required. Fall through to below to perform the // actual typecheck. diff --git a/crates/environ/src/compile/module_environ.rs b/crates/environ/src/compile/module_environ.rs index 542181e55fd6..192c090feda2 100644 --- a/crates/environ/src/compile/module_environ.rs +++ b/crates/environ/src/compile/module_environ.rs @@ -76,6 +76,26 @@ pub struct ModuleTranslation<'data> { /// trampolines for each of these signatures are required. pub exported_signatures: Vec, + /// Per-table flag indicating whether the table is ever mutated by any + /// function defined in this module via `table.set` / `table.fill` / + /// `table.copy` (as the destination) / `table.grow` / `table.init`. + /// + /// `false` (the default) means the table's contents are determined + /// entirely by its `elem` segments and any active initializer, and never + /// change at runtime — provably immutable for the lifetime of any + /// instance of this module. + /// + /// `true` means the contents can change at runtime (or the table is + /// imported, in which case we conservatively assume the importer + /// mutates it). + /// + /// This is groundwork for later passes that turn `call_indirect` + /// through provably-immutable function tables into direct calls when + /// the dispatched-to slot is statically known. Set during module + /// translation (see `analyze_table_mutability`); read by Cranelift + /// lowering and by Pulley AOT IC seeding. + pub tables_mutated: SecondaryMap, + /// DWARF debug information, if enabled, parsed from the module. pub debuginfo: DebugInfoData<'data>, @@ -193,6 +213,7 @@ impl<'data> ModuleTranslation<'data> { function_body_inputs: PrimaryMap::default(), known_imported_functions: SecondaryMap::default(), exported_signatures: Vec::default(), + tables_mutated: SecondaryMap::default(), debuginfo: DebugInfoData::default(), has_unparsed_debuginfo: false, data_align: None, @@ -315,6 +336,8 @@ impl<'a, 'data> ModuleEnvironment<'a, 'data> { self.translate_payload(payload?)?; } + analyze_table_mutability(&mut self.result)?; + Ok(self.result) } @@ -1548,3 +1571,85 @@ impl ModuleTranslation<'_> { self.module.startup = ModuleStartup::IfMemoriesNeedInit(ty); } } + +/// Walk every defined function body, recording in +/// `translation.tables_mutated` each table that is the destination of any +/// runtime mutation opcode (`table.set`, `table.fill`, `table.copy` as the +/// destination, `table.grow`, `table.init`). +/// +/// Imported tables are conservatively pre-marked as mutated since the +/// importer can mutate them in ways we can't see. Active `elem` segments +/// applied at instantiation time are NOT counted as mutations — they are +/// part of the table's *initial* state, not a runtime change. +/// +/// `elem.drop` drops a passive element segment but does not write to any +/// table directly, so it is intentionally not counted here. Conservatively, +/// any `table.init` from a passive segment marks the destination table as +/// mutated. +fn analyze_table_mutability<'data>( + translation: &mut ModuleTranslation<'data>, +) -> Result<()> { + // Resize the table-mutability map to cover every table in the module + // (imports + defined). `SecondaryMap` defaults to `false` for all + // unset entries, which is the correct "definitely-not-mutated" default + // for defined tables we haven't observed any mutations on yet. + let num_tables = translation.module.tables.len(); + if num_tables == 0 { + return Ok(()); + } + + // Mark all imported tables as mutated up front. The importer can + // mutate them in ways this module can't see, so the conservative + // assumption is that they are not stable across calls. + let num_imported = translation.module.num_imported_tables; + for i in 0..num_imported { + translation.tables_mutated[TableIndex::from_u32(i as u32)] = true; + } + + // Mark all *exported* tables as mutated as well. A host (or another + // instance importing the export) can call `Table::set` / + // `Table::grow` via the public wasmtime API on any exported table, + // and those mutations are not visible in this module's bytecode. + // The `call_indirect` optimizations that read this bit must + // therefore treat exported tables as conservatively non-stable. + for (_, entity_index) in &translation.module.exports { + if let EntityIndex::Table(table_index) = entity_index { + translation.tables_mutated[*table_index] = true; + } + } + + // Walk every defined function body and look for table-mutation opcodes. + // The cost is O(total opcodes), one extra pass on top of the validator; + // typical large modules (sqlite3 ~50K opcodes) take well under a + // millisecond. + for (_, body_data) in &translation.function_body_inputs { + let mut reader = body_data.body.get_operators_reader()?; + while !reader.eof() { + use wasmparser::Operator; + match reader.read()? { + Operator::TableSet { table } + | Operator::TableFill { table } + | Operator::TableGrow { table } => { + translation.tables_mutated[TableIndex::from_u32(table)] = true; + } + Operator::TableCopy { + dst_table, + src_table: _, + } => { + // `src_table` is read-only in `table.copy`; only the + // destination is mutated. + translation.tables_mutated[TableIndex::from_u32(dst_table)] = true; + } + Operator::TableInit { + table, + elem_index: _, + } => { + translation.tables_mutated[TableIndex::from_u32(table)] = true; + } + _ => {} + } + } + } + + Ok(()) +} diff --git a/crates/environ/tests/table_mutability.rs b/crates/environ/tests/table_mutability.rs new file mode 100644 index 000000000000..562966a708e4 --- /dev/null +++ b/crates/environ/tests/table_mutability.rs @@ -0,0 +1,307 @@ +//! Integration tests for `analyze_table_mutability` and the surrounding +//! precompute ordering invariants. +//! +//! The per-table mutability bit is the foundation of the `call_indirect` +//! optimizations in `crates/cranelift/src/func_environ.rs` +//! (constant-index direct call, sig-check elision, NULL elision, bound- +//! load elision). A false negative here — failing to mark a table as +//! mutated when it actually is — would silently turn correct calls into +//! incorrect direct calls or skip required runtime checks. A false +//! positive — marking an immutable table as mutated — is merely a missed +//! optimization. Pin the analysis behaviour with focused module-level +//! tests so any regression surfaces immediately, not after a downstream +//! optimization fires on a now-invalid premise. +//! +//! Test scenario inspiration drawn from comparable bugs in peer +//! interpreters that have shipped fixes for analogous IC-invalidation +//! mistakes: +//! +//! - **Luau** (`LOP_NAMECALL`): inline cache had to be invalidated on +//! `table.insert` / metatable change. Analogous wasm risk: `table.grow` +//! not invalidating an immutability proof, so see `table_grow_marks…`. +//! - **JavaScriptCore** (`ic_table`): inline-cache corruption from missed +//! shape transitions. Analogous risk: over-marking, e.g. `table.copy` +//! wrongly marking the SOURCE table as mutated would forbid downstream +//! optimizations on a perfectly read-only table. See +//! `table_copy_marks_destination_only_not_source`. +//! - **Hermes** (`HiddenClass` cache): property cache misses with +//! `Object.defineProperty`. Analogous risk: `table.init` (active- +//! segment init at runtime) being treated as a no-op rather than a +//! write. See `table_init_marks_destination`. +//! +//! Lives in `tests/` rather than as a `#[cfg(test)] mod` inside +//! `module_environ.rs` because the latter triggers a pre-existing +//! upstream compile failure in `key.rs` / `module_artifacts.rs` (their +//! `arbitrary::Arbitrary` derives are stale relative to the workspace's +//! pinned `arbitrary 1.4.2`). Integration tests build against the lib +//! as a normal dependency and so do not set `cfg(test)` on +//! `wasmtime-environ` itself. + +use wasmparser::{Parser, Validator, WasmFeatures}; +use wasmtime_environ::{ + ModuleEnvironment, ModuleTypesBuilder, StaticModuleIndex, TableIndex, Tunables, +}; + +/// Translate `wat` and return the resulting `tables_mutated` bits, in +/// table-index order. Helper to keep individual tests short. +fn translate_and_get_mutability(wat: &str) -> Vec { + let bytes = wat::parse_str(wat).expect("WAT parse failed"); + let tunables = Tunables::default_host(); + // WASM2 covers reference-types + bulk-memory, which is what every + // table-mutating opcode below needs (`table.set`, `table.fill`, + // `table.grow`, `table.copy`, `table.init`, `elem.drop`). + let features = WasmFeatures::WASM2; + let mut validator = Validator::new_with_features(features); + let mut types = ModuleTypesBuilder::new(&validator); + let env = ModuleEnvironment::new( + &tunables, + &mut validator, + &mut types, + StaticModuleIndex::from_u32(0), + ); + let parser = Parser::new(0); + let translation = env.translate(parser, &bytes).expect("translate failed"); + let n: u32 = translation.module.tables.len().try_into().unwrap(); + (0..n) + .map(|i| translation.tables_mutated[TableIndex::from_u32(i)]) + .collect() +} + +/// A table only used as the source of `call_indirect` and `table.get` is +/// provably immutable. (Both ops READ the table; neither writes it.) The +/// table is intentionally NOT exported — exported tables are +/// conservatively pre-marked as mutated (see +/// `exported_tables_are_pre_marked` for the export case) since the host +/// can mutate them via the public wasmtime API. +#[test] +fn read_only_table_is_immutable() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 42) + (elem (i32.const 0) $f $f $f $f) + (func (export "call_zero") (result i32) + i32.const 0 + call_indirect (param) (result i32)) + (func (export "read_zero") (result funcref) + i32.const 0 + table.get 0)) + "#, + ); + assert_eq!(bits, vec![false], "no opcode mutated this table"); +} + +/// Exported tables are always pre-marked as mutated, regardless of +/// whether any opcode in this module touches them. The host can call +/// `Table::set` / `Table::grow` via the public wasmtime API on any +/// exported table, and another module that imports the export can also +/// mutate it. Without this rule, downstream optimizations would +/// happily elide null traps and sig checks on exported tables on the +/// (false) assumption that the table contents are stable. +#[test] +fn exported_tables_are_pre_marked() { + let bits = translate_and_get_mutability( + r#" + (module + (table (export "t") 4 funcref) + (func $f (result i32) i32.const 42) + (elem (i32.const 0) $f $f $f $f)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.set` marks its destination as mutated. +#[test] +fn table_set_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "do_set") + i32.const 1 + ref.func $f + table.set 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.fill` marks its destination as mutated. +#[test] +fn table_fill_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "do_fill") + i32.const 0 + ref.func $f + i32.const 4 + table.fill 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.grow` is treated as mutating — analogous to Luau's NAMECALL IC +/// needing to invalidate on table-shape change. +#[test] +fn table_grow_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func (export "do_grow") (result i32) + ref.null func + i32.const 1 + table.grow 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.copy` marks the DESTINATION but explicitly NOT the source. The +/// source is read-only (its contents aren't changed by the op); marking +/// it as mutated would forbid downstream optimizations from treating it +/// as immutable, which would be incorrect over-conservatism — the JSC +/// `ic_table` analogue. +#[test] +fn table_copy_marks_destination_only_not_source() { + let bits = translate_and_get_mutability( + r#" + (module + (table $dst (export "dst") 4 funcref) + (table $src 4 funcref) + (func $f (result i32) i32.const 0) + (elem (table $src) (i32.const 0) func $f $f $f $f) + (func (export "do_copy") + i32.const 0 ;; dst offset + i32.const 0 ;; src offset + i32.const 4 ;; len + table.copy $dst $src)) + "#, + ); + assert_eq!( + bits, + vec![true, false], + "dst should be mutated, src should remain immutable", + ); +} + +/// `table.init` writes to the destination table from a passive elem +/// segment, so it is treated as mutation (the destination's contents +/// change at runtime). +#[test] +fn table_init_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (elem $e funcref (ref.func $f) (ref.func $f)) + (func (export "do_init") + i32.const 0 ;; dst + i32.const 0 ;; src offset within elem + i32.const 2 ;; len + table.init 0 $e)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `elem.drop` drops a passive element segment but does NOT write to any +/// table — distinct from `table.init` which DOES write. A pessimistic +/// implementation that marked all tables as mutated on `elem.drop` would +/// hand out false positives and shut off optimizations on perfectly- +/// immutable tables. +#[test] +fn elem_drop_does_not_mark_tables() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (elem $e funcref (ref.func $f)) + (func (export "do_drop") + elem.drop $e)) + "#, + ); + assert_eq!(bits, vec![false]); +} + +/// Imported tables are always pre-marked as mutated, regardless of +/// whether any opcode in this module touches them. The importer can +/// mutate the table in ways this module can't see. +#[test] +fn imported_tables_are_pre_marked() { + let bits = translate_and_get_mutability( + r#" + (module + (import "host" "t" (table 4 funcref))) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// A mutation in ONE function correctly marks the table — the analysis +/// has to walk every function body, not just the first. +#[test] +fn mutation_in_any_function_counts() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "innocent") (result i32) + i32.const 0 + call_indirect (param) (result i32)) + (func (export "guilty") + i32.const 0 + ref.func $f + table.set 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// Two tables, one mutated, one not. The analysis tracks per-table — a +/// mutation on one must not leak to the other. +#[test] +fn mutation_isolated_to_target_table() { + let bits = translate_and_get_mutability( + r#" + (module + (table $a 4 funcref) + (table $b 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "mut_a") + i32.const 0 + ref.func $f + table.set $a)) + "#, + ); + assert_eq!( + bits, + vec![true, false], + "$a should be mutated, $b should remain immutable", + ); +} + +/// Translating without any tables at all must not panic. (Defensive: the +/// analysis indexes a `SecondaryMap` keyed by `TableIndex`, and we want +/// to confirm an empty module produces an empty result rather than e.g. +/// a default-allocated single entry.) +#[test] +fn module_with_no_tables_produces_empty_mutability_vec() { + let bits = translate_and_get_mutability( + r#" + (module + (func (export "noop"))) + "#, + ); + assert!(bits.is_empty(), "no tables ⇒ no mutability bits"); +} diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 5b3f79445340..07c73584afd9 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -1425,6 +1425,83 @@ impl OpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } + fn call_indirect1(&mut self, dst: XReg, arg1: XReg) -> ControlFlow { + // Read arg1 before writing x0 so this is safe when `arg1 == x0`. + let arg1_val = self.state[arg1]; + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = arg1_val; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + + fn call_indirect2(&mut self, dst: XReg, arg1: XReg, arg2: XReg) -> ControlFlow { + let (a1, a2) = (self.state[arg1], self.state[arg2]); + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = a1; + self.state[XReg::x1] = a2; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + + fn call_indirect3( + &mut self, + dst: XReg, + arg1: XReg, + arg2: XReg, + arg3: XReg, + ) -> ControlFlow { + let (a1, a2, a3) = (self.state[arg1], self.state[arg2], self.state[arg3]); + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = a1; + self.state[XReg::x1] = a2; + self.state[XReg::x2] = a3; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + + fn call_indirect4( + &mut self, + dst: XReg, + arg1: XReg, + arg2: XReg, + arg3: XReg, + arg4: XReg, + ) -> ControlFlow { + let (a1, a2, a3, a4) = ( + self.state[arg1], + self.state[arg2], + self.state[arg3], + self.state[arg4], + ); + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = a1; + self.state[XReg::x1] = a2; + self.state[XReg::x2] = a3; + self.state[XReg::x3] = a4; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + fn jump(&mut self, offset: PcRelOffset) -> ControlFlow { self.pc_rel_jump::(offset) } @@ -2296,6 +2373,340 @@ impl OpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } + fn xband32_s8_br_if_x32( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i32(); + self.state[dst].set_i32(s & i32::from(mask)); + if s != 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband32_s8_br_if_not_x32( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i32(); + self.state[dst].set_i32(s & i32::from(mask)); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband64_s8_br_if_x64( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i64(); + self.state[dst].set_i64(s & i64::from(mask)); + if s != 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband64_s8_br_if_not_x64( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i64(); + self.state[dst].set_i64(s & i64::from(mask)); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xfuncref_dispatch_x64( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + // `src` is the already-masked funcref. The null side traps: the + // fusion absorbed the continuation-block loads, so the lazy-init + // slow path's rejoin would see uninitialized dst_code/dst_vmctx. + // Gated on `is_eagerly_initialized_funcref_table`, so trapping + // here is unreachable in correct code. + let s = self.state[src].get_u64(); + if s == 0 { + self.done_trap::() + } else { + // SAFETY: predicate guarantees `src` points to a real VMFuncRef. + let base = s as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + self.pc_rel_jump::(offset) + } + } + + fn xfuncref_dispatch_not_x64( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + // Inverted form: fast path falls through; null side traps. + // `offset` is unused (kept for encoding shape parity). + let _ = offset; + let s = self.state[src].get_u64(); + if s == 0 { + self.done_trap::() + } else { + let base = s as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + ControlFlow::Continue(()) + } + } + + fn xfuncref_dispatch_x32( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u32(); + if s == 0 { + self.done_trap::() + } else { + let base = s as usize as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + self.pc_rel_jump::(offset) + } + } + + fn xfuncref_dispatch_not_x32( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let _ = offset; + let s = self.state[src].get_u32(); + if s == 0 { + self.done_trap::() + } else { + let base = s as usize as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + ControlFlow::Continue(()) + } + } + + fn xband_funcref_dispatch_x64( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + // Combines the standalone xband64_s8 with the xfuncref dispatch. + // `src` is unmasked. `dst_masked = src & -2` is written + // unconditionally so the brif's block-call-arg copy still finds a + // producer; the loads + branch fire on `src != 0`. Null traps + // (same rationale as `xfuncref_dispatch_x64`). + let s = self.state[src].get_u64(); + let masked = s & !1u64; + self.state[dst_masked].set_u64(masked); + if s != 0 { + let base = masked as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + self.pc_rel_jump::(offset) + } else { + self.done_trap::() + } + } + + fn xband_funcref_dispatch_not_x64( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + // Inverted form; `offset` is vestigial after the trap-on-null fix. + let _ = offset; + let s = self.state[src].get_u64(); + let masked = s & !1u64; + self.state[dst_masked].set_u64(masked); + if s == 0 { + self.done_trap::() + } else { + let base = masked as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + ControlFlow::Continue(()) + } + } + + fn xband_funcref_dispatch_x32( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u32(); + let masked = s & !1u32; + self.state[dst_masked].set_u32(masked); + if s != 0 { + let base = masked as usize as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + self.pc_rel_jump::(offset) + } else { + self.done_trap::() + } + } + + fn xband_funcref_dispatch_not_x32( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let _ = offset; + let s = self.state[src].get_u32(); + let masked = s & !1u32; + self.state[dst_masked].set_u32(masked); + if s == 0 { + self.done_trap::() + } else { + let base = masked as usize as *const u8; + unsafe { + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + ControlFlow::Continue(()) + } + } + fn xbor32(&mut self, operands: BinaryOperands) -> ControlFlow { let a = self.state[operands.src1].get_u32(); let b = self.state[operands.src2].get_u32(); diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 36a09cb13a34..de2210d7e4b0 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -115,6 +115,14 @@ macro_rules! for_each_op { /// Transfer control to the PC in `reg` and set `lr` to the PC just /// after this instruction. call_indirect = CallIndirect { reg: XReg }; + /// Like `call_indirect`, but also `x0 = arg1`. + call_indirect1 = CallIndirect1 { reg: XReg, arg1: XReg }; + /// Like `call_indirect`, but also `x0, x1 = arg1, arg2`. + call_indirect2 = CallIndirect2 { reg: XReg, arg1: XReg, arg2: XReg }; + /// Like `call_indirect`, but also `x0, x1, x2 = arg1, arg2, arg3`. + call_indirect3 = CallIndirect3 { reg: XReg, arg1: XReg, arg2: XReg, arg3: XReg }; + /// Like `call_indirect`, but also `x0, x1, x2, x3 = arg1, arg2, arg3, arg4`. + call_indirect4 = CallIndirect4 { reg: XReg, arg1: XReg, arg2: XReg, arg3: XReg, arg4: XReg }; /// Unconditionally transfer control to the PC at the given offset. jump = Jump { offset: PcRelOffset }; @@ -562,6 +570,58 @@ macro_rules! for_each_op { xband64_s8 = Xband64S8 { dst: XReg, src1: XReg, src2: i8 }; /// Same as `xband64` but `src2` is a sign-extended 32-bit immediate. xband64_s32 = Xband64S32 { dst: XReg, src1: XReg, src2: i32 }; + + /// `low32(dst) = low32(src) & sign_extend(mask)`, then branch by + /// `offset` if `low32(src)` is non-zero. Fused `xband32_s8 + + /// br_if32` for the call_indirect lazy-init brif site. + xband32_s8_br_if_x32 = Xband32S8BrIfX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + /// Inverted form of `xband32_s8_br_if_x32`: branch if `low32(src)` + /// is zero. Mask + dst write are unconditional. + xband32_s8_br_if_not_x32 = Xband32S8BrIfNotX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + /// 64-bit form of `xband32_s8_br_if_x32`. + xband64_s8_br_if_x64 = Xband64S8BrIfX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + /// Inverted form of `xband64_s8_br_if_x64`: branch if `src` is zero. + xband64_s8_br_if_not_x64 = Xband64S8BrIfNotX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + + /// Funcref-dispatch fusion (64-bit). If `src != 0`, load + /// `dst_code = [src + offset_code]`, `dst_vmctx = [src + + /// offset_vmctx]`, and branch by `offset`. `src` is the + /// already-masked funcref pointer. + /// + /// The null side traps. The fusion absorbs the two field loads + /// from the brif's continuation block; if execution reached the + /// original lazy-init slow path, it would rejoin that + /// continuation with `dst_code`/`dst_vmctx` uninitialized, so + /// the null path can no longer fall through safely. Gated on + /// `is_eagerly_initialized_funcref_table`, which guarantees the + /// null path is unreachable at runtime. + xfuncref_dispatch_x64 = XfuncrefDispatchX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xfuncref_dispatch_x64`: fast path falls + /// through; null path traps. `offset` is vestigial (kept for + /// shape parity with the forward variant). + xfuncref_dispatch_not_x64 = XfuncrefDispatchNotX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// 32-bit pointer-width form of `xfuncref_dispatch_x64`. + xfuncref_dispatch_x32 = XfuncrefDispatchX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xfuncref_dispatch_x32`. + xfuncref_dispatch_not_x32 = XfuncrefDispatchNotX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + + /// Combines `xband64_s8 dst_masked, src, -2` with + /// `xfuncref_dispatch_*_x64` into one op. `src` is the unmasked + /// funcref; the init-bit strip is internal. + /// + /// `dst_masked = src & -2` unconditionally. If `src != 0`, do + /// the two loads and branch by `offset`. Null side traps (same + /// rationale as `xfuncref_dispatch_*`). + xband_funcref_dispatch_x64 = XbandFuncrefDispatchX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xband_funcref_dispatch_x64`: fast path + /// falls through; null path traps. `dst_masked` is still + /// written unconditionally. `offset` is vestigial. + xband_funcref_dispatch_not_x64 = XbandFuncrefDispatchNotX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// 32-bit pointer-width form of `xband_funcref_dispatch_x64`. + xband_funcref_dispatch_x32 = XbandFuncrefDispatchX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xband_funcref_dispatch_x32`. + xband_funcref_dispatch_not_x32 = XbandFuncrefDispatchNotX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// `low32(dst) = low32(src1) | low32(src2)` xbor32 = XBor32 { operands: BinaryOperands }; /// Same as `xbor64` but `src2` is a sign-extended 8-bit immediate. diff --git a/tests/all/pulley.rs b/tests/all/pulley.rs index d4cd458f915d..7af28655567f 100644 --- a/tests/all/pulley.rs +++ b/tests/all/pulley.rs @@ -515,3 +515,282 @@ fn decode_unaligned() -> Result<()> { Ok(()) } + +// Runtime-semantics tests for the call_indirect fusion stack +// (`tests/disas/pulley-fusion-*.wat` covers the static disasm side). +// Each test runs the same wasm under Pulley and native Cranelift and +// asserts the results agree. + +/// Pulley config for tests that exercise traps. The interpreter can't +/// catch signals, so trap emission must be explicit. +fn pulley_trap_safe_config() -> Config { + let mut config = pulley_config(); + config.signals_based_traps(false); + config +} + +fn pulley_and_native_agree( + wat: &str, + func_name: &str, + params: Params, +) -> Result +where + Params: wasmtime::WasmParams + Copy, + Results: wasmtime::WasmResults + std::fmt::Debug + PartialEq, +{ + let bytes = wat::parse_str(wat)?; + let pulley = { + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let f = inst.get_typed_func::(&mut store, func_name)?; + f.call(&mut store, params)? + }; + let native = { + let engine = Engine::new(&Config::new())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let f = inst.get_typed_func::(&mut store, func_name)?; + f.call(&mut store, params)? + }; + assert_eq!( + pulley, native, + "Pulley and native diverged for `{func_name}` — fusion lowering bug?" + ); + Ok(pulley) +} + +/// Fusion returns the right callee for every in-bounds index and traps +/// on OOB. +#[test] +fn fusion_call_indirect_every_index() -> Result<()> { + let wat = r#" + (module + (table 3 3 funcref) + (func $f0 (result i32) i32.const 100) + (func $f1 (result i32) i32.const 101) + (func $f2 (result i32) i32.const 102) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + (elem (i32.const 0) func $f0 $f1 $f2)) + "#; + for (idx, expected) in [(0_i32, 100_i32), (1, 101), (2, 102)] { + let got: i32 = pulley_and_native_agree(wat, "call", idx)?; + assert_eq!(got, expected, "idx {idx}"); + } + // Pulley only — native signal-based traps interact badly with + // `cargo test`'s debug-mode signal handlers. + let bytes = wat::parse_str(wat)?; + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let f = inst.get_typed_func::(&mut store, "call")?; + let err = f.call(&mut store, 3).unwrap_err(); + let trap = err.downcast_ref::().expect("Trap"); + assert_eq!(*trap, Trap::TableOutOfBounds); + Ok(()) +} + +/// Two call_indirect sites in the same function; each must fuse +/// independently. +#[test] +fn fusion_call_indirect_multi_site() -> Result<()> { + let wat = r#" + (module + (table 3 3 funcref) + (func $f0 (result i32) i32.const 10) + (func $f1 (result i32) i32.const 20) + (func $f2 (result i32) i32.const 30) + (func (export "sum") (param i32 i32) (result i32) + local.get 0 call_indirect (result i32) + local.get 1 call_indirect (result i32) + i32.add) + (elem (i32.const 0) func $f0 $f1 $f2)) + "#; + for (a, b, expected) in [(0_i32, 1_i32, 30_i32), (1, 2, 50), (2, 0, 40), (1, 1, 40)] { + let got: i32 = pulley_and_native_agree(wat, "sum", (a, b))?; + assert_eq!(got, expected, "a={a} b={b}"); + } + Ok(()) +} + +/// `return_call_indirect` correctness with fusion applied. +#[test] +fn fusion_return_call_indirect() -> Result<()> { + let wat = r#" + (module + (table 2 2 funcref) + (type $sig (func (result i32))) + (func $f0 (result i32) i32.const 7) + (func $f1 (result i32) i32.const 11) + (func (export "tail") (param i32) (result i32) + local.get 0 + return_call_indirect (type $sig)) + (elem (i32.const 0) func $f0 $f1)) + "#; + for (idx, expected) in [(0_i32, 7_i32), (1, 11)] { + let got: i32 = pulley_and_native_agree(wat, "tail", idx)?; + assert_eq!(got, expected, "idx {idx}"); + } + Ok(()) +} + +/// Host mutates a slot to `ref.null func`; call_indirect must trap +/// `IndirectCallToNull`. +#[test] +fn fusion_call_indirect_with_host_null_set() -> Result<()> { + let wat = r#" + (module + (table (export "t") 2 2 funcref) + (func $f0 (result i32) i32.const 100) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + (elem (i32.const 0) func $f0 $f0)) + "#; + let bytes = wat::parse_str(wat)?; + + // Pulley only (see note on `fusion_call_indirect_null_slot`). + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let call = inst.get_typed_func::(&mut store, "call")?; + assert_eq!(call.call(&mut store, 0)?, 100); + assert_eq!(call.call(&mut store, 1)?, 100); + + let table = inst.get_table(&mut store, "t").expect("table export"); + table.set(&mut store, 1, wasmtime::Ref::Func(None))?; + + assert_eq!(call.call(&mut store, 0)?, 100); + let err = call.call(&mut store, 1).unwrap_err(); + let trap = err.downcast_ref::().expect("Trap"); + assert_eq!(*trap, Trap::IndirectCallToNull); + Ok(()) +} + +/// Host `Table::set` swaps to a different funcref between calls; the +/// second call must observe the new target. +#[test] +fn fusion_call_indirect_with_host_swap() -> Result<()> { + let wat = r#" + (module + (table (export "t") 1 1 funcref) + (func $f0 (result i32) i32.const 100) + (func $f1 (result i32) i32.const 200) + (func (export "f1_ref") (result funcref) ref.func $f1) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + (elem declare func $f1) + (elem (i32.const 0) func $f0)) + "#; + let bytes = wat::parse_str(wat)?; + + for use_pulley in [true, false] { + let cfg = if use_pulley { + pulley_trap_safe_config() + } else { + Config::new() + }; + let engine = Engine::new(&cfg)?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let call = inst.get_typed_func::(&mut store, "call")?; + assert_eq!(call.call(&mut store, 0)?, 100); + + let f1_ref = inst + .get_typed_func::<(), Option>(&mut store, "f1_ref")? + .call(&mut store, ())? + .expect("f1_ref returned None"); + let table = inst.get_table(&mut store, "t").expect("table export"); + table.set(&mut store, 0, wasmtime::Ref::Func(Some(f1_ref)))?; + + assert_eq!(call.call(&mut store, 0)?, 200, "use_pulley={use_pulley}"); + } + Ok(()) +} + +/// Module B imports module A's table and calls into it. Tables are +/// imported, so the importer's `tables_mutated` is `true` and no +/// fusion fires on B's side; the call must still produce the right +/// result. +#[test] +fn fusion_call_indirect_imported_table() -> Result<()> { + let wat_a = r#" + (module + (table (export "t") 2 2 funcref) + (func $f0 (result i32) i32.const 42) + (func $f1 (result i32) i32.const 84) + (elem (i32.const 0) func $f0 $f1)) + "#; + let wat_b = r#" + (module + (import "a" "t" (table 2 2 funcref)) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32))) + "#; + let bytes_a = wat::parse_str(wat_a)?; + let bytes_b = wat::parse_str(wat_b)?; + + for use_pulley in [true, false] { + let cfg = if use_pulley { + pulley_trap_safe_config() + } else { + Config::new() + }; + let engine = Engine::new(&cfg)?; + let module_a = Module::new(&engine, &bytes_a)?; + let module_b = Module::new(&engine, &bytes_b)?; + let mut store = Store::new(&engine, ()); + let inst_a = Instance::new(&mut store, &module_a, &[])?; + let table_export = inst_a.get_export(&mut store, "t").expect("a.t"); + + let mut linker = wasmtime::Linker::new(&engine); + linker.define(&store, "a", "t", table_export)?; + let inst_b = linker.instantiate(&mut store, &module_b)?; + + let call = inst_b.get_typed_func::(&mut store, "call")?; + for (idx, expected) in [(0_i32, 42_i32), (1, 84)] { + assert_eq!( + call.call(&mut store, idx)?, + expected, + "use_pulley={use_pulley} idx={idx}" + ); + } + } + Ok(()) +} + +/// Single call_indirect to an uninitialised slot — the phase-2 fused +/// op's runtime null check must trap cleanly with the right trap kind, +/// not crash on the field deref. +/// +/// Call into an uninitialised table slot must trap. +#[test] +fn fusion_call_indirect_null_slot() -> Result<()> { + let wat = r#" + (module + (table (export "t") 1 1 funcref) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32))) + "#; + let bytes = wat::parse_str(wat)?; + // Pulley only — see note on `fusion_call_indirect_every_index`. + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let call = inst.get_typed_func::(&mut store, "call")?; + let err = call.call(&mut store, 0).unwrap_err(); + let trap = err.downcast_ref::().expect("Trap"); + assert_eq!(*trap, Trap::IndirectCallToNull); + Ok(()) +} diff --git a/tests/disas/call-indirect-immutable-elide-null.wat b/tests/disas/call-indirect-immutable-elide-null.wat new file mode 100644 index 000000000000..35e2e0c7f0db --- /dev/null +++ b/tests/disas/call-indirect-immutable-elide-null.wat @@ -0,0 +1,116 @@ +;;! target = "x86_64" + +;; Immutable funcref table where every slot is filled by the elem +;; segment (no "no-entry" gaps). With both the sig check AND the +;; funcref-NULL check elided, the dispatch path is reduced to: +;; - bounds check (static) +;; - lazy-init brif + masking +;; - load code+vmctx +;; - call_indirect +;; +;; In particular the cold block that handles the runtime trap-on-null +;; path should not exist after the funcref load: the static-match path +;; with `may_be_null = false` skips both the sig check and any +;; downstream null-handling. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + ;; Fully cover the table — no null slot anywhere. + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @003f v3 = iconst.i32 1 +;; @0041 jump block1 +;; +;; block1: +;; @0041 return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0044 v3 = iconst.i32 2 +;; @0046 jump block1 +;; +;; block1: +;; @0046 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0049 v3 = iconst.i32 3 +;; @004b jump block1 +;; +;; block1: +;; @004b return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:7 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0050 v4 = iconst.i32 3 +;; @0050 v5 = icmp uge v2, v4 ; v4 = 3 +;; @0050 v6 = uextend.i64 v2 +;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 +;; +;; block2 cold: +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) +;; +;; block3(v16: i64): +;; @0050 v20 = load.i64 notrap aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) +;; @0053 jump block1 +;; +;; block1: +;; @0053 return v22 +;; } diff --git a/tests/disas/call-indirect-immutable-elide-sig.wat b/tests/disas/call-indirect-immutable-elide-sig.wat new file mode 100644 index 000000000000..d5d892f6d99a --- /dev/null +++ b/tests/disas/call-indirect-immutable-elide-sig.wat @@ -0,0 +1,115 @@ +;;! target = "x86_64" + +;; Immutable funcref table where every elem-segment entry has the same +;; declared type as the call site. This module's `tables_mutated` bit +;; for table 0 is clear (no opcode in any function writes to it), and +;; all three slots resolve to the same module type as the call site. +;; That triggers `try_elide_sig_check_for_immutable_table` → +;; `CheckIndirectCallTypeSignature::StaticMatch`, removing the runtime +;; signature load + compare from the dispatch hot path. +;; +;; Look for the absence of `load.i32 user6 aligned readonly v_+16` (the +;; sig-id load) and the matching `icmp eq / trapz user7` on the call +;; site. Compare with `indirect-call-no-caching.wat` for the +;; non-elided shape. + +(module + (table 10 10 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @003f v3 = iconst.i32 1 +;; @0041 jump block1 +;; +;; block1: +;; @0041 return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0044 v3 = iconst.i32 2 +;; @0046 jump block1 +;; +;; block1: +;; @0046 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0049 v3 = iconst.i32 3 +;; @004b jump block1 +;; +;; block1: +;; @004b return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:7 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0050 v4 = iconst.i32 10 +;; @0050 v5 = icmp uge v2, v4 ; v4 = 10 +;; @0050 v6 = uextend.i64 v2 +;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 +;; +;; block2 cold: +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) +;; +;; block3(v16: i64): +;; @0050 v20 = load.i64 user7 aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) +;; @0053 jump block1 +;; +;; block1: +;; @0053 return v22 +;; } diff --git a/tests/disas/call-indirect-immutable-static-bound.wat b/tests/disas/call-indirect-immutable-static-bound.wat new file mode 100644 index 000000000000..05c3ffd748ab --- /dev/null +++ b/tests/disas/call-indirect-immutable-static-bound.wat @@ -0,0 +1,115 @@ +;;! target = "x86_64" + +;; Table declared with min < max (a "dynamic-declared" table) that is +;; never written to in the module. Without the per-table mutability +;; bit, Cranelift would emit `load.i64 v0+56` per dispatch to fetch +;; the current bound. With it, `make_table` lowers to +;; `TableSize::Static` and the bound becomes an immediate. +;; +;; Look for: bounds-check `iconst.i32 16` (the declared min, used as +;; static bound) and NO `load.i64 ... v0+56` for the current_elements +;; field. (`+48` for the funcref base is still loaded — that's the +;; element-data pointer, separate from the bound.) + +(module + ;; min=16, max=64 — distinct, so without our optimization the + ;; bound would be loaded per dispatch from `current_elements`. + (table 16 64 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @003f v3 = iconst.i32 1 +;; @0041 jump block1 +;; +;; block1: +;; @0041 return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0044 v3 = iconst.i32 2 +;; @0046 jump block1 +;; +;; block1: +;; @0046 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0049 v3 = iconst.i32 3 +;; @004b jump block1 +;; +;; block1: +;; @004b return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:7 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0050 v4 = iconst.i32 16 +;; @0050 v5 = icmp uge v2, v4 ; v4 = 16 +;; @0050 v6 = uextend.i64 v2 +;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 +;; +;; block2 cold: +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) +;; +;; block3(v16: i64): +;; @0050 v20 = load.i64 user7 aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) +;; @0053 jump block1 +;; +;; block1: +;; @0053 return v22 +;; } diff --git a/tests/disas/call-indirect-mutable-keeps-sigcheck.wat b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat new file mode 100644 index 000000000000..03318a349ef7 --- /dev/null +++ b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat @@ -0,0 +1,159 @@ +;;! target = "x86_64" + +;; Counterpart to `call-indirect-immutable-elide-sig.wat`. Same module +;; shape — same elem segment, same uniform call-site type — but one +;; function writes to the table via `table.set`. That sets the +;; `tables_mutated` bit and disables sig-check elision. +;; +;; Look for the runtime sig load + compare on the call site: +;; load.i32 user6 aligned readonly v_+16 +;; icmp eq +;; trapz user7 +;; (versus the elided form in the immutable test). + +(module + (table 10 10 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Mutator: this clears the immutability proof for table 0. + (func (export "mutate") (param i32) + local.get 0 + ref.func $f1 + table.set 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @004d v3 = iconst.i32 1 +;; @004f jump block1 +;; +;; block1: +;; @004f return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0052 v3 = iconst.i32 2 +;; @0054 jump block1 +;; +;; block1: +;; @0054 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0057 v3 = iconst.i32 3 +;; @0059 jump block1 +;; +;; block1: +;; @0059 return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i32) -> i64 tail +;; fn0 = colocated u805306368:6 sig0 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @005e v3 = iconst.i32 0 +;; @005e v4 = call fn0(v0, v3) ; v3 = 0 +;; @0060 v5 = iconst.i32 10 +;; @0060 v6 = icmp uge v2, v5 ; v5 = 10 +;; @0060 v7 = uextend.i64 v2 +;; @0060 v8 = load.i64 notrap aligned readonly can_move v0+48 +;; @0060 v9 = iconst.i64 3 +;; @0060 v10 = ishl v7, v9 ; v9 = 3 +;; @0060 v11 = iadd v8, v10 +;; @0060 v12 = iconst.i64 0 +;; @0060 v13 = select_spectre_guard v6, v12, v11 ; v12 = 0 +;; @0060 v14 = iconst.i64 1 +;; @0060 v15 = bor v4, v14 ; v14 = 1 +;; @0060 store user6 aligned region1 v15, v13 +;; @0062 jump block1 +;; +;; block1: +;; @0062 return +;; } +;; +;; function u0:4(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" +;; region2 = 40 "VMContext+0x28" +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:7 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0067 v4 = iconst.i32 10 +;; @0067 v5 = icmp uge v2, v4 ; v4 = 10 +;; @0067 v6 = uextend.i64 v2 +;; @0067 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; @0067 v8 = iconst.i64 3 +;; @0067 v9 = ishl v6, v8 ; v8 = 3 +;; @0067 v10 = iadd v7, v9 +;; @0067 v11 = iconst.i64 0 +;; @0067 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0067 v13 = load.i64 user6 aligned region1 v12 +;; @0067 v14 = iconst.i64 -2 +;; @0067 v15 = band v13, v14 ; v14 = -2 +;; @0067 brif v13, block3(v15), block2 +;; +;; block2 cold: +;; @0067 v17 = iconst.i32 0 +;; @0067 v18 = uextend.i64 v2 +;; @0067 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0067 jump block3(v19) +;; +;; block3(v16: i64): +;; @0067 v20 = load.i64 notrap aligned readonly can_move region2 v0+40 +;; @0067 v21 = load.i32 notrap aligned readonly can_move v20 +;; @0067 v22 = load.i32 user7 aligned readonly v16+16 +;; @0067 v23 = icmp eq v22, v21 +;; @0067 v24 = uextend.i32 v23 +;; @0067 trapz v24, user8 +;; @0067 v25 = load.i64 notrap aligned readonly v16+8 +;; @0067 v26 = load.i64 notrap aligned readonly v16+24 +;; @0067 v27 = call_indirect sig0, v25(v26, v0) +;; @006a jump block1 +;; +;; block1: +;; @006a return v27 +;; } diff --git a/tests/disas/gc/call-indirect-final-type.wat b/tests/disas/gc/call-indirect-final-type.wat index 0406261611bf..13ffa96bec62 100644 --- a/tests/disas/gc/call-indirect-final-type.wat +++ b/tests/disas/gc/call-indirect-final-type.wat @@ -23,47 +23,38 @@ ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx -;; gv4 = load.i64 notrap aligned gv3+48 -;; gv5 = load.i64 notrap aligned gv3+56 +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64, i32) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail ;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32, v3: i32): -;; @002b v5 = load.i64 notrap aligned v0+56 -;; @002b v9 = load.i64 notrap aligned v0+48 -;; @002b v6 = ireduce.i32 v5 -;; @002b v7 = icmp uge v3, v6 -;; @002b v13 = iconst.i64 0 -;; @002b v8 = uextend.i64 v3 -;; @002b v10 = iconst.i64 3 -;; @002b v11 = ishl v8, v10 ; v10 = 3 -;; @002b v12 = iadd v9, v11 -;; @002b v14 = select_spectre_guard v7, v13, v12 ; v13 = 0 -;; @002b v15 = load.i64 user6 aligned region1 v14 -;; @002b v16 = iconst.i64 -2 -;; @002b v17 = band v15, v16 ; v16 = -2 -;; @002b brif v15, block3(v17), block2 +;; @002b v12 = iconst.i64 0 +;; @002b v14 = load.i64 user6 aligned region1 v12 ; v12 = 0 +;; @002b v15 = iconst.i64 -2 +;; @002b v16 = band v14, v15 ; v15 = -2 +;; @002b brif v14, block3(v16), block2 ;; ;; block2 cold: -;; @002b v19 = iconst.i32 0 -;; @002b v21 = call fn0(v0, v19, v8) ; v19 = 0 -;; @002b jump block3(v21) +;; @002b v5 = iconst.i32 0 +;; @002b v7 = uextend.i64 v3 +;; @002b v20 = call fn0(v0, v5, v7) ; v5 = 0 +;; @002b jump block3(v20) ;; -;; block3(v18: i64): -;; @002b v24 = load.i32 user7 aligned readonly v18+16 -;; @002b v22 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @002b v23 = load.i32 notrap aligned readonly can_move v22 -;; @002b v25 = icmp eq v24, v23 -;; @002b trapz v25, user8 -;; @002b v27 = load.i64 notrap aligned readonly v18+8 -;; @002b v28 = load.i64 notrap aligned readonly v18+24 -;; @002b v29 = call_indirect sig0, v27(v28, v0, v2) +;; block3(v17: i64): +;; @002b v23 = load.i32 user7 aligned readonly v17+16 +;; @002b v21 = load.i64 notrap aligned readonly can_move region2 v0+40 +;; @002b v22 = load.i32 notrap aligned readonly can_move v21 +;; @002b v24 = icmp eq v23, v22 +;; @002b trapz v24, user8 +;; @002b v26 = load.i64 notrap aligned readonly v17+8 +;; @002b v27 = load.i64 notrap aligned readonly v17+24 +;; @002b v28 = call_indirect sig0, v26(v27, v0, v2) ;; @002e jump block1 ;; ;; block1: -;; @002e return v29 +;; @002e return v28 ;; } ;; ;; function u0:1(i64 vmctx, i64, i32, i32) -> i32 tail { @@ -74,41 +65,32 @@ ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx -;; gv4 = load.i64 notrap aligned gv3+48 -;; gv5 = load.i64 notrap aligned gv3+56 +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64, i32) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail ;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32, v3: i32): -;; @0035 v5 = load.i64 notrap aligned v0+56 -;; @0035 v9 = load.i64 notrap aligned v0+48 -;; @0035 v6 = ireduce.i32 v5 -;; @0035 v7 = icmp uge v3, v6 -;; @0035 v13 = iconst.i64 0 -;; @0035 v8 = uextend.i64 v3 -;; @0035 v10 = iconst.i64 3 -;; @0035 v11 = ishl v8, v10 ; v10 = 3 -;; @0035 v12 = iadd v9, v11 -;; @0035 v14 = select_spectre_guard v7, v13, v12 ; v13 = 0 -;; @0035 v15 = load.i64 user6 aligned region1 v14 -;; @0035 v16 = iconst.i64 -2 -;; @0035 v17 = band v15, v16 ; v16 = -2 -;; @0035 brif v15, block3(v17), block2 +;; @0035 v12 = iconst.i64 0 +;; @0035 v14 = load.i64 user6 aligned region1 v12 ; v12 = 0 +;; @0035 v15 = iconst.i64 -2 +;; @0035 v16 = band v14, v15 ; v15 = -2 +;; @0035 brif v14, block3(v16), block2 ;; ;; block2 cold: -;; @0035 v19 = iconst.i32 0 -;; @0035 v21 = call fn0(v0, v19, v8) ; v19 = 0 -;; @0035 jump block3(v21) +;; @0035 v5 = iconst.i32 0 +;; @0035 v7 = uextend.i64 v3 +;; @0035 v20 = call fn0(v0, v5, v7) ; v5 = 0 +;; @0035 jump block3(v20) ;; -;; block3(v18: i64): -;; @0035 v24 = load.i32 user7 aligned readonly v18+16 -;; @0035 v22 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @0035 v23 = load.i32 notrap aligned readonly can_move v22 -;; @0035 v25 = icmp eq v24, v23 -;; @0035 trapz v25, user8 -;; @0035 v27 = load.i64 notrap aligned readonly v18+8 -;; @0035 v28 = load.i64 notrap aligned readonly v18+24 -;; @0035 return_call_indirect sig0, v27(v28, v0, v2) +;; block3(v17: i64): +;; @0035 v23 = load.i32 user7 aligned readonly v17+16 +;; @0035 v21 = load.i64 notrap aligned readonly can_move region2 v0+40 +;; @0035 v22 = load.i32 notrap aligned readonly can_move v21 +;; @0035 v24 = icmp eq v23, v22 +;; @0035 trapz v24, user8 +;; @0035 v26 = load.i64 notrap aligned readonly v17+8 +;; @0035 v27 = load.i64 notrap aligned readonly v17+24 +;; @0035 return_call_indirect sig0, v26(v27, v0, v2) ;; } diff --git a/tests/disas/indirect-call-no-caching.wat b/tests/disas/indirect-call-no-caching.wat index ae42c54f4c27..1a2e852558bb 100644 --- a/tests/disas/indirect-call-no-caching.wat +++ b/tests/disas/indirect-call-no-caching.wat @@ -68,7 +68,6 @@ ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { ;; region0 = 8 "VMContext+0x8" ;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" -;; region2 = 40 "VMContext+0x28" ;; gv0 = vmctx ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 @@ -101,17 +100,11 @@ ;; @0050 jump block3(v19) ;; ;; block3(v16: i64): -;; @0050 v20 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @0050 v21 = load.i32 notrap aligned readonly can_move v20 -;; @0050 v22 = load.i32 user7 aligned readonly v16+16 -;; @0050 v23 = icmp eq v22, v21 -;; @0050 v24 = uextend.i32 v23 -;; @0050 trapz v24, user8 -;; @0050 v25 = load.i64 notrap aligned readonly v16+8 -;; @0050 v26 = load.i64 notrap aligned readonly v16+24 -;; @0050 v27 = call_indirect sig0, v25(v26, v0) +;; @0050 v20 = load.i64 user7 aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v27 +;; @0053 return v22 ;; } diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat new file mode 100644 index 000000000000..178f86f72259 --- /dev/null +++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat @@ -0,0 +1,239 @@ +;;! target = "pulley64" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Immutable funcref table fully populated by a static elem segment — the +;; `is_eagerly_initialized_funcref_table` predicate holds AND sig check +;; is statically elided. Two-layer fusion fires at the call_indirect +;; dispatch tail: +;; +;; 1. `try_fuse_funcref_dispatch` (phase 2) absorbs the brif + the two +;; VMFuncRef field loads (`wasm_call` + `vmctx`) emitted by +;; `load_code_and_vmctx`, and emits one `xfuncref_dispatch_not_x64` +;; Pulley op. The continuation block's standalone loads are skipped +;; via the cross-block sink performed by Pulley's `pre_lower` hook. +;; +;; 2. The preceding `xband64_s8 v, -2` stays as a separate op (its +;; result is `src` to the fused dispatch). Phase-1's `BandBrIf` +;; fusion does NOT fire here because phase 2 absorbs the brif +;; first (the recogniser tries phase 2 before phase 1). +;; +;; What we pin here: the dispatch tail is exactly +;; `xband64_s8 ; xfuncref_dispatch_not_x64 ; call_indirect` — three +;; Pulley dispatches instead of the unfused five. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x25 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x59 // target = 0x72 +;; 20: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 +;; zext32 x15, x2 +;; xshl64_u6 x1, x15, 3 +;; xadd64 x0, x0, x1 +;; xload64le_o32 x1, x0, 0 +;; xband64_s8 x0, x1, -2 +;; br_if_xeq64_i8 x1, 0, 0x22 // target = 0x60 +;; 45: xmov x25, x3 +;; xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x25 +;; pop_frame_restore 16, x25 +;; ret +;; 60: xzero x0 +;; 62: xmov x25, x3 +;; 65: call3 x25, x0, x15, 0x267 // target = 0x2cc +;; 6d: jump -0x25 // target = 0x48 +;; 72: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0xc7 +;; xstore64le_o32 x13, 80, x15 +;; call -0xac // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc7 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; c7: xzero x0 +;; c9: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ce: ret +;; +;; wasm[0]::array_to_wasm_trampoline[1]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x121 +;; xstore64le_o32 x13, 80, x15 +;; call -0x101 // target = 0x5 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x121 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 121: xzero x0 +;; 123: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 128: ret +;; +;; wasm[0]::array_to_wasm_trampoline[2]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x17b +;; xstore64le_o32 x13, 80, x15 +;; call -0x155 // target = 0xb +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17b +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 17b: xzero x0 +;; 17d: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 182: ret +;; +;; wasm[0]::array_to_wasm_trampoline[3]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x15, x0, 8 +;; xmov_fp x2 +;; xstore64le_o32 x15, 72, x2 +;; xmov x2, sp +;; xstore64le_o32 x15, 64, x2 +;; xpcadd x2, 0x2d // target = 0x1df +;; xstore64le_o32 x15, 80, x2 +;; call3 x0, x1, x14, -0x1b0 // target = 0x11 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1df +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 1df: xzero x0 +;; 1e1: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e6: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x2, x0 +;; xmov x17, x1 +;; xload64le_o32 x13, x1, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 48, x14 +;; xmov_lr x14 +;; xstore64le_o32 x13, 56, x14 +;; xload64le_o32 x0, x0, 8 +;; xmov x16, sp +;; xone x4 +;; xmov x1, x2 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x15, x0 +;; br_if_not32 x15, 0x13 // target = 0x23e +;; 231: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 23e: xmov x1, x17 +;; 241: xload64le_o32 x0, x1, 16 +;; 248: xload64le_o32 x0, x0, 328 +;; 24f: call_indirect_host 42 +;; 253: trap +;; +;; signatures[1]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload64le_o32 x14, x1, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 48, x15 +;; xmov_lr x15 +;; xstore64le_o32 x14, 56, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0x2b4 +;; 2a7: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 2b4: xmov x1, x17 +;; 2b7: xload64le_o32 x0, x1, 16 +;; 2be: xload64le_o32 x0, x0, 328 +;; 2c5: call_indirect_host 42 +;; 2c9: trap +;; +;; wasmtime_builtin_table_get_lazy_init_func_ref: +;; push_frame +;; xload64le_o32 x9, x0, 8 +;; xmov_fp x10 +;; xstore64le_o32 x9, 48, x10 +;; xmov_lr x10 +;; xstore64le_o32 x9, 56, x10 +;; xload64le_o32 x11, x0, 16 +;; xmov x13, x0 +;; xload64le_o32 x0, x11, 56 +;; xmov x3, x2 +;; xmov x2, x1 +;; xmov x1, x13 +;; call_indirect_host 8 +;; pop_frame +;; ret diff --git a/tests/disas/pulley-fusion-fires-32bit.wat b/tests/disas/pulley-fusion-fires-32bit.wat new file mode 100644 index 000000000000..90ccab4100a9 --- /dev/null +++ b/tests/disas/pulley-fusion-fires-32bit.wat @@ -0,0 +1,234 @@ +;;! target = "pulley32" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Phase 2 fusion on 32-bit Pulley (used by arm64_32-apple-watchos +;; via cross-language LTO + linker-plugin-lto). The fused op is +;; `xfuncref_dispatch_x32` with i8 offsets 4 (wasm_call) and 12 +;; (vmctx) — half of the pulley64 offsets (8 and 24). +;; +;; This test pins the 32-bit dispatch tail shape AND verifies that +;; the `imm.bits() == -2` gate fires here (the band's Imm64 from +;; func_environ's `Imm64::from(-2_i64)` still bits-equals -2 even +;; though Cranelift truncates the imm to i32 for an i32 band). +;; +;; Known-follow-up from `docs/opcode-fusion-funcref-dispatch.md` → +;; "Known follow-ups" — arm64_32 / Apple Watch confirmation. This +;; test is the static side of that confirmation; the dynamic side +;; (a Pulley-on-Apple-Watch run) is gated by Apple Watch SE2 +;; hardware access. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x25 +;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x6e +;; 1d: xload32le_o32 x15, x0, 24 +;; xmov x3, x0 +;; xshl32_u6 x0, x2, 2 +;; xadd32 x15, x15, x0 +;; xload32le_o32 x15, x15, 0 +;; xband32_s8 x0, x15, -2 +;; br_if_not32 x15, 0x21 // target = 0x59 +;; 3e: xmov x25, x3 +;; xload32le_o32 x1, x0, 4 +;; xload32le_o32 x0, x0, 12 +;; call_indirect2 x1, x0, x25 +;; pop_frame_restore 16, x25 +;; ret +;; 59: xzero x0 +;; 5b: zext32 x1, x2 +;; 5e: xmov x25, x3 +;; 61: call3 x25, x0, x1, 0x267 // target = 0x2c8 +;; 69: jump -0x28 // target = 0x41 +;; 6e: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x13, x0, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 48, x14 +;; xmov x14, sp +;; xstore32le_o32 x13, 44, x14 +;; xpcadd x15, 0x2a // target = 0xc3 +;; xstore32le_o32 x13, 52, x15 +;; call -0xa8 // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; c3: xzero x0 +;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ca: ret +;; +;; wasm[0]::array_to_wasm_trampoline[1]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x13, x0, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 48, x14 +;; xmov x14, sp +;; xstore32le_o32 x13, 44, x14 +;; xpcadd x15, 0x2a // target = 0x11d +;; xstore32le_o32 x13, 52, x15 +;; call -0xfd // target = 0x5 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 11d: xzero x0 +;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 124: ret +;; +;; wasm[0]::array_to_wasm_trampoline[2]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x13, x0, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 48, x14 +;; xmov x14, sp +;; xstore32le_o32 x13, 44, x14 +;; xpcadd x15, 0x2a // target = 0x177 +;; xstore32le_o32 x13, 52, x15 +;; call -0x151 // target = 0xb +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 177: xzero x0 +;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 17e: ret +;; +;; wasm[0]::array_to_wasm_trampoline[3]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x15, x0, 4 +;; xmov_fp x2 +;; xstore32le_o32 x15, 48, x2 +;; xmov x2, sp +;; xstore32le_o32 x15, 44, x2 +;; xpcadd x2, 0x2d // target = 0x1db +;; xstore32le_o32 x15, 52, x2 +;; call3 x0, x1, x14, -0x1ac // target = 0x11 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 1db: xzero x0 +;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e2: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x2, x0 +;; xmov x17, x1 +;; xload32le_o32 x13, x1, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 36, x14 +;; xmov_lr x14 +;; xstore32le_o32 x13, 40, x14 +;; xload32le_o32 x0, x0, 4 +;; xmov x16, sp +;; xone x4 +;; xmov x1, x2 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x15, x0 +;; br_if_not32 x15, 0x13 // target = 0x23a +;; 22d: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 23a: xmov x1, x17 +;; 23d: xload32le_o32 x0, x1, 8 +;; 244: xload32le_o32 x0, x0, 164 +;; 24b: call_indirect_host 42 +;; 24f: trap +;; +;; signatures[1]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload32le_o32 x14, x1, 4 +;; xmov_fp x15 +;; xstore32le_o32 x14, 36, x15 +;; xmov_lr x15 +;; xstore32le_o32 x14, 40, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload32le_o32 x0, x0, 4 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0x2b0 +;; 2a3: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 2b0: xmov x1, x17 +;; 2b3: xload32le_o32 x0, x1, 8 +;; 2ba: xload32le_o32 x0, x0, 164 +;; 2c1: call_indirect_host 42 +;; 2c5: trap +;; +;; wasmtime_builtin_table_get_lazy_init_func_ref: +;; push_frame +;; xload32le_o32 x9, x0, 4 +;; xmov_fp x10 +;; xstore32le_o32 x9, 36, x10 +;; xmov_lr x10 +;; xstore32le_o32 x9, 40, x10 +;; xload32le_o32 x11, x0, 8 +;; xmov x13, x0 +;; xload32le_o32 x0, x11, 28 +;; xmov x3, x2 +;; xmov x2, x1 +;; xmov x1, x13 +;; call_indirect_host 8 +;; pop_frame +;; ret diff --git a/tests/disas/pulley-fusion-fires-multi-call.wat b/tests/disas/pulley-fusion-fires-multi-call.wat new file mode 100644 index 000000000000..abd94de07148 --- /dev/null +++ b/tests/disas/pulley-fusion-fires-multi-call.wat @@ -0,0 +1,95 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Multiple call_indirect sites in the same function should each fuse +;; independently. The pre-pass scans every brif in every block; each +;; matching pattern marks its own pair of continuation loads as +;; absorbed. The lowering emits a separate FuncrefDispatch MachInst +;; at each brif. +;; +;; This test pins that the optimisation is per-call-site, not +;; per-function. A bug that misuses the pre-pass's `to_sink` list +;; (e.g. accidental dedup, missing one of two patterns) would show up +;; as one of the two dispatch tails reverting to unfused form. +;; +;; Reference precedent: ChakraCore #5915 ("setPrototypeOf does not +;; invalidate cached instanceof IC inside currently-executing +;; frame") — fused-op caches must be per-site, not per-function. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_two") (param i32 i32) (result i32) + local.get 0 + call_indirect (result i32) + local.get 1 + call_indirect (result i32) + i32.add) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 32, x16, x17, x28, x29 +;; xmov x29, x3 +;; br_if_xugteq32_u8 x2, 3, 0xb1 // target = 0xca +;; 20: xload64le_o32 x28, x0, 48 +;; xmov x4, x0 +;; zext32 x1, x2 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x28, x0 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x68 // target = 0xa6 +;; 45: xmov x16, x4 +;; xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x16 +;; xmov x3, x29 +;; xmov x4, x16 +;; xmov x17, x0 +;; br_if_xugteq32_u8 x3, 3, 0x6a // target = 0xcd +;; 6a: zext32 x1, x3 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x28, x0 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x3a // target = 0xb8 +;; 85: xmov x16, x4 +;; xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x16 +;; xmov x1, x17 +;; xadd32 x0, x1, x0 +;; pop_frame_restore 32, x16, x17, x28, x29 +;; ret +;; a6: xzero x0 +;; a8: xmov x16, x4 +;; ab: call3 x16, x0, x1, 0x28f // target = 0x33a +;; b3: jump -0x6b // target = 0x48 +;; b8: xzero x0 +;; ba: xmov x16, x4 +;; bd: call3 x16, x0, x1, 0x27d // target = 0x33a +;; c5: jump -0x3d // target = 0x88 +;; ca: trap +;; cd: trap diff --git a/tests/disas/pulley-fusion-fires-return-call-indirect.wat b/tests/disas/pulley-fusion-fires-return-call-indirect.wat new file mode 100644 index 000000000000..ae5faaba802c --- /dev/null +++ b/tests/disas/pulley-fusion-fires-return-call-indirect.wat @@ -0,0 +1,60 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; `return_call_indirect` IS a tail call but the lazy-init brif is +;; unchanged — only the call op itself is different. Phase 2 still +;; matches and fires here: the brif's continuation block contains +;; the same canonical 2-load pattern, and after the loads is a +;; `return_call_indirect` (lowered as `xjump` after the field reads) +;; instead of a `call_indirect`. Both consume (code, vmctx) the same +;; way, so the fusion is sound across the tail-call boundary. +;; +;; The disas confirms: `xband64_s8 ; xfuncref_dispatch_not_x64 ; +;; xjump` — the tail jump replaces what would have been +;; `call_indirect` in the non-tail case. +;; +;; Reference precedent: WAMR #2231 ("AOT/JIT tail-call: +;; `return_call_indirect` is not actually tail" — uses LLVM `tail` +;; hint instead of `musttail`). Our fusion preserves tail-call +;; semantics because it runs upstream of the call_indirect-vs- +;; return_call_indirect choice; this test pins that. + +(module + (table 1 1 funcref) + (type $sig (func (result i32))) + + (func $f1 (result i32) i32.const 1) + + (func (export "trampoline") (param i32) (result i32) + local.get 0 + return_call_indirect (type $sig)) + + (elem (i32.const 0) func $f1)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]: +;; push_frame_save 16, x25 +;; br_if32 x2, 0x58 // target = 0x62 +;; 10: xload64le_o32 x15, x0, 48 +;; xmov x1, x0 +;; zext32 x14, x2 +;; xshl64_u6 x0, x14, 3 +;; xadd64 x15, x15, x0 +;; xload64le_o32 x15, x15, 0 +;; xband64_s8 x0, x15, -2 +;; br_if_xeq64_i8 x15, 0, 0x22 // target = 0x50 +;; 35: xmov x25, x1 +;; xload64le_o32 x15, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x25 +;; pop_frame_restore 16, x25 +;; xjump x15 +;; 50: xzero x0 +;; xmov x25, x1 +;; call3 x25, x0, x14, 0x1b3 // target = 0x208 +;; jump -0x25 // target = 0x38 +;; 62: trap diff --git a/tests/disas/pulley-fusion-no-fire-mutable-table.wat b/tests/disas/pulley-fusion-no-fire-mutable-table.wat new file mode 100644 index 000000000000..f1c57adc892a --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-mutable-table.wat @@ -0,0 +1,341 @@ +;;! target = "pulley64" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Phase 1 / phase 2 fusion gating: a single `table.set` anywhere in +;; the module sets `tables_mutated[idx] = true` for that table, which +;; disables the `is_eagerly_initialized_funcref_table` predicate. +;; func_environ's IR rewrite then emits the ORIGINAL brif on `value` +;; (unmasked) instead of the rewritten brif on `value_masked`. With no +;; `brif(band(v, -2))` pattern reaching the lowering, neither phase 1 +;; (BandBrIf) nor phase 2 (FuncrefDispatch) fires. The dispatch tail +;; keeps its separate band + brif + xload + xload + call_indirect ops. +;; +;; Reference precedents in upstream interpreters where similar +;; mutation-invariant edges caused real bugs: +;; - V8 issue 5913 (call_indirect signature mismatch under table +;; sharing) — the sig-elide invariant must not survive a foreign +;; mutation. +;; - GHSA-q49f-xg75-m9xw (wasmtime Winch table.fill host panic) — +;; bulk table ops must invalidate fusion-eligibility. +;; - Hermes 24a8fe64 (HiddenClass GC'd mid-IC), Luau release/717 +;; (userdata write didn't invalidate store cache) — the general +;; shape "fused-op cached state survives invalidation source". +;; +;; This test pins the gating. Adding a `table.set` anywhere should +;; produce the unfused dispatch sequence below. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Mutator: clears the immutability proof for table 0. + (func (export "mutate") (param i32) + local.get 0 + ref.func $f1 + table.set 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x16, x17 +;; xmov x12, x0 +;; xmov x17, x2 +;; xzero x9 +;; xmov x16, x12 +;; call2 x16, x9, 0x3da // target = 0x3fb +;; xmov x2, x17 +;; xmov x12, x16 +;; br_if_xugteq32_u8 x2, 3, 0x2b // target = 0x59 +;; 35: xbor64_s8 x10, x0, 1 +;; xmov x0, x12 +;; xload64le_o32 x11, x0, 48 +;; zext32 x12, x2 +;; xshl64_u6 x12, x12, 3 +;; xadd64 x11, x11, x12 +;; xstore64le_o32 x11, 0, x10 +;; pop_frame_restore 16, x16, x17 +;; ret +;; 59: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) +;; +;; wasm[0]::function[4]: +;; push_frame_save 16, x28 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x7c // target = 0xe0 +;; 6b: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 +;; zext32 x1, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0xce +;; 90: xmov x28, x3 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0xe3 +;; 9a: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x28, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x37 // target = 0xe6 +;; b6: xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x28 +;; pop_frame_restore 16, x28 +;; ret +;; ce: xzero x0 +;; d0: xmov x28, x3 +;; d3: call3 x28, x0, x1, 0x363 // target = 0x436 +;; db: jump -0x48 // target = 0x93 +;; e0: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) +;; e3: trap +;; ╰─╼ trap: Normal(IndirectCallToNull) +;; e6: trap +;; ╰─╼ trap: Normal(BadSignature) +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x13b +;; xstore64le_o32 x13, 80, x15 +;; call -0x120 // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x13b +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 13b: xzero x0 +;; 13d: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 142: ret +;; +;; wasm[0]::array_to_wasm_trampoline[1]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x195 +;; xstore64le_o32 x13, 80, x15 +;; call -0x175 // target = 0x5 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x195 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 195: xzero x0 +;; 197: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 19c: ret +;; +;; wasm[0]::array_to_wasm_trampoline[2]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x1ef +;; xstore64le_o32 x13, 80, x15 +;; call -0x1c9 // target = 0xb +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1ef +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 1ef: xzero x0 +;; 1f1: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1f6: ret +;; +;; wasm[0]::array_to_wasm_trampoline[3]: +;; push_frame_save 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x13, x2, 0 +;; xload64le_o32 x14, x0, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 72, x15 +;; xmov x15, sp +;; xstore64le_o32 x14, 64, x15 +;; xpcadd x15, 0x1f // target = 0x23e +;; xstore64le_o32 x14, 80, x15 +;; call3 x0, x1, x13, -0x21d // target = 0x11 +;; ├─╼ exception frame offset: SP = FP - 0x80 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x23e +;; xone x0 +;; pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 23e: xzero x0 +;; 240: pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 245: ret +;; +;; wasm[0]::array_to_wasm_trampoline[4]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x15, x0, 8 +;; xmov_fp x2 +;; xstore64le_o32 x15, 72, x2 +;; xmov x2, sp +;; xstore64le_o32 x15, 64, x2 +;; xpcadd x2, 0x2d // target = 0x2a2 +;; xstore64le_o32 x15, 80, x2 +;; call3 x0, x1, x14, -0x228 // target = 0x5c +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x2a2 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 2a2: xzero x0 +;; 2a4: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 2a9: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x2, x0 +;; xmov x17, x1 +;; xload64le_o32 x13, x1, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 48, x14 +;; xmov_lr x14 +;; xstore64le_o32 x13, 56, x14 +;; xload64le_o32 x0, x0, 8 +;; xmov x16, sp +;; xone x4 +;; xmov x1, x2 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x15, x0 +;; br_if_not32 x15, 0x13 // target = 0x301 +;; 2f4: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 301: xmov x1, x17 +;; 304: xload64le_o32 x0, x1, 16 +;; 30b: xload64le_o32 x0, x0, 328 +;; 312: call_indirect_host 42 +;; 316: trap +;; +;; signatures[1]::wasm_to_array_trampoline: +;; push_frame_save 32, x16 +;; xmov x5, x0 +;; xmov x16, x1 +;; xload64le_o32 x13, x1, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 48, x14 +;; xmov_lr x14 +;; xstore64le_o32 x13, 56, x14 +;; xmov x3, sp +;; xstore32le_o32 x3, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x5 +;; xmov x2, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0xc // target = 0x36d +;; 367: pop_frame_restore 32, x16 +;; ret +;; 36d: xmov x1, x16 +;; 370: xload64le_o32 x0, x1, 16 +;; 377: xload64le_o32 x0, x0, 328 +;; 37e: call_indirect_host 42 +;; 382: trap +;; +;; signatures[2]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload64le_o32 x14, x1, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 48, x15 +;; xmov_lr x15 +;; xstore64le_o32 x14, 56, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0x3e3 +;; 3d6: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 3e3: xmov x1, x17 +;; 3e6: xload64le_o32 x0, x1, 16 +;; 3ed: xload64le_o32 x0, x0, 328 +;; 3f4: call_indirect_host 42 +;; 3f8: trap +;; +;; wasmtime_builtin_ref_func: +;; push_frame +;; xload64le_o32 x8, x0, 8 +;; xmov_fp x9 +;; xstore64le_o32 x8, 48, x9 +;; xmov_lr x9 +;; xstore64le_o32 x8, 56, x9 +;; xload64le_o32 x10, x0, 16 +;; xmov x11, x0 +;; xload64le_o32 x0, x10, 48 +;; xmov x2, x1 +;; xmov x1, x11 +;; call_indirect_host 7 +;; pop_frame +;; ret +;; +;; wasmtime_builtin_table_get_lazy_init_func_ref: +;; push_frame +;; xload64le_o32 x9, x0, 8 +;; xmov_fp x10 +;; xstore64le_o32 x9, 48, x10 +;; xmov_lr x10 +;; xstore64le_o32 x9, 56, x10 +;; xload64le_o32 x11, x0, 16 +;; xmov x13, x0 +;; xload64le_o32 x0, x11, 56 +;; xmov x3, x2 +;; xmov x2, x1 +;; xmov x1, x13 +;; call_indirect_host 8 +;; pop_frame +;; ret diff --git a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat new file mode 100644 index 000000000000..398c5fee2cb3 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat @@ -0,0 +1,88 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 2 fusion does NOT match when the sig check is NOT statically +;; elided. With a runtime sig check, the continuation block starts +;; with a sig load (from the funcref's `type_index` field) + comparison +;; + trapz, NOT the two `wasm_call` / `vmctx` loads. Phase 2's +;; recogniser requires the first two CLIF insts in the continuation +;; to be the canonical loads, so it bails. Phase 1's band+brif fusion +;; still applies as fallback. +;; +;; The module shape: an untyped `funcref` table with elem entries of +;; MIXED signatures. With mixed sigs, `try_elide_sig_check_for_immutable_table` +;; cannot establish a uniform static type, and the runtime sig check +;; stays in the dispatch tail. +;; +;; Reference precedent: V8 issue 5913 ("call_indirect signature +;; mismatch with table-sharing") + WebKit changeset 273962 +;; ("call_ref / non-null funcref"): sig elision under "assumed- +;; immutable" predicates is a known footgun, and the safe fallback +;; is "keep the runtime sig check". + +(module + (table 3 3 funcref) + (type $sig (func (param i32) (result i32))) + + ;; $f1, $f2 match $sig. + (func $f1 (param i32) (result i32) i32.const 1) + (func $f2 (param i32) (result i32) i32.const 2) + ;; $f3 has a DIFFERENT signature — defeats uniform-sig elision. + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + local.get 0 + call_indirect (type $sig)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x16, x18 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x82 // target = 0x9b +;; 20: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 +;; zext32 x1, x2 +;; xmov x18, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x48 // target = 0x89 +;; 48: xmov x16, x3 +;; br_if_xeq64_i8 x0, 0, 0x53 // target = 0x9e +;; 52: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x16, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x3a // target = 0xa1 +;; 6e: xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x2, x18 +;; call_indirect2 x1, x0, x16 +;; pop_frame_restore 16, x16, x18 +;; ret +;; 89: xzero x0 +;; 8b: xmov x16, x3 +;; 8e: call3 x16, x0, x1, 0x281 // target = 0x30f +;; 96: jump -0x4b // target = 0x4b +;; 9b: trap +;; 9e: trap +;; a1: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-copy.wat b/tests/disas/pulley-fusion-no-fire-table-copy.wat new file mode 100644 index 000000000000..9b89bccf1ec9 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-table-copy.wat @@ -0,0 +1,187 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 1 / phase 2 fusion gating: `table.copy` mutates the +;; destination table. With table 0 as the copy destination, its +;; immutability proof is cleared and the eager-init predicate becomes +;; false — fusion does not fire. +;; +;; Note that this only marks the DESTINATION as mutated; the source +;; table (table 1) keeps its proof. wasm-benchmark/`environ`'s +;; `table_mutability` test suite has the integration coverage for the +;; src-vs-dst marking; this filetest pins the lowering-level +;; consequence (Pulley dispatch tail is unfused for the dst table). +;; +;; wasm3 #547 (`op_CallIndirect` SEGV — missing bounds check on table +;; index) is a related precedent: bulk-copy invariants that fail +;; silently in one engine produce dispatch-time crashes in another. + +(module + (table $tdst 5 5 funcref) + (table $tsrc 5 5 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Bulk mutator: clears the immutability proof for table $tdst. + (func (export "copy") (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 + table.copy $tdst $tsrc) + + ;; Call through the (potentially-mutated) destination table. + (func (export "call_dst") (param i32) (result i32) + local.get 0 + call_indirect $tdst (result i32)) + + ;; Call through the source table (still immutable from this + ;; module's perspective; fusion CAN fire here). + (func (export "call_src") (param i32) (result i32) + local.get 0 + call_indirect $tsrc (result i32)) + + (elem (table $tdst) (i32.const 0) func $f1 $f2 $f3) + (elem (table $tsrc) (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 64, x16, x17, x20, x21, x24, x26, x28 +;; zext32 x2, x2 +;; zext32 x1, x4 +;; xadd64 x5, x2, x1 +;; br_if_xugt64_u8 x5, 5, 0x109 // target = 0x128 +;; 26: zext32 x5, x3 +;; xadd64 x6, x5, x1 +;; br_if_xugt64_u8 x6, 5, 0xff // target = 0x12b +;; br_if_not32 x4, 0xcf // target = 0x102 +;; 39: xload64le_o32 x6, x0, 48 +;; xshl64_u6 x2, x2, 3 +;; xadd64 x17, x6, x2 +;; xload64le_o32 x16, x0, 64 +;; xmov x6, x0 +;; xshl64_u6 x0, x5, 3 +;; xadd64 x20, x16, x0 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x24, x17, x0 +;; xadd64 x26, x20, x0 +;; xadd32 x28, x3, x4 +;; xmov x0, x3 +;; br_if_xulteq64 x20, x17, 0x12 // target = 0x77 +;; 6c: xmov x21, x6 +;; xmov x28, x0 +;; jump 0x50 // target = 0xc2 +;; 77: xsub32_u8 x28, x28, 1 +;; br_if_xugteq32_u8 x28, 5, 0xb3 // target = 0x12e +;; 82: zext32 x1, x28 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x16, x0 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x72 // target = 0x108 +;; 9d: xmov x21, x6 +;; xbor64_s8 x0, x0, 1 +;; xsub64_u8 x24, x24, 8 +;; xstore64le_o32 x24, 0, x0 +;; xsub64_u8 x26, x26, 8 +;; br_if_xeq64 x26, x20, 0x4f // target = 0x102 +;; ba: xmov x6, x21 +;; jump -0x46 // target = 0x77 +;; br_if_xugteq32_u8 x28, 5, 0x6f // target = 0x131 +;; c9: zext32 x2, x28 +;; xshl64_u6 x3, x2, 3 +;; xadd64 x3, x16, x3 +;; xload64le_o32 x3, x3, 0 +;; xband64_s8 x0, x3, -2 +;; br_if_xeq64_i8 x3, 0, 0x3d // target = 0x11a +;; e4: xbor64_s8 x5, x0, 1 +;; xstore64le_o32 x17, 0, x5 +;; xadd64_u8 x20, x20, 8 +;; xadd64_u8 x17, x17, 8 +;; xadd32_u8 x28, x28, 1 +;; br_if_xneq64 x20, x26, -0x39 // target = 0xc2 +;; 102: pop_frame_restore 64, x16, x17, x20, x21, x24, x26, x28 +;; ret +;; 108: xone x0 +;; 10a: xmov x21, x6 +;; 10d: call3 x21, x0, x1, 0x4bf // target = 0x5cc +;; 115: jump -0x75 // target = 0xa0 +;; 11a: xone x4 +;; 11c: call2 x21, x4, 0x4b0 // target = 0x5cc +;; 123: jump -0x3f // target = 0xe4 +;; 128: trap +;; 12b: trap +;; 12e: trap +;; 131: trap +;; +;; wasm[0]::function[4]: +;; push_frame_save 16, x28 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 5, 0x7c // target = 0x1b8 +;; 143: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 +;; zext32 x1, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0x1a6 +;; 168: xmov x28, x3 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0x1bb +;; 172: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x28, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x37 // target = 0x1be +;; 18e: xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x28 +;; pop_frame_restore 16, x28 +;; ret +;; 1a6: xzero x0 +;; 1a8: xmov x28, x3 +;; 1ab: call3 x28, x0, x1, 0x421 // target = 0x5cc +;; 1b3: jump -0x48 // target = 0x16b +;; 1b8: trap +;; 1bb: trap +;; 1be: trap +;; +;; wasm[0]::function[5]: +;; push_frame_save 16, x25 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 5, 0x60 // target = 0x229 +;; 1d0: xmov x1, x3 +;; xload64le_o32 x0, x1, 64 +;; zext32 x15, x2 +;; xshl64_u6 x1, x15, 3 +;; xadd64 x0, x0, x1 +;; xload64le_o32 x1, x0, 0 +;; xband64_s8 x0, x1, -2 +;; br_if_xeq64_i8 x1, 0, 0x29 // target = 0x217 +;; 1f5: xmov x25, x3 +;; br_if_xeq64_i8 x0, 0, 0x34 // target = 0x22c +;; 1ff: xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x25 +;; pop_frame_restore 16, x25 +;; ret +;; 217: xone x0 +;; 219: xmov x25, x3 +;; 21c: call3 x25, x0, x15, 0x3b0 // target = 0x5cc +;; 224: jump -0x2c // target = 0x1f8 +;; 229: trap +;; 22c: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-fill.wat b/tests/disas/pulley-fusion-no-fire-table-fill.wat new file mode 100644 index 000000000000..9bb480d7ff20 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-table-fill.wat @@ -0,0 +1,110 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 1 / phase 2 fusion gating: `table.fill` is a bulk-memory op +;; that mutates an arbitrary range of the table. Like `table.set`, it +;; sets `tables_mutated[idx] = true` for the target table and disables +;; the eager-init predicate. The dispatch tail must be the unfused +;; sequence with the original `brif value` (not `brif value_masked`), +;; so neither phase 1 nor phase 2 fires. +;; +;; Reference: GHSA-q49f-xg75-m9xw (wasmtime Winch table.fill host +;; panic) — bulk table ops are a classic invariant-edge for any +;; "immutable-table" cache or fusion. wasm3 #335 (null table element +;; on Swift reactor-mode tables) showed how a partially-initialised +;; table breaks a "table is fully populated" assumption. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Bulk mutator: clears the immutability proof for table 0. + (func (export "fill_some") (param $dst i32) + local.get $dst + ref.func $f1 + i32.const 1 + table.fill 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x16, x20 +;; xmov x16, x2 +;; xzero x12 +;; xmov x20, x0 +;; call2 x20, x12, 0x3f7 // target = 0x415 +;; xmov x15, x0 +;; xmov x2, x16 +;; xmov x0, x20 +;; zext32 x12, x2 +;; xadd64_u8 x13, x12, 1 +;; br_if_xugt64_u8 x13, 3, 0x3e // target = 0x73 +;; 3c: xload64le_o32 x13, x0, 48 +;; xshl64_u6 x14, x12, 3 +;; xadd64 x13, x13, x14 +;; xmov x0, x15 +;; xmov x12, x13 +;; xbor64_s8 x14, x0, 1 +;; xstore64le_o32 x12, 0, x14 +;; xadd64_u8 x15, x12, 8 +;; br_if_xeq64 x12, x13, 0xf // target = 0x6d +;; 65: xmov x12, x15 +;; jump -0x19 // target = 0x4f +;; 6d: pop_frame_restore 16, x16, x20 +;; ret +;; 73: trap +;; +;; wasm[0]::function[4]: +;; push_frame_save 16, x28 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x7c // target = 0xfa +;; 85: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 +;; zext32 x1, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0xe8 +;; aa: xmov x28, x3 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0xfd +;; b4: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x28, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x37 // target = 0x100 +;; d0: xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x28 +;; pop_frame_restore 16, x28 +;; ret +;; e8: xzero x0 +;; ea: xmov x28, x3 +;; ed: call3 x28, x0, x1, 0x363 // target = 0x450 +;; f5: jump -0x48 // target = 0xad +;; fa: trap +;; fd: trap +;; 100: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-grow.wat b/tests/disas/pulley-fusion-no-fire-table-grow.wat new file mode 100644 index 000000000000..5dcac37c501e --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-table-grow.wat @@ -0,0 +1,103 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 1 / phase 2 fusion gating: `table.grow` adds slots at the +;; end of the table; new slots default to `ref.null func`. The +;; "eagerly-initialised, fully-populated" predicate doesn't hold +;; after grow, so fusion is disabled. +;; +;; In our `table_mutability` accounting (crates/environ), `table.grow` +;; sets the mutated bit for the target table the same way +;; `table.set` does. This filetest pins the lowering-level +;; consequence: the unfused dispatch sequence on the grown table. +;; +;; Reference: wasm3 #547 — bounds-check ↔ growth races; Luau release/ +;; 717 — "writes to userdata did not invalidate the store cache", +;; same shape of "fused-op cached a base pointer that got +;; reallocated". + +(module + (table 1 funcref) + + (func $f1 (result i32) i32.const 1) + + (func (export "grow") (param i32) (result i32) + ref.func $f1 + local.get 0 + table.grow 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]: +;; push_frame_save 48, x18, x19, x20, x23, x28 +;; xmov x23, x2 +;; xzero x19 +;; xmov x28, x0 +;; call2 x28, x19, 0x313 // target = 0x325 +;; xmov x20, x0 +;; xmov x2, x23 +;; xmov x0, x28 +;; zext32 x18, x2 +;; call3 x28, x19, x18, 0x379 // target = 0x39e +;; xmov x1, x0 +;; br_if_xeq32_i8 x1, -1, 0x51 // target = 0x81 +;; 37: xload64le_o32 x3, x28, 56 +;; zext32 x2, x1 +;; xadd64 x4, x2, x18 +;; zext32 x0, x3 +;; br_if_xult64 x0, x4, 0x43 // target = 0x8a +;; 4e: xload64le_o32 x0, x28, 48 +;; xshl64_u6 x2, x2, 3 +;; xadd64 x0, x0, x2 +;; xshl64_u6 x2, x18, 3 +;; xadd64 x2, x0, x2 +;; br_if_xeq64_i8 x18, 0, 0x20 // target = 0x81 +;; 68: xmov x3, x20 +;; xbor64_s8 x4, x3, 1 +;; xstore64le_o32 x0, 0, x4 +;; xadd64_u8 x0, x0, 8 +;; br_if_xneq64 x0, x2, -0xf // target = 0x6b +;; 81: xmov x0, x1 +;; pop_frame_restore 48, x18, x19, x20, x23, x28 +;; ret +;; 8a: trap +;; +;; wasm[0]::function[2]: +;; push_frame_save 16, x16 +;; xload64le_o32 x1, x0, 56 +;; br_if_xulteq32 x1, x2, 0x7c // target = 0x115 +;; a0: xload64le_o32 x3, x0, 48 +;; xmov x4, x0 +;; zext32 x1, x2 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x3, x0 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0x103 +;; c5: xmov x16, x4 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0x118 +;; cf: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x16, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x37 // target = 0x11b +;; eb: xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x16 +;; pop_frame_restore 16, x16 +;; ret +;; 103: xzero x0 +;; 105: xmov x16, x4 +;; 108: call3 x16, x0, x1, 0x258 // target = 0x360 +;; 110: jump -0x48 // target = 0xc8 +;; 115: trap +;; 118: trap +;; 11b: trap diff --git a/tests/disas/pulley-fusion-no-fire-user-mask.wat b/tests/disas/pulley-fusion-no-fire-user-mask.wat new file mode 100644 index 000000000000..2ce412df8cb9 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-user-mask.wat @@ -0,0 +1,92 @@ +;;! target = "pulley64" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Phase 1 / phase 2 fusion gating against user wasm: the recogniser +;; gates on `imm.bits() == -2`, which would naively match the wat +;; `(i32.const -2) (i32.and) (br_if)` user pattern and risk a soundness +;; mismatch (the fused op tests UNMASKED src for non-zero, whereas the +;; original brif tests `(v & -2) != 0` — they differ at v == 1). +;; +;; The bug is unreachable from wasm because: +;; * `br_if` cond is always i32 (wasm validation), AND +;; * the wat parser stores `(i32.const -2)` as `Imm64(0xFFFFFFFE)` +;; (= 4294967294), NOT `Imm64(-2)`. +;; So `imm.bits() == -2` doesn't match the wat-emitted i32 form. The +;; only producer of `Imm64(-2)` reaching the recogniser is +;; `func_environ::get_or_init_func_ref_table_elem`'s call to +;; `Imm64::from(-2_i64)`. +;; +;; This test pins the surface behaviour. If the gate ever changes to +;; accept i32 -2 encodings too, the disas would suddenly start +;; containing `xband32_s8_br_if_*` or `xfuncref_dispatch_*` here, and +;; this test fails — that's the signal to re-audit soundness. + +(module + (func (export "test") (param $v i32) (result i32) (local $tmp i32) + local.get $v + i32.const -2 + i32.and + local.tee $tmp + local.get $tmp + br_if 0 + drop + i32.const 999)) +;; wasm[0]::function[0]: +;; push_frame +;; xband32_s8 x0, x2, -2 +;; br_if32 x0, 0xa // target = 0xf +;; b: xconst16 x0, 999 +;; pop_frame +;; ret +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x15, x0, 8 +;; xmov_fp x2 +;; xstore64le_o32 x15, 72, x2 +;; xmov x2, sp +;; xstore64le_o32 x15, 64, x2 +;; xpcadd x2, 0x2d // target = 0x6d +;; xstore64le_o32 x15, 80, x2 +;; call3 x0, x1, x14, -0x4f // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x6d +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 6d: xzero x0 +;; 6f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 74: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload64le_o32 x14, x1, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 48, x15 +;; xmov_lr x15 +;; xstore64le_o32 x14, 56, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0xd3 +;; c6: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; d3: xmov x1, x17 +;; d6: xload64le_o32 x0, x1, 16 +;; dd: xload64le_o32 x0, x0, 328 +;; e4: call_indirect_host 42 +;; e8: trap diff --git a/tests/disas/pulley/call.wat b/tests/disas/pulley/call.wat index 233ca7be3c35..d9bc3142fd99 100644 --- a/tests/disas/pulley/call.wat +++ b/tests/disas/pulley/call.wat @@ -8,9 +8,7 @@ ;; wasm[0]::function[1]: ;; push_frame ;; xload32le_o32 x3, x0, 28 -;; xmov x6, x0 -;; xload32le_o32 x0, x6, 36 -;; xmov x1, x6 -;; call_indirect x3 +;; xload32le_o32 x4, x0, 36 +;; call_indirect2 x3, x4, x0 ;; pop_frame ;; ret diff --git a/tests/disas/readonly-funcrefs.wat b/tests/disas/readonly-funcrefs.wat index 9febf947e3b1..e341fbcc4dba 100644 --- a/tests/disas/readonly-funcrefs.wat +++ b/tests/disas/readonly-funcrefs.wat @@ -35,7 +35,6 @@ ;; function u0:1(i64 vmctx, i64, i32) tail { ;; region0 = 8 "VMContext+0x8" ;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" -;; region2 = 40 "VMContext+0x28" ;; gv0 = vmctx ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 @@ -67,14 +66,9 @@ ;; @0031 jump block3(v18) ;; ;; block3(v15: i64): -;; @0031 v21 = load.i32 user7 aligned readonly v15+16 -;; @0031 v19 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @0031 v20 = load.i32 notrap aligned readonly can_move v19 -;; @0031 v22 = icmp eq v21, v20 -;; @0031 trapz v22, user8 -;; @0031 v24 = load.i64 notrap aligned readonly v15+8 -;; @0031 v25 = load.i64 notrap aligned readonly v15+24 -;; @0031 call_indirect sig0, v24(v25, v0) +;; @0031 v19 = load.i64 user7 aligned readonly v15+8 +;; @0031 v20 = load.i64 notrap aligned readonly v15+24 +;; @0031 call_indirect sig0, v19(v20, v0) ;; @0034 jump block1 ;; ;; block1: diff --git a/tests/disas/startup-elem-active.wat b/tests/disas/startup-elem-active.wat index 0c3158f8c2b1..40cdfd2d91f4 100644 --- a/tests/disas/startup-elem-active.wat +++ b/tests/disas/startup-elem-active.wat @@ -42,37 +42,21 @@ ;; function u2415919104:0(i64 vmctx, i64) tail { ;; region0 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned gv0+48 -;; gv2 = load.i64 notrap aligned gv0+56 +;; gv1 = load.i64 notrap aligned readonly can_move gv0+48 ;; ;; block0(v0: i64, v1: i64): -;; v4 = load.i64 notrap aligned v0+56 -;; v5 = ireduce.i32 v4 -;; v6 = uextend.i64 v5 -;; v86 = iconst.i64 4 -;; v92 = icmp ult v6, v86 ; v86 = 4 -;; trapnz v92, user6 -;; v13 = load.i64 notrap aligned v0+48 -;; v103 = iconst.i32 21 -;; v2 = iconst.i32 1 -;; v114 = icmp ule v5, v2 ; v2 = 1 -;; v79 = iconst.i64 0 -;; v17 = iadd v13, v86 ; v86 = 4 -;; v34 = select_spectre_guard v114, v79, v17 ; v79 = 0 -;; store user6 aligned region0 v103, v34 ; v103 = 21 +;; v100 = iconst.i32 21 +;; v12 = load.i64 notrap aligned readonly can_move v0+48 +;; v79 = iconst.i64 4 +;; v16 = iadd v12, v79 ; v79 = 4 +;; store user6 aligned region0 v100, v16 ; v100 = 21 ;; v117 = iconst.i32 23 -;; v123 = iconst.i32 2 -;; v129 = icmp ule v5, v123 ; v123 = 2 -;; v131 = iconst.i64 8 -;; v49 = iadd v13, v131 ; v131 = 8 -;; v51 = select_spectre_guard v129, v79, v49 ; v79 = 0 -;; store user6 aligned region0 v117, v51 ; v117 = 23 -;; v133 = iconst.i32 25 -;; v3 = iconst.i32 3 -;; v144 = icmp ule v5, v3 ; v3 = 3 -;; v146 = iconst.i64 12 -;; v66 = iadd v13, v146 ; v146 = 12 -;; v68 = select_spectre_guard v144, v79, v66 ; v79 = 0 -;; store user6 aligned region0 v133, v68 ; v133 = 25 +;; v134 = iconst.i64 8 +;; v46 = iadd v12, v134 ; v134 = 8 +;; store user6 aligned region0 v117, v46 ; v117 = 23 +;; v136 = iconst.i32 25 +;; v152 = iconst.i64 12 +;; v62 = iadd v12, v152 ; v152 = 12 +;; store user6 aligned region0 v136, v62 ; v136 = 25 ;; return ;; } diff --git a/tests/disas/startup-table-initial-value.wat b/tests/disas/startup-table-initial-value.wat index 7b39ecc93333..a2cb9a5f6da2 100644 --- a/tests/disas/startup-table-initial-value.wat +++ b/tests/disas/startup-table-initial-value.wat @@ -35,31 +35,24 @@ ;; ;; function u2415919104:0(i64 vmctx, i64) tail { ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned gv0+48 -;; gv2 = load.i64 notrap aligned gv0+56 +;; gv1 = load.i64 notrap aligned readonly can_move gv0+48 ;; ;; block0(v0: i64, v1: i64): -;; v9 = load.i64 notrap aligned v0+56 -;; v10 = ireduce.i32 v9 -;; v11 = uextend.i64 v10 -;; v41 = iconst.i64 10 -;; v53 = icmp ult v11, v41 ; v41 = 10 -;; trapnz v53, user6 -;; v18 = load.i64 notrap aligned v0+48 +;; v17 = load.i64 notrap aligned readonly can_move v0+48 ;; v3 = iconst.i32 1 -;; v83 = iconst.i64 36 -;; v85 = iadd v18, v83 ; v83 = 36 -;; v20 = iconst.i64 4 -;; jump block1(v18) -;; -;; block1(v29: i64): -;; v88 = iconst.i32 1 -;; store notrap aligned v88, v29 ; v88 = 1 -;; v89 = iadd.i64 v18, v83 ; v83 = 36 -;; v90 = icmp eq v29, v89 -;; v91 = iconst.i64 4 -;; v92 = iadd v29, v91 ; v91 = 4 -;; brif v90, block2, block1(v92) +;; v84 = iconst.i64 36 +;; v86 = iadd v17, v84 ; v84 = 36 +;; v19 = iconst.i64 4 +;; jump block1(v17) +;; +;; block1(v28: i64): +;; v89 = iconst.i32 1 +;; store notrap aligned v89, v28 ; v89 = 1 +;; v90 = iadd.i64 v17, v84 ; v84 = 36 +;; v91 = icmp eq v28, v90 +;; v92 = iconst.i64 4 +;; v93 = iadd v28, v92 ; v92 = 4 +;; brif v91, block2, block1(v93) ;; ;; block2: ;; return diff --git a/tests/misc_testsuite/immutable-table-call-indirect.wast b/tests/misc_testsuite/immutable-table-call-indirect.wast new file mode 100644 index 000000000000..3b40cb9ab534 --- /dev/null +++ b/tests/misc_testsuite/immutable-table-call-indirect.wast @@ -0,0 +1,71 @@ +;;! reference_types = true + +;; call_indirect through tables that are never grown, exported, or mutated. +;; Compilation may use a constant bound and elide null/signature checks on +;; these shapes; runtime behavior must be unchanged: in-bounds calls work, +;; and out-of-bounds, null-slot, and signature-mismatch accesses still trap. + +;; Mixed-signature immutable table with a null hole. +(module + (type $i2i (func (param i32) (result i32))) + (type $v2i (func (result i32))) + (table 5 funcref) + (elem (i32.const 0) $add1 $ten $add1) + + (func $add1 (type $i2i) (i32.add (local.get 0) (i32.const 1))) + (func $ten (type $v2i) (i32.const 10)) + + (func (export "call-i2i") (param i32 i32) (result i32) + (call_indirect (type $i2i) (local.get 1) (local.get 0))) + (func (export "call-v2i") (param i32) (result i32) + (call_indirect (type $v2i) (local.get 0)))) + +(assert_return (invoke "call-i2i" (i32.const 0) (i32.const 41)) (i32.const 42)) +(assert_return (invoke "call-i2i" (i32.const 2) (i32.const 7)) (i32.const 8)) +(assert_return (invoke "call-v2i" (i32.const 1)) (i32.const 10)) + +;; Signature mismatch still traps. +(assert_trap (invoke "call-i2i" (i32.const 1) (i32.const 0)) "indirect call type mismatch") +(assert_trap (invoke "call-v2i" (i32.const 0)) "indirect call type mismatch") + +;; Null slots still trap: slot 3 was never initialized. +(assert_trap (invoke "call-i2i" (i32.const 3) (i32.const 0)) "uninitialized element") +(assert_trap (invoke "call-v2i" (i32.const 4)) "uninitialized element") + +;; Out of bounds still traps against the constant bound. +(assert_trap (invoke "call-i2i" (i32.const 5) (i32.const 0)) "undefined element") +(assert_trap (invoke "call-i2i" (i32.const -1) (i32.const 0)) "undefined element") + +;; Uniform-signature immutable table, fully initialized. +(module + (type $v2i (func (result i32))) + (table 3 funcref) + (elem (i32.const 0) $a $b $c) + + (func $a (type $v2i) (i32.const 1)) + (func $b (type $v2i) (i32.const 2)) + (func $c (type $v2i) (i32.const 3)) + + (func (export "call") (param i32) (result i32) + (call_indirect (type $v2i) (local.get 0))) + (func (export "call-wrong-type") (param i32 i32) (result i32) + (call_indirect (param i32) (result i32) (local.get 1) (local.get 0)))) + +(assert_return (invoke "call" (i32.const 0)) (i32.const 1)) +(assert_return (invoke "call" (i32.const 1)) (i32.const 2)) +(assert_return (invoke "call" (i32.const 2)) (i32.const 3)) +(assert_trap (invoke "call" (i32.const 3)) "undefined element") + +;; A caller whose expected type differs from the table's uniform type must +;; still observe the mismatch. +(assert_trap (invoke "call-wrong-type" (i32.const 0) (i32.const 0)) "indirect call type mismatch") + +;; Same shapes through a declared-growable (no max) table never actually +;; grown: an empty never-grown table has no valid index. +(module + (table 0 100 funcref) + (func (export "call-empty") (param i32) + (call_indirect (local.get 0)))) + +(assert_trap (invoke "call-empty" (i32.const 0)) "undefined element") +(assert_trap (invoke "call-empty" (i32.const 99)) "undefined element")