splicer 2.4.1 - Docs.rs

//! A [`wit_bindgen_core::abi::Bindgen`] implementation that emits
//! [`wasm_encoder`] instructions, used to drive
//! [`wit_bindgen_core::abi::lift_from_memory`] when the adapter needs
//! to load an async task's result from linear memory onto the wasm
//! value stack for `task.return`.
//!
//! ## Operand model
//!
//! `Operand = ()` — the wasm value stack is the source of truth. The
//! generator's internal operand stack tracks counts, not identities;
//! our `emit` pushes/pops placeholders to match the declared arity of
//! each [`Instruction`] variant.
//!
//! ## Address handling
//!
//! The base address for all loads lives in a local (`addr_local`) that
//! the caller sets before invoking `lift_from_memory`. Every load emit
//! prepends `local.get $addr` so the load has the address on top of
//! the wasm stack. The generator's "address operand" can be cloned
//! freely because our impl never pops a wasm value for it — it always
//! re-reads from the local.
//!
//! ## Block-capture IR
//!
//! Variant / option / result lifts require per-arm bodies that later
//! get wrapped in a wasm `block ... br_table ... end` structure.
//! Fixed-size lists (see below) need the element-read body replayed
//! N times with an advancing base address. Both cases use the same
//! mechanism: when `push_block` fires we start a fresh buffer; emits
//! redirect to the top-of-stack buffer; `finish_block` pops it to
//! [`CompletedBlock`]s that the variant / list-lift emit consumes.
//!
//! ## Fixed-size vs dynamic lists
//!
//! `list<T>` (dynamic) flattens to `[i32 ptr, i32 len]` — a
//! heap-like reference. `list<T, N>` (fixed-size) flattens to
//! `N × flat(T)` inlined on the wasm stack, semantically like
//! `tuple<T, …, T>`. When a fixed-size list is stored in memory
//! we walk the N contiguous slots and push each element's flat
//! values. See the `FixedLengthListLiftFromMemory` emit arm for
//! the full rationale and emission strategy.
//!
//! ## Authoritative canonical-ABI references
//!
//! When in doubt, these docs are ground truth for flatten / load /
//! store semantics that this Bindgen implements:
//!
//! - Spec narrative:
//!   <https://github.com/WebAssembly/component-model/blob/main/design/mvp/CanonicalABI.md>
//! - Python reference implementation (precise semantics):
//!   <https://github.com/WebAssembly/component-model/blob/main/design/mvp/canonical-abi/definitions.py>
//!
//! Individual emit arms below link to the specific spec functions
//! (`flatten_type`, `load`, `lift_flat`, etc.) they correspond to
//! when the mapping isn't obvious from the Instruction name.

use std::borrow::Cow;
use std::collections::HashMap;

use wasm_encoder::{BlockType, Instruction, MemArg, ValType};
use wit_bindgen_core::abi::{Bindgen, Bitcast, Instruction as AbiInst, WasmType};
use wit_parser::{Alignment, ArchitectureSize, FlagsRepr, Resolve, SizeAlign, Type};

use super::super::indices::LocalsBuilder;
use super::compat::{cast, flat_types};
use super::emit::wasm_type_to_val;

/// Bindgen that accumulates `wasm_encoder::Instruction`s into buffers,
/// ready to be flushed into a `Function` by [`WasmEncoderBindgen::drain_into`].
///
/// ## Lift vs. lower
///
/// The same bindgen drives both `lift_from_memory` (memory → flat
/// values on the wasm stack) and `lower_to_memory` (flat values from
/// wasm function params → memory). Lift uses [`Self::addr_local`] +
/// the load arms; lower additionally uses [`Self::flat_cursor`] +
/// [`Self::param_flat_count`] to feed the scalar lift-to-flat arms
/// (`I32FromU32`, etc.) plus the store arms. Pure-lift contexts can
/// skip [`Self::with_param_flat_count`].
///
/// Lower-mode invariant: wasm local index == `flat_cursor`. The
/// caller arranges for the wrapper's first `param_flat_count` locals
/// to be exactly the flat params, in canonical order. The replay
/// helpers rely on this to remap `LocalGet(k)` by cursor arithmetic.
/// Variant arms canonically overlap on the joined-flat window, so
/// `finish_block_body` rewinds the cursor for arm blocks tagged via
/// `VariantPayloadName`.
pub(crate) struct WasmEncoderBindgen<'a> {
    /// Top-level instruction buffer — the final output that goes into
    /// the target Function. Emits land here when no block is active.
    main: Vec<Instruction<'static>>,
    /// Stack of active blocks. When non-empty, emits go to the top
    /// block's buffer instead of `main`. Populated by `push_block`,
    /// drained by `finish_block`.
    block_buffers: Vec<ActiveBlock>,
    /// Completed blocks waiting to be consumed by `VariantLift` /
    /// `OptionLift` / `ResultLift` / `FixedLengthListLiftFromMemory`.
    /// LIFO order — last `finish_block` is at the top.
    completed_blocks: Vec<CompletedBlock>,
    /// Canonical-ABI sizes, required by `Bindgen::sizes`.
    sizes: &'a SizeAlign,
    /// Local index holding the base address for all loads/stores at
    /// the outermost scope. Iteration blocks override this via their
    /// own `iter_addr_local`; see [`WasmEncoderBindgen::current_addr_local`].
    /// For lower-mode the caller writes the per-param effective
    /// address into this local before each `lower_to_memory` invocation.
    addr_local: u32,
    /// Shared local allocator — the bindgen routes its dynamic
    /// allocations through the same [`LocalsBuilder`] the caller
    /// uses for its own locals, so all of the function's locals land
    /// in one contiguous, correctly-indexed block.
    indices: &'a mut LocalsBuilder,
    /// Width of the wrapper's flat-param block. `0` for lift-only.
    param_flat_count: u32,
    /// Next flat slot to consume; also the wasm-local index, by the
    /// lower-mode invariant.
    flat_cursor: u32,
    /// Lazy per-`ValType` scratch local for the value-shuffle template
    /// stores need (`local.set $tmp; local.get $addr; local.get $tmp;
    /// <store>`). Allocated through [`Self::indices`] on first use,
    /// reused across stores that share a wasm value type.
    store_tmp_by_valtype: HashMap<ValType, u32>,
}

/// An active block being captured. Tracks its instruction buffer and
/// — for list / fixed-size-list iteration blocks — the i32 local that
/// holds the current element's base address. When the block body
/// emits loads, they read from this local (if set) rather than the
/// outer `addr_local`.
struct ActiveBlock {
    buffer: Vec<Instruction<'static>>,
    /// Allocated lazily on the first `IterBasePointer` emitted inside
    /// this block. `None` for blocks that aren't iteration bodies
    /// (e.g. variant arm blocks).
    iter_addr_local: Option<u32>,
    /// `flat_cursor` value at the moment this block was opened.
    /// Lower-mode emit arms (variant dispatch, fixed-list iter)
    /// snapshot this so they can rewrite the captured `LocalGet`
    /// indices to map per-arm / per-iter cursor reads onto the
    /// actual canonical-ABI flat-slot positions at replay time.
    /// Unused on the lift side (lifts don't read flat slots).
    start_cursor: u32,
    /// Set on `VariantPayloadName`; triggers cursor rewind in
    /// `finish_block_body`.
    is_variant_arm: bool,
}

/// A captured block body — the wasm instructions emitted between a
/// `push_block` / `finish_block` pair. The variant-lift emit splices
/// these into the `block ... br_table ... end` dispatch structure;
/// the fixed-size-list emit replays them N times with per-iteration
/// base-address advancement.
///
/// We don't track the Bindgen operand count (`finish_block`'s
/// `operand.len()`) here because it counts the generator's abstract
/// operand stack — which collapses compound types via aggregate lifts
/// like `RecordLift`. For our purposes the *wasm* stack count is what
/// matters for widening, and that's driven by `push_flat(arm_type)`
/// at variant-emit time, not by the block itself.
struct CompletedBlock {
    body: Vec<Instruction<'static>>,
    /// The iteration local the body's loads read from, if this was
    /// an iteration block. `None` for variant-arm blocks.
    iter_addr_local: Option<u32>,
    /// Cursor at the moment this block was opened (lower-mode only).
    /// Together with `end_cursor`, gives the [start, end) range of
    /// flat-param indices the body's `LocalGet`s reference.
    start_cursor: u32,
    /// Cursor at the moment this block was closed. `end - start` is
    /// the number of flat slots this block consumed.
    end_cursor: u32,
}

impl<'a> WasmEncoderBindgen<'a> {
    /// Create a new bindgen. The caller sets up `addr_local` (an
    /// i32 local holding the base address for loads) and hands in a
    /// `&mut LocalsBuilder` for all dynamic local allocation the
    /// bindgen needs.
    pub fn new(sizes: &'a SizeAlign, addr_local: u32, indices: &'a mut LocalsBuilder) -> Self {
        Self {
            main: Vec::new(),
            block_buffers: Vec::new(),
            completed_blocks: Vec::new(),
            sizes,
            addr_local,
            indices,
            param_flat_count: 0,
            flat_cursor: 0,
            store_tmp_by_valtype: HashMap::new(),
        }
    }

    /// Enable lower-mode with `count` flat-param locals at indices
    /// `0..count` in canonical order.
    pub fn with_param_flat_count(mut self, count: u32) -> Self {
        self.param_flat_count = count;
        self
    }

    /// Emit `i32.const value; local.set $addr_local` into the active
    /// buffer. Lower-mode helper for staging the per-param effective
    /// address between `lower_to_memory` calls — keeps one bindgen
    /// (and one `store_tmp_by_valtype` cache) live across all params.
    pub fn emit_set_addr_const(&mut self, value: i32) {
        let addr_local = self.addr_local;
        self.emit_one(Instruction::I32Const(value));
        self.emit_one(Instruction::LocalSet(addr_local));
    }

    /// Consume the bindgen and return the accumulated wasm
    /// instructions. Locals were allocated through the caller's
    /// [`LocalsBuilder`], so they're already tracked there.
    pub fn into_instructions(self) -> Vec<Instruction<'static>> {
        assert!(
            self.block_buffers.is_empty(),
            "into_instructions called mid-block — push_block/finish_block unbalanced"
        );
        assert!(
            self.completed_blocks.is_empty(),
            "into_instructions called with unconsumed completed blocks \
             (variant emit missing?)"
        );
        self.main
    }

    /// Allocate a new local of the given type via the shared
    /// [`LocalsBuilder`].
    fn alloc_local(&mut self, ty: ValType) -> u32 {
        self.indices.alloc_local(ty)
    }

    /// Append one instruction to the currently-active buffer (either
    /// the top block buffer, or `main` if no block is active).
    fn emit_one(&mut self, inst: Instruction<'static>) {
        self.active_buf().push(inst);
    }

    fn active_buf(&mut self) -> &mut Vec<Instruction<'static>> {
        match self.block_buffers.last_mut() {
            Some(block) => &mut block.buffer,
            None => &mut self.main,
        }
    }

    /// Return the local index holding the current address for loads:
    /// the innermost active iteration block's `iter_addr_local` if
    /// any, otherwise the outer `addr_local`. Walks the block stack
    /// so nested iteration / variant structures pick up the correct
    /// scope — e.g. a variant arm inside a fixed-size list sees the
    /// list's iter local, not the bindgen's top-level addr_local.
    fn current_addr_local(&self) -> u32 {
        for block in self.block_buffers.iter().rev() {
            if let Some(idx) = block.iter_addr_local {
                return idx;
            }
        }
        self.addr_local
    }

    /// Emit `local.get $addr; <load>` for a memory load at the given
    /// byte offset. All load emits funnel through this helper. The
    /// address local comes from [`Self::current_addr_local`], so
    /// loads inside a list iteration block read from the per-element
    /// iter local rather than the outer base.
    fn emit_load(&mut self, offset: ArchitectureSize, load: LoadKind) {
        let off = offset.size_wasm32() as u64;
        let mem_arg = MemArg {
            offset: off,
            align: load.natural_align_log2(),
            memory_index: 0,
        };
        let addr_local = self.current_addr_local();
        self.emit_one(Instruction::LocalGet(addr_local));
        self.emit_one(load.to_instruction(mem_arg));
    }

    /// Emit a value-shuffle store at the given byte offset. The wasm
    /// stack on entry has `[..., value]`; on exit, `[...]`. Wasm's
    /// store opcodes need `[addr, value]` with addr deeper, but the
    /// canonical-ABI emit order pushes value first then addr-as-an-
    /// abstract-operand (which our `Operand = ()` doesn't materialize
    /// onto the wasm stack). To bridge: stash value into a per-ValType
    /// scratch, push the address, push the value back, then store.
    /// All scalar store emits funnel through here.
    fn emit_store(&mut self, offset: ArchitectureSize, store: StoreKind) {
        let off = offset.size_wasm32() as u64;
        let mem_arg = MemArg {
            offset: off,
            align: store.natural_align_log2(),
            memory_index: 0,
        };
        let value_vt = store.value_valtype();
        let tmp = self.alloc_store_tmp(value_vt);
        let addr_local = self.current_addr_local();
        self.emit_one(Instruction::LocalSet(tmp));
        self.emit_one(Instruction::LocalGet(addr_local));
        self.emit_one(Instruction::LocalGet(tmp));
        self.emit_one(store.to_instruction(mem_arg));
    }

    /// Lazy-allocated per-`ValType` scratch local used by [`Self::emit_store`].
    /// Reused across all stores of the same wasm value type (e.g.
    /// every i32 store shares one i32 tmp), so the per-fn local count
    /// scales with the number of distinct flat types in the params,
    /// not the number of stores.
    fn alloc_store_tmp(&mut self, vt: ValType) -> u32 {
        if let Some(&idx) = self.store_tmp_by_valtype.get(&vt) {
            return idx;
        }
        let idx = self.indices.alloc_local(vt);
        self.store_tmp_by_valtype.insert(vt, idx);
        idx
    }

    /// Emit `local.get $cursor; cursor += 1` for a scalar lift-to-flat.
    fn emit_get_flat_slot(&mut self) {
        let idx = self.flat_cursor;
        assert!(
            idx < self.param_flat_count,
            "lift-to-flat past end of param flat (cursor={idx}, count={count})",
            count = self.param_flat_count,
        );
        self.emit_one(Instruction::LocalGet(idx));
        self.flat_cursor = idx + 1;
    }

    /// Emit a bitcast sequence to convert the top-of-stack value's
    /// wasm type. Decomposes `Bitcast::Sequence` recursively and maps
    /// each leaf bitcast to its wasm instruction.
    fn emit_bitcast(&mut self, bc: &Bitcast) {
        use Bitcast::*;
        match bc {
            None => {}
            I32ToI64 => self.emit_one(Instruction::I64ExtendI32U),
            I64ToI32 => self.emit_one(Instruction::I32WrapI64),
            F32ToI32 => self.emit_one(Instruction::I32ReinterpretF32),
            I32ToF32 => self.emit_one(Instruction::F32ReinterpretI32),
            F64ToI64 => self.emit_one(Instruction::I64ReinterpretF64),
            I64ToF64 => self.emit_one(Instruction::F64ReinterpretI64),
            F32ToI64 => {
                self.emit_one(Instruction::I32ReinterpretF32);
                self.emit_one(Instruction::I64ExtendI32U);
            }
            I64ToF32 => {
                self.emit_one(Instruction::I32WrapI64);
                self.emit_one(Instruction::F32ReinterpretI32);
            }
            // Wasm32 mapping: `Pointer` and `Length` are `i32`,
            // `PointerOrI64` is `i64`. Casts between types that
            // collapse to the same wasm type are genuine no-ops; the
            // ones that cross the i32/i64 boundary need the
            // corresponding wasm extend/wrap.
            PToI32 | I32ToP | I32ToL | LToI32 | PToL | LToP => {}
            I64ToP64 | P64ToI64 => {}
            PToP64 | LToI64 => self.emit_one(Instruction::I64ExtendI32U),
            P64ToP | I64ToL => self.emit_one(Instruction::I32WrapI64),
            Sequence(pair) => {
                let [a, b] = pair.as_ref();
                self.emit_bitcast(a);
                self.emit_bitcast(b);
            }
        }
    }

    /// Push a fresh block onto the stack. `iter_addr_local` is
    /// allocated lazily — only if this block turns out to be an
    /// iteration body (i.e. emits an `IterBasePointer`).
    fn start_block(&mut self) {
        self.block_buffers.push(ActiveBlock {
            buffer: Vec::new(),
            iter_addr_local: None,
            start_cursor: self.flat_cursor,
            is_variant_arm: false,
        });
    }

    /// Pop the top active block and record it as a completed block.
    fn finish_block_body(&mut self) {
        let active = self
            .block_buffers
            .pop()
            .expect("finish_block without matching push_block");
        let end_cursor = self.flat_cursor;
        // Sibling variant arms overlap on the joined-flat window;
        // rewind so the next arm starts at the same cursor. The
        // captured `end_cursor` keeps the replay range intact.
        if active.is_variant_arm {
            self.flat_cursor = active.start_cursor;
        }
        self.completed_blocks.push(CompletedBlock {
            body: active.buffer,
            iter_addr_local: active.iter_addr_local,
            start_cursor: active.start_cursor,
            end_cursor,
        });
    }

    /// Variant dispatch for write_to_memory (results=&[]): read disc,
    /// br_table per arm, pre-load arm-typed locals from joined-flat
    /// slots with joined→arm bitcasts so each arm sees its arm-flat
    /// types (the wrapper params are in joined-flat). Cursor lands at
    /// `variant_start + joined_flat.len()`. Nested-variant correctness
    /// is pinned by `lower_nested_variant_routes_through_replay`.
    fn emit_variant_dispatch_for_lower(
        &mut self,
        resolve: &Resolve,
        variant_type: &Type,
        arm_types: &[Option<Type>],
    ) {
        let n_arms = arm_types.len();
        let joined = flat_types(resolve, variant_type, None).unwrap_or_else(|| {
            panic!(
                "variant flat must fit in MAX_FLAT_PARAMS ({}) — larger variants are invalid",
                Resolve::MAX_FLAT_PARAMS,
            )
        });
        let joined_total = joined.len() as u32;

        let start = self
            .completed_blocks
            .len()
            .checked_sub(n_arms)
            .expect("fewer captured arm blocks than arms");
        let arms: Vec<CompletedBlock> = self.completed_blocks.drain(start..).collect();
        let variant_start = arms[0].start_cursor;

        // Pre-compute each arm's flat shape — drives the per-arm
        // local allocation + the bitcast sequence joined→arm.
        let arm_flats: Vec<Vec<WasmType>> = arm_types
            .iter()
            .map(|at| match at {
                None => Vec::new(),
                Some(ty) => flat_types(resolve, ty, None).unwrap_or_else(|| {
                    panic!(
                        "variant arm flat must fit in MAX_FLAT_PARAMS ({}) — larger arms invalid",
                        Resolve::MAX_FLAT_PARAMS,
                    )
                }),
            })
            .collect();

        // Disc lives at the variant's first flat slot.
        let disc_param_local = variant_start;
        let disc_local = self.alloc_local(ValType::I32);
        self.emit_one(Instruction::LocalGet(disc_param_local));
        self.emit_one(Instruction::LocalSet(disc_local));

        // Nested blocks: $end / $default / $case_{n-1} … $case_0.
        self.emit_one(Instruction::Block(BlockType::Empty)); // $end
        self.emit_one(Instruction::Block(BlockType::Empty)); // $default
        for _ in 0..n_arms {
            self.emit_one(Instruction::Block(BlockType::Empty)); // $case_i
        }
        self.emit_one(Instruction::LocalGet(disc_local));
        let table: Cow<'static, [u32]> = Cow::Owned((0..n_arms as u32).collect());
        self.emit_one(Instruction::BrTable(table, n_arms as u32));
        self.emit_one(Instruction::End); // close $case_0

        for (i, (arm, arm_flat)) in arms.iter().zip(&arm_flats).enumerate() {
            // Pre-load arm-typed locals from the joined slots with
            // joined→arm bitcasts. The arm body's `LocalGet`s then
            // read directly from these locals, sidestepping the
            // type mismatch the wrapper's joined-flat sig would
            // otherwise produce at validation.
            let arm_locals: Vec<u32> = arm_flat
                .iter()
                .map(|wt| self.alloc_local(wasm_type_to_val(*wt)))
                .collect();
            for (m, &arm_wt) in arm_flat.iter().enumerate() {
                let joined_local = variant_start + 1 + m as u32;
                self.emit_one(Instruction::LocalGet(joined_local));
                self.emit_bitcast(&cast(joined[m + 1], arm_wt));
                self.emit_one(Instruction::LocalSet(arm_locals[m]));
            }
            // Replay arm body, mapping its in-block `LocalGet(k)`s
            // (where k = block_start + position) to the matching
            // arm_locals[position].
            let block_range = (arm.start_cursor, arm.end_cursor);
            self.replay_block_with_arm_locals(&arm.body, block_range, &arm_locals);
            // br $end. Depth from inside case_i:
            // (n_arms-1-i) sibling cases + default + end → n_arms - i.
            let depth = (n_arms - i) as u32;
            self.emit_one(Instruction::Br(depth));
            self.emit_one(Instruction::End); // close this case
        }

        // After all case Ends control falls into $default's body
        // (out-of-range disc). Trap, then close $end.
        self.emit_one(Instruction::Unreachable);
        self.emit_one(Instruction::End); // close $end

        self.flat_cursor = variant_start + joined_total;
    }

    /// Replay a captured block, mapping `LocalGet(k)` for `k ∈
    /// [block_start, block_end)` to `LocalGet(arm_locals[k -
    /// block_start])`. Variant-arm sister to [`Self::replay_block_remapped`]:
    /// reads route through bitcast-ready arm locals instead of
    /// wrapper params directly.
    fn replay_block_with_arm_locals(
        &mut self,
        body: &[Instruction<'static>],
        block_range: (u32, u32),
        arm_locals: &[u32],
    ) {
        let (block_start, _) = block_range;
        for inst in body {
            let mapped = match inst {
                Instruction::LocalGet(k) if *k >= block_range.0 && *k < block_range.1 => {
                    let pos = (*k - block_start) as usize;
                    debug_assert!(
                        pos < arm_locals.len(),
                        "arm-local pos {pos} >= arm_locals.len() ({})",
                        arm_locals.len(),
                    );
                    Instruction::LocalGet(arm_locals[pos])
                }
                other => other.clone(),
            };
            self.emit_one(mapped);
        }
    }

    /// Replay a captured block, rewriting `LocalGet(k)` for `k ∈
    /// [block_start, block_end)` to `LocalGet(new_base + (k -
    /// block_start))`. Other LocalGets (addr / tmp scratch) are
    /// outside the cursor range and pass through. Used by fixed-list
    /// iter (per-iter shift = i × elem_flat_width).
    fn replay_block_remapped(
        &mut self,
        body: &[Instruction<'static>],
        block_range: (u32, u32),
        new_base: u32,
    ) {
        let (block_start, block_end) = block_range;
        for inst in body {
            let mapped = match inst {
                Instruction::LocalGet(k) if *k >= block_start && *k < block_end => {
                    Instruction::LocalGet(new_base + (*k - block_start))
                }
                other => other.clone(),
            };
            self.emit_one(mapped);
        }
    }

    /// Emit a zero constant for the given flat wasm type — used to
    /// pad variant arms whose natural flat is shorter than the joined
    /// payload.
    fn emit_const_zero(&mut self, wt: WasmType) {
        use WasmType::*;
        let inst = match wt {
            I32 | Pointer | Length => Instruction::I32Const(0),
            I64 | PointerOrI64 => Instruction::I64Const(0),
            F32 => Instruction::F32Const(0.0f32.into()),
            F64 => Instruction::F64Const(0.0f64.into()),
        };
        self.emit_one(inst);
    }

    /// Emit the block / `br_table` dispatch structure for a variant
    /// lift, plus per-arm widening to the joined flat. Consumes the
    /// top `n` entries of `completed_blocks` (one per arm, in arm
    /// order) and the disc value on the wasm value stack.
    ///
    /// After this runs, the wasm stack holds the full joined flat
    /// `[disc, ...joined_payload]` for the variant.
    ///
    /// ## Structure
    ///
    /// A wasm block with a multi-value result type (e.g. `block
    /// (result i32 i64)`) requires registering a function type in
    /// the module's type section — an awkward cross-cutting concern
    /// for a function-body-only emitter. Instead we route each arm's
    /// widened values through locals: the arm body widens-and-stores
    /// into per-variant locals, every block in the br_table chain
    /// uses `BlockType::Empty`, and after the chain closes we
    /// re-push `disc` followed by the payload locals to form the
    /// joined flat on the stack.
    ///
    /// ## Nested variants
    ///
    /// The `disc_local` is allocated FRESH per call, not shared
    /// across variants. If an arm contains a nested variant, that
    /// nested emit allocates its own disc_local before overwriting
    /// would happen; outer's disc stays intact for the final
    /// re-push.
    fn emit_variant_dispatch(
        &mut self,
        resolve: &Resolve,
        variant_type: &Type,
        arm_types: &[Option<Type>],
    ) {
        // Joined flat: [disc, ...joined_payload].
        let joined = flat_types(resolve, variant_type, None).unwrap_or_else(|| {
            panic!(
                "variant flat must fit in MAX_FLAT_PARAMS ({}) — larger variants are invalid \
                 per the canonical ABI spec",
                Resolve::MAX_FLAT_PARAMS
            )
        });
        assert!(
            !joined.is_empty(),
            "variant joined flat must include at least a discriminant"
        );
        let joined_payload: Vec<WasmType> = joined[1..].to_vec();

        // Allocate a fresh disc local for this variant. Sharing one
        // across nested variants would cause an inner emit to
        // overwrite the outer's disc before the outer's final
        // re-push reads it back.
        let disc_local = self.alloc_local(ValType::I32);
        self.emit_one(Instruction::LocalSet(disc_local));

        // Allocate fresh locals for each joined payload slot. These
        // are per-variant; nested variants allocate their own sets.
        let payload_locals: Vec<u32> = joined_payload
            .iter()
            .map(|wt| self.alloc_local(wasm_type_to_val(*wt)))
            .collect();

        // Pop the arm blocks (most recent n, in arm order).
        let n = arm_types.len();
        let start = self
            .completed_blocks
            .len()
            .checked_sub(n)
            .expect("fewer captured arm blocks than arms");
        let arm_blocks: Vec<CompletedBlock> = self.completed_blocks.drain(start..).collect();

        // Compute arm natural flats.
        let arm_flats: Vec<Vec<WasmType>> = arm_types
            .iter()
            .map(|opt_ty| match opt_ty {
                None => Vec::new(),
                Some(ty) => flat_types(resolve, ty, None).unwrap_or_else(|| {
                    panic!(
                        "arm flat must fit in MAX_FLAT_PARAMS ({}) — larger arms are invalid \
                         per the canonical ABI spec",
                        Resolve::MAX_FLAT_PARAMS
                    )
                }),
            })
            .collect();

        // Emit nested blocks: $end, $default, $case_n-1, ..., $case_0.
        self.emit_one(Instruction::Block(BlockType::Empty)); // $end
        self.emit_one(Instruction::Block(BlockType::Empty)); // $default
        for _ in 0..n {
            self.emit_one(Instruction::Block(BlockType::Empty)); // $case_i
        }
        // br_table dispatch inside the innermost block.
        self.emit_one(Instruction::LocalGet(disc_local));
        let table: Cow<'static, [u32]> = Cow::Owned((0..n as u32).collect());
        self.emit_one(Instruction::BrTable(table, n as u32));
        self.emit_one(Instruction::End); // close $case_0

        // Emit each arm body, widening + stashing into payload_locals.
        // The widening loop's bounds come from `arm_flat.len()` — the
        // count of values the arm body leaves on the *wasm* value
        // stack — not from the generator's operand-stack view, which
        // collapses compound types via aggregate lifts like
        // `RecordLift`.
        for (i, arm) in arm_blocks.iter().enumerate() {
            let arm_flat = &arm_flats[i];

            // Run the recorded arm body — pushes arm's natural flat
            // on the wasm value stack.
            for inst in &arm.body {
                self.emit_one(inst.clone());
            }

            // Widen from top of stack down. Each pop-widen-store
            // sequence peels one value off, so the ORDER is reverse
            // of the flat layout (top-of-stack = last-pushed).
            for j in (0..arm_flat.len()).rev() {
                self.emit_bitcast(&cast(arm_flat[j], joined_payload[j]));
                self.emit_one(Instruction::LocalSet(payload_locals[j]));
            }

            // Zero-pad any joined payload slots this arm didn't fill.
            for j in arm_flat.len()..joined_payload.len() {
                self.emit_const_zero(joined_payload[j]);
                self.emit_one(Instruction::LocalSet(payload_locals[j]));
            }

            // br $end. Depth: after case_i's End, we're inside
            //   case_{i+1}, ..., case_{n-1}, $default, $end
            // → (n-1-i) + 2 enclosing blocks, so $end is at depth
            // (n-1-i) + 1 = n - i.
            let depth = (n - i) as u32;
            self.emit_one(Instruction::Br(depth));
            // Close this case's block.
            self.emit_one(Instruction::End);
        }

        // After the loop's n Ends, $case_0 / ... / $case_{n-1} /
        // $default are all closed; control falls into $end's body
        // area. Emit the default-path trap here (runs when disc was
        // out of range, since all valid cases br'd to $end), then
        // close $end.
        self.emit_one(Instruction::Unreachable);
        self.emit_one(Instruction::End); // close $end

        // Re-push [disc, ...payload] to form the joined flat on the stack.
        self.emit_one(Instruction::LocalGet(disc_local));
        for idx in &payload_locals {
            self.emit_one(Instruction::LocalGet(*idx));
        }
    }
}

/// The six load-instruction shapes the canonical ABI reads from
/// memory. Collapses wit-bindgen-core's 10-way split (with Pointer /
/// Length duplicates) to the actual wasm instructions.
#[derive(Clone, Copy)]
enum LoadKind {
    I32Load,
    I32Load8U,
    I32Load8S,
    I32Load16U,
    I32Load16S,
    I64Load,
    F32Load,
    F64Load,
}

impl LoadKind {
    fn to_instruction(self, mem_arg: MemArg) -> Instruction<'static> {
        match self {
            LoadKind::I32Load => Instruction::I32Load(mem_arg),
            LoadKind::I32Load8U => Instruction::I32Load8U(mem_arg),
            LoadKind::I32Load8S => Instruction::I32Load8S(mem_arg),
            LoadKind::I32Load16U => Instruction::I32Load16U(mem_arg),
            LoadKind::I32Load16S => Instruction::I32Load16S(mem_arg),
            LoadKind::I64Load => Instruction::I64Load(mem_arg),
            LoadKind::F32Load => Instruction::F32Load(mem_arg),
            LoadKind::F64Load => Instruction::F64Load(mem_arg),
        }
    }

    /// Natural alignment in log2 bytes, per the canonical ABI's
    /// memory-alignment rules for each load width.
    fn natural_align_log2(self) -> u32 {
        match self {
            LoadKind::I32Load8U | LoadKind::I32Load8S => 0,
            LoadKind::I32Load16U | LoadKind::I32Load16S => 1,
            LoadKind::I32Load | LoadKind::F32Load => 2,
            LoadKind::I64Load | LoadKind::F64Load => 3,
        }
    }
}

/// The six store-instruction shapes the canonical ABI writes to
/// memory for scalar values. Mirror of [`LoadKind`].
#[derive(Clone, Copy)]
enum StoreKind {
    I32Store,
    I32Store8,
    I32Store16,
    I64Store,
    F32Store,
    F64Store,
}

impl StoreKind {
    fn to_instruction(self, mem_arg: MemArg) -> Instruction<'static> {
        match self {
            StoreKind::I32Store => Instruction::I32Store(mem_arg),
            StoreKind::I32Store8 => Instruction::I32Store8(mem_arg),
            StoreKind::I32Store16 => Instruction::I32Store16(mem_arg),
            StoreKind::I64Store => Instruction::I64Store(mem_arg),
            StoreKind::F32Store => Instruction::F32Store(mem_arg),
            StoreKind::F64Store => Instruction::F64Store(mem_arg),
        }
    }

    /// Natural alignment in log2 bytes, per the canonical ABI's
    /// memory-alignment rules for each store width.
    fn natural_align_log2(self) -> u32 {
        match self {
            StoreKind::I32Store8 => 0,
            StoreKind::I32Store16 => 1,
            StoreKind::I32Store | StoreKind::F32Store => 2,
            StoreKind::I64Store | StoreKind::F64Store => 3,
        }
    }

    /// Wasm value-stack type the store consumes — drives the
    /// [`WasmEncoderBindgen::store_tmp_by_valtype`] scratch lookup.
    /// The narrow `i32.store{8,16}` variants still consume an `i32`
    /// from the stack; the truncation happens in the store opcode.
    fn value_valtype(self) -> ValType {
        match self {
            StoreKind::I32Store | StoreKind::I32Store8 | StoreKind::I32Store16 => ValType::I32,
            StoreKind::I64Store => ValType::I64,
            StoreKind::F32Store => ValType::F32,
            StoreKind::F64Store => ValType::F64,
        }
    }
}

impl Bindgen for WasmEncoderBindgen<'_> {
    type Operand = ();

    fn emit(
        &mut self,
        _resolve: &Resolve,
        inst: &AbiInst<'_>,
        operands: &mut Vec<()>,
        results: &mut Vec<()>,
    ) {
        // Most of our arms don't look at operand/results contents —
        // Operand = () carries no info. We still must push the
        // declared number of results, which `produce_n` handles.
        match inst {
            // ── Memory loads ────────────────────────────────────
            AbiInst::I32Load { offset } => {
                self.emit_load(*offset, LoadKind::I32Load);
                produce_n(results, 1);
            }
            AbiInst::I32Load8U { offset } => {
                self.emit_load(*offset, LoadKind::I32Load8U);
                produce_n(results, 1);
            }
            AbiInst::I32Load8S { offset } => {
                self.emit_load(*offset, LoadKind::I32Load8S);
                produce_n(results, 1);
            }
            AbiInst::I32Load16U { offset } => {
                self.emit_load(*offset, LoadKind::I32Load16U);
                produce_n(results, 1);
            }
            AbiInst::I32Load16S { offset } => {
                self.emit_load(*offset, LoadKind::I32Load16S);
                produce_n(results, 1);
            }
            AbiInst::I64Load { offset } => {
                self.emit_load(*offset, LoadKind::I64Load);
                produce_n(results, 1);
            }
            AbiInst::F32Load { offset } => {
                self.emit_load(*offset, LoadKind::F32Load);
                produce_n(results, 1);
            }
            AbiInst::F64Load { offset } => {
                self.emit_load(*offset, LoadKind::F64Load);
                produce_n(results, 1);
            }
            AbiInst::PointerLoad { offset } => {
                // Wasm32: Pointer is i32.
                self.emit_load(*offset, LoadKind::I32Load);
                produce_n(results, 1);
            }
            AbiInst::LengthLoad { offset } => {
                // Wasm32: Length is i32.
                self.emit_load(*offset, LoadKind::I32Load);
                produce_n(results, 1);
            }

            // ── Scalar "lift" instructions: no-op on wasm side ──
            // The wasm value loaded by the preceding Load is already
            // the canonical representation; the interface-type cast
            // is a source-language concept we don't model.
            AbiInst::BoolFromI32
            | AbiInst::S8FromI32
            | AbiInst::U8FromI32
            | AbiInst::S16FromI32
            | AbiInst::U16FromI32
            | AbiInst::S32FromI32
            | AbiInst::U32FromI32
            | AbiInst::S64FromI64
            | AbiInst::U64FromI64
            | AbiInst::CharFromI32
            | AbiInst::F32FromCoreF32
            | AbiInst::F64FromCoreF64 => {
                produce_n(results, 1);
            }

            // ── Scalar lift-to-flat (lower direction) ───────────
            // Pulls the next flat slot off the wrapper export's wasm
            // function params via `param_flat_locals`. The interface-
            // type → wasm-type "From" arms are no-ops at the wasm
            // layer (sign/width narrowing is a source-language
            // concept; `i32.store8` etc. handles the actual
            // truncation downstream), so each just emits one
            // `local.get`.
            AbiInst::I32FromBool
            | AbiInst::I32FromS8
            | AbiInst::I32FromU8
            | AbiInst::I32FromS16
            | AbiInst::I32FromU16
            | AbiInst::I32FromS32
            | AbiInst::I32FromU32
            | AbiInst::I64FromS64
            | AbiInst::I64FromU64
            | AbiInst::I32FromChar
            | AbiInst::CoreF32FromF32
            | AbiInst::CoreF64FromF64 => {
                self.emit_get_flat_slot();
                produce_n(results, 1);
            }

            // ── Memory stores ───────────────────────────────────
            AbiInst::I32Store { offset } => {
                self.emit_store(*offset, StoreKind::I32Store);
            }
            AbiInst::I32Store8 { offset } => {
                self.emit_store(*offset, StoreKind::I32Store8);
            }
            AbiInst::I32Store16 { offset } => {
                self.emit_store(*offset, StoreKind::I32Store16);
            }
            AbiInst::I64Store { offset } => {
                self.emit_store(*offset, StoreKind::I64Store);
            }
            AbiInst::F32Store { offset } => {
                self.emit_store(*offset, StoreKind::F32Store);
            }
            AbiInst::F64Store { offset } => {
                self.emit_store(*offset, StoreKind::F64Store);
            }
            // Pointer / Length lower as i32 on wasm32.
            AbiInst::PointerStore { offset } | AbiInst::LengthStore { offset } => {
                self.emit_store(*offset, StoreKind::I32Store);
            }

            // ── Aggregate lowers (lower direction) ──────────────
            // Records and tuples decompose 1 abstract value into N
            // abstract fields/elements; the fields are then lowered
            // individually, each firing its own scalar lift-to-flat
            // (`local.get $cursor++`). Enum / flags lowering yields
            // the integer discriminant directly: same shape as a
            // scalar lift, just typed as an interface enum / flags
            // value at the source-language layer.
            AbiInst::RecordLower { record, .. } => {
                produce_n(results, record.fields.len());
            }
            AbiInst::TupleLower { tuple, .. } => {
                produce_n(results, tuple.types.len());
            }
            AbiInst::EnumLower { .. } => {
                self.emit_get_flat_slot();
                produce_n(results, 1);
            }
            AbiInst::FlagsLower { flags, .. } => {
                // Component Model caps `flags` at 32 members → repr
                // is always 1 wasm slot today. The U32(n>1) branch is
                // defensive against a future spec relaxation; mirrors
                // wit-parser's instruction arity contract.
                let n = match flags.repr() {
                    FlagsRepr::U8 | FlagsRepr::U16 => 1usize,
                    FlagsRepr::U32(n) => n,
                };
                for _ in 0..n {
                    self.emit_get_flat_slot();
                }
                produce_n(results, n);
            }

            // ── Pass-through lowers (lower direction) ──────────
            // Host's canon-lower already deposited backing data; we
            // just thread (ptr, len) / i32 through. `realloc` field
            // intentionally ignored — see `build_lower_params_to_memory`.
            AbiInst::StringLower { .. } | AbiInst::ListCanonLower { .. } => {
                self.emit_get_flat_slot(); // ptr
                self.emit_get_flat_slot(); // len
                produce_n(results, 2);
            }
            AbiInst::HandleLower { .. }
            | AbiInst::ErrorContextLower
            | AbiInst::FutureLower { .. }
            | AbiInst::StreamLower { .. } => {
                self.emit_get_flat_slot();
                produce_n(results, 1);
            }
            // Map at the wrapper-flat boundary is (ptr, len), same as a
            // list. Upstream's `lower(map)` wraps a kv-pair-write block
            // that's meaningless in our pass-through model; discard it
            // and roll cursor back before reading (ptr, len).
            AbiInst::MapLower { .. } => {
                let block = self
                    .completed_blocks
                    .pop()
                    .expect("MapLower without matching block");
                self.flat_cursor = block.start_cursor;
                self.emit_get_flat_slot(); // ptr
                self.emit_get_flat_slot(); // len
                produce_n(results, 2);
            }
            AbiInst::IterMapKey { .. } | AbiInst::IterMapValue { .. } => {
                produce_n(results, 1);
            }

            // ── Constants / placeholders used by aggregate lowers ──
            // I32Const fires for variant-arm disc constants;
            // IterElem / VariantPayloadName push abstract-operand
            // placeholders that the cursor model doesn't materialize.
            AbiInst::I32Const { val } => {
                self.emit_one(Instruction::I32Const(*val));
                produce_n(results, 1);
            }
            AbiInst::IterElem { .. } => {
                produce_n(results, 1);
            }
            AbiInst::VariantPayloadName => {
                // Tag the arm so `finish_block_body` rewinds its cursor.
                self.block_buffers
                    .last_mut()
                    .expect("VariantPayloadName outside a block")
                    .is_variant_arm = true;
                produce_n(results, 1);
            }

            // ── Fixed-list lower decomposition ────────────────
            // 1 → size abstract decomposition; the per-element
            // `lower(elem_ty)` calls that follow advance the cursor.
            AbiInst::FixedLengthListLower { size, .. } => {
                produce_n(results, *size as usize);
            }

            // ── Variant / option / result lowers (write_to_memory ctx) ──
            // `results` is empty in the write_to_memory path — arms
            // store directly to memory. The dispatch reads disc from
            // `param_flat_locals[variant_start]` and `br_table`s over
            // per-arm captured blocks.
            AbiInst::VariantLower {
                variant,
                ty,
                results: r,
                ..
            } => {
                debug_assert!(
                    r.is_empty(),
                    "VariantLower in lower-flat (non-empty results) context not yet supported",
                );
                let arms: Vec<Option<Type>> = variant.cases.iter().map(|c| c.ty).collect();
                self.emit_variant_dispatch_for_lower(_resolve, &Type::Id(*ty), &arms);
                produce_n(results, r.len());
            }
            AbiInst::OptionLower {
                payload,
                ty,
                results: r,
            } => {
                debug_assert!(r.is_empty());
                let arms = vec![None, Some(**payload)];
                self.emit_variant_dispatch_for_lower(_resolve, &Type::Id(*ty), &arms);
                produce_n(results, r.len());
            }
            AbiInst::ResultLower {
                result,
                ty,
                results: r,
            } => {
                debug_assert!(r.is_empty());
                let arms = vec![result.ok, result.err];
                self.emit_variant_dispatch_for_lower(_resolve, &Type::Id(*ty), &arms);
                produce_n(results, r.len());
            }

            // ── Bitcasts ───────────────────────────────────────
            AbiInst::Bitcasts { casts } => {
                for bc in casts.iter() {
                    self.emit_bitcast(bc);
                }
                produce_n(results, operands.len());
            }

            // ── Aggregate lifts: no-op ─────────────────────────
            // The N wasm values on the value stack already represent
            // the aggregated value (record / tuple / handle / flags /
            // enum / future / stream / error-context / fixed-list).
            // No wasm emission; just collapse N operands → 1.
            AbiInst::RecordLift { .. }
            | AbiInst::TupleLift { .. }
            | AbiInst::HandleLift { .. }
            | AbiInst::FutureLift { .. }
            | AbiInst::StreamLift { .. }
            | AbiInst::EnumLift { .. }
            | AbiInst::FlagsLift { .. }
            | AbiInst::ErrorContextLift
            | AbiInst::FixedLengthListLift { .. }
            | AbiInst::StringLift
            | AbiInst::ListCanonLift { .. }
            | AbiInst::ListLift { .. } => {
                produce_n(results, 1);
            }
            // `is_list_canonical = true` short-circuits the list lift,
            // but `lift(Map)` unconditionally pushes a per-entry block
            // and emits `MapLift`. Drop the block (we have our own
            // per-element loop) and collapse operands.
            AbiInst::MapLift { .. } => {
                self.completed_blocks
                    .pop()
                    .expect("MapLift without matching block");
                produce_n(results, 1);
            }

            // ── Variant / option / result lifts ────────────────
            AbiInst::VariantLift { variant, ty, .. } => {
                let arms: Vec<Option<Type>> = variant.cases.iter().map(|c| c.ty).collect();
                self.emit_variant_dispatch(_resolve, &Type::Id(*ty), &arms);
                produce_n(results, 1);
            }
            AbiInst::OptionLift { payload, ty } => {
                let arms = vec![None, Some(**payload)];
                self.emit_variant_dispatch(_resolve, &Type::Id(*ty), &arms);
                produce_n(results, 1);
            }
            AbiInst::ResultLift { result, ty } => {
                let arms = vec![result.ok, result.err];
                self.emit_variant_dispatch(_resolve, &Type::Id(*ty), &arms);
                produce_n(results, 1);
            }

            // ── Fixed-size list lift ───────────────────────────
            //
            // The Canonical ABI treats `list<T, N>` (fixed-size)
            // fundamentally differently from `list<T>` (dynamic):
            //
            // | Type       | Flat form        | In memory            |
            // |------------|------------------|----------------------|
            // | list<T>    | `[i32 ptr, i32 len]` | elements at `*ptr` |
            // | list<T, N> | `N × flat(T)` inlined | N contiguous elements |
            //
            // The fixed-size variant is semantically a
            // `tuple<T, …, T>` (N times), so it flattens the same
            // way tuples do — every element becomes a value on the
            // wasm stack (or in a retptr buffer if `N × flat(T)` >
            // `Resolve::MAX_FLAT_PARAMS`). The payoff is zero-copy passing
            // of small fixed arrays (hashes, UUIDs, 3D vectors,
            // small buffers) without the realloc + pointer-chase
            // that dynamic lists require.
            //
            // When a fixed-size list lives inside a container
            // (record field, async result buffer, …) it's stored
            // as N contiguous element slots in memory. This
            // instruction materializes the inlined flat form by
            // reading N elements out.
            //
            // Emission strategy: the generator captures the
            // per-element read as a block body (with
            // `IterBasePointer` marking where the element base
            // address is used), then fires this instruction to
            // iterate. We unroll at emission time: allocate an
            // iter local, initialize it to the list's base
            // address, and replay the block body once per element
            // with the local advanced by `elem_size` each step.
            // Loads inside the block body reference the iter
            // local via [`current_addr_local`] so they hit the
            // right element.
            //
            // Dynamic lists (`list<T>` / `TypeDefKind::List`) hit
            // the `PointerLoad` + `LengthLoad` pair in
            // `read_list_from_memory` and then `ListCanonLift`
            // above, which is a no-op in our emit — the `(ptr,
            // len)` pair is already the flat form.
            AbiInst::IterBasePointer => {
                // Lazily allocate the iteration address local on the
                // current active block — `FixedLengthListLiftFromMemory`
                // reads it off the completed block below.
                let need_alloc = self
                    .block_buffers
                    .last()
                    .expect("IterBasePointer must fire inside a block")
                    .iter_addr_local
                    .is_none();
                if need_alloc {
                    let idx = self.indices.alloc_local(ValType::I32);
                    self.block_buffers
                        .last_mut()
                        .expect("IterBasePointer must fire inside a block")
                        .iter_addr_local = Some(idx);
                }
                produce_n(results, 1);
            }
            // Mirror of FixedLengthListLiftFromMemory for lowering:
            // replay the captured per-element write N times, shifting
            // the body's cursor reads by `i * elem_flat_width` per iter.
            AbiInst::FixedLengthListLowerToMemory { element, size, .. } => {
                let elem_size = self.sizes.size(element).size_wasm32() as u32;
                let block = self
                    .completed_blocks
                    .pop()
                    .expect("FixedLengthListLowerToMemory without a matching block");
                let iter_addr = block.iter_addr_local.expect(
                    "fixed-size-list block must have allocated an iter_addr_local via \
                     IterBasePointer",
                );
                let parent_addr = self.current_addr_local();
                self.emit_one(Instruction::LocalGet(parent_addr));
                self.emit_one(Instruction::LocalSet(iter_addr));
                let elem_flat = block.end_cursor - block.start_cursor;
                let block_range = (block.start_cursor, block.end_cursor);
                for i in 0..*size {
                    if i > 0 {
                        self.emit_one(Instruction::LocalGet(iter_addr));
                        self.emit_one(Instruction::I32Const(elem_size as i32));
                        self.emit_one(Instruction::I32Add);
                        self.emit_one(Instruction::LocalSet(iter_addr));
                    }
                    let new_base = block.start_cursor + i * elem_flat;
                    self.replay_block_remapped(&block.body, block_range, new_base);
                }
                // Cursor was already advanced once during capture
                // (to end_cursor); ensure it reflects the full size *
                // elem_flat advance for any code that follows.
                self.flat_cursor = block.start_cursor + *size * elem_flat;
                produce_n(results, 0);
            }
            AbiInst::FixedLengthListLiftFromMemory { element, size, .. } => {
                let elem_size = self.sizes.size(element).size_wasm32() as u32;
                let block = self
                    .completed_blocks
                    .pop()
                    .expect("FixedLengthListLiftFromMemory without a matching block");
                let iter_addr = block.iter_addr_local.expect(
                    "fixed-size-list block must have allocated an iter_addr_local via \
                     IterBasePointer",
                );
                // Initialize iter_addr_local to the current base —
                // the parent's address (outer addr_local, or a
                // parent iteration's iter local for nested lists).
                let parent_addr = self.current_addr_local();
                self.emit_one(Instruction::LocalGet(parent_addr));
                self.emit_one(Instruction::LocalSet(iter_addr));
                for i in 0..*size {
                    if i > 0 {
                        // Advance by elem_size. `elem_size == 0` is
                        // possible for zero-sized records; the add is
                        // a no-op in that case but harmless.
                        self.emit_one(Instruction::LocalGet(iter_addr));
                        self.emit_one(Instruction::I32Const(elem_size as i32));
                        self.emit_one(Instruction::I32Add);
                        self.emit_one(Instruction::LocalSet(iter_addr));
                    }
                    for inst in &block.body {
                        self.emit_one(inst.clone());
                    }
                }
                produce_n(results, 1);
            }

            // ── Instructions we don't expect on the lift-from-memory path ──
            other => unimplemented!(
                "WasmEncoderBindgen::emit hit unsupported instruction: {:?}. \
                 This path is only exercised by lift_from_memory; other entry \
                 points aren't supported.",
                other
            ),
        }
    }

    fn return_pointer(&mut self, _size: ArchitectureSize, _align: Alignment) {
        unimplemented!(
            "return_pointer is only called on lowering paths; \
             lift_from_memory never invokes it"
        );
    }

    fn push_block(&mut self) {
        self.start_block();
    }

    fn finish_block(&mut self, operand: &mut Vec<()>) {
        // The generator's operand-stack count at block exit isn't
        // meaningful for our wasm emission — see `CompletedBlock`.
        operand.clear();
        self.finish_block_body();
    }

    fn sizes(&self) -> &SizeAlign {
        self.sizes
    }

    fn is_list_canonical(&self, _resolve: &Resolve, _element: &Type) -> bool {
        // For lift_from_memory of a list, the canonical representation
        // means we stop at `(ptr, len)` on the stack rather than iterate
        // each element. That's what the adapter wants — a `(ptr, len)`
        // pair is two i32s and matches the joined flat for the list
        // type. Return true unconditionally.
        true
    }
}

/// Push `n` placeholder operands onto a results vec. Mirrors the
/// arity declared in each `Instruction` variant so the generator's
/// bookkeeping stays consistent.
fn produce_n(results: &mut Vec<()>, n: usize) {
    for _ in 0..n {
        results.push(());
    }
}

#[cfg(test)]
impl WasmEncoderBindgen<'_> {
    /// Test-only: inspect the accumulated main-buffer instructions
    /// without draining them into a Function.
    pub(crate) fn instructions(&self) -> &[Instruction<'static>] {
        &self.main
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use wit_bindgen_core::abi::{lift_from_memory, lower_to_memory};
    use wit_parser::{Docs, Field, Record, Span, Stability, TypeDef, TypeDefKind, TypeOwner};

    fn new_sizes(resolve: &Resolve) -> SizeAlign {
        let mut s = SizeAlign::default();
        s.fill(resolve);
        s
    }

    /// Helper: count instructions matching a predicate.
    fn count<F: Fn(&Instruction<'static>) -> bool>(bg: &WasmEncoderBindgen<'_>, pred: F) -> usize {
        bg.instructions().iter().filter(|i| pred(i)).count()
    }

    #[test]
    fn lift_u32_emits_one_load() {
        let resolve = Resolve::default();
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::U32);

        assert_eq!(count(&bg, |i| matches!(i, Instruction::LocalGet(_))), 1);
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Load(_))), 1);
    }

    #[test]
    fn lift_u64_emits_i64_load() {
        let resolve = Resolve::default();
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(4);
        let mut bg = WasmEncoderBindgen::new(&sizes, 3, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::U64);

        assert_eq!(count(&bg, |i| matches!(i, Instruction::I64Load(_))), 1);
        // addr_local=3 must show up in the LocalGet
        assert!(bg
            .instructions()
            .iter()
            .any(|i| matches!(i, Instruction::LocalGet(3))));
    }

    #[test]
    fn lift_record_emits_one_load_per_field() {
        let mut resolve = Resolve::default();
        let record_id = resolve.types.alloc(TypeDef {
            name: Some("r".to_string()),
            kind: TypeDefKind::Record(Record {
                fields: vec![
                    Field {
                        name: "a".to_string(),
                        ty: Type::U32,
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                    Field {
                        name: "b".to_string(),
                        ty: Type::U64,
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                    Field {
                        name: "c".to_string(),
                        ty: Type::U8,
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                ],
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(record_id));

        // 3 fields → 3 load instructions, each paired with a LocalGet
        assert_eq!(count(&bg, |i| matches!(i, Instruction::LocalGet(_))), 3);
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Load(_))), 1);
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I64Load(_))), 1);
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Load8U(_))), 1);
    }

    #[test]
    fn lift_string_emits_ptr_len_loads() {
        let resolve = Resolve::default();
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::String);

        // String lifts as (ptr, len) — both i32 loads.
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Load(_))), 2);
    }

    /// `result<u32, u32>` — homogeneous arms (both flatten to `[i32]`).
    /// Joined flat is `[i32 (disc), i32 (payload)]`. No widening
    /// needed; both arms' widening bitcasts are `None`.
    #[test]
    fn lift_homogeneous_result_emits_dispatch_structure() {
        let mut resolve = Resolve::default();
        let result_id = resolve.types.alloc(TypeDef {
            name: Some("r".to_string()),
            kind: TypeDefKind::Result(wit_parser::Result_ {
                ok: Some(Type::U32),
                err: Some(Type::U32),
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(result_id));

        // Disc load (1 byte) + one payload load per arm (2).
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Load8U(_))), 1);
        // Two block/brtable structure (4 nested blocks: $end, $default, $case_0, $case_1).
        assert_eq!(count(&bg, |i| matches!(i, Instruction::Block(_))), 4);
        assert_eq!(count(&bg, |i| matches!(i, Instruction::BrTable(_, _))), 1);
        assert_eq!(count(&bg, |i| matches!(i, Instruction::Unreachable)), 1);
        let _insts = bg.into_instructions();
        // Bindgen allocated exactly one disc local + one payload local (both i32).
        assert_eq!(indices.freeze().locals, vec![ValType::I32, ValType::I32]);
    }

    /// `result<u8, u64>` — heterogeneous arms. Joined flat is
    /// `[i32 (disc), i64 (payload)]`. The `Ok` arm flat is `[i32]`
    /// (u8 loaded as i32), needs widening to i64 via `i64.extend_i32_u`.
    /// The `Err` arm flat is `[i64]` — no widening.
    #[test]
    fn lift_heterogeneous_result_emits_widening() {
        let mut resolve = Resolve::default();
        let result_id = resolve.types.alloc(TypeDef {
            name: Some("r".to_string()),
            kind: TypeDefKind::Result(wit_parser::Result_ {
                ok: Some(Type::U8),
                err: Some(Type::U64),
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(result_id));

        // Widening bitcast i32 → i64 for the ok arm.
        assert_eq!(
            count(&bg, |i| matches!(i, Instruction::I64ExtendI32U)),
            1,
            "ok (u8) arm should widen to i64 to match joined payload"
        );
        let _insts = bg.into_instructions();
        // Disc local (i32) + payload local (i64).
        assert_eq!(indices.freeze().locals, vec![ValType::I32, ValType::I64]);
    }

    /// `option<u32>` — None arm has no payload, so zero-padding is
    /// emitted for its joined_payload slot.
    #[test]
    fn lift_option_pads_none_arm_with_zero() {
        let mut resolve = Resolve::default();
        let opt_id = resolve.types.alloc(TypeDef {
            name: Some("o".to_string()),
            kind: TypeDefKind::Option(Type::U32),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(opt_id));

        // None arm (no payload) emits `i32.const 0` zero-pad.
        assert!(
            bg.instructions()
                .iter()
                .any(|i| matches!(i, Instruction::I32Const(0))),
            "option's None arm should emit i32.const 0 to pad joined payload"
        );
    }

    /// `result<string, u64>` forces a `Pointer → PointerOrI64` cast
    /// at payload position 0: ok's flat is `[Pointer, Length]`, err's
    /// flat is `[I64]`, and their positional join is
    /// `[PointerOrI64, Length]`. The ok arm must emit
    /// `i64.extend_i32_u` to widen its i32 pointer up to the joined
    /// i64 slot — without that, the stack type disagrees with the
    /// joined-flat block signature and wasm validation rejects with
    /// "expected i64, found i32".
    #[test]
    fn lift_result_string_u64_widens_pointer_to_pointer_or_i64() {
        let mut resolve = Resolve::default();
        let result_id = resolve.types.alloc(TypeDef {
            name: Some("r".to_string()),
            kind: TypeDefKind::Result(wit_parser::Result_ {
                ok: Some(Type::String),
                err: Some(Type::U64),
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(result_id));

        // Exactly one `i64.extend_i32_u`: ok arm widens its Pointer
        // (i32) to the joined PointerOrI64 (i64) at position 0.
        // (Length at position 1 stays i32 on both sides; err arm's
        // I64 → PointerOrI64 is i64→i64, no instruction.)
        assert_eq!(
            count(&bg, |i| matches!(i, Instruction::I64ExtendI32U)),
            1,
            "ok (string) arm should widen Pointer to PointerOrI64"
        );
        let _insts = bg.into_instructions();
        // Joined flat: [disc=i32, PointerOrI64→i64, Length→i32].
        // Locals: disc(i32), payload[0]=i64, payload[1]=i32.
        assert_eq!(
            indices.freeze().locals,
            vec![ValType::I32, ValType::I64, ValType::I32]
        );
    }

    /// `list<u32, 4>` — fixed-size list of 4 u32s. Should emit the
    /// iteration init (`LocalGet $parent; LocalSet $iter`) plus 4
    /// unrolled element loads with the iter local advanced by 4
    /// bytes each time.
    #[test]
    fn lift_fixed_size_list_unrolls_n_loads() {
        let mut resolve = Resolve::default();
        let list_id = resolve.types.alloc(TypeDef {
            name: Some("l".to_string()),
            kind: TypeDefKind::FixedLengthList(Type::U32, 4),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(list_id));

        // Four i32 loads, one per element.
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Load(_))), 4);
        // Three `I32Add`s — advance iter_addr between iterations 0→1,
        // 1→2, 2→3 (the first iteration reads at base, no advance).
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Add)), 3);
        // One `I32Const(4)` per advance (elem_size = 4 for u32).
        assert_eq!(count(&bg, |i| matches!(i, Instruction::I32Const(4))), 3);
        // Bindgen allocated one i32 local for the iteration address.
        let _insts = bg.into_instructions();
        assert_eq!(indices.freeze().locals, vec![ValType::I32]);
    }

    /// Dynamic `list<T>` flattens to `[Pointer, Length]`, the same
    /// shape as `string`, so `result<list<T>, u64>` exercises the
    /// same `Pointer → PointerOrI64` widening as the string case.
    /// Kept as a separate test so the assertion isolates the list
    /// path in case list and string lowering evolve independently.
    #[test]
    fn lift_result_list_u64_widens_pointer_to_pointer_or_i64() {
        let mut resolve = Resolve::default();
        let list_id = resolve.types.alloc(TypeDef {
            name: Some("l".to_string()),
            kind: TypeDefKind::List(Type::U8),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let result_id = resolve.types.alloc(TypeDef {
            name: Some("r".to_string()),
            kind: TypeDefKind::Result(wit_parser::Result_ {
                ok: Some(Type::Id(list_id)),
                err: Some(Type::U64),
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        let mut indices = LocalsBuilder::new(1);
        let mut bg = WasmEncoderBindgen::new(&sizes, 0, &mut indices);
        lift_from_memory(&resolve, &mut bg, (), &Type::Id(result_id));

        assert_eq!(
            count(&bg, |i| matches!(i, Instruction::I64ExtendI32U)),
            1,
            "ok (list) arm should widen Pointer to PointerOrI64"
        );
    }

    /// `variant { a(u64), b(u64), c(u64) }` — arms overlap on the
    /// joined-flat payload slot. Without the arm-rewind, arm 2's
    /// body reads past `param_flat_count` and panics at codegen.
    #[test]
    fn lower_variant_arms_share_joined_payload_slot() {
        let mut resolve = Resolve::default();
        let variant_id = resolve.types.alloc(TypeDef {
            name: Some("v".to_string()),
            kind: TypeDefKind::Variant(wit_parser::Variant {
                cases: vec![
                    wit_parser::Case {
                        name: "a".to_string(),
                        ty: Some(Type::U64),
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                    wit_parser::Case {
                        name: "b".to_string(),
                        ty: Some(Type::U64),
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                    wit_parser::Case {
                        name: "c".to_string(),
                        ty: Some(Type::U64),
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                ],
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        // Joined flat = [disc, i64 payload] = 2 wrapper params.
        let mut indices = LocalsBuilder::new(2);
        let addr_local = indices.alloc_local(ValType::I32);
        let mut bg =
            WasmEncoderBindgen::new(&sizes, addr_local, &mut indices).with_param_flat_count(2);
        bg.emit_set_addr_const(0);
        lower_to_memory(&resolve, &mut bg, (), (), &Type::Id(variant_id));

        let insts = bg.into_instructions();
        let count_inst =
            |pred: fn(&Instruction<'_>) -> bool| insts.iter().filter(|i| pred(i)).count();
        // One br_table; disc read once; payload param pre-loaded
        // once per arm (3 arms → 3 reads of local 1).
        assert_eq!(count_inst(|i| matches!(i, Instruction::BrTable(_, _))), 1);
        assert_eq!(count_inst(|i| matches!(i, Instruction::LocalGet(0))), 1);
        assert_eq!(count_inst(|i| matches!(i, Instruction::LocalGet(1))), 3);
    }

    /// Lower-side `option<u32>` — one `br_table` over the two arms,
    /// payload param pre-loaded only for `Some`.
    #[test]
    fn lower_option_emits_dispatch() {
        let mut resolve = Resolve::default();
        let opt_id = resolve.types.alloc(TypeDef {
            name: Some("o".to_string()),
            kind: TypeDefKind::Option(Type::U32),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        // option<u32> joined flat = [disc, i32] = 2 slots.
        let mut indices = LocalsBuilder::new(2);
        let addr_local = indices.alloc_local(ValType::I32);
        let mut bg =
            WasmEncoderBindgen::new(&sizes, addr_local, &mut indices).with_param_flat_count(2);
        bg.emit_set_addr_const(0);
        lower_to_memory(&resolve, &mut bg, (), (), &Type::Id(opt_id));

        let insts = bg.into_instructions();
        let count_inst =
            |pred: fn(&Instruction<'_>) -> bool| insts.iter().filter(|i| pred(i)).count();
        // One br_table; disc read once; payload pre-loaded only for
        // Some (None has no payload).
        assert_eq!(count_inst(|i| matches!(i, Instruction::BrTable(_, _))), 1);
        assert_eq!(count_inst(|i| matches!(i, Instruction::LocalGet(0))), 1);
        assert_eq!(count_inst(|i| matches!(i, Instruction::LocalGet(1))), 1);
    }

    /// `variant { x(variant { p(u64), q(u64), r(u64) }), y(u8) }` —
    /// nested variant lower. Outer arm 0's captured body contains
    /// the inner variant's full dispatch; replay rewrites the inner
    /// emit's cursor-space `LocalGet`s onto outer arm 0's arm-locals.
    /// Joined flat: outer = `[i32 disc, i32 (inner disc | u8), i64 (u64)]`.
    #[test]
    fn lower_nested_variant_routes_through_replay() {
        let mut resolve = Resolve::default();
        let inner_id = resolve.types.alloc(TypeDef {
            name: Some("inner".to_string()),
            kind: TypeDefKind::Variant(wit_parser::Variant {
                cases: ["p", "q", "r"]
                    .into_iter()
                    .map(|n| wit_parser::Case {
                        name: n.to_string(),
                        ty: Some(Type::U64),
                        docs: Docs::default(),
                        span: Span::default(),
                    })
                    .collect(),
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let outer_id = resolve.types.alloc(TypeDef {
            name: Some("outer".to_string()),
            kind: TypeDefKind::Variant(wit_parser::Variant {
                cases: vec![
                    wit_parser::Case {
                        name: "x".to_string(),
                        ty: Some(Type::Id(inner_id)),
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                    wit_parser::Case {
                        name: "y".to_string(),
                        ty: Some(Type::U8),
                        docs: Docs::default(),
                        span: Span::default(),
                    },
                ],
            }),
            owner: TypeOwner::None,
            docs: Docs::default(),
            stability: Stability::default(),
            span: Span::default(),
        });
        let sizes = new_sizes(&resolve);
        // Outer joined flat = 3 slots: [disc, joined-payload-0, joined-payload-1].
        let mut indices = LocalsBuilder::new(3);
        let addr_local = indices.alloc_local(ValType::I32);
        let mut bg =
            WasmEncoderBindgen::new(&sizes, addr_local, &mut indices).with_param_flat_count(3);
        bg.emit_set_addr_const(0);
        lower_to_memory(&resolve, &mut bg, (), (), &Type::Id(outer_id));

        let insts = bg.into_instructions();
        let count_inst =
            |pred: fn(&Instruction<'_>) -> bool| insts.iter().filter(|i| pred(i)).count();
        // Two br_tables: outer dispatch + replayed inner dispatch.
        assert_eq!(count_inst(|i| matches!(i, Instruction::BrTable(_, _))), 2);
        // Outer disc read once at slot 0; no cursor-space LocalGet(0)
        // survives from the inner arm bodies — those got remapped.
        assert_eq!(count_inst(|i| matches!(i, Instruction::LocalGet(0))), 1);
    }
}