trueno-gpu 0.4.17

//! PTX Module and Kernel Builder
//!
//! Provides a fluent builder API for constructing PTX modules and kernels.
//!
//! ## Extension Traits
//!
//! The builder functionality is split into focused extension traits for maintainability:
//!
//! - [`PtxArithmetic`]: Add, sub, mul, fma, dp4a, transcendentals
//! - [`PtxComparison`]: setp operations for predicates
//! - [`PtxMemory`]: Global and shared memory load/store
//! - [`PtxControl`]: Labels, branches, returns, immediate moves
//! - [`PtxSync`]: Barriers, shuffles, warp votes, bit manipulation
//! - [`PtxAtomic`]: Atomic memory operations
//!
//! All traits are automatically implemented for `KernelBuilder` via blanket impls.

// Extension trait modules
mod arithmetic;
mod atomic;
mod comparison;
mod control;
mod core;
mod emit;
mod memory;
mod sync;

// Re-export extension traits for easy use
pub use arithmetic::PtxArithmetic;
pub use atomic::PtxAtomic;
pub use comparison::PtxComparison;
pub use control::PtxControl;
pub use core::KernelBuilderCore;
pub use memory::PtxMemory;
pub use sync::PtxSync;

use std::fmt::Write;

use super::instructions::{
    CmpOp, Operand, Predicate, PtxInstruction, PtxOp, RoundingMode, WmmaLayout,
};
use super::registers::{PtxReg, RegisterAllocator, VirtualReg};
use super::types::{PtxStateSpace, PtxType};
use super::{validate_target, validate_version};
use crate::error::Result;

// Emit functions extracted to emit.rs (PMAT-018 domain separation)
use emit::write_instruction;

/// Macro for dp4a operations (in-place variant)
macro_rules! impl_dp4a_inplace {
    ($fn_name:ident, $op:ident, $ty:ident, $doc:expr) => {
        #[doc = $doc]
        pub fn $fn_name(&mut self, acc: VirtualReg, a: VirtualReg, b: VirtualReg) {
            self.instructions.push(
                PtxInstruction::new(PtxOp::$op, PtxType::$ty)
                    .dst(Operand::Reg(acc))
                    .src(Operand::Reg(a))
                    .src(Operand::Reg(b))
                    .src(Operand::Reg(acc)),
            );
        }
    };
}

/// PTX Module builder
#[derive(Debug, Clone)]
pub struct PtxModule {
    /// PTX version (major, minor)
    version: (u32, u32),
    /// Target compute capability (e.g., `sm_70`)
    target: String,
    /// Address size (32 or 64)
    address_size: u32,
    /// Kernels in this module
    kernels: Vec<PtxKernel>,
}

impl PtxModule {
    /// Create a new PTX module with defaults
    #[must_use]
    pub fn new() -> Self {
        Self {
            version: (8, 0),
            target: "sm_70".to_string(),
            address_size: 64,
            kernels: Vec::new(),
        }
    }

    /// Set PTX version
    #[must_use]
    pub fn version(mut self, major: u32, minor: u32) -> Self {
        self.version = (major, minor);
        self
    }

    /// Get PTX version
    #[must_use]
    pub const fn get_version(&self) -> (u32, u32) {
        self.version
    }

    /// Set target compute capability
    #[must_use]
    pub fn target(mut self, target: impl Into<String>) -> Self {
        self.target = target.into();
        self
    }

    /// Get target
    #[must_use]
    pub fn get_target(&self) -> &str {
        &self.target
    }

    /// Set address size
    #[must_use]
    pub const fn address_size(mut self, size: u32) -> Self {
        self.address_size = size;
        self
    }

    /// Get address size
    #[must_use]
    pub const fn get_address_size(&self) -> u32 {
        self.address_size
    }

    /// Add a kernel to the module
    #[must_use]
    pub fn add_kernel(mut self, kernel: PtxKernel) -> Self {
        self.kernels.push(kernel);
        self
    }

    /// Validate the module configuration
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - The PTX version is below the minimum supported (7.0)
    /// - The target compute capability is invalid
    pub fn validate(&self) -> Result<()> {
        validate_version(self.version.0, self.version.1)?;
        validate_target(&self.target)?;
        Ok(())
    }

    /// Emit PTX source code
    #[must_use]
    pub fn emit(&self) -> String {
        let mut ptx = String::new();

        // Header comment
        ptx.push_str("// Generated by trueno-gpu\n");
        ptx.push_str("// Pure Rust PTX generation - no external dependencies\n\n");

        // Version directive
        let _ = writeln!(ptx, ".version {}.{}", self.version.0, self.version.1);

        // Target directive
        let _ = writeln!(ptx, ".target {}", self.target);

        // Address size directive
        let _ = writeln!(ptx, ".address_size {}\n", self.address_size);

        // Emit each kernel
        for kernel in &self.kernels {
            ptx.push_str(&kernel.emit());
            ptx.push('\n');
        }

        ptx
    }
}

impl Default for PtxModule {
    fn default() -> Self {
        Self::new()
    }
}

/// Kernel parameter
#[derive(Debug, Clone)]
pub struct KernelParam {
    /// Parameter type
    pub ty: PtxType,
    /// Parameter name
    pub name: String,
}

/// PTX Kernel builder
#[derive(Debug, Clone)]
pub struct PtxKernel {
    /// Kernel name
    name: String,
    /// Parameters
    params: Vec<KernelParam>,
    /// Shared memory size in bytes
    shared_memory: usize,
    /// Instructions
    instructions: Vec<PtxInstruction>,
    /// Register allocator
    registers: RegisterAllocator,
    /// Labels
    labels: Vec<String>,
}

impl PtxKernel {
    /// Create a new kernel
    #[must_use]
    pub fn new(name: impl Into<String>) -> Self {
        Self {
            name: name.into(),
            params: Vec::new(),
            shared_memory: 0,
            instructions: Vec::new(),
            registers: RegisterAllocator::new(),
            labels: Vec::new(),
        }
    }

    /// Add a parameter
    #[must_use]
    pub fn param(mut self, ty: PtxType, name: impl Into<String>) -> Self {
        self.params.push(KernelParam {
            ty,
            name: name.into(),
        });
        self
    }

    /// Set shared memory size
    #[must_use]
    pub const fn shared_memory(mut self, bytes: usize) -> Self {
        self.shared_memory = bytes;
        self
    }

    /// Get shared memory size
    #[must_use]
    pub const fn shared_memory_bytes(&self) -> usize {
        self.shared_memory
    }

    /// Build kernel body with a closure
    #[must_use]
    pub fn build<F>(mut self, builder_fn: F) -> Self
    where
        F: FnOnce(&mut KernelBuilder<'_>),
    {
        let mut builder = KernelBuilder::new(&mut self.registers);
        builder_fn(&mut builder);
        self.instructions = builder.instructions;
        self.labels = builder.labels;
        self
    }

    /// Build kernel body with optimization passes (Issue #72, #73)
    ///
    /// Applies FMA fusion and tile validation passes to the instruction sequence.
    ///
    /// # Arguments
    ///
    /// * `builder_fn` - Closure that builds the kernel body
    ///
    /// # Returns
    ///
    /// Result containing the kernel or an error if tile validation fails
    ///
    /// # cuda-tile-behavior.md References
    ///
    /// - Section 3.5: FMA Fusion Detection
    /// - Section 3.4: Tile Dimension Constraints
    pub fn build_optimized<F>(mut self, builder_fn: F) -> crate::error::Result<Self>
    where
        F: FnOnce(&mut KernelBuilder<'_>),
    {
        let mut builder = KernelBuilder::new(&mut self.registers);
        builder_fn(&mut builder);

        // Apply optimization passes (FMA fusion + tile validation)
        self.instructions = super::optimize::optimize(builder.instructions)?;
        self.labels = builder.labels;
        Ok(self)
    }

    /// Emit kernel PTX
    #[must_use]
    pub fn emit(&self) -> String {
        use std::fmt::Write;
        // Pre-allocate with estimated size: ~100 bytes per instruction + header overhead
        let estimated_size = 512 + self.instructions.len() * 100;
        let mut ptx = String::with_capacity(estimated_size);

        // Kernel entry point
        let _ = writeln!(ptx, ".visible .entry {}(", self.name);

        // Parameters
        for (i, param) in self.params.iter().enumerate() {
            let comma = if i < self.params.len() - 1 { "," } else { "" };
            let _ = writeln!(
                ptx,
                "    .param {} {}{}",
                param.ty.to_ptx_string(),
                param.name,
                comma
            );
        }

        ptx.push_str(") {\n");

        // Register declarations
        ptx.push_str(&self.registers.emit_declarations());

        // Shared memory declaration (if any)
        if self.shared_memory > 0 {
            let _ = writeln!(
                ptx,
                "    .shared .align 16 .b8 smem[{}];",
                self.shared_memory
            );
        }

        ptx.push('\n');

        // Instructions - write directly to ptx buffer to avoid allocations
        for instr in &self.instructions {
            write_instruction(instr, &mut ptx);
        }

        ptx.push_str("}\n");
        ptx
    }
}

/// Kernel builder context (passed to build closure)
pub struct KernelBuilder<'a> {
    /// Register allocator
    registers: &'a mut RegisterAllocator,
    /// Instructions
    instructions: Vec<PtxInstruction>,
    /// Labels
    labels: Vec<String>,
}

// Implement KernelBuilderCore to enable extension traits
impl<'a> core::KernelBuilderCore for KernelBuilder<'a> {
    fn registers_mut(&mut self) -> &mut RegisterAllocator {
        self.registers
    }

    fn instructions_mut(&mut self) -> &mut Vec<PtxInstruction> {
        &mut self.instructions
    }

    fn labels_mut(&mut self) -> &mut Vec<String> {
        &mut self.labels
    }
}

impl<'a> KernelBuilder<'a> {
    fn new(registers: &'a mut RegisterAllocator) -> Self {
        Self {
            registers,
            instructions: Vec::new(),
            labels: Vec::new(),
        }
    }

    // ===== Special Registers =====

    /// Read a special register into a virtual register
    pub fn special_reg(&mut self, reg: PtxReg) -> VirtualReg {
        let vreg = self.registers.allocate_virtual(reg.data_type());
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, reg.data_type())
                .dst(Operand::Reg(vreg))
                .src(Operand::SpecialReg(reg)),
        );
        vreg
    }

    // ===== Parameter Loading =====

    /// Load a u32 parameter
    pub fn load_param_u32(&mut self, name: &str) -> VirtualReg {
        let vreg = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::LdParam, PtxType::U32)
                .dst(Operand::Reg(vreg))
                .src(Operand::Param(name.to_string())),
        );
        vreg
    }

    /// Load a u64 parameter
    pub fn load_param_u64(&mut self, name: &str) -> VirtualReg {
        let vreg = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::LdParam, PtxType::U64)
                .dst(Operand::Reg(vreg))
                .src(Operand::Param(name.to_string())),
        );
        vreg
    }

    /// Load an f32 parameter
    pub fn load_param_f32(&mut self, name: &str) -> VirtualReg {
        let vreg = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::LdParam, PtxType::F32)
                .dst(Operand::Reg(vreg))
                .src(Operand::Param(name.to_string())),
        );
        vreg
    }

    // ===== Register Reuse Operations (not in traits) =====

    /// Move u64 immediate into existing register (register reuse)
    pub fn mov_u64_into(&mut self, dst: VirtualReg, val: u64) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmU64(val)),
        );
    }

    /// Move u32 immediate into existing register (register reuse)
    pub fn mov_u32_into(&mut self, dst: VirtualReg, val: u32) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmI64(val as i64)),
        );
    }

    // Comparison and Memory operations now in PtxComparison and PtxMemory traits

    // ===== Memory Operations (vectorized - not in traits) =====

    /// Load f32 from global memory (kept for compatibility - delegates to trait)
    pub fn ld_global_f32(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::F32)
                .space(PtxStateSpace::Global)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
        );
        dst
    }

    /// Store f32 to global memory
    pub fn st_global_f32(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::F32)
                .space(PtxStateSpace::Global)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val)),
        );
    }

    /// Load 4 consecutive f32 values from global memory (vectorized, 16-byte load)
    ///
    /// Returns 4 registers containing the loaded values.
    /// Address must be 16-byte aligned for optimal performance.
    ///
    /// PTX: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [addr];
    pub fn ld_global_f32_v4(&mut self, addr: VirtualReg) -> [VirtualReg; 4] {
        let r0 = self.registers.allocate_virtual(PtxType::F32);
        let r1 = self.registers.allocate_virtual(PtxType::F32);
        let r2 = self.registers.allocate_virtual(PtxType::F32);
        let r3 = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::V4F32)
                .space(PtxStateSpace::Global)
                .dst(Operand::Reg(r0))
                .dst(Operand::Reg(r1))
                .dst(Operand::Reg(r2))
                .dst(Operand::Reg(r3))
                .src(Operand::Reg(addr)),
        );
        [r0, r1, r2, r3]
    }

    // ===== Immediate Arithmetic (not in traits - different signatures) =====

    /// Add u32 with immediate (different from trait version which takes two registers)
    pub fn add_u32(&mut self, a: VirtualReg, b: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Add, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::ImmU64(b as u64)),
        );
        dst
    }

    /// PAR-063: Dot product of 4 x u8 vectors with accumulate
    ///
    /// Computes: d = dot4(a, b) + c
    /// where a and b are u32 containing 4 x u8 values each
    ///
    /// This is the key SIMD instruction used by llama.cpp for Q4K inference.
    /// Each dp4a computes 4 multiply-adds in one instruction.
    ///
    /// # Example
    /// ```ignore
    /// let a = 0x01020304u32;  // bytes [4, 3, 2, 1]
    /// let b = 0x05060708u32;  // bytes [8, 7, 6, 5]
    /// let c = 0u32;           // accumulator
    /// // d = 4*8 + 3*7 + 2*6 + 1*5 = 32 + 21 + 12 + 5 = 70
    /// ```
    pub fn dp4a_u32(&mut self, a: VirtualReg, b: VirtualReg, c: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Dp4a, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b))
                .src(Operand::Reg(c)),
        );
        dst
    }

    // ===== DP4A In-Place Operations (Macro-Generated) =====
    // Dot product of 4 x u8/s8 vectors with accumulate, generated via macro.
    // PAR-063: Key SIMD instruction for Q4K inference (llama.cpp pattern).

    impl_dp4a_inplace!(
        dp4a_u32_inplace,
        Dp4a,
        U32,
        "DP4A u32 in-place: acc += dot4(a, b) where a,b are packed u8x4"
    );
    impl_dp4a_inplace!(
        dp4a_u32_s32_inplace,
        Dp4aUS,
        S32,
        "DP4A u32×s32 in-place: acc += dot4(u8x4, s8x4)"
    );
    impl_dp4a_inplace!(
        dp4a_s32_inplace,
        Dp4aS32,
        S32,
        "DP4A s32 in-place: acc += dot4(s8x4, s8x4)"
    );

    /// Barrier synchronization (all threads in block must reach this point)
    pub fn bar_sync(&mut self, barrier_id: u32) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Bar, PtxType::B32).label(format!("sync {}", barrier_id)),
        );
    }

    /// Memory fence at CTA (thread block) level
    ///
    /// Ensures all prior memory operations are visible to other threads in the block.
    /// PTX: membar.cta;
    pub fn membar_cta(&mut self) {
        self.instructions
            .push(PtxInstruction::new(PtxOp::MemBar, PtxType::B32).label("cta".to_string()));
    }

    /// Memory fence at GPU level
    ///
    /// Ensures all prior memory operations are visible to other threads on the GPU.
    /// PTX: membar.gl;
    pub fn membar_gl(&mut self) {
        self.instructions
            .push(PtxInstruction::new(PtxOp::MemBar, PtxType::B32).label("gl".to_string()));
    }

    // ===== Shared Memory Operations =====
    // These operations are provided by the PtxMemory extension trait in memory.rs.
    // KernelBuilder implements KernelBuilderCore, so it automatically gets all
    // PtxMemory methods via blanket impl. Available methods:
    //   - ld_shared_f32, st_shared_f32
    //   - ld_shared_u32, st_shared_u32
    //   - ld_shared_u32_volatile
    //   - st_shared_f16

    /// Warp shuffle down (for reductions)
    /// Format: shfl.sync.down.b32 dst, src, delta, clamp, membermask
    pub fn shfl_down_f32(&mut self, val: VirtualReg, offset: u32, mask: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::ShflDown, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::ImmU64(offset as u64))
                .src(Operand::ImmU64(31)) // clamp to warp size
                .src(Operand::ImmU64(mask as u64)), // membermask
        );
        dst
    }

    /// Warp shuffle indexed (for broadcasts - gets value from specific lane)
    ///
    /// Format: shfl.sync.idx.b32 dst, src, srcLane, width, membermask
    ///
    /// IMPORTANT: For shfl.idx, the third parameter is WIDTH (not clamp!)
    /// Width must be a power of 2: 1, 2, 4, 8, 16, or 32.
    /// Use 32 for full-warp broadcasts.
    pub fn shfl_idx_f32(&mut self, val: VirtualReg, src_lane: u32, mask: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::ShflIdx, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::ImmU64(src_lane as u64))
                .src(Operand::ImmU64(32)) // Width for shfl.idx (must be power of 2!)
                .src(Operand::ImmU64(mask as u64)), // membermask
        );
        dst
    }

    /// Warp shuffle indexed for u32 values (broadcasts, lane selection)
    ///
    /// Format: shfl.sync.idx.b32 dst, src, srcLane, width, membermask
    ///
    /// IMPORTANT: For shfl.idx, the third parameter is WIDTH (not clamp!)
    /// Width must be a power of 2: 1, 2, 4, 8, 16, or 32.
    /// Use 32 for full-warp broadcasts.
    ///
    /// Used for KF-000A hypothesis test: Can shfl.sync values be stored without F081/F082 crash?
    pub fn shfl_idx_u32(&mut self, val: VirtualReg, src_lane: u32, mask: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::ShflIdx, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::ImmU64(src_lane as u64))
                .src(Operand::ImmU64(32)) // Width for shfl.idx (must be power of 2!)
                .src(Operand::ImmU64(mask as u64)), // membermask
        );
        dst
    }

    /// Warp shuffle indexed with dynamic lane (from register)
    ///
    /// Format: shfl.sync.idx.b32 dst, src, srcLane, width, membermask
    /// srcLane comes from a register instead of immediate.
    pub fn shfl_idx_u32_reg(
        &mut self,
        val: VirtualReg,
        src_lane_reg: VirtualReg,
        mask: u32,
    ) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::ShflIdx, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::Reg(src_lane_reg))
                .src(Operand::ImmU64(32)) // Width
                .src(Operand::ImmU64(mask as u64)), // membermask
        );
        dst
    }

    // ===== KF-002: Warp Vote and Bit Manipulation =====

    /// Warp ballot - returns bitmask of lanes where predicate is true
    ///
    /// Format: vote.sync.ballot.b32 dst, pred, membermask;
    ///
    /// Returns a u32 where bit i is set if lane i has predicate true.
    /// Used for finding which lanes have matching hash values in LZ4 compression.
    pub fn ballot_sync(&mut self, pred: VirtualReg, mask: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::VoteBallot, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(pred))
                .src(Operand::ImmU64(mask as u64)),
        );
        dst
    }

    /// Population count - counts number of 1 bits in a u32
    ///
    /// Format: popc.b32 dst, src;
    ///
    /// Used for counting matches in ballot results.
    pub fn popc_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Popc, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Find first set bit (1-indexed, returns 0 if input is 0)
    ///
    /// Format: bfind.u32 dst, src;
    ///
    /// Returns position of most significant set bit (0 if src==0).
    /// To get lane number from ballot: use bfind or clz+subtract.
    pub fn bfind_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Bfind, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Count leading zeros
    ///
    /// Format: clz.b32 dst, src;
    ///
    /// Used with ballot to find first matching lane: lane = 31 - clz(ballot)
    pub fn clz_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Clz, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Store u16 to shared memory (for hash table positions)
    ///
    /// Format: st.shared.u16 [addr], val;
    ///
    /// Used for storing 16-bit positions in LZ4 hash table (2048 entries × 2 bytes).
    /// IMPORTANT: This is WRITE-ONLY usage - no ld.shared.u16 needed, avoiding F081!
    pub fn st_shared_u16(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U16)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Shared),
        );
    }

    /// Min u32 of two values
    pub fn min_u32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Min, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Subtract u32 registers
    pub fn sub_u32_reg(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Sub, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Exp f32 (exponential)
    pub fn ex2_f32(&mut self, val: VirtualReg) -> VirtualReg {
        // PTX has ex2 (base 2), we scale input by log2(e) for natural exp
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ex2, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// AND two predicates: dst = a AND b
    /// Used for combining bounds checks (PARITY-114)
    pub fn and_pred(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::Pred);
        self.instructions.push(
            PtxInstruction::new(PtxOp::And, PtxType::Pred)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Multiply u32
    pub fn mul_u32(&mut self, a: VirtualReg, b: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::ImmU64(b as u64)),
        );
        dst
    }

    /// Multiply u32 (register * register)
    pub fn mul_u32_reg(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Add u32 (register + register)
    pub fn add_u32_reg(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Add, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Convert u32 to u64 (zero extend)
    pub fn cvt_u64_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert u32 to u64 into existing register (register reuse)
    pub fn cvt_u64_u32_into(&mut self, dst: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
    }

    /// Convert u64 to u32 (truncate)
    pub fn cvt_u32_u64(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert u32 to f32
    pub fn cvt_f32_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .rounding(RoundingMode::Rn),
        );
        dst
    }

    /// Convert signed int8 to signed int32: dst = sext(val)
    /// Used for Q8_0 dequantization (int8 quantized values)
    /// Note: Input is u8 (from ld.global.u8), we do manual sign extension
    pub fn cvt_s32_s8(&mut self, val: VirtualReg) -> VirtualReg {
        // First convert u8 -> u32 (zero extend)
        let u32_val = self.cvt_u32_u8(val);
        // For sign extension: if val >= 128, subtract 256
        // signed = unsigned - ((unsigned >= 128) ? 256 : 0)
        let const_128 = self.mov_u32_imm(128);
        let is_negative = self.setp_ge_u32(u32_val, const_128);
        let const_256 = self.mov_u32_imm(256);
        let zero = self.mov_u32_imm(0);
        // Select 256 if negative, else 0
        let adjust = self.selp_u32(is_negative, const_256, zero);
        // Compute signed value
        self.sub_u32_reg(u32_val, adjust)
    }

    /// Convert signed int32 to f32: dst = (f32)val
    /// Used for Q8_0 dequantization after s8->s32 conversion
    /// Emits cvt.rn.f32.s32 which interprets the source bits as signed
    pub fn cvt_f32_s32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .with_src_type(PtxType::S32) // Force .s32 source type
                .rounding(RoundingMode::Rn),
        );
        dst
    }

    /// Reciprocal square root f32: dst = 1/sqrt(val)
    pub fn rsqrt_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Rsqrt, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Sine f32 (approximate): dst = sin(val)
    /// PAR-060: Used for RoPE (Rotary Position Embedding) kernel
    pub fn sin_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Sin, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Cosine f32 (approximate): dst = cos(val)
    /// PAR-060: Used for RoPE (Rotary Position Embedding) kernel
    pub fn cos_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cos, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// CORRECTNESS-013: Precise sine using 7th-order polynomial (Chebyshev approximation)
    /// Error < 2^-23 for input in [-π, π]
    /// For RoPE with high theta (1M), we need range reduction first
    ///
    /// Uses minimax polynomial: sin(x) ≈ x * (1 + c1*x² + c2*x⁴ + c3*x⁶)
    /// Coefficients from Cephes library (public domain)
    pub fn sin_f32_precise(&mut self, x: VirtualReg) -> VirtualReg {
        // Range reduction: x_reduced = x - 2π * round(x / (2π))
        let inv_two_pi = self.mov_f32_imm(1.0 / std::f32::consts::TAU);
        let half = self.mov_f32_imm(0.5);

        // n = round(x / 2π) = floor(x / 2π + 0.5)
        let x_scaled = self.mul_f32(x, inv_two_pi);
        let x_plus_half = self.add_f32(x_scaled, half);
        let n_f32 = self.floor_f32(x_plus_half);

        // x_reduced = x - n * 2π (using FMA for precision)
        let neg_two_pi = self.mov_f32_imm(-std::f32::consts::TAU);
        let x_reduced = self.fma_f32(n_f32, neg_two_pi, x);

        // Now x_reduced is in [-π, π]
        // Polynomial coefficients (Cephes sin polynomial)
        let c1 = self.mov_f32_imm(-0.166_666_67_f32); // -1/6
        let c2 = self.mov_f32_imm(0.008_333_334_f32); // 1/120
        let c3 = self.mov_f32_imm(-0.000_198_412_7_f32); // -1/5040

        // x² and higher powers
        let x2 = self.mul_f32(x_reduced, x_reduced);
        let x4 = self.mul_f32(x2, x2);
        let x6 = self.mul_f32(x4, x2);

        // Horner's method: 1 + c1*x² + c2*x⁴ + c3*x⁶
        let term3 = self.mul_f32(c3, x6);
        let term2 = self.fma_f32(c2, x4, term3);
        let term1 = self.fma_f32(c1, x2, term2);
        let one = self.mov_f32_imm(1.0);
        let poly = self.add_f32(one, term1);

        // sin(x) = x * poly
        self.mul_f32(x_reduced, poly)
    }

    /// CORRECTNESS-013: Precise cosine using 6th-order polynomial
    /// Error < 2^-23 for input in [-π, π]
    ///
    /// Uses: cos(x) ≈ 1 + c1*x² + c2*x⁴ + c3*x⁶
    pub fn cos_f32_precise(&mut self, x: VirtualReg) -> VirtualReg {
        // Range reduction: x_reduced = x - 2π * round(x / (2π))
        let inv_two_pi = self.mov_f32_imm(1.0 / std::f32::consts::TAU);
        let half = self.mov_f32_imm(0.5);

        let x_scaled = self.mul_f32(x, inv_two_pi);
        let x_plus_half = self.add_f32(x_scaled, half);
        let n_f32 = self.floor_f32(x_plus_half);

        let neg_two_pi = self.mov_f32_imm(-std::f32::consts::TAU);
        let x_reduced = self.fma_f32(n_f32, neg_two_pi, x);

        // Polynomial coefficients (Cephes cos polynomial)
        let c1 = self.mov_f32_imm(-0.5_f32); // -1/2
        let c2 = self.mov_f32_imm(0.041_666_668_f32); // 1/24
        let c3 = self.mov_f32_imm(-0.001_388_888_9_f32); // -1/720

        let x2 = self.mul_f32(x_reduced, x_reduced);
        let x4 = self.mul_f32(x2, x2);
        let x6 = self.mul_f32(x4, x2);

        // Horner's method: 1 + c1*x² + c2*x⁴ + c3*x⁶
        let term3 = self.mul_f32(c3, x6);
        let term2 = self.fma_f32(c2, x4, term3);
        let term1 = self.fma_f32(c1, x2, term2);
        let one = self.mov_f32_imm(1.0);
        self.add_f32(one, term1)
    }

    /// Floor f32: dst = floor(val)
    /// Uses cvt.rmi.f32.f32 (round toward minus infinity)
    pub fn floor_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::F32)
                .with_src_type(PtxType::F32)
                .rounding(RoundingMode::Rmi) // round-to-integer-toward-minus-infinity (floor)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Negate f32: dst = -val
    /// PAR-060: Used for RoPE (Rotary Position Embedding) kernel
    pub fn neg_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Neg, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// CORRECTNESS-013: Precise exp2 (2^x) using polynomial approximation
    ///
    /// Avoids ex2.approx.f32 which has ~2^-21 error.
    /// Uses range reduction: 2^x = 2^n * 2^f where n = round(x), f = x - n
    /// Then 2^f is computed with a polynomial for f ∈ [-0.5, 0.5]
    pub fn ex2_f32_precise(&mut self, x: VirtualReg) -> VirtualReg {
        // Range reduction: split x into integer and fractional parts
        // n = round(x), f = x - n where f ∈ [-0.5, 0.5]
        let half = self.mov_f32_imm(0.5);
        let x_plus_half = self.add_f32(x, half);
        let n_f32 = self.floor_f32(x_plus_half); // n = floor(x + 0.5) = round(x)
        let neg_one = self.mov_f32_imm(-1.0);
        let f = self.fma_f32(n_f32, neg_one, x); // f = x - n

        // Polynomial for 2^f where f ∈ [-0.5, 0.5]
        // Using 6th order minimax polynomial (relative error < 2^-23)
        // 2^f ≈ c0 + c1*f + c2*f² + c3*f³ + c4*f⁴ + c5*f⁵ + c6*f⁶
        // Coefficients from sollya/libm for 2^x on [-0.5, 0.5]
        let c0 = self.mov_f32_imm(1.0);
        let c1 = self.mov_f32_imm(std::f32::consts::LN_2); // ln(2)
        let c2 = self.mov_f32_imm(0.240_226_5_f32); // ln(2)²/2
        let c3 = self.mov_f32_imm(0.055_503_19_f32); // ln(2)³/6
        let c4 = self.mov_f32_imm(0.009_618_342_f32); // ln(2)⁴/24
        let c5 = self.mov_f32_imm(0.001_333_355_9_f32); // ln(2)⁵/120

        // Horner's method: p = c0 + f*(c1 + f*(c2 + f*(c3 + f*(c4 + f*c5))))
        let _f2 = self.mul_f32(f, f);
        let p5 = c5;
        let p4 = self.fma_f32(p5, f, c4);
        let p3 = self.fma_f32(p4, f, c3);
        let p2 = self.fma_f32(p3, f, c2);
        let p1 = self.fma_f32(p2, f, c1);
        let exp2_f = self.fma_f32(p1, f, c0);

        // Now compute 2^n using scalbn (ldexp)
        // In PTX, we can use the fact that 2^n for integer n can be computed via:
        // scalbn(1.0, n) = ldexp(1.0, n)
        // For simplicity, we'll use ex2.approx for the integer part (which is exact!)
        // Since n is an integer, ex2(n) = 2^n exactly (no approximation error)
        let two_pow_n = self.ex2_f32(n_f32);

        // Final result: 2^x = 2^n * 2^f
        self.mul_f32(two_pow_n, exp2_f)
    }

    /// Integer division u32
    pub fn div_u32(&mut self, a: VirtualReg, b: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Div, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::ImmU64(b as u64)),
        );
        dst
    }

    /// Integer remainder (modulo) u32
    pub fn rem_u32(&mut self, a: VirtualReg, b: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Rem, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::ImmU64(b as u64)),
        );
        dst
    }

    /// Move immediate u64 value
    pub fn mov_u64_imm(&mut self, val: u64) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmU64(val)),
        );
        dst
    }

    /// Multiply u64 by immediate
    pub fn mul_u64(&mut self, a: VirtualReg, b: u64) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::ImmU64(b)),
        );
        dst
    }

    /// Multiply u64 (register * register)
    pub fn mul_u64_reg(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Branch if predicate is false (negated predicate)
    pub fn branch_if_not(&mut self, pred: VirtualReg, label: &str) {
        let predicate = Predicate {
            reg: pred,
            negated: true,
        };
        self.instructions.push(
            PtxInstruction::new(PtxOp::Bra, PtxType::B32)
                .predicated(predicate)
                .label(label),
        );
    }

    /// Load u32 from global memory
    pub fn ld_global_u32(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Load u32 from global memory into existing register (register reuse)
    pub fn ld_global_u32_into(&mut self, dst: VirtualReg, addr: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .space(PtxStateSpace::Global),
        );
    }

    /// Store u32 to global memory
    pub fn st_global_u32(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U32)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
    }

    /// Load u64 from global memory (PAR-118: for pointer arrays in batched attention)
    pub fn ld_global_u64(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Store u64 to global memory
    pub fn st_global_u64(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U64)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
    }

    /// Load u8 from global memory
    ///
    /// NOTE: PTX does not support .u8 register types (minimum is 16-bit).
    /// We allocate a U16 register and use ld.global.u8 which zero-extends
    /// the loaded byte into the 16-bit register.
    pub fn ld_global_u8(&mut self, addr: VirtualReg) -> VirtualReg {
        // CRITICAL: PTX requires registers to be at least 16-bit
        // ld.global.u8 zero-extends the byte into the U16 register
        let dst = self.registers.allocate_virtual(PtxType::U16);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U8)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Store u8 to global memory
    ///
    /// NOTE: PTX requires stores to come from at least a 16-bit register.
    /// The low 8 bits of the source register are stored to the address.
    pub fn st_global_u8(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U8)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
    }

    /// Store u16 to global memory
    pub fn st_global_u16(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U16)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
    }

    // ========================================================================
    // ATOMIC OPERATIONS - For debugging and synchronization
    // ========================================================================

    /// Atomic add to global memory, returns old value
    ///
    /// PTX: atom.global.add.u32 dst, [addr], val
    /// Atomically: old = *addr; *addr = old + val; return old
    pub fn atom_add_global_u32(&mut self, addr: VirtualReg, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::AtomAdd, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Atomic exchange on global memory, returns old value
    ///
    /// PTX: atom.global.exch.u32 dst, [addr], val
    /// Atomically: old = *addr; *addr = val; return old
    pub fn atom_exch_global_u32(&mut self, addr: VirtualReg, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::AtomExch, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Atomic min on global memory, returns old value
    ///
    /// PTX: atom.global.min.u32 dst, [addr], val
    pub fn atom_min_global_u32(&mut self, addr: VirtualReg, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::AtomMin, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Atomic max on global memory, returns old value
    ///
    /// PTX: atom.global.max.u32 dst, [addr], val
    pub fn atom_max_global_u32(&mut self, addr: VirtualReg, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::AtomMax, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Atomic exchange on shared memory, returns old value
    ///
    /// PTX: atom.shared.exch.u32 dst, [addr], val
    /// Atomically: old = *addr; *addr = val; return old
    ///
    /// NOTE: This is a workaround for a ptxas bug where regular st.shared
    /// with computed addresses crashes the JIT compiler.
    pub fn atom_exch_shared_u32(&mut self, addr: VirtualReg, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::AtomExch, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Shared),
        );
        dst
    }

    // ========================================================================
    // DEBUG HELPERS - Printf-style debugging for PTX kernels
    // ========================================================================

    /// Emit a debug marker to a debug buffer
    ///
    /// This atomically increments a counter at debug_buf[0] and writes
    /// the marker value to debug_buf[old_counter + 1].
    ///
    /// Usage:
    /// - Pass a debug buffer with at least (max_markers + 1) u32 elements
    /// - debug_buf[0] = counter (starts at 0)
    /// - debug_buf[1..] = marker values written by emit_debug_marker
    ///
    /// Returns the slot index where the marker was written (for chaining)
    pub fn emit_debug_marker(&mut self, debug_buf_ptr: VirtualReg, marker: u32) -> VirtualReg {
        // Atomically get next slot: slot = atomicAdd(debug_buf[0], 1)
        let one = self.mov_u32_imm(1);
        let slot = self.atom_add_global_u32(debug_buf_ptr, one);

        // Compute address: addr = debug_buf_ptr + (slot + 1) * 4
        let slot_plus_1 = self.add_u32(slot, 1);
        let offset = self.mul_u32(slot_plus_1, 4);
        let offset_64 = self.cvt_u64_u32(offset);
        let addr = self.add_u64(debug_buf_ptr, offset_64);

        // Write marker value
        let marker_val = self.mov_u32_imm(marker);
        self.st_global_u32(addr, marker_val);

        slot
    }

    /// Emit a debug value to a debug buffer (for variables)
    ///
    /// Similar to emit_debug_marker but writes an arbitrary register value
    pub fn emit_debug_value(&mut self, debug_buf_ptr: VirtualReg, value: VirtualReg) -> VirtualReg {
        // Atomically get next slot
        let one = self.mov_u32_imm(1);
        let slot = self.atom_add_global_u32(debug_buf_ptr, one);

        // Compute address
        let slot_plus_1 = self.add_u32(slot, 1);
        let offset = self.mul_u32(slot_plus_1, 4);
        let offset_64 = self.cvt_u64_u32(offset);
        let addr = self.add_u64(debug_buf_ptr, offset_64);

        // Write value
        self.st_global_u32(addr, value);

        slot
    }

    /// Load u16 from global memory (for f16 as raw bits)
    pub fn ld_global_u16(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U16);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U16)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Convert u8 to u32 (zero extend)
    pub fn cvt_u32_u8(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert u16 to u32 (zero extend)
    pub fn cvt_u32_u16(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert u32 to u16 (truncate)
    ///
    /// Takes the low 16 bits of the u32 value.
    /// Use this before storing u32 values to u16 memory locations.
    pub fn cvt_u16_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U16);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U16)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Shift right u32 (logical shift)
    ///
    /// NOTE: PTX requires .b32 (bitwise) type for shift ops, not .u32
    pub fn shr_u32(&mut self, val: VirtualReg, shift: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            // PTX requires .b32 for shift ops, not .u32
            PtxInstruction::new(PtxOp::Shr, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::Reg(shift)),
        );
        dst
    }

    /// Shift right u32 by immediate (logical shift)
    ///
    /// Uses an immediate value for the shift amount, avoiding register clobbering issues.
    /// Use this in loops where the shift amount is constant to prevent SASS from
    /// reusing the shift register.
    pub fn shr_u32_imm(&mut self, val: VirtualReg, shift: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Shr, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::ImmU64(shift as u64)),
        );
        dst
    }

    /// Bitwise AND u32 (register AND register)
    ///
    /// NOTE: PTX requires .b32 (bitwise) type for and/or/xor, not .u32
    pub fn and_u32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            // PTX requires .b32 for bitwise ops, not .u32
            PtxInstruction::new(PtxOp::And, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Bitwise OR u32 (register OR register)
    ///
    /// NOTE: PTX requires .b32 (bitwise) type for and/or/xor, not .u32
    pub fn or_u32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            // PTX requires .b32 for bitwise ops, not .u32
            PtxInstruction::new(PtxOp::Or, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Bitwise OR u32 into existing register (register reuse)
    pub fn or_u32_into(&mut self, dst: VirtualReg, a: VirtualReg, b: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Or, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
    }

    /// Shift left u32 (register << register)
    ///
    /// NOTE: PTX requires .b32 (bitwise) type for shift ops, not .u32
    pub fn shl_u32(&mut self, val: VirtualReg, shift: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            // PTX requires .b32 for shift ops, not .u32
            PtxInstruction::new(PtxOp::Shl, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::Reg(shift)),
        );
        dst
    }

    /// Shift left u32 by immediate (register << immediate)
    pub fn shl_u32_imm(&mut self, val: VirtualReg, shift: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Shl, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::ImmU64(shift as u64)),
        );
        dst
    }

    /// Select based on predicate: dst = pred ? true_val : false_val
    ///
    /// PTX format: selp.u32 d, a, b, p
    /// where d = destination, a = value if true, b = value if false, p = predicate
    pub fn selp_u32(
        &mut self,
        pred: VirtualReg,
        true_val: VirtualReg,
        false_val: VirtualReg,
    ) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Selp, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(true_val))
                .src(Operand::Reg(false_val))
                .src(Operand::Reg(pred)),
        );
        dst
    }

    /// Select f32 based on predicate: dst = pred ? true_val : false_val
    ///
    /// PTX format: selp.f32 d, a, b, p
    /// PAR-062: Used by ArgMax kernel for conditional max tracking
    pub fn selp_f32(
        &mut self,
        pred: VirtualReg,
        true_val: VirtualReg,
        false_val: VirtualReg,
    ) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Selp, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(true_val))
                .src(Operand::Reg(false_val))
                .src(Operand::Reg(pred)),
        );
        dst
    }

    /// Compare f32 greater than: pred = a > b
    ///
    /// PTX format: setp.gt.f32 p, a, b
    /// PAR-062: Used by ArgMax kernel for max comparison
    pub fn setp_gt_f32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let pred = self.registers.allocate_virtual(PtxType::Pred);
        let mut instr = PtxInstruction::new(PtxOp::Setp, PtxType::F32)
            .dst(Operand::Reg(pred))
            .src(Operand::Reg(a))
            .src(Operand::Reg(b));
        instr.label = Some(CmpOp::Gt.to_ptx_string().to_string());
        self.instructions.push(instr);
        pred
    }

    /// Get shared memory base pointer
    ///
    /// PAR-062: Returns base address of shared memory for this block
    pub fn shared_ptr(&mut self) -> VirtualReg {
        self.shared_base_addr()
    }

    /// Warp shuffle down for u32: exchange with lane + offset
    ///
    /// PTX format: shfl.sync.down.b32 d, a, offset, clamp, mask
    /// PAR-062: Used by ArgMax kernel for warp-level index reduction
    pub fn shfl_down_u32(&mut self, val: VirtualReg, offset: u32, mask: u32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::ShflDown, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .src(Operand::ImmU64(offset as u64))
                .src(Operand::ImmU64(31)) // clamp to warp size
                .src(Operand::ImmU64(mask as u64)),
        );
        dst
    }

    /// Load f32 immediate constant
    ///
    /// PAR-062: Used for NEG_INFINITY initialization
    pub fn const_f32(&mut self, val: f32) -> VirtualReg {
        self.mov_f32_imm(val)
    }

    /// Load u32 immediate constant
    ///
    /// PAR-062: Used for index initialization
    pub fn const_u32(&mut self, val: u32) -> VirtualReg {
        self.mov_u32_imm(val)
    }

    /// Bitwise AND u32 with immediate
    ///
    /// PAR-062: Used for lane_id extraction (tid & 31)
    pub fn and_u32_imm(&mut self, a: VirtualReg, imm: u32) -> VirtualReg {
        let imm_reg = self.mov_u32_imm(imm);
        self.and_u32(a, imm_reg)
    }

    // ===== In-Place Updates (for loops) =====

    /// Add u32 immediate in-place: dst = dst + imm
    /// Used for loop counter updates where SSA would allocate a new register
    pub fn add_u32_inplace(&mut self, dst: VirtualReg, imm: u32) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Add, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::ImmU64(imm as u64)),
        );
    }

    /// Add f32 register in-place: dst = dst + src
    /// Used for accumulator updates in reduction loops
    /// Add register to u32 register in place (dst += src)
    pub fn add_u32_reg_inplace(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.registers.extend_live_range(src);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Add, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::Reg(src)),
        );
    }

    /// Add f32 value in-place: dst = dst + src
    pub fn add_f32_inplace(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Add, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::Reg(src))
                .rounding(RoundingMode::Rn),
        );
    }

    /// Shift right u32 in-place by immediate: dst = dst >> imm
    /// Used for stride halving in reduction loops
    ///
    /// NOTE: PTX requires .b32 (bitwise) type for shift ops, not .u32
    pub fn shr_u32_inplace(&mut self, dst: VirtualReg, imm: u32) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            // PTX requires .b32 for shift ops, not .u32
            PtxInstruction::new(PtxOp::Shr, PtxType::B32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::ImmU64(imm as u64)),
        );
    }

    /// Fused multiply-add in-place: dst = a * b + dst
    /// Used for GEMM accumulation
    pub fn fma_f32_inplace(&mut self, dst: VirtualReg, a: VirtualReg, b: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Fma, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b))
                .src(Operand::Reg(dst))
                .rounding(RoundingMode::Rn),
        );
    }

    /// Max in-place: dst = max(dst, src)
    /// Used for online softmax running max
    pub fn max_f32_inplace(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Max, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::Reg(src)),
        );
    }

    /// Copy f32 register: dst = src
    /// Used for accumulator state updates
    pub fn mov_f32_reg(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(src)),
        );
    }

    /// Copy u32 register: dst = src
    /// Used for loop counter updates
    pub fn mov_u32_reg(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(src)),
        );
    }

    /// Copy u64 register: dst = src
    pub fn mov_u64_reg(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(src)),
        );
    }

    /// Multiply in-place: dst = dst * src
    /// Used for scaling operations
    pub fn mul_f32_inplace(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::Reg(src))
                .rounding(RoundingMode::Rn),
        );
    }

    /// Divide in-place: dst = dst / src
    /// Used for normalization
    pub fn div_f32_inplace(&mut self, dst: VirtualReg, src: VirtualReg) {
        self.registers.extend_live_range(dst);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Div, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(dst))
                .src(Operand::Reg(src))
                .rounding(RoundingMode::Rn),
        );
    }

    // ===== Tensor Core (WMMA) Operations =====
    // These require sm_70+ and generate WMMA PTX intrinsics

    /// Load F16 matrix fragment A for WMMA (16x16x16 tile)
    /// Returns fragment registers for use in wmma_mma
    pub fn wmma_load_a_f16(
        &mut self,
        addr: VirtualReg,
        stride: u32,
        layout: WmmaLayout,
    ) -> Vec<VirtualReg> {
        // WMMA 16x16x16 F16 requires 8 F16x2 registers (16 half values)
        let mut frag = Vec::with_capacity(8);
        for _ in 0..8 {
            frag.push(self.registers.allocate_virtual(PtxType::B32));
        }
        // Build instruction with all 8 destination registers
        let mut instr = PtxInstruction::new(PtxOp::WmmaLoadA, PtxType::F16).label(format!(
            "m16n16k16.{}.f16.stride.{}",
            layout.to_ptx_string(),
            stride
        ));
        // Add all fragment registers as destinations (use push_dst for vector dests)
        for reg in &frag {
            instr = instr.push_dst(Operand::Reg(*reg));
        }
        // Source is address and stride immediate
        instr = instr.src(Operand::Reg(addr));
        instr = instr.src(Operand::ImmI64(i64::from(stride)));
        self.instructions.push(instr);
        frag
    }

    /// Load F16 matrix fragment B for WMMA (16x16x16 tile)
    pub fn wmma_load_b_f16(
        &mut self,
        addr: VirtualReg,
        stride: u32,
        layout: WmmaLayout,
    ) -> Vec<VirtualReg> {
        let mut frag = Vec::with_capacity(8);
        for _ in 0..8 {
            frag.push(self.registers.allocate_virtual(PtxType::B32));
        }
        // Build instruction with all 8 destination registers
        let mut instr = PtxInstruction::new(PtxOp::WmmaLoadB, PtxType::F16).label(format!(
            "m16n16k16.{}.f16.stride.{}",
            layout.to_ptx_string(),
            stride
        ));
        for reg in &frag {
            instr = instr.push_dst(Operand::Reg(*reg));
        }
        instr = instr.src(Operand::Reg(addr));
        instr = instr.src(Operand::ImmI64(i64::from(stride)));
        self.instructions.push(instr);
        frag
    }

    /// Load F32 accumulator fragment C for WMMA (16x16x16 tile)
    pub fn wmma_load_c_f32(
        &mut self,
        addr: VirtualReg,
        stride: u32,
        layout: WmmaLayout,
    ) -> Vec<VirtualReg> {
        // Accumulator is 8 F32 values
        let mut frag = Vec::with_capacity(8);
        for _ in 0..8 {
            frag.push(self.registers.allocate_virtual(PtxType::F32));
        }
        // Build instruction with all 8 destination registers
        let mut instr = PtxInstruction::new(PtxOp::WmmaLoadC, PtxType::F32).label(format!(
            "m16n16k16.{}.f32.stride.{}",
            layout.to_ptx_string(),
            stride
        ));
        for reg in &frag {
            instr = instr.push_dst(Operand::Reg(*reg));
        }
        instr = instr.src(Operand::Reg(addr));
        instr = instr.src(Operand::ImmI64(i64::from(stride)));
        self.instructions.push(instr);
        frag
    }

    /// Initialize F32 accumulator fragment C to zero (WAPR-PERF-010)
    /// This avoids loading from memory address 0 which is invalid
    pub fn wmma_init_c_zero(&mut self) -> Vec<VirtualReg> {
        // Accumulator is 8 F32 values, initialize all to 0.0
        let mut frag = Vec::with_capacity(8);
        for _ in 0..8 {
            let reg = self.registers.allocate_virtual(PtxType::F32);
            self.instructions.push(
                PtxInstruction::new(PtxOp::Mov, PtxType::F32)
                    .dst(Operand::Reg(reg))
                    .src(Operand::ImmF32(0.0)),
            );
            frag.push(reg);
        }
        frag
    }

    /// WMMA matrix multiply-accumulate: D = A * B + C
    /// Takes A, B, C fragment registers and returns D fragment registers
    #[allow(clippy::similar_names)]
    pub fn wmma_mma_f16_f32(
        &mut self,
        frag_a: &[VirtualReg],
        frag_b: &[VirtualReg],
        frag_c: &[VirtualReg],
    ) -> Vec<VirtualReg> {
        // Output accumulator D (8 F32 values)
        let mut frag_d = Vec::with_capacity(8);
        for _ in 0..8 {
            frag_d.push(self.registers.allocate_virtual(PtxType::F32));
        }

        // MMA instruction with all fragment registers
        // Format: wmma.mma.sync.aligned.m16n16k16.row.col.f32.f32 {d0-d7}, {a0-a7}, {b0-b7}, {c0-c7}
        let mut instr =
            PtxInstruction::new(PtxOp::WmmaMma, PtxType::F32).label("m16n16k16.row.col.f32.f32");

        // Add all D registers as destinations (use push_dst for vector dests)
        for reg in &frag_d {
            instr = instr.push_dst(Operand::Reg(*reg));
        }

        // Add all A, B, C fragment registers as sources (in order)
        for reg in frag_a {
            instr = instr.src(Operand::Reg(*reg));
        }
        for reg in frag_b {
            instr = instr.src(Operand::Reg(*reg));
        }
        for reg in frag_c {
            instr = instr.src(Operand::Reg(*reg));
        }

        self.instructions.push(instr);
        frag_d
    }

    /// Store F32 accumulator fragment D to memory
    pub fn wmma_store_d_f32(
        &mut self,
        addr: VirtualReg,
        frag_d: &[VirtualReg],
        stride: u32,
        layout: WmmaLayout,
    ) {
        if frag_d.is_empty() {
            return;
        }
        // Format: wmma.store.d.sync.aligned.m16n16k16.row.f32 [addr], {d0-d7}, stride
        let mut instr = PtxInstruction::new(PtxOp::WmmaStoreD, PtxType::F32).label(format!(
            "m16n16k16.{}.f32.stride.{}",
            layout.to_ptx_string(),
            stride
        ));
        // Address is first source
        instr = instr.src(Operand::Reg(addr));
        // All fragment registers
        for reg in frag_d {
            instr = instr.src(Operand::Reg(*reg));
        }
        // Stride
        instr = instr.src(Operand::ImmI64(i64::from(stride)));
        self.instructions.push(instr);
    }

    /// Convert F32 values to F16 (for feeding tensor cores)
    pub fn cvt_f16_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F16);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::F16)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .rounding(RoundingMode::Rn),
        );
        dst
    }

    /// Convert F16 value to F32 (for accumulation)
    pub fn cvt_f32_f16(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Load F16 from global memory
    ///
    /// NOTE: PTX uses `.b16` (binary 16-bit) for half-precision loads,
    /// not `.f16`. The loaded value is still interpreted as f16 for
    /// subsequent operations.
    pub fn ld_global_f16(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F16);
        self.instructions.push(
            // PTX requires ld.global.b16, not ld.global.f16
            PtxInstruction::new(PtxOp::Ld, PtxType::B16)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr))
                .space(PtxStateSpace::Global),
        );
        dst
    }

    /// Store F16 to global memory
    ///
    /// PTX uses `.b16` (binary 16-bit) for half-precision stores, not `.f16`.
    /// The PTX ISA does not support `st.global.f16` — only `.b16` for 16-bit stores.
    /// This matches `ld_global_f16` which already uses `PtxType::B16`.
    pub fn st_global_f16(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::B16)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val))
                .space(PtxStateSpace::Global),
        );
    }

    // =========================================================================
    // COALESCED GEMV SUPPORT - DECODER THROUGHPUT SPEC §5.3
    // =========================================================================

    /// Multiply low u32 (register * register -> u32)
    ///
    /// Unlike mul_wide, this keeps only the low 32 bits of the result.
    /// Used for computing block offsets: col_base = block_id * block_size
    pub fn mul_lo_u32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    // =========================================================================
    // PAR-063-V4: Additional ops for Q8 quantization kernels
    // =========================================================================

    /// Signed 32-bit multiply (low 32 bits of result)
    pub fn mul_lo_s32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mul, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Absolute value of f32
    pub fn abs_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Abs, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Minimum of two signed 32-bit integers
    pub fn min_s32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Min, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Maximum of two signed 32-bit integers
    pub fn max_s32(&mut self, a: VirtualReg, b: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Max, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(a))
                .src(Operand::Reg(b)),
        );
        dst
    }

    /// Convert f32 to s32 with round-to-nearest-integer
    pub fn cvt_rni_s32_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val))
                .rounding(RoundingMode::Rni),
        );
        dst
    }

    /// Move immediate to s32 register
    pub fn mov_s32_imm(&mut self, val: i32) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmI64(i64::from(val))),
        );
        dst
    }

    /// Reinterpret u32 bits as s32 (no instruction, just type change)
    pub fn mov_s32_from_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert s32 to u8 (truncate to low 8 bits)
    pub fn cvt_u8_s32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U8);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::U8)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert u8 to s32 with sign extension
    pub fn cvt_s32_u8_sx(&mut self, val: VirtualReg) -> VirtualReg {
        // For sign extension, we treat the u8 as s8 and extend to s32
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Convert u32 to s32 (reinterpret bits)
    pub fn cvt_s32_u32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::S32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::S32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Reciprocal approximation (1/x)
    pub fn rcp_f32(&mut self, val: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Rcp, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(val)),
        );
        dst
    }

    /// Move u32 immediate into existing register
    pub fn mov_u32_inplace(&mut self, dst: VirtualReg, val: u32) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmU64(u64::from(val))),
        );
    }

    /// Get base address of shared memory array 'smem' as generic address
    ///
    /// Returns a u64 pointer to the beginning of the shared memory region
    /// declared by `.shared .align 16 .b8 smem[N]`.
    ///
    /// NOTE: This returns a GENERIC address (via cvta.to.shared).
    /// Use with ld/st WITHOUT state space (generic addressing).
    /// For WMMA operations that require generic pointers.
    ///
    /// For shared-space ld.shared/st.shared, use `shared_base_addr_local()` instead.
    pub fn shared_base_addr(&mut self) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        // Use cvta.to.shared.u64 to get generic address from shared memory label
        // This is REQUIRED for WMMA operations which need generic pointers
        // Generates: cvta.to.shared.u64 %rd, smem;
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvta, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Label("smem".to_string()))
                .space(PtxStateSpace::Shared),
        );
        dst
    }

    /// Load u32 from generic address (unified address space)
    ///
    /// Use this after `shared_base_addr()` + offset computation for shared memory.
    /// Generic addressing allows the hardware to resolve the actual memory space.
    pub fn ld_generic_u32(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
            // No .space() means generic addressing
        );
        dst
    }

    /// Load u32 from generic address into existing register (register reuse)
    pub fn ld_generic_u32_into(&mut self, dst: VirtualReg, addr: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
        );
    }

    /// Store u32 to generic address (unified address space)
    ///
    /// Use this after `shared_base_addr()` + offset computation for shared memory.
    /// Generic addressing allows the hardware to resolve the actual memory space.
    pub fn st_generic_u32(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U32)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val)),
            // No .space() means generic addressing
        );
    }

    /// Load u64 from generic address (unified address space)
    pub fn ld_generic_u64(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U64);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U64)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
        );
        dst
    }

    /// Store u64 to generic address (unified address space)
    pub fn st_generic_u64(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U64)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val)),
        );
    }

    /// Load u8 from generic address (unified address space)
    ///
    /// Use this for byte-level operations on shared memory.
    /// Returns value in a U16 register (PTX minimum register size).
    pub fn ld_generic_u8(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U16);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U8)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
            // No .space() means generic addressing
        );
        dst
    }

    /// Store u8 to generic address (unified address space)
    ///
    /// Use this for byte-level writes to shared memory.
    /// Source should be in a U16 or U32 register (low 8 bits stored).
    pub fn st_generic_u8(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U8)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val)),
            // No .space() means generic addressing
        );
    }

    /// Load u16 from generic address (unified address space)
    ///
    /// Use this for 16-bit operations on shared memory (e.g., hash table entries).
    pub fn ld_generic_u16(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::U16);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::U16)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
            // No .space() means generic addressing
        );
        dst
    }

    /// Store u16 to generic address (unified address space)
    ///
    /// Use this for 16-bit writes to shared memory (e.g., hash table entries).
    pub fn st_generic_u16(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::U16)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val)),
            // No .space() means generic addressing
        );
    }

    /// Load f32 from generic address (unified address space)
    ///
    /// Use this after `shared_base_addr()` + offset computation for shared memory.
    /// Generic addressing allows the hardware to resolve the actual memory space.
    pub fn ld_generic_f32(&mut self, addr: VirtualReg) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
            // No .space() means generic addressing
        );
        dst
    }

    /// Store f32 to generic address (unified address space)
    ///
    /// Use this after `shared_base_addr()` + offset computation for shared memory.
    /// Generic addressing allows the hardware to resolve the actual memory space.
    pub fn st_generic_f32(&mut self, addr: VirtualReg, val: VirtualReg) {
        self.instructions.push(
            PtxInstruction::new(PtxOp::St, PtxType::F32)
                .src(Operand::Reg(addr))
                .src(Operand::Reg(val)),
            // No .space() means generic addressing
        );
    }

    /// Predicated load f32 from global memory with default value
    ///
    /// If predicate is true: loads value from addr
    /// If predicate is false: returns default_val (no memory access)
    ///
    /// Implementation:
    /// ```ptx
    /// mov.f32 %dst, default_val;     // Initialize with default
    /// @pred ld.global.f32 %dst, [addr];  // Conditional load
    /// ```
    ///
    /// Used for bounds-checked loads in GEMV:
    /// ```text
    /// let valid = setp_lt_u32(idx, n);
    /// let val = ld_global_f32_predicated(addr, valid, 0.0);
    /// ```
    pub fn ld_global_f32_predicated(
        &mut self,
        addr: VirtualReg,
        pred: VirtualReg,
        default_val: f32,
    ) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);

        // 1. Initialize with default value
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmF32(default_val)),
        );

        // 2. Predicated load - only executes if pred is true
        // If pred is false, dst keeps the default value
        let predicate = Predicate {
            reg: pred,
            negated: false,
        };

        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::F32)
                .space(PtxStateSpace::Global)
                .predicated(predicate)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(addr)),
        );

        dst
    }

    /// PAR-028: Load F16 from global memory with predicate guard
    ///
    /// If predicate is true: loads from addr, converts to F32
    /// If predicate is false: returns 0.0 (no memory access)
    ///
    /// Implementation:
    /// ```ptx
    /// mov.f32 %dst, 0.0;                // Initialize with default
    /// @pred {
    ///     ld.global.b16 %tmp, [addr];   // Conditional load F16
    ///     cvt.f32.f16 %dst, %tmp;       // Convert to F32
    /// }
    /// ```
    ///
    /// Used for FP16 KV cache in attention kernels:
    /// ```text
    /// let valid = setp_lt_u32(idx, head_dim);
    /// let k_val = ld_global_f16_to_f32_predicated(addr, valid);
    /// ```
    pub fn ld_global_f16_to_f32_predicated(
        &mut self,
        addr: VirtualReg,
        pred: VirtualReg,
    ) -> VirtualReg {
        let dst = self.registers.allocate_virtual(PtxType::F32);
        let tmp = self.registers.allocate_virtual(PtxType::F16);

        // 1. Initialize with default value (0.0)
        self.instructions.push(
            PtxInstruction::new(PtxOp::Mov, PtxType::F32)
                .dst(Operand::Reg(dst))
                .src(Operand::ImmF32(0.0)),
        );

        // 2. Predicated F16 load - only executes if pred is true
        let predicate = Predicate {
            reg: pred,
            negated: false,
        };

        // Load F16 (using .b16 as PTX requires)
        self.instructions.push(
            PtxInstruction::new(PtxOp::Ld, PtxType::B16)
                .space(PtxStateSpace::Global)
                .predicated(predicate.clone())
                .dst(Operand::Reg(tmp))
                .src(Operand::Reg(addr)),
        );

        // 3. Predicated convert F16 to F32
        self.instructions.push(
            PtxInstruction::new(PtxOp::Cvt, PtxType::F32)
                .predicated(predicate)
                .dst(Operand::Reg(dst))
                .src(Operand::Reg(tmp)),
        );

        dst
    }
}

// Tests (~3K lines extracted for TDG compliance)
#[cfg(test)]
mod tests;