kaio-core 0.5.0

//! PTX module — the top-level IR container.

use std::fmt;

use super::instruction::PtxInstruction;
use super::kernel::PtxKernel;
use crate::instr::{MemoryOp, TensorCoreOp};
use crate::types::PtxType;

/// A complete PTX module containing version/target metadata and kernels.
///
/// Corresponds to a single `.ptx` file with a header and one or more
/// `.entry` kernel definitions.
#[derive(Debug, Clone)]
pub struct PtxModule {
    /// PTX ISA version (e.g. `"7.8"`).
    pub version: String,
    /// Target SM architecture (e.g. `"sm_89"`).
    pub target: String,
    /// Address size in bits (32 or 64).
    pub address_size: u32,
    /// Kernel definitions in this module.
    pub kernels: Vec<PtxKernel>,
}

impl PtxModule {
    /// Create a new module targeting the given SM architecture.
    ///
    /// Defaults: PTX version `8.7` (CUDA 12.8), address size `64`.
    pub fn new(target: &str) -> Self {
        Self {
            version: "8.7".to_string(),
            target: target.to_string(),
            address_size: 64,
            kernels: Vec::new(),
        }
    }

    /// Add a kernel to this module.
    pub fn add_kernel(&mut self, kernel: PtxKernel) {
        self.kernels.push(kernel);
    }

    /// Parse the target string (e.g. `"sm_89"`) into a numeric SM
    /// version (e.g. `89`).
    ///
    /// Returns `None` if the target string is not a recognized
    /// `sm_NN` form (e.g. future targets, virtual architectures).
    /// [`validate`](Self::validate) tolerates unparseable targets by
    /// skipping the SM check — we'd rather let unusual targets through
    /// than block a user experimenting with a custom target string.
    fn parse_sm_target(&self) -> Option<u32> {
        self.target.strip_prefix("sm_").and_then(|s| s.parse().ok())
    }

    /// Validate that this module's target SM is high enough for every
    /// feature used by its kernels.
    ///
    /// Walks all kernel bodies looking for features that carry a minimum
    /// SM requirement — currently tensor-core operations and `cp.async`
    /// variants (both Ampere+ / SM 8.0). Returns [`ValidationError::SmTooLow`]
    /// on the **first** such mismatch with a human-readable description.
    ///
    /// This is a narrow **target-capability** check, not a semantic or
    /// dataflow pass. The goal is to surface clean errors at emit-time
    /// instead of cryptic ptxas messages downstream.
    pub fn validate(&self) -> Result<(), ValidationError> {
        let target_sm = self.parse_sm_target();

        for kernel in &self.kernels {
            for instr in &kernel.body {
                // Target-agnostic shape/dtype routing checks.
                if let PtxInstruction::TensorCore(op) = instr {
                    validate_tensor_core_op(op)?;
                }

                // Target-capability check (skipped on unparseable targets).
                if let Some(target_sm) = target_sm
                    && let Some((required, feature)) = instruction_sm_requirement(instr)
                    && target_sm < required
                {
                    return Err(ValidationError::SmTooLow {
                        required,
                        actual: target_sm,
                        feature,
                    });
                }
            }
        }
        Ok(())
    }
}

/// Per-instruction target-agnostic IR validation for tensor-core ops.
///
/// Rejects bf16 dtype tags on the generic [`TensorCoreOp::MmaSync`]
/// variant — bf16 emission must go through [`TensorCoreOp::MmaSyncBf16`] so
/// the fragment types and instruction dtype stay aligned at the IR boundary.
///
/// Also rejects mis-typed registers on [`TensorCoreOp::LdMatrix`]: unlike
/// the mma variants, whose operands are typed fragment wrappers (allocated
/// with the correct register class by construction), `LdMatrix` carries raw
/// [`Register`](crate::ir::Register)s — so the `.b32`-class requirement
/// (`PtxType::U32`, the `alloc_packed_half2` packed-pair convention) is
/// enforced here, surfacing a named error at module load instead of a
/// cryptic ptxas failure at JIT time.
fn validate_tensor_core_op(op: &TensorCoreOp) -> Result<(), ValidationError> {
    match op {
        TensorCoreOp::MmaSync { a_ty, b_ty, .. } => {
            if *a_ty == PtxType::BF16 {
                return Err(ValidationError::MmaSyncBf16Rejected { operand: "a_ty" });
            }
            if *b_ty == PtxType::BF16 {
                return Err(ValidationError::MmaSyncBf16Rejected { operand: "b_ty" });
            }
        }
        TensorCoreOp::LdMatrix { dst, addr, .. } => {
            for reg in dst.regs() {
                if reg.ptx_type != PtxType::U32 {
                    return Err(ValidationError::LdMatrixBadRegType {
                        operand: "dst",
                        found: reg.ptx_type,
                    });
                }
            }
            if addr.ptx_type != PtxType::U32 {
                return Err(ValidationError::LdMatrixBadRegType {
                    operand: "addr",
                    found: addr.ptx_type,
                });
            }
        }
        _ => {}
    }
    Ok(())
}

/// Return `Some((min_sm, feature_label))` if this instruction carries an SM
/// requirement, or `None` if it is SM-agnostic.
fn instruction_sm_requirement(instr: &PtxInstruction) -> Option<(u32, String)> {
    match instr {
        PtxInstruction::TensorCore(op) => Some((op.min_sm(), op.feature_label())),
        PtxInstruction::Memory(
            MemoryOp::CpAsyncCaSharedGlobal { .. }
            | MemoryOp::CpAsyncCommitGroup
            | MemoryOp::CpAsyncWaitGroup { .. },
        ) => Some((80, "cp.async".to_string())),
        _ => None,
    }
}

/// Errors returned by [`PtxModule::validate`].
///
/// Scope is intentionally narrow — target-capability checks only, no
/// semantic analysis.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ValidationError {
    /// A feature used by the module requires a higher SM target than
    /// the module declares.
    ///
    /// Example: a kernel containing `mma.sync.m16n8k16` in a module
    /// with `.target sm_70` would yield
    /// `required: 80, actual: 70, feature: "mma.sync.m16n8k16"`.
    SmTooLow {
        /// Minimum SM version required by the offending feature.
        required: u32,
        /// SM version parsed from the module's target string.
        actual: u32,
        /// Human-readable name of the offending feature.
        feature: String,
    },
    /// A [`TensorCoreOp::MmaSync`] instruction was constructed with
    /// `PtxType::BF16` on `a_ty` or `b_ty`. Bf16 emission must use the
    /// dedicated [`TensorCoreOp::MmaSyncBf16`] variant so fragment types
    /// and instruction dtype stay aligned at the IR boundary.
    ///
    /// Introduced in Sprint 9.1 cleanup; closes the legacy hole where the
    /// generic `MmaSync` path silently emitted a bf16 instruction from
    /// `FragmentA_F16` / `FragmentB_F16` operands.
    MmaSyncBf16Rejected {
        /// Which operand carried the rejected dtype tag (`"a_ty"` or
        /// `"b_ty"`).
        operand: &'static str,
    },
    /// A [`TensorCoreOp::LdMatrix`] instruction carries a register whose
    /// declared type is not `PtxType::U32` (`.b32` class). The mma
    /// variants get this for free from their typed fragment wrappers;
    /// `LdMatrix` takes raw registers, so the check lives here.
    ///
    /// Destination registers hold packed 16-bit pairs
    /// ([`alloc_packed_half2`](crate::ir::RegisterAllocator::alloc_packed_half2)
    /// convention) and the address register is a shared-space `.u32`
    /// byte address. Introduced in Sprint 9.3.
    LdMatrixBadRegType {
        /// Which operand carried the rejected register (`"dst"` or
        /// `"addr"`).
        operand: &'static str,
        /// The register's declared PTX type.
        found: PtxType,
    },
}

impl fmt::Display for ValidationError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::SmTooLow {
                required,
                actual,
                feature,
            } => {
                write!(
                    f,
                    "{feature} requires sm_{required}+, target is sm_{actual}"
                )
            }
            Self::MmaSyncBf16Rejected { operand } => {
                write!(
                    f,
                    "TensorCoreOp::MmaSync with PtxType::BF16 on {operand} is rejected; use TensorCoreOp::MmaSyncBf16 for bf16 emission"
                )
            }
            Self::LdMatrixBadRegType { operand, found } => {
                write!(
                    f,
                    "TensorCoreOp::LdMatrix {operand} register must be PtxType::U32 (.b32 packed-pair convention, see alloc_packed_half2), found {found:?}"
                )
            }
        }
    }
}

impl std::error::Error for ValidationError {}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::fragment::{alloc_a_f16, alloc_b_f16, alloc_c};
    use crate::instr::{MemoryOp, MmaShape, TensorCoreOp};
    use crate::ir::{PtxInstruction, PtxKernel, Register, RegisterAllocator};
    use crate::types::{PtxType, RegKind};

    fn reg(kind: RegKind, index: u32, ptx_type: PtxType) -> Register {
        Register {
            kind,
            index,
            ptx_type,
        }
    }

    fn tc_kernel() -> PtxKernel {
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("has_mma");
        k.push(PtxInstruction::TensorCore(TensorCoreOp::MmaSync {
            d: alloc_c(&mut alloc),
            a: alloc_a_f16(&mut alloc),
            b: alloc_b_f16(&mut alloc),
            c: alloc_c(&mut alloc),
            shape: MmaShape::M16N8K16,
            d_ty: PtxType::F32,
            a_ty: PtxType::F16,
            b_ty: PtxType::F16,
            c_ty: PtxType::F32,
        }));
        k
    }

    #[test]
    fn validate_rejects_mma_on_sm_70() {
        let mut module = PtxModule::new("sm_70");
        module.add_kernel(tc_kernel());
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::SmTooLow {
                required: 80,
                actual: 70,
                feature: "mma.sync.m16n8k16".to_string(),
            }
        );
        assert_eq!(
            err.to_string(),
            "mma.sync.m16n8k16 requires sm_80+, target is sm_70"
        );
    }

    #[test]
    fn validate_accepts_mma_on_sm_80() {
        let mut module = PtxModule::new("sm_80");
        module.add_kernel(tc_kernel());
        assert!(module.validate().is_ok());
    }

    #[test]
    fn validate_accepts_mma_on_sm_89() {
        let mut module = PtxModule::new("sm_89");
        module.add_kernel(tc_kernel());
        assert!(module.validate().is_ok());
    }

    fn tc_int8_kernel() -> PtxKernel {
        use crate::fragment::{alloc_a_M16N8K32, alloc_b_M16N8K32, alloc_c_M16N8K32};
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("has_mma_int8");
        k.push(PtxInstruction::TensorCore(TensorCoreOp::MmaSyncInt8 {
            d: alloc_c_M16N8K32(&mut alloc),
            a: alloc_a_M16N8K32(&mut alloc),
            b: alloc_b_M16N8K32(&mut alloc),
            c: alloc_c_M16N8K32(&mut alloc),
        }));
        k
    }

    #[test]
    fn validate_rejects_mma_int8_on_sm_70() {
        let mut module = PtxModule::new("sm_70");
        module.add_kernel(tc_int8_kernel());
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::SmTooLow {
                required: 80,
                actual: 70,
                feature: "mma.sync.m16n8k32.s8.s8.s32".to_string(),
            }
        );
        assert_eq!(
            err.to_string(),
            "mma.sync.m16n8k32.s8.s8.s32 requires sm_80+, target is sm_70"
        );
    }

    #[test]
    fn validate_accepts_mma_int8_on_sm_80() {
        let mut module = PtxModule::new("sm_80");
        module.add_kernel(tc_int8_kernel());
        assert!(module.validate().is_ok());
    }

    #[test]
    fn validate_accepts_mma_int8_on_sm_89() {
        let mut module = PtxModule::new("sm_89");
        module.add_kernel(tc_int8_kernel());
        assert!(module.validate().is_ok());
    }

    fn ldmatrix_kernel() -> PtxKernel {
        use crate::instr::LdMatrixDst;
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("has_ldmatrix");
        k.push(PtxInstruction::TensorCore(TensorCoreOp::LdMatrix {
            dst: LdMatrixDst::X4([
                alloc.alloc_packed_half2(),
                alloc.alloc_packed_half2(),
                alloc.alloc_packed_half2(),
                alloc.alloc_packed_half2(),
            ]),
            addr: alloc.alloc(PtxType::U32),
            trans: false,
        }));
        k
    }

    // ldmatrix is the first sub-80 TensorCore instruction; these tests
    // protect the shared validation path for the new 75 tier
    // (Sprint 9.3): sm_75 accepts ldmatrix, sm_70 still rejects it, and
    // mma stays gated at 80 even in a module whose ldmatrix is fine.
    #[test]
    fn validate_accepts_ldmatrix_on_sm_75() {
        let mut module = PtxModule::new("sm_75");
        module.add_kernel(ldmatrix_kernel());
        assert!(module.validate().is_ok());
    }

    #[test]
    fn validate_rejects_ldmatrix_on_sm_70() {
        let mut module = PtxModule::new("sm_70");
        module.add_kernel(ldmatrix_kernel());
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::SmTooLow {
                required: 75,
                actual: 70,
                feature: "ldmatrix.m8n8.x4".to_string(),
            }
        );
        assert_eq!(
            err.to_string(),
            "ldmatrix.m8n8.x4 requires sm_75+, target is sm_70"
        );
    }

    #[test]
    fn validate_rejects_mma_at_sm_75_even_with_ldmatrix_present() {
        let mut module = PtxModule::new("sm_75");
        module.add_kernel(ldmatrix_kernel());
        module.add_kernel(tc_kernel());
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::SmTooLow {
                required: 80,
                actual: 75,
                feature: "mma.sync.m16n8k16".to_string(),
            }
        );
    }

    #[test]
    fn validate_rejects_ldmatrix_bad_dst_reg_type() {
        use crate::instr::LdMatrixDst;
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("bad_ldmatrix_dst");
        // Third dst register is an .f32 — not the .b32 packed-pair class.
        k.push(PtxInstruction::TensorCore(TensorCoreOp::LdMatrix {
            dst: LdMatrixDst::X4([
                alloc.alloc_packed_half2(),
                alloc.alloc_packed_half2(),
                alloc.alloc(PtxType::F32),
                alloc.alloc_packed_half2(),
            ]),
            addr: alloc.alloc(PtxType::U32),
            trans: false,
        }));
        let mut module = PtxModule::new("sm_80");
        module.add_kernel(k);
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::LdMatrixBadRegType {
                operand: "dst",
                found: PtxType::F32,
            }
        );
    }

    #[test]
    fn validate_rejects_ldmatrix_bad_addr_reg_type() {
        use crate::instr::LdMatrixDst;
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("bad_ldmatrix_addr");
        // Shared addresses are 32-bit byte offsets in this IR — a .u64
        // address register is a wiring bug.
        k.push(PtxInstruction::TensorCore(TensorCoreOp::LdMatrix {
            dst: LdMatrixDst::X2([alloc.alloc_packed_half2(), alloc.alloc_packed_half2()]),
            addr: alloc.alloc(PtxType::U64),
            trans: true,
        }));
        let mut module = PtxModule::new("sm_80");
        module.add_kernel(k);
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::LdMatrixBadRegType {
                operand: "addr",
                found: PtxType::U64,
            }
        );
        assert!(err.to_string().contains("alloc_packed_half2"));
    }

    #[test]
    fn validate_rejects_cp_async_on_sm_75() {
        let mut module = PtxModule::new("sm_75");
        let mut k = PtxKernel::new("has_cp_async");
        k.push(PtxInstruction::Memory(MemoryOp::new_cp_async_ca(
            reg(RegKind::R, 0, PtxType::U32),
            reg(RegKind::Rd, 0, PtxType::U64),
            16,
        )));
        module.add_kernel(k);
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::SmTooLow {
                required: 80,
                actual: 75,
                feature: "cp.async".to_string(),
            }
        );
    }

    #[test]
    fn validate_accepts_scalar_kernel_on_sm_70() {
        // A module with no tensor-core or cp.async features should pass
        // validation even on sm_70.
        let mut module = PtxModule::new("sm_70");
        let k = PtxKernel::new("scalar_only");
        module.add_kernel(k);
        assert!(module.validate().is_ok());
    }

    #[test]
    fn validate_skips_unparseable_target() {
        // Don't block weird custom targets.
        let mut module = PtxModule::new("compute_90a");
        module.add_kernel(tc_kernel());
        assert!(module.validate().is_ok());
    }

    #[test]
    fn parse_sm_target() {
        let m = PtxModule::new("sm_89");
        assert_eq!(m.parse_sm_target(), Some(89));
        let m2 = PtxModule::new("sm_80");
        assert_eq!(m2.parse_sm_target(), Some(80));
        let m3 = PtxModule::new("compute_90a");
        assert_eq!(m3.parse_sm_target(), None);
    }

    fn mma_sync_with_bf16_tags() -> PtxKernel {
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("legacy_bf16_on_mma_sync");
        k.push(PtxInstruction::TensorCore(TensorCoreOp::MmaSync {
            d: alloc_c(&mut alloc),
            a: alloc_a_f16(&mut alloc),
            b: alloc_b_f16(&mut alloc),
            c: alloc_c(&mut alloc),
            shape: MmaShape::M16N8K16,
            d_ty: PtxType::F32,
            a_ty: PtxType::BF16,
            b_ty: PtxType::BF16,
            c_ty: PtxType::F32,
        }));
        k
    }

    #[test]
    fn validate_rejects_mma_sync_bf16_a_ty() {
        let mut module = PtxModule::new("sm_89");
        module.add_kernel(mma_sync_with_bf16_tags());
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::MmaSyncBf16Rejected { operand: "a_ty" }
        );
        assert_eq!(
            err.to_string(),
            "TensorCoreOp::MmaSync with PtxType::BF16 on a_ty is rejected; \
             use TensorCoreOp::MmaSyncBf16 for bf16 emission"
        );
    }

    #[test]
    fn validate_rejects_mma_sync_bf16_b_ty_only() {
        // Mixed: a_ty F16 + b_ty BF16 still rejected.
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("mixed_bf16_b_only");
        k.push(PtxInstruction::TensorCore(TensorCoreOp::MmaSync {
            d: alloc_c(&mut alloc),
            a: alloc_a_f16(&mut alloc),
            b: alloc_b_f16(&mut alloc),
            c: alloc_c(&mut alloc),
            shape: MmaShape::M16N8K16,
            d_ty: PtxType::F32,
            a_ty: PtxType::F16,
            b_ty: PtxType::BF16,
            c_ty: PtxType::F32,
        }));
        let mut module = PtxModule::new("sm_89");
        module.add_kernel(k);
        let err = module.validate().unwrap_err();
        assert_eq!(
            err,
            ValidationError::MmaSyncBf16Rejected { operand: "b_ty" }
        );
    }

    #[test]
    fn validate_rejects_mma_sync_bf16_even_on_unparseable_target() {
        // The dtype-routing check is target-agnostic — it fires regardless
        // of whether the target string is `sm_NN`.
        let mut module = PtxModule::new("compute_90a");
        module.add_kernel(mma_sync_with_bf16_tags());
        assert_eq!(
            module.validate().unwrap_err(),
            ValidationError::MmaSyncBf16Rejected { operand: "a_ty" }
        );
    }

    fn mma_sync_bf16_kernel() -> PtxKernel {
        use crate::fragment::{alloc_a_bf16, alloc_b_bf16};
        let mut alloc = RegisterAllocator::new();
        let mut k = PtxKernel::new("native_bf16");
        k.push(PtxInstruction::TensorCore(TensorCoreOp::MmaSyncBf16 {
            d: alloc_c(&mut alloc),
            a: alloc_a_bf16(&mut alloc),
            b: alloc_b_bf16(&mut alloc),
            c: alloc_c(&mut alloc),
        }));
        k
    }

    #[test]
    fn validate_accepts_mma_sync_bf16_dedicated_variant() {
        let mut module = PtxModule::new("sm_89");
        module.add_kernel(mma_sync_bf16_kernel());
        assert!(module.validate().is_ok());
    }
}