vyre-emit-ptx 0.6.1

PTX text emitter for vyre KernelDescriptor. Produces NVRTC-compatible CUDA assembly.
Documentation
#![allow(
    clippy::doc_lazy_continuation,
    clippy::double_must_use,
    clippy::manual_div_ceil,
    clippy::needless_range_loop,
    clippy::collapsible_if,
    clippy::match_like_matches_macro,
    clippy::redundant_closure,
    clippy::too_many_arguments,
    clippy::nonminimal_bool,
    clippy::derivable_impls
)]
//! PTX text emitter for vyre `KernelDescriptor`.
//!
//! Consumes a substrate-neutral `vyre_lower::KernelDescriptor` and
//! produces NVRTC-compatible PTX assembly text. The emitter owns only
//! PTX construction; descriptor shaping and substrate-neutral
//! analyses stay in `vyre-lower`.
//!
//! ## Op coverage
//!
//! Mirrors `vyre-emit-naga` for parity:
//! - `Literal` (U32, I32, F32, Bool)
//! - `LocalInvocationId` / `GlobalInvocationId` / `WorkgroupId` (axis 0/1/2)
//! - `LoadGlobal` / `StoreGlobal` (scalar U32/I32/F32/Bool, plus packed
//!   `v2`/`v4` U32/I32/F32 chains when the descriptor presents unit-stride
//!   adjacent accesses)
//! - `BinOpKind` for the common arithmetic/logic set
//! - `UnOpKind` for Negate / LogicalNot / BitNot
//! - `Cast` between scalar types
//! - `Select`, `Fma`
//! - `StructuredIfThen`, `StructuredIfThenElse`, `StructuredBlock`,
//!   `Region`, `Return`, workgroup-scope `Barrier`
//!
//! Out of scope (returns `EmitError::UnsupportedOp` or
//! `EmitError::InvalidDescriptor`): indirect-dispatch (host concern),
//! `MemoryOrdering::GridSync` until a native cooperative-grid lowering is
//! wired, and descriptor forms without a PTX-safe lowering.
//!
//! ## PTX output shape
//!
//! ```text
//! //
//! // Generated by vyre-emit-ptx (target sm_70)
//! //
//! .version 7.0
//! .target sm_70
//! .address_size 64
//!
//! .visible .entry main(
//!     .param .u64 _arg_<binding_name>
//! )
//! {
//!     .reg .pred  %p<N>;
//!     .reg .u32   %r<N>;
//!     .reg .s32   %s<N>;
//!     .reg .f32   %f<N>;
//!     .reg .u64   %rd<N>;
//!
//!     <body>
//!
//!     ret;
//! }
//! ```

mod emitter;
mod error;
mod index_facts;
pub mod patterns;
mod reg;
mod target;

use vyre_lower::KernelDescriptor;

pub use error::EmitError;
pub use target::{ComputeCapability, PtxEmitOptions};

pub fn emit(desc: &KernelDescriptor) -> Result<String, EmitError> {
    emit_with_target(desc, ComputeCapability::default())
}

pub fn emit_with_target(
    desc: &KernelDescriptor,
    target: ComputeCapability,
) -> Result<String, EmitError> {
    emit_with_options(desc, PtxEmitOptions::for_target(target))
}

pub fn emit_with_options(
    desc: &KernelDescriptor,
    options: PtxEmitOptions,
) -> Result<String, EmitError> {
    if options.subgroup_size == 0
        || options.subgroup_size > 32
        || !options.subgroup_size.is_power_of_two()
    {
        return Err(EmitError::InvalidDescriptor(format!(
            "invalid CUDA subgroup size {}. Fix: pass the probed CUDA warp size.",
            options.subgroup_size
        )));
    }
    emitter::emit_text(desc, options)
}

/// Emit PTX text from a `KernelDescriptor` after running the full
/// `vyre_lower::rewrites::run_all` optimization pipeline. Recommended
/// over [`emit`] for production use  -  fewer dead instructions, fewer
/// redundant loads, lower register pressure.
pub fn emit_optimized(desc: &KernelDescriptor) -> Result<String, EmitError> {
    emit_optimized_with_stats(desc).map(|(s, _)| s)
}

/// Like [`emit_optimized`] but also returns
/// [`vyre_lower::rewrites::OptimizationStats`].
pub fn emit_optimized_with_stats(
    desc: &KernelDescriptor,
) -> Result<(String, vyre_lower::rewrites::OptimizationStats), EmitError> {
    let (optimized, stats) = vyre_lower::rewrites::run_all_with_stats(desc);
    debug_assert!(
        vyre_lower::verify::verify(&optimized).is_ok(),
        "rewrite pipeline produced an invalid descriptor  -  see vyre_lower::verify for the contract"
    );
    let ptx = emit(&optimized)?;
    Ok((ptx, stats))
}

/// Same as [`emit_with_target`] but runs the optimization pipeline
/// first.
pub fn emit_optimized_with_target(
    desc: &KernelDescriptor,
    target: ComputeCapability,
) -> Result<String, EmitError> {
    emit_optimized_with_target_with_stats(desc, target).map(|(s, _)| s)
}

/// The full-power variant: optimize first AND target a specific
/// compute capability AND surface OptimizationStats. Combines
/// [`emit_optimized_with_target`] and [`emit_optimized_with_stats`].
pub fn emit_optimized_with_target_with_stats(
    desc: &KernelDescriptor,
    target: ComputeCapability,
) -> Result<(String, vyre_lower::rewrites::OptimizationStats), EmitError> {
    let (optimized, stats) = vyre_lower::rewrites::run_all_with_stats(desc);
    debug_assert!(
        vyre_lower::verify::verify(&optimized).is_ok(),
        "rewrite pipeline produced an invalid descriptor  -  see vyre_lower::verify for the contract"
    );
    let ptx = emit_with_target(&optimized, target)?;
    Ok((ptx, stats))
}

#[cfg(test)]
mod tests;