vyre-driver-cuda 0.6.1

CUDA/PTX backend for vyre through the CUDA driver API.
Documentation
//! Live CUDA elementwise dispatch conformance.

mod common;
use common::{bytes_u32, u32_bytes};
use vyre_driver::DispatchConfig;
use vyre_driver_cuda::CudaBackend;
use vyre_foundation::ir::{BufferDecl, CommGroup, DataType, Expr, Node, Program};

#[test]
fn cuda_runs_u32_add_one_program_end_to_end() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(8),
            BufferDecl::output("out", 1, DataType::U32).with_count(8),
        ],
        [128, 1, 1],
        vec![Node::store(
            "out",
            Expr::gid_x(),
            Expr::add(Expr::load("input", Expr::gid_x()), Expr::u32(1)),
        )],
    );

    backend.reset_telemetry();
    let outputs = backend
        .dispatch(
            &program,
            &[u32_bytes(&[0, 1, 2, 3, 9, 10, 99, u32::MAX - 1])],
            &DispatchConfig::default(),
        )
        .expect("Fix: CUDA backend must execute the minimal u32 add-one program end-to-end.");

    assert_eq!(outputs.len(), 1);
    assert_eq!(
        bytes_u32(&outputs[0]),
        vec![1, 2, 3, 4, 10, 11, 100, u32::MAX],
        "Fix: CUDA dispatch output must be byte-exact for u32 add-one."
    );
    let telemetry = backend.telemetry_snapshot();
    assert_eq!(
        telemetry.param_upload_bytes, 12,
        "Fix: CUDA host dispatch must report the exact non-empty launch parameter bytes."
    );
    assert_eq!(
        telemetry.host_upload_operations, 2,
        "Fix: CUDA host dispatch must count one input upload plus one non-empty parameter upload."
    );
    assert_eq!(
        telemetry.transient_allocation_bytes_requested, 80,
        "Fix: CUDA host dispatch transient allocation telemetry must include input, output, and the rounded non-empty parameter buffer only."
    );
}

#[test]
fn cuda_runs_simple_if_select_program_end_to_end() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(4),
            BufferDecl::output("out", 1, DataType::U32).with_count(4),
        ],
        [64, 1, 1],
        vec![Node::store(
            "out",
            Expr::gid_x(),
            Expr::select(
                Expr::gt(Expr::load("input", Expr::gid_x()), Expr::u32(10)),
                Expr::u32(1),
                Expr::u32(0),
            ),
        )],
    );

    let outputs = backend
        .dispatch(
            &program,
            &[u32_bytes(&[1, 10, 11, 99])],
            &DispatchConfig::default(),
        )
        .expect("Fix: CUDA backend must execute select/comparison subset end-to-end.");

    assert_eq!(bytes_u32(&outputs[0]), vec![0, 0, 1, 1]);
}

#[test]
fn cuda_async_dispatch_returns_pending_gpu_work() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(4),
            BufferDecl::output("out", 1, DataType::U32).with_count(4),
        ],
        [64, 1, 1],
        vec![Node::store(
            "out",
            Expr::gid_x(),
            Expr::add(Expr::load("input", Expr::gid_x()), Expr::u32(9)),
        )],
    );

    let pending = backend
        .dispatch_async(
            &program,
            &[u32_bytes(&[1, 2, 3, 4])],
            &DispatchConfig::default(),
        )
        .expect("Fix: CUDA async dispatch must enqueue kernel work and return a pending handle.");
    let outputs = pending
        .await_result()
        .expect("Fix: CUDA async pending dispatch must return completed readback bytes.");

    assert_eq!(bytes_u32(&outputs[0]), vec![10, 11, 12, 13]);
}

#[test]
fn cuda_executes_world_allgather_as_single_rank_copy() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(8),
            BufferDecl::output("out", 1, DataType::U32).with_count(8),
        ],
        [64, 1, 1],
        vec![Node::AllGather {
            input: "input".into(),
            output: "out".into(),
            group: CommGroup::WORLD,
        }],
    );

    let outputs = backend
        .dispatch(
            &program,
            &[u32_bytes(&[3, 1, 4, 1, 5, 9, 2, 6])],
            &DispatchConfig::default(),
        )
        .expect("Fix: CUDA must lower WORLD AllGather into a real single-rank device copy.");

    assert_eq!(bytes_u32(&outputs[0]), vec![3, 1, 4, 1, 5, 9, 2, 6]);
}

#[test]
fn cuda_compiled_pipeline_executes_world_reduce_scatter_as_single_rank_copy() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(4),
            BufferDecl::output("out", 1, DataType::U32).with_count(4),
        ],
        [64, 1, 1],
        vec![Node::ReduceScatter {
            input: "input".into(),
            output: "out".into(),
            op: vyre_foundation::ir::CollectiveOp::Sum,
            group: CommGroup::WORLD,
        }],
    );
    let pipeline = backend
        .compile_native(&program, &DispatchConfig::default())
        .expect("Fix: CUDA native compile must pre-lower WORLD ReduceScatter before PTX emission.");

    let outputs = pipeline
        .dispatch(&[u32_bytes(&[8, 6, 7, 5])], &DispatchConfig::default())
        .expect("Fix: compiled CUDA pipeline must execute single-rank ReduceScatter.");

    assert_eq!(bytes_u32(&outputs[0]), vec![8, 6, 7, 5]);
}

#[test]
fn cuda_executes_world_allreduce_as_single_rank_identity() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(8),
            BufferDecl::output("out", 1, DataType::U32).with_count(8),
        ],
        [8, 1, 1],
        vec![
            Node::store("out", Expr::gid_x(), Expr::load("input", Expr::gid_x())),
            Node::AllReduce {
                buffer: "out".into(),
                op: vyre_foundation::ir::CollectiveOp::Sum,
                group: CommGroup::WORLD,
            },
        ],
    );

    let outputs = backend
        .dispatch(
            &program,
            &[u32_bytes(&[13, 21, 34, 55, 89, 144, 233, 377])],
            &DispatchConfig::default(),
        )
        .expect("Fix: CUDA must lower WORLD AllReduce into a single-rank identity.");

    assert_eq!(
        bytes_u32(&outputs[0]),
        vec![13, 21, 34, 55, 89, 144, 233, 377]
    );
}

#[test]
fn cuda_compiled_pipeline_executes_world_broadcast_root_zero_as_identity() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![
            BufferDecl::read("input", 0, DataType::U32).with_count(4),
            BufferDecl::output("out", 1, DataType::U32).with_count(4),
        ],
        [4, 1, 1],
        vec![
            Node::store("out", Expr::gid_x(), Expr::load("input", Expr::gid_x())),
            Node::Broadcast {
                buffer: "out".into(),
                root: 0,
                group: CommGroup::WORLD,
            },
        ],
    );
    let pipeline = backend
        .compile_native(&program, &DispatchConfig::default())
        .expect(
            "Fix: CUDA native compile must pre-lower WORLD Broadcast root 0 before PTX emission.",
        );

    let outputs = pipeline
        .dispatch(
            &[u32_bytes(&[0, 1, u32::MAX - 1, u32::MAX])],
            &DispatchConfig::default(),
        )
        .expect("Fix: compiled CUDA pipeline must execute single-rank Broadcast root 0.");

    assert_eq!(bytes_u32(&outputs[0]), vec![0, 1, u32::MAX - 1, u32::MAX]);
}

#[test]
fn generated_cuda_world_copy_collectives_cover_boundary_shapes() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    for &count in &[1u32, 7, 8, 63, 64, 65, 127, 128, 255, 256, 257, 1024] {
        let input = (0..count)
            .map(|index| {
                index.wrapping_mul(1_664_525).wrapping_add(1_013_904_223)
                    ^ count.rotate_left(index & 15)
            })
            .collect::<Vec<_>>();
        for reduce in [false, true] {
            let node = if reduce {
                Node::ReduceScatter {
                    input: "input".into(),
                    output: "out".into(),
                    op: vyre_foundation::ir::CollectiveOp::Sum,
                    group: CommGroup::WORLD,
                }
            } else {
                Node::AllGather {
                    input: "input".into(),
                    output: "out".into(),
                    group: CommGroup::WORLD,
                }
            };
            let program = Program::wrapped(
                vec![
                    BufferDecl::read("input", 0, DataType::U32).with_count(count),
                    BufferDecl::output("out", 1, DataType::U32).with_count(count),
                ],
                [64, 1, 1],
                vec![node],
            );

            let outputs = backend
                .dispatch(&program, &[u32_bytes(&input)], &DispatchConfig::default())
                .unwrap_or_else(|error| {
                    panic!(
                        "Fix: CUDA WORLD copy collective must dispatch at boundary count={count} reduce={reduce}: {error}"
                    )
                });

            assert_eq!(
                bytes_u32(&outputs[0]),
                input,
                "Fix: CUDA WORLD copy collective mismatch at count={count} reduce={reduce}"
            );
        }
    }
}

#[test]
fn cuda_rejects_nonzero_single_rank_broadcast_root() {
    let backend =
        CudaBackend::acquire().expect("Fix: CUDA backend acquire failed on a GPU-required host.");
    let program = Program::wrapped(
        vec![BufferDecl::output("out", 0, DataType::U32).with_count(4)],
        [64, 1, 1],
        vec![Node::Broadcast {
            buffer: "out".into(),
            root: 1,
            group: CommGroup::WORLD,
        }],
    );

    let error = backend
        .dispatch(&program, &[], &DispatchConfig::default())
        .expect_err("Fix: CUDA must reject a single-rank broadcast root other than rank 0.");
    assert!(
        error.to_string().contains("root 0"),
        "Fix: CUDA single-rank broadcast rejection must explain the root invariant: {error}"
    );
}