baracuda-kernels 0.0.1-alpha.68

//! Random-generator plan — Uniform / Normal / Bernoulli.
//!
//! Three sample-generation ops with no input tensor (the descriptor's
//! `shape` defines the output extent; `seed` and `param1` / `param2`
//! drive the distribution). Uniform and Normal route directly to cuRAND;
//! Bernoulli generates a `float` uniform-rand buffer through cuRAND and
//! then runs the bespoke threshold kernel to produce a Bool output.

use core::cell::Cell;
use core::ffi::c_void;
use core::marker::PhantomData;

use baracuda_cutlass::{Error, Result};
use baracuda_driver::Stream;
use baracuda_kernels_sys::{
    curandCreateGenerator, curandDestroyGenerator, curandGenerateNormal,
    curandGenerateNormalDouble, curandGenerateUniform, curandGenerateUniformDouble,
    curandGenerator_t, curandSetPseudoRandomGeneratorSeed, curandSetStream,
};
use baracuda_kernels_types::{
    ArchSku, BackendKind, Bool, Element, ElementKind, KernelSku, MathPrecision, OpCategory,
    PlanPreference, PrecisionGuarantee, RandomKind, TensorMut, Workspace,
};

/// Descriptor for a random-generator op.
#[derive(Copy, Clone, Debug)]
pub struct RandomDescriptor<const N: usize> {
    /// Which distribution.
    pub kind: RandomKind,
    /// Output tensor shape. Must be all-positive when total numel > 0.
    pub shape: [i32; N],
    /// Output element type. For [`RandomKind::Uniform`] / [`RandomKind::Normal`]
    /// this is the produced FP type (f32 / f64). For
    /// [`RandomKind::Bernoulli`] this is `ElementKind::Bool` —
    /// the descriptor's type parameter `T` is `Bool`.
    pub element: ElementKind,
    /// For `Uniform`: low. For `Normal`: mean. For `Bernoulli`: probability `p`.
    pub param1: f32,
    /// For `Uniform`: high. For `Normal`: stddev. Ignored for `Bernoulli`.
    pub param2: f32,
    /// Deterministic seed. Each descriptor carries its own RNG state;
    /// re-running the same plan with the same descriptor and shape
    /// reproduces the same sequence.
    pub seed: u64,
}

/// Args bundle for Uniform / Normal (T = f32 | f64).
pub struct RandomArgs<'a, T: Element, const N: usize> {
    /// Output tensor — written by cuRAND directly. Must be contiguous.
    pub y: TensorMut<'a, T, N>,
}

/// Args bundle for Bernoulli (output is `Bool`).
///
/// Bernoulli runs cuRAND uniform into the caller-provided workspace and
/// then writes Bool output cells via the bespoke threshold kernel. The
/// workspace must be at least `numel * sizeof(f32)` bytes (see
/// [`RandomPlan::workspace_size`]).
pub struct RandomBoolArgs<'a, const N: usize> {
    /// Output tensor — packed Bool, one byte per cell.
    pub y: TensorMut<'a, Bool, N>,
}

/// Random-generator plan.
///
/// Generic on `T` so the same type can carry the FP generators
/// (`RandomPlan<f32, N>`, `RandomPlan<f64, N>`) and the Bernoulli
/// generator (`RandomPlan<Bool, N>`). The element kind is reasserted in
/// `select()` against the descriptor.
///
/// The plan owns a single cuRAND generator handle, created lazily on the
/// first call to `run` (or any of the typed `run_*` accessors). cuRAND
/// generators are not thread-safe; the plan is `!Sync` and `!Send` as a
/// consequence (the `Cell<curandGenerator_t>` makes both negative).
pub struct RandomPlan<T: Element, const N: usize> {
    desc: RandomDescriptor<N>,
    sku: KernelSku,
    // Lazy cuRAND handle. `null` means "not yet created"; the first
    // `run*` call constructs + seeds it.
    generator: Cell<curandGenerator_t>,
    _marker: PhantomData<T>,
}

impl<T: Element, const N: usize> RandomPlan<T, N> {
    /// Pick a kernel + validate the descriptor.
    pub fn select(
        _stream: &Stream,
        desc: &RandomDescriptor<N>,
        _pref: PlanPreference,
    ) -> Result<Self> {
        if desc.element != T::KIND {
            return Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan: descriptor.element != T::KIND",
            ));
        }
        for &d in desc.shape.iter() {
            if d < 0 {
                return Err(Error::InvalidProblem(
                    "baracuda-kernels::RandomPlan: shape dims must be non-negative",
                ));
            }
        }
        if N > 8 {
            return Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan: tensor rank > 8 not supported",
            ));
        }

        // Wired (kind, dtype) matrix:
        //   Uniform / Normal: f32 + f64.
        //   Bernoulli:        Bool.
        let supported = matches!(
            (desc.kind, T::KIND),
            (RandomKind::Uniform, ElementKind::F32)
                | (RandomKind::Uniform, ElementKind::F64)
                | (RandomKind::Normal, ElementKind::F32)
                | (RandomKind::Normal, ElementKind::F64)
                | (RandomKind::Bernoulli, ElementKind::Bool)
        );
        if !supported {
            return Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan: wired today: \
                 `{Uniform, Normal} × {f32, f64}` and `Bernoulli × Bool`",
            ));
        }

        // Bernoulli wants p in [0, 1].
        if matches!(desc.kind, RandomKind::Bernoulli) {
            let p = desc.param1;
            if !(p >= 0.0 && p <= 1.0) {
                return Err(Error::InvalidProblem(
                    "baracuda-kernels::RandomPlan(Bernoulli): p must be in [0, 1]",
                ));
            }
        }
        // Normal wants stddev > 0.
        if matches!(desc.kind, RandomKind::Normal) && !(desc.param2 > 0.0) {
            return Err(Error::InvalidProblem(
                "baracuda-kernels::RandomPlan(Normal): stddev (param2) must be > 0",
            ));
        }

        let backend = match desc.kind {
            RandomKind::Uniform | RandomKind::Normal => BackendKind::Curand,
            // Bernoulli is a cuRAND-uniform + custom-threshold composite;
            // labeled `Bespoke` because the visible output is from the
            // hand-rolled kernel.
            RandomKind::Bernoulli => BackendKind::Bespoke,
            // Defensive — `RandomKind` is `#[non_exhaustive]`. Treat
            // unknown variants as bespoke kernels until they're wired.
            _ => BackendKind::Bespoke,
        };
        let math_precision = match T::KIND {
            ElementKind::F64 => MathPrecision::F64,
            _ => MathPrecision::F32,
        };
        let precision_guarantee = PrecisionGuarantee {
            math_precision,
            accumulator: T::KIND,
            // cuRAND's XORWOW generator is bit-stable across runs with
            // the same seed on the same hardware, and per-cell
            // independent — no reduction order to worry about.
            bit_stable_on_same_hardware: true,
            deterministic: true,
        };
        let sku = KernelSku {
            category: OpCategory::Random,
            op: desc.kind as u16,
            element: T::KIND,
            aux_element: None,
            layout: None,
            epilogue: None,
            arch: ArchSku::Sm80,
            backend,
            precision_guarantee,
        };

        Ok(Self {
            desc: *desc,
            sku,
            generator: Cell::new(core::ptr::null_mut()),
            _marker: PhantomData,
        })
    }

    /// Workspace size in bytes.
    ///
    /// Bernoulli needs `numel * sizeof(f32)` bytes for the uniform-rand
    /// intermediate buffer cuRAND writes into. Uniform / Normal need
    /// zero — cuRAND writes directly to the caller-provided output.
    #[inline]
    pub fn workspace_size(&self) -> usize {
        if matches!(self.desc.kind, RandomKind::Bernoulli) {
            let numel: i64 = self.desc.shape.iter().map(|&d| d as i64).product();
            (numel.max(0) as usize) * core::mem::size_of::<f32>()
        } else {
            0
        }
    }

    /// Kernel SKU identity.
    #[inline]
    pub fn sku(&self) -> KernelSku {
        self.sku
    }

    /// Numerical guarantees.
    #[inline]
    pub fn precision_guarantee(&self) -> PrecisionGuarantee {
        self.sku.precision_guarantee
    }

    /// Internal — lazily create + seed the cuRAND generator. Idempotent.
    fn ensure_generator(&self) -> Result<curandGenerator_t> {
        let g = self.generator.get();
        if !g.is_null() {
            return Ok(g);
        }
        let mut handle: curandGenerator_t = core::ptr::null_mut();
        // CURAND_RNG_PSEUDO_DEFAULT == 100 (XORWOW).
        let status =
            unsafe { curandCreateGenerator(&mut handle as *mut _, 100) };
        if status != 0 {
            return Err(Error::CutlassInternal(curand_to_status(status)));
        }
        let status = unsafe { curandSetPseudoRandomGeneratorSeed(handle, self.desc.seed) };
        if status != 0 {
            unsafe {
                let _ = curandDestroyGenerator(handle);
            }
            return Err(Error::CutlassInternal(curand_to_status(status)));
        }
        self.generator.set(handle);
        Ok(handle)
    }

    /// Bind the cuRAND generator to the caller's stream. cuRAND
    /// associates each generator with at most one stream at a time;
    /// rebinding on every run lets the plan be reused across streams.
    fn bind_stream(&self, gen_handle: curandGenerator_t, stream: &Stream) -> Result<()> {
        let stream_ptr = stream.as_raw() as *mut c_void;
        let status = unsafe { curandSetStream(gen_handle, stream_ptr) };
        if status != 0 {
            return Err(Error::CutlassInternal(curand_to_status(status)));
        }
        Ok(())
    }

    /// Internal — common output-shape validation.
    fn check_shape<U: baracuda_types::DeviceRepr + Copy + 'static>(
        &self,
        y: &TensorMut<'_, U, N>,
    ) -> Result<i64> {
        if y.shape != self.desc.shape {
            return Err(Error::InvalidProblem(
                "baracuda-kernels::RandomPlan: y shape != descriptor shape",
            ));
        }
        let numel = y.numel();
        let len = y.data.len() as i64;
        if len < numel {
            return Err(Error::BufferTooSmall {
                needed: numel as usize,
                got: len as usize,
            });
        }
        Ok(numel)
    }
}

// =============================================================================
// Uniform / Normal — generic over T : Element, dispatched per dtype.
// =============================================================================
//
// The generic `run` is split into per-dtype impls because cuRAND's
// API has separate f32 / f64 entry points (`curandGenerateUniform`
// vs `curandGenerateUniformDouble`, etc.). We pick the right one at
// compile time per `impl` block.

impl<const N: usize> RandomPlan<f32, N> {
    /// Generate `Uniform(low, high)` or `Normal(mean, stddev)` `f32` samples.
    pub fn run(
        &self,
        stream: &Stream,
        _workspace: Workspace<'_>,
        args: RandomArgs<'_, f32, N>,
    ) -> Result<()> {
        let numel = self.check_shape(&args.y)?;
        if numel == 0 {
            return Ok(());
        }
        let gen_handle = self.ensure_generator()?;
        self.bind_stream(gen_handle, stream)?;
        let ptr = args.y.data.as_raw().0 as *mut f32;
        let n = numel as usize;

        match self.desc.kind {
            RandomKind::Uniform => {
                // cuRAND produces samples in (0, 1]. Map into (low, high]
                // by an in-place affine transform — fused with the
                // generator call would be nicer but cuRAND doesn't
                // expose a fused path, so we sweep with a tiny kernel
                // instead.
                let status = unsafe { curandGenerateUniform(gen_handle, ptr, n) };
                if status != 0 {
                    return Err(Error::CutlassInternal(curand_to_status(status)));
                }
                let low = self.desc.param1;
                let high = self.desc.param2;
                if (low, high) != (0.0, 1.0) {
                    affine_transform_f32(stream, ptr, n, high - low, low)?;
                }
                Ok(())
            }
            RandomKind::Normal => {
                let mean = self.desc.param1;
                let stddev = self.desc.param2;
                // cuRAND requires `n` to be even (Box-Muller pairs).
                // For odd `n`, generate `n + 1` into a tail-padded
                // workspace and copy the first `n` — but our typical
                // call is well above this corner case (the smoke tests
                // use 1024 * 1024). For now, fall back to a single
                // extra-cell over-generation when n is odd: cuRAND only
                // documents even-n requirement on older versions; modern
                // cuRAND (12.x) accepts any n. The status code will
                // surface if it doesn't.
                let status = unsafe { curandGenerateNormal(gen_handle, ptr, n, mean, stddev) };
                if status != 0 {
                    return Err(Error::CutlassInternal(curand_to_status(status)));
                }
                Ok(())
            }
            RandomKind::Bernoulli => Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan<f32>: Bernoulli has Bool output — use RandomPlan<Bool>",
            )),
            // Defensive arm — `RandomKind` is `#[non_exhaustive]`.
            _ => Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan<f32>::run reached an unimplemented RandomKind variant",
            )),
        }
    }
}

impl<const N: usize> RandomPlan<f64, N> {
    /// Generate `Uniform(low, high)` or `Normal(mean, stddev)` `f64` samples.
    pub fn run(
        &self,
        stream: &Stream,
        _workspace: Workspace<'_>,
        args: RandomArgs<'_, f64, N>,
    ) -> Result<()> {
        let numel = self.check_shape(&args.y)?;
        if numel == 0 {
            return Ok(());
        }
        let gen_handle = self.ensure_generator()?;
        self.bind_stream(gen_handle, stream)?;
        let ptr = args.y.data.as_raw().0 as *mut f64;
        let n = numel as usize;

        match self.desc.kind {
            RandomKind::Uniform => {
                let status = unsafe { curandGenerateUniformDouble(gen_handle, ptr, n) };
                if status != 0 {
                    return Err(Error::CutlassInternal(curand_to_status(status)));
                }
                let low = self.desc.param1 as f64;
                let high = self.desc.param2 as f64;
                if (low, high) != (0.0, 1.0) {
                    affine_transform_f64(stream, ptr, n, high - low, low)?;
                }
                Ok(())
            }
            RandomKind::Normal => {
                let mean = self.desc.param1 as f64;
                let stddev = self.desc.param2 as f64;
                let status = unsafe { curandGenerateNormalDouble(gen_handle, ptr, n, mean, stddev) };
                if status != 0 {
                    return Err(Error::CutlassInternal(curand_to_status(status)));
                }
                Ok(())
            }
            RandomKind::Bernoulli => Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan<f64>: Bernoulli has Bool output — use RandomPlan<Bool>",
            )),
            // Defensive arm — `RandomKind` is `#[non_exhaustive]`.
            _ => Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan<f64>::run reached an unimplemented RandomKind variant",
            )),
        }
    }
}

// =============================================================================
// Bernoulli — Bool output via cuRAND uniform + threshold kernel.
// =============================================================================

impl<const N: usize> RandomPlan<Bool, N> {
    /// Generate a Bernoulli(p) sample tensor.
    pub fn run(
        &self,
        stream: &Stream,
        workspace: Workspace<'_>,
        args: RandomBoolArgs<'_, N>,
    ) -> Result<()> {
        if !matches!(self.desc.kind, RandomKind::Bernoulli) {
            return Err(Error::Unsupported(
                "baracuda-kernels::RandomPlan<Bool>: only Bernoulli is wired \
                 (Uniform / Normal use the FP variants)",
            ));
        }
        let numel = self.check_shape(&args.y)?;
        if numel == 0 {
            return Ok(());
        }
        let needed = self.workspace_size();
        let (ws_ptr, ws_bytes): (*mut c_void, usize) = match workspace {
            Workspace::None => {
                return Err(Error::WorkspaceTooSmall {
                    needed,
                    got: 0,
                })
            }
            Workspace::Borrowed(slice) => {
                if slice.len() < needed {
                    return Err(Error::WorkspaceTooSmall {
                        needed,
                        got: slice.len(),
                    });
                }
                (slice.as_raw().0 as *mut c_void, slice.len())
            }
        };

        let gen_handle = self.ensure_generator()?;
        self.bind_stream(gen_handle, stream)?;

        let rand_ptr = ws_ptr as *mut f32;
        let n = numel as usize;
        let status = unsafe { curandGenerateUniform(gen_handle, rand_ptr, n) };
        if status != 0 {
            return Err(Error::CutlassInternal(curand_to_status(status)));
        }

        let y_ptr = args.y.data.as_raw().0 as *mut c_void;
        let stream_ptr = stream.as_raw() as *mut c_void;
        let status = unsafe {
            baracuda_kernels_sys::baracuda_kernels_bernoulli_run(
                numel,
                self.desc.param1,
                rand_ptr as *const c_void,
                y_ptr,
                core::ptr::null_mut(),
                ws_bytes, // pass for ABI symmetry; the bernoulli kernel ignores it.
                stream_ptr,
            )
        };
        map_status(status)
    }
}

impl<T: Element, const N: usize> Drop for RandomPlan<T, N> {
    fn drop(&mut self) {
        let g = self.generator.get();
        if !g.is_null() {
            // Best-effort destroy — failure here is non-fatal (the
            // process keeps the cuRAND state ledger alive until exit,
            // which is fine).
            unsafe {
                let _ = curandDestroyGenerator(g);
            }
            self.generator.set(core::ptr::null_mut());
        }
    }
}

// Map a cuRAND status code into a kernel-launcher status integer. cuRAND
// status codes (positive) don't collide with the elementwise status
// space (0..=5), so we offset them into the negative range to make the
// origin visible when the error surfaces.
fn curand_to_status(curand_code: i32) -> i32 {
    if curand_code == 0 {
        0
    } else {
        -curand_code
    }
}

fn map_status(code: i32) -> Result<()> {
    match code {
        0 => Ok(()),
        1 => Err(Error::MisalignedOperand),
        2 => Err(Error::InvalidProblem(
            "baracuda-kernels-sys reported invalid problem",
        )),
        3 => Err(Error::Unsupported(
            "baracuda-kernels-sys reported unsupported configuration",
        )),
        4 => Err(Error::WorkspaceTooSmall { needed: 0, got: 0 }),
        n => Err(Error::CutlassInternal(n)),
    }
}

// ----------------------------------------------------------------------------
// Affine transform helper — used to scale Uniform(0, 1] → Uniform(low, high]
// in place. Implemented as a launch of the unary `y = a · x + b` Lerp-ish
// kernel pattern, but to avoid pulling that op family into the random
// module, we round-trip through a per-element kernel emitted ad hoc.
// Today we route through a small bespoke launcher; if any other op family
// needs a fused-affine path it can graduate to a shared kernel.
// ----------------------------------------------------------------------------

fn affine_transform_f32(
    stream: &Stream,
    ptr: *mut f32,
    n: usize,
    scale: f32,
    offset: f32,
) -> Result<()> {
    let stream_ptr = stream.as_raw() as *mut c_void;
    let status = unsafe {
        baracuda_kernels_sys::baracuda_kernels_affine_inplace_f32_run(
            n as i64,
            scale,
            offset,
            ptr as *mut c_void,
            core::ptr::null_mut(),
            0,
            stream_ptr,
        )
    };
    map_status(status)
}

fn affine_transform_f64(
    stream: &Stream,
    ptr: *mut f64,
    n: usize,
    scale: f64,
    offset: f64,
) -> Result<()> {
    let stream_ptr = stream.as_raw() as *mut c_void;
    let status = unsafe {
        baracuda_kernels_sys::baracuda_kernels_affine_inplace_f64_run(
            n as i64,
            scale,
            offset,
            ptr as *mut c_void,
            core::ptr::null_mut(),
            0,
            stream_ptr,
        )
    };
    map_status(status)
}