oxicuda-blas 0.1.3

OxiCUDA BLAS - GPU-accelerated BLAS operations (cuBLAS equivalent)
Documentation
//! Reduction operations for OxiCUDA BLAS.
//!
//! This module provides parallel reduction primitives over device buffers:
//! sum, max, min, mean, variance, and softmax. Each operation generates PTX
//! via templates from `oxicuda-ptx`, performs a two-phase block-level
//! reduction when needed, and writes the scalar (or vector) result to device
//! memory.

mod max;
mod mean;
mod min;
mod ops;
mod softmax;
mod sum;
mod variance;

pub use max::max;
pub use mean::mean;
pub use min::min;
pub use ops::ReductionOp;
pub use softmax::softmax;
pub use sum::sum;
pub use variance::variance;