// baracuda_random.cuh
//
// Templated kernels and INSTANTIATE macros for the random / sampling op
// family (Phase 4.5 Category Q of the comprehensive plan).
//
// Scope today:
// * Bernoulli — `y = (rand < p) ? 1 : 0`, Bool output. Reads a
// caller-supplied uniform-rand `float` buffer.
// * Dropout (FW) — `y = mask * x * scale`, `mask = (rand < 1 - p)`
// Bool. `scale = 1 / (1 - p)`.
// * Dropout (BW) — `dx = mask * dy * scale`.
//
// `Uniform` and `Normal` route directly through the cuRAND host API at
// the safe-plan layer — no bespoke kernel needed.
//
// Status codes mirror the elementwise family:
// 0 success, 1 misaligned, 2 invalid problem, 3 unsupported,
// 4 workspace too small, 5 internal launch failure.
#ifndef BARACUDA_RANDOM_CUH
#define BARACUDA_RANDOM_CUH
#include <cstddef>
#include <cstdint>
#include <cuda_runtime.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace baracuda { namespace random {
// Bernoulli — one Bool (uint8_t 0/1) per output cell.
__global__ inline void bernoulli_kernel(
const float* __restrict__ rand,
uint8_t* __restrict__ y,
int64_t numel,
float p)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
y[i] = (rand[i] < p) ? (uint8_t)1 : (uint8_t)0;
}
}
__host__ inline int32_t launch_bernoulli(
const float* rand,
uint8_t* y,
int64_t numel,
float p,
cudaStream_t stream)
{
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
bernoulli_kernel<<<blocks, kBlock, 0, stream>>>(rand, y, numel, p);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
// Dropout FW — writes `y` and `mask` simultaneously. `scale` is passed
// by value (caller computes `1 / (1 - p)` to avoid a divide-by-zero check
// inside the kernel hot path; the safe-plan layer handles `p == 1` by
// short-circuiting before invoking this kernel).
//
// `T` is the element type for `x` and `y`. The uniform-rand buffer is
// always `float` regardless of `T` (cuRAND's f32 uniform generator is
// fast and the `<` comparison loses no useful entropy for the dropout
// use case).
template <typename T, typename Scale>
__global__ void dropout_fw_kernel(
const T* __restrict__ x,
const float* __restrict__ rand,
T* __restrict__ y,
uint8_t* __restrict__ mask,
int64_t numel,
float keep_prob,
Scale scale)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
uint8_t m = (rand[i] < keep_prob) ? (uint8_t)1 : (uint8_t)0;
mask[i] = m;
// mask * scale is 0 or scale; multiplying x by that yields the
// dropout output without a branch on the data path.
y[i] = static_cast<T>(static_cast<Scale>(x[i]) * (m ? scale : (Scale)0));
}
}
template <typename T, typename Scale>
__host__ inline int32_t launch_dropout_fw(
const T* x,
const float* rand,
T* y,
uint8_t* mask,
int64_t numel,
float keep_prob,
Scale scale,
cudaStream_t stream)
{
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
dropout_fw_kernel<T, Scale><<<blocks, kBlock, 0, stream>>>(
x, rand, y, mask, numel, keep_prob, scale);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
// Dropout BW — `dx = dy * mask * scale`.
template <typename T, typename Scale>
__global__ void dropout_bw_kernel(
const T* __restrict__ dy,
const uint8_t* __restrict__ mask,
T* __restrict__ dx,
int64_t numel,
Scale scale)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
uint8_t m = mask[i];
dx[i] = static_cast<T>(static_cast<Scale>(dy[i]) * (m ? scale : (Scale)0));
}
}
template <typename T, typename Scale>
__host__ inline int32_t launch_dropout_bw(
const T* dy,
const uint8_t* mask,
T* dx,
int64_t numel,
Scale scale,
cudaStream_t stream)
{
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
dropout_bw_kernel<T, Scale><<<blocks, kBlock, 0, stream>>>(
dy, mask, dx, numel, scale);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
// In-place affine `y = scale * y + offset`. Used to remap a cuRAND
// uniform-(0, 1] buffer into Uniform(low, high].
template <typename T>
__global__ void affine_inplace_kernel(
T* __restrict__ y,
int64_t numel,
T scale,
T offset)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
y[i] = static_cast<T>(static_cast<T>(y[i]) * scale + offset);
}
}
template <typename T>
__host__ inline int32_t launch_affine_inplace(
T* y,
int64_t numel,
T scale,
T offset,
cudaStream_t stream)
{
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
affine_inplace_kernel<T><<<blocks, kBlock, 0, stream>>>(y, numel, scale, offset);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
// Half-precision in-place affine — compute at f32, store at __half.
// Mirrors the forward `affine_contig_kernel_f16` upcast/downcast pattern;
// `__half * __half + __half` has no native operator path (would require
// `__hmul`/`__hadd` intrinsics).
__global__ inline void affine_inplace_kernel_f16(
__half* __restrict__ y,
int64_t numel,
float scale,
float offset)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
float yi = __half2float(y[i]);
y[i] = __float2half(scale * yi + offset);
}
}
// bf16 in-place affine — compute at f32, store at __nv_bfloat16.
__global__ inline void affine_inplace_kernel_bf16(
__nv_bfloat16* __restrict__ y,
int64_t numel,
float scale,
float offset)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
float yi = __bfloat162float(y[i]);
y[i] = __float2bfloat16(scale * yi + offset);
}
}
__host__ inline int32_t launch_affine_inplace_f16(
__half* y,
int64_t numel,
float scale,
float offset,
cudaStream_t stream)
{
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
affine_inplace_kernel_f16<<<blocks, kBlock, 0, stream>>>(y, numel, scale, offset);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
__host__ inline int32_t launch_affine_inplace_bf16(
__nv_bfloat16* y,
int64_t numel,
float scale,
float offset,
cudaStream_t stream)
{
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
affine_inplace_kernel_bf16<<<blocks, kBlock, 0, stream>>>(y, numel, scale, offset);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
// =============================================================================
// Phase 62 — strided in-place affine `y[off(i)] = scale * y[off(i)] + offset`.
//
// Stride-aware variant of `affine_inplace_kernel<T>`. Each thread decomposes
// its linear index `i` into a multi-coord, computes a single stride-aware
// offset `off(i)` into the target buffer, reads `y[off(i)]`, applies the
// affine, writes back. Single-pointer ABI: caller passes `y` + `stride_y`
// only; in-place is implicit in the kernel structure.
//
// Per-thread access pattern is identical to contig in-place: one read
// then one write at the same address, so the same aliasing-safety
// reasoning holds. The pre-launch contract for callers that want this
// to *replace* a forward strided affine call with `x_ptr == y_ptr` is:
// `stride_x == stride_y` AND `stride_y` is a valid permutation (no
// zero strides on `y`, no two `i` mapping to the same `off`). Both are
// caller obligations — the kernel does no validation.
// =============================================================================
inline constexpr int MAX_RANK = 8;
struct DimsI32 { int32_t v[MAX_RANK]; };
struct DimsI64 { int64_t v[MAX_RANK]; };
// Templated strided in-place affine — compute and storage at T.
template <typename T>
__global__ void affine_inplace_strided_kernel(
T* __restrict__ y,
int64_t numel,
int32_t rank,
DimsI32 shape,
DimsI64 stride_y,
T scale,
T offset)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
int64_t linear = i;
int64_t off_y = 0;
for (int d = rank - 1; d >= 0; --d) {
int32_t s = shape.v[d];
int64_t c = (s == 0) ? 0 : (linear % (int64_t)s);
if (s != 0) linear /= (int64_t)s;
off_y += c * stride_y.v[d];
}
y[off_y] = static_cast<T>(static_cast<T>(y[off_y]) * scale + offset);
}
}
// f16-storage / f32-compute strided in-place — matches forward
// `affine_strided_kernel_f16` upcast/downcast pattern.
__global__ inline void affine_inplace_strided_kernel_f16(
__half* __restrict__ y,
int64_t numel,
int32_t rank,
DimsI32 shape,
DimsI64 stride_y,
float scale,
float offset)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
int64_t linear = i;
int64_t off_y = 0;
for (int d = rank - 1; d >= 0; --d) {
int32_t s = shape.v[d];
int64_t c = (s == 0) ? 0 : (linear % (int64_t)s);
if (s != 0) linear /= (int64_t)s;
off_y += c * stride_y.v[d];
}
float yi = __half2float(y[off_y]);
y[off_y] = __float2half(scale * yi + offset);
}
}
// bf16-storage / f32-compute strided in-place.
__global__ inline void affine_inplace_strided_kernel_bf16(
__nv_bfloat16* __restrict__ y,
int64_t numel,
int32_t rank,
DimsI32 shape,
DimsI64 stride_y,
float scale,
float offset)
{
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t step = (int64_t)gridDim.x * (int64_t)blockDim.x;
for (int64_t i = tid; i < numel; i += step) {
int64_t linear = i;
int64_t off_y = 0;
for (int d = rank - 1; d >= 0; --d) {
int32_t s = shape.v[d];
int64_t c = (s == 0) ? 0 : (linear % (int64_t)s);
if (s != 0) linear /= (int64_t)s;
off_y += c * stride_y.v[d];
}
float yi = __bfloat162float(y[off_y]);
y[off_y] = __float2bfloat16(scale * yi + offset);
}
}
template <typename T>
__host__ inline int32_t launch_affine_inplace_strided(
T* y,
int64_t numel,
int32_t rank,
const int32_t* shape_host,
const int64_t* stride_y_host,
T scale,
T offset,
cudaStream_t stream)
{
if (rank < 0 || rank > MAX_RANK) return 2;
DimsI32 shape{};
DimsI64 sy{};
for (int d = 0; d < rank; ++d) {
shape.v[d] = shape_host[d];
sy.v[d] = stride_y_host[d];
}
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
affine_inplace_strided_kernel<T><<<blocks, kBlock, 0, stream>>>(
y, numel, rank, shape, sy, scale, offset);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
__host__ inline int32_t launch_affine_inplace_strided_f16(
__half* y,
int64_t numel,
int32_t rank,
const int32_t* shape_host,
const int64_t* stride_y_host,
float scale,
float offset,
cudaStream_t stream)
{
if (rank < 0 || rank > MAX_RANK) return 2;
DimsI32 shape{};
DimsI64 sy{};
for (int d = 0; d < rank; ++d) {
shape.v[d] = shape_host[d];
sy.v[d] = stride_y_host[d];
}
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
affine_inplace_strided_kernel_f16<<<blocks, kBlock, 0, stream>>>(
y, numel, rank, shape, sy, scale, offset);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
__host__ inline int32_t launch_affine_inplace_strided_bf16(
__nv_bfloat16* y,
int64_t numel,
int32_t rank,
const int32_t* shape_host,
const int64_t* stride_y_host,
float scale,
float offset,
cudaStream_t stream)
{
if (rank < 0 || rank > MAX_RANK) return 2;
DimsI32 shape{};
DimsI64 sy{};
for (int d = 0; d < rank; ++d) {
shape.v[d] = shape_host[d];
sy.v[d] = stride_y_host[d];
}
constexpr int kBlock = 256;
constexpr int64_t kMaxBlocks = 65535;
int64_t blocks_i64 = (numel + kBlock - 1) / kBlock;
int blocks = static_cast<int>(blocks_i64 > kMaxBlocks ? kMaxBlocks : blocks_i64);
if (blocks <= 0) blocks = 1;
affine_inplace_strided_kernel_bf16<<<blocks, kBlock, 0, stream>>>(
y, numel, rank, shape, sy, scale, offset);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess) ? 0 : 5;
}
} } // namespace baracuda::random
// =============================================================================
// Instantiation macros — emit `extern "C"` launcher symbols.
// =============================================================================
// ABI: `(numel, p, rand, y, ws, ws_bytes, stream) -> i32`.
//
// `y` is a packed-Bool buffer (`uint8_t`). `rand` is a `float` buffer
// generated by cuRAND.
#define BARACUDA_KERNELS_BERNOULLI_INSTANTIATE() \
extern "C" int32_t baracuda_kernels_bernoulli_run( \
int64_t numel, \
float p, \
const void* rand, \
void* y, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (rand == nullptr || y == nullptr) return 2; \
if (!(p >= 0.0f && p <= 1.0f)) return 2; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_bernoulli( \
static_cast<const float*>(rand), \
static_cast<uint8_t*>(y), \
numel, p, stream); \
} \
extern "C" int32_t baracuda_kernels_bernoulli_can_implement( \
int64_t numel, \
float p, \
const void* /*rand*/, \
const void* /*y*/) \
{ \
if (numel < 0) return 2; \
if (!(p >= 0.0f && p <= 1.0f)) return 2; \
return 0; \
}
// ABI: `(numel, p, scale, x, rand, y, mask, ws, ws_bytes, stream) -> i32`.
//
// `x` / `y` are `T`-typed; `rand` is `float`; `mask` is packed-Bool
// (`uint8_t`). The safe-plan layer enforces `0 <= p < 1` before
// dispatching (so `scale = 1/(1-p)` is finite).
#define BARACUDA_KERNELS_DROPOUT_INSTANTIATE(NAME, T, SCALE_T) \
extern "C" int32_t baracuda_kernels_dropout_##NAME##_run( \
int64_t numel, \
float p, \
SCALE_T scale, \
const void* x, \
const void* rand, \
void* y, \
void* mask, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (x == nullptr || rand == nullptr || y == nullptr || mask == nullptr) return 2; \
if (!(p >= 0.0f && p < 1.0f)) return 2; \
float keep_prob = 1.0f - p; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_dropout_fw<T, SCALE_T>( \
static_cast<const T*>(x), \
static_cast<const float*>(rand), \
static_cast<T*>(y), \
static_cast<uint8_t*>(mask), \
numel, keep_prob, scale, stream); \
} \
extern "C" int32_t baracuda_kernels_dropout_##NAME##_can_implement( \
int64_t numel, \
float p, \
SCALE_T /*scale*/, \
const void* /*x*/, \
const void* /*rand*/, \
const void* /*y*/, \
const void* /*mask*/) \
{ \
if (numel < 0) return 2; \
if (!(p >= 0.0f && p < 1.0f)) return 2; \
return 0; \
}
// ABI: `(numel, scale, dy, mask, dx, ws, ws_bytes, stream) -> i32`.
#define BARACUDA_KERNELS_DROPOUT_BACKWARD_INSTANTIATE(NAME, T, SCALE_T) \
extern "C" int32_t baracuda_kernels_dropout_backward_##NAME##_run( \
int64_t numel, \
SCALE_T scale, \
const void* dy, \
const void* mask, \
void* dx, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (dy == nullptr || mask == nullptr || dx == nullptr) return 2; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_dropout_bw<T, SCALE_T>( \
static_cast<const T*>(dy), \
static_cast<const uint8_t*>(mask), \
static_cast<T*>(dx), \
numel, scale, stream); \
} \
extern "C" int32_t baracuda_kernels_dropout_backward_##NAME##_can_implement( \
int64_t numel, \
SCALE_T /*scale*/, \
const void* /*dy*/, \
const void* /*mask*/, \
const void* /*dx*/) \
{ \
if (numel < 0) return 2; \
return 0; \
}
// ABI: `(numel, scale, offset, y, ws, ws_bytes, stream) -> i32`.
//
// In-place affine map `y[i] = scale * y[i] + offset`. Used to remap a
// cuRAND uniform-(0, 1] buffer into Uniform(low, high] without a second
// kernel set, and (Phase 61) for in-place weight-decay / Op::AddScalar /
// Op::MulScalar on contiguous tensors in Fuel's executor.
#define BARACUDA_KERNELS_AFFINE_INPLACE_INSTANTIATE(NAME, T) \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_run( \
int64_t numel, \
T scale, \
T offset, \
void* y, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (y == nullptr) return 2; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_affine_inplace<T>( \
static_cast<T*>(y), numel, scale, offset, stream); \
} \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_can_implement( \
int64_t numel, \
T /*scale*/, \
T /*offset*/, \
const void* /*y*/) \
{ \
if (numel < 0) return 2; \
return 0; \
}
// f32-scalar variant for half-precision storage types (Phase 61).
//
// Same ABI shape, but `scale` and `offset` are always `float` regardless
// of the storage dtype. This matches the forward `affine_{f16,bf16}_run`
// convention (avoids passing `__half` / `__nv_bfloat16` by value through
// C ABI, which is compiler-dependent) and matches how Fuel's CPU
// `affine_inplace_{f16,bf16}` kernels carry their scalars. Half-precision
// kernels use the same upcast-to-f32, compute, downcast pattern as the
// forward `affine_contig_kernel_{f16,bf16}`.
//
// NAME : `f16` or `bf16` — must match the suffix of the
// `launch_affine_inplace_<NAME>` host wrapper.
// T : `__half` or `__nv_bfloat16` — storage type, for the y pointer
// cast.
#define BARACUDA_KERNELS_AFFINE_INPLACE_F32SCALAR_INSTANTIATE(NAME, T) \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_run( \
int64_t numel, \
float scale, \
float offset, \
void* y, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (y == nullptr) return 2; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_affine_inplace_##NAME( \
static_cast<T*>(y), numel, scale, offset, stream); \
} \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_can_implement( \
int64_t numel, \
float /*scale*/, \
float /*offset*/, \
const void* /*y*/) \
{ \
if (numel < 0) return 2; \
return 0; \
}
// Phase 62 — strided in-place affine, ABI:
// (numel, rank, shape, stride_y, scale, offset, y, ws, ws_bytes, stream) -> i32
//
// shape and stride_y are host-side arrays of length `rank` (int32_t for
// shape, int64_t for stride). MAX_RANK = 8; rank > 8 returns
// STATUS_INVALID_ARG.
#define BARACUDA_KERNELS_AFFINE_INPLACE_STRIDED_INSTANTIATE(NAME, T) \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_strided_run( \
int64_t numel, \
int32_t rank, \
const int32_t* shape, \
const int64_t* stride_y, \
T scale, \
T offset, \
void* y, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (y == nullptr) return 2; \
if (rank > 0 && (shape == nullptr || stride_y == nullptr)) return 2; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_affine_inplace_strided<T>( \
static_cast<T*>(y), numel, rank, shape, stride_y, scale, offset, stream); \
} \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_strided_can_implement( \
int64_t numel, \
int32_t rank, \
const int32_t* shape, \
const int64_t* stride_y, \
T /*scale*/, \
T /*offset*/, \
const void* /*y*/) \
{ \
if (numel < 0) return 2; \
if (rank < 0) return 2; \
if (rank > 0 && (shape == nullptr || stride_y == nullptr)) return 2; \
return 0; \
}
// Phase 62 — f32-scalar strided in-place variant for half-precision.
// Matches the contig f32-scalar pattern (`scale` and `offset` are
// always `float`).
#define BARACUDA_KERNELS_AFFINE_INPLACE_STRIDED_F32SCALAR_INSTANTIATE(NAME, T) \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_strided_run( \
int64_t numel, \
int32_t rank, \
const int32_t* shape, \
const int64_t* stride_y, \
float scale, \
float offset, \
void* y, \
void* /*workspace*/, size_t /*workspace_bytes*/, \
void* stream_ptr) \
{ \
if (numel < 0) return 2; \
if (numel == 0) return 0; \
if (y == nullptr) return 2; \
if (rank > 0 && (shape == nullptr || stride_y == nullptr)) return 2; \
cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr); \
return baracuda::random::launch_affine_inplace_strided_##NAME( \
static_cast<T*>(y), numel, rank, shape, stride_y, scale, offset, stream); \
} \
extern "C" int32_t baracuda_kernels_affine_inplace_##NAME##_strided_can_implement( \
int64_t numel, \
int32_t rank, \
const int32_t* shape, \
const int64_t* stride_y, \
float /*scale*/, \
float /*offset*/, \
const void* /*y*/) \
{ \
if (numel < 0) return 2; \
if (rank < 0) return 2; \
if (rank > 0 && (shape == nullptr || stride_y == nullptr)) return 2; \
return 0; \
}
#endif // BARACUDA_RANDOM_CUH