#![no_std]
use core::ffi::c_void;
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_gemm_f16_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_f16_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_f16_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
pub fn baracuda_cutlass_gemm_bf16_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bf16_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bf16_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_f16_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
bias: *const c_void,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_f16_rcr_sm80_workspace_size(
m: i32,
n: i32,
k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_f16_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_bf16_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
bias: *const c_void,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_bf16_rcr_sm80_workspace_size(
m: i32,
n: i32,
k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_bf16_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_gemm_f16_rrr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_f16_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_f16_rrr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
pub fn baracuda_cutlass_gemm_bf16_rrr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bf16_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bf16_rrr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_f16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_f16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_f16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_bf16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_bf16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_bf16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_gemm_tf32_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_tf32_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_tf32_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
pub fn baracuda_cutlass_gemm_tf32_rrr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_tf32_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_tf32_rrr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_tf32_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_tf32_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_tf32_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_tf32_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_tf32_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_tf32_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_workspace_size(
m: i32, n: i32, k: i32,
) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_gemm_f32_simt_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_f32_simt_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
) -> i32;
pub fn baracuda_cutlass_gemm_f32_simt_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_f32_simt_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_gemm_f64_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_f64_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
) -> i32;
pub fn baracuda_cutlass_gemm_f64_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_f64_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_f64_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_f64_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_f64_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_f64_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_can_implement(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_batched_f16_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
stride_a: i64,
b: *const c_void,
ldb: i64,
stride_b: i64,
c: *const c_void,
ldc: i64,
stride_c: i64,
d: *mut c_void,
ldd: i64,
stride_d: i64,
alpha: f32,
beta: f32,
batch_count: i32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_batched_f16_rcr_sm80_workspace_size(
m: i32,
n: i32,
k: i32,
batch_count: i32,
) -> usize;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_batched_f16_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
stride_a: i64,
b: *const c_void,
ldb: i64,
stride_b: i64,
c: *const c_void,
ldc: i64,
stride_c: i64,
d: *mut c_void,
ldd: i64,
stride_d: i64,
batch_count: i32,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_batched_bf16_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
stride_a: i64,
b: *const c_void,
ldb: i64,
stride_b: i64,
c: *const c_void,
ldc: i64,
stride_c: i64,
d: *mut c_void,
ldd: i64,
stride_d: i64,
alpha: f32,
beta: f32,
batch_count: i32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_batched_bf16_rcr_sm80_workspace_size(
m: i32,
n: i32,
k: i32,
batch_count: i32,
) -> usize;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_gemm_batched_bf16_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
stride_a: i64,
b: *const c_void,
ldb: i64,
stride_b: i64,
c: *const c_void,
ldc: i64,
stride_c: i64,
d: *mut c_void,
ldd: i64,
stride_d: i64,
batch_count: i32,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_sufficient(
h_m: *const i32,
h_n: *const i32,
h_k: *const i32,
group_count: i32,
) -> i32;
pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_scratch_bytes(
h_m: *const i32,
h_n: *const i32,
h_k: *const i32,
group_count: i32,
threadblock_count: i32,
) -> usize;
pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_can_implement(
h_m: *const i32,
h_n: *const i32,
h_k: *const i32,
group_count: i32,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_run(
group_count: i32,
threadblock_count: i32,
d_problem_sizes: *const c_void,
d_ptr_a: *const c_void,
d_ptr_b: *const c_void,
d_ptr_c: *const c_void,
d_ptr_d: *mut c_void,
d_lda: *const c_void,
d_ldb: *const c_void,
d_ldc: *const c_void,
d_ldd: *const c_void,
h_problem_sizes: *const c_void,
alpha: f32,
beta: f32,
scratch: *mut c_void,
scratch_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_sufficient(
h_m: *const i32,
h_n: *const i32,
h_k: *const i32,
group_count: i32,
) -> i32;
pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_scratch_bytes(
h_m: *const i32,
h_n: *const i32,
h_k: *const i32,
group_count: i32,
threadblock_count: i32,
) -> usize;
pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_can_implement(
h_m: *const i32,
h_n: *const i32,
h_k: *const i32,
group_count: i32,
) -> i32;
#[allow(clippy::too_many_arguments)]
pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_run(
group_count: i32,
threadblock_count: i32,
d_problem_sizes: *const c_void,
d_ptr_a: *const c_void,
d_ptr_b: *const c_void,
d_ptr_c: *const c_void,
d_ptr_d: *mut c_void,
d_lda: *const c_void,
d_ldb: *const c_void,
d_ldc: *const c_void,
d_ldd: *const c_void,
h_problem_sizes: *const c_void,
alpha: f32,
beta: f32,
scratch: *mut c_void,
scratch_bytes: usize,
stream: *mut c_void,
) -> i32;
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
unsafe extern "C" {
pub fn baracuda_cutlass_gemm_s8_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_s8_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_s8_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
pub fn baracuda_cutlass_gemm_u8_rcr_sm80_run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
pub fn baracuda_cutlass_gemm_u8_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
pub fn baracuda_cutlass_gemm_u8_rcr_sm80_can_implement(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
) -> i32;
}
macro_rules! int8_bias_ffi {
($run:ident, $ws:ident, $ck:ident) => {
unsafe extern "C" {
#[doc = concat!(
"int8 bias-fused GEMM with optional fused activation.\n\n",
"Computes `D = saturating_cast(activation(alpha * (A * B) ",
"+ beta * C + bias_broadcast(N)))`. See the section header for ",
"the layout / accumulator / clamp contract.\n\n",
"# Safety\nSame contract as ",
"[`baracuda_cutlass_gemm_s8_rcr_sm80_run`]."
)]
pub fn $run(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
bias: *const c_void,
alpha: f32,
beta: f32,
workspace: *mut c_void,
workspace_bytes: usize,
stream: *mut c_void,
) -> i32;
#[doc = "Workspace size in bytes for the corresponding `_run` entry point."]
pub fn $ws(m: i32, n: i32, k: i32) -> usize;
#[doc = concat!(
"Pre-launch implementability check for the corresponding ",
"`_run` entry point.\n\n# Safety\nSame pointer-validity ",
"contract as the matching `_run`, but only host-side ",
"alignment and leading-dimension checks occur."
)]
pub fn $ck(
m: i32,
n: i32,
k: i32,
a: *const c_void,
lda: i64,
b: *const c_void,
ldb: i64,
c: *const c_void,
ldc: i64,
d: *mut c_void,
ldd: i64,
bias: *const c_void,
) -> i32;
}
};
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
mod int8_bias_decls {
use super::c_void;
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_can_implement
);
int8_bias_ffi!(
baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_can_implement
);
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
pub use int8_bias_decls::*;