#![allow(non_camel_case_types)]
#![allow(clippy::too_many_arguments)]
use core::ffi::c_void;
use baracuda_cutlass_kernels_sys as kk;
macro_rules! gemm_nobias_f32 {
($run:ident, $ws:ident, $ci:ident,
$kk_run:ident, $kk_ws:ident, $kk_ci:ident) => {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ws(m: i32, n: i32, k: i32) -> usize {
unsafe { kk::$kk_ws(m, n, k) }
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ci(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
) -> i32 {
unsafe {
kk::$kk_ci(m, n, k, a, lda, b, ldb, c, ldc, d, ldd)
}
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32 {
unsafe {
kk::$kk_run(
m, n, k, a, lda, b, ldb, c, ldc, d, ldd,
alpha, beta, workspace, workspace_bytes, stream,
)
}
}
};
}
macro_rules! gemm_nobias_f64 {
($run:ident, $ws:ident, $ci:ident,
$kk_run:ident, $kk_ws:ident, $kk_ci:ident) => {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ws(m: i32, n: i32, k: i32) -> usize {
unsafe { kk::$kk_ws(m, n, k) }
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ci(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
) -> i32 {
unsafe {
kk::$kk_ci(m, n, k, a, lda, b, ldb, c, ldc, d, ldd)
}
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32 {
unsafe {
kk::$kk_run(
m, n, k, a, lda, b, ldb, c, ldc, d, ldd,
alpha, beta, workspace, workspace_bytes, stream,
)
}
}
};
}
macro_rules! gemm_bias_f32 {
($run:ident, $ws:ident, $ci:ident,
$kk_run:ident, $kk_ws:ident, $kk_ci:ident) => {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ws(m: i32, n: i32, k: i32) -> usize {
unsafe { kk::$kk_ws(m, n, k) }
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ci(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32 {
unsafe {
kk::$kk_ci(m, n, k, a, lda, b, ldb, c, ldc, d, ldd, bias)
}
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f32, beta: f32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32 {
unsafe {
kk::$kk_run(
m, n, k, a, lda, b, ldb, c, ldc, d, ldd,
bias, alpha, beta, workspace, workspace_bytes, stream,
)
}
}
};
}
macro_rules! gemm_bias_f64 {
($run:ident, $ws:ident, $ci:ident,
$kk_run:ident, $kk_ws:ident, $kk_ci:ident) => {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ws(m: i32, n: i32, k: i32) -> usize {
unsafe { kk::$kk_ws(m, n, k) }
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ci(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
) -> i32 {
unsafe {
kk::$kk_ci(m, n, k, a, lda, b, ldb, c, ldc, d, ldd, bias)
}
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64,
b: *const c_void, ldb: i64,
c: *const c_void, ldc: i64,
d: *mut c_void, ldd: i64,
bias: *const c_void,
alpha: f64, beta: f64,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32 {
unsafe {
kk::$kk_run(
m, n, k, a, lda, b, ldb, c, ldc, d, ldd,
bias, alpha, beta, workspace, workspace_bytes, stream,
)
}
}
};
}
macro_rules! gemm_batched_f32 {
($run:ident, $ws:ident, $ci:ident,
$kk_run:ident, $kk_ws:ident, $kk_ci:ident) => {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ws(
m: i32, n: i32, k: i32, batch_count: i32,
) -> usize {
unsafe { kk::$kk_ws(m, n, k, batch_count) }
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $ci(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64, stride_a: i64,
b: *const c_void, ldb: i64, stride_b: i64,
c: *const c_void, ldc: i64, stride_c: i64,
d: *mut c_void, ldd: i64, stride_d: i64,
batch_count: i32,
) -> i32 {
unsafe {
kk::$kk_ci(
m, n, k,
a, lda, stride_a, b, ldb, stride_b,
c, ldc, stride_c, d, ldd, stride_d,
batch_count,
)
}
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn $run(
m: i32, n: i32, k: i32,
a: *const c_void, lda: i64, stride_a: i64,
b: *const c_void, ldb: i64, stride_b: i64,
c: *const c_void, ldc: i64, stride_c: i64,
d: *mut c_void, ldd: i64, stride_d: i64,
alpha: f32, beta: f32,
batch_count: i32,
workspace: *mut c_void, workspace_bytes: usize,
stream: *mut c_void,
) -> i32 {
unsafe {
kk::$kk_run(
m, n, k,
a, lda, stride_a, b, ldb, stride_b,
c, ldc, stride_c, d, ldd, stride_d,
alpha, beta, batch_count,
workspace, workspace_bytes, stream,
)
}
}
};
}
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_f16_rcr_sm80_run,
baracuda_kernels_gemm_f16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_f16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_f16_rcr_sm80_run,
baracuda_cutlass_gemm_f16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_f16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_f16_rrr_sm80_run,
baracuda_kernels_gemm_f16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_f16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_f16_rrr_sm80_run,
baracuda_cutlass_gemm_f16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_f16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_bf16_rcr_sm80_run,
baracuda_kernels_gemm_bf16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bf16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bf16_rcr_sm80_run,
baracuda_cutlass_gemm_bf16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bf16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_bf16_rrr_sm80_run,
baracuda_kernels_gemm_bf16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bf16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bf16_rrr_sm80_run,
baracuda_cutlass_gemm_bf16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bf16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_tf32_rcr_sm80_run,
baracuda_kernels_gemm_tf32_rcr_sm80_workspace_size,
baracuda_kernels_gemm_tf32_rcr_sm80_can_implement,
baracuda_cutlass_gemm_tf32_rcr_sm80_run,
baracuda_cutlass_gemm_tf32_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_tf32_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_tf32_rrr_sm80_run,
baracuda_kernels_gemm_tf32_rrr_sm80_workspace_size,
baracuda_kernels_gemm_tf32_rrr_sm80_can_implement,
baracuda_cutlass_gemm_tf32_rrr_sm80_run,
baracuda_cutlass_gemm_tf32_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_tf32_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_f32_simt_rcr_sm80_run,
baracuda_kernels_gemm_f32_simt_rcr_sm80_workspace_size,
baracuda_kernels_gemm_f32_simt_rcr_sm80_can_implement,
baracuda_cutlass_gemm_f32_simt_rcr_sm80_run,
baracuda_cutlass_gemm_f32_simt_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_f32_simt_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_f32_simt_rrr_sm80_run,
baracuda_kernels_gemm_f32_simt_rrr_sm80_workspace_size,
baracuda_kernels_gemm_f32_simt_rrr_sm80_can_implement,
baracuda_cutlass_gemm_f32_simt_rrr_sm80_run,
baracuda_cutlass_gemm_f32_simt_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_f32_simt_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_s8_rcr_sm80_run,
baracuda_kernels_gemm_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_s8_rcr_sm80_run,
baracuda_cutlass_gemm_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f32!(
baracuda_kernels_gemm_u8_rcr_sm80_run,
baracuda_kernels_gemm_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_u8_rcr_sm80_run,
baracuda_cutlass_gemm_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f64!(
baracuda_kernels_gemm_f64_rcr_sm80_run,
baracuda_kernels_gemm_f64_rcr_sm80_workspace_size,
baracuda_kernels_gemm_f64_rcr_sm80_can_implement,
baracuda_cutlass_gemm_f64_rcr_sm80_run,
baracuda_cutlass_gemm_f64_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_f64_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_nobias_f64!(
baracuda_kernels_gemm_f64_rrr_sm80_run,
baracuda_kernels_gemm_f64_rrr_sm80_workspace_size,
baracuda_kernels_gemm_f64_rrr_sm80_can_implement,
baracuda_cutlass_gemm_f64_rrr_sm80_run,
baracuda_cutlass_gemm_f64_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_f64_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_f16_rcr_sm80_run,
baracuda_kernels_gemm_bias_f16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_f16_rrr_sm80_run,
baracuda_kernels_gemm_bias_f16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_f16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_bf16_rcr_sm80_run,
baracuda_kernels_gemm_bias_bf16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_bf16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_bf16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_bf16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_bf16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_bf16_rrr_sm80_run,
baracuda_kernels_gemm_bias_bf16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_bf16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_bf16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_bf16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_bf16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_tf32_rcr_sm80_run,
baracuda_kernels_gemm_bias_tf32_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_tf32_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_tf32_rcr_sm80_run,
baracuda_cutlass_gemm_bias_tf32_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_tf32_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_tf32_rrr_sm80_run,
baracuda_kernels_gemm_bias_tf32_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_tf32_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_tf32_rrr_sm80_run,
baracuda_cutlass_gemm_bias_tf32_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_tf32_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_f32_simt_rcr_sm80_run,
baracuda_kernels_gemm_bias_f32_simt_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f32_simt_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_f32_simt_rrr_sm80_run,
baracuda_kernels_gemm_bias_f32_simt_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f32_simt_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_run,
baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_f16_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_f16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_f16_rrr_sm80_run,
baracuda_kernels_gemm_bias_relu_f16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_bf16_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_bf16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_bf16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_bf16_rrr_sm80_run,
baracuda_kernels_gemm_bias_relu_bf16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_bf16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_tf32_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_tf32_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_tf32_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_tf32_rrr_sm80_run,
baracuda_kernels_gemm_bias_relu_tf32_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_tf32_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_run,
baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_f32_simt_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_f32_simt_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f32_simt_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_f32_simt_rrr_sm80_run,
baracuda_kernels_gemm_bias_relu_f32_simt_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f32_simt_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_f16_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_f16_rrr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_bf16_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_bf16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_bf16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_bf16_rrr_sm80_run,
baracuda_kernels_gemm_bias_gelu_bf16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_bf16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_tf32_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_tf32_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_tf32_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_tf32_rrr_sm80_run,
baracuda_kernels_gemm_bias_gelu_tf32_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_tf32_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_f32_simt_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f32_simt_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f32_simt_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_f32_simt_rrr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f32_simt_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f32_simt_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_f16_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_f16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_f16_rrr_sm80_run,
baracuda_kernels_gemm_bias_silu_f16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_bf16_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_bf16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_bf16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_bf16_rrr_sm80_run,
baracuda_kernels_gemm_bias_silu_bf16_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_bf16_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_run,
baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_tf32_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_tf32_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_tf32_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_tf32_rrr_sm80_run,
baracuda_kernels_gemm_bias_silu_tf32_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_tf32_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_run,
baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_f32_simt_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_f32_simt_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f32_simt_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_f32_simt_rrr_sm80_run,
baracuda_kernels_gemm_bias_silu_f32_simt_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f32_simt_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_f32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_f32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_f32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_f32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_f32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_f32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_f32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_f32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_f32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_f32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_i32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_i32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_i32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_i32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_i32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_i32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_i32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_i32bias_s8_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_i32bias_s8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_i32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_i32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_i32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_relu_i32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_i32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_gelu_i32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_i32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f32!(
baracuda_kernels_gemm_bias_silu_i32bias_u8_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_i32bias_u8_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_f64_rcr_sm80_run,
baracuda_kernels_gemm_bias_f64_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f64_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f64_rcr_sm80_run,
baracuda_cutlass_gemm_bias_f64_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f64_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_f64_rrr_sm80_run,
baracuda_kernels_gemm_bias_f64_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_f64_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_f64_rrr_sm80_run,
baracuda_cutlass_gemm_bias_f64_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_f64_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_relu_f64_rcr_sm80_run,
baracuda_kernels_gemm_bias_relu_f64_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f64_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_relu_f64_rrr_sm80_run,
baracuda_kernels_gemm_bias_relu_f64_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_relu_f64_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_run,
baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_gelu_f64_rcr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f64_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f64_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_gelu_f64_rrr_sm80_run,
baracuda_kernels_gemm_bias_gelu_f64_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_gelu_f64_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_run,
baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_silu_f64_rcr_sm80_run,
baracuda_kernels_gemm_bias_silu_f64_rcr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f64_rcr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_bias_f64!(
baracuda_kernels_gemm_bias_silu_f64_rrr_sm80_run,
baracuda_kernels_gemm_bias_silu_f64_rrr_sm80_workspace_size,
baracuda_kernels_gemm_bias_silu_f64_rrr_sm80_can_implement,
baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_run,
baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_workspace_size,
baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_batched_f32!(
baracuda_kernels_gemm_batched_f16_rcr_sm80_run,
baracuda_kernels_gemm_batched_f16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_batched_f16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_batched_f16_rcr_sm80_run,
baracuda_cutlass_gemm_batched_f16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_batched_f16_rcr_sm80_can_implement
);
#[cfg(any(feature = "sm80", feature = "sm90a"))]
gemm_batched_f32!(
baracuda_kernels_gemm_batched_bf16_rcr_sm80_run,
baracuda_kernels_gemm_batched_bf16_rcr_sm80_workspace_size,
baracuda_kernels_gemm_batched_bf16_rcr_sm80_can_implement,
baracuda_cutlass_gemm_batched_bf16_rcr_sm80_run,
baracuda_cutlass_gemm_batched_bf16_rcr_sm80_workspace_size,
baracuda_cutlass_gemm_batched_bf16_rcr_sm80_can_implement
);