#![allow(dead_code)]
#![allow(improper_ctypes)]
use std::ffi::c_void;
macro_rules! declare_mmvq_fused_qkv {
($fn_name:ident) => {
pub fn $fn_name(
vx_q: *const c_void,
vx_k: *const c_void,
vx_v: *const c_void,
vy: *const c_void,
q_dst: *mut c_void,
k_dst: *mut c_void,
v_dst: *mut c_void,
ncols_x: i32,
nrows_q: i32,
nrows_k: i32,
nrows_v: i32,
stride_col_y: i32,
b_size: i32,
stream: *mut c_void,
);
};
}
extern "C" {
pub fn launch_quantize_q8_1(
x: *const f32,
vy: *mut c_void,
kx: i32,
kx_padded: i32,
num_blocks_x: i32,
num_rows: i32,
stream: *mut c_void,
);
pub fn launch_quantize_q8_1_bf16(
x: *const c_void,
vy: *mut c_void,
kx: i32,
kx_padded: i32,
num_rows: i32,
stream: *mut c_void,
);
pub fn launch_quantize_q8_1_f16(
x: *const c_void,
vy: *mut c_void,
kx: i32,
kx_padded: i32,
num_rows: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q2k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q3k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q4k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q5k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q6k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q8_0_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q4_0_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q4_1_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q5_0_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q5_1_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_indexed_moe_forward_q8_1_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_dispatch(
topk_ids: *const i32,
expert_bounds: *mut i32,
sorted_token_ids: *mut i32,
sorted_source_ids: *mut i32,
total_assignments: i32,
num_experts: i32,
topk: i32,
expert_counts: *mut i32,
expert_cursors: *mut i32,
stream: *mut c_void,
);
pub fn launch_moe_weighted_reduce_flat(
inputs: *const f32,
topk_weights: *const f32,
outputs: *mut f32,
num_tokens: i32,
hidden: i32,
topk: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q8_0(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q4_0(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q4_1(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q5_0(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q5_1(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q2k(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q3k(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q4k(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q5k(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_grouped_gemm_q6k(
all_weights: *const c_void,
all_inputs: *const c_void,
expert_bounds: *const i32,
sorted_token_ids: *const i32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
k_padded: i32,
num_experts: i32,
topk: i32,
input_dim1: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q8_0_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q4_0_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q4_1_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q5_0_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q5_1_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q8_1_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q2k_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q3k_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q4k_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q5k_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_fused_gate_up_q6k_q8_1(
gate_weights: *const c_void,
up_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
act_type: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q8_0_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q4_0_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q4_1_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q5_0_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q5_1_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q8_1_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q2k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q3k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q4k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q5k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_moe_gemv_down_aggregate_q6k_q8_1(
all_weights: *const c_void,
all_inputs: *const c_void,
indices: *const u32,
topk_weights: *const f32,
all_outputs: *mut f32,
n: i32,
k: i32,
batch: i32,
topk: i32,
k_padded: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_0_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_1_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_0_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_1_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q8_0_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q2_k_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q3_k_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_k_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_k_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q6_k_bf16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_0_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_1_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_0_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_1_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q8_0_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q2_k_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q3_k_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_k_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_k_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q6_k_f32_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_0_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_1_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_0_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_1_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q8_0_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q2_k_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q3_k_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q4_k_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q5_k_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q6_k_f16_plain(
vx: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q8_0_bf16_fused_glu(
vx_gate: *const c_void,
vx_up: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
activation: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q8_0_f16_fused_glu(
vx_gate: *const c_void,
vx_up: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
activation: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_q8_0_f32_fused_glu(
vx_gate: *const c_void,
vx_up: *const c_void,
vy: *const c_void,
dst: *mut c_void,
ncols_x: i32,
nrows_x: i32,
stride_col_y: i32,
stride_col_dst: i32,
b_size: i32,
activation: i32,
stream: *mut c_void,
);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_0_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_1_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_0_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_1_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q8_0_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q2_k_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q3_k_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_k_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_k_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q6_k_bf16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_0_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_1_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_0_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_1_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q8_0_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q2_k_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q3_k_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_k_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_k_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q6_k_f16_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_0_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_1_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_0_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_1_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q8_0_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q2_k_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q3_k_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q4_k_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q5_k_f32_fused_qkv);
declare_mmvq_fused_qkv!(launch_mmvq_gguf_q6_k_f32_fused_qkv);
pub fn launch_mmvq_gguf_quantize_q8_1_bf16(
x: *const c_void,
vy: *mut c_void,
kx: i32,
kx_padded: i32,
num_rows: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_quantize_q8_1_f16(
x: *const c_void,
vy: *mut c_void,
kx: i32,
kx_padded: i32,
num_rows: i32,
stream: *mut c_void,
);
pub fn launch_mmvq_gguf_quantize_q8_1_f32(
x: *const c_void,
vy: *mut c_void,
kx: i32,
kx_padded: i32,
num_rows: i32,
stream: *mut c_void,
);
pub fn launch_mmq_quantize_q8_1_D4(
x: *const c_void,
ids: *const i32,
vy: *mut c_void,
type_x: i32,
ne00: i64,
s01: i64,
s02: i64,
s03: i64,
ne0: i64,
ne1: i64,
ne2: i64,
ne3: i64,
stream: *mut c_void,
);
pub fn launch_mmq_quantize_q8_1_DS4(
x: *const c_void,
ids: *const i32,
vy: *mut c_void,
type_x: i32,
ne00: i64,
s01: i64,
s02: i64,
s03: i64,
ne0: i64,
ne1: i64,
ne2: i64,
ne3: i64,
stream: *mut c_void,
);
pub fn launch_mmq_quantize_q8_1_D2S6(
x: *const c_void,
ids: *const i32,
vy: *mut c_void,
type_x: i32,
ne00: i64,
s01: i64,
s02: i64,
s03: i64,
ne0: i64,
ne1: i64,
ne2: i64,
ne3: i64,
stream: *mut c_void,
);
pub fn launch_mmq_quantize_glu_q8_1_D4_f32(
gate: *const f32,
up: *const f32,
ids: *const i32,
vy: *mut c_void,
ne00: i64,
s01: i64,
ne0: i64,
ne1: i64,
activation: i32,
stream: *mut c_void,
);
pub fn launch_mmq_quantize_glu_q8_1_DS4_f32(
gate: *const f32,
up: *const f32,
ids: *const i32,
vy: *mut c_void,
ne00: i64,
s01: i64,
ne0: i64,
ne1: i64,
activation: i32,
stream: *mut c_void,
);
pub fn launch_mmq_quantize_glu_q8_1_D2S6_f32(
gate: *const f32,
up: *const f32,
ids: *const i32,
vy: *mut c_void,
ne00: i64,
s01: i64,
ne0: i64,
ne1: i64,
activation: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q4_0(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q4_1(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q5_0(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q5_1(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q8_0(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q4_0_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q4_1_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q5_0_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q5_1_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q8_0_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q2_k_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q3_k_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q4_k_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q5_k_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q6_k_moe(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
ids_dst: *const i32,
expert_bounds: *const i32,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_dst: i64,
stride_row_x: i64,
stride_col_dst: i64,
num_experts: i64,
ncols_max: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q2_k(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q3_k(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q4_k(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q5_k(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
pub fn launch_mmq_gguf_q6_k(
tmp_fixup: *mut c_void,
x: *const c_void,
y: *const c_void,
dst: *mut c_void,
ncols_x: i64,
nrows_x: i64,
ncols_y: i64,
stride_row_x: i64,
stride_col_dst: i64,
cc: i32,
nsm: i32,
smpbo: i64,
warp_size: i32,
stream: *mut c_void,
);
}