pub struct CpuBackend;Trait Implementations§
Source§impl Backend for CpuBackend
impl Backend for CpuBackend
type Buffer = Vec<f32>
Source§type GptqStore = CpuGptqStore
type GptqStore = CpuGptqStore
Opaque per-backend GPTQ weight representation. Read more
Source§type QuantStore = CpuQuantStore
type QuantStore = CpuQuantStore
Single backend-specific store for all GGUF k-quant flavours
(Q4_K_M today; Q5_K_M / Q6_K / Q8_0 etc. become enum variants
without changing the trait shape). Read more
Source§fn new_context() -> Self::Context
fn new_context() -> Self::Context
Create a new execution context (begin accumulating work).
Source§fn sync(_ctx: &mut Self::Context)
fn sync(_ctx: &mut Self::Context)
Flush accumulated work and wait for completion.
CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.
Source§fn load_gptq(
qweight: &[i32],
scales: &[f32],
qzeros: &[i32],
_g_idx: Option<&[i32]>,
bits: u32,
group_size: usize,
k: usize,
n: usize,
) -> Result<Self::GptqStore>
fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Self::GptqStore>
Repack raw GPTQ tensors into the backend’s preferred format.
Called once per layer at model load time. Read more
Source§fn gemm_gptq(
ctx: &mut Self::Context,
a: &Self::Buffer,
weight: &Self::GptqStore,
out: &mut Self::Buffer,
m: usize,
) -> Result<()>
fn gemm_gptq( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::GptqStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>
GEMM with pre-loaded GPTQ weights.
out[m, n] = a[m, k] @ dequant(weight)^TSource§fn load_quant(
kind: GgufQuantType,
bytes: &[u8],
n_rows: usize,
n_cols: usize,
) -> Result<Self::QuantStore>
fn load_quant( kind: GgufQuantType, bytes: &[u8], n_rows: usize, n_cols: usize, ) -> Result<Self::QuantStore>
Load GGUF k-quant weights into the backend’s preferred format. Read more
Source§fn gemm_quant(
ctx: &mut Self::Context,
a: &Self::Buffer,
weight: &Self::QuantStore,
out: &mut Self::Buffer,
m: usize,
) -> Result<()>
fn gemm_quant( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::QuantStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>
GEMM with k-quant weights. Mirrors
gemm / gemm_gptq shape:
out[m, n] = a[m, k] @ dequant(weight)^T. The dispatch on the
quant flavour happens inside the backend’s QuantStore enum.fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )
fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )
Source§fn copy_slice(
_ctx: &mut Self::Context,
src: &Self::Buffer,
src_offset: usize,
dst: &mut Self::Buffer,
dst_offset: usize,
len: usize,
)
fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )
fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )
Source§fn split_qkv(
_ctx: &mut Self::Context,
qkv: &Self::Buffer,
q: &mut Self::Buffer,
k: &mut Self::Buffer,
v: &mut Self::Buffer,
tokens: usize,
q_dim: usize,
kv_dim: usize,
)
fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )
Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers.
Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]
Source§fn fused_silu_mul_split(
_ctx: &mut Self::Context,
gate_up: &Self::Buffer,
out: &mut Self::Buffer,
tokens: usize,
im: usize,
)
fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )
Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im],
then compute SiLU(gate) * up → out [tokens, im].
Source§fn qk_norm_rope(
_ctx: &mut Self::Context,
input: &Self::Buffer,
norm_w: &Self::Buffer,
cos: &Self::Buffer,
sin: &Self::Buffer,
output: &mut Self::Buffer,
tokens: usize,
heads: usize,
head_dim: usize,
pos_offset: usize,
eps: f32,
mode: i32,
)
fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )
Fused QK-norm + RoPE + transpose-to-head-major. Read more
Source§fn kv_cache_append_head_major(
_ctx: &mut Self::Context,
cache_k: &mut Self::Buffer,
cache_v: &mut Self::Buffer,
cache_len: usize,
cache_capacity: usize,
new_k_head_major: &Self::Buffer,
new_v_head_major: &Self::Buffer,
new_tokens: usize,
nkv: usize,
hd: usize,
)
fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )
Append new K/V into a pre-allocated head-major cache buffer. Read more
Source§fn transpose_head_to_token(
_ctx: &mut Self::Context,
src: &Self::Buffer,
dst: &mut Self::Buffer,
tokens: usize,
heads: usize,
dim: usize,
)
fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )
Transpose [heads, tokens, dim] → [tokens, heads, dim].
Called after
flash_attention to restore token-major layout for O-proj.Source§fn add_inplace(
_ctx: &mut Self::Context,
residual: &mut Self::Buffer,
x: &Self::Buffer,
len: usize,
)
fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )
residual[i] += x[i] (in-place)
Source§fn scaled_add_inplace(
_ctx: &mut Self::Context,
dst: &mut Self::Buffer,
src: &Self::Buffer,
scale: f32,
len: usize,
)
fn scaled_add_inplace( _ctx: &mut Self::Context, dst: &mut Self::Buffer, src: &Self::Buffer, scale: f32, len: usize, )
dst[i] += scale * src[i] — scalar-broadcast scaled add, in place. Read moreSource§fn add_bias(
_ctx: &mut Self::Context,
data: &mut Self::Buffer,
bias: &Self::Buffer,
rows: usize,
cols: usize,
)
fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )
Broadcast bias add:
data[r, c] += bias[c] for every row.
Required by Bert / Clip / Whisper whose linear projections carry a bias.Source§fn layer_norm(
_ctx: &mut Self::Context,
x: &Self::Buffer,
gamma: &Self::Buffer,
beta: &Self::Buffer,
eps: f32,
out: &mut Self::Buffer,
tokens: usize,
dim: usize,
)
fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
Full LayerNorm (mean + variance normalisation + affine), distinct from
the
rms_norm used by Llama-family decoders.
out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c]
Where mean and var are reduced over the last dim (cols).Source§fn gelu(
_ctx: &mut Self::Context,
x: &Self::Buffer,
out: &mut Self::Buffer,
len: usize,
)
fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )
Element-wise GELU activation (erf-based, matches PyTorch default).
fn alloc(len: usize) -> Self::Buffer
fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>
fn from_slice(data: &[f32]) -> Self::Buffer
Source§fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)
fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)
Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).
Source§fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)
fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)
Toggle between scalar-arg kernels (normal) and
_dyn kernels that
read their dynamic scalar args from device memory (graph-friendly).Source§fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>
fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>
Begin stream capture. Subsequent kernel launches are recorded into
a pending graph instead of executing eagerly.
Source§fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>
fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>
End stream capture and install the captured graph as this context’s
“last graph” for future
replay_last_graph calls.Source§fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>
fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>
Replay the last captured graph. Returns
Ok(false) if no graph
is cached; caller should run eager.Source§fn reset_graph(_ctx: &mut Self::Context)
fn reset_graph(_ctx: &mut Self::Context)
Drop the cached decode graph — required when the KV cache it
was captured against is about to be freed (e.g. request release),
since the graph holds raw device pointers into that cache.
Source§fn load_quant_fused(
_parts: &[(GgufQuantType, &[u8], usize)],
_n_cols: usize,
) -> Result<Self::QuantStore>
fn load_quant_fused( _parts: &[(GgufQuantType, &[u8], usize)], _n_cols: usize, ) -> Result<Self::QuantStore>
Build a fused
QuantStore from multiple (kind, bytes, n_rows)
parts that share n_cols. Used by GgufLoader::load_fused when
parts have heterogeneous quant kinds (e.g. Qwen3 qkv_proj where
q+k are Q4_K but v is Q6_K) — byte-concatenation isn’t possible,
so each part stays as its own QuantStore and the gemm dispatches
one matvec per part with output offsets. Read moreSource§fn load_quant_experts(
_kind: GgufQuantType,
_bytes: &[u8],
_num_experts: usize,
_n_rows: usize,
_n_cols: usize,
) -> Result<Self::QuantStore>
fn load_quant_experts( _kind: GgufQuantType, _bytes: &[u8], _num_experts: usize, _n_rows: usize, _n_cols: usize, ) -> Result<Self::QuantStore>
Build a stacked-experts
QuantStore from a contiguous 3-D weight
payload [num_experts, n_rows, n_cols/256] super-blocks.
Used for the MoE indirect-dispatch fast path; backends without
such a kernel return Err(unsupported) and the model code falls
back to the per-expert loop. Read moreSource§fn gemm_quant_moe_id(
_ctx: &mut Self::Context,
_a: &Self::Buffer,
_weight: &Self::QuantStore,
_ids: &Self::Buffer,
_tpe: &Self::Buffer,
_out: &mut Self::Buffer,
_ne11: usize,
_top_k: usize,
_max_per_expert: usize,
_batch: usize,
) -> Result<()>
fn gemm_quant_moe_id( _ctx: &mut Self::Context, _a: &Self::Buffer, _weight: &Self::QuantStore, _ids: &Self::Buffer, _tpe: &Self::Buffer, _out: &mut Self::Buffer, _ne11: usize, _top_k: usize, _max_per_expert: usize, _batch: usize, ) -> Result<()>
MoE 2-D indirect-dispatch GEMM (prefill m > 1). Read more
Source§fn route_topk_softmax(
_ctx: &mut Self::Context,
_logits: &Self::Buffer,
_out_ids: &mut Self::Buffer,
_out_weights: &mut Self::Buffer,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_norm_topk_prob: bool,
) -> Result<()>
fn route_topk_softmax( _ctx: &mut Self::Context, _logits: &Self::Buffer, _out_ids: &mut Self::Buffer, _out_weights: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _norm_topk_prob: bool, ) -> Result<()>
GPU-side MoE router:
[batch, num_experts] logits → [batch, top_k]
expert IDs (i32) + [batch, top_k] combine weights (f32). Read moreSource§fn compute_ids_tpe_gpu(
_ctx: &mut Self::Context,
_selected_ids: &Self::Buffer,
_tpe: &mut Self::Buffer,
_ids: &mut Self::Buffer,
_gate_up_args: &mut Self::Buffer,
_down_args: &mut Self::Buffer,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_m_gate_up: usize,
_m_down: usize,
) -> Result<()>
fn compute_ids_tpe_gpu( _ctx: &mut Self::Context, _selected_ids: &Self::Buffer, _tpe: &mut Self::Buffer, _ids: &mut Self::Buffer, _gate_up_args: &mut Self::Buffer, _down_args: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _m_gate_up: usize, _m_down: usize, ) -> Result<()>
GPU-side bucket sort: turn
[batch, top_k] selected expert IDs
(from Self::route_topk_softmax) into tpe[num_experts] /
ids[num_experts * row_stride] arrays consumed by the batched
MoE GEMM, and emit indirect-dispatch args for the consumer GEMM. Read moreSource§fn gemm_quant_moe_id_indirect(
_ctx: &mut Self::Context,
_src1: &Self::Buffer,
_weights: &Self::QuantStore,
_ids: &Self::Buffer,
_tpe: &Self::Buffer,
_out: &mut Self::Buffer,
_args_buf: &Self::Buffer,
_ne11: usize,
_top_k: usize,
_max_per_expert: usize,
_batch: usize,
) -> Result<()>
fn gemm_quant_moe_id_indirect( _ctx: &mut Self::Context, _src1: &Self::Buffer, _weights: &Self::QuantStore, _ids: &Self::Buffer, _tpe: &Self::Buffer, _out: &mut Self::Buffer, _args_buf: &Self::Buffer, _ne11: usize, _top_k: usize, _max_per_expert: usize, _batch: usize, ) -> Result<()>
Indirect-dispatch variant of
gemm_quant_moe_id. Read moreSource§fn silu_mul_batched(
_ctx: &mut Self::Context,
_gate: &Self::Buffer,
_up: &Self::Buffer,
_out: &mut Self::Buffer,
_total_pairs: usize,
_ffn: usize,
) -> Result<()>
fn silu_mul_batched( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _total_pairs: usize, _ffn: usize, ) -> Result<()>
Stacked SiLU·gate over
[batch * top_k, ffn] rows (prefill version
of silu_mul_stacked).Source§fn weighted_sum_residual_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_residual: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
) -> Result<()>
fn weighted_sum_residual_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>
Fused weighted-sum + residual-add:
residual[i] += Σ_k weights[k] · slots[k, i].
Single dispatch replaces the (weighted_sum → moe_out) +
(add_inplace residual += moe_out) pair on the decode hot path.Source§fn weighted_sum_residual_norm_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_residual: &mut Self::Buffer,
_next_norm_w: &Self::Buffer,
_normed_out: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
_eps: f32,
) -> Result<()>
fn weighted_sum_residual_norm_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _next_norm_w: &Self::Buffer, _normed_out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, _eps: f32, ) -> Result<()>
Fused weighted-sum-residual + RMSNorm: combines this layer’s
weighted_sum_residual_stacked with the next layer’s leading
rms_norm into a single dispatch. Read moreSource§fn weighted_sum_batched(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_out: &mut Self::Buffer,
_batch: usize,
_top_k: usize,
_hidden: usize,
) -> Result<()>
fn weighted_sum_batched( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _top_k: usize, _hidden: usize, ) -> Result<()>
Per-batch weighted sum:
out[b, h] = Σ_k weights[b, k] · slots[b, k, h].
Single dispatch covers the whole batch (prefill version of
weighted_sum_stacked which only handled one token).Source§fn weighted_sum_batched_offset(
ctx: &mut Self::Context,
slots: &Self::Buffer,
weights: &Self::Buffer,
weights_offset: usize,
out: &mut Self::Buffer,
out_offset: usize,
batch: usize,
top_k: usize,
hidden: usize,
) -> Result<()>
fn weighted_sum_batched_offset( ctx: &mut Self::Context, slots: &Self::Buffer, weights: &Self::Buffer, weights_offset: usize, out: &mut Self::Buffer, out_offset: usize, batch: usize, top_k: usize, hidden: usize, ) -> Result<()>
Offset-aware variant of
Self::weighted_sum_batched —
weights reads from weights_offset (in elements, points at
the start of [batch, top_k]), out writes from out_offset
(in elements, points at start of [batch, hidden]). Used by
the per-item batched-decode path to skip copy_slice round-trips.
Default falls back to the non-offset variant via two copies.Source§fn gemv_quant_moe_id(
_ctx: &mut Self::Context,
_a: &Self::Buffer,
_weight: &Self::QuantStore,
_ids: &Self::Buffer,
_out: &mut Self::Buffer,
_n_selected: usize,
_src1_stride: usize,
) -> Result<()>
fn gemv_quant_moe_id( _ctx: &mut Self::Context, _a: &Self::Buffer, _weight: &Self::QuantStore, _ids: &Self::Buffer, _out: &mut Self::Buffer, _n_selected: usize, _src1_stride: usize, ) -> Result<()>
MoE indirect-dispatch GEMV:
out[i, :] = a[i, :] @ dequant(weight[ids[i], :])^T
for each i ∈ [0, n_selected). Single backend dispatch covers
all selected (token, expert) pairs. Read moreSource§fn gemv_quant_moe_id_offset(
ctx: &mut Self::Context,
a: &Self::Buffer,
a_offset: usize,
weight: &Self::QuantStore,
ids: &Self::Buffer,
ids_offset: usize,
out: &mut Self::Buffer,
n_selected: usize,
src1_stride: usize,
) -> Result<()>
fn gemv_quant_moe_id_offset( ctx: &mut Self::Context, a: &Self::Buffer, a_offset: usize, weight: &Self::QuantStore, ids: &Self::Buffer, ids_offset: usize, out: &mut Self::Buffer, n_selected: usize, src1_stride: usize, ) -> Result<()>
Offset-aware variant of
Self::gemv_quant_moe_id — reads a
from a_offset (in elements; meaningful only when src1_stride=0
for the broadcast case, or as the start of an n_selected × K
strided read when src1_stride≥K), reads ids from ids_offset
(the i-th top_k block in a stacked-batch [M, top_k] ids
buffer), and writes out from offset 0 (output stays per-iter
scratch). Used by the per-item batched-decode path so the M=N
concurrent decodes can read directly from the M-batch
selected_ids_buf / norm_out without materialising
per-iteration copies.Source§fn from_slice_i32(data: &[i32]) -> Self::Buffer
fn from_slice_i32(data: &[i32]) -> Self::Buffer
Allocate a backend buffer of i32-typed values for kernels that
need integer indices (MoE expert IDs, scatter indices, etc.). Read more
Source§fn write_i32_into(buf: &mut Self::Buffer, data: &[i32])
fn write_i32_into(buf: &mut Self::Buffer, data: &[i32])
Overwrite an existing i32 buffer’s contents in place. Used on
the MoE decode hot path: per-layer expert-id updates do an
in-place memcpy instead of allocating a fresh device buffer
(48 layers × 128 tokens = 6144 fresh allocations per decode
run otherwise — allocator pressure dominates the secondary cost). Read more
Source§fn write_f32_into(buf: &mut Self::Buffer, data: &[f32])
fn write_f32_into(buf: &mut Self::Buffer, data: &[f32])
Overwrite an existing f32 buffer’s contents in place. Counterpart
to
write_i32_into for f32 data — used to update the per-token
MoE combine weights into a pre-allocated scratch buffer instead
of allocating a fresh from_slice buffer 6144 times per decode
run.Source§fn silu_mul_stacked(
_ctx: &mut Self::Context,
_gate: &Self::Buffer,
_up: &Self::Buffer,
_out: &mut Self::Buffer,
_n_slots: usize,
_ffn: usize,
) -> Result<()>
fn silu_mul_stacked( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _ffn: usize, ) -> Result<()>
Stacked SiLU·gate over
[n_slots, ffn] rows. Read moreSource§fn gemv_quant_moe_id_gate_up_silu(
_ctx: &mut Self::Context,
_a: &Self::Buffer,
_gate_w: &Self::QuantStore,
_up_w: &Self::QuantStore,
_ids: &Self::Buffer,
_silu_out: &mut Self::Buffer,
_n_selected: usize,
) -> Result<()>
fn gemv_quant_moe_id_gate_up_silu( _ctx: &mut Self::Context, _a: &Self::Buffer, _gate_w: &Self::QuantStore, _up_w: &Self::QuantStore, _ids: &Self::Buffer, _silu_out: &mut Self::Buffer, _n_selected: usize, ) -> Result<()>
Fused gate+up MoE GEMV with in-register
SiLU(gate) * up. Read moreSource§fn supports_fused_moe_gate_up_silu() -> bool
fn supports_fused_moe_gate_up_silu() -> bool
Capability probe for
Self::gemv_quant_moe_id_gate_up_silu. Read moreSource§fn gemv_quant_moe_id_batched(
_ctx: &mut Self::Context,
_a: &Self::Buffer,
_weight: &Self::QuantStore,
_ids: &Self::Buffer,
_out: &mut Self::Buffer,
_m: usize,
_top_k: usize,
_src1_outer_stride: usize,
_src1_inner_stride: usize,
) -> Result<()>
fn gemv_quant_moe_id_batched( _ctx: &mut Self::Context, _a: &Self::Buffer, _weight: &Self::QuantStore, _ids: &Self::Buffer, _out: &mut Self::Buffer, _m: usize, _top_k: usize, _src1_outer_stride: usize, _src1_inner_stride: usize, ) -> Result<()>
Batched MoE indirect-dispatch GEMV — one Metal launch covers
all
m * top_k (token, expert) pairs at once. Read moreSource§fn supports_batched_moe_gemv() -> bool
fn supports_batched_moe_gemv() -> bool
Capability probe for
Self::gemv_quant_moe_id_batched.Source§fn supports_paged_kv() -> bool
fn supports_paged_kv() -> bool
Whether this backend has a paged-KV decode path
(
paged_decode_attention etc.). Currently true for Metal, false
for CPU. Used to decide the default of FERRUM_METAL_PAGED_KV —
the serve path should opt in automatically when supported so
users get the bench-quality concurrent-decode numbers without
having to learn the flag.Source§fn gemv_quant_moe_id_gate_up_silu_batched(
_ctx: &mut Self::Context,
_a: &Self::Buffer,
_gate_w: &Self::QuantStore,
_up_w: &Self::QuantStore,
_ids: &Self::Buffer,
_silu_out: &mut Self::Buffer,
_m: usize,
_top_k: usize,
_src1_outer_stride: usize,
_src1_inner_stride: usize,
) -> Result<()>
fn gemv_quant_moe_id_gate_up_silu_batched( _ctx: &mut Self::Context, _a: &Self::Buffer, _gate_w: &Self::QuantStore, _up_w: &Self::QuantStore, _ids: &Self::Buffer, _silu_out: &mut Self::Buffer, _m: usize, _top_k: usize, _src1_outer_stride: usize, _src1_inner_stride: usize, ) -> Result<()>
Batched fused gate+up MoE GEMV with in-register
SiLU(gate) * up. Read moreSource§fn supports_batched_moe_gate_up_silu() -> bool
fn supports_batched_moe_gate_up_silu() -> bool
Capability probe for
Self::gemv_quant_moe_id_gate_up_silu_batched.Source§fn weighted_sum_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_out: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
) -> Result<()>
fn weighted_sum_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>
Source§fn mla_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_kv_compressed: &Self::Buffer,
_kv_rope: &Self::Buffer,
_out: &mut Self::Buffer,
_batch: usize,
_q_len: usize,
_kv_len: usize,
_pos_offset: usize,
_cfg: &AttnConfig,
_kv_lora_rank: usize,
_qk_rope_head_dim: usize,
) -> Result<()>
fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>
Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV
attention variant. Extension point only; no backend implements it
yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more
Source§fn split_qkv_norm_rope(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_k_out: &mut Self::Buffer,
_v_out: &mut Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
) -> Result<()>
fn split_qkv_norm_rope( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _k_out: &mut Self::Buffer, _v_out: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, ) -> Result<()>
Fused split-QKV + QK-norm + RoPE + head-major transpose. Read more
Source§fn split_qkv_norm_rope_into_cache(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
_cache_len: usize,
_cache_capacity: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _cache_capacity: usize, ) -> Result<()>
Variant of
Backend::split_qkv_norm_rope that writes the new
K and V directly into pre-allocated head-major KV cache buffers
at slot [kv_heads, cache_len .. cache_len + tokens, hd].
Eliminates the trailing kv_cache_append_head_major dispatch on
the decode hot path. Q still lands in per-token head-major
scratch (flash-attention reads it as the query). Read moreSource§fn split_qkv_norm_rope_into_paged_cache(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_qkv_byte_offset: u64,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_q_out_byte_offset: u64,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_block_table: &Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
_cache_len: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_paged_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _qkv_byte_offset: u64, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _q_out_byte_offset: u64, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _block_table: &Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Paged-KV variant of
Self::split_qkv_norm_rope_into_cache. Read moreSource§fn paged_decode_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_block_tables: &Self::Buffer,
_context_lens: &Self::Buffer,
_num_seqs: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
_q_len: usize,
) -> Result<()>
fn paged_decode_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _q_len: usize, ) -> Result<()>
Paged-KV variant of
Self::flash_attention. Read moreSource§fn alloc_u32(n: usize) -> Self::Buffer
fn alloc_u32(n: usize) -> Self::Buffer
Allocate a u32 buffer of length
n for paged-KV bookkeeping
(block tables, context lens). Default uses the existing
from_slice_i32 route then bit-casts; backends with a faster
path can override.Source§fn write_u32(_ctx: &mut Self::Context, _dst: &mut Self::Buffer, _data: &[u32])
fn write_u32(_ctx: &mut Self::Context, _dst: &mut Self::Buffer, _data: &[u32])
Write a u32 slice into a buffer previously allocated via
Self::alloc_u32. Used for live block_tables / context_lens
updates between decode steps. Read moreSource§fn from_weight_bytes(raw: &[u8], src_dtype: SrcDtype) -> Self::Buffer
fn from_weight_bytes(raw: &[u8], src_dtype: SrcDtype) -> Self::Buffer
Load a weight tensor straight from its on-disk byte representation,
letting the backend pick its preferred storage dtype. Read more
fn world_size(_ctx: &Self::Context) -> usize
fn rank(_ctx: &Self::Context) -> usize
fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )
fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )
fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )
Auto Trait Implementations§
impl Freeze for CpuBackend
impl RefUnwindSafe for CpuBackend
impl Send for CpuBackend
impl Sync for CpuBackend
impl Unpin for CpuBackend
impl UnsafeUnpin for CpuBackend
impl UnwindSafe for CpuBackend
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more