Skip to main content

CpuBackend

Struct CpuBackend 

Source
pub struct CpuBackend;

Trait Implementations§

Source§

impl Backend for CpuBackend

Source§

type Buffer = Vec<f32>

Source§

type Context = ()

Execution context that accumulates GPU work. Read more
Source§

type GptqStore = CpuGptqStore

Opaque per-backend GPTQ weight representation. Read more
Source§

type QuantStore = CpuQuantStore

Single backend-specific store for all GGUF k-quant flavours (Q4_K_M today; Q5_K_M / Q6_K / Q8_0 etc. become enum variants without changing the trait shape). Read more
Source§

fn new_context() -> Self::Context

Create a new execution context (begin accumulating work).
Source§

fn sync(_ctx: &mut Self::Context)

Flush accumulated work and wait for completion. CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.
Source§

fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Self::GptqStore>

Repack raw GPTQ tensors into the backend’s preferred format. Called once per layer at model load time. Read more
Source§

fn gemm_gptq( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::GptqStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>

GEMM with pre-loaded GPTQ weights. out[m, n] = a[m, k] @ dequant(weight)^T
Source§

fn load_quant( kind: GgufQuantType, bytes: &[u8], n_rows: usize, n_cols: usize, ) -> Result<Self::QuantStore>

Load GGUF k-quant weights into the backend’s preferred format. Read more
Source§

fn gemm_quant( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::QuantStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>

GEMM with k-quant weights. Mirrors gemm / gemm_gptq shape: out[m, n] = a[m, k] @ dequant(weight)^T. The dispatch on the quant flavour happens inside the backend’s QuantStore enum.
Source§

fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )

Source§

fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source§

fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source§

fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )

Source§

fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )

Copy len floats from src[src_offset..] to dst[dst_offset..]. Read more
Source§

fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )

Source§

fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )

Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers. Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]
Source§

fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )

Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im], then compute SiLU(gate) * up → out [tokens, im].
Source§

fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )

Fused QK-norm + RoPE + transpose-to-head-major. Read more
Source§

fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )

Append new K/V into a pre-allocated head-major cache buffer. Read more
Source§

fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )

Transpose [heads, tokens, dim] → [tokens, heads, dim]. Called after flash_attention to restore token-major layout for O-proj.
Source§

fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )

residual[i] += x[i] (in-place)
Source§

fn scaled_add_inplace( _ctx: &mut Self::Context, dst: &mut Self::Buffer, src: &Self::Buffer, scale: f32, len: usize, )

dst[i] += scale * src[i] — scalar-broadcast scaled add, in place. Read more
Source§

fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )

Broadcast bias add: data[r, c] += bias[c] for every row. Required by Bert / Clip / Whisper whose linear projections carry a bias.
Source§

fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Full LayerNorm (mean + variance normalisation + affine), distinct from the rms_norm used by Llama-family decoders. out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c] Where mean and var are reduced over the last dim (cols).
Source§

fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )

Element-wise GELU activation (erf-based, matches PyTorch default).
Source§

fn alloc(len: usize) -> Self::Buffer

Source§

fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>

Source§

fn from_slice(data: &[f32]) -> Self::Buffer

Source§

fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)

Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).
Source§

fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)

Toggle between scalar-arg kernels (normal) and _dyn kernels that read their dynamic scalar args from device memory (graph-friendly).
Source§

fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>

Begin stream capture. Subsequent kernel launches are recorded into a pending graph instead of executing eagerly.
Source§

fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>

End stream capture and install the captured graph as this context’s “last graph” for future replay_last_graph calls.
Source§

fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>

Replay the last captured graph. Returns Ok(false) if no graph is cached; caller should run eager.
Source§

fn reset_graph(_ctx: &mut Self::Context)

Drop the cached decode graph — required when the KV cache it was captured against is about to be freed (e.g. request release), since the graph holds raw device pointers into that cache.
Source§

fn load_quant_fused( _parts: &[(GgufQuantType, &[u8], usize)], _n_cols: usize, ) -> Result<Self::QuantStore>

Build a fused QuantStore from multiple (kind, bytes, n_rows) parts that share n_cols. Used by GgufLoader::load_fused when parts have heterogeneous quant kinds (e.g. Qwen3 qkv_proj where q+k are Q4_K but v is Q6_K) — byte-concatenation isn’t possible, so each part stays as its own QuantStore and the gemm dispatches one matvec per part with output offsets. Read more
Source§

fn load_quant_experts( _kind: GgufQuantType, _bytes: &[u8], _num_experts: usize, _n_rows: usize, _n_cols: usize, ) -> Result<Self::QuantStore>

Build a stacked-experts QuantStore from a contiguous 3-D weight payload [num_experts, n_rows, n_cols/256] super-blocks. Used for the MoE indirect-dispatch fast path; backends without such a kernel return Err(unsupported) and the model code falls back to the per-expert loop. Read more
Source§

fn gemm_quant_moe_id( _ctx: &mut Self::Context, _a: &Self::Buffer, _weight: &Self::QuantStore, _ids: &Self::Buffer, _tpe: &Self::Buffer, _out: &mut Self::Buffer, _ne11: usize, _top_k: usize, _max_per_expert: usize, _batch: usize, ) -> Result<()>

MoE 2-D indirect-dispatch GEMM (prefill m > 1). Read more
Source§

fn route_topk_softmax( _ctx: &mut Self::Context, _logits: &Self::Buffer, _out_ids: &mut Self::Buffer, _out_weights: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _norm_topk_prob: bool, ) -> Result<()>

GPU-side MoE router: [batch, num_experts] logits → [batch, top_k] expert IDs (i32) + [batch, top_k] combine weights (f32). Read more
Source§

fn compute_ids_tpe_gpu( _ctx: &mut Self::Context, _selected_ids: &Self::Buffer, _tpe: &mut Self::Buffer, _ids: &mut Self::Buffer, _gate_up_args: &mut Self::Buffer, _down_args: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _m_gate_up: usize, _m_down: usize, ) -> Result<()>

GPU-side bucket sort: turn [batch, top_k] selected expert IDs (from Self::route_topk_softmax) into tpe[num_experts] / ids[num_experts * row_stride] arrays consumed by the batched MoE GEMM, and emit indirect-dispatch args for the consumer GEMM. Read more
Source§

fn gemm_quant_moe_id_indirect( _ctx: &mut Self::Context, _src1: &Self::Buffer, _weights: &Self::QuantStore, _ids: &Self::Buffer, _tpe: &Self::Buffer, _out: &mut Self::Buffer, _args_buf: &Self::Buffer, _ne11: usize, _top_k: usize, _max_per_expert: usize, _batch: usize, ) -> Result<()>

Indirect-dispatch variant of gemm_quant_moe_id. Read more
Source§

fn silu_mul_batched( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _total_pairs: usize, _ffn: usize, ) -> Result<()>

Stacked SiLU·gate over [batch * top_k, ffn] rows (prefill version of silu_mul_stacked).
Source§

fn weighted_sum_residual_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>

Fused weighted-sum + residual-add: residual[i] += Σ_k weights[k] · slots[k, i]. Single dispatch replaces the (weighted_sum → moe_out) + (add_inplace residual += moe_out) pair on the decode hot path.
Source§

fn weighted_sum_residual_norm_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _next_norm_w: &Self::Buffer, _normed_out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, _eps: f32, ) -> Result<()>

Fused weighted-sum-residual + RMSNorm: combines this layer’s weighted_sum_residual_stacked with the next layer’s leading rms_norm into a single dispatch. Read more
Source§

fn weighted_sum_batched( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _top_k: usize, _hidden: usize, ) -> Result<()>

Per-batch weighted sum: out[b, h] = Σ_k weights[b, k] · slots[b, k, h]. Single dispatch covers the whole batch (prefill version of weighted_sum_stacked which only handled one token).
Source§

fn weighted_sum_batched_offset( ctx: &mut Self::Context, slots: &Self::Buffer, weights: &Self::Buffer, weights_offset: usize, out: &mut Self::Buffer, out_offset: usize, batch: usize, top_k: usize, hidden: usize, ) -> Result<()>

Offset-aware variant of Self::weighted_sum_batchedweights reads from weights_offset (in elements, points at the start of [batch, top_k]), out writes from out_offset (in elements, points at start of [batch, hidden]). Used by the per-item batched-decode path to skip copy_slice round-trips. Default falls back to the non-offset variant via two copies.
Source§

fn gemv_quant_moe_id( _ctx: &mut Self::Context, _a: &Self::Buffer, _weight: &Self::QuantStore, _ids: &Self::Buffer, _out: &mut Self::Buffer, _n_selected: usize, _src1_stride: usize, ) -> Result<()>

MoE indirect-dispatch GEMV: out[i, :] = a[i, :] @ dequant(weight[ids[i], :])^T for each i ∈ [0, n_selected). Single backend dispatch covers all selected (token, expert) pairs. Read more
Source§

fn gemv_quant_moe_id_offset( ctx: &mut Self::Context, a: &Self::Buffer, a_offset: usize, weight: &Self::QuantStore, ids: &Self::Buffer, ids_offset: usize, out: &mut Self::Buffer, n_selected: usize, src1_stride: usize, ) -> Result<()>

Offset-aware variant of Self::gemv_quant_moe_id — reads a from a_offset (in elements; meaningful only when src1_stride=0 for the broadcast case, or as the start of an n_selected × K strided read when src1_stride≥K), reads ids from ids_offset (the i-th top_k block in a stacked-batch [M, top_k] ids buffer), and writes out from offset 0 (output stays per-iter scratch). Used by the per-item batched-decode path so the M=N concurrent decodes can read directly from the M-batch selected_ids_buf / norm_out without materialising per-iteration copies.
Source§

fn from_slice_i32(data: &[i32]) -> Self::Buffer

Allocate a backend buffer of i32-typed values for kernels that need integer indices (MoE expert IDs, scatter indices, etc.). Read more
Source§

fn write_i32_into(buf: &mut Self::Buffer, data: &[i32])

Overwrite an existing i32 buffer’s contents in place. Used on the MoE decode hot path: per-layer expert-id updates do an in-place memcpy instead of allocating a fresh device buffer (48 layers × 128 tokens = 6144 fresh allocations per decode run otherwise — allocator pressure dominates the secondary cost). Read more
Source§

fn write_f32_into(buf: &mut Self::Buffer, data: &[f32])

Overwrite an existing f32 buffer’s contents in place. Counterpart to write_i32_into for f32 data — used to update the per-token MoE combine weights into a pre-allocated scratch buffer instead of allocating a fresh from_slice buffer 6144 times per decode run.
Source§

fn silu_mul_stacked( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _ffn: usize, ) -> Result<()>

Stacked SiLU·gate over [n_slots, ffn] rows. Read more
Source§

fn gemv_quant_moe_id_gate_up_silu( _ctx: &mut Self::Context, _a: &Self::Buffer, _gate_w: &Self::QuantStore, _up_w: &Self::QuantStore, _ids: &Self::Buffer, _silu_out: &mut Self::Buffer, _n_selected: usize, ) -> Result<()>

Fused gate+up MoE GEMV with in-register SiLU(gate) * up. Read more
Source§

fn supports_fused_moe_gate_up_silu() -> bool

Source§

fn gemv_quant_moe_id_batched( _ctx: &mut Self::Context, _a: &Self::Buffer, _weight: &Self::QuantStore, _ids: &Self::Buffer, _out: &mut Self::Buffer, _m: usize, _top_k: usize, _src1_outer_stride: usize, _src1_inner_stride: usize, ) -> Result<()>

Batched MoE indirect-dispatch GEMV — one Metal launch covers all m * top_k (token, expert) pairs at once. Read more
Source§

fn supports_batched_moe_gemv() -> bool

Capability probe for Self::gemv_quant_moe_id_batched.
Source§

fn supports_paged_kv() -> bool

Whether this backend has a paged-KV decode path (paged_decode_attention etc.). Currently true for Metal, false for CPU. Used to decide the default of FERRUM_METAL_PAGED_KV — the serve path should opt in automatically when supported so users get the bench-quality concurrent-decode numbers without having to learn the flag.
Source§

fn gemv_quant_moe_id_gate_up_silu_batched( _ctx: &mut Self::Context, _a: &Self::Buffer, _gate_w: &Self::QuantStore, _up_w: &Self::QuantStore, _ids: &Self::Buffer, _silu_out: &mut Self::Buffer, _m: usize, _top_k: usize, _src1_outer_stride: usize, _src1_inner_stride: usize, ) -> Result<()>

Batched fused gate+up MoE GEMV with in-register SiLU(gate) * up. Read more
Source§

fn supports_batched_moe_gate_up_silu() -> bool

Source§

fn weighted_sum_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>

Weighted sum across n_slots rows of [hidden]. Read more
Source§

fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>

Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV attention variant. Extension point only; no backend implements it yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more
Source§

fn split_qkv_norm_rope( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _k_out: &mut Self::Buffer, _v_out: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, ) -> Result<()>

Fused split-QKV + QK-norm + RoPE + head-major transpose. Read more
Source§

fn split_qkv_norm_rope_into_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _cache_capacity: usize, ) -> Result<()>

Variant of Backend::split_qkv_norm_rope that writes the new K and V directly into pre-allocated head-major KV cache buffers at slot [kv_heads, cache_len .. cache_len + tokens, hd]. Eliminates the trailing kv_cache_append_head_major dispatch on the decode hot path. Q still lands in per-token head-major scratch (flash-attention reads it as the query). Read more
Source§

fn split_qkv_norm_rope_into_paged_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _qkv_byte_offset: u64, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _q_out_byte_offset: u64, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _block_table: &Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Source§

fn paged_decode_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _q_len: usize, ) -> Result<()>

Paged-KV variant of Self::flash_attention. Read more
Source§

fn alloc_u32(n: usize) -> Self::Buffer

Allocate a u32 buffer of length n for paged-KV bookkeeping (block tables, context lens). Default uses the existing from_slice_i32 route then bit-casts; backends with a faster path can override.
Source§

fn write_u32(_ctx: &mut Self::Context, _dst: &mut Self::Buffer, _data: &[u32])

Write a u32 slice into a buffer previously allocated via Self::alloc_u32. Used for live block_tables / context_lens updates between decode steps. Read more
Source§

fn from_weight_bytes(raw: &[u8], src_dtype: SrcDtype) -> Self::Buffer

Load a weight tensor straight from its on-disk byte representation, letting the backend pick its preferred storage dtype. Read more
Source§

fn world_size(_ctx: &Self::Context) -> usize

Source§

fn rank(_ctx: &Self::Context) -> usize

Source§

fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )

Source§

fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )

Source§

fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V