Skip to main content

CpuBackend

Struct CpuBackend 

Source
pub struct CpuBackend;

Trait Implementations§

Source§

impl Backend for CpuBackend

Source§

type Buffer = Vec<f32>

Source§

type Context = ()

Execution context that accumulates GPU work. Read more
Source§

type GptqStore = CpuGptqStore

Opaque per-backend GPTQ weight representation. Read more
Source§

fn new_context() -> Self::Context

Create a new execution context (begin accumulating work).
Source§

fn sync(_ctx: &mut Self::Context)

Flush accumulated work and wait for completion. CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.
Source§

fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Self::GptqStore>

Repack raw GPTQ tensors into the backend’s preferred format. Called once per layer at model load time. Read more
Source§

fn gemm_gptq( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::GptqStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>

GEMM with pre-loaded GPTQ weights. out[m, n] = a[m, k] @ dequant(weight)^T
Source§

fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )

Source§

fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source§

fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source§

fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )

Source§

fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )

Copy len floats from src[src_offset..] to dst[dst_offset..]. Read more
Source§

fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )

Source§

fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )

Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers. Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]
Source§

fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )

Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im], then compute SiLU(gate) * up → out [tokens, im].
Source§

fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )

Fused QK-norm + RoPE + transpose-to-head-major. Read more
Source§

fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )

Append new K/V into a pre-allocated head-major cache buffer. Read more
Source§

fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )

Transpose [heads, tokens, dim] → [tokens, heads, dim]. Called after flash_attention to restore token-major layout for O-proj.
Source§

fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )

residual[i] += x[i] (in-place)
Source§

fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )

Broadcast bias add: data[r, c] += bias[c] for every row. Required by Bert / Clip / Whisper whose linear projections carry a bias.
Source§

fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Full LayerNorm (mean + variance normalisation + affine), distinct from the rms_norm used by Llama-family decoders. out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c] Where mean and var are reduced over the last dim (cols).
Source§

fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )

Element-wise GELU activation (erf-based, matches PyTorch default).
Source§

fn alloc(len: usize) -> Self::Buffer

Source§

fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>

Source§

fn from_slice(data: &[f32]) -> Self::Buffer

Source§

fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)

Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).
Source§

fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)

Toggle between scalar-arg kernels (normal) and _dyn kernels that read their dynamic scalar args from device memory (graph-friendly).
Source§

fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>

Begin stream capture. Subsequent kernel launches are recorded into a pending graph instead of executing eagerly.
Source§

fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>

End stream capture and install the captured graph as this context’s “last graph” for future replay_last_graph calls.
Source§

fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>

Replay the last captured graph. Returns Ok(false) if no graph is cached; caller should run eager.
Source§

fn reset_graph(_ctx: &mut Self::Context)

Drop the cached decode graph — required when the KV cache it was captured against is about to be freed (e.g. request release), since the graph holds raw device pointers into that cache.
Source§

fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>

Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV attention variant. Extension point only; no backend implements it yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more
Source§

fn gemm_quant( _ctx: &mut Self::Context, _a: &Self::Buffer, _weights: &QuantWeights<'_, Self>, _out: &mut Self::Buffer, _m: usize, _n: usize, _k: usize, kind: &QuantKind, ) -> Result<()>

GEMM with packed-quantized B matrix. m/n/k describe the dense equivalent ([m,n] = [m,k] @ [k,n]^T).
Source§

fn world_size(_ctx: &Self::Context) -> usize

Source§

fn rank(_ctx: &Self::Context) -> usize

Source§

fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )

Source§

fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )

Source§

fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.