Struct CpuBackend

Source

pub struct CpuBackend;

Trait Implementations§

Source §

impl Backend for CpuBackend

Source §

type Context = ()

Execution context that accumulates GPU work. Read more

Source §

type GptqStore = CpuGptqStore

Opaque per-backend GPTQ weight representation. Read more

Source §

fn new_context() -> Self::Context

Create a new execution context (begin accumulating work).

Source §

fn sync(_ctx: &mut Self::Context)

Flush accumulated work and wait for completion. CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.

Source §

fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Self::GptqStore>

Repack raw GPTQ tensors into the backend’s preferred format. Called once per layer at model load time. Read more

Source §

fn gemm_gptq( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::GptqStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>

GEMM with pre-loaded GPTQ weights. out[m, n] = a[m, k] @ dequant(weight)^T

Source §

fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )

Source §

fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source §

fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source §

fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )

Source §

fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )

Copy len floats from src[src_offset..] to dst[dst_offset..]. Read more

Source §

fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )

Source §

fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )

Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers. Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]

Source §

fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )

Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im], then compute SiLU(gate) * up → out [tokens, im].

Source §

fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )

Fused QK-norm + RoPE + transpose-to-head-major. Read more

Source §

fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )

Append new K/V into a pre-allocated head-major cache buffer. Read more

Source §

fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )

Transpose [heads, tokens, dim] → [tokens, heads, dim]. Called after flash_attention to restore token-major layout for O-proj.

Source §

fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )

residual[i] += x[i] (in-place)

Source §

fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )

Broadcast bias add: data[r, c] += bias[c] for every row. Required by Bert / Clip / Whisper whose linear projections carry a bias.

Source §

fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Full LayerNorm (mean + variance normalisation + affine), distinct from the rms_norm used by Llama-family decoders. out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c] Where mean and var are reduced over the last dim (cols).

Source §

fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )

Element-wise GELU activation (erf-based, matches PyTorch default).

Source §

fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)

Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).

Source §

fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)

Toggle between scalar-arg kernels (normal) and _dyn kernels that read their dynamic scalar args from device memory (graph-friendly).

Source §

fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>

Begin stream capture. Subsequent kernel launches are recorded into a pending graph instead of executing eagerly.

Source §

fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>

End stream capture and install the captured graph as this context’s “last graph” for future replay_last_graph calls.

Source §

fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>

Replay the last captured graph. Returns Ok(false) if no graph is cached; caller should run eager.

Source §

fn reset_graph(_ctx: &mut Self::Context)

Drop the cached decode graph — required when the KV cache it was captured against is about to be freed (e.g. request release), since the graph holds raw device pointers into that cache.

Source §

fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>

Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV attention variant. Extension point only; no backend implements it yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more

Source §

fn gemm_quant( _ctx: &mut Self::Context, _a: &Self::Buffer, _weights: &QuantWeights<'_, Self>, _out: &mut Self::Buffer, _m: usize, _n: usize, _k: usize, kind: &QuantKind, ) -> Result<()>

GEMM with packed-quantized B matrix. m/n/k describe the dense equivalent ([m,n] = [m,k] @ [k,n]^T).

Source §

fn world_size(_ctx: &Self::Context) -> usize

Source §

fn rank(_ctx: &Self::Context) -> usize

Source §

fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )

Source §

fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )

Source §

fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )

Auto Trait Implementations§

§

impl Freeze for CpuBackend

§

impl RefUnwindSafe for CpuBackend

§

impl Send for CpuBackend

§

impl Sync for CpuBackend

§

impl Unpin for CpuBackend

§

impl UnsafeUnpin for CpuBackend

§

impl UnwindSafe for CpuBackend

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

CpuBackend

Struct CpuBackend Copy item path

Trait Implementations§

impl Backend for CpuBackend

type Buffer = Vec<f32>

type Context = ()

type GptqStore = CpuGptqStore

fn new_context() -> Self::Context

fn sync(_ctx: &mut Self::Context)

fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Self::GptqStore>

fn gemm_gptq( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::GptqStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>

fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )

fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )

fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )

fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )

fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )

fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )

fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )

fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )

fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )

fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )

fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )

fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )

fn alloc(len: usize) -> Self::Buffer

fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>

fn from_slice(data: &[f32]) -> Self::Buffer

fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)

fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)

fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>

fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>

fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>

fn reset_graph(_ctx: &mut Self::Context)

fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>

fn gemm_quant( _ctx: &mut Self::Context, _a: &Self::Buffer, _weights: &QuantWeights<'_, Self>, _out: &mut Self::Buffer, _m: usize, _n: usize, _k: usize, kind: &QuantKind, ) -> Result<()>

fn world_size(_ctx: &Self::Context) -> usize

fn rank(_ctx: &Self::Context) -> usize

fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )

fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )

fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )

Auto Trait Implementations§

impl Freeze for CpuBackend

impl RefUnwindSafe for CpuBackend

impl Send for CpuBackend

impl Sync for CpuBackend

impl Unpin for CpuBackend

impl UnsafeUnpin for CpuBackend

impl UnwindSafe for CpuBackend

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct CpuBackend

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,