pub struct CpuBackend;Trait Implementations§
Source§impl Backend for CpuBackend
impl Backend for CpuBackend
type Buffer = Vec<f32>
Source§type GptqStore = CpuGptqStore
type GptqStore = CpuGptqStore
Opaque per-backend GPTQ weight representation. Read more
Source§fn new_context() -> Self::Context
fn new_context() -> Self::Context
Create a new execution context (begin accumulating work).
Source§fn sync(_ctx: &mut Self::Context)
fn sync(_ctx: &mut Self::Context)
Flush accumulated work and wait for completion.
CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.
Source§fn load_gptq(
qweight: &[i32],
scales: &[f32],
qzeros: &[i32],
_g_idx: Option<&[i32]>,
bits: u32,
group_size: usize,
k: usize,
n: usize,
) -> Result<Self::GptqStore>
fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Self::GptqStore>
Repack raw GPTQ tensors into the backend’s preferred format.
Called once per layer at model load time. Read more
Source§fn gemm_gptq(
ctx: &mut Self::Context,
a: &Self::Buffer,
weight: &Self::GptqStore,
out: &mut Self::Buffer,
m: usize,
) -> Result<()>
fn gemm_gptq( ctx: &mut Self::Context, a: &Self::Buffer, weight: &Self::GptqStore, out: &mut Self::Buffer, m: usize, ) -> Result<()>
GEMM with pre-loaded GPTQ weights.
out[m, n] = a[m, k] @ dequant(weight)^Tfn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )
fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )
Source§fn copy_slice(
_ctx: &mut Self::Context,
src: &Self::Buffer,
src_offset: usize,
dst: &mut Self::Buffer,
dst_offset: usize,
len: usize,
)
fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )
fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )
Source§fn split_qkv(
_ctx: &mut Self::Context,
qkv: &Self::Buffer,
q: &mut Self::Buffer,
k: &mut Self::Buffer,
v: &mut Self::Buffer,
tokens: usize,
q_dim: usize,
kv_dim: usize,
)
fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )
Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers.
Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]
Source§fn fused_silu_mul_split(
_ctx: &mut Self::Context,
gate_up: &Self::Buffer,
out: &mut Self::Buffer,
tokens: usize,
im: usize,
)
fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )
Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im],
then compute SiLU(gate) * up → out [tokens, im].
Source§fn qk_norm_rope(
_ctx: &mut Self::Context,
input: &Self::Buffer,
norm_w: &Self::Buffer,
cos: &Self::Buffer,
sin: &Self::Buffer,
output: &mut Self::Buffer,
tokens: usize,
heads: usize,
head_dim: usize,
pos_offset: usize,
eps: f32,
mode: i32,
)
fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )
Fused QK-norm + RoPE + transpose-to-head-major. Read more
Source§fn kv_cache_append_head_major(
_ctx: &mut Self::Context,
cache_k: &mut Self::Buffer,
cache_v: &mut Self::Buffer,
cache_len: usize,
cache_capacity: usize,
new_k_head_major: &Self::Buffer,
new_v_head_major: &Self::Buffer,
new_tokens: usize,
nkv: usize,
hd: usize,
)
fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )
Append new K/V into a pre-allocated head-major cache buffer. Read more
Source§fn transpose_head_to_token(
_ctx: &mut Self::Context,
src: &Self::Buffer,
dst: &mut Self::Buffer,
tokens: usize,
heads: usize,
dim: usize,
)
fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )
Transpose [heads, tokens, dim] → [tokens, heads, dim].
Called after
flash_attention to restore token-major layout for O-proj.Source§fn add_inplace(
_ctx: &mut Self::Context,
residual: &mut Self::Buffer,
x: &Self::Buffer,
len: usize,
)
fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )
residual[i] += x[i] (in-place)
Source§fn add_bias(
_ctx: &mut Self::Context,
data: &mut Self::Buffer,
bias: &Self::Buffer,
rows: usize,
cols: usize,
)
fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )
Broadcast bias add:
data[r, c] += bias[c] for every row.
Required by Bert / Clip / Whisper whose linear projections carry a bias.Source§fn layer_norm(
_ctx: &mut Self::Context,
x: &Self::Buffer,
gamma: &Self::Buffer,
beta: &Self::Buffer,
eps: f32,
out: &mut Self::Buffer,
tokens: usize,
dim: usize,
)
fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
Full LayerNorm (mean + variance normalisation + affine), distinct from
the
rms_norm used by Llama-family decoders.
out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c]
Where mean and var are reduced over the last dim (cols).Source§fn gelu(
_ctx: &mut Self::Context,
x: &Self::Buffer,
out: &mut Self::Buffer,
len: usize,
)
fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )
Element-wise GELU activation (erf-based, matches PyTorch default).
fn alloc(len: usize) -> Self::Buffer
fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>
fn from_slice(data: &[f32]) -> Self::Buffer
Source§fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)
fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)
Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).
Source§fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)
fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)
Toggle between scalar-arg kernels (normal) and
_dyn kernels that
read their dynamic scalar args from device memory (graph-friendly).Source§fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>
fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>
Begin stream capture. Subsequent kernel launches are recorded into
a pending graph instead of executing eagerly.
Source§fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>
fn end_graph_capture(_ctx: &mut Self::Context) -> Result<()>
End stream capture and install the captured graph as this context’s
“last graph” for future
replay_last_graph calls.Source§fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>
fn replay_last_graph(_ctx: &mut Self::Context) -> Result<bool>
Replay the last captured graph. Returns
Ok(false) if no graph
is cached; caller should run eager.Source§fn reset_graph(_ctx: &mut Self::Context)
fn reset_graph(_ctx: &mut Self::Context)
Drop the cached decode graph — required when the KV cache it
was captured against is about to be freed (e.g. request release),
since the graph holds raw device pointers into that cache.
Source§fn mla_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_kv_compressed: &Self::Buffer,
_kv_rope: &Self::Buffer,
_out: &mut Self::Buffer,
_batch: usize,
_q_len: usize,
_kv_len: usize,
_pos_offset: usize,
_cfg: &AttnConfig,
_kv_lora_rank: usize,
_qk_rope_head_dim: usize,
) -> Result<()>
fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>
Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV
attention variant. Extension point only; no backend implements it
yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more
Source§fn gemm_quant(
_ctx: &mut Self::Context,
_a: &Self::Buffer,
_weights: &QuantWeights<'_, Self>,
_out: &mut Self::Buffer,
_m: usize,
_n: usize,
_k: usize,
kind: &QuantKind,
) -> Result<()>
fn gemm_quant( _ctx: &mut Self::Context, _a: &Self::Buffer, _weights: &QuantWeights<'_, Self>, _out: &mut Self::Buffer, _m: usize, _n: usize, _k: usize, kind: &QuantKind, ) -> Result<()>
GEMM with packed-quantized B matrix.
m/n/k describe the dense
equivalent ([m,n] = [m,k] @ [k,n]^T).fn world_size(_ctx: &Self::Context) -> usize
fn rank(_ctx: &Self::Context) -> usize
fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )
fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )
fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )
Auto Trait Implementations§
impl Freeze for CpuBackend
impl RefUnwindSafe for CpuBackend
impl Send for CpuBackend
impl Sync for CpuBackend
impl Unpin for CpuBackend
impl UnsafeUnpin for CpuBackend
impl UnwindSafe for CpuBackend
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more