Struct CpuBackend

Source

pub struct CpuBackend { /* private fields */ }

Implementations§

Source §

impl CpuBackend

Source

pub fn new() -> Self

Trait Implementations§

Source §

impl ComputeBackend for CpuBackend

Source §

fn matmul( &self, a: &BufferHandle, b: &BufferHandle, out: &BufferHandle, m: u32, n: u32, k: u32, ) -> Result<()>

Matrix multiply: C[m,n] = A[m,k] * B[k,n] For single-token decode (m=1), this is a GEMV. Buffers hold row-major f32 data.

Source §

fn softmax( &self, input: &BufferHandle, output: &BufferHandle, size: u32, ) -> Result<()>

Softmax: out[i] = exp(input[i] - max) / sum(exp(input - max)) Numerically stable via max subtraction.

Source §

fn rms_norm( &self, input: &BufferHandle, weight: &BufferHandle, output: &BufferHandle, size: u32, eps: f32, ) -> Result<()>

RMSNorm: out = (input / rms) * weight where rms = sqrt(mean(input^2) + eps)

Source §

fn rope( &self, q: &BufferHandle, k: &BufferHandle, pos: u32, head_dim: u32, freq_base: f32, _n_heads_q: u32, _n_heads_k: u32, ) -> Result<()>

RoPE (Rotary Position Embedding) applied in-place to Q and K buffers. head_dim: dimension per attention head (typically 128). Applies rotation in pairs: (q[2i], q[2i+1]) rotated by pos * freq.

Source §

fn silu( &self, input: &BufferHandle, output: &BufferHandle, size: u32, ) -> Result<()>

SiLU (Sigmoid Linear Unit): out[i] = input[i] * sigmoid(input[i]) Also known as swish. Used in SwiGLU FFN.

Source §

fn element_mul( &self, a: &BufferHandle, b: &BufferHandle, output: &BufferHandle, size: u32, ) -> Result<()>

Element-wise multiply: out[i] = a[i] * b[i]

Source §

fn add( &self, a: &BufferHandle, b: &BufferHandle, output: &BufferHandle, size: u32, ) -> Result<()>

Element-wise add: out[i] = a[i] + b[i]

Source §

fn copy_from_device(&self, handle: &BufferHandle, data: &mut [u8]) -> Result<()>

Source §

fn copy_buffer( &self, src: &BufferHandle, dst: &BufferHandle, size: usize, ) -> Result<()>

Source §

fn copy_buffer_offset( &self, src: &BufferHandle, dst: &BufferHandle, src_offset: usize, dst_offset: usize, size: usize, ) -> Result<()>

Source §

fn synchronize(&self) -> Result<()>

Source §

fn attn_score( &self, _q: &BufferHandle, _k_cache: &BufferHandle, _scores: &BufferHandle, _head_dim: u32, _seq_len: u32, _head_offset: u32, _kv_offset: u32, _kv_stride: u32, ) -> Result<()>

Compute attention scores: scores[pos] = Q[head_offset..] · K_cache[pos*kv_stride+kv_offset..]

Source §

fn attn_value( &self, _weights: &BufferHandle, _v_cache: &BufferHandle, _output: &BufferHandle, _head_dim: u32, _seq_len: u32, _kv_offset: u32, _kv_stride: u32, _out_offset: u32, ) -> Result<()>

Compute weighted value aggregation: out[out_offset+d] = sum_pos(weights[pos] * V[pos*kv_stride+kv_offset+d])

Source §

fn quantized_matmul( &self, _weights: &BufferHandle, _input: &BufferHandle, _output: &BufferHandle, _n_rows: u32, _n_cols: u32, _dtype: DType, ) -> Result<()>

Fused dequantize + matrix-vector multiply for quantized weights. GPU backends override this for fused VRAM kernels. Default impl falls back to regular matmul (assumes pre-dequantized data).

Source §