Skip to main content

CpuBackend

Struct CpuBackend 

Source
pub struct CpuBackend;

Trait Implementations§

Source§

impl Backend for CpuBackend

Source§

fn alloc_typed(dtype: Dtype, n: usize) -> Self::Buffer

Phase D step 2+3: typed alloc. CPU Buffer is Vec — bytes are dtype-erased, so we size the underlying Vec to hold n elements of dtype (bit-cast at read/write time).

Source§

fn from_slice_typed<T: HostDtype>(data: &[T]) -> Self::Buffer

Phase D step 2+3: typed upload. Bit-cast host data into f32 words (CPU buffer is dtype-erased Vec, see alloc_typed).

Source§

fn write_typed<T: HostDtype>( _ctx: &mut Self::Context, dst: &mut Self::Buffer, data: &[T], )

Phase D step 2+3: typed in-place write. Bit-cast bytes into the dtype-erased f32 storage.

Source§

type Buffer = Vec<f32>

Source§

type Context = ()

Execution context that accumulates GPU work. Read more
Source§

type Timer = CpuTimer

GPU-side timer scoped to this backend. See super::timer — CPU: Instant; Metal: sync-wrap; CUDA: cuEvent. PLAYBOOK § 1.1.
Source§

fn make_timer() -> Self::Timer

Factory for Self::Timer — exists so call sites that have a <B: Backend> parameter can spawn a timer without importing the concrete impl. PLAYBOOK § 1.2.
Source§

fn new_context() -> Self::Context

Opaque per-backend GPTQ weight representation. Read more
Source§

fn sync(_ctx: &mut Self::Context)

Flush accumulated work and wait for completion. CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.
Source§

fn activation_elem_size_bytes() -> usize

Byte width of buffers returned by Self::alloc. Read more
Source§

fn fused_silu_mul_split_strided( _ctx: &mut Self::Context, gate_up: &Self::Buffer, in_row_offset: usize, out: &mut Self::Buffer, out_row_offset: usize, tokens: usize, intermediate: usize, )

Strided variant of Backend::fused_silu_mul_split for the bucketed MoE path: reads gate_up rows starting at in_row_offset, writes out rows starting at out_row_offset.
Source§

fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )

Source§

fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source§

fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Source§

fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )

Source§

fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )

Copy len floats from src[src_offset..] to dst[dst_offset..]. Read more
Source§

fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )

Source§

fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )

Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers. Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]
Source§

fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )

Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im], then compute SiLU(gate) * up → out [tokens, im].
Source§

fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )

Fused QK-norm + RoPE + transpose-to-head-major. Read more
Source§

fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )

Append new K/V into a pre-allocated head-major cache buffer. Read more
Source§

fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )

Transpose [heads, tokens, dim] → [tokens, heads, dim]. Called after flash_attention to restore token-major layout for O-proj.
Source§

fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )

residual[i] += x[i] (in-place)
Source§

fn scaled_add_inplace( _ctx: &mut Self::Context, dst: &mut Self::Buffer, src: &Self::Buffer, scale: f32, len: usize, )

dst[i] += scale * src[i] — scalar-broadcast scaled add, in place. Read more
Source§

fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )

Broadcast bias add: data[r, c] += bias[c] for every row. Required by Bert / Clip / Whisper whose linear projections carry a bias.
Source§

fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )

Full LayerNorm (mean + variance normalisation + affine), distinct from the rms_norm used by Llama-family decoders. out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c] Where mean and var are reduced over the last dim (cols).
Source§

fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )

Element-wise GELU activation (erf-based, matches PyTorch default).
Source§

fn alloc(len: usize) -> Self::Buffer

Source§

fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>

Source§

fn from_slice(data: &[f32]) -> Self::Buffer

Source§

fn with_device_ordinal<R>( _device_ordinal: Option<usize>, body: impl FnOnce() -> R, ) -> R

Run body while binding context-free backend operations to an explicit device ordinal when the backend supports multi-device scopes. Read more
Source§

fn supports_device_ordinal_scope() -> bool

Whether Self::with_device_ordinal actually switches backend execution to the requested ordinal.
Source§

fn sync_before_host_readback(_ctx: &mut Self::Context)

Prepare pending GPU work for a following host readback. Read more
Source§

fn supports_llama_family_batched_decode() -> bool

Whether LlamaFamilyModel::decode_batch_internal may use its optimized batched decode path on this backend. Read more
Source§

fn zero_buffer( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, ) -> Result<()>

Zero the first len elements of a Self::Buffer. CUDA path uses cuMemsetD16Async; default returns unsupported.
Source§

fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>

Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV attention variant. Extension point only; no backend implements it yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more
Source§

fn embedding_lookup_dev( ctx: &mut Self::Context, table: &Self::Buffer, ids: &Self::Buffer, out: &mut Self::Buffer, batch: usize, dim: usize, )

Device-buffer variant of embedding_lookup for graph-capturable MoE routing — the gather step before phase-1 GEMM in moe_forward_bucketed. The host-slice embedding_lookup does clone_htod(ids) internally, which records stale host pointers under CUDA Graph capture replay. Read more
Source§

fn kv_cache_append_batched_per_cache( _ctx: &mut Self::Context, _caches: &[&Self::Buffer], _new_data: &Self::Buffer, _cache_lens: &Self::Buffer, _capacity: usize, _m: usize, _nkv: usize, _hd: usize, _slot: usize, ) -> Result<()>

Batched kv_cache_append across M caches in one launch. Each item writes its (head-major) K-or-V row into its own cache at offset read from cache_lens[i]. Replaces M sequential kv_cache_append_head_major calls with a single dispatch. Read more
Source§

fn flash_attention_batched_per_cache( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_caches: &[&Self::Buffer], _v_caches: &[&Self::Buffer], _kv_lens: &Self::Buffer, _out: &mut Self::Buffer, _nq: usize, _nkv: usize, _hd: usize, _scale: f32, _max_valid_kv: usize, _capacity: usize, _slot: usize, ) -> Result<()>

Batched flash_attention across M decode caches in one launch. Replaces the per-item flash_attention(q_len=1, ...) × M loop in the non-paged batched-decode path. Read more
Source§

fn qk_norm_rope_batched_per_item( _ctx: &mut Self::Context, _input: &Self::Buffer, _norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _output: &mut Self::Buffer, _positions: &Self::Buffer, _m: usize, _heads: usize, _head_dim: usize, _eps: f32, _mode: i32, ) -> Result<()>

Batched per-item-position variant of qk_norm_rope for the non-paged batched-decode path. Each of the m items has its own absolute RoPE position (read from a device i32 buffer of length m). Layout is item-major in both input and output: Read more
Source§

fn split_qkv_norm_rope( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _k_out: &mut Self::Buffer, _v_out: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, ) -> Result<()>

Fused split-QKV + QK-norm + RoPE + head-major transpose. Read more
Source§

fn split_qkv_norm_rope_into_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _cache_capacity: usize, ) -> Result<()>

Variant of Backend::split_qkv_norm_rope that writes the new K and V directly into pre-allocated head-major KV cache buffers at slot [kv_heads, cache_len .. cache_len + tokens, hd]. Eliminates the trailing kv_cache_append_head_major dispatch on the decode hot path. Q still lands in per-token head-major scratch (flash-attention reads it as the query). Read more
Source§

fn transpose_token_to_head( _ctx: &mut Self::Context, _src: &Self::Buffer, _dst: &mut Self::Buffer, _tokens: usize, _heads: usize, _dim: usize, )

Inverse of transpose_head_to_token: [tokens, heads, dim] → [heads, tokens, dim]. Used by the CUDA paged_decode_attention wrapper to convert paged_varlen_attention’s token-major output back to the head-major layout that Qwen3MoeModel expects. Default panics — backends without a paged-KV CUDA path don’t hit this code.
Source§

fn write_f32_to_activation( ctx: &mut Self::Context, dst: &mut Self::Buffer, data: &[f32], )

Source§

fn argmax_rows_f16( _ctx: &mut Self::Context, logits: &Self::Buffer, m: usize, n: usize, ) -> Result<Vec<u32>>

Greedy-decode fast path: GPU argmax over each row of a [m, n] FP16 logits buffer, returning the m token indices on the host. Saves m × n × 2 bytes of D2H per call (e.g. 19.5 MB at c=32, vocab=152064) and the host-side argmax scan (~150 µs × m). Read more
Source§

fn from_weight_bytes(raw: &[u8], src_dtype: SrcDtype) -> Self::Buffer

Load a weight tensor straight from its on-disk byte representation, letting the backend pick its preferred storage dtype. Read more
Source§

impl BackendCollective for CpuBackend

Source§

fn world_size(_ctx: &Self::Context) -> usize

Source§

fn rank(_ctx: &Self::Context) -> usize

Source§

fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )

Source§

fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )

Source§

fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )

Source§

impl BackendGraph for CpuBackend

Source§

fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)

Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).
Source§

fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)

Toggle between scalar-arg kernels (normal) and _dyn kernels that read their dynamic scalar args from device memory (graph-friendly).
Source§

fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>

Begin stream capture. Subsequent kernel launches are recorded into a pending graph instead of executing eagerly.
Source§

fn end_graph_capture(_ctx: &mut Self::Context, _key: u64) -> Result<()>

End stream capture and install the captured graph keyed by _key (opaque caller-chosen u64; the model uses m_padded so that different batch shapes don’t thrash a single slot).
Source§

fn replay_graph(_ctx: &mut Self::Context, _key: u64) -> Result<bool>

Replay the captured graph for _key. Returns Ok(false) if no graph is cached for that key; caller should run eager.
Source§

fn reset_graph(_ctx: &mut Self::Context, _key: u64)

Drop the cached graph for _key — required when its kernel-arg pointers (KV cache, scratch) might no longer be valid. Use reset_all_graphs when EVERY cached graph should be evicted (hard model reload / scratch realloc).
Source§

fn reset_all_graphs(_ctx: &mut Self::Context)

Drop ALL cached graphs — used by hard reset paths.
Source§

impl BackendKvDtype<KvFp16> for CpuBackend

Source§

type KvBuffer = <CpuBackend as Backend>::Buffer

Per-layer K/V element storage.
Source§

type KvScales = ()

Per-token per-kv-head scale storage. () for FP16 (no scales).
Source§

impl BackendMoeFused for CpuBackend

Source§

fn upload_moe_routing( _ctx: &mut Self::Context, _sorted_token_ids: &[i32], _expert_ids: &[i32], _num_tokens_past_padded: &[i32], ) -> Result<MoeRouting<Self>>

Routing inputs for moe_gemm_phase_vllm — host-built i32 arrays uploaded once per layer (or per token, depending on caller cadence). Matches the shape contract of moe_align_block_size outputs but is usable on backends that build the indices on host. Read more
Source§

fn route_topk_softmax( _ctx: &mut Self::Context, _logits: &Self::Buffer, _out_ids: &mut Self::Buffer, _out_weights: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _norm_topk_prob: bool, ) -> Result<()>

GPU-side MoE router: [batch, num_experts] logits → [batch, top_k] expert IDs (i32) + [batch, top_k] combine weights (f32). Read more
Source§

fn try_gpu_route_topk_into_host( _ctx: &mut Self::Context, _logits_dev: &Self::Buffer, _out_ids_host: &mut Vec<u32>, _out_weights_host: &mut Vec<f32>, _batch: usize, _num_experts: usize, _top_k: usize, _norm_topk_prob: bool, ) -> Result<()>

GPU-side fast-path for the host route() leg of the bucketed MoE forward (moe_forward_bucketed in ferrum-models). Replaces the B::sync(ctx) + B::to_vec(logits) + crate::moe::router:: route_into(...) triple with a single GPU kernel + small D2H of [batch, top_k] ids + weights. Read more
Source§

fn moe_build_pairs_by_token( _ctx: &mut Self::Context, _expert_ids: &Self::Buffer, _pairs_by_token: &mut Self::Buffer, _packed_token_idx: &mut Self::Buffer, _expert_offsets: &mut Self::Buffer, _batch_x_topk: usize, _num_experts: usize, _top_k: usize, ) -> Result<()>

GPU-side moe_align_block_size — prep for a future fused MoE Marlin kernel. Takes per-pair expert assignments (from Self::route_topk_softmax) and produces: Read more
Source§

fn moe_align_block_size( _ctx: &mut Self::Context, _expert_ids_per_pair: &Self::Buffer, _sorted_token_ids: &mut Self::Buffer, _block_ids: &mut Self::Buffer, _total_tokens_post_pad: &mut Self::Buffer, _batch_x_topk: usize, _num_experts: usize, _block_size: usize, _sorted_max_size: usize, ) -> Result<()>

b[block_ids[blockIdx.y] * n_per_expert + ...]. Read more
Source§

fn moe_align_block_size_pair_ids( _ctx: &mut Self::Context, _expert_ids_per_pair: &Self::Buffer, _sorted_token_ids: &mut Self::Buffer, _block_ids: &mut Self::Buffer, _total_tokens_post_pad: &mut Self::Buffer, _batch_x_topk: usize, _num_experts: usize, _block_size: usize, _sorted_max_size: usize, ) -> Result<()>

vLLM-native align variant: sorted_token_ids stores flattened (token, top_k_slot) pair ids, not Ferrum’s pre-gathered packed rows. This lets marlin_moe read gate_up input as A[pair_id / top_k].
Source§

fn compute_ids_tpe_gpu( _ctx: &mut Self::Context, _selected_ids: &Self::Buffer, _tpe: &mut Self::Buffer, _ids: &mut Self::Buffer, _gate_up_args: &mut Self::Buffer, _down_args: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _m_gate_up: usize, _m_down: usize, ) -> Result<()>

GPU-side bucket sort: turn [batch, top_k] selected expert IDs (from Self::route_topk_softmax) into tpe[num_experts] / ids[num_experts * row_stride] arrays consumed by the batched MoE GEMM, and emit indirect-dispatch args for the consumer GEMM. Read more
Source§

fn silu_mul_batched( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _total_pairs: usize, _ffn: usize, ) -> Result<()>

Stacked SiLU·gate over [batch * top_k, ffn] rows (prefill version of silu_mul_stacked).
Source§

fn weighted_sum_residual_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>

Fused weighted-sum + residual-add: residual[i] += Σ_k weights[k] · slots[k, i]. Single dispatch replaces the (weighted_sum → moe_out) + (add_inplace residual += moe_out) pair on the decode hot path.
Source§

fn weighted_sum_residual_norm_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _next_norm_w: &Self::Buffer, _normed_out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, _eps: f32, ) -> Result<()>

Fused weighted-sum-residual + RMSNorm: combines this layer’s weighted_sum_residual_stacked with the next layer’s leading rms_norm into a single dispatch. Read more
Source§

fn weighted_sum_batched( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _top_k: usize, _hidden: usize, ) -> Result<()>

Per-batch weighted sum: out[b, h] = Σ_k weights[b, k] · slots[b, k, h]. Single dispatch covers the whole batch (prefill version of weighted_sum_stacked which only handled one token).
Source§

fn weighted_sum_batched_offset( ctx: &mut Self::Context, slots: &Self::Buffer, weights: &Self::Buffer, weights_offset: usize, out: &mut Self::Buffer, out_offset: usize, batch: usize, top_k: usize, hidden: usize, ) -> Result<()>

Offset-aware variant of Self::weighted_sum_batchedweights reads from weights_offset (in elements, points at the start of [batch, top_k]), out writes from out_offset (in elements, points at start of [batch, hidden]). Used by the per-item batched-decode path to skip copy_slice round-trips. Default falls back to the non-offset variant via two copies.
Source§

fn silu_mul_stacked( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _ffn: usize, ) -> Result<()>

Stacked SiLU·gate over [n_slots, ffn] rows. Read more
Source§

fn supports_fused_moe_gate_up_silu() -> bool

Capability probe for [Self::gemv_quant_moe_id_gate_up_silu]. Read more
Source§

fn supports_batched_moe_gemv() -> bool

Capability probe for [Self::gemv_quant_moe_id_batched].
Source§

fn supports_batched_moe_gate_up_silu() -> bool

Capability probe for [Self::gemv_quant_moe_id_gate_up_silu_batched].
Source§

fn weighted_sum_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>

Weighted sum across n_slots rows of [hidden]. Read more
Source§

fn moe_combine( ctx: &mut Self::Context, packed_down: &Self::Buffer, pairs_by_token: &Self::Buffer, pair_weights: &Self::Buffer, out: &mut Self::Buffer, batch: usize, hidden: usize, top_k: usize, total_pairs: usize, )

MoE combine: per-token weighted sum across top_k expert outputs. Read more
Source§

impl BackendPagedKv for CpuBackend

Source§

fn supports_paged_kv() -> bool

Whether this backend has a paged-KV decode path (paged_decode_attention etc.). Currently true for Metal, false for CPU. Used to decide the default of FERRUM_METAL_PAGED_KV — the serve path should opt in automatically when supported so users get the bench-quality concurrent-decode numbers without having to learn the flag.
Source§

fn populate_batched_pointers( _ctx: &mut Self::Context, _k_caches: &[&Self::Buffer], _v_caches: &[&Self::Buffer], _num_layers: usize, _m: usize, ) -> Result<()>

Pre-populate the per-slot device-pointer scratch arrays used by the batched kernels (kv_cache_append_batched_per_cache and flash_attention_batched_per_cache). Required by the CUDA-graph capture path: the captured graph contains only kernel launches (no captured memcpy_htod), so the device scratch must be fresh when the graph replays. Read more
Source§

fn split_qkv_norm_rope_into_paged_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _qkv_byte_offset: u64, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _q_out_byte_offset: u64, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _block_table: &Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Paged-KV variant of [Self::split_qkv_norm_rope_into_cache]. Read more
Source§

fn paged_decode_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _q_len: usize, ) -> Result<()>

Paged-KV variant of [Self::flash_attention]. Read more
Source§

fn supports_varlen_qkv() -> bool

Capability: does this backend implement split_qkv_norm_rope_into_paged_cache_varlen and paged_varlen_attention? Required by the unified mixed-batch forward path used by LlamaFamilyModel::unified_forward. Default false; backends that ship the varlen kernels override.
Source§

fn split_qkv_norm_rope_into_paged_cache_varlen( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _m_total: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _eps: f32, _qk_mode: i32, _block_size: usize, _max_blocks_per_seq: usize, ) -> Result<()>

Source§

fn paged_varlen_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _total_q_tokens: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Variable-length paged attention with GQA + causal mask. Read more
Source§

fn paged_varlen_attention_fa2_ffi( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _lse: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _seq_lens: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _total_q_tokens: usize, _max_q_len: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Opt-in vLLM FlashAttention-2 FFI path for FA-layout paged KV. Read more
Source§

fn paged_batched_decode_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _valid_kv_lens: &Self::Buffer, _num_seqs: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Batched paged decode attention — multi-seq, single token per seq. Faster path for the unified_forward layer when m_total == num_seqs (every item is a single-token decode). Skips the cu_seqlens_q linear scan that paged_varlen_attention does in the fully-mixed case. Read more
Source§

fn supports_vllm_paged_attn() -> bool

Capability: backend has vLLM-layout paged KV write kernels and the paged_attention_v2 decode kernel. Models that opt into this layout at construction time (via FERRUM_USE_VLLM_PAGED_ATTN=1) must dispatch ALL paged writes and reads through the _vllm variants — the layouts are not compatible. Default false.
Source§

fn split_qkv_norm_rope_into_paged_cache_vllm( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _qkv_byte_offset: u64, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _q_out_byte_offset: u64, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _block_table: &Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

vLLM-layout variant of Self::split_qkv_norm_rope_into_paged_cache. K/V are written in vLLM’s paged_attention_v2 layout: K is [num_blocks, kv_heads, head_dim/x, block_size, x] (x = 16/sizeof(elem)), V is [num_blocks, kv_heads, head_dim, block_size]. Q output and every other argument matches the non-vllm variant exactly so the model layer can swap dispatchers based on a single flag.
Source§

fn split_qkv_norm_rope_into_paged_cache_varlen_vllm( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _m_total: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _eps: f32, _qk_mode: i32, _block_size: usize, _max_blocks_per_seq: usize, ) -> Result<()>

vLLM-layout variant of Self::split_qkv_norm_rope_into_paged_cache_varlen. Same signature — only the K/V cache layout changes.
Source§

fn paged_decode_attention_v2( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _max_seq_len: usize, ) -> Result<()>

vLLM paged_attention_v2 — multi-partition split-K decode attention reading the vLLM K/V layout. q_len is implicitly 1 (decode only; vLLM’s v2 kernel does not support q_len > 1). max_seq_len is the max kv_len across the batch — used to size the partition reduction.
Source§

fn paged_varlen_attention_vllm_layout( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _q_len: usize, ) -> Result<()>

q_len>1 prefill/chunk-prefill attention over vLLM-layout paged KV. This keeps cache layout consistent when FERRUM_USE_VLLM_PAGED_ATTN=1 and the prompt path writes K/V in the layout consumed later by paged_decode_attention_v2.
Source§

fn paged_varlen_attention_vllm( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _total_q_tokens: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Variable-length paged attention over vLLM-layout paged KV. Read more
Source§

fn paged_varlen_attention_vllm_tiled_q4( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _tile_seqs: &Self::Buffer, _tile_starts: &Self::Buffer, _num_tiles: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>

Q-tiled vLLM-layout varlen attention. tile_seqs and tile_starts describe a compact list of q-token tiles, avoiding empty grid blocks for mixed batches that contain both long prefill items and q_len=1 decode items. Semantics match Self::paged_varlen_attention_vllm.
Source§

impl BackendQuantGguf for CpuBackend

Source§

fn load_quant( kind: GgufQuantType, bytes: &[u8], n_rows: usize, n_cols: usize, ) -> Result<Box<dyn Linear<Self> + Send + Sync>>

Load GGUF k-quant weights into the backend’s preferred format. Read more
Source§

fn load_quant_fused( _parts: &[(GgufQuantType, &[u8], usize)], _n_cols: usize, ) -> Result<Box<dyn Linear<Self> + Send + Sync>>

Build a fused linear from multiple (kind, bytes, n_rows) parts that share n_cols. Used by GgufLoader::load_fused when parts have heterogeneous quant kinds (e.g. Qwen3 qkv_proj where q+k are Q4_K but v is Q6_K) — byte-concatenation isn’t possible, so each part stays as its own QuantStore and the gemm dispatches one matvec per part with output offsets. Read more
Source§

fn load_quant_experts( _kind: GgufQuantType, _bytes: &[u8], _num_experts: usize, _n_rows: usize, _n_cols: usize, ) -> Result<Box<dyn StackedExpertGgufLinear<Self>>>

Build a stacked-experts MoE linear from a contiguous 3-D weight payload [num_experts, n_rows, n_cols/256] super-blocks. Used for the MoE indirect-dispatch fast path; backends without such a kernel return Err(unsupported) and the model code falls back to the per-expert Box<dyn Linear<Self>> loop. Read more
Source§

impl BackendQuantMarlin for CpuBackend

Source§

fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bias_host: Option<&[f32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Box<dyn Linear<Self> + Send + Sync>>

Repack raw GPTQ tensors into a backend-specific Linear<Self> impl. Called once per layer at model load time. Read more
Source§

fn load_gptq_stacked( qweights: &[&[i32]], scales: &[&[f32]], qzeros: &[&[i32]], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n_per_expert: usize, ) -> Result<Arc<dyn MarlinExpertStack<Self>>>

Load num_experts GPTQ weight tiles into ONE stacked store, with the property that each expert’s packed bytes are contiguous in the resulting store. This is what the offset GEMM needs to dispatch per expert via pointer offset alone. Read more
Source§

fn pregrow_marlin_gather_scratch(_ctx: &mut Self::Context, _required: usize)

Pre-grow any backend-internal scratch slots whose size depends on m_total * intermediate_size (the largest matmul fan-in inside unified_forward_internal). Default no-op. CUDA implements this to grow the perm-aware Marlin gather scratch EAGERLY before the caller enters a CUDA-graph capture region — cuLaunchKernel after a runtime alloc inside a captured stream returns CUDA_ERROR_INVALID_VALUE.
Source§

impl BackendTimer<CpuBackend> for CpuTimer

Source§

fn new() -> Self

Allocate timer state. On CUDA this creates two cuEvent_t handles; on Metal it’s a no-op; on CPU it’s two Option<Instant>.
Source§

fn record_start(&mut self, _ctx: &mut <CpuBackend as Backend>::Context)

Record the “start” timestamp on the current ctx’s stream/command buffer. Returns immediately on CUDA (async); on Metal forces a sync to flush any pending work first.
Source§

fn record_end(&mut self, _ctx: &mut <CpuBackend as Backend>::Context)

Record the “end” timestamp.
Source§

fn elapsed_ms(&self) -> f64

Synchronize on the recorded events and return the elapsed time in milliseconds. Blocks the calling thread on CUDA; instant on CPU/Metal. Read more
Source§

impl Linear<CpuBackend> for CpuGptqLinear

Source§

fn in_features(&self) -> usize

Source§

fn out_features(&self) -> usize

Source§

fn forward( &self, ctx: &mut <CpuBackend as Backend>::Context, input: &<CpuBackend as Backend>::Buffer, out: &mut <CpuBackend as Backend>::Buffer, m: usize, )

Append GEMM work onto ctx. Caller flushes the context when results must be materialised.
Source§

impl Linear<CpuBackend> for CpuGgufLinear

Source§

fn in_features(&self) -> usize

Source§

fn out_features(&self) -> usize

Source§

fn forward( &self, ctx: &mut <CpuBackend as Backend>::Context, input: &<CpuBackend as Backend>::Buffer, out: &mut <CpuBackend as Backend>::Buffer, m: usize, )

Append GEMM work onto ctx. Caller flushes the context when results must be materialised.
Source§

impl MarlinExpertStack<CpuBackend> for CpuMarlinExpertStack

Source§

fn n_per_expert(&self) -> usize

Per-expert output width (N tile cols).
Source§

fn k(&self) -> usize

Input width (K), common across experts.
Source§

fn num_experts(&self) -> usize

Number of experts packed into the tile.
Source§

fn as_any(&self) -> &dyn Any

Downcast hook — used at FFN dispatch boundaries where the caller needs to reach into the concrete store to e.g. share workspace memory across phases. Standard dyn Any pattern.
Source§

fn zero_workspace( &self, _ctx: &mut <CpuBackend as Backend>::Context, ) -> Result<()>

Bulk-zero the per-expert Marlin workspace mutex slots. Call ONCE before a batch of bucketed gemm_phase_batched calls — saves the per-call cuMemsetD32Async (one launch each → one launch total). At c=32 with 128 active experts × 2 phases × 48 layers that’s ~12k memset launches/token reduced to ~96.
Source§

fn gemm_phase_batched( &self, ctx: &mut <CpuBackend as Backend>::Context, input: &<CpuBackend as Backend>::Buffer, dispatches: &[(usize, usize, usize, usize)], output: &mut <CpuBackend as Backend>::Buffer, k: usize, ) -> Result<()>

Batched per-expert offset GEMM. dispatches[i] = (expert_idx, in_row_offset, out_row_offset, m). Runs each expert’s (m × K) @ tile[expert] = m × n_per_expert slice; CUDA backend overlaps via multi-stream round-robin.
Source§

fn make_expert_linear( self: Arc<Self>, expert_offset: usize, expert_n: usize, bias_host: Option<&[f32]>, ) -> Result<Box<dyn Linear<CpuBackend> + Send + Sync>>

Build a single-expert Linear<B> view onto this stack’s [expert_offset .. expert_offset + expert_n) column slice. Used for per-expert dispatch outside the MoE phase batching (e.g. shared-experts code paths). expert_offset and expert_n MUST be multiples of the backend’s Marlin N tile (64 on CUDA).
Source§

fn gemm_phase_vllm( &self, _ctx: &mut B::Context, _input: &B::Buffer, _sorted_token_ids: &B::Buffer, _expert_ids: &B::Buffer, _num_tokens_past_padded: &B::Buffer, _output: &mut B::Buffer, _prob_m: usize, _moe_block_size: usize, _top_k: usize, ) -> Result<()>

vLLM marlin_moe_wna16 fused GEMM (single launch, per-block expert routing inside the kernel). Caller responsibilities: Read more

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> LlmBackend for T

Source§

impl<T> MoeLlmBackend for T

Source§

impl<T> QuantLlmBackend for T

Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more