pub struct CpuBackend;Trait Implementations§
Source§impl Backend for CpuBackend
impl Backend for CpuBackend
Source§fn alloc_typed(dtype: Dtype, n: usize) -> Self::Buffer
fn alloc_typed(dtype: Dtype, n: usize) -> Self::Buffer
Phase D step 2+3: typed alloc. CPU Buffer is Vecn
elements of dtype (bit-cast at read/write time).
Source§fn from_slice_typed<T: HostDtype>(data: &[T]) -> Self::Buffer
fn from_slice_typed<T: HostDtype>(data: &[T]) -> Self::Buffer
Phase D step 2+3: typed upload. Bit-cast host data into f32
words (CPU buffer is dtype-erased Vec
Source§fn write_typed<T: HostDtype>(
_ctx: &mut Self::Context,
dst: &mut Self::Buffer,
data: &[T],
)
fn write_typed<T: HostDtype>( _ctx: &mut Self::Context, dst: &mut Self::Buffer, data: &[T], )
Phase D step 2+3: typed in-place write. Bit-cast bytes into the dtype-erased f32 storage.
type Buffer = Vec<f32>
Source§type Timer = CpuTimer
type Timer = CpuTimer
GPU-side timer scoped to this backend. See
super::timer —
CPU: Instant; Metal: sync-wrap; CUDA: cuEvent.
PLAYBOOK § 1.1.Source§fn make_timer() -> Self::Timer
fn make_timer() -> Self::Timer
Factory for
Self::Timer — exists so call sites that have a
<B: Backend> parameter can spawn a timer without importing the
concrete impl. PLAYBOOK § 1.2.Source§fn new_context() -> Self::Context
fn new_context() -> Self::Context
Opaque per-backend GPTQ weight representation. Read more
Source§fn sync(_ctx: &mut Self::Context)
fn sync(_ctx: &mut Self::Context)
Flush accumulated work and wait for completion.
CPU: no-op. Metal: commit + waitUntilCompleted. CUDA: stream sync.
Source§fn activation_elem_size_bytes() -> usize
fn activation_elem_size_bytes() -> usize
Byte width of buffers returned by
Self::alloc. Read moreSource§fn fused_silu_mul_split_strided(
_ctx: &mut Self::Context,
gate_up: &Self::Buffer,
in_row_offset: usize,
out: &mut Self::Buffer,
out_row_offset: usize,
tokens: usize,
intermediate: usize,
)
fn fused_silu_mul_split_strided( _ctx: &mut Self::Context, gate_up: &Self::Buffer, in_row_offset: usize, out: &mut Self::Buffer, out_row_offset: usize, tokens: usize, intermediate: usize, )
Strided variant of
Backend::fused_silu_mul_split for the
bucketed MoE path: reads gate_up rows starting at
in_row_offset, writes out rows starting at out_row_offset.fn gemm( _ctx: &mut Self::Context, a: &Self::Buffer, b: &Self::Buffer, out: &mut Self::Buffer, m: usize, n: usize, k: usize, )
fn rms_norm( _ctx: &mut Self::Context, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
fn fused_add_rms_norm( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, w: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
fn flash_attention( _ctx: &mut Self::Context, q: &Self::Buffer, k: &Self::Buffer, v: &Self::Buffer, out: &mut Self::Buffer, batch: usize, q_len: usize, kv_len: usize, pos_offset: usize, cfg: &AttnConfig, )
Source§fn copy_slice(
_ctx: &mut Self::Context,
src: &Self::Buffer,
src_offset: usize,
dst: &mut Self::Buffer,
dst_offset: usize,
len: usize,
)
fn copy_slice( _ctx: &mut Self::Context, src: &Self::Buffer, src_offset: usize, dst: &mut Self::Buffer, dst_offset: usize, len: usize, )
fn embedding_lookup( _ctx: &mut Self::Context, table: &Self::Buffer, ids: &[u32], out: &mut Self::Buffer, dim: usize, )
Source§fn split_qkv(
_ctx: &mut Self::Context,
qkv: &Self::Buffer,
q: &mut Self::Buffer,
k: &mut Self::Buffer,
v: &mut Self::Buffer,
tokens: usize,
q_dim: usize,
kv_dim: usize,
)
fn split_qkv( _ctx: &mut Self::Context, qkv: &Self::Buffer, q: &mut Self::Buffer, k: &mut Self::Buffer, v: &mut Self::Buffer, tokens: usize, q_dim: usize, kv_dim: usize, )
Split fused QKV [tokens, q_dim+2*kv_dim] into separate Q, K, V buffers.
Q: [tokens, q_dim], K: [tokens, kv_dim], V: [tokens, kv_dim]
Source§fn fused_silu_mul_split(
_ctx: &mut Self::Context,
gate_up: &Self::Buffer,
out: &mut Self::Buffer,
tokens: usize,
im: usize,
)
fn fused_silu_mul_split( _ctx: &mut Self::Context, gate_up: &Self::Buffer, out: &mut Self::Buffer, tokens: usize, im: usize, )
Split fused gate_up [tokens, 2*im] into gate [tokens, im] and up [tokens, im],
then compute SiLU(gate) * up → out [tokens, im].
Source§fn qk_norm_rope(
_ctx: &mut Self::Context,
input: &Self::Buffer,
norm_w: &Self::Buffer,
cos: &Self::Buffer,
sin: &Self::Buffer,
output: &mut Self::Buffer,
tokens: usize,
heads: usize,
head_dim: usize,
pos_offset: usize,
eps: f32,
mode: i32,
)
fn qk_norm_rope( _ctx: &mut Self::Context, input: &Self::Buffer, norm_w: &Self::Buffer, cos: &Self::Buffer, sin: &Self::Buffer, output: &mut Self::Buffer, tokens: usize, heads: usize, head_dim: usize, pos_offset: usize, eps: f32, mode: i32, )
Fused QK-norm + RoPE + transpose-to-head-major. Read more
Source§fn kv_cache_append_head_major(
_ctx: &mut Self::Context,
cache_k: &mut Self::Buffer,
cache_v: &mut Self::Buffer,
cache_len: usize,
cache_capacity: usize,
new_k_head_major: &Self::Buffer,
new_v_head_major: &Self::Buffer,
new_tokens: usize,
nkv: usize,
hd: usize,
)
fn kv_cache_append_head_major( _ctx: &mut Self::Context, cache_k: &mut Self::Buffer, cache_v: &mut Self::Buffer, cache_len: usize, cache_capacity: usize, new_k_head_major: &Self::Buffer, new_v_head_major: &Self::Buffer, new_tokens: usize, nkv: usize, hd: usize, )
Append new K/V into a pre-allocated head-major cache buffer. Read more
Source§fn transpose_head_to_token(
_ctx: &mut Self::Context,
src: &Self::Buffer,
dst: &mut Self::Buffer,
tokens: usize,
heads: usize,
dim: usize,
)
fn transpose_head_to_token( _ctx: &mut Self::Context, src: &Self::Buffer, dst: &mut Self::Buffer, tokens: usize, heads: usize, dim: usize, )
Transpose [heads, tokens, dim] → [tokens, heads, dim].
Called after
flash_attention to restore token-major layout for O-proj.Source§fn add_inplace(
_ctx: &mut Self::Context,
residual: &mut Self::Buffer,
x: &Self::Buffer,
len: usize,
)
fn add_inplace( _ctx: &mut Self::Context, residual: &mut Self::Buffer, x: &Self::Buffer, len: usize, )
residual[i] += x[i] (in-place)
Source§fn scaled_add_inplace(
_ctx: &mut Self::Context,
dst: &mut Self::Buffer,
src: &Self::Buffer,
scale: f32,
len: usize,
)
fn scaled_add_inplace( _ctx: &mut Self::Context, dst: &mut Self::Buffer, src: &Self::Buffer, scale: f32, len: usize, )
dst[i] += scale * src[i] — scalar-broadcast scaled add, in place. Read moreSource§fn add_bias(
_ctx: &mut Self::Context,
data: &mut Self::Buffer,
bias: &Self::Buffer,
rows: usize,
cols: usize,
)
fn add_bias( _ctx: &mut Self::Context, data: &mut Self::Buffer, bias: &Self::Buffer, rows: usize, cols: usize, )
Broadcast bias add:
data[r, c] += bias[c] for every row.
Required by Bert / Clip / Whisper whose linear projections carry a bias.Source§fn layer_norm(
_ctx: &mut Self::Context,
x: &Self::Buffer,
gamma: &Self::Buffer,
beta: &Self::Buffer,
eps: f32,
out: &mut Self::Buffer,
tokens: usize,
dim: usize,
)
fn layer_norm( _ctx: &mut Self::Context, x: &Self::Buffer, gamma: &Self::Buffer, beta: &Self::Buffer, eps: f32, out: &mut Self::Buffer, tokens: usize, dim: usize, )
Full LayerNorm (mean + variance normalisation + affine), distinct from
the
rms_norm used by Llama-family decoders.
out[r, c] = ((x[r, c] - mean) / sqrt(var + eps)) * gamma[c] + beta[c]
Where mean and var are reduced over the last dim (cols).Source§fn gelu(
_ctx: &mut Self::Context,
x: &Self::Buffer,
out: &mut Self::Buffer,
len: usize,
)
fn gelu( _ctx: &mut Self::Context, x: &Self::Buffer, out: &mut Self::Buffer, len: usize, )
Element-wise GELU activation (erf-based, matches PyTorch default).
fn alloc(len: usize) -> Self::Buffer
fn to_vec(buf: &Self::Buffer, len: usize) -> Vec<f32>
fn from_slice(data: &[f32]) -> Self::Buffer
Source§fn with_device_ordinal<R>(
_device_ordinal: Option<usize>,
body: impl FnOnce() -> R,
) -> R
fn with_device_ordinal<R>( _device_ordinal: Option<usize>, body: impl FnOnce() -> R, ) -> R
Run
body while binding context-free backend operations to an
explicit device ordinal when the backend supports multi-device scopes. Read moreSource§fn supports_device_ordinal_scope() -> bool
fn supports_device_ordinal_scope() -> bool
Whether
Self::with_device_ordinal actually switches backend
execution to the requested ordinal.Source§fn sync_before_host_readback(_ctx: &mut Self::Context)
fn sync_before_host_readback(_ctx: &mut Self::Context)
Prepare pending GPU work for a following host readback. Read more
Source§fn supports_llama_family_batched_decode() -> bool
fn supports_llama_family_batched_decode() -> bool
Whether
LlamaFamilyModel::decode_batch_internal may use its optimized
batched decode path on this backend. Read moreSource§fn zero_buffer(
_ctx: &mut Self::Context,
_buf: &mut Self::Buffer,
_len: usize,
) -> Result<()>
fn zero_buffer( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, ) -> Result<()>
Zero the first
len elements of a Self::Buffer. CUDA path uses
cuMemsetD16Async; default returns unsupported.Source§fn mla_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_kv_compressed: &Self::Buffer,
_kv_rope: &Self::Buffer,
_out: &mut Self::Buffer,
_batch: usize,
_q_len: usize,
_kv_len: usize,
_pos_offset: usize,
_cfg: &AttnConfig,
_kv_lora_rank: usize,
_qk_rope_head_dim: usize,
) -> Result<()>
fn mla_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _kv_compressed: &Self::Buffer, _kv_rope: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _q_len: usize, _kv_len: usize, _pos_offset: usize, _cfg: &AttnConfig, _kv_lora_rank: usize, _qk_rope_head_dim: usize, ) -> Result<()>
Multi-Head Latent Attention — DeepSeek V2 / V3’s compressed-KV
attention variant. Extension point only; no backend implements it
yet. DeepSeek V3 landing in Phase D/E will fill this in. Read more
Source§fn embedding_lookup_dev(
ctx: &mut Self::Context,
table: &Self::Buffer,
ids: &Self::Buffer,
out: &mut Self::Buffer,
batch: usize,
dim: usize,
)
fn embedding_lookup_dev( ctx: &mut Self::Context, table: &Self::Buffer, ids: &Self::Buffer, out: &mut Self::Buffer, batch: usize, dim: usize, )
Device-buffer variant of
embedding_lookup for graph-capturable
MoE routing — the gather step before phase-1 GEMM in
moe_forward_bucketed. The host-slice embedding_lookup does
clone_htod(ids) internally, which records stale host pointers
under CUDA Graph capture replay. Read moreSource§fn kv_cache_append_batched_per_cache(
_ctx: &mut Self::Context,
_caches: &[&Self::Buffer],
_new_data: &Self::Buffer,
_cache_lens: &Self::Buffer,
_capacity: usize,
_m: usize,
_nkv: usize,
_hd: usize,
_slot: usize,
) -> Result<()>
fn kv_cache_append_batched_per_cache( _ctx: &mut Self::Context, _caches: &[&Self::Buffer], _new_data: &Self::Buffer, _cache_lens: &Self::Buffer, _capacity: usize, _m: usize, _nkv: usize, _hd: usize, _slot: usize, ) -> Result<()>
Batched kv_cache_append across M caches in one launch. Each item
writes its (head-major) K-or-V row into its own cache at offset
read from
cache_lens[i]. Replaces M sequential
kv_cache_append_head_major calls with a single dispatch. Read moreSource§fn flash_attention_batched_per_cache(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_caches: &[&Self::Buffer],
_v_caches: &[&Self::Buffer],
_kv_lens: &Self::Buffer,
_out: &mut Self::Buffer,
_nq: usize,
_nkv: usize,
_hd: usize,
_scale: f32,
_max_valid_kv: usize,
_capacity: usize,
_slot: usize,
) -> Result<()>
fn flash_attention_batched_per_cache( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_caches: &[&Self::Buffer], _v_caches: &[&Self::Buffer], _kv_lens: &Self::Buffer, _out: &mut Self::Buffer, _nq: usize, _nkv: usize, _hd: usize, _scale: f32, _max_valid_kv: usize, _capacity: usize, _slot: usize, ) -> Result<()>
Batched flash_attention across M decode caches in one launch.
Replaces the per-item
flash_attention(q_len=1, ...) × M
loop in the non-paged batched-decode path. Read moreSource§fn qk_norm_rope_batched_per_item(
_ctx: &mut Self::Context,
_input: &Self::Buffer,
_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_output: &mut Self::Buffer,
_positions: &Self::Buffer,
_m: usize,
_heads: usize,
_head_dim: usize,
_eps: f32,
_mode: i32,
) -> Result<()>
fn qk_norm_rope_batched_per_item( _ctx: &mut Self::Context, _input: &Self::Buffer, _norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _output: &mut Self::Buffer, _positions: &Self::Buffer, _m: usize, _heads: usize, _head_dim: usize, _eps: f32, _mode: i32, ) -> Result<()>
Batched per-item-position variant of
qk_norm_rope for the
non-paged batched-decode path. Each of the m items has its own
absolute RoPE position (read from a device i32 buffer of length
m). Layout is item-major in both input and output: Read moreSource§fn split_qkv_norm_rope(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_k_out: &mut Self::Buffer,
_v_out: &mut Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
) -> Result<()>
fn split_qkv_norm_rope( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _k_out: &mut Self::Buffer, _v_out: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, ) -> Result<()>
Fused split-QKV + QK-norm + RoPE + head-major transpose. Read more
Source§fn split_qkv_norm_rope_into_cache(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
_cache_len: usize,
_cache_capacity: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _cache_capacity: usize, ) -> Result<()>
Variant of
Backend::split_qkv_norm_rope that writes the new
K and V directly into pre-allocated head-major KV cache buffers
at slot [kv_heads, cache_len .. cache_len + tokens, hd].
Eliminates the trailing kv_cache_append_head_major dispatch on
the decode hot path. Q still lands in per-token head-major
scratch (flash-attention reads it as the query). Read moreSource§fn transpose_token_to_head(
_ctx: &mut Self::Context,
_src: &Self::Buffer,
_dst: &mut Self::Buffer,
_tokens: usize,
_heads: usize,
_dim: usize,
)
fn transpose_token_to_head( _ctx: &mut Self::Context, _src: &Self::Buffer, _dst: &mut Self::Buffer, _tokens: usize, _heads: usize, _dim: usize, )
Inverse of
transpose_head_to_token: [tokens, heads, dim] →
[heads, tokens, dim]. Used by the CUDA paged_decode_attention
wrapper to convert paged_varlen_attention’s token-major output
back to the head-major layout that Qwen3MoeModel expects.
Default panics — backends without a paged-KV CUDA path don’t
hit this code.fn write_f32_to_activation( ctx: &mut Self::Context, dst: &mut Self::Buffer, data: &[f32], )
Source§fn argmax_rows_f16(
_ctx: &mut Self::Context,
logits: &Self::Buffer,
m: usize,
n: usize,
) -> Result<Vec<u32>>
fn argmax_rows_f16( _ctx: &mut Self::Context, logits: &Self::Buffer, m: usize, n: usize, ) -> Result<Vec<u32>>
Greedy-decode fast path: GPU argmax over each row of a
[m, n] FP16 logits buffer, returning the m token indices on the
host. Saves m × n × 2 bytes of D2H per call (e.g. 19.5 MB at
c=32, vocab=152064) and the host-side argmax scan (~150 µs × m). Read moreSource§impl BackendCollective for CpuBackend
impl BackendCollective for CpuBackend
fn world_size(_ctx: &Self::Context) -> usize
fn rank(_ctx: &Self::Context) -> usize
fn all_reduce( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp, )
fn all_gather( _ctx: &mut Self::Context, _local: &Self::Buffer, _global: &mut Self::Buffer, _local_len: usize, )
fn broadcast( _ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize, )
Source§impl BackendGraph for CpuBackend
impl BackendGraph for CpuBackend
Source§fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)
fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32)
Update per-step dynamic state (token id, step/pos). Fast (3x memcpy).
Source§fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)
fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool)
Toggle between scalar-arg kernels (normal) and
_dyn kernels that
read their dynamic scalar args from device memory (graph-friendly).Source§fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>
fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()>
Begin stream capture. Subsequent kernel launches are recorded into
a pending graph instead of executing eagerly.
Source§fn end_graph_capture(_ctx: &mut Self::Context, _key: u64) -> Result<()>
fn end_graph_capture(_ctx: &mut Self::Context, _key: u64) -> Result<()>
End stream capture and install the captured graph keyed by
_key (opaque caller-chosen u64; the model uses m_padded so
that different batch shapes don’t thrash a single slot).Source§fn replay_graph(_ctx: &mut Self::Context, _key: u64) -> Result<bool>
fn replay_graph(_ctx: &mut Self::Context, _key: u64) -> Result<bool>
Replay the captured graph for
_key. Returns Ok(false) if no
graph is cached for that key; caller should run eager.Source§fn reset_graph(_ctx: &mut Self::Context, _key: u64)
fn reset_graph(_ctx: &mut Self::Context, _key: u64)
Drop the cached graph for
_key — required when its kernel-arg
pointers (KV cache, scratch) might no longer be valid. Use
reset_all_graphs when EVERY cached graph should be evicted
(hard model reload / scratch realloc).Source§fn reset_all_graphs(_ctx: &mut Self::Context)
fn reset_all_graphs(_ctx: &mut Self::Context)
Drop ALL cached graphs — used by hard reset paths.
Source§impl BackendKvDtype<KvFp16> for CpuBackend
impl BackendKvDtype<KvFp16> for CpuBackend
Source§impl BackendMoeFused for CpuBackend
impl BackendMoeFused for CpuBackend
Source§fn upload_moe_routing(
_ctx: &mut Self::Context,
_sorted_token_ids: &[i32],
_expert_ids: &[i32],
_num_tokens_past_padded: &[i32],
) -> Result<MoeRouting<Self>>
fn upload_moe_routing( _ctx: &mut Self::Context, _sorted_token_ids: &[i32], _expert_ids: &[i32], _num_tokens_past_padded: &[i32], ) -> Result<MoeRouting<Self>>
Routing inputs for
moe_gemm_phase_vllm — host-built i32 arrays
uploaded once per layer (or per token, depending on caller cadence).
Matches the shape contract of moe_align_block_size outputs but is
usable on backends that build the indices on host. Read moreSource§fn route_topk_softmax(
_ctx: &mut Self::Context,
_logits: &Self::Buffer,
_out_ids: &mut Self::Buffer,
_out_weights: &mut Self::Buffer,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_norm_topk_prob: bool,
) -> Result<()>
fn route_topk_softmax( _ctx: &mut Self::Context, _logits: &Self::Buffer, _out_ids: &mut Self::Buffer, _out_weights: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _norm_topk_prob: bool, ) -> Result<()>
GPU-side MoE router:
[batch, num_experts] logits → [batch, top_k]
expert IDs (i32) + [batch, top_k] combine weights (f32). Read moreSource§fn try_gpu_route_topk_into_host(
_ctx: &mut Self::Context,
_logits_dev: &Self::Buffer,
_out_ids_host: &mut Vec<u32>,
_out_weights_host: &mut Vec<f32>,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_norm_topk_prob: bool,
) -> Result<()>
fn try_gpu_route_topk_into_host( _ctx: &mut Self::Context, _logits_dev: &Self::Buffer, _out_ids_host: &mut Vec<u32>, _out_weights_host: &mut Vec<f32>, _batch: usize, _num_experts: usize, _top_k: usize, _norm_topk_prob: bool, ) -> Result<()>
GPU-side fast-path for the host route() leg of the bucketed
MoE forward (
moe_forward_bucketed in ferrum-models). Replaces
the B::sync(ctx) + B::to_vec(logits) + crate::moe::router:: route_into(...) triple with a single GPU kernel + small D2H of
[batch, top_k] ids + weights. Read moreSource§fn moe_build_pairs_by_token(
_ctx: &mut Self::Context,
_expert_ids: &Self::Buffer,
_pairs_by_token: &mut Self::Buffer,
_packed_token_idx: &mut Self::Buffer,
_expert_offsets: &mut Self::Buffer,
_batch_x_topk: usize,
_num_experts: usize,
_top_k: usize,
) -> Result<()>
fn moe_build_pairs_by_token( _ctx: &mut Self::Context, _expert_ids: &Self::Buffer, _pairs_by_token: &mut Self::Buffer, _packed_token_idx: &mut Self::Buffer, _expert_offsets: &mut Self::Buffer, _batch_x_topk: usize, _num_experts: usize, _top_k: usize, ) -> Result<()>
GPU-side moe_align_block_size — prep for a future fused MoE
Marlin kernel. Takes per-pair expert assignments (from
Self::route_topk_softmax) and produces: Read moreSource§fn moe_align_block_size(
_ctx: &mut Self::Context,
_expert_ids_per_pair: &Self::Buffer,
_sorted_token_ids: &mut Self::Buffer,
_block_ids: &mut Self::Buffer,
_total_tokens_post_pad: &mut Self::Buffer,
_batch_x_topk: usize,
_num_experts: usize,
_block_size: usize,
_sorted_max_size: usize,
) -> Result<()>
fn moe_align_block_size( _ctx: &mut Self::Context, _expert_ids_per_pair: &Self::Buffer, _sorted_token_ids: &mut Self::Buffer, _block_ids: &mut Self::Buffer, _total_tokens_post_pad: &mut Self::Buffer, _batch_x_topk: usize, _num_experts: usize, _block_size: usize, _sorted_max_size: usize, ) -> Result<()>
b[block_ids[blockIdx.y] * n_per_expert + ...]. Read moreSource§fn moe_align_block_size_pair_ids(
_ctx: &mut Self::Context,
_expert_ids_per_pair: &Self::Buffer,
_sorted_token_ids: &mut Self::Buffer,
_block_ids: &mut Self::Buffer,
_total_tokens_post_pad: &mut Self::Buffer,
_batch_x_topk: usize,
_num_experts: usize,
_block_size: usize,
_sorted_max_size: usize,
) -> Result<()>
fn moe_align_block_size_pair_ids( _ctx: &mut Self::Context, _expert_ids_per_pair: &Self::Buffer, _sorted_token_ids: &mut Self::Buffer, _block_ids: &mut Self::Buffer, _total_tokens_post_pad: &mut Self::Buffer, _batch_x_topk: usize, _num_experts: usize, _block_size: usize, _sorted_max_size: usize, ) -> Result<()>
vLLM-native align variant:
sorted_token_ids stores flattened
(token, top_k_slot) pair ids, not Ferrum’s pre-gathered packed rows.
This lets marlin_moe read gate_up input as A[pair_id / top_k].Source§fn compute_ids_tpe_gpu(
_ctx: &mut Self::Context,
_selected_ids: &Self::Buffer,
_tpe: &mut Self::Buffer,
_ids: &mut Self::Buffer,
_gate_up_args: &mut Self::Buffer,
_down_args: &mut Self::Buffer,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_m_gate_up: usize,
_m_down: usize,
) -> Result<()>
fn compute_ids_tpe_gpu( _ctx: &mut Self::Context, _selected_ids: &Self::Buffer, _tpe: &mut Self::Buffer, _ids: &mut Self::Buffer, _gate_up_args: &mut Self::Buffer, _down_args: &mut Self::Buffer, _batch: usize, _num_experts: usize, _top_k: usize, _m_gate_up: usize, _m_down: usize, ) -> Result<()>
GPU-side bucket sort: turn
[batch, top_k] selected expert IDs
(from Self::route_topk_softmax) into tpe[num_experts] /
ids[num_experts * row_stride] arrays consumed by the batched
MoE GEMM, and emit indirect-dispatch args for the consumer GEMM. Read moreSource§fn silu_mul_batched(
_ctx: &mut Self::Context,
_gate: &Self::Buffer,
_up: &Self::Buffer,
_out: &mut Self::Buffer,
_total_pairs: usize,
_ffn: usize,
) -> Result<()>
fn silu_mul_batched( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _total_pairs: usize, _ffn: usize, ) -> Result<()>
Stacked SiLU·gate over
[batch * top_k, ffn] rows (prefill version
of silu_mul_stacked).Source§fn weighted_sum_residual_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_residual: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
) -> Result<()>
fn weighted_sum_residual_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _n_slots: usize, _hidden: usize, ) -> Result<()>
Fused weighted-sum + residual-add:
residual[i] += Σ_k weights[k] · slots[k, i].
Single dispatch replaces the (weighted_sum → moe_out) +
(add_inplace residual += moe_out) pair on the decode hot path.Source§fn weighted_sum_residual_norm_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_residual: &mut Self::Buffer,
_next_norm_w: &Self::Buffer,
_normed_out: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
_eps: f32,
) -> Result<()>
fn weighted_sum_residual_norm_stacked( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _residual: &mut Self::Buffer, _next_norm_w: &Self::Buffer, _normed_out: &mut Self::Buffer, _n_slots: usize, _hidden: usize, _eps: f32, ) -> Result<()>
Fused weighted-sum-residual + RMSNorm: combines this layer’s
weighted_sum_residual_stacked with the next layer’s leading
rms_norm into a single dispatch. Read moreSource§fn weighted_sum_batched(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_out: &mut Self::Buffer,
_batch: usize,
_top_k: usize,
_hidden: usize,
) -> Result<()>
fn weighted_sum_batched( _ctx: &mut Self::Context, _slots: &Self::Buffer, _weights: &Self::Buffer, _out: &mut Self::Buffer, _batch: usize, _top_k: usize, _hidden: usize, ) -> Result<()>
Per-batch weighted sum:
out[b, h] = Σ_k weights[b, k] · slots[b, k, h].
Single dispatch covers the whole batch (prefill version of
weighted_sum_stacked which only handled one token).Source§fn weighted_sum_batched_offset(
ctx: &mut Self::Context,
slots: &Self::Buffer,
weights: &Self::Buffer,
weights_offset: usize,
out: &mut Self::Buffer,
out_offset: usize,
batch: usize,
top_k: usize,
hidden: usize,
) -> Result<()>
fn weighted_sum_batched_offset( ctx: &mut Self::Context, slots: &Self::Buffer, weights: &Self::Buffer, weights_offset: usize, out: &mut Self::Buffer, out_offset: usize, batch: usize, top_k: usize, hidden: usize, ) -> Result<()>
Offset-aware variant of
Self::weighted_sum_batched —
weights reads from weights_offset (in elements, points at
the start of [batch, top_k]), out writes from out_offset
(in elements, points at start of [batch, hidden]). Used by
the per-item batched-decode path to skip copy_slice round-trips.
Default falls back to the non-offset variant via two copies.Source§fn silu_mul_stacked(
_ctx: &mut Self::Context,
_gate: &Self::Buffer,
_up: &Self::Buffer,
_out: &mut Self::Buffer,
_n_slots: usize,
_ffn: usize,
) -> Result<()>
fn silu_mul_stacked( _ctx: &mut Self::Context, _gate: &Self::Buffer, _up: &Self::Buffer, _out: &mut Self::Buffer, _n_slots: usize, _ffn: usize, ) -> Result<()>
Stacked SiLU·gate over
[n_slots, ffn] rows. Read moreSource§fn supports_fused_moe_gate_up_silu() -> bool
fn supports_fused_moe_gate_up_silu() -> bool
Capability probe for [
Self::gemv_quant_moe_id_gate_up_silu]. Read moreSource§fn supports_batched_moe_gemv() -> bool
fn supports_batched_moe_gemv() -> bool
Capability probe for [
Self::gemv_quant_moe_id_batched].Source§fn supports_batched_moe_gate_up_silu() -> bool
fn supports_batched_moe_gate_up_silu() -> bool
Capability probe for [
Self::gemv_quant_moe_id_gate_up_silu_batched].Source§impl BackendPagedKv for CpuBackend
impl BackendPagedKv for CpuBackend
Source§fn supports_paged_kv() -> bool
fn supports_paged_kv() -> bool
Whether this backend has a paged-KV decode path
(
paged_decode_attention etc.). Currently true for Metal, false
for CPU. Used to decide the default of FERRUM_METAL_PAGED_KV —
the serve path should opt in automatically when supported so
users get the bench-quality concurrent-decode numbers without
having to learn the flag.Source§fn populate_batched_pointers(
_ctx: &mut Self::Context,
_k_caches: &[&Self::Buffer],
_v_caches: &[&Self::Buffer],
_num_layers: usize,
_m: usize,
) -> Result<()>
fn populate_batched_pointers( _ctx: &mut Self::Context, _k_caches: &[&Self::Buffer], _v_caches: &[&Self::Buffer], _num_layers: usize, _m: usize, ) -> Result<()>
Pre-populate the per-slot device-pointer scratch arrays used by
the batched kernels (
kv_cache_append_batched_per_cache and
flash_attention_batched_per_cache). Required by the CUDA-graph
capture path: the captured graph contains only kernel launches
(no captured memcpy_htod), so the device scratch must be fresh
when the graph replays. Read moreSource§fn split_qkv_norm_rope_into_paged_cache(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_qkv_byte_offset: u64,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_q_out_byte_offset: u64,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_block_table: &Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
_cache_len: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_paged_cache( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _qkv_byte_offset: u64, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _q_out_byte_offset: u64, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _block_table: &Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Paged-KV variant of [
Self::split_qkv_norm_rope_into_cache]. Read moreSource§fn paged_decode_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_block_tables: &Self::Buffer,
_context_lens: &Self::Buffer,
_num_seqs: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
_q_len: usize,
) -> Result<()>
fn paged_decode_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _q_len: usize, ) -> Result<()>
Paged-KV variant of [
Self::flash_attention]. Read moreSource§fn supports_varlen_qkv() -> bool
fn supports_varlen_qkv() -> bool
Capability: does this backend implement
split_qkv_norm_rope_into_paged_cache_varlen and
paged_varlen_attention? Required by the unified mixed-batch
forward path used by LlamaFamilyModel::unified_forward. Default
false; backends that ship the varlen kernels override.Source§fn split_qkv_norm_rope_into_paged_cache_varlen(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_cu_seqlens_q: &Self::Buffer,
_pos_offsets: &Self::Buffer,
_block_tables: &Self::Buffer,
_num_seqs: usize,
_m_total: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_eps: f32,
_qk_mode: i32,
_block_size: usize,
_max_blocks_per_seq: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_paged_cache_varlen( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _m_total: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _eps: f32, _qk_mode: i32, _block_size: usize, _max_blocks_per_seq: usize, ) -> Result<()>
Varlen variant of
Self::split_qkv_norm_rope_into_paged_cache. Read moreSource§fn paged_varlen_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_cu_seqlens_q: &Self::Buffer,
_pos_offsets: &Self::Buffer,
_block_tables: &Self::Buffer,
_num_seqs: usize,
_total_q_tokens: usize,
_max_kv_len: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn paged_varlen_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _total_q_tokens: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Variable-length paged attention with GQA + causal mask. Read more
Source§fn paged_varlen_attention_fa2_ffi(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_lse: &mut Self::Buffer,
_cu_seqlens_q: &Self::Buffer,
_seq_lens: &Self::Buffer,
_block_tables: &Self::Buffer,
_num_seqs: usize,
_total_q_tokens: usize,
_max_q_len: usize,
_max_kv_len: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn paged_varlen_attention_fa2_ffi( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _lse: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _seq_lens: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _total_q_tokens: usize, _max_q_len: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Opt-in vLLM FlashAttention-2 FFI path for FA-layout paged KV. Read more
Source§fn paged_batched_decode_attention(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_block_tables: &Self::Buffer,
_valid_kv_lens: &Self::Buffer,
_num_seqs: usize,
_max_kv_len: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn paged_batched_decode_attention( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _valid_kv_lens: &Self::Buffer, _num_seqs: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Batched paged decode attention — multi-seq, single token per seq.
Faster path for the unified_forward layer when m_total == num_seqs
(every item is a single-token decode). Skips the cu_seqlens_q
linear scan that
paged_varlen_attention does in the fully-mixed
case. Read moreSource§fn supports_vllm_paged_attn() -> bool
fn supports_vllm_paged_attn() -> bool
Capability: backend has vLLM-layout paged KV write kernels and the
paged_attention_v2 decode kernel. Models that opt into this layout
at construction time (via FERRUM_USE_VLLM_PAGED_ATTN=1) must
dispatch ALL paged writes and reads through the _vllm variants —
the layouts are not compatible. Default false.Source§fn split_qkv_norm_rope_into_paged_cache_vllm(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_qkv_byte_offset: u64,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_q_out_byte_offset: u64,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_block_table: &Self::Buffer,
_tokens: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
_cache_len: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_paged_cache_vllm( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _qkv_byte_offset: u64, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _q_out_byte_offset: u64, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _block_table: &Self::Buffer, _tokens: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, _cache_len: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
vLLM-layout variant of
Self::split_qkv_norm_rope_into_paged_cache. K/V are written in
vLLM’s paged_attention_v2 layout: K is
[num_blocks, kv_heads, head_dim/x, block_size, x] (x = 16/sizeof(elem)),
V is [num_blocks, kv_heads, head_dim, block_size]. Q output and
every other argument matches the non-vllm variant exactly so the
model layer can swap dispatchers based on a single flag.Source§fn split_qkv_norm_rope_into_paged_cache_varlen_vllm(
_ctx: &mut Self::Context,
_qkv: &Self::Buffer,
_q_norm_w: &Self::Buffer,
_k_norm_w: &Self::Buffer,
_cos: &Self::Buffer,
_sin: &Self::Buffer,
_q_out: &mut Self::Buffer,
_cache_k: &mut Self::Buffer,
_cache_v: &mut Self::Buffer,
_cu_seqlens_q: &Self::Buffer,
_pos_offsets: &Self::Buffer,
_block_tables: &Self::Buffer,
_num_seqs: usize,
_m_total: usize,
_q_heads: usize,
_kv_heads: usize,
_head_dim: usize,
_eps: f32,
_qk_mode: i32,
_block_size: usize,
_max_blocks_per_seq: usize,
) -> Result<()>
fn split_qkv_norm_rope_into_paged_cache_varlen_vllm( _ctx: &mut Self::Context, _qkv: &Self::Buffer, _q_norm_w: &Self::Buffer, _k_norm_w: &Self::Buffer, _cos: &Self::Buffer, _sin: &Self::Buffer, _q_out: &mut Self::Buffer, _cache_k: &mut Self::Buffer, _cache_v: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _m_total: usize, _q_heads: usize, _kv_heads: usize, _head_dim: usize, _eps: f32, _qk_mode: i32, _block_size: usize, _max_blocks_per_seq: usize, ) -> Result<()>
vLLM-layout variant of
Self::split_qkv_norm_rope_into_paged_cache_varlen. Same signature
— only the K/V cache layout changes.Source§fn paged_decode_attention_v2(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_block_tables: &Self::Buffer,
_context_lens: &Self::Buffer,
_num_seqs: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
_max_seq_len: usize,
) -> Result<()>
fn paged_decode_attention_v2( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _max_seq_len: usize, ) -> Result<()>
vLLM
paged_attention_v2 — multi-partition split-K decode attention
reading the vLLM K/V layout. q_len is implicitly 1 (decode only;
vLLM’s v2 kernel does not support q_len > 1). max_seq_len is the
max kv_len across the batch — used to size the partition reduction.Source§fn paged_varlen_attention_vllm_layout(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_block_tables: &Self::Buffer,
_context_lens: &Self::Buffer,
_num_seqs: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
_q_len: usize,
) -> Result<()>
fn paged_varlen_attention_vllm_layout( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _block_tables: &Self::Buffer, _context_lens: &Self::Buffer, _num_seqs: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, _q_len: usize, ) -> Result<()>
q_len>1 prefill/chunk-prefill attention over vLLM-layout paged KV.
This keeps cache layout consistent when
FERRUM_USE_VLLM_PAGED_ATTN=1
and the prompt path writes K/V in the layout consumed later by
paged_decode_attention_v2.Source§fn paged_varlen_attention_vllm(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_cu_seqlens_q: &Self::Buffer,
_pos_offsets: &Self::Buffer,
_block_tables: &Self::Buffer,
_num_seqs: usize,
_total_q_tokens: usize,
_max_kv_len: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn paged_varlen_attention_vllm( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _num_seqs: usize, _total_q_tokens: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Variable-length paged attention over vLLM-layout paged KV. Read more
Source§fn paged_varlen_attention_vllm_tiled_q4(
_ctx: &mut Self::Context,
_q: &Self::Buffer,
_k_pool: &Self::Buffer,
_v_pool: &Self::Buffer,
_out: &mut Self::Buffer,
_cu_seqlens_q: &Self::Buffer,
_pos_offsets: &Self::Buffer,
_block_tables: &Self::Buffer,
_tile_seqs: &Self::Buffer,
_tile_starts: &Self::Buffer,
_num_tiles: usize,
_max_kv_len: usize,
_num_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_block_size: usize,
_max_num_blocks_per_seq: usize,
) -> Result<()>
fn paged_varlen_attention_vllm_tiled_q4( _ctx: &mut Self::Context, _q: &Self::Buffer, _k_pool: &Self::Buffer, _v_pool: &Self::Buffer, _out: &mut Self::Buffer, _cu_seqlens_q: &Self::Buffer, _pos_offsets: &Self::Buffer, _block_tables: &Self::Buffer, _tile_seqs: &Self::Buffer, _tile_starts: &Self::Buffer, _num_tiles: usize, _max_kv_len: usize, _num_heads: usize, _num_kv_heads: usize, _head_dim: usize, _block_size: usize, _max_num_blocks_per_seq: usize, ) -> Result<()>
Q-tiled vLLM-layout varlen attention.
tile_seqs and tile_starts
describe a compact list of q-token tiles, avoiding empty grid blocks
for mixed batches that contain both long prefill items and q_len=1
decode items. Semantics match Self::paged_varlen_attention_vllm.Source§impl BackendQuantGguf for CpuBackend
impl BackendQuantGguf for CpuBackend
Source§fn load_quant(
kind: GgufQuantType,
bytes: &[u8],
n_rows: usize,
n_cols: usize,
) -> Result<Box<dyn Linear<Self> + Send + Sync>>
fn load_quant( kind: GgufQuantType, bytes: &[u8], n_rows: usize, n_cols: usize, ) -> Result<Box<dyn Linear<Self> + Send + Sync>>
Load GGUF k-quant weights into the backend’s preferred format. Read more
Source§fn load_quant_fused(
_parts: &[(GgufQuantType, &[u8], usize)],
_n_cols: usize,
) -> Result<Box<dyn Linear<Self> + Send + Sync>>
fn load_quant_fused( _parts: &[(GgufQuantType, &[u8], usize)], _n_cols: usize, ) -> Result<Box<dyn Linear<Self> + Send + Sync>>
Build a fused linear from multiple
(kind, bytes, n_rows)
parts that share n_cols. Used by GgufLoader::load_fused when
parts have heterogeneous quant kinds (e.g. Qwen3 qkv_proj where
q+k are Q4_K but v is Q6_K) — byte-concatenation isn’t possible,
so each part stays as its own QuantStore and the gemm dispatches
one matvec per part with output offsets. Read moreSource§fn load_quant_experts(
_kind: GgufQuantType,
_bytes: &[u8],
_num_experts: usize,
_n_rows: usize,
_n_cols: usize,
) -> Result<Box<dyn StackedExpertGgufLinear<Self>>>
fn load_quant_experts( _kind: GgufQuantType, _bytes: &[u8], _num_experts: usize, _n_rows: usize, _n_cols: usize, ) -> Result<Box<dyn StackedExpertGgufLinear<Self>>>
Build a stacked-experts MoE linear from a contiguous 3-D weight
payload
[num_experts, n_rows, n_cols/256] super-blocks. Used for
the MoE indirect-dispatch fast path; backends without such a kernel
return Err(unsupported) and the model code falls back to the
per-expert Box<dyn Linear<Self>> loop. Read moreSource§impl BackendQuantMarlin for CpuBackend
impl BackendQuantMarlin for CpuBackend
Source§fn load_gptq(
qweight: &[i32],
scales: &[f32],
qzeros: &[i32],
_g_idx: Option<&[i32]>,
bias_host: Option<&[f32]>,
bits: u32,
group_size: usize,
k: usize,
n: usize,
) -> Result<Box<dyn Linear<Self> + Send + Sync>>
fn load_gptq( qweight: &[i32], scales: &[f32], qzeros: &[i32], _g_idx: Option<&[i32]>, bias_host: Option<&[f32]>, bits: u32, group_size: usize, k: usize, n: usize, ) -> Result<Box<dyn Linear<Self> + Send + Sync>>
Repack raw GPTQ tensors into a backend-specific
Linear<Self> impl.
Called once per layer at model load time. Read moreSource§fn load_gptq_stacked(
qweights: &[&[i32]],
scales: &[&[f32]],
qzeros: &[&[i32]],
_g_idx: Option<&[i32]>,
bits: u32,
group_size: usize,
k: usize,
n_per_expert: usize,
) -> Result<Arc<dyn MarlinExpertStack<Self>>>
fn load_gptq_stacked( qweights: &[&[i32]], scales: &[&[f32]], qzeros: &[&[i32]], _g_idx: Option<&[i32]>, bits: u32, group_size: usize, k: usize, n_per_expert: usize, ) -> Result<Arc<dyn MarlinExpertStack<Self>>>
Load num_experts GPTQ weight tiles into ONE stacked store, with
the property that each expert’s packed bytes are contiguous
in the resulting store. This is what the offset GEMM needs to
dispatch per expert via pointer offset alone. Read more
Source§fn pregrow_marlin_gather_scratch(_ctx: &mut Self::Context, _required: usize)
fn pregrow_marlin_gather_scratch(_ctx: &mut Self::Context, _required: usize)
Pre-grow any backend-internal scratch slots whose size depends
on
m_total * intermediate_size (the largest matmul fan-in
inside unified_forward_internal). Default no-op. CUDA
implements this to grow the perm-aware Marlin gather scratch
EAGERLY before the caller enters a CUDA-graph capture region —
cuLaunchKernel after a runtime alloc inside a captured
stream returns CUDA_ERROR_INVALID_VALUE.Source§impl BackendTimer<CpuBackend> for CpuTimer
impl BackendTimer<CpuBackend> for CpuTimer
Source§fn new() -> Self
fn new() -> Self
Allocate timer state. On CUDA this creates two
cuEvent_t
handles; on Metal it’s a no-op; on CPU it’s two Option<Instant>.Source§fn record_start(&mut self, _ctx: &mut <CpuBackend as Backend>::Context)
fn record_start(&mut self, _ctx: &mut <CpuBackend as Backend>::Context)
Record the “start” timestamp on the current ctx’s stream/command
buffer. Returns immediately on CUDA (async); on Metal forces a
sync to flush any pending work first.
Source§fn record_end(&mut self, _ctx: &mut <CpuBackend as Backend>::Context)
fn record_end(&mut self, _ctx: &mut <CpuBackend as Backend>::Context)
Record the “end” timestamp.
Source§fn elapsed_ms(&self) -> f64
fn elapsed_ms(&self) -> f64
Synchronize on the recorded events and return the elapsed time
in milliseconds. Blocks the calling thread on CUDA; instant on CPU/Metal. Read more
Source§impl Linear<CpuBackend> for CpuGptqLinear
impl Linear<CpuBackend> for CpuGptqLinear
fn in_features(&self) -> usize
fn out_features(&self) -> usize
Source§fn forward(
&self,
ctx: &mut <CpuBackend as Backend>::Context,
input: &<CpuBackend as Backend>::Buffer,
out: &mut <CpuBackend as Backend>::Buffer,
m: usize,
)
fn forward( &self, ctx: &mut <CpuBackend as Backend>::Context, input: &<CpuBackend as Backend>::Buffer, out: &mut <CpuBackend as Backend>::Buffer, m: usize, )
Append GEMM work onto
ctx. Caller flushes the context when results
must be materialised.Source§impl Linear<CpuBackend> for CpuGgufLinear
impl Linear<CpuBackend> for CpuGgufLinear
fn in_features(&self) -> usize
fn out_features(&self) -> usize
Source§fn forward(
&self,
ctx: &mut <CpuBackend as Backend>::Context,
input: &<CpuBackend as Backend>::Buffer,
out: &mut <CpuBackend as Backend>::Buffer,
m: usize,
)
fn forward( &self, ctx: &mut <CpuBackend as Backend>::Context, input: &<CpuBackend as Backend>::Buffer, out: &mut <CpuBackend as Backend>::Buffer, m: usize, )
Append GEMM work onto
ctx. Caller flushes the context when results
must be materialised.Source§impl MarlinExpertStack<CpuBackend> for CpuMarlinExpertStack
impl MarlinExpertStack<CpuBackend> for CpuMarlinExpertStack
Source§fn n_per_expert(&self) -> usize
fn n_per_expert(&self) -> usize
Per-expert output width (N tile cols).
Source§fn num_experts(&self) -> usize
fn num_experts(&self) -> usize
Number of experts packed into the tile.
Source§fn as_any(&self) -> &dyn Any
fn as_any(&self) -> &dyn Any
Downcast hook — used at FFN dispatch boundaries where the
caller needs to reach into the concrete store to e.g. share
workspace memory across phases. Standard
dyn Any pattern.Source§fn zero_workspace(
&self,
_ctx: &mut <CpuBackend as Backend>::Context,
) -> Result<()>
fn zero_workspace( &self, _ctx: &mut <CpuBackend as Backend>::Context, ) -> Result<()>
Bulk-zero the per-expert Marlin workspace mutex slots. Call ONCE
before a batch of bucketed
gemm_phase_batched calls — saves
the per-call cuMemsetD32Async (one launch each → one launch
total). At c=32 with 128 active experts × 2 phases × 48 layers
that’s ~12k memset launches/token reduced to ~96.Source§fn gemm_phase_batched(
&self,
ctx: &mut <CpuBackend as Backend>::Context,
input: &<CpuBackend as Backend>::Buffer,
dispatches: &[(usize, usize, usize, usize)],
output: &mut <CpuBackend as Backend>::Buffer,
k: usize,
) -> Result<()>
fn gemm_phase_batched( &self, ctx: &mut <CpuBackend as Backend>::Context, input: &<CpuBackend as Backend>::Buffer, dispatches: &[(usize, usize, usize, usize)], output: &mut <CpuBackend as Backend>::Buffer, k: usize, ) -> Result<()>
Batched per-expert offset GEMM.
dispatches[i] = (expert_idx, in_row_offset, out_row_offset, m). Runs each
expert’s (m × K) @ tile[expert] = m × n_per_expert slice;
CUDA backend overlaps via multi-stream round-robin.Source§fn make_expert_linear(
self: Arc<Self>,
expert_offset: usize,
expert_n: usize,
bias_host: Option<&[f32]>,
) -> Result<Box<dyn Linear<CpuBackend> + Send + Sync>>
fn make_expert_linear( self: Arc<Self>, expert_offset: usize, expert_n: usize, bias_host: Option<&[f32]>, ) -> Result<Box<dyn Linear<CpuBackend> + Send + Sync>>
Build a single-expert
Linear<B> view onto this stack’s
[expert_offset .. expert_offset + expert_n) column slice.
Used for per-expert dispatch outside the MoE phase batching
(e.g. shared-experts code paths). expert_offset and expert_n
MUST be multiples of the backend’s Marlin N tile (64 on CUDA).Source§fn gemm_phase_vllm(
&self,
_ctx: &mut B::Context,
_input: &B::Buffer,
_sorted_token_ids: &B::Buffer,
_expert_ids: &B::Buffer,
_num_tokens_past_padded: &B::Buffer,
_output: &mut B::Buffer,
_prob_m: usize,
_moe_block_size: usize,
_top_k: usize,
) -> Result<()>
fn gemm_phase_vllm( &self, _ctx: &mut B::Context, _input: &B::Buffer, _sorted_token_ids: &B::Buffer, _expert_ids: &B::Buffer, _num_tokens_past_padded: &B::Buffer, _output: &mut B::Buffer, _prob_m: usize, _moe_block_size: usize, _top_k: usize, ) -> Result<()>
vLLM
marlin_moe_wna16 fused GEMM (single launch, per-block
expert routing inside the kernel). Caller responsibilities: Read moreAuto Trait Implementations§
impl Freeze for CpuBackend
impl RefUnwindSafe for CpuBackend
impl Send for CpuBackend
impl Sync for CpuBackend
impl Unpin for CpuBackend
impl UnsafeUnpin for CpuBackend
impl UnwindSafe for CpuBackend
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more