baracuda_kernels::attention::flash_decoding

Struct FlashDecodingDescriptor

pub struct FlashDecodingDescriptor {
    pub batch_size: i32,
    pub num_heads: i32,
    pub num_kv_heads: i32,
    pub k_len: i32,
    pub head_dim: i32,
    pub scale: f32,
    pub element: ElementKind,
}

Expand description

Descriptor for a FlashDecoding op.

num_kv_heads is the GQA grouping signal: when it equals num_heads the workload is full MHA; when it’s smaller (e.g. 8 for Llama 3 8B at H_q=32) every K/V head is shared by group_size = num_heads / num_kv_heads Q heads. The launcher uses group_size to pick between the warp-cooperative SIMT kernel (Tier-1) and the GQA-batched WMMA kernel (Tier-2, gated on group_size ≥ 4 + head_dim aligned to 16).

Fields§

§batch_size: i32

Batch size (B).

§num_heads: i32

Number of query / output heads (H_q).

§num_kv_heads: i32

Number of K/V heads (H_kv). Must divide num_heads evenly. num_kv_heads == num_heads → pure MHA. num_kv_heads == 1 → MQA. num_kv_heads < num_heads && > 1 → GQA.

§k_len: i32

K/V sequence length (the full attended prefix, not just the new step). Arbitrary; the split-K factor adapts via [CHUNK_K].

§head_dim: i32

Per-head feature dimension. d_q == d_k == d_v is enforced — the decode regime doesn’t justify the d_k != d_v complication the prefill kernel handles.

§scale: f32

Score scaling factor — typically 1.0 / sqrt(head_dim).

§element: ElementKind

Element type — must match the plan’s type parameter.

Struct FlashDecodingDescriptor Copy item path

Fields§

Implementations§

impl FlashDecodingDescriptor

pub fn new( batch_size: i32, num_heads: i32, k_len: i32, head_dim: i32, element: ElementKind, ) -> Self

pub fn new_gqa( batch_size: i32, num_heads: i32, num_kv_heads: i32, k_len: i32, head_dim: i32, element: ElementKind, ) -> Self

pub fn with_scale(self, scale: f32) -> Self

pub fn group_size(&self) -> i32

Trait Implementations§

impl Clone for FlashDecodingDescriptor

fn clone(&self) -> FlashDecodingDescriptor

fn clone_from(&mut self, source: &Self)

impl Copy for FlashDecodingDescriptor

impl Debug for FlashDecodingDescriptor

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Auto Trait Implementations§

impl Freeze for FlashDecodingDescriptor

impl RefUnwindSafe for FlashDecodingDescriptor

impl Send for FlashDecodingDescriptor

impl Sync for FlashDecodingDescriptor

impl Unpin for FlashDecodingDescriptor

impl UnsafeUnpin for FlashDecodingDescriptor

impl UnwindSafe for FlashDecodingDescriptor

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct FlashDecodingDescriptor

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,