Struct ExpertStack

Source

pub struct ExpertStack<B: Backend> {
    pub gate_up: Vec<Box<dyn Linear<B>>>,
    pub down: Vec<Box<dyn Linear<B>>>,
    pub gate_stacked: Option<B::QuantStore>,
    pub up_stacked: Option<B::QuantStore>,
    pub down_stacked: Option<B::QuantStore>,
}

Expand description

Per-layer expert weights, materialised as [num_experts]-long vectors of Box<dyn Linear>. Each entry runs the corresponding expert’s fused [gate; up] projection or its down projection.

B::Buffer is hidden behind Linear so this struct is generic over backend, but Phase 2’s only consumer (moe_forward_cpu) is CPU- only — generic moe_forward is deferred until the trait gains scaled-accumulate + cheap buffer slicing.

Fields§

§gate_up: Vec<Box<dyn Linear>>

Fused [gate; up] projection per expert. Output shape per token: [2 * expert_intermediate] — the lower half is gate, upper is up.

§down: Vec<Box<dyn Linear>>

down projection per expert. Output shape per token: [hidden_size].

§gate_stacked: Option<B::QuantStore>

Stacked-experts representation for backends that have a batched MoE indirect-dispatch kernel (Metal gemv_q4kw_moe_id_f32 / gemv_q6kw_moe_id_f32). Holds all experts for one matmul role in a single B::QuantStore with byte stride between expert slabs, so a single dispatch can cover all selected (token, expert) pairs at decode m=1.

None on backends without the kernel (CPU, CUDA-without-MoE-kernel) and on quant flavours that don’t have a stacked path yet — callers fall back to the per-expert gate_up / down Linears in those cases.

§up_stacked: Option<B::QuantStore>§down_stacked: Option<B::QuantStore>

Struct ExpertStack Copy item path

Fields§

Implementations§

impl<B: Backend> ExpertStack<B>

pub fn from_dense_stacks( gate_stack: &[f32], up_stack: &[f32], down_stack: &[f32], num_experts: usize, hidden_size: usize, expert_intermediate: usize, ) -> Result<Self>

pub fn load_from_gguf( gguf: &GgufFile, layer_idx: usize, num_experts: usize, hidden_size: usize, expert_intermediate: usize, ) -> Result<Self>

pub fn open_and_load( path: impl AsRef<Path>, layer_idx: usize, num_experts: usize, hidden_size: usize, expert_intermediate: usize, ) -> Result<Self>

pub fn num_experts(&self) -> usize

Auto Trait Implementations§

impl<B> Freeze for ExpertStack<B>where <B as Backend>::QuantStore: Freeze,

impl<B> !RefUnwindSafe for ExpertStack<B>

impl<B> Send for ExpertStack<B>

impl<B> Sync for ExpertStack<B>

impl<B> Unpin for ExpertStack<B>where <B as Backend>::QuantStore: Unpin,

impl<B> UnsafeUnpin for ExpertStack<B>where <B as Backend>::QuantStore: UnsafeUnpin,

impl<B> !UnwindSafe for ExpertStack<B>

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<F, T> IntoSample<T> for Fwhere T: FromSample<F>,

fn into_sample(self) -> T

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> PolicyExt for Twhere T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>where T: Policy<B, E>, P: Policy<B, E>,

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> ErasedDestructor for Twhere T: 'static,

Struct ExpertStack

impl<B> Freeze for ExpertStack<B>
where <B as Backend>::QuantStore: Freeze,

impl<B> Unpin for ExpertStack<B>
where <B as Backend>::QuantStore: Unpin,

impl<B> UnsafeUnpin for ExpertStack<B>
where <B as Backend>::QuantStore: UnsafeUnpin,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<F, T> IntoSample<T> for F
where T: FromSample<F>,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> ErasedDestructor for T
where T: 'static,