use crate::backend::Backend;
use crate::Linear;
use ferrum_types::Result;
use std::sync::Arc;
pub trait MarlinExpertStack<B: Backend>: Send + Sync {
fn n_per_expert(&self) -> usize;
fn k(&self) -> usize;
fn num_experts(&self) -> usize;
fn as_any(&self) -> &dyn std::any::Any;
fn zero_workspace(&self, ctx: &mut B::Context) -> Result<()>;
#[allow(clippy::too_many_arguments)]
fn gemm_phase_batched(
&self,
ctx: &mut B::Context,
input: &B::Buffer,
dispatches: &[(usize, usize, usize, usize)],
output: &mut B::Buffer,
k: usize,
) -> Result<()>;
#[allow(clippy::too_many_arguments)]
fn gemm_phase_vllm(
&self,
_ctx: &mut B::Context,
_input: &B::Buffer,
_sorted_token_ids: &B::Buffer,
_expert_ids: &B::Buffer,
_num_tokens_past_padded: &B::Buffer,
_output: &mut B::Buffer,
_prob_m: usize,
_moe_block_size: usize,
_top_k: usize,
) -> Result<()> {
Err(ferrum_types::FerrumError::unsupported(
"MarlinExpertStack::gemm_phase_vllm not implemented for this backend",
))
}
fn make_expert_linear(
self: Arc<Self>,
expert_offset: usize,
expert_n: usize,
bias_host: Option<&[f32]>,
) -> Result<Box<dyn Linear<B> + Send + Sync>>;
}