use ferrum_types::{FerrumError, Result};
use super::traits::Backend;
use super::types::{GgufQuantType, MoeRouting, ReduceOp};
pub trait BackendGraph: Backend {
fn set_decode_state(_ctx: &mut Self::Context, _token: u32, _step: u32) {}
fn set_dev_state_mode(_ctx: &mut Self::Context, _enable: bool) {}
fn begin_graph_capture(_ctx: &mut Self::Context) -> Result<()> {
Err(FerrumError::unsupported("graph capture not supported"))
}
fn end_graph_capture(_ctx: &mut Self::Context, _key: u64) -> Result<()> {
Err(FerrumError::unsupported("graph capture not supported"))
}
fn replay_graph(_ctx: &mut Self::Context, _key: u64) -> Result<bool> {
Ok(false)
}
fn reset_graph(_ctx: &mut Self::Context, _key: u64) {}
fn reset_all_graphs(_ctx: &mut Self::Context) {}
}
pub trait BackendCollective: Backend {
fn world_size(_ctx: &Self::Context) -> usize {
1
}
fn rank(_ctx: &Self::Context) -> usize {
0
}
fn all_reduce(_ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _op: ReduceOp) {
}
fn all_gather(
_ctx: &mut Self::Context,
_local: &Self::Buffer,
_global: &mut Self::Buffer,
_local_len: usize,
) {
}
fn broadcast(_ctx: &mut Self::Context, _buf: &mut Self::Buffer, _len: usize, _src_rank: usize) {
}
}
pub trait BackendQuantMarlin: Backend {
#[allow(clippy::too_many_arguments)]
fn load_gptq(
_qweight: &[i32],
_scales: &[f32],
_qzeros: &[i32],
_g_idx: Option<&[i32]>,
_bias_host: Option<&[f32]>,
_bits: u32,
_group_size: usize,
_k: usize,
_n: usize,
) -> Result<Box<dyn crate::Linear<Self> + Send + Sync>> {
Err(FerrumError::unsupported(
"load_gptq not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn load_gptq_stacked(
_qweights: &[&[i32]],
_scales: &[&[f32]],
_qzeros: &[&[i32]],
_g_idx: Option<&[i32]>,
_bits: u32,
_group_size: usize,
_k: usize,
_n_per_expert: usize,
) -> Result<std::sync::Arc<dyn crate::MarlinExpertStack<Self>>> {
Err(FerrumError::unsupported(
"load_gptq_stacked not implemented for this backend",
))
}
fn pregrow_marlin_gather_scratch(_ctx: &mut Self::Context, _required: usize) {
}
}
pub trait BackendQuantGguf: Backend {
fn load_quant(
_kind: GgufQuantType,
_bytes: &[u8],
_n_rows: usize,
_n_cols: usize,
) -> Result<Box<dyn crate::Linear<Self> + Send + Sync>> {
Err(FerrumError::unsupported(
"load_quant not implemented for this backend",
))
}
fn load_quant_fused(
_parts: &[(GgufQuantType, &[u8], usize)],
_n_cols: usize,
) -> Result<Box<dyn crate::Linear<Self> + Send + Sync>> {
Err(FerrumError::unsupported(
"load_quant_fused not implemented for this backend",
))
}
fn load_quant_experts(
_kind: GgufQuantType,
_bytes: &[u8],
_num_experts: usize,
_n_rows: usize,
_n_cols: usize,
) -> Result<Box<dyn crate::StackedExpertGgufLinear<Self>>> {
Err(FerrumError::unsupported(
"load_quant_experts not implemented for this backend",
))
}
}
pub trait BackendMoeFused: Backend {
fn upload_moe_routing(
_ctx: &mut Self::Context,
_sorted_token_ids: &[i32],
_expert_ids: &[i32],
_num_tokens_past_padded: &[i32],
) -> Result<MoeRouting<Self>> {
Err(FerrumError::unsupported(
"upload_moe_routing not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn route_topk_softmax(
_ctx: &mut Self::Context,
_logits: &Self::Buffer,
_out_ids: &mut Self::Buffer,
_out_weights: &mut Self::Buffer,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_norm_topk_prob: bool,
) -> Result<()> {
Err(FerrumError::unsupported(
"route_topk_softmax not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn try_gpu_route_topk_into_host(
_ctx: &mut Self::Context,
_logits_dev: &Self::Buffer,
_out_ids_host: &mut Vec<u32>,
_out_weights_host: &mut Vec<f32>,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_norm_topk_prob: bool,
) -> Result<()> {
Err(FerrumError::unsupported(
"try_gpu_route_topk_into_host not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn moe_build_pairs_by_token(
_ctx: &mut Self::Context,
_expert_ids: &Self::Buffer,
_pairs_by_token: &mut Self::Buffer,
_packed_token_idx: &mut Self::Buffer,
_expert_offsets: &mut Self::Buffer,
_batch_x_topk: usize,
_num_experts: usize,
_top_k: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"moe_build_pairs_by_token not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn moe_align_block_size(
_ctx: &mut Self::Context,
_expert_ids_per_pair: &Self::Buffer,
_sorted_token_ids: &mut Self::Buffer,
_block_ids: &mut Self::Buffer,
_total_tokens_post_pad: &mut Self::Buffer,
_batch_x_topk: usize,
_num_experts: usize,
_block_size: usize,
_sorted_max_size: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"moe_align_block_size not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn moe_align_block_size_pair_ids(
_ctx: &mut Self::Context,
_expert_ids_per_pair: &Self::Buffer,
_sorted_token_ids: &mut Self::Buffer,
_block_ids: &mut Self::Buffer,
_total_tokens_post_pad: &mut Self::Buffer,
_batch_x_topk: usize,
_num_experts: usize,
_block_size: usize,
_sorted_max_size: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"moe_align_block_size_pair_ids not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn compute_ids_tpe_gpu(
_ctx: &mut Self::Context,
_selected_ids: &Self::Buffer,
_tpe: &mut Self::Buffer,
_ids: &mut Self::Buffer,
_gate_up_args: &mut Self::Buffer,
_down_args: &mut Self::Buffer,
_batch: usize,
_num_experts: usize,
_top_k: usize,
_m_gate_up: usize,
_m_down: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"compute_ids_tpe_gpu not implemented for this backend",
))
}
fn silu_mul_batched(
_ctx: &mut Self::Context,
_gate: &Self::Buffer,
_up: &Self::Buffer,
_out: &mut Self::Buffer,
_total_pairs: usize,
_ffn: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"silu_mul_batched not implemented for this backend",
))
}
fn weighted_sum_residual_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_residual: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"weighted_sum_residual_stacked not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn weighted_sum_residual_norm_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_residual: &mut Self::Buffer,
_next_norm_w: &Self::Buffer,
_normed_out: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
_eps: f32,
) -> Result<()> {
Err(FerrumError::unsupported(
"weighted_sum_residual_norm_stacked not implemented for this backend",
))
}
fn weighted_sum_batched(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_out: &mut Self::Buffer,
_batch: usize,
_top_k: usize,
_hidden: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"weighted_sum_batched not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn weighted_sum_batched_offset(
ctx: &mut Self::Context,
slots: &Self::Buffer,
weights: &Self::Buffer,
weights_offset: usize,
out: &mut Self::Buffer,
out_offset: usize,
batch: usize,
top_k: usize,
hidden: usize,
) -> Result<()> {
let _ = (
ctx,
slots,
weights,
weights_offset,
out,
out_offset,
batch,
top_k,
hidden,
);
Err(FerrumError::unsupported(
"weighted_sum_batched_offset not implemented for this backend",
))
}
fn silu_mul_stacked(
_ctx: &mut Self::Context,
_gate: &Self::Buffer,
_up: &Self::Buffer,
_out: &mut Self::Buffer,
_n_slots: usize,
_ffn: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"silu_mul_stacked not implemented for this backend",
))
}
fn supports_fused_moe_gate_up_silu() -> bool {
false
}
fn supports_batched_moe_gemv() -> bool {
false
}
fn supports_batched_moe_gate_up_silu() -> bool {
false
}
fn weighted_sum_stacked(
_ctx: &mut Self::Context,
_slots: &Self::Buffer,
_weights: &Self::Buffer,
_out: &mut Self::Buffer,
_n_slots: usize,
_hidden: usize,
) -> Result<()> {
Err(FerrumError::unsupported(
"weighted_sum_stacked not implemented for this backend",
))
}
#[allow(clippy::too_many_arguments)]
fn moe_combine(
ctx: &mut Self::Context,
packed_down: &Self::Buffer,
pairs_by_token: &Self::Buffer,
pair_weights: &Self::Buffer,
out: &mut Self::Buffer,
batch: usize,
hidden: usize,
top_k: usize,
total_pairs: usize,
) {
let _ = ctx;
let packed = Self::to_vec(packed_down, total_pairs * hidden);
let pairs_host_f32 = Self::to_vec(pairs_by_token, batch * top_k);
let weights_host = Self::to_vec(pair_weights, batch * top_k);
let mut out_h = vec![0.0f32; batch * hidden];
for b in 0..batch {
for k in 0..top_k {
let pair_row = pairs_host_f32[b * top_k + k].to_bits() as i32;
if pair_row < 0 {
continue;
}
let w = weights_host[b * top_k + k];
let src = &packed[(pair_row as usize) * hidden..(pair_row as usize + 1) * hidden];
let dst = &mut out_h[b * hidden..(b + 1) * hidden];
for h in 0..hidden {
dst[h] += w * src[h];
}
}
}
*out = Self::from_slice(&out_h);
}
}