use crate::error::Result;
use crate::gguf::quantized::QuantizedTensorRef;
use crate::gguf::GGUFModel;
use crate::gguf::QuantizedGGUFTransformer;
#[derive(Debug, Clone)]
pub struct Qwen3MoeQuantizedLayer {
pub router: QuantizedTensorRef,
pub gate_exps: QuantizedTensorRef,
pub up_exps: QuantizedTensorRef,
pub down_exps: QuantizedTensorRef,
}
pub fn load_qwen3_moe_layer(
model: &GGUFModel,
data: &[u8],
layer_idx: usize,
) -> Result<Qwen3MoeQuantizedLayer> {
let prefix = format!("blk.{layer_idx}");
Ok(Qwen3MoeQuantizedLayer {
router: QuantizedGGUFTransformer::get_tensor_ref(
model,
data,
&format!("{prefix}.ffn_gate_inp.weight"),
)?,
gate_exps: QuantizedGGUFTransformer::get_tensor_ref(
model,
data,
&format!("{prefix}.ffn_gate_exps.weight"),
)?,
up_exps: QuantizedGGUFTransformer::get_tensor_ref(
model,
data,
&format!("{prefix}.ffn_up_exps.weight"),
)?,
down_exps: QuantizedGGUFTransformer::get_tensor_ref(
model,
data,
&format!("{prefix}.ffn_down_exps.weight"),
)?,
})
}
pub fn expert_byte_slice<'a>(
tensor: &QuantizedTensorRef,
data: &'a [u8],
expert_id: usize,
num_experts: usize,
) -> crate::error::Result<&'a [u8]> {
use crate::error::RealizarError;
if num_experts == 0 {
return Err(RealizarError::InvalidShape {
reason: "expert_byte_slice: num_experts must be > 0".to_string(),
});
}
if expert_id >= num_experts {
return Err(RealizarError::InvalidShape {
reason: format!(
"expert_byte_slice: expert_id {expert_id} out of range \
(num_experts = {num_experts})"
),
});
}
if tensor.byte_size % num_experts != 0 {
return Err(RealizarError::InvalidShape {
reason: format!(
"expert_byte_slice: tensor byte_size {} not divisible by num_experts {} \
— stacking invariant violated. Layout mismatch (LAZY-FUSED-MATVEC \
expects [num_experts, ...] outermost dim contiguous)",
tensor.byte_size, num_experts
),
});
}
let per_expert_bytes = tensor.byte_size / num_experts;
let start = tensor.offset + expert_id * per_expert_bytes;
let end = start + per_expert_bytes;
if end > data.len() {
return Err(RealizarError::InvalidShape {
reason: format!(
"expert_byte_slice: slice range [{start}, {end}) exceeds file size {}",
data.len()
),
});
}
Ok(&data[start..end])
}
pub fn expert_swiglu_quantized(
hidden: &[f32],
layer: &Qwen3MoeQuantizedLayer,
expert_id: usize,
num_experts: usize,
intermediate: usize,
hidden_dim: usize,
data: &[u8],
) -> Result<Vec<f32>> {
use crate::error::RealizarError;
if hidden.len() != hidden_dim {
return Err(RealizarError::InvalidShape {
reason: format!(
"expert_swiglu_quantized: hidden.len() = {} but hidden_dim = {}",
hidden.len(),
hidden_dim
),
});
}
let gate_bytes = expert_byte_slice(&layer.gate_exps, data, expert_id, num_experts)?;
let up_bytes = expert_byte_slice(&layer.up_exps, data, expert_id, num_experts)?;
let down_bytes = expert_byte_slice(&layer.down_exps, data, expert_id, num_experts)?;
let gate_out = matvec_for_qtype(
layer.gate_exps.qtype,
gate_bytes,
hidden,
hidden_dim,
intermediate,
)?;
let up_out = matvec_for_qtype(
layer.up_exps.qtype,
up_bytes,
hidden,
hidden_dim,
intermediate,
)?;
let mut ffn_hidden = vec![0.0f32; intermediate];
for i in 0..intermediate {
let g = gate_out[i];
let silu = g / (1.0 + (-g).exp());
ffn_hidden[i] = silu * up_out[i];
}
let result = matvec_for_qtype(
layer.down_exps.qtype,
down_bytes,
&ffn_hidden,
intermediate,
hidden_dim,
)?;
Ok(result)
}
fn matvec_for_qtype(
qtype: u32,
weight_data: &[u8],
activations: &[f32],
in_dim: usize,
out_dim: usize,
) -> Result<Vec<f32>> {
use crate::error::RealizarError;
use crate::gguf::types::{GGUF_TYPE_Q4_K, GGUF_TYPE_Q6_K};
use crate::quantize::{fused_q4k_parallel_matvec, fused_q6k_parallel_matvec};
match qtype {
GGUF_TYPE_Q4_K => fused_q4k_parallel_matvec(weight_data, activations, in_dim, out_dim),
GGUF_TYPE_Q6_K => fused_q6k_parallel_matvec(weight_data, activations, in_dim, out_dim),
other => Err(RealizarError::UnsupportedOperation {
operation: "moe_expert_matvec".to_string(),
reason: format!(
"MoE expert tensor qtype {other} not supported. Qwen3-Coder Q4_K_M uses \
Q4_K (12) and Q6_K (14) — caller must extend matvec_for_qtype for other \
quantizations."
),
}),
}
}
#[allow(clippy::too_many_arguments)]
pub fn moe_ffn_forward_layer(
hidden: &[f32],
layer: &Qwen3MoeQuantizedLayer,
num_experts: usize,
num_experts_per_tok: usize,
intermediate: usize,
hidden_dim: usize,
data: &[u8],
) -> Result<Vec<f32>> {
use crate::error::RealizarError;
if hidden.len() != hidden_dim {
return Err(RealizarError::InvalidShape {
reason: format!(
"moe_ffn_forward_layer: hidden.len() = {} but hidden_dim = {}",
hidden.len(),
hidden_dim
),
});
}
if layer.router.qtype != crate::gguf::types::GGUF_TYPE_F32 {
return Err(RealizarError::UnsupportedOperation {
operation: "moe_router_quantized_read".to_string(),
reason: format!(
"moe_ffn_forward_layer: router qtype = {} (not F32). Quantized router \
not yet wired — Qwen3-Coder-30B uses F32 router so this is fine for it; \
other Qwen3-MoE variants needing quantized router are M32 follow-up.",
layer.router.qtype
),
});
}
let router_bytes = &data[layer.router.offset..layer.router.offset + layer.router.byte_size];
let expected_bytes = num_experts * hidden_dim * 4;
if router_bytes.len() != expected_bytes {
return Err(RealizarError::InvalidShape {
reason: format!(
"moe_ffn_forward_layer: router byte_size {} != expected {} \
(num_experts {} × hidden_dim {} × 4)",
router_bytes.len(),
expected_bytes,
num_experts,
hidden_dim
),
});
}
let mut logits = vec![0.0f32; num_experts];
for e in 0..num_experts {
let row_off = e * hidden_dim * 4;
let mut sum = 0.0f32;
for j in 0..hidden_dim {
let b = row_off + j * 4;
let w = f32::from_le_bytes([
router_bytes[b],
router_bytes[b + 1],
router_bytes[b + 2],
router_bytes[b + 3],
]);
sum += w * hidden[j];
}
logits[e] = sum;
}
let max_l = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut probs: Vec<f32> = logits.iter().map(|&l| (l - max_l).exp()).collect();
let psum: f32 = probs.iter().sum();
if psum > 0.0 {
for p in &mut probs {
*p /= psum;
}
}
let mut indexed: Vec<(usize, f32)> = probs.iter().copied().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let topk = &indexed[..num_experts_per_tok.min(num_experts)];
let topk_sum: f32 = topk.iter().map(|(_, w)| w).sum();
let topk_renorm: Vec<(usize, f32)> = if topk_sum > 0.0 {
topk.iter().map(|(i, w)| (*i, w / topk_sum)).collect()
} else {
let n = topk.len();
topk.iter().map(|(i, _)| (*i, 1.0 / n as f32)).collect()
};
use rayon::prelude::*;
let expert_outputs: Vec<(f32, Vec<f32>)> = topk_renorm
.par_iter()
.map(|(expert_id, weight)| {
let expert_out = expert_swiglu_quantized(
hidden,
layer,
*expert_id,
num_experts,
intermediate,
hidden_dim,
data,
)?;
Ok::<_, RealizarError>((*weight, expert_out))
})
.collect::<Result<Vec<_>>>()?;
let mut output = vec![0.0f32; hidden_dim];
for (weight, expert_out) in expert_outputs {
for i in 0..hidden_dim {
output[i] += weight * expert_out[i];
}
}
Ok(output)
}
#[allow(clippy::too_many_arguments)]
pub fn moe_ffn_forward_layer_with_router(
hidden: &[f32],
layer: &Qwen3MoeQuantizedLayer,
num_experts: usize,
num_experts_per_tok: usize,
intermediate: usize,
hidden_dim: usize,
data: &[u8],
) -> Result<(Vec<f32>, Vec<f32>)> {
use crate::error::RealizarError;
if hidden.len() != hidden_dim {
return Err(RealizarError::InvalidShape {
reason: format!(
"moe_ffn_forward_layer_with_router: hidden.len() = {} but hidden_dim = {}",
hidden.len(),
hidden_dim
),
});
}
if layer.router.qtype != crate::gguf::types::GGUF_TYPE_F32 {
return Err(RealizarError::UnsupportedOperation {
operation: "moe_router_quantized_read".to_string(),
reason: format!(
"moe_ffn_forward_layer_with_router: router qtype = {} (not F32). \
Quantized router not yet wired.",
layer.router.qtype
),
});
}
let router_bytes = &data[layer.router.offset..layer.router.offset + layer.router.byte_size];
let expected_bytes = num_experts * hidden_dim * 4;
if router_bytes.len() != expected_bytes {
return Err(RealizarError::InvalidShape {
reason: format!(
"moe_ffn_forward_layer_with_router: router byte_size {} != expected {} \
(num_experts {} × hidden_dim {} × 4)",
router_bytes.len(),
expected_bytes,
num_experts,
hidden_dim
),
});
}
let mut logits = vec![0.0f32; num_experts];
for e in 0..num_experts {
let row_off = e * hidden_dim * 4;
let mut sum = 0.0f32;
for j in 0..hidden_dim {
let b = row_off + j * 4;
let w = f32::from_le_bytes([
router_bytes[b],
router_bytes[b + 1],
router_bytes[b + 2],
router_bytes[b + 3],
]);
sum += w * hidden[j];
}
logits[e] = sum;
}
let max_l = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut probs: Vec<f32> = logits.iter().map(|&l| (l - max_l).exp()).collect();
let psum: f32 = probs.iter().sum();
if psum > 0.0 {
for p in &mut probs {
*p /= psum;
}
}
let mut indexed: Vec<(usize, f32)> = probs.iter().copied().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let topk = &indexed[..num_experts_per_tok.min(num_experts)];
let topk_sum: f32 = topk.iter().map(|(_, w)| w).sum();
let topk_renorm: Vec<(usize, f32)> = if topk_sum > 0.0 {
topk.iter().map(|(i, w)| (*i, w / topk_sum)).collect()
} else {
let n = topk.len();
topk.iter().map(|(i, _)| (*i, 1.0 / n as f32)).collect()
};
use rayon::prelude::*;
let expert_outputs: Vec<(f32, Vec<f32>)> = topk_renorm
.par_iter()
.map(|(expert_id, weight)| {
let expert_out = expert_swiglu_quantized(
hidden,
layer,
*expert_id,
num_experts,
intermediate,
hidden_dim,
data,
)?;
Ok::<_, RealizarError>((*weight, expert_out))
})
.collect::<Result<Vec<_>>>()?;
let mut output = vec![0.0f32; hidden_dim];
for (weight, expert_out) in &expert_outputs {
for i in 0..hidden_dim {
output[i] += weight * expert_out[i];
}
}
let router_top_k_weights: Vec<f32> = topk_renorm.iter().map(|(_, w)| *w).collect();
Ok((output, router_top_k_weights))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn qwen3_moe_quantized_layer_is_clone_and_debug() {
let dummy = QuantizedTensorRef {
offset: 0,
byte_size: 0,
num_elements: 0,
qtype: 0,
};
let layer = Qwen3MoeQuantizedLayer {
router: dummy.clone(),
gate_exps: dummy.clone(),
up_exps: dummy.clone(),
down_exps: dummy,
};
let cloned = layer.clone();
assert_eq!(cloned.router.offset, layer.router.offset);
assert!(format!("{layer:?}").contains("Qwen3MoeQuantizedLayer"));
}
#[test]
fn expert_byte_slice_partitions_evenly() {
let data: Vec<u8> = (0..128).collect();
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 128,
num_elements: 128 * 2, qtype: 12, };
for e in 0..4 {
let slice = expert_byte_slice(&tensor, &data, e, 4).unwrap();
assert_eq!(slice.len(), 32, "expert {e} slice length");
assert_eq!(slice[0], (e * 32) as u8, "expert {e} first byte");
}
}
#[test]
fn expert_byte_slice_rejects_out_of_range_expert_id() {
let data = vec![0u8; 64];
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 64,
num_elements: 0,
qtype: 0,
};
let err = expert_byte_slice(&tensor, &data, 4, 4).unwrap_err();
assert!(format!("{err}").contains("expert_id 4 out of range"));
}
#[test]
fn expert_byte_slice_rejects_zero_num_experts() {
let data = vec![0u8; 64];
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 64,
num_elements: 0,
qtype: 0,
};
let err = expert_byte_slice(&tensor, &data, 0, 0).unwrap_err();
assert!(format!("{err}").contains("num_experts must be > 0"));
}
#[test]
fn expert_byte_slice_rejects_uneven_stacking() {
let data = vec![0u8; 100];
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 100,
num_elements: 0,
qtype: 0,
};
let err = expert_byte_slice(&tensor, &data, 0, 3).unwrap_err();
assert!(format!("{err}").contains("stacking invariant violated"));
}
#[test]
fn expert_byte_slice_rejects_overrun() {
let data = vec![0u8; 32];
let tensor = QuantizedTensorRef {
offset: 0,
byte_size: 64, num_elements: 0,
qtype: 0,
};
let err = expert_byte_slice(&tensor, &data, 1, 2).unwrap_err();
assert!(format!("{err}").contains("exceeds file size"));
}
#[test]
fn moe_ffn_forward_layer_with_router_rejects_hidden_dim_mismatch() {
let dummy = QuantizedTensorRef {
offset: 0,
byte_size: 0,
num_elements: 0,
qtype: crate::gguf::types::GGUF_TYPE_F32,
};
let layer = Qwen3MoeQuantizedLayer {
router: dummy.clone(),
gate_exps: dummy.clone(),
up_exps: dummy.clone(),
down_exps: dummy,
};
let hidden = vec![0.0f32; 8];
let data = vec![0u8; 16];
let err = moe_ffn_forward_layer_with_router(
&hidden, &layer, 4, 2, 16, 16, &data,
)
.unwrap_err();
assert!(
format!("{err}").contains("hidden.len() = 8 but hidden_dim = 16"),
"expected hidden_dim mismatch error, got: {err}"
);
}
#[test]
fn moe_ffn_forward_layer_with_router_rejects_non_f32_router() {
let dummy = QuantizedTensorRef {
offset: 0,
byte_size: 0,
num_elements: 0,
qtype: crate::gguf::types::GGUF_TYPE_Q4_K, };
let layer = Qwen3MoeQuantizedLayer {
router: dummy.clone(),
gate_exps: dummy.clone(),
up_exps: dummy.clone(),
down_exps: dummy,
};
let hidden = vec![0.0f32; 16];
let data = vec![0u8; 16];
let err =
moe_ffn_forward_layer_with_router(&hidden, &layer, 4, 2, 16, 16, &data).unwrap_err();
assert!(
format!("{err}").contains("router qtype") && format!("{err}").contains("not F32"),
"expected non-F32 router error, got: {err}"
);
}
}