use ferrum_quantization::gguf::GgufFile;
use ferrum_types::{FerrumError, Result};
use crate::models::llama_family::LlamaFamilyConfig;
use crate::moe_config::Qwen3MoeConfig;
const KNOWN_MOE_ARCHS: &[&str] = &["qwen3moe", "mixtral", "deepseek2"];
impl LlamaFamilyConfig {
pub fn from_gguf(gguf: &GgufFile) -> Result<Self> {
let arch = gguf
.architecture()
.map_err(|e| FerrumError::model(format!("read general.architecture: {e}")))?
.to_string();
if KNOWN_MOE_ARCHS.contains(&arch.as_str()) {
return Err(FerrumError::model(format!(
"GGUF arch '{arch}' is MoE — use Qwen3MoeConfig::from_gguf or the matching MoE config builder, not LlamaFamilyConfig::from_gguf"
)));
}
let block_count = read_u32(gguf, &format!("{arch}.block_count"))? as usize;
let hidden_size = read_u32(gguf, &format!("{arch}.embedding_length"))? as usize;
let intermediate_size = read_u32(gguf, &format!("{arch}.feed_forward_length"))? as usize;
let num_heads = read_u32(gguf, &format!("{arch}.attention.head_count"))? as usize;
let num_kv_heads = match read_u32(gguf, &format!("{arch}.attention.head_count_kv")) {
Ok(v) => v as usize,
Err(_) => num_heads,
};
let rms_norm_eps =
read_f32(gguf, &format!("{arch}.attention.layer_norm_rms_epsilon"))? as f32;
let max_seq_len = read_u32(gguf, &format!("{arch}.context_length"))
.map(|v| v as usize)
.unwrap_or(4096);
let default_rope = match arch.as_str() {
"qwen3" | "qwen2" => 1_000_000.0_f64,
"llama" => 500_000.0,
"mistral" => 10_000_000.0,
_ => 10_000.0,
};
let rope_theta = read_f32(gguf, &format!("{arch}.rope.freq_base"))
.map(|v| v as f64)
.unwrap_or(default_rope);
let has_qk_norm = matches!(arch.as_str(), "qwen3");
let sliding_window = read_u32(gguf, &format!("{arch}.attention.sliding_window"))
.map(|v| v as usize)
.unwrap_or(0);
let vocab_size = match read_u32(gguf, &format!("{arch}.vocab_size")) {
Ok(v) => v as usize,
Err(_) => infer_vocab_from_embed(gguf)?,
};
let head_dim = match read_u32(gguf, &format!("{arch}.attention.key_length")) {
Ok(v) => v as usize,
Err(_) => {
if num_heads == 0 || hidden_size % num_heads != 0 {
return Err(FerrumError::model(format!(
"GGUF config: head_dim missing AND hidden_size {hidden_size} not divisible by num_heads {num_heads}"
)));
}
hidden_size / num_heads
}
};
Ok(LlamaFamilyConfig {
hidden_size,
intermediate_size,
num_heads,
num_kv_heads,
head_dim,
num_layers: block_count,
vocab_size,
max_seq_len,
rms_norm_eps,
rope_theta,
has_qk_norm,
sliding_window,
})
}
}
impl Qwen3MoeConfig {
pub fn from_gguf(gguf: &GgufFile) -> Result<Self> {
let arch = gguf
.architecture()
.map_err(|e| FerrumError::model(format!("read general.architecture: {e}")))?
.to_string();
if arch != "qwen3moe" {
return Err(FerrumError::model(format!(
"Qwen3MoeConfig::from_gguf: expected arch 'qwen3moe', got '{arch}'"
)));
}
let num_layers = read_u32(gguf, "qwen3moe.block_count")? as usize;
let hidden_size = read_u32(gguf, "qwen3moe.embedding_length")? as usize;
let num_heads = read_u32(gguf, "qwen3moe.attention.head_count")? as usize;
let num_kv_heads = match read_u32(gguf, "qwen3moe.attention.head_count_kv") {
Ok(v) => v as usize,
Err(_) => num_heads,
};
let rms_norm_eps = read_f32(gguf, "qwen3moe.attention.layer_norm_rms_epsilon")?;
let max_seq_len = read_u32(gguf, "qwen3moe.context_length")
.map(|v| v as usize)
.unwrap_or(32768);
let rope_theta = read_f32(gguf, "qwen3moe.rope.freq_base")
.map(|v| v as f64)
.unwrap_or(1_000_000.0);
let vocab_size = match read_u32(gguf, "qwen3moe.vocab_size") {
Ok(v) => v as usize,
Err(_) => infer_vocab_from_embed(gguf)?,
};
let head_dim = match read_u32(gguf, "qwen3moe.attention.key_length") {
Ok(v) => v as usize,
Err(_) => {
if num_heads == 0 || hidden_size % num_heads != 0 {
return Err(FerrumError::model(format!(
"GGUF Qwen3-MoE: head_dim missing AND hidden_size {hidden_size} not divisible by num_heads {num_heads}"
)));
}
hidden_size / num_heads
}
};
let num_experts = read_u32(gguf, "qwen3moe.expert_count")? as usize;
let num_experts_per_tok = read_u32(gguf, "qwen3moe.expert_used_count")? as usize;
let expert_intermediate_size =
read_u32(gguf, "qwen3moe.expert_feed_forward_length")? as usize;
let norm_topk_prob = match gguf.metadata_bool("qwen3moe.expert_norm_topk_prob") {
Ok(v) => v,
Err(_) => true,
};
if num_experts_per_tok == 0 || num_experts_per_tok > num_experts {
return Err(FerrumError::model(format!(
"GGUF Qwen3-MoE: invalid expert_used_count {num_experts_per_tok} (num_experts={num_experts})"
)));
}
let base = LlamaFamilyConfig {
hidden_size,
intermediate_size: expert_intermediate_size,
num_heads,
num_kv_heads,
head_dim,
num_layers,
vocab_size,
max_seq_len,
rms_norm_eps,
rope_theta,
has_qk_norm: true,
sliding_window: 0,
};
Ok(Self::from_base(
base,
num_experts,
num_experts_per_tok,
expert_intermediate_size,
norm_topk_prob,
))
}
}
fn read_u32(gguf: &GgufFile, key: &str) -> Result<u32> {
gguf.metadata_u32(key)
.map_err(|e| FerrumError::model(format!("GGUF {key}: {e}")))
}
fn read_f32(gguf: &GgufFile, key: &str) -> Result<f32> {
gguf.metadata_f32(key)
.map_err(|e| FerrumError::model(format!("GGUF {key}: {e}")))
}
fn infer_vocab_from_embed(gguf: &GgufFile) -> Result<usize> {
let info = gguf.tensor_info("token_embd.weight").ok_or_else(|| {
FerrumError::model(
"GGUF: cannot infer vocab — neither <arch>.vocab_size nor token_embd.weight present",
)
})?;
let dims = info.shape.dims();
if dims.is_empty() {
return Err(FerrumError::model(
"GGUF: token_embd.weight has empty shape",
));
}
Ok(dims[0])
}