ferrum-models 0.7.5

Model architectures (LLaMA, Qwen, BERT) for Ferrum inference
Documentation
//! Build a `LlamaFamilyConfig` from a GGUF file's metadata.
//!
//! GGUF model files store the architecture as `general.architecture` (string)
//! and namespace the actual hyperparameters under that prefix:
//!   `qwen3.block_count`, `qwen3.embedding_length`, `qwen3.attention.head_count`, …
//!   `llama.block_count`, `llama.attention.head_count_kv`, …
//!
//! This helper reads those metadata fields and produces the same
//! `LlamaFamilyConfig` you would otherwise build via `qwen3_from_def` /
//! `llama_from_def` from a HuggingFace `config.json`. Returning the same
//! type means downstream model construction (`LlamaFamilyModel::load`) is
//! unchanged regardless of source format.
//!
//! Phase 1C scope: dense Llama-family architectures (qwen3 / qwen2 / llama /
//! mistral / tinyllama). MoE-specific fields (`expert_count`, `expert_used_count`)
//! are deferred to Phase 2 alongside the MoE runtime.
//!
//! Architecture-specific notes:
//!   - **qwen3**: has QK-norm by convention; rope_theta default 1e6.
//!   - **qwen2 / qwen2.5**: no QK-norm; rope_theta default 1e6.
//!   - **llama**: no QK-norm; rope_theta default 5e5 (Llama-3.x).
//!   - **mistral**: no QK-norm; rope_theta default 1e7; reads
//!     `mistral.attention.sliding_window` if present.

use ferrum_quantization::gguf::GgufFile;
use ferrum_types::{FerrumError, Result};

use crate::models::llama_family::{LlamaFamilyConfig, RopeScalingConfig};
use crate::moe_config::Qwen3MoeConfig;

/// Architectures that are known MoE — `LlamaFamilyConfig::from_gguf` rejects
/// them with a pointer to the appropriate MoE constructor rather than
/// silently lowering them to a dense config.
const KNOWN_MOE_ARCHS: &[&str] = &["qwen3moe", "mixtral", "deepseek2"];

impl LlamaFamilyConfig {
    /// Parse a `LlamaFamilyConfig` out of a GGUF file's metadata.
    ///
    /// Errors if `general.architecture` is missing or unrecognised, if
    /// any required architecture-scoped key is absent, or if the GGUF
    /// is a known MoE variant — those go through
    /// [`Qwen3MoeConfig::from_gguf`] instead so the MoE-specific
    /// hyperparameters aren't silently dropped.
    pub fn from_gguf(gguf: &GgufFile) -> Result<Self> {
        let arch = gguf
            .architecture()
            .map_err(|e| FerrumError::model(format!("read general.architecture: {e}")))?
            .to_string();

        if KNOWN_MOE_ARCHS.contains(&arch.as_str()) {
            return Err(FerrumError::model(format!(
                "GGUF arch '{arch}' is MoE — use Qwen3MoeConfig::from_gguf or the matching MoE config builder, not LlamaFamilyConfig::from_gguf"
            )));
        }

        let block_count = read_u32(gguf, &format!("{arch}.block_count"))? as usize;
        let hidden_size = read_u32(gguf, &format!("{arch}.embedding_length"))? as usize;
        let intermediate_size = read_u32(gguf, &format!("{arch}.feed_forward_length"))? as usize;
        let num_heads = read_u32(gguf, &format!("{arch}.attention.head_count"))? as usize;
        // GQA models put kv-head count here; older ones omit it (= num_heads)
        let num_kv_heads = match read_u32(gguf, &format!("{arch}.attention.head_count_kv")) {
            Ok(v) => v as usize,
            Err(_) => num_heads,
        };
        let rms_norm_eps =
            read_f32(gguf, &format!("{arch}.attention.layer_norm_rms_epsilon"))? as f32;
        // Some GGUFs store context length, some don't — fall back to a sane
        // default rather than failing the whole config.
        let max_seq_len = read_u32(gguf, &format!("{arch}.context_length"))
            .map(|v| v as usize)
            .unwrap_or(4096);

        // rope_theta: optional; per-arch defaults.
        let default_rope = match arch.as_str() {
            "qwen3" | "qwen2" => 1_000_000.0_f64,
            "llama" => 500_000.0,
            "mistral" => 10_000_000.0,
            _ => 10_000.0,
        };
        let rope_theta = read_f32(gguf, &format!("{arch}.rope.freq_base"))
            .map(|v| v as f64)
            .unwrap_or(default_rope);
        let rope_scaling = infer_llama3_rope_scaling(gguf, arch.as_str(), max_seq_len, rope_theta);

        // QK-norm: only Qwen3 has it among supported architectures.
        let has_qk_norm = matches!(arch.as_str(), "qwen3");
        let rope_interleaved = matches!(arch.as_str(), "llama");

        // Sliding window: only Mistral v0.1 sets it; missing → 0 (disabled).
        let sliding_window = read_u32(gguf, &format!("{arch}.attention.sliding_window"))
            .map(|v| v as usize)
            .unwrap_or(0);

        // Vocab size: prefer arch-scoped, fall back to embed-table row count.
        let vocab_size = match read_u32(gguf, &format!("{arch}.vocab_size")) {
            Ok(v) => v as usize,
            Err(_) => infer_vocab_from_embed(gguf)?,
        };

        // head_dim: prefer the explicit `<arch>.attention.key_length` key
        // when present — Qwen3-MoE / Qwen3-30B-A3B has hidden_size=2048
        // but num_heads*head_dim=4096 (head_dim=128), so the
        // `hidden_size / num_heads` shortcut is WRONG for that family.
        // Fall back to the divide only for older GGUFs that omit the key.
        let head_dim = match read_u32(gguf, &format!("{arch}.attention.key_length")) {
            Ok(v) => v as usize,
            Err(_) => {
                if num_heads == 0 || hidden_size % num_heads != 0 {
                    return Err(FerrumError::model(format!(
                        "GGUF config: head_dim missing AND hidden_size {hidden_size} not divisible by num_heads {num_heads}"
                    )));
                }
                hidden_size / num_heads
            }
        };

        Ok(LlamaFamilyConfig {
            hidden_size,
            intermediate_size,
            num_heads,
            num_kv_heads,
            head_dim,
            num_layers: block_count,
            vocab_size,
            max_seq_len,
            rms_norm_eps,
            rope_theta,
            rope_scaling,
            rope_interleaved,
            has_qk_norm,
            sliding_window,
        })
    }
}

impl Qwen3MoeConfig {
    /// Parse a `Qwen3MoeConfig` out of a GGUF file's metadata.
    ///
    /// Expects `general.architecture == "qwen3moe"`. Reads the dense fields
    /// from the `qwen3moe.*` namespace (same shape as `LlamaFamilyConfig`)
    /// plus the MoE-specific extras. Falls back to sane defaults for
    /// missing optional fields, matching `LlamaFamilyConfig::from_gguf`.
    ///
    /// Qwen3-MoE uses **QK-norm** like dense Qwen3 — that flag is set
    /// regardless of how `LlamaFamilyConfig::from_gguf` would treat the
    /// arch, because the dense path explicitly excludes MoE archs.
    pub fn from_gguf(gguf: &GgufFile) -> Result<Self> {
        let arch = gguf
            .architecture()
            .map_err(|e| FerrumError::model(format!("read general.architecture: {e}")))?
            .to_string();
        if arch != "qwen3moe" {
            return Err(FerrumError::model(format!(
                "Qwen3MoeConfig::from_gguf: expected arch 'qwen3moe', got '{arch}'"
            )));
        }

        // Reuse the same key conventions as the dense path — qwen3moe.*
        // mirrors qwen3.* exactly for the shared transformer dims.
        let num_layers = read_u32(gguf, "qwen3moe.block_count")? as usize;
        let hidden_size = read_u32(gguf, "qwen3moe.embedding_length")? as usize;
        let num_heads = read_u32(gguf, "qwen3moe.attention.head_count")? as usize;
        let num_kv_heads = match read_u32(gguf, "qwen3moe.attention.head_count_kv") {
            Ok(v) => v as usize,
            Err(_) => num_heads,
        };
        let rms_norm_eps = read_f32(gguf, "qwen3moe.attention.layer_norm_rms_epsilon")?;
        let max_seq_len = read_u32(gguf, "qwen3moe.context_length")
            .map(|v| v as usize)
            .unwrap_or(32768);
        let rope_theta = read_f32(gguf, "qwen3moe.rope.freq_base")
            .map(|v| v as f64)
            .unwrap_or(1_000_000.0);
        let vocab_size = match read_u32(gguf, "qwen3moe.vocab_size") {
            Ok(v) => v as usize,
            Err(_) => infer_vocab_from_embed(gguf)?,
        };

        // See LlamaFamilyConfig::from_gguf for the rationale: prefer the
        // explicit `qwen3moe.attention.key_length` GGUF key. For
        // Qwen3-30B-A3B specifically, hidden=2048 but head_dim=128
        // (num_heads*head_dim=4096 ≠ hidden), so the shortcut divide
        // gives 64 → wrong q_dim → garbage attention output.
        let head_dim = match read_u32(gguf, "qwen3moe.attention.key_length") {
            Ok(v) => v as usize,
            Err(_) => {
                if num_heads == 0 || hidden_size % num_heads != 0 {
                    return Err(FerrumError::model(format!(
                        "GGUF Qwen3-MoE: head_dim missing AND hidden_size {hidden_size} not divisible by num_heads {num_heads}"
                    )));
                }
                hidden_size / num_heads
            }
        };

        // MoE-specific keys.
        let num_experts = read_u32(gguf, "qwen3moe.expert_count")? as usize;
        let num_experts_per_tok = read_u32(gguf, "qwen3moe.expert_used_count")? as usize;
        // Per-expert FFN length — distinct from the legacy `feed_forward_length`
        // (which most qwen3moe GGUFs leave as the dense reference value).
        let expert_intermediate_size =
            read_u32(gguf, "qwen3moe.expert_feed_forward_length")? as usize;
        // Whether the router normalises the top-K logits before combining.
        // Qwen3-MoE: yes. Some legacy GGUFs omit this key.
        let norm_topk_prob = match gguf.metadata_bool("qwen3moe.expert_norm_topk_prob") {
            Ok(v) => v,
            Err(_) => true,
        };

        if num_experts_per_tok == 0 || num_experts_per_tok > num_experts {
            return Err(FerrumError::model(format!(
                "GGUF Qwen3-MoE: invalid expert_used_count {num_experts_per_tok} (num_experts={num_experts})"
            )));
        }

        let base = LlamaFamilyConfig {
            hidden_size,
            // Qwen3-MoE has no shared dense FFN; mirror expert size into
            // base for any code that reads `intermediate_size`.
            intermediate_size: expert_intermediate_size,
            num_heads,
            num_kv_heads,
            head_dim,
            num_layers,
            vocab_size,
            max_seq_len,
            rms_norm_eps,
            rope_theta,
            rope_scaling: None,
            rope_interleaved: false,
            // Qwen3-MoE uses QK-norm exactly like dense Qwen3.
            has_qk_norm: true,
            // No sliding window in Qwen3-MoE.
            sliding_window: 0,
        };

        Ok(Self::from_base(
            base,
            num_experts,
            num_experts_per_tok,
            expert_intermediate_size,
            norm_topk_prob,
        ))
    }
}

fn read_u32(gguf: &GgufFile, key: &str) -> Result<u32> {
    gguf.metadata_u32(key)
        .map_err(|e| FerrumError::model(format!("GGUF {key}: {e}")))
}

fn read_f32(gguf: &GgufFile, key: &str) -> Result<f32> {
    gguf.metadata_f32(key)
        .map_err(|e| FerrumError::model(format!("GGUF {key}: {e}")))
}

fn infer_llama3_rope_scaling(
    gguf: &GgufFile,
    arch: &str,
    max_seq_len: usize,
    rope_theta: f64,
) -> Option<RopeScalingConfig> {
    if arch != "llama" || max_seq_len < 65_536 || (rope_theta - 500_000.0).abs() > 1.0 {
        return None;
    }
    let meta = [
        "general.basename",
        "general.name",
        "general.license",
        "general.finetune",
    ]
    .iter()
    .filter_map(|key| gguf.metadata_string(key).ok())
    .collect::<Vec<_>>()
    .join(" ")
    .to_ascii_lowercase();
    let is_llama3_long = [
        "llama-3.1",
        "llama 3.1",
        "llama3.1",
        "llama-3.2",
        "llama 3.2",
        "llama3.2",
        "llama-3.3",
        "llama 3.3",
        "llama3.3",
    ]
    .iter()
    .any(|needle| meta.contains(needle));
    if is_llama3_long {
        Some(RopeScalingConfig::llama3_default())
    } else {
        None
    }
}

/// Vocab size ≈ rows of the embedding table. Used when `<arch>.vocab_size`
/// isn't recorded in metadata (older GGUF dumps).
fn infer_vocab_from_embed(gguf: &GgufFile) -> Result<usize> {
    let info = gguf.tensor_info("token_embd.weight").ok_or_else(|| {
        FerrumError::model(
            "GGUF: cannot infer vocab — neither <arch>.vocab_size nor token_embd.weight present",
        )
    })?;
    let dims = info.shape.dims();
    if dims.is_empty() {
        return Err(FerrumError::model(
            "GGUF: token_embd.weight has empty shape",
        ));
    }
    Ok(dims[0])
}