aprender-core 0.34.0

impl Architecture {
    /// Map a source tensor name to APR canonical name
    #[must_use]
    pub fn map_name(&self, source_name: &str) -> String {
        match self {
            Self::Auto => Self::auto_map_name(source_name),
            Self::Whisper => Self::whisper_map_name(source_name),
            Self::Llama => Self::llama_map_name(source_name),
            Self::Bert => Self::bert_map_name(source_name),
            Self::Qwen2 => Self::qwen2_map_name(source_name),
            Self::Qwen3 => Self::qwen2_map_name(source_name), // Qwen3 uses same GGUF naming as Qwen2
            Self::Qwen3_5 => Self::qwen2_map_name(source_name), // Qwen3.5 uses same tensor naming as Qwen2
            Self::Gpt2 => Self::gpt2_map_name(source_name),
            Self::Phi => Self::llama_map_name(source_name), // Phi uses HuggingFace model.layers naming
            Self::GptNeoX => Self::gpt_neox_map_name(source_name),
            Self::Opt => Self::opt_map_name(source_name),
            // PMAT-526: New architectures — use LLaMA-like naming (HuggingFace model.layers)
            Self::DeepSeek => Self::llama_map_name(source_name),
            Self::Gemma => Self::llama_map_name(source_name),
            Self::Mistral => Self::llama_map_name(source_name),
            // PMAT-546: Model-family parity — new architectures
            Self::FalconH1 => Self::llama_map_name(source_name), // HuggingFace model.layers naming
            Self::OpenElm => Self::llama_map_name(source_name),  // HuggingFace model.layers naming
            Self::Moonshine => Self::whisper_map_name(source_name), // Audio model, strip model. prefix
            Self::Mamba => Self::auto_map_name(source_name), // SSM: mixer.* naming, passthrough
            Self::Rwkv7 => Self::auto_map_name(source_name), // Recurrence: rwkv.blocks.* naming, passthrough
            // GH-1587: Falcon classic uses `transformer.h.N.*` naming with
            // fused QKV (single MQ head in 7B, MGQA in 40B+).
            Self::FalconClassic => Self::falcon_classic_map_name(source_name),
            // GH-1589: InternLM2 uses LLaMA-style `model.layers.N.*` but with
            // distinct subtree names (attention.wqkv / wo / feed_forward.w1/w2/w3 /
            // attention_norm / ffn_norm / tok_embeddings / output).
            Self::InternLm2 => Self::internlm2_map_name(source_name),
            // GH-1586: BLOOM's HuggingFace names diverge from the LLaMA pattern
            // (word_embeddings vs embed_tokens; h.N.* vs model.layers.N.*).
            Self::Bloom => Self::bloom_map_name(source_name),
        }
    }

    /// PMAT-224: Check if this architecture has verified inference support.
    ///
    /// Returns true only for architectures with tested tensor name mapping
    /// and confirmed realizar inference compatibility.
    ///
    /// `Bert` was promoted to verified in GH-326 Phase 4b after end-to-end
    /// numerical parity was demonstrated against the HuggingFace reference
    /// for `cross-encoder/ms-marco-MiniLM-L-6-v2`:
    ///
    /// | pair | HF score | apr score | input_ids match |
    /// |---|---|---|---|
    /// | France/Paris | 0.999805 | 0.999805 | ✅ exact |
    /// | France/Cats  | 0.000015 | 0.000015 | ✅ exact |
    /// | ML/neural    | 0.000020 | 0.000020 | ✅ exact |
    ///
    /// Raw logits differ by < 4e-4 (f32 round-off); sigmoid scores match
    /// to 6 decimal places. WordPiece tokenization is bit-identical.
    #[must_use]
    pub fn is_inference_verified(&self) -> bool {
        matches!(
            self,
            Self::Qwen2 | Self::Qwen3 | Self::Qwen3_5 | Self::Llama | Self::Phi | Self::Bert
        )
    }

    /// PMAT-526: Returns true for decoder-only LLM architectures that use BPE tokenizers
    /// and support chat templates. Returns false for audio models (Whisper, Moonshine),
    /// encoder-only models (BERT), and Auto (indeterminate).
    #[must_use]
    pub fn is_llm(&self) -> bool {
        matches!(
            self,
            Self::Llama
                | Self::Qwen2
                | Self::Qwen3
                | Self::Qwen3_5
                | Self::Gpt2
                | Self::Phi
                | Self::GptNeoX
                | Self::Opt
                | Self::DeepSeek
                | Self::Gemma
                | Self::Mistral
                | Self::FalconH1
                | Self::Mamba
                | Self::OpenElm
                | Self::Rwkv7
                | Self::FalconClassic
                | Self::InternLm2
                | Self::Bloom
        )
    }

    /// GH-279: Get the architecture key for `enforce_architecture_completeness()`.
    ///
    /// Returns a lowercase key that matches the architecture match table in
    /// `layout_contract_part_03.rs::enforce_architecture_completeness()`.
    /// Returns `None` for architectures where completeness checking doesn't apply
    /// (e.g., Whisper, BERT, GPT-2 have different tensor naming).
    #[must_use]
    pub fn completeness_key(&self) -> Option<&'static str> {
        match self {
            Self::Llama => Some("llama"),
            Self::Qwen2 => Some("qwen2"),
            Self::Qwen3 => Some("qwen3"),
            Self::Qwen3_5 => Some("qwen3_5"), // Different: no QK norm (unlike Qwen3)
            Self::Phi => Some("phi"),
            // Auto, Whisper, BERT, GPT-2, GPT-NeoX, OPT: no completeness check (different tensor naming)
            _ => None,
        }
    }

    /// PMAT-224: Get a human-readable name for warning messages.
    #[must_use]
    pub fn display_name(&self) -> &'static str {
        match self {
            Self::Auto => "auto-detected",
            Self::Whisper => "Whisper",
            Self::Llama => "LLaMA",
            Self::Bert => "BERT",
            Self::Qwen2 => "Qwen2",
            Self::Qwen3 => "Qwen3",
            Self::Qwen3_5 => "Qwen3.5",
            Self::Gpt2 => "GPT-2",
            Self::Phi => "Phi",
            Self::GptNeoX => "GPT-NeoX",
            Self::Opt => "OPT",
            Self::DeepSeek => "DeepSeek",
            Self::Gemma => "Gemma",
            Self::Mistral => "Mistral",
            Self::FalconH1 => "Falcon-H1",
            Self::Mamba => "Mamba",
            Self::Moonshine => "Moonshine",
            Self::OpenElm => "OpenELM",
            Self::Rwkv7 => "RWKV-7",
            Self::FalconClassic => "Falcon",
            Self::InternLm2 => "InternLM2",
            Self::Bloom => "BLOOM",
        }
    }

    /// Parse a `model_type` string (from config.json or GGUF metadata) into an Architecture.
    ///
    /// Returns None for unrecognized types. Centralizes the mapping used by
    /// `infer_architecture()` (import.rs) and `detect_gguf_architecture()` (export.rs).
    #[must_use]
    pub fn from_model_type(model_type: &str) -> Option<Self> {
        match model_type.to_lowercase().as_str() {
            "qwen2" | "qwen" | "qwen2.5" => Some(Self::Qwen2),
            "qwen3" => Some(Self::Qwen3),
            "qwen3_5" | "qwen3.5" => Some(Self::Qwen3_5),
            "llama" | "llama2" | "llama3" => Some(Self::Llama),
            "whisper" => Some(Self::Whisper),
            "bert" => Some(Self::Bert),
            "gpt2" => Some(Self::Gpt2),
            "phi" | "phi3" | "phi4" => Some(Self::Phi),
            // GH-311: GPT-NeoX family (EleutherAI)
            "gpt-neox" | "gpt_neox" | "gptneox" | "pythia" => Some(Self::GptNeoX),
            // GH-311: Meta OPT family
            "opt" | "galactica" => Some(Self::Opt),
            // GH-311 / GH-1594: StarCoder + GPTBigCode reuse GPT-2 tensor naming
            "starcoder" | "starcoder2" | "bigcode" | "gpt_bigcode" | "gpt-bigcode" => {
                Some(Self::Gpt2)
            }
            // PMAT-526: Proper architecture variants for major model families
            "deepseek" | "deepseek_v2" | "deepseek-v2" => Some(Self::DeepSeek),
            "gemma" | "gemma2" | "gemma3" => Some(Self::Gemma),
            "mistral" | "mixtral" => Some(Self::Mistral),
            // PMAT-546: Model-family parity — new architecture variants
            "falcon_h1" | "falcon-h1" | "falconh1" | "falcon3" => Some(Self::FalconH1),
            "mamba" | "mamba2" => Some(Self::Mamba),
            "moonshine" => Some(Self::Moonshine),
            "openelm" => Some(Self::OpenElm),
            "rwkv" | "rwkv7" | "rwkv-7" => Some(Self::Rwkv7),
            // LLaMA derivatives (use LLaMA tensor naming)
            // GH-1591 OLMo / GH-1592 StableLM added here as Llama-family
            "smollm" | "smollm2" | "granite" | "granite3" | "nemotron" | "olmo" | "olmo2"
            | "stablelm" | "stablelm_epoch" | "stablelm_alpha" => Some(Self::Llama),
            // GH-1587: Falcon classic — distinct from FalconH1 (hybrid SSM).
            "falcon" | "falcon7b" | "falcon40b" | "falcon11b" | "refinedweb" => {
                Some(Self::FalconClassic)
            }
            // GH-1589: InternLM2 / InternLM2.5 — distinct from LLaMA (renamed
            // subtrees: wqkv/wo/w1/w2/w3 vs q/k/v/o + gate/up/down).
            "internlm2" | "internlm2_5" | "internlm2.5" => Some(Self::InternLm2),
            // GH-1586: BLOOM — distinct from LLaMA (ALiBi position bias, fused QKV).
            "bloom" | "bloomz" => Some(Self::Bloom),
            _ => None,
        }
    }

    fn auto_map_name(name: &str) -> String {
        // PMAT-099: Preserve original tensor names for AprTransformer compatibility
        // AprTransformer::from_apr_bytes expects model.* prefixes for HuggingFace models
        name.to_string()
    }

    fn whisper_map_name(name: &str) -> String {
        // GH-577: Strip model. prefix for whisper-apr compatibility.
        // whisper-apr's load_from_apr() expects encoder./decoder. prefixes
        // without model. prefix (matching whisper-apr's own map_tensor_name).
        if let Some(stripped) = name.strip_prefix("model.") {
            stripped.to_string()
        } else {
            name.to_string()
        }
    }

    fn llama_map_name(name: &str) -> String {
        // PMAT-099: Preserve model. prefix for LLaMA
        name.to_string()
    }

    fn bert_map_name(name: &str) -> String {
        // BERT uses "bert." prefix - preserve it
        name.to_string()
    }

    fn qwen2_map_name(name: &str) -> String {
        // PMAT-205 FIX (GH-190): Map GGUF tensor names to APR canonical format.
        // APR uses BARE names WITHOUT "model." prefix to match the Qwen2 loader
        // contract (models/qwen2/mod.rs:1046-1131).
        //
        // GGUF: blk.N.attn_q.weight → APR: layers.N.self_attn.q_proj.weight
        //
        // PMAT-113 originally added "model." prefix, but the loader expects bare
        // names. This mismatch caused GH-190: 196 tensors unfindable → garbage.

        // Handle layer-specific tensors (blk.N.*)
        if let Some(rest) = name.strip_prefix("blk.") {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                // Map GGUF tensor suffixes to APR canonical names
                let apr_suffix = match suffix {
                    "attn_q.weight" => "self_attn.q_proj.weight",
                    "attn_q.bias" => "self_attn.q_proj.bias",
                    "attn_k.weight" => "self_attn.k_proj.weight",
                    "attn_k.bias" => "self_attn.k_proj.bias",
                    "attn_v.weight" => "self_attn.v_proj.weight",
                    "attn_v.bias" => "self_attn.v_proj.bias",
                    "attn_output.weight" => "self_attn.o_proj.weight",
                    "attn_output.bias" => "self_attn.o_proj.bias",
                    "attn_norm.weight" => "input_layernorm.weight",
                    // GH-279: Qwen3 QK normalization tensors
                    "attn_q_norm.weight" => "self_attn.q_norm.weight",
                    "attn_k_norm.weight" => "self_attn.k_norm.weight",
                    "ffn_gate.weight" => "mlp.gate_proj.weight",
                    "ffn_up.weight" => "mlp.up_proj.weight",
                    "ffn_down.weight" => "mlp.down_proj.weight",
                    "ffn_norm.weight" => "post_attention_layernorm.weight",
                    other => other, // Preserve unknown suffixes
                };

                // PMAT-222 FIX: Add "model." prefix to match SafeTensors convention
                // GH-190 was wrong - realizar DOES expect "model.layers.N.suffix"
                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // PMAT-222 FIX: Handle non-layer tensors with "model." prefix to match SafeTensors
        // Realizar's AprTransformer looks for "model.embed_tokens.weight" not "embed_tokens.weight"
        match name {
            "token_embd.weight" => "model.embed_tokens.weight".to_string(),
            "output.weight" => "lm_head.weight".to_string(),
            "output_norm.weight" => "model.norm.weight".to_string(),
            _ => name.to_string(), // Preserve unknown names
        }
    }

    /// GH-311: Map GPT-NeoX tensor names to APR canonical format.
    ///
    /// GPT-NeoX uses `gpt_neox.layers.N.*` naming. The fused `query_key_value` tensor
    /// is preserved here and split by `split_neox_fused_qkv()` after mapping.
    fn gpt_neox_map_name(name: &str) -> String {
        if let Some(rest) = name.strip_prefix("gpt_neox.layers.") {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                let apr_suffix = match suffix {
                    "input_layernorm.weight" => "input_layernorm.weight",
                    "input_layernorm.bias" => "input_layernorm.bias",
                    "post_attention_layernorm.weight" => "post_attention_layernorm.weight",
                    "post_attention_layernorm.bias" => "post_attention_layernorm.bias",
                    // Fused QKV — preserved, split later by split_neox_fused_qkv()
                    "attention.query_key_value.weight" => "self_attn.query_key_value.weight",
                    "attention.query_key_value.bias" => "self_attn.query_key_value.bias",
                    "attention.dense.weight" => "self_attn.o_proj.weight",
                    "attention.dense.bias" => "self_attn.o_proj.bias",
                    // GPT-NeoX MLP (no gate projection — uses dense_h_to_4h / dense_4h_to_h)
                    "mlp.dense_h_to_4h.weight" => "mlp.up_proj.weight",
                    "mlp.dense_h_to_4h.bias" => "mlp.up_proj.bias",
                    "mlp.dense_4h_to_h.weight" => "mlp.down_proj.weight",
                    "mlp.dense_4h_to_h.bias" => "mlp.down_proj.bias",
                    other => other,
                };

                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // Non-layer tensors
        match name {
            "gpt_neox.embed_in.weight" => "model.embed_tokens.weight".to_string(),
            "gpt_neox.final_layer_norm.weight" => "model.norm.weight".to_string(),
            "gpt_neox.final_layer_norm.bias" => "model.norm.bias".to_string(),
            "embed_out.weight" => "lm_head.weight".to_string(),
            _ => name.to_string(),
        }
    }

    /// GH-311: Map OPT tensor names to APR canonical format.
    ///
    /// OPT uses `model.decoder.layers.N.*` naming with separate Q/K/V projections.
    fn opt_map_name(name: &str) -> String {
        if let Some(rest) = name.strip_prefix("model.decoder.layers.") {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                let apr_suffix = match suffix {
                    "self_attn_layer_norm.weight" => "input_layernorm.weight",
                    "self_attn_layer_norm.bias" => "input_layernorm.bias",
                    "final_layer_norm.weight" => "post_attention_layernorm.weight",
                    "final_layer_norm.bias" => "post_attention_layernorm.bias",
                    // OPT has separate Q/K/V (no fusion)
                    "self_attn.q_proj.weight" => "self_attn.q_proj.weight",
                    "self_attn.q_proj.bias" => "self_attn.q_proj.bias",
                    "self_attn.k_proj.weight" => "self_attn.k_proj.weight",
                    "self_attn.k_proj.bias" => "self_attn.k_proj.bias",
                    "self_attn.v_proj.weight" => "self_attn.v_proj.weight",
                    "self_attn.v_proj.bias" => "self_attn.v_proj.bias",
                    "self_attn.out_proj.weight" => "self_attn.o_proj.weight",
                    "self_attn.out_proj.bias" => "self_attn.o_proj.bias",
                    // OPT MLP (fc1 = up, fc2 = down, no gate)
                    "fc1.weight" => "mlp.up_proj.weight",
                    "fc1.bias" => "mlp.up_proj.bias",
                    "fc2.weight" => "mlp.down_proj.weight",
                    "fc2.bias" => "mlp.down_proj.bias",
                    other => other,
                };

                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // Non-layer tensors
        match name {
            "model.decoder.embed_tokens.weight" => "model.embed_tokens.weight".to_string(),
            "model.decoder.embed_positions.weight" => "model.position_embedding.weight".to_string(),
            "model.decoder.final_layer_norm.weight" => "model.norm.weight".to_string(),
            "model.decoder.final_layer_norm.bias" => "model.norm.bias".to_string(),
            "lm_head.weight" => "lm_head.weight".to_string(),
            _ => name.to_string(),
        }
    }

    /// GH-1587: Map Falcon classic tensor names to APR canonical format.
    ///
    /// Falcon (`FalconForCausalLM`, both 7B/40B/11B and RefinedWeb variants)
    /// uses HuggingFace `transformer.h.N.*` naming with:
    /// - Fused QKV (`self_attention.query_key_value`), either MQA (single
    ///   K/V head, 7B) or MGQA (8 K/V groups, 40B)
    /// - RoPE position encoding (no positional-embedding tensor)
    /// - Parallel attn+mlp residuals (40B+ has separate norms; 7B uses one)
    /// - LayerNorm (Falcon-7B has a single `input_layernorm` per block; 40B+
    ///   has both `ln_attn` and `ln_mlp`)
    ///
    /// HuggingFace → APR mapping:
    /// - `transformer.word_embeddings.weight`           → `model.embed_tokens.weight`
    /// - `transformer.h.N.input_layernorm.{w,b}`        → `model.layers.N.input_layernorm.{w,b}`
    /// - `transformer.h.N.ln_attn.{w,b}` (40B)          → `model.layers.N.input_layernorm.{w,b}`
    /// - `transformer.h.N.ln_mlp.{w,b}` (40B)           → `model.layers.N.post_attention_layernorm.{w,b}`
    /// - `transformer.h.N.self_attention.query_key_value.{w,b}` → `model.layers.N.self_attn.qkv_proj.{w,b}` (fused)
    /// - `transformer.h.N.self_attention.dense.{w,b}`   → `model.layers.N.self_attn.o_proj.{w,b}`
    /// - `transformer.h.N.mlp.dense_h_to_4h.{w,b}`      → `model.layers.N.mlp.up_proj.{w,b}`
    /// - `transformer.h.N.mlp.dense_4h_to_h.{w,b}`      → `model.layers.N.mlp.down_proj.{w,b}`
    /// - `transformer.ln_f.{w,b}`                       → `model.norm.{w,b}`
    /// - `lm_head.weight`                               → `lm_head.weight`
    fn falcon_classic_map_name(name: &str) -> String {
        if let Some(rest) = name.strip_prefix("transformer.h.") {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                let apr_suffix = match suffix {
                    // Falcon-7B: single layernorm per block
                    "input_layernorm.weight" => "input_layernorm.weight",
                    "input_layernorm.bias" => "input_layernorm.bias",
                    // Falcon-40B: separate attn + mlp layernorms
                    "ln_attn.weight" => "input_layernorm.weight",
                    "ln_attn.bias" => "input_layernorm.bias",
                    "ln_mlp.weight" => "post_attention_layernorm.weight",
                    "ln_mlp.bias" => "post_attention_layernorm.bias",
                    // Older single post_attention_layernorm (some variants)
                    "post_attention_layernorm.weight" => "post_attention_layernorm.weight",
                    "post_attention_layernorm.bias" => "post_attention_layernorm.bias",
                    // Fused QKV — kept fused; splitter at conversion layer handles
                    // the MQA/MGQA-specific Q/K/V layout.
                    "self_attention.query_key_value.weight" => "self_attn.qkv_proj.weight",
                    "self_attention.query_key_value.bias" => "self_attn.qkv_proj.bias",
                    "self_attention.dense.weight" => "self_attn.o_proj.weight",
                    "self_attention.dense.bias" => "self_attn.o_proj.bias",
                    "mlp.dense_h_to_4h.weight" => "mlp.up_proj.weight",
                    "mlp.dense_h_to_4h.bias" => "mlp.up_proj.bias",
                    "mlp.dense_4h_to_h.weight" => "mlp.down_proj.weight",
                    "mlp.dense_4h_to_h.bias" => "mlp.down_proj.bias",
                    other => other,
                };

                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // Non-layer tensors
        match name {
            "transformer.word_embeddings.weight" => "model.embed_tokens.weight".to_string(),
            "transformer.ln_f.weight" => "model.norm.weight".to_string(),
            "transformer.ln_f.bias" => "model.norm.bias".to_string(),
            "lm_head.weight" => "lm_head.weight".to_string(),
            _ => name.to_string(),
        }
    }

    /// GH-1589: Map InternLM2 / InternLM2.5 tensor names to APR canonical format.
    ///
    /// InternLM2 (`InternLM2ForCausalLM`) is LLaMA-derivative but with renamed
    /// subtrees:
    /// - `model.tok_embeddings.weight`                  → `model.embed_tokens.weight`
    /// - `model.layers.N.attention.wqkv.weight`         → `model.layers.N.self_attn.qkv_proj.weight` (fused)
    /// - `model.layers.N.attention.wo.weight`           → `model.layers.N.self_attn.o_proj.weight`
    /// - `model.layers.N.feed_forward.w1.weight`        → `model.layers.N.mlp.gate_proj.weight`
    /// - `model.layers.N.feed_forward.w2.weight`        → `model.layers.N.mlp.down_proj.weight`
    /// - `model.layers.N.feed_forward.w3.weight`        → `model.layers.N.mlp.up_proj.weight`
    /// - `model.layers.N.attention_norm.weight`         → `model.layers.N.input_layernorm.weight`
    /// - `model.layers.N.ffn_norm.weight`               → `model.layers.N.post_attention_layernorm.weight`
    /// - `model.norm.weight`                            → `model.norm.weight` (passthrough)
    /// - `output.weight`                                → `lm_head.weight`
    ///
    /// Fused QKV is kept fused here; splitter must run at conversion layer
    /// since InternLM2 packs Q/K/V interleaved per GQA group (8 K/V groups
    /// in 7B/2.5-7B, 8 K/V groups in 20B).
    fn internlm2_map_name(name: &str) -> String {
        if let Some(rest) = name.strip_prefix("model.layers.") {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                let apr_suffix = match suffix {
                    "attention.wqkv.weight" => "self_attn.qkv_proj.weight",
                    "attention.wqkv.bias" => "self_attn.qkv_proj.bias",
                    "attention.wo.weight" => "self_attn.o_proj.weight",
                    "attention.wo.bias" => "self_attn.o_proj.bias",
                    // InternLM2 MLP: w1=gate, w2=down, w3=up (SwiGLU)
                    "feed_forward.w1.weight" => "mlp.gate_proj.weight",
                    "feed_forward.w2.weight" => "mlp.down_proj.weight",
                    "feed_forward.w3.weight" => "mlp.up_proj.weight",
                    "attention_norm.weight" => "input_layernorm.weight",
                    "ffn_norm.weight" => "post_attention_layernorm.weight",
                    other => other,
                };

                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // Non-layer tensors
        match name {
            "model.tok_embeddings.weight" => "model.embed_tokens.weight".to_string(),
            "model.norm.weight" => "model.norm.weight".to_string(),
            "output.weight" => "lm_head.weight".to_string(),
            _ => name.to_string(),
        }
    }

    /// GH-1586: Map BLOOM tensor names to APR canonical format.
    ///
    /// BLOOM (`BloomForCausalLM`) uses HuggingFace `h.N.*` naming with fused
    /// QKV in `self_attention.query_key_value` and ALiBi position bias (no
    /// positional-embedding tensor). The fused QKV is RENAMED here but not
    /// split — the splitter must run at the conversion layer because BLOOM
    /// packs Q/K/V interleaved per head, not concatenated.
    ///
    /// HuggingFace → APR mapping:
    /// - `word_embeddings.weight`                       → `model.embed_tokens.weight`
    /// - `word_embeddings_layernorm.weight/bias`        → `model.embed_norm.{w,b}`
    /// - `h.N.input_layernorm.{w,b}`                    → `model.layers.N.input_layernorm.{w,b}`
    /// - `h.N.self_attention.query_key_value.{w,b}`     → `model.layers.N.self_attn.qkv_proj.{w,b}` (fused)
    /// - `h.N.self_attention.dense.{w,b}`               → `model.layers.N.self_attn.o_proj.{w,b}`
    /// - `h.N.post_attention_layernorm.{w,b}`           → `model.layers.N.post_attention_layernorm.{w,b}`
    /// - `h.N.mlp.dense_h_to_4h.{w,b}`                  → `model.layers.N.mlp.up_proj.{w,b}`
    /// - `h.N.mlp.dense_4h_to_h.{w,b}`                  → `model.layers.N.mlp.down_proj.{w,b}`
    /// - `ln_f.{w,b}`                                   → `model.norm.{w,b}`
    fn bloom_map_name(name: &str) -> String {
        if let Some(rest) = name.strip_prefix("h.") {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                let apr_suffix = match suffix {
                    "input_layernorm.weight" => "input_layernorm.weight",
                    "input_layernorm.bias" => "input_layernorm.bias",
                    "post_attention_layernorm.weight" => "post_attention_layernorm.weight",
                    "post_attention_layernorm.bias" => "post_attention_layernorm.bias",
                    // Fused QKV — kept fused here; splitter runs at conversion layer.
                    "self_attention.query_key_value.weight" => "self_attn.qkv_proj.weight",
                    "self_attention.query_key_value.bias" => "self_attn.qkv_proj.bias",
                    "self_attention.dense.weight" => "self_attn.o_proj.weight",
                    "self_attention.dense.bias" => "self_attn.o_proj.bias",
                    "mlp.dense_h_to_4h.weight" => "mlp.up_proj.weight",
                    "mlp.dense_h_to_4h.bias" => "mlp.up_proj.bias",
                    "mlp.dense_4h_to_h.weight" => "mlp.down_proj.weight",
                    "mlp.dense_4h_to_h.bias" => "mlp.down_proj.bias",
                    other => other,
                };

                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // Non-layer tensors
        match name {
            "word_embeddings.weight" => "model.embed_tokens.weight".to_string(),
            "word_embeddings_layernorm.weight" => "model.embed_norm.weight".to_string(),
            "word_embeddings_layernorm.bias" => "model.embed_norm.bias".to_string(),
            "ln_f.weight" => "model.norm.weight".to_string(),
            "ln_f.bias" => "model.norm.bias".to_string(),
            // BLOOM ties embeddings → lm_head; if a separate lm_head exists, preserve.
            "lm_head.weight" => "lm_head.weight".to_string(),
            _ => name.to_string(),
        }
    }

    /// GH-311: Split GPT-NeoX fused QKV tensors into separate Q, K, V projections.
    ///
    /// GPT-NeoX uses `query_key_value` which concatenates Q, K, V along dim 0.
    /// Shape: `[3*hidden, hidden]` for weights, `[3*hidden]` for biases.
    pub fn split_neox_fused_qkv(tensors: &mut BTreeMap<String, (Vec<f32>, Vec<usize>)>) {
        let fused_keys: Vec<String> = tensors
            .keys()
            .filter(|k| k.contains("self_attn.query_key_value."))
            .cloned()
            .collect();

        for fused_name in fused_keys {
            let (data, shape) = match tensors.remove(&fused_name) {
                Some(v) => v,
                None => continue,
            };

            let is_bias = std::path::Path::new(&fused_name)
                .extension()
                .is_some_and(|ext| ext.eq_ignore_ascii_case("bias"));

            if is_bias {
                if data.len() % 3 != 0 {
                    tensors.insert(fused_name, (data, shape));
                    continue;
                }
                let chunk = data.len() / 3;
                let base = fused_name.replace("self_attn.query_key_value.bias", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.bias"),
                    (data[..chunk].to_vec(), vec![chunk]),
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.bias"),
                    (data[chunk..2 * chunk].to_vec(), vec![chunk]),
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.bias"),
                    (data[2 * chunk..].to_vec(), vec![chunk]),
                );
            } else {
                if shape.len() != 2 || shape[0] % 3 != 0 {
                    tensors.insert(fused_name, (data, shape));
                    continue;
                }
                let rows_per_proj = shape[0] / 3;
                let cols = shape[1];
                let chunk = rows_per_proj * cols;
                let base = fused_name.replace("self_attn.query_key_value.weight", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.weight"),
                    (data[..chunk].to_vec(), vec![rows_per_proj, cols]),
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.weight"),
                    (data[chunk..2 * chunk].to_vec(), vec![rows_per_proj, cols]),
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.weight"),
                    (data[2 * chunk..].to_vec(), vec![rows_per_proj, cols]),
                );
            }
        }
    }

    /// GH-311: Split GPT-NeoX fused QKV tensors (raw/quantized version).
    ///
    /// Like `split_neox_fused_qkv()` but works with raw quantized bytes
    /// (`GgufRawTensor`). Splits by dividing raw bytes into 3 equal parts.
    pub fn split_neox_fused_qkv_raw(
        tensors: &mut BTreeMap<String, crate::format::gguf::GgufRawTensor>,
    ) {
        let fused_keys: Vec<String> = tensors
            .keys()
            .filter(|k| k.contains("self_attn.query_key_value."))
            .cloned()
            .collect();

        for fused_name in fused_keys {
            let tensor = match tensors.remove(&fused_name) {
                Some(v) => v,
                None => continue,
            };

            let is_bias = std::path::Path::new(&fused_name)
                .extension()
                .is_some_and(|ext| ext.eq_ignore_ascii_case("bias"));

            if is_bias {
                if tensor.data.len() % 3 != 0 || tensor.shape.len() != 1 || tensor.shape[0] % 3 != 0
                {
                    tensors.insert(fused_name, tensor);
                    continue;
                }
                let byte_chunk = tensor.data.len() / 3;
                let elem_chunk = tensor.shape[0] / 3;
                let base = fused_name.replace("self_attn.query_key_value.bias", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.bias"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[..byte_chunk].to_vec(),
                        shape: vec![elem_chunk],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.bias"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[byte_chunk..2 * byte_chunk].to_vec(),
                        shape: vec![elem_chunk],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.bias"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[2 * byte_chunk..].to_vec(),
                        shape: vec![elem_chunk],
                        dtype: tensor.dtype,
                    },
                );
            } else {
                if tensor.shape.len() != 2 || tensor.shape[0] % 3 != 0 || tensor.data.len() % 3 != 0
                {
                    tensors.insert(fused_name, tensor);
                    continue;
                }
                let rows_per_proj = tensor.shape[0] / 3;
                let cols = tensor.shape[1];
                let byte_chunk = tensor.data.len() / 3;
                let base = fused_name.replace("self_attn.query_key_value.weight", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.weight"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[..byte_chunk].to_vec(),
                        shape: vec![rows_per_proj, cols],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.weight"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[byte_chunk..2 * byte_chunk].to_vec(),
                        shape: vec![rows_per_proj, cols],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.weight"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[2 * byte_chunk..].to_vec(),
                        shape: vec![rows_per_proj, cols],
                        dtype: tensor.dtype,
                    },
                );
            }
        }
    }

    /// GH-233: Map GPT-2 tensor names to APR canonical format.
    ///
    /// GPT-2 uses `transformer.h.N.*` naming. The fused `c_attn` tensor is
    /// preserved here and split by `split_gpt2_fused_qkv()` after mapping.
    fn gpt2_map_name(name: &str) -> String {
        // GH-255: Handle both "transformer.h.N.*" (PyTorch) and "h.N.*" (SafeTensors) patterns
        let layer_rest = name
            .strip_prefix("transformer.h.")
            .or_else(|| name.strip_prefix("h."));

        if let Some(rest) = layer_rest {
            if let Some(dot_pos) = rest.find('.') {
                let layer_num = &rest[..dot_pos];
                let suffix = &rest[dot_pos + 1..];

                let apr_suffix = match suffix {
                    "ln_1.weight" => "input_layernorm.weight",
                    "ln_1.bias" => "input_layernorm.bias",
                    "ln_2.weight" => "post_attention_layernorm.weight",
                    "ln_2.bias" => "post_attention_layernorm.bias",
                    "attn.c_attn.weight" => "self_attn.c_attn.weight",
                    "attn.c_attn.bias" => "self_attn.c_attn.bias",
                    "attn.c_proj.weight" => "self_attn.o_proj.weight",
                    "attn.c_proj.bias" => "self_attn.o_proj.bias",
                    "mlp.c_fc.weight" => "mlp.up_proj.weight",
                    "mlp.c_fc.bias" => "mlp.up_proj.bias",
                    "mlp.c_proj.weight" => "mlp.down_proj.weight",
                    "mlp.c_proj.bias" => "mlp.down_proj.bias",
                    other => other,
                };

                return format!("model.layers.{layer_num}.{apr_suffix}");
            }
        }

        // Non-layer tensors: handle with/without "transformer." prefix
        let base_name = name.strip_prefix("transformer.").unwrap_or(name);
        match base_name {
            "wte.weight" => "model.embed_tokens.weight".to_string(),
            "wpe.weight" => "model.position_embedding.weight".to_string(),
            "ln_f.weight" => "model.norm.weight".to_string(),
            "ln_f.bias" => "model.norm.bias".to_string(),
            _ => name.to_string(),
        }
    }

    /// GH-233/GH-255: Split GPT-2 fused QKV tensors into separate Q, K, V projections.
    ///
    /// GPT-2's `c_attn` can have shape `[hidden, 3*hidden]` (SafeTensors/HF) or
    /// `[3*hidden, hidden]` (GGUF). Detects fused dimension automatically.
    /// Call this AFTER `map_tensor_names()` when architecture is `Gpt2`.
    pub fn split_gpt2_fused_qkv(tensors: &mut BTreeMap<String, (Vec<f32>, Vec<usize>)>) {
        // Collect fused c_attn tensor names
        let fused_keys: Vec<String> = tensors
            .keys()
            .filter(|k| k.contains("self_attn.c_attn."))
            .cloned()
            .collect();

        for fused_name in fused_keys {
            let (data, shape) = match tensors.remove(&fused_name) {
                Some(v) => v,
                None => continue,
            };

            let is_bias = fused_name
                .rsplit_once('.')
                .is_some_and(|(_, ext)| ext.eq_ignore_ascii_case("bias"));

            if is_bias {
                // Bias: 1D tensor of shape [3*hidden] — split into 3 equal parts
                if data.len() % 3 != 0 {
                    // Can't split evenly, put it back
                    tensors.insert(fused_name, (data, shape));
                    continue;
                }
                let chunk = data.len() / 3;
                let base = fused_name.replace("self_attn.c_attn.bias", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.bias"),
                    (data[..chunk].to_vec(), vec![chunk]),
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.bias"),
                    (data[chunk..2 * chunk].to_vec(), vec![chunk]),
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.bias"),
                    (data[2 * chunk..].to_vec(), vec![chunk]),
                );
            } else {
                // Weight: 2D tensor — detect fused dimension
                // SafeTensors/HF: [hidden, 3*hidden] → split columns (dim 1)
                // GGUF:           [3*hidden, hidden] → split rows (dim 0)
                if shape.len() != 2 {
                    tensors.insert(fused_name, (data, shape));
                    continue;
                }

                let base = fused_name.replace("self_attn.c_attn.weight", "");

                if shape[1] == 3 * shape[0] {
                    // GH-255: SafeTensors shape [hidden, 3*hidden] — split columns
                    let rows = shape[0];
                    let cols_per_proj = shape[0]; // hidden
                    let total_cols = shape[1]; // 3*hidden

                    let mut q_data = Vec::with_capacity(rows * cols_per_proj);
                    let mut k_data = Vec::with_capacity(rows * cols_per_proj);
                    let mut v_data = Vec::with_capacity(rows * cols_per_proj);

                    for row in 0..rows {
                        let row_start = row * total_cols;
                        q_data.extend_from_slice(&data[row_start..row_start + cols_per_proj]);
                        k_data.extend_from_slice(
                            &data[row_start + cols_per_proj..row_start + 2 * cols_per_proj],
                        );
                        v_data.extend_from_slice(
                            &data[row_start + 2 * cols_per_proj..row_start + total_cols],
                        );
                    }

                    tensors.insert(
                        format!("{base}self_attn.q_proj.weight"),
                        (q_data, vec![rows, cols_per_proj]),
                    );
                    tensors.insert(
                        format!("{base}self_attn.k_proj.weight"),
                        (k_data, vec![rows, cols_per_proj]),
                    );
                    tensors.insert(
                        format!("{base}self_attn.v_proj.weight"),
                        (v_data, vec![rows, cols_per_proj]),
                    );
                } else if shape[0] % 3 == 0 {
                    // Original path: [3*hidden, hidden] — split rows (dim 0)
                    let rows_per_proj = shape[0] / 3;
                    let cols = shape[1];
                    let chunk = rows_per_proj * cols;

                    tensors.insert(
                        format!("{base}self_attn.q_proj.weight"),
                        (data[..chunk].to_vec(), vec![rows_per_proj, cols]),
                    );
                    tensors.insert(
                        format!("{base}self_attn.k_proj.weight"),
                        (data[chunk..2 * chunk].to_vec(), vec![rows_per_proj, cols]),
                    );
                    tensors.insert(
                        format!("{base}self_attn.v_proj.weight"),
                        (data[2 * chunk..].to_vec(), vec![rows_per_proj, cols]),
                    );
                } else {
                    // Can't split — put it back
                    tensors.insert(fused_name, (data, shape));
                    continue;
                }
            }

            eprintln!(
                "[GH-233] Split fused c_attn tensor: {} → q_proj + k_proj + v_proj",
                fused_name
            );
        }
    }

    /// GH-241: Split GPT-2 fused QKV tensors (raw/quantized version).
    ///
    /// Like `split_gpt2_fused_qkv()` but works with raw quantized bytes
    /// (`GgufRawTensor`) instead of f32 data. Splits by dividing raw bytes
    /// into 3 equal parts — valid because GGUF row-major storage means
    /// each projection's quantization blocks are contiguous.
    pub fn split_gpt2_fused_qkv_raw(
        tensors: &mut BTreeMap<String, crate::format::gguf::GgufRawTensor>,
    ) {
        let fused_keys: Vec<String> = tensors
            .keys()
            .filter(|k| k.contains("self_attn.c_attn."))
            .cloned()
            .collect();

        for fused_name in fused_keys {
            let tensor = match tensors.remove(&fused_name) {
                Some(v) => v,
                None => continue,
            };

            let is_bias = fused_name
                .rsplit_once('.')
                .is_some_and(|(_, ext)| ext.eq_ignore_ascii_case("bias"));

            if is_bias {
                // Bias: 1D shape [3*hidden] — split bytes into 3 equal parts
                if tensor.data.len() % 3 != 0 || tensor.shape.len() != 1 || tensor.shape[0] % 3 != 0
                {
                    tensors.insert(fused_name, tensor);
                    continue;
                }
                let byte_chunk = tensor.data.len() / 3;
                let elem_chunk = tensor.shape[0] / 3;
                let base = fused_name.replace("self_attn.c_attn.bias", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.bias"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[..byte_chunk].to_vec(),
                        shape: vec![elem_chunk],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.bias"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[byte_chunk..2 * byte_chunk].to_vec(),
                        shape: vec![elem_chunk],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.bias"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[2 * byte_chunk..].to_vec(),
                        shape: vec![elem_chunk],
                        dtype: tensor.dtype,
                    },
                );
            } else {
                // Weight: 2D shape [3*hidden, hidden] — split dim 0
                if tensor.shape.len() != 2 || tensor.shape[0] % 3 != 0 || tensor.data.len() % 3 != 0
                {
                    tensors.insert(fused_name, tensor);
                    continue;
                }
                let rows_per_proj = tensor.shape[0] / 3;
                let cols = tensor.shape[1];
                let byte_chunk = tensor.data.len() / 3;
                let base = fused_name.replace("self_attn.c_attn.weight", "");

                tensors.insert(
                    format!("{base}self_attn.q_proj.weight"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[..byte_chunk].to_vec(),
                        shape: vec![rows_per_proj, cols],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.k_proj.weight"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[byte_chunk..2 * byte_chunk].to_vec(),
                        shape: vec![rows_per_proj, cols],
                        dtype: tensor.dtype,
                    },
                );
                tensors.insert(
                    format!("{base}self_attn.v_proj.weight"),
                    crate::format::gguf::GgufRawTensor {
                        data: tensor.data[2 * byte_chunk..].to_vec(),
                        shape: vec![rows_per_proj, cols],
                        dtype: tensor.dtype,
                    },
                );
            }

            eprintln!(
                "[GH-241] Split fused c_attn tensor (raw): {} → q_proj + k_proj + v_proj",
                fused_name
            );
        }
    }
}

// ============================================================================
// Tensor Expectations
// ============================================================================

/// Expected statistics for a tensor type
#[derive(Debug, Clone)]
pub struct TensorExpectation {
    /// Expected mean range (min, max)
    pub mean_range: (f32, f32),
    /// Expected std range (min, max)
    pub std_range: Option<(f32, f32)>,
    /// Description for error messages
    pub description: &'static str,
}