aprender-core 0.31.2


/// Resolve GGUF tokenizer: use embedded, external, or error.
fn resolve_gguf_tokenizer(
    embedded: &GgufTokenizer,
    gguf_path: &Path,
    external_path: Option<&Path>,
) -> Result<GgufTokenizer> {
    if !embedded.vocabulary.is_empty() {
        if embedded.merges.is_empty() {
            eprintln!(
                "[PMAT-232] WARNING: GGUF file has vocabulary but no BPE merges. \
                 Text encoding may fail for multi-character tokens."
            );
        } else {
            eprintln!(
                "[PMAT-232] Tokenizer validated: {} vocab tokens, {} merge rules",
                embedded.vocabulary.len(),
                embedded.merges.len()
            );
        }
        return Ok(embedded.clone());
    }
    // No embedded tokenizer — try external
    if let Some(tokenizer_path) = external_path {
        eprintln!(
            "[PMAT-232] GGUF has no embedded tokenizer, trying external: {}",
            tokenizer_path.display()
        );
        return load_tokenizer_from_explicit_path(tokenizer_path).ok_or_else(|| {
            AprenderError::FormatError {
                message: format!(
                    "Failed to load external tokenizer from '{}'. \
                     Ensure the file is a valid HuggingFace tokenizer.json.",
                    tokenizer_path.display()
                ),
            }
        });
    }
    Err(AprenderError::FormatError {
        message: format!(
            "GGUF file '{}' has no embedded tokenizer vocabulary. \
             Solutions: (1) Use a GGUF with embedded tokenizer, or \
             (2) Provide --tokenizer /path/to/tokenizer.json, or \
             (3) Use SafeTensors format with sibling tokenizer.json, or \
             (4) Import from HuggingFace source: apr import hf://ORG/REPO -o model.apr",
            gguf_path.display()
        ),
    })
}

/// Get XDG cache directory or fallback.
fn get_xdg_cache_dir() -> PathBuf {
    std::env::var("XDG_CACHE_HOME")
        .ok()
        .map(PathBuf::from)
        .unwrap_or_else(|| {
            std::env::var("HOME")
                .map(|h| PathBuf::from(h).join(".cache"))
                .unwrap_or_else(|_| PathBuf::from(".cache"))
        })
}

/// Get `HuggingFace` cache directory.
fn get_hf_cache_dir() -> PathBuf {
    std::env::var("HF_HOME")
        .ok()
        .map(PathBuf::from)
        .unwrap_or_else(|| {
            std::env::var("HOME")
                .map(|h| PathBuf::from(h).join(".cache").join("huggingface"))
                .unwrap_or_else(|_| PathBuf::from(".cache").join("huggingface"))
        })
}

/// Check aprender cache for a file.
fn find_in_aprender_cache(
    cache_base: &Path,
    org: &str,
    repo: &str,
    filename: &str,
) -> Option<PathBuf> {
    let apr_cache = cache_base
        .join("aprender")
        .join("hf")
        .join(org)
        .join(repo)
        .join(filename);
    apr_cache.exists().then_some(apr_cache)
}

/// Check `HuggingFace` hub cache for a file.
fn find_in_hf_hub_cache(
    cache_base: &Path,
    org: &str,
    repo: &str,
    filename: &str,
) -> Option<PathBuf> {
    let hf_cache = cache_base
        .join("hub")
        .join(format!("models--{org}--{repo}"));

    if !hf_cache.exists() {
        return None;
    }

    let snapshot_dir = hf_cache.join("snapshots");
    let entries = fs::read_dir(&snapshot_dir).ok()?;

    for entry in entries.flatten() {
        let file_path = entry.path().join(filename);
        if file_path.exists() {
            return Some(file_path);
        }
    }
    None
}

/// Get the APR CLI cache directory (~/.apr/cache/hf/).
///
/// This is where `apr pull` stores downloaded models (sharded SafeTensors, etc.).
/// Separate from the library cache (`~/.cache/aprender/hf/`).
fn get_apr_cache_dir() -> Option<PathBuf> {
    std::env::var("HOME")
        .ok()
        .map(|h| PathBuf::from(h).join(".apr").join("cache").join("hf"))
}

/// Find a model file in standard cache locations
fn find_in_cache(org: &str, repo: &str, filename: &str) -> Option<PathBuf> {
    let cache_paths = [get_xdg_cache_dir(), get_hf_cache_dir()];

    for cache_base in &cache_paths {
        if let Some(path) = find_in_aprender_cache(cache_base, org, repo, filename) {
            return Some(path);
        }
        if let Some(path) = find_in_hf_hub_cache(cache_base, org, repo, filename) {
            return Some(path);
        }
    }

    // GH-279-2: Check ~/.apr/cache/hf/ (where `apr pull` stores sharded models)
    if let Some(apr_cache_base) = get_apr_cache_dir() {
        let apr_path = apr_cache_base.join(org).join(repo).join(filename);
        if apr_path.exists() {
            return Some(apr_path);
        }
    }

    None
}

/// Download a file from HuggingFace Hub
#[cfg(feature = "hf-hub-integration")]
fn download_from_hf(repo_id: &str, filename: &str) -> Result<PathBuf> {
    use hf_hub::api::sync::ApiBuilder;

    // Build API client (uses HF_TOKEN if available)
    let token = std::env::var("HF_TOKEN").ok();
    let mut builder = ApiBuilder::new();
    if let Some(t) = token {
        builder = builder.with_token(Some(t));
    }

    let api = builder.build().map_err(|e| {
        let resource = format!("{repo_id}/{filename}");
        let err = parse_import_error(&e.to_string(), &resource);
        AprenderError::from(err)
    })?;

    // Get repo handle
    let repo = api.model(repo_id.to_string());

    // Download the file (GH-129: parse error for actionable messages)
    let path = repo.get(filename).map_err(|e| {
        let resource = format!("{repo_id}/{filename}");
        let err = parse_import_error(&e.to_string(), &resource);
        AprenderError::from(err)
    })?;

    Ok(path)
}

/// Result of loading source tensors (may include tokenizer data)
#[derive(Debug)]
pub(crate) struct SourceLoadResult {
    /// Tensor data (name -> (data, shape))
    pub(crate) tensors: BTreeMap<String, (Vec<f32>, Vec<usize>)>,
    /// GH-205 + GH-353: Raw F16/BF16 tensor bytes for passthrough (name -> (bytes, shape, is_bf16))
    /// Tensors in this map should NOT be converted - write raw to APR
    pub(crate) f16_raw_tensors: BTreeMap<String, (Vec<u8>, Vec<usize>, bool)>,
    /// Tokenizer data (only present for GGUF files)
    pub(crate) tokenizer: Option<GgufTokenizer>,
    /// Model config (CRITICAL for inference - from GGUF)
    pub(crate) model_config: Option<GgufModelConfig>,
    /// PMAT-223: User metadata from SafeTensors `__metadata__` section
    pub(crate) user_metadata: UserMetadata,
}

/// Sanitize non-standard JSON from HuggingFace config files (GH-268).
///
/// Some HuggingFace models (e.g., Mamba2 variants) include JavaScript literals
/// like `Infinity`, `-Infinity`, and `NaN` which are not valid JSON.
/// This function replaces them with JSON-compatible equivalents before parsing.
pub fn sanitize_hf_json(content: &str) -> String {
    // Replace -Infinity BEFORE Infinity to avoid partial match
    content
        .replace("-Infinity", "-1e308")
        .replace("Infinity", "1e308")
        .replace("NaN", "null")
}

/// Look up a `u64` JSON field by trying multiple key aliases in order.
///
/// Returns the first matching key's value as `usize`, or `None` if no alias matches.
/// This is the data-driven replacement for the repeated pattern:
///   `json.get("primary").or_else(|| json.get("alias1")).and_then(Value::as_u64).map(|v| v as usize)`
fn json_usize_with_aliases(json: &serde_json::Value, keys: &[&str]) -> Option<usize> {
    keys.iter()
        .find_map(|&k| json.get(k))
        .and_then(serde_json::Value::as_u64)
        .map(|v| v as usize)
}

/// Look up an `f64` JSON field by trying multiple key aliases in order.
///
/// Returns the first matching key's value as `f64`, or the provided default.
fn json_f64_with_aliases(json: &serde_json::Value, keys: &[&str], default: f64) -> f64 {
    keys.iter()
        .find_map(|&k| json.get(k))
        .and_then(serde_json::Value::as_f64)
        .unwrap_or(default)
}

/// HuggingFace config.json field alias table.
///
/// Each entry maps a semantic field to its known key names across architectures.
/// GH-235: GPT-2 uses n_embd, n_head, n_layer, n_inner, n_positions.
/// GH-265: BLOOM, GPT-Neo, OPT, GPT-BigCode also use non-standard key names.
const CONFIG_ALIASES_HIDDEN_SIZE: &[&str] = &["hidden_size", "n_embd", "n_embed", "d_model"];
const CONFIG_ALIASES_NUM_LAYERS: &[&str] = &["num_hidden_layers", "n_layer", "num_layers"];
const CONFIG_ALIASES_NUM_HEADS: &[&str] = &["num_attention_heads", "n_head", "num_heads"];
const CONFIG_ALIASES_INTERMEDIATE: &[&str] = &["intermediate_size", "n_inner", "ffn_dim"];
const CONFIG_ALIASES_MAX_POS: &[&str] = &["max_position_embeddings", "n_positions", "n_ctx"];
/// GH-278: LLaMA/Qwen use "rms_norm_eps", GPT-2/BERT use "layer_norm_epsilon".
const CONFIG_ALIASES_NORM_EPS: &[&str] = &["rms_norm_eps", "layer_norm_epsilon", "layer_norm_eps"];

/// Load model config from config.json alongside the model file (PMAT-098)
///
/// This is the preferred way to get model config for SafeTensors models.
/// Falls back to shape inference if config.json is not found.
pub(crate) fn load_model_config_from_json(model_path: &Path) -> Option<GgufModelConfig> {
    // Look for config.json alongside the model file
    let config_path = model_path.with_file_name("config.json");
    if !config_path.exists() {
        return None;
    }

    let content = fs::read_to_string(&config_path).ok()?;
    let sanitized = sanitize_hf_json(&content);
    let json: serde_json::Value = serde_json::from_str(&sanitized).ok()?;

    // Parse HuggingFace config.json format using alias lookup tables
    let hidden_size = json_usize_with_aliases(&json, CONFIG_ALIASES_HIDDEN_SIZE);
    let num_layers = json_usize_with_aliases(&json, CONFIG_ALIASES_NUM_LAYERS);
    let num_heads = json_usize_with_aliases(&json, CONFIG_ALIASES_NUM_HEADS);

    let num_kv_heads = json_usize_with_aliases(&json, &["num_key_value_heads"])
        .or(num_heads); // Default to num_heads if not specified (no GQA)

    let vocab_size = json_usize_with_aliases(&json, &["vocab_size"]);

    let intermediate_size = json_usize_with_aliases(&json, CONFIG_ALIASES_INTERMEDIATE)
        .or_else(|| hidden_size.map(|h| 4 * h)); // BLOOM/GPT-Neo default: 4 * hidden_size

    let max_position_embeddings = json_usize_with_aliases(&json, CONFIG_ALIASES_MAX_POS);

    let rope_theta = json_f64_with_aliases(&json, &["rope_theta"], 10000.0);
    let rms_norm_eps = json_f64_with_aliases(&json, CONFIG_ALIASES_NORM_EPS, 1e-6);

    let architecture = json
        .get("model_type")
        .and_then(|v| v.as_str())
        .map(ToString::to_string);

    // PMAT-114: Infer rope_type from architecture
    // Qwen2/Qwen2.5/Qwen3, Phi, and GPT-NeoX models use NEOX-style RoPE (type 2)
    // GH-311: Added GPT-NeoX family (Pythia)
    let rope_type = match architecture.as_deref() {
        Some("qwen2" | "qwen2.5" | "qwen" | "qwen3" | "qwen3_5" | "qwen3.5") => Some(2),
        Some("phi" | "phi3" | "phi4") => Some(2),
        Some("gpt-neox" | "gpt_neox" | "gptneox" | "pythia") => Some(2),
        _ => Some(0),
    };

    Some(GgufModelConfig {
        architecture,
        hidden_size,
        num_layers,
        num_heads,
        num_kv_heads,
        vocab_size,
        intermediate_size,
        max_position_embeddings,
        rope_theta: Some(rope_theta as f32),
        rms_norm_eps: Some(rms_norm_eps as f32),
        rope_type,
        head_dim: None,
        num_experts: None,
        num_experts_per_tok: None,
        moe_intermediate_size: None,
    })
}

/// Parse tokenizer from already-loaded JSON content.
///
/// Extracted from `load_tokenizer_from_json` for testability. This is the pure
/// JSON-parsing core that receives parsed JSON values directly, with no filesystem I/O.
///
/// # Arguments
/// * `json` - Parsed tokenizer.json content
/// * `config_json` - Optional parsed config.json content (for `vocab_size`, BOS/EOS)
pub(crate) fn parse_tokenizer_json(
    json: &serde_json::Value,
    config_json: Option<&serde_json::Value>,
) -> Option<GgufTokenizer> {
    // Step 1: Build token-to-id map from model.vocab + added_tokens
    let (token_to_id, base_vocab_len) = parse_vocab_from_model(json)?;

    let added_count = json
        .get("added_tokens")
        .and_then(|v| v.as_array())
        .map(|a| a.len())
        .unwrap_or(0);

    // Step 2: Build padded vocabulary vector
    let expected_vocab_size = config_json
        .and_then(|cfg| cfg.get("vocab_size").and_then(|v| v.as_u64()))
        .map(|v| v as u32)
        .unwrap_or(0);

    let vocabulary = build_vocab_vector(&token_to_id, expected_vocab_size);

    eprintln!(
        "[BUG-EXPORT-004] Vocab: base={}, added={}, expected={}, final={}",
        base_vocab_len,
        added_count,
        expected_vocab_size,
        vocabulary.len()
    );

    if vocabulary.is_empty() {
        return None;
    }

    // Step 3: Extract BOS/EOS special token IDs
    let (bos_token_id, eos_token_id) = parse_special_tokens(json, config_json);

    Some(GgufTokenizer {
        vocabulary,
        merges: parse_merges(json),
        model_type: extract_model_type(json),
        bos_token_id,
        eos_token_id,
        architecture: None,
        model_name: None,
        ..Default::default()
    })
}

/// Extract model type string from tokenizer JSON.
fn extract_model_type(json: &serde_json::Value) -> Option<String> {
    json.get("model")
        .and_then(|m| m.get("type"))
        .and_then(|t| t.as_str())
        .map(String::from)
}

/// Build a token-to-id map from the base vocabulary in `model.vocab`, then overlay
/// any entries from the `added_tokens` array.
///
/// Returns `(token_to_id_map, base_vocab_len)` or `None` if `model.vocab` is missing.
fn parse_vocab_from_model(
    json: &serde_json::Value,
) -> Option<(std::collections::BTreeMap<u32, String>, usize)> {
    let vocab_obj = json.get("model")?.get("vocab")?;
    let vocab_map = vocab_obj.as_object()?;
    let base_vocab_len = vocab_map.len();

    let mut token_to_id: std::collections::BTreeMap<u32, String> = vocab_map
        .iter()
        .filter_map(|(token, id)| Some((id.as_u64()? as u32, token.clone())))
        .collect();

    if let Some(added) = json.get("added_tokens").and_then(|v| v.as_array()) {
        for token in added {
            if let (Some(content), Some(id)) = (
                token.get("content").and_then(|v| v.as_str()),
                token.get("id").and_then(|v| v.as_u64()),
            ) {
                token_to_id.insert(id as u32, content.to_string());
            }
        }
    }

    Some((token_to_id, base_vocab_len))
}

/// Extract BOS/EOS token IDs.
///
/// BUG-EXPORT-004: Priority 1 is config.json (authoritative). Priority 2 is
/// inferring from `added_tokens` patterns in tokenizer.json (fallback via
/// `infer_bos_eos_from_added_tokens`).
fn parse_special_tokens(
    json: &serde_json::Value,
    config_json: Option<&serde_json::Value>,
) -> (Option<u32>, Option<u32>) {
    let mut bos_token_id = config_json
        .and_then(|cfg| cfg.get("bos_token_id"))
        .and_then(|v| v.as_u64())
        .map(|v| v as u32);
    let mut eos_token_id = config_json
        .and_then(|cfg| cfg.get("eos_token_id"))
        .and_then(|v| v.as_u64())
        .map(|v| v as u32);

    if config_json.is_some() && (bos_token_id.is_some() || eos_token_id.is_some()) {
        eprintln!(
            "[BUG-EXPORT-004] Read BOS/EOS from config.json: bos={:?}, eos={:?}",
            bos_token_id, eos_token_id
        );
    }

    // Fallback: infer from added_tokens (less reliable)
    if bos_token_id.is_none() || eos_token_id.is_none() {
        if let Some(added_tokens) = json.get("added_tokens").and_then(|v| v.as_array()) {
            (bos_token_id, eos_token_id) =
                infer_bos_eos_from_added_tokens(added_tokens, bos_token_id, eos_token_id);
        }
    }

    (bos_token_id, eos_token_id)
}

/// Extract BPE merge rules from `model.merges` (PMAT-171).
///
/// Handles two HuggingFace tokenizer.json merge formats:
/// - String format: `"Ġ Ġ"` (space-separated pair)
/// - Array format:  `["Ġ", "Ġ"]` (two-element array, used by Qwen3)
fn parse_merges(json: &serde_json::Value) -> Vec<String> {
    json.get("model")
        .and_then(|m| m.get("merges"))
        .and_then(|m| m.as_array())
        .map(|arr| {
            arr.iter()
                .filter_map(|v| {
                    // Format 1: "Ġ Ġ" (single string with space separator)
                    if let Some(s) = v.as_str() {
                        return Some(s.to_string());
                    }
                    // Format 2: ["Ġ", "Ġ"] (two-element array — Qwen3 tokenizer.json)
                    if let Some(pair) = v.as_array() {
                        if let (Some(a), Some(b)) = (pair.first(), pair.get(1)) {
                            if let (Some(a_str), Some(b_str)) = (a.as_str(), b.as_str()) {
                                return Some(format!("{a_str} {b_str}"));
                            }
                        }
                    }
                    None
                })
                .collect()
        })
        .unwrap_or_default()
}

#[cfg(test)]
mod parse_merges_tests {
    use super::*;

    #[test]
    fn parse_merges_string_format() {
        let json: serde_json::Value = serde_json::json!({
            "model": {
                "type": "BPE",
                "vocab": {},
                "merges": ["Ġ t", "i n", "e r"]
            }
        });
        let merges = parse_merges(&json);
        assert_eq!(merges, vec!["Ġ t", "i n", "e r"]);
    }

    #[test]
    fn parse_merges_array_format_qwen3() {
        let json: serde_json::Value = serde_json::json!({
            "model": {
                "type": "BPE",
                "vocab": {},
                "merges": [["Ġ", "Ġ"], ["ĠĠ", "ĠĠ"], ["i", "n"]]
            }
        });
        let merges = parse_merges(&json);
        assert_eq!(merges, vec!["Ġ Ġ", "ĠĠ ĠĠ", "i n"]);
    }

    #[test]
    fn parse_merges_empty() {
        let json: serde_json::Value = serde_json::json!({
            "model": { "type": "BPE", "vocab": {}, "merges": [] }
        });
        assert!(parse_merges(&json).is_empty());
    }

    #[test]
    fn parse_merges_missing() {
        let json: serde_json::Value = serde_json::json!({
            "model": { "type": "BPE", "vocab": {} }
        });
        assert!(parse_merges(&json).is_empty());
    }
}