fn resolve_gguf_tokenizer(
embedded: &GgufTokenizer,
gguf_path: &Path,
external_path: Option<&Path>,
) -> Result<GgufTokenizer> {
if !embedded.vocabulary.is_empty() {
if embedded.merges.is_empty() {
eprintln!(
"[PMAT-232] WARNING: GGUF file has vocabulary but no BPE merges. \
Text encoding may fail for multi-character tokens."
);
} else {
eprintln!(
"[PMAT-232] Tokenizer validated: {} vocab tokens, {} merge rules",
embedded.vocabulary.len(),
embedded.merges.len()
);
}
return Ok(embedded.clone());
}
if let Some(tokenizer_path) = external_path {
eprintln!(
"[PMAT-232] GGUF has no embedded tokenizer, trying external: {}",
tokenizer_path.display()
);
return load_tokenizer_from_explicit_path(tokenizer_path).ok_or_else(|| {
AprenderError::FormatError {
message: format!(
"Failed to load external tokenizer from '{}'. \
Ensure the file is a valid HuggingFace tokenizer.json.",
tokenizer_path.display()
),
}
});
}
Err(AprenderError::FormatError {
message: format!(
"GGUF file '{}' has no embedded tokenizer vocabulary. \
Solutions: (1) Use a GGUF with embedded tokenizer, or \
(2) Provide --tokenizer /path/to/tokenizer.json, or \
(3) Use SafeTensors format with sibling tokenizer.json, or \
(4) Import from HuggingFace source: apr import hf://ORG/REPO -o model.apr",
gguf_path.display()
),
})
}
fn get_xdg_cache_dir() -> PathBuf {
std::env::var("XDG_CACHE_HOME")
.ok()
.map(PathBuf::from)
.unwrap_or_else(|| {
std::env::var("HOME")
.map(|h| PathBuf::from(h).join(".cache"))
.unwrap_or_else(|_| PathBuf::from(".cache"))
})
}
fn get_hf_cache_dir() -> PathBuf {
std::env::var("HF_HOME")
.ok()
.map(PathBuf::from)
.unwrap_or_else(|| {
std::env::var("HOME")
.map(|h| PathBuf::from(h).join(".cache").join("huggingface"))
.unwrap_or_else(|_| PathBuf::from(".cache").join("huggingface"))
})
}
fn find_in_aprender_cache(
cache_base: &Path,
org: &str,
repo: &str,
filename: &str,
) -> Option<PathBuf> {
let apr_cache = cache_base
.join("aprender")
.join("hf")
.join(org)
.join(repo)
.join(filename);
apr_cache.exists().then_some(apr_cache)
}
fn find_in_hf_hub_cache(
cache_base: &Path,
org: &str,
repo: &str,
filename: &str,
) -> Option<PathBuf> {
let hf_cache = cache_base
.join("hub")
.join(format!("models--{org}--{repo}"));
if !hf_cache.exists() {
return None;
}
let snapshot_dir = hf_cache.join("snapshots");
let entries = fs::read_dir(&snapshot_dir).ok()?;
for entry in entries.flatten() {
let file_path = entry.path().join(filename);
if file_path.exists() {
return Some(file_path);
}
}
None
}
fn get_apr_cache_dir() -> Option<PathBuf> {
std::env::var("HOME")
.ok()
.map(|h| PathBuf::from(h).join(".apr").join("cache").join("hf"))
}
fn find_in_cache(org: &str, repo: &str, filename: &str) -> Option<PathBuf> {
let cache_paths = [get_xdg_cache_dir(), get_hf_cache_dir()];
for cache_base in &cache_paths {
if let Some(path) = find_in_aprender_cache(cache_base, org, repo, filename) {
return Some(path);
}
if let Some(path) = find_in_hf_hub_cache(cache_base, org, repo, filename) {
return Some(path);
}
}
if let Some(apr_cache_base) = get_apr_cache_dir() {
let apr_path = apr_cache_base.join(org).join(repo).join(filename);
if apr_path.exists() {
return Some(apr_path);
}
}
None
}
#[cfg(feature = "hf-hub-integration")]
fn download_from_hf(repo_id: &str, filename: &str) -> Result<PathBuf> {
use hf_hub::api::sync::ApiBuilder;
let token = std::env::var("HF_TOKEN").ok();
let mut builder = ApiBuilder::new();
if let Some(t) = token {
builder = builder.with_token(Some(t));
}
let api = builder.build().map_err(|e| {
let resource = format!("{repo_id}/{filename}");
let err = parse_import_error(&e.to_string(), &resource);
AprenderError::from(err)
})?;
let repo = api.model(repo_id.to_string());
let path = repo.get(filename).map_err(|e| {
let resource = format!("{repo_id}/{filename}");
let err = parse_import_error(&e.to_string(), &resource);
AprenderError::from(err)
})?;
Ok(path)
}
#[derive(Debug)]
pub(crate) struct SourceLoadResult {
pub(crate) tensors: BTreeMap<String, (Vec<f32>, Vec<usize>)>,
pub(crate) f16_raw_tensors: BTreeMap<String, (Vec<u8>, Vec<usize>, bool)>,
pub(crate) tokenizer: Option<GgufTokenizer>,
pub(crate) model_config: Option<GgufModelConfig>,
pub(crate) user_metadata: UserMetadata,
}
pub fn sanitize_hf_json(content: &str) -> String {
content
.replace("-Infinity", "-1e308")
.replace("Infinity", "1e308")
.replace("NaN", "null")
}
fn json_usize_with_aliases(json: &serde_json::Value, keys: &[&str]) -> Option<usize> {
keys.iter()
.find_map(|&k| json.get(k))
.and_then(serde_json::Value::as_u64)
.map(|v| v as usize)
}
fn json_f64_with_aliases(json: &serde_json::Value, keys: &[&str], default: f64) -> f64 {
keys.iter()
.find_map(|&k| json.get(k))
.and_then(serde_json::Value::as_f64)
.unwrap_or(default)
}
const CONFIG_ALIASES_HIDDEN_SIZE: &[&str] = &["hidden_size", "n_embd", "n_embed", "d_model"];
const CONFIG_ALIASES_NUM_LAYERS: &[&str] = &["num_hidden_layers", "n_layer", "num_layers"];
const CONFIG_ALIASES_NUM_HEADS: &[&str] = &["num_attention_heads", "n_head", "num_heads"];
const CONFIG_ALIASES_INTERMEDIATE: &[&str] = &["intermediate_size", "n_inner", "ffn_dim"];
const CONFIG_ALIASES_MAX_POS: &[&str] = &["max_position_embeddings", "n_positions", "n_ctx"];
const CONFIG_ALIASES_NORM_EPS: &[&str] = &["rms_norm_eps", "layer_norm_epsilon", "layer_norm_eps"];
pub(crate) fn load_model_config_from_json(model_path: &Path) -> Option<GgufModelConfig> {
let config_path = model_path.with_file_name("config.json");
if !config_path.exists() {
return None;
}
let content = fs::read_to_string(&config_path).ok()?;
let sanitized = sanitize_hf_json(&content);
let json: serde_json::Value = serde_json::from_str(&sanitized).ok()?;
let hidden_size = json_usize_with_aliases(&json, CONFIG_ALIASES_HIDDEN_SIZE);
let num_layers = json_usize_with_aliases(&json, CONFIG_ALIASES_NUM_LAYERS);
let num_heads = json_usize_with_aliases(&json, CONFIG_ALIASES_NUM_HEADS);
let num_kv_heads = json_usize_with_aliases(&json, &["num_key_value_heads"])
.or(num_heads);
let vocab_size = json_usize_with_aliases(&json, &["vocab_size"]);
let intermediate_size = json_usize_with_aliases(&json, CONFIG_ALIASES_INTERMEDIATE)
.or_else(|| hidden_size.map(|h| 4 * h));
let max_position_embeddings = json_usize_with_aliases(&json, CONFIG_ALIASES_MAX_POS);
let rope_theta = json_f64_with_aliases(&json, &["rope_theta"], 10000.0);
let rms_norm_eps = json_f64_with_aliases(&json, CONFIG_ALIASES_NORM_EPS, 1e-6);
let architecture = json
.get("model_type")
.and_then(|v| v.as_str())
.map(ToString::to_string);
let rope_type = match architecture.as_deref() {
Some("qwen2" | "qwen2.5" | "qwen" | "qwen3" | "qwen3_5" | "qwen3.5") => Some(2),
Some("phi" | "phi3" | "phi4") => Some(2),
Some("gpt-neox" | "gpt_neox" | "gptneox" | "pythia") => Some(2),
_ => Some(0),
};
Some(GgufModelConfig {
architecture,
hidden_size,
num_layers,
num_heads,
num_kv_heads,
vocab_size,
intermediate_size,
max_position_embeddings,
rope_theta: Some(rope_theta as f32),
rms_norm_eps: Some(rms_norm_eps as f32),
rope_type,
head_dim: None,
num_experts: None,
num_experts_per_tok: None,
moe_intermediate_size: None,
})
}
pub(crate) fn parse_tokenizer_json(
json: &serde_json::Value,
config_json: Option<&serde_json::Value>,
) -> Option<GgufTokenizer> {
let (token_to_id, base_vocab_len) = parse_vocab_from_model(json)?;
let added_count = json
.get("added_tokens")
.and_then(|v| v.as_array())
.map(|a| a.len())
.unwrap_or(0);
let expected_vocab_size = config_json
.and_then(|cfg| cfg.get("vocab_size").and_then(|v| v.as_u64()))
.map(|v| v as u32)
.unwrap_or(0);
let vocabulary = build_vocab_vector(&token_to_id, expected_vocab_size);
eprintln!(
"[BUG-EXPORT-004] Vocab: base={}, added={}, expected={}, final={}",
base_vocab_len,
added_count,
expected_vocab_size,
vocabulary.len()
);
if vocabulary.is_empty() {
return None;
}
let (bos_token_id, eos_token_id) = parse_special_tokens(json, config_json);
Some(GgufTokenizer {
vocabulary,
merges: parse_merges(json),
model_type: extract_model_type(json),
bos_token_id,
eos_token_id,
architecture: None,
model_name: None,
..Default::default()
})
}
fn extract_model_type(json: &serde_json::Value) -> Option<String> {
json.get("model")
.and_then(|m| m.get("type"))
.and_then(|t| t.as_str())
.map(String::from)
}
fn parse_vocab_from_model(
json: &serde_json::Value,
) -> Option<(std::collections::BTreeMap<u32, String>, usize)> {
let vocab_obj = json.get("model")?.get("vocab")?;
let vocab_map = vocab_obj.as_object()?;
let base_vocab_len = vocab_map.len();
let mut token_to_id: std::collections::BTreeMap<u32, String> = vocab_map
.iter()
.filter_map(|(token, id)| Some((id.as_u64()? as u32, token.clone())))
.collect();
if let Some(added) = json.get("added_tokens").and_then(|v| v.as_array()) {
for token in added {
if let (Some(content), Some(id)) = (
token.get("content").and_then(|v| v.as_str()),
token.get("id").and_then(|v| v.as_u64()),
) {
token_to_id.insert(id as u32, content.to_string());
}
}
}
Some((token_to_id, base_vocab_len))
}
fn parse_special_tokens(
json: &serde_json::Value,
config_json: Option<&serde_json::Value>,
) -> (Option<u32>, Option<u32>) {
let mut bos_token_id = config_json
.and_then(|cfg| cfg.get("bos_token_id"))
.and_then(|v| v.as_u64())
.map(|v| v as u32);
let mut eos_token_id = config_json
.and_then(|cfg| cfg.get("eos_token_id"))
.and_then(|v| v.as_u64())
.map(|v| v as u32);
if config_json.is_some() && (bos_token_id.is_some() || eos_token_id.is_some()) {
eprintln!(
"[BUG-EXPORT-004] Read BOS/EOS from config.json: bos={:?}, eos={:?}",
bos_token_id, eos_token_id
);
}
if bos_token_id.is_none() || eos_token_id.is_none() {
if let Some(added_tokens) = json.get("added_tokens").and_then(|v| v.as_array()) {
(bos_token_id, eos_token_id) =
infer_bos_eos_from_added_tokens(added_tokens, bos_token_id, eos_token_id);
}
}
(bos_token_id, eos_token_id)
}
fn parse_merges(json: &serde_json::Value) -> Vec<String> {
json.get("model")
.and_then(|m| m.get("merges"))
.and_then(|m| m.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| {
if let Some(s) = v.as_str() {
return Some(s.to_string());
}
if let Some(pair) = v.as_array() {
if let (Some(a), Some(b)) = (pair.first(), pair.get(1)) {
if let (Some(a_str), Some(b_str)) = (a.as_str(), b.as_str()) {
return Some(format!("{a_str} {b_str}"));
}
}
}
None
})
.collect()
})
.unwrap_or_default()
}
#[cfg(test)]
mod parse_merges_tests {
use super::*;
#[test]
fn parse_merges_string_format() {
let json: serde_json::Value = serde_json::json!({
"model": {
"type": "BPE",
"vocab": {},
"merges": ["Ġ t", "i n", "e r"]
}
});
let merges = parse_merges(&json);
assert_eq!(merges, vec!["Ġ t", "i n", "e r"]);
}
#[test]
fn parse_merges_array_format_qwen3() {
let json: serde_json::Value = serde_json::json!({
"model": {
"type": "BPE",
"vocab": {},
"merges": [["Ġ", "Ġ"], ["ĠĠ", "ĠĠ"], ["i", "n"]]
}
});
let merges = parse_merges(&json);
assert_eq!(merges, vec!["Ġ Ġ", "ĠĠ ĠĠ", "i n"]);
}
#[test]
fn parse_merges_empty() {
let json: serde_json::Value = serde_json::json!({
"model": { "type": "BPE", "vocab": {}, "merges": [] }
});
assert!(parse_merges(&json).is_empty());
}
#[test]
fn parse_merges_missing() {
let json: serde_json::Value = serde_json::json!({
"model": { "type": "BPE", "vocab": {} }
});
assert!(parse_merges(&json).is_empty());
}
}