pub(crate) fn load_tokenizer_from_json(model_path: &Path) -> Option<GgufTokenizer> {
let standard_path = model_path.with_file_name("tokenizer.json");
let stem = model_path.file_stem()?.to_str()?;
let base_stem = stem.split('.').next().unwrap_or(stem);
let pacha_path = model_path.with_file_name(format!("{}.tokenizer.json", base_stem));
let parent_path = model_path
.parent()
.and_then(|p| p.parent())
.map(|p| p.join("tokenizer.json"));
let safetensors_sibling_path = model_path
.parent()
.and_then(|p| p.parent())
.map(|p| p.join("safetensors").join("tokenizer.json"));
let tokenizer_path = if standard_path.exists() {
standard_path
} else if pacha_path.exists() {
eprintln!(
"[BUG-TOK-002] Found tokenizer at Pacha cache path: {}",
pacha_path.display()
);
pacha_path
} else if parent_path.as_ref().is_some_and(|p| p.exists()) {
let p = parent_path.expect("checked above");
eprintln!(
"[GH-226] Found tokenizer in parent directory: {}",
p.display()
);
p
} else if safetensors_sibling_path
.as_ref()
.is_some_and(|p| p.exists())
{
let p = safetensors_sibling_path.expect("checked above");
eprintln!(
"[GH-226] Found tokenizer in sibling safetensors/: {}",
p.display()
);
p
} else {
return load_tokenizer_from_sentencepiece(model_path);
};
let content = fs::read_to_string(&tokenizer_path).ok()?;
let json: serde_json::Value = serde_json::from_str(&content).ok()?;
let config_path = tokenizer_path.with_file_name("config.json");
let pacha_config_path = tokenizer_path
.file_stem()
.and_then(|s| s.to_str())
.map(|s| s.split('.').next().unwrap_or(s))
.map(|stem| tokenizer_path.with_file_name(format!("{stem}.config.json")));
let config_json = config_path
.exists()
.then(|| fs::read_to_string(&config_path).ok())
.flatten()
.or_else(|| {
pacha_config_path
.as_ref()
.filter(|p| p.exists())
.and_then(|p| fs::read_to_string(p).ok())
})
.and_then(|s| {
let sanitized = sanitize_hf_json(&s);
serde_json::from_str::<serde_json::Value>(&sanitized).ok()
});
parse_tokenizer_json(&json, config_json.as_ref())
}
fn load_sibling_config(path: &Path) -> Option<serde_json::Value> {
let config_path = path.with_file_name("config.json");
config_path
.exists()
.then(|| fs::read_to_string(&config_path).ok())
.flatten()
.and_then(|s| {
let sanitized = sanitize_hf_json(&s);
serde_json::from_str::<serde_json::Value>(&sanitized).ok()
})
}
fn build_vocab_vector(
token_to_id: &std::collections::BTreeMap<u32, String>,
expected_vocab_size: u32,
) -> Vec<String> {
let max_id = token_to_id.keys().max().copied().unwrap_or(0);
let final_size = (expected_vocab_size.max(max_id + 1)) as usize;
let mut vocabulary: Vec<String> = vec!["<unk>".to_string(); final_size];
for (id, token) in token_to_id {
if (*id as usize) < vocabulary.len() {
vocabulary[*id as usize] = token.clone();
}
}
vocabulary
}
fn infer_bos_eos_from_added_tokens(
added_tokens: &[serde_json::Value],
mut bos: Option<u32>,
mut eos: Option<u32>,
) -> (Option<u32>, Option<u32>) {
for token in added_tokens {
let content = token.get("content").and_then(|v| v.as_str());
let id = token.get("id").and_then(|v| v.as_u64()).map(|v| v as u32);
let (Some(content), Some(id)) = (content, id) else {
continue;
};
if bos.is_none() && is_bos_token(content) {
bos = Some(id);
}
if eos.is_none() && is_eos_token(content) {
eos = Some(id);
}
}
(bos, eos)
}
fn is_bos_token(content: &str) -> bool {
content.contains("bos") || content == "<s>" || content == "<|startoftext|>"
}
fn is_eos_token(content: &str) -> bool {
content.contains("eos") || content == "</s>" || content == "<|eot_id|>"
}
pub(crate) fn load_tokenizer_from_explicit_path(tokenizer_path: &Path) -> Option<GgufTokenizer> {
contract_pre_identity!();
if !tokenizer_path.exists() {
eprintln!(
"[PMAT-232] External tokenizer not found: {}",
tokenizer_path.display()
);
return None;
}
let content = fs::read_to_string(tokenizer_path).ok()?;
let json: serde_json::Value = serde_json::from_str(&content).ok()?;
let token_to_id = extract_vocab_with_added_tokens(&json)?;
let sibling_config = load_sibling_config(tokenizer_path);
let expected_vocab_size = get_config_u32(sibling_config.as_ref(), "vocab_size");
let vocabulary = build_vocab_vector(&token_to_id, expected_vocab_size);
eprintln!(
"[PMAT-232] External tokenizer loaded: {} vocab tokens from {}",
vocabulary.len(),
tokenizer_path.display()
);
if vocabulary.is_empty() {
return None;
}
let (bos_token_id, eos_token_id) =
resolve_bos_eos(&json, sibling_config.as_ref());
Some(GgufTokenizer {
vocabulary,
merges: parse_merges(&json),
model_type: extract_model_type(&json),
bos_token_id,
eos_token_id,
architecture: None,
model_name: None,
..Default::default()
})
}
fn extract_vocab_with_added_tokens(
json: &serde_json::Value,
) -> Option<std::collections::BTreeMap<u32, String>> {
let vocab_obj = json.get("model")?.get("vocab")?;
let vocab_map = vocab_obj.as_object()?;
let mut token_to_id: std::collections::BTreeMap<u32, String> = vocab_map
.iter()
.filter_map(|(token, id)| Some((id.as_u64()? as u32, token.clone())))
.collect();
if let Some(added) = json.get("added_tokens").and_then(|v| v.as_array()) {
for token in added {
if let (Some(content), Some(id)) = (
token.get("content").and_then(|v| v.as_str()),
token.get("id").and_then(|v| v.as_u64()),
) {
token_to_id.insert(id as u32, content.to_string());
}
}
}
Some(token_to_id)
}
fn get_config_u32(config: Option<&serde_json::Value>, key: &str) -> u32 {
config
.and_then(|cfg| cfg.get(key).and_then(|v| v.as_u64()))
.map(|v| v as u32)
.unwrap_or(0)
}
fn resolve_bos_eos(
json: &serde_json::Value,
sibling_config: Option<&serde_json::Value>,
) -> (Option<u32>, Option<u32>) {
let mut bos = sibling_config
.and_then(|cfg| cfg.get("bos_token_id").and_then(|v| v.as_u64()))
.map(|v| v as u32);
let mut eos = sibling_config
.and_then(|cfg| cfg.get("eos_token_id").and_then(|v| v.as_u64()))
.map(|v| v as u32);
if bos.is_none() || eos.is_none() {
if let Some(added_tokens) = json.get("added_tokens").and_then(|v| v.as_array()) {
(bos, eos) = infer_bos_eos_from_added_tokens(added_tokens, bos, eos);
}
}
(bos, eos)
}
fn infer_embedding_dims(
tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
) -> Option<(usize, usize)> {
tensors
.iter()
.find(|(name, _)| {
name.contains("embed_tokens")
|| name.contains("wte")
|| name.contains("word_embeddings")
|| name.contains("token_embd") })
.and_then(|(_, (_, shape))| {
if shape.len() == 2 {
let (dim0, dim1) = (shape[0], shape[1]);
if dim0 > dim1 {
Some((dim0, dim1)) } else {
Some((dim1, dim0)) }
} else {
None
}
})
}
fn extract_layer_index(name: &str, prefix: &str) -> Option<usize> {
let start = name.find(prefix)?;
let rest = &name[start + prefix.len()..];
let end = rest.find('.')?;
rest[..end].parse::<usize>().ok()
}
fn count_transformer_layers(tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>) -> usize {
tensors
.keys()
.filter_map(|name| {
if let Some(n) = extract_layer_index(name, "blk.") {
return Some(n);
}
for prefix in &["layers.", "h.", "blocks."] {
if let Some(n) = extract_layer_index(name, prefix) {
return Some(n);
}
}
None
})
.max()
.map(|n| n + 1)
.unwrap_or(0)
}
fn find_projection_dim(
tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
name_patterns: &[&str],
) -> Option<usize> {
tensors
.iter()
.find(|(name, _)| name_patterns.iter().any(|p| name.contains(p)))
.and_then(|(_, (_, shape))| {
if shape.len() == 2 {
Some(shape[0].min(shape[1]))
} else {
None
}
})
}
fn infer_head_counts(
q_dim: Option<usize>,
kv_dim: Option<usize>,
hidden_size: usize,
) -> (Option<usize>, Option<usize>) {
match (q_dim, kv_dim) {
(Some(q), Some(kv)) if kv < q => infer_gqa_heads(q, kv),
(Some(q), _) if q == hidden_size => infer_mha_heads(hidden_size),
_ => (None, None),
}
}
fn infer_gqa_heads(q: usize, kv: usize) -> (Option<usize>, Option<usize>) {
const HEAD_DIMS: [usize; 4] = [64, 128, 96, 80];
for head_dim in HEAD_DIMS {
if kv.is_multiple_of(head_dim) && q.is_multiple_of(head_dim) {
let n_kv = kv / head_dim;
let n_heads = q / head_dim;
if n_heads >= n_kv && n_kv > 0 {
return (Some(n_heads), Some(n_kv));
}
}
}
(None, None)
}
fn infer_mha_heads(hidden_size: usize) -> (Option<usize>, Option<usize>) {
const HEAD_DIMS: [usize; 4] = [64, 128, 96, 80];
for head_dim in HEAD_DIMS {
if hidden_size.is_multiple_of(head_dim) {
let n_heads = hidden_size / head_dim;
return (Some(n_heads), Some(n_heads));
}
}
(None, None)
}
fn infer_intermediate_size_from_tensors(
tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
) -> Option<usize> {
tensors
.iter()
.find(|(name, _)| {
name.contains("gate_proj")
|| name.contains("up_proj")
|| name.contains("fc1")
|| name.contains("ffn_gate")
|| name.contains("ffn_up")
})
.and_then(|(_, (_, shape))| {
if shape.len() == 2 {
Some(shape[0].max(shape[1]))
} else {
None
}
})
}
fn infer_architecture_from_names(
tensors: &BTreeMap<String, (Vec<f32>, Vec<usize>)>,
) -> Option<String> {
let has_model_layers = tensors.keys().any(|k| k.contains("model.layers"));
let has_transformer_h = tensors.keys().any(|k| k.contains("transformer.h"))
|| tensors
.keys()
.any(|k| k.starts_with("h.") && k.contains(".attn."));
let has_blk = tensors.keys().any(|k| k.contains("blk."));
let has_gpt_neox = tensors.keys().any(|k| k.starts_with("gpt_neox."));
if has_gpt_neox {
return Some("gpt-neox".to_string());
}
let has_opt_decoder = tensors
.keys()
.any(|k| k.starts_with("model.decoder.layers."));
if has_opt_decoder {
return Some("opt".to_string());
}
let has_bert = tensors.keys().any(|k| k.starts_with("bert."));
if has_bert {
return Some("bert".to_string());
}
if has_model_layers {
let has_qk_norm = tensors
.keys()
.any(|k| k.contains("self_attn.q_norm.weight"));
if has_qk_norm {
return Some("qwen3".to_string());
}
let has_attn_bias = tensors.keys().any(|k| k.contains("self_attn.q_proj.bias"));
let has_fused_qkv = tensors.keys().any(|k| k.contains("qkv_proj.weight"));
if has_attn_bias || has_fused_qkv {
Some("qwen2".to_string())
} else {
Some("llama".to_string())
}
} else if has_transformer_h {
Some("gpt2".to_string())
} else if has_blk {
Some("unknown".to_string())
} else {
Some("unknown".to_string())
}
}
fn load_tokenizer_from_sentencepiece(model_path: &Path) -> Option<GgufTokenizer> {
let sp_path = find_sentencepiece_model(model_path)?;
let data = fs::read(&sp_path).ok()?;
let (vocab, scores) = parse_sentencepiece_protobuf(&data)?;
eprintln!(
"[GH-366] Loaded SentencePiece tokenizer: {} vocab tokens from {}",
vocab.len(),
sp_path.display()
);
if vocab.is_empty() {
return None;
}
let config_json = load_sibling_config(&sp_path)
.or_else(|| {
let tc_path = sp_path.with_file_name("tokenizer_config.json");
tc_path.exists()
.then(|| fs::read_to_string(&tc_path).ok())
.flatten()
.and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
});
let bos = config_json.as_ref()
.and_then(|cfg| cfg.get("bos_token_id").and_then(|v| v.as_u64()))
.map(|v| v as u32)
.or(Some(1)); let eos = config_json.as_ref()
.and_then(|cfg| cfg.get("eos_token_id").and_then(|v| v.as_u64()))
.map(|v| v as u32)
.or(Some(2));
Some(GgufTokenizer {
vocabulary: vocab,
scores,
model_type: Some("unigram".to_string()),
bos_token_id: bos,
eos_token_id: eos,
..Default::default()
})
}
fn find_sentencepiece_model(model_path: &Path) -> Option<PathBuf> {
let standard = model_path.with_file_name("tokenizer.model");
if standard.exists() {
return Some(standard);
}
let stem = model_path.file_stem()?.to_str()?;
let base = stem.split('.').next().unwrap_or(stem);
let pacha = model_path.with_file_name(format!("{base}.tokenizer.model"));
if pacha.exists() {
return Some(pacha);
}
let parent = model_path.parent()?.parent()?.join("tokenizer.model");
if parent.exists() {
return Some(parent);
}
let sibling = model_path.parent()?.parent()?.join("safetensors").join("tokenizer.model");
if sibling.exists() {
return Some(sibling);
}
None
}
fn skip_proto_field(data: &[u8], pos: usize, wire_type: u64) -> Option<usize> {
match wire_type {
2 => {
let (len, new_pos) = read_varint(data, pos)?;
Some(new_pos + len as usize)
}
0 => {
let (_, new_pos) = read_varint(data, pos)?;
Some(new_pos)
}
1 => Some(pos + 8),
5 => Some(pos + 4),
_ => None,
}
}
fn parse_sentencepiece_protobuf(data: &[u8]) -> Option<(Vec<String>, Vec<f32>)> {
let mut vocab = Vec::new();
let mut scores = Vec::new();
let mut pos = 0;
while pos < data.len() {
let (tag, new_pos) = read_varint(data, pos)?;
pos = new_pos;
let field_number = tag >> 3;
let wire_type = tag & 0x7;
match (field_number, wire_type) {
(1, 2) => {
let (len, new_pos) = read_varint(data, pos)?;
pos = new_pos;
let len = len as usize;
if pos + len > data.len() {
return None;
}
let (piece, score) = parse_sentencepiece_entry(&data[pos..pos + len])?;
vocab.push(piece);
scores.push(score);
pos += len;
}
_ => {
pos = skip_proto_field(data, pos, wire_type)?;
}
}
}
if vocab.is_empty() { None } else { Some((vocab, scores)) }
}
fn read_proto_string(data: &[u8], pos: usize) -> Option<(String, usize)> {
let (len, new_pos) = read_varint(data, pos)?;
let len = len as usize;
if new_pos + len > data.len() {
return None;
}
let s = String::from_utf8_lossy(&data[new_pos..new_pos + len]).into_owned();
Some((s, new_pos + len))
}
fn read_proto_f32(data: &[u8], pos: usize) -> Option<(f32, usize)> {
if pos + 4 > data.len() {
return None;
}
let val = f32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
Some((val, pos + 4))
}
fn parse_sentencepiece_entry(data: &[u8]) -> Option<(String, f32)> {
let mut piece = String::new();
let mut score: f32 = 0.0;
let mut pos = 0;
while pos < data.len() {
let (tag, new_pos) = read_varint(data, pos)?;
pos = new_pos;
let field_number = tag >> 3;
let wire_type = tag & 0x7;
match (field_number, wire_type) {
(1, 2) => {
let (s, new_pos) = read_proto_string(data, pos)?;
piece = s;
pos = new_pos;
}
(2, 5) => {
let (v, new_pos) = read_proto_f32(data, pos)?;
score = v;
pos = new_pos;
}
_ => {
pos = skip_proto_field(data, pos, wire_type)?;
}
}
}
Some((piece, score))
}
fn read_varint(data: &[u8], start: usize) -> Option<(u64, usize)> {
let mut result: u64 = 0;
let mut shift = 0;
let mut pos = start;
loop {
if pos >= data.len() {
return None;
}
let byte = data[pos];
result |= u64::from(byte & 0x7F) << shift;
pos += 1;
if byte & 0x80 == 0 {
return Some((result, pos));
}
shift += 7;
if shift >= 64 {
return None;
}
}
}