use std::path::Path;
use std::sync::LazyLock;
use anyhow::{Context, Result, anyhow};
use llm_multimodal::{
ImagePreProcessor, ImageProcessorRegistry, ModelMetadata, ModelRegistry, PreProcessorConfig,
};
use llm_tokenizer::traits::Tokenizer;
use llm_tokenizer::{Decoder, Encoder, Encoding, HuggingFaceTokenizer, SpecialTokens};
use crate::protocols::TokenIdType;
struct NullTokenizer;
impl Encoder for NullTokenizer {
fn encode(&self, _input: &str, _add_special_tokens: bool) -> anyhow::Result<Encoding> {
Ok(Encoding::Plain(Vec::new()))
}
fn encode_batch(
&self,
inputs: &[&str],
_add_special_tokens: bool,
) -> anyhow::Result<Vec<Encoding>> {
Ok(inputs.iter().map(|_| Encoding::Plain(Vec::new())).collect())
}
}
impl Decoder for NullTokenizer {
fn decode(&self, _ids: &[u32], _skip_special_tokens: bool) -> anyhow::Result<String> {
Ok(String::new())
}
}
impl Tokenizer for NullTokenizer {
fn vocab_size(&self) -> usize {
0
}
fn get_special_tokens(&self) -> &SpecialTokens {
static EMPTY: LazyLock<SpecialTokens> = LazyLock::new(SpecialTokens::default);
&EMPTY
}
fn token_to_id(&self, _token: &str) -> Option<u32> {
None
}
fn id_to_token(&self, _id: u32) -> Option<String> {
None
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
static REGISTRY: LazyLock<ImageProcessorRegistry> =
LazyLock::new(ImageProcessorRegistry::with_defaults);
static MODEL_REGISTRY: LazyLock<ModelRegistry> = LazyLock::new(ModelRegistry::new);
pub struct LightseekMmCounter {
processor: &'static dyn ImagePreProcessor,
config: PreProcessorConfig,
model_id: String,
}
impl LightseekMmCounter {
pub fn try_new(model_id: &str, model_type: Option<&str>, model_dir: &Path) -> Result<Self> {
let cfg_path = model_dir.join("preprocessor_config.json");
let json = std::fs::read_to_string(&cfg_path).with_context(|| {
format!(
"lightseek: failed to read preprocessor_config.json at {}",
cfg_path.display()
)
})?;
let config = PreProcessorConfig::from_json(&json).with_context(|| {
format!(
"lightseek: failed to parse preprocessor_config.json at {}",
cfg_path.display()
)
})?;
let processor = REGISTRY.find(model_id, model_type).ok_or_else(|| {
anyhow!(
"lightseek: no image processor registered for model_id={:?} model_type={:?}",
model_id,
model_type
)
})?;
Ok(Self {
processor,
config,
model_id: model_id.to_string(),
})
}
pub fn count_tokens(&self, width: u32, height: u32) -> usize {
self.processor
.calculate_num_tokens(width, height, &self.config)
}
pub fn model_id(&self) -> &str {
&self.model_id
}
}
pub fn resolve_image_token_id(model_id: &str, model_dir: &Path) -> Option<TokenIdType> {
let config = read_json(model_dir, "config.json")?;
resolve_image_token_id_with_config(model_id, model_dir, &config)
}
fn resolve_image_token_id_with_config(
model_id: &str,
model_dir: &Path,
config: &serde_json::Value,
) -> Option<TokenIdType> {
let tokenizer_path = model_dir.join("tokenizer.json");
let hf_tokenizer =
tokenizer_path
.to_str()
.and_then(|p| match HuggingFaceTokenizer::from_file(p) {
Ok(t) => Some(t),
Err(e) => {
tracing::debug!(
target: "mm_routing",
model_dir = %model_dir.display(),
err = %e,
"lightseek: tokenizer.json not loaded; falling back to NullTokenizer"
);
None
}
});
let null_tokenizer = NullTokenizer;
let tokenizer: &dyn Tokenizer = match hf_tokenizer.as_ref() {
Some(t) => t,
None => &null_tokenizer,
};
let metadata = ModelMetadata {
model_id,
tokenizer,
config,
};
let spec = MODEL_REGISTRY.lookup(&metadata)?;
let id = spec
.placeholder_token_id(&metadata)
.map_err(|e| {
tracing::warn!(
target: "mm_routing",
model_id = %model_id,
err = %e,
"lightseek: ModelProcessorSpec could not resolve placeholder_token_id"
);
e
})
.ok()?;
tracing::debug!(
target: "mm_routing",
model_id = %model_id,
image_token_id = id,
spec = spec.name(),
"resolved image-placeholder token id"
);
Some(id as TokenIdType)
}
pub struct RoutingTokens {
pub image_token_id: Option<TokenIdType>,
pub chat_placeholder_token_id: Option<TokenIdType>,
pub bos_token_string: Option<String>,
}
pub fn resolve_routing_tokens(model_id: &str, model_dir: &Path) -> RoutingTokens {
let config = read_json(model_dir, "config.json");
let tokenizer_config = read_json(model_dir, "tokenizer_config.json");
let image_token_id = config
.as_ref()
.and_then(|c| resolve_image_token_id_with_config(model_id, model_dir, c));
let chat_placeholder_token_id = config
.as_ref()
.and_then(extract_chat_placeholder_from_config)
.or(image_token_id);
let bos_token_string = tokenizer_config
.as_ref()
.and_then(extract_bos_token_from_tokenizer_config);
RoutingTokens {
image_token_id,
chat_placeholder_token_id,
bos_token_string,
}
}
fn read_json(model_dir: &Path, filename: &str) -> Option<serde_json::Value> {
let path = model_dir.join(filename);
let raw = match std::fs::read_to_string(&path) {
Ok(s) => s,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return None,
Err(e) => {
tracing::warn!(
target: "mm_routing",
path = %path.display(),
err = %e,
"lightseek: failed to read {filename}"
);
return None;
}
};
match serde_json::from_str(&raw) {
Ok(v) => Some(v),
Err(e) => {
tracing::warn!(
target: "mm_routing",
path = %path.display(),
err = %e,
"lightseek: failed to parse {filename}"
);
None
}
}
}
fn extract_chat_placeholder_from_config(config: &serde_json::Value) -> Option<TokenIdType> {
config
.get("image_token_id")
.and_then(|x| x.as_u64())
.and_then(|id| u32::try_from(id).ok())
}
fn extract_bos_token_from_tokenizer_config(cfg: &serde_json::Value) -> Option<String> {
if !cfg
.get("add_bos_token")
.and_then(|x| x.as_bool())
.unwrap_or(false)
{
return None;
}
cfg.get("bos_token").and_then(|x| match x {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Object(o) => o
.get("content")
.and_then(|c| c.as_str())
.map(|s| s.to_owned()),
_ => None,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn image_processor_registry_resolves_qwen3vl_via_path_substring() {
assert!(REGISTRY.find("Qwen/Qwen3-VL-2B-Instruct", None).is_some());
assert!(REGISTRY.find("/models/Qwen3-VL-2B/", None).is_some());
}
#[test]
fn image_processor_registry_uses_model_type_fallback() {
assert!(REGISTRY.find("/models/my-finetune", None).is_none());
assert!(
REGISTRY
.find("/models/my-finetune", Some("qwen3_vl"))
.is_some()
);
}
#[test]
fn image_processor_registry_covers_documented_families() {
const FAMILIES: &[(&str, &str, &str)] = &[
("Qwen3-VL", "Qwen/Qwen3-VL-2B-Instruct", "qwen3_vl"),
("Qwen2-VL", "Qwen/Qwen2-VL-7B-Instruct", "qwen2_vl"),
("Qwen2.5-VL", "Qwen/Qwen2.5-VL-7B-Instruct", "qwen2_5_vl"),
(
"LLaVA-NeXT",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava_next",
),
("LLaVA-1.5", "llava-hf/llava-1.5-7b-hf", "llava"),
(
"Phi-3-vision",
"microsoft/Phi-3-vision-128k-instruct",
"phi3_v",
),
("Llama-4", "meta-llama/Llama-4-Scout-17B-16E", "llama4"),
("Kimi-K2.5", "moonshotai/Kimi-K2.5-Instruct", "kimi_k2_5"),
];
let mut missing: Vec<&str> = Vec::new();
for (family, hf_id, model_type) in FAMILIES {
let by_id = REGISTRY.find(hf_id, None).is_some();
let by_type = REGISTRY.find("/local/finetune", Some(model_type)).is_some();
if !(by_id || by_type) {
missing.push(family);
}
}
assert!(
missing.is_empty(),
"lightseek registry has no processor for: {:?}. \
Either pick up an smg release that registers these, or trim \
the supported-families list in docs.",
missing
);
}
}