batuta/serve/banco/
model_slot.rs

1//! Model slot — tracks which model is loaded in Banco.
2//!
3//! Phase 2a: metadata-only (path, format, size, loaded_at).
4//! Phase 2b: GGUF metadata extraction via realizar (behind `inference` feature).
5
6use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8use std::sync::{Arc, RwLock};
9use std::time::Instant;
10
11/// Detected model format.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13#[serde(rename_all = "lowercase")]
14pub enum ModelFormat {
15    Gguf,
16    Apr,
17    SafeTensors,
18    Unknown,
19}
20
21impl ModelFormat {
22    /// Detect format from file extension.
23    #[must_use]
24    pub fn from_path(path: &Path) -> Self {
25        match path.extension().and_then(|e| e.to_str()) {
26            Some("gguf") => Self::Gguf,
27            Some("apr") => Self::Apr,
28            Some("safetensors") => Self::SafeTensors,
29            _ => Self::Unknown,
30        }
31    }
32}
33
34/// Metadata about a loaded model.
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct ModelSlotInfo {
37    pub model_id: String,
38    pub path: String,
39    pub format: ModelFormat,
40    pub size_bytes: u64,
41    pub loaded_at_secs: u64,
42    /// Architecture name (e.g., "llama", "phi2", "qwen2"). Available when inference feature enabled.
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub architecture: Option<String>,
45    /// Vocabulary size. Available when inference feature enabled.
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub vocab_size: Option<usize>,
48    /// Hidden dimension. Available when inference feature enabled.
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub hidden_dim: Option<usize>,
51    /// Number of transformer layers. Available when inference feature enabled.
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub num_layers: Option<usize>,
54    /// Context length. Available when inference feature enabled.
55    #[serde(skip_serializing_if = "Option::is_none")]
56    pub context_length: Option<usize>,
57    /// Number of tensors in the model file.
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub tensor_count: Option<usize>,
60}
61
62/// Model slot — holds the currently loaded model (or None).
63pub struct ModelSlot {
64    info: RwLock<Option<ModelSlotInfo>>,
65    loaded_at: RwLock<Option<Instant>>,
66    /// The actual quantized model for inference (behind inference feature).
67    #[cfg(feature = "realizar")]
68    quantized_model: RwLock<Option<Arc<realizar::gguf::OwnedQuantizedModel>>>,
69    /// Vocabulary tokens for encoding/decoding.
70    #[cfg(feature = "realizar")]
71    vocab: RwLock<Vec<String>>,
72    /// Proper BPE tokenizer from aprender (behind ml feature).
73    /// Uses merge rules for correct tokenization instead of greedy longest-match.
74    #[cfg(feature = "aprender")]
75    bpe_tokenizer: RwLock<Option<aprender::text::bpe::BpeTokenizer>>,
76}
77
78impl ModelSlot {
79    /// Create an empty slot.
80    #[must_use]
81    pub fn empty() -> Self {
82        Self {
83            info: RwLock::new(None),
84            loaded_at: RwLock::new(None),
85            #[cfg(feature = "realizar")]
86            quantized_model: RwLock::new(None),
87            #[cfg(feature = "realizar")]
88            vocab: RwLock::new(Vec::new()),
89            #[cfg(feature = "aprender")]
90            bpe_tokenizer: RwLock::new(None),
91        }
92    }
93
94    /// Load a model from a path.
95    ///
96    /// With `inference` feature: parses GGUF metadata (architecture, vocab, layers).
97    /// Without: records file metadata only.
98    pub fn load(&self, path: &str) -> Result<ModelSlotInfo, ModelSlotError> {
99        let pb = PathBuf::from(path);
100
101        let model_id = pb.file_stem().and_then(|s| s.to_str()).unwrap_or("unknown").to_string();
102        let format = ModelFormat::from_path(&pb);
103        let size_bytes = std::fs::metadata(&pb).map(|m| m.len()).unwrap_or(0);
104
105        // Extract model metadata + quantized model when inference feature is available
106        let gguf_meta = extract_model_metadata(&pb, format);
107
108        let info = ModelSlotInfo {
109            model_id,
110            path: path.to_string(),
111            format,
112            size_bytes,
113            loaded_at_secs: epoch_secs(),
114            architecture: gguf_meta.as_ref().map(|m| m.architecture.clone()),
115            vocab_size: gguf_meta.as_ref().map(|m| m.vocab_size),
116            hidden_dim: gguf_meta.as_ref().map(|m| m.hidden_dim),
117            num_layers: gguf_meta.as_ref().map(|m| m.num_layers),
118            context_length: gguf_meta.as_ref().map(|m| m.context_length),
119            tensor_count: gguf_meta.as_ref().map(|m| m.tensor_count),
120        };
121
122        // Store the quantized model when inference feature is enabled
123        #[cfg(feature = "realizar")]
124        if let Some(ref meta) = gguf_meta {
125            if let Ok(mut m) = self.quantized_model.write() {
126                *m = meta.model.clone();
127            }
128            if let Ok(mut v) = self.vocab.write() {
129                *v = meta.vocab.clone();
130            }
131        }
132
133        // Try to load a proper BPE tokenizer (correct merge rules vs greedy)
134        #[cfg(feature = "aprender")]
135        {
136            let bpe = load_bpe_tokenizer(&pb);
137            if let Ok(mut t) = self.bpe_tokenizer.write() {
138                *t = bpe;
139            }
140        }
141
142        if let Ok(mut slot) = self.info.write() {
143            *slot = Some(info.clone());
144        }
145        if let Ok(mut t) = self.loaded_at.write() {
146            *t = Some(Instant::now());
147        }
148
149        Ok(info)
150    }
151
152    /// Unload the current model, freeing the quantized model and vocabulary.
153    pub fn unload(&self) -> Result<(), ModelSlotError> {
154        let had_model = self.info.write().map(|mut s| s.take().is_some()).unwrap_or(false);
155        if let Ok(mut t) = self.loaded_at.write() {
156            *t = None;
157        }
158        #[cfg(feature = "realizar")]
159        {
160            if let Ok(mut m) = self.quantized_model.write() {
161                *m = None;
162            }
163            if let Ok(mut v) = self.vocab.write() {
164                v.clear();
165            }
166        }
167        #[cfg(feature = "aprender")]
168        {
169            if let Ok(mut t) = self.bpe_tokenizer.write() {
170                *t = None;
171            }
172        }
173        if had_model {
174            Ok(())
175        } else {
176            Err(ModelSlotError::NoModelLoaded)
177        }
178    }
179
180    /// Get current model info (None if empty).
181    #[must_use]
182    pub fn info(&self) -> Option<ModelSlotInfo> {
183        self.info.read().ok()?.clone()
184    }
185
186    /// Check if a model is loaded.
187    #[must_use]
188    pub fn is_loaded(&self) -> bool {
189        self.info.read().map(|s| s.is_some()).unwrap_or(false)
190    }
191
192    /// Get the quantized model for inference (None if not loaded or inference feature disabled).
193    #[cfg(feature = "realizar")]
194    #[must_use]
195    pub fn quantized_model(&self) -> Option<Arc<realizar::gguf::OwnedQuantizedModel>> {
196        self.quantized_model.read().ok()?.clone()
197    }
198
199    /// Get the vocabulary tokens.
200    #[cfg(feature = "realizar")]
201    #[must_use]
202    pub fn vocabulary(&self) -> Vec<String> {
203        self.vocab.read().map(|v| v.clone()).unwrap_or_default()
204    }
205
206    /// Check if inference-capable model is loaded (not just metadata).
207    #[cfg(feature = "realizar")]
208    #[must_use]
209    pub fn has_inference_model(&self) -> bool {
210        self.quantized_model.read().map(|m| m.is_some()).unwrap_or(false)
211    }
212
213    /// Encode text to token IDs using proper BPE when available, else greedy fallback.
214    ///
215    /// Priority: BPE tokenizer (correct merge rules) → greedy longest-match (approximate).
216    #[cfg(feature = "realizar")]
217    #[must_use]
218    pub fn encode_text(&self, text: &str) -> Vec<u32> {
219        if text.is_empty() {
220            return Vec::new();
221        }
222
223        // Try BPE tokenizer first (correct tokenization)
224        #[cfg(feature = "aprender")]
225        if let Ok(guard) = self.bpe_tokenizer.read() {
226            if let Some(ref bpe) = *guard {
227                return bpe.encode(text);
228            }
229        }
230
231        // Fall back to greedy longest-match (approximate)
232        let vocab = self.vocabulary();
233        super::inference::encode_prompt(&vocab, text)
234    }
235
236    /// Check if a proper BPE tokenizer is loaded (not just greedy fallback).
237    #[cfg(feature = "aprender")]
238    #[must_use]
239    pub fn has_bpe_tokenizer(&self) -> bool {
240        self.bpe_tokenizer.read().map(|t| t.is_some()).unwrap_or(false)
241    }
242
243    /// How long the model has been loaded.
244    #[must_use]
245    pub fn uptime_secs(&self) -> u64 {
246        self.loaded_at.read().ok().and_then(|t| t.map(|i| i.elapsed().as_secs())).unwrap_or(0)
247    }
248}
249
250/// Model slot errors.
251#[derive(Debug, Clone, PartialEq, Eq)]
252pub enum ModelSlotError {
253    NoModelLoaded,
254}
255
256impl std::fmt::Display for ModelSlotError {
257    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
258        match self {
259            Self::NoModelLoaded => write!(f, "No model loaded"),
260        }
261    }
262}
263
264impl std::error::Error for ModelSlotError {}
265
266fn epoch_secs() -> u64 {
267    std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap_or_default().as_secs()
268}
269
270/// Extracted GGUF metadata (architecture, vocab, etc.).
271struct GgufMeta {
272    architecture: String,
273    vocab_size: usize,
274    hidden_dim: usize,
275    num_layers: usize,
276    context_length: usize,
277    tensor_count: usize,
278    /// The quantized model for inference (only with inference feature).
279    #[cfg(feature = "realizar")]
280    model: Option<Arc<realizar::gguf::OwnedQuantizedModel>>,
281    /// Vocabulary tokens.
282    #[cfg(feature = "realizar")]
283    vocab: Vec<String>,
284}
285
286/// Extract model metadata + quantized model from GGUF or APR file.
287#[cfg(feature = "realizar")]
288fn extract_model_metadata(path: &Path, format: ModelFormat) -> Option<GgufMeta> {
289    match format {
290        ModelFormat::Gguf => extract_gguf_metadata(path),
291        ModelFormat::Apr => extract_apr_metadata(path),
292        _ => None,
293    }
294}
295
296/// Extract GGUF metadata + model from a .gguf file.
297#[cfg(feature = "realizar")]
298fn extract_gguf_metadata(path: &Path) -> Option<GgufMeta> {
299    // Memory-map for efficient loading
300    let mapped = realizar::gguf::MappedGGUFModel::from_path(path.to_str()?).ok()?;
301    let config = realizar::gguf::GGUFConfig::from_gguf(&mapped.model).ok()?;
302
303    // Extract vocabulary
304    let vocab = mapped
305        .model
306        .vocabulary()
307        .unwrap_or_else(|| (0..config.vocab_size).map(|i| format!("token{i}")).collect());
308
309    // Build quantized model for inference
310    let quantized = match realizar::gguf::OwnedQuantizedModel::from_mapped(&mapped) {
311        Ok(m) => {
312            eprintln!("[banco] Quantized model loaded successfully");
313            Some(m)
314        }
315        Err(e) => {
316            eprintln!("[banco] WARNING: Failed to build quantized model: {e}");
317            eprintln!("[banco] Metadata available but inference disabled for this model");
318            None
319        }
320    };
321
322    Some(GgufMeta {
323        architecture: config.architecture.clone(),
324        vocab_size: config.vocab_size,
325        hidden_dim: config.hidden_dim,
326        num_layers: config.num_layers,
327        context_length: config.context_length,
328        tensor_count: mapped.model.tensors.len(),
329        model: quantized.map(Arc::new),
330        vocab,
331    })
332}
333
334/// Extract APR metadata + model from a .apr file.
335#[cfg(feature = "realizar")]
336fn extract_apr_metadata(path: &Path) -> Option<GgufMeta> {
337    let apr = realizar::apr::MappedAprModel::from_path(path).ok()?;
338
339    let meta = &apr.metadata;
340    let architecture = meta.architecture.clone().unwrap_or_else(|| "unknown".to_string());
341    let hidden_dim = meta.hidden_size.unwrap_or(0);
342    let num_layers = meta.num_layers.unwrap_or(0);
343    let vocab_size = meta.vocab_size.unwrap_or(0);
344    let context_length = meta.max_position_embeddings.unwrap_or(2048);
345    let tensor_count = apr.tensor_count();
346
347    eprintln!(
348        "[banco] APR model: {architecture} | {num_layers} layers | {hidden_dim}d | {vocab_size} vocab | {tensor_count} tensors"
349    );
350
351    // Build vocabulary from APR metadata or generate placeholders
352    let vocab: Vec<String> = if vocab_size > 0 {
353        (0..vocab_size).map(|i| format!("token{i}")).collect()
354    } else {
355        Vec::new()
356    };
357
358    // Build quantized model for inference via realizar
359    let quantized = match realizar::gguf::OwnedQuantizedModel::from_apr(&apr) {
360        Ok(m) => {
361            eprintln!("[banco] APR quantized model loaded successfully");
362            Some(m)
363        }
364        Err(e) => {
365            eprintln!("[banco] WARNING: Failed to build quantized model from APR: {e}");
366            None
367        }
368    };
369
370    Some(GgufMeta {
371        architecture,
372        vocab_size,
373        hidden_dim,
374        num_layers,
375        context_length,
376        tensor_count,
377        model: quantized.map(Arc::new),
378        vocab,
379    })
380}
381
382/// Stub when inference feature is not enabled.
383#[cfg(not(feature = "realizar"))]
384fn extract_model_metadata(_path: &Path, _format: ModelFormat) -> Option<GgufMeta> {
385    None
386}
387
388/// Load a proper BPE tokenizer for a model file.
389///
390/// Search order (same as apr-cli):
391/// 1. Sibling `{stem}.tokenizer.json` (e.g., `model.tokenizer.json`)
392/// 2. `tokenizer.json` in the same directory
393///
394/// Returns `None` if no tokenizer.json found — caller falls back to greedy.
395#[cfg(feature = "aprender")]
396fn load_bpe_tokenizer(model_path: &Path) -> Option<aprender::text::bpe::BpeTokenizer> {
397    use aprender::text::bpe::BpeTokenizer;
398
399    // 1. Sibling {stem}.tokenizer.json
400    let stem = model_path.file_stem()?.to_string_lossy();
401    let sibling = model_path.with_file_name(format!("{stem}.tokenizer.json"));
402    if sibling.exists() {
403        match BpeTokenizer::from_huggingface(&sibling) {
404            Ok(tok) => {
405                eprintln!("[banco] BPE tokenizer loaded from {}", sibling.display());
406                return Some(tok);
407            }
408            Err(e) => {
409                eprintln!(
410                    "[banco] WARNING: Failed to load tokenizer from {}: {e}",
411                    sibling.display()
412                );
413            }
414        }
415    }
416
417    // 2. tokenizer.json in same directory
418    if let Some(parent) = model_path.parent() {
419        let tokenizer_json = parent.join("tokenizer.json");
420        if tokenizer_json.exists() {
421            match BpeTokenizer::from_huggingface(&tokenizer_json) {
422                Ok(tok) => {
423                    eprintln!("[banco] BPE tokenizer loaded from {}", tokenizer_json.display());
424                    return Some(tok);
425                }
426                Err(e) => {
427                    eprintln!(
428                        "[banco] WARNING: Failed to load tokenizer from {}: {e}",
429                        tokenizer_json.display()
430                    );
431                }
432            }
433        }
434    }
435
436    eprintln!(
437        "[banco] No tokenizer.json found for '{}' — using greedy tokenization",
438        model_path.display()
439    );
440    None
441}
batuta/serve/banco/model_slot.rs

batuta/serve/banco/
model_slot.rs