realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
//! High-level inference API for CLI tools
//!
//! This module provides a simple, high-level API for running inference
//! that can be used by CLI tools like `apr run` and `apr chat`.
//!
//! # Architecture (APR-CLI-DELEGATE-001)
//!
//! ```text
//! ┌─────────────┐     ┌─────────────┐     ┌─────────────┐
//! │  apr-cli    │ --> │  realizar   │ --> │   trueno    │
//! │  (100 LOC)  │     │   infer.rs  │     │   SIMD/GPU  │
//! └─────────────┘     └─────────────┘     └─────────────┘
//! ```
//!
//! The `apr run` command delegates ALL inference to this module.
//! This eliminates ~1800 lines of duplicated code in apr-cli.
//!
//! # Example
//!
//! ```rust,ignore
//! use realizar::infer::{InferenceConfig, run_inference};
//!
//! let config = InferenceConfig::new("model.gguf")
//!     .with_prompt("Hello, world!")
//!     .with_max_tokens(32);
//!
//! let result = run_inference(config)?;
//! println!("{}", result.text);
//! ```

use crate::error::{RealizarError, Result};
use crate::format::{detect_format, ModelFormat};
use std::path::PathBuf;
use std::time::Instant;

/// PMAT-173 / GH-321: Convert GGML quantization type to human-readable string.
/// Uses unified `GgmlQuantType` enum — single source of truth.
pub(crate) fn qtype_to_dtype_str(qtype: u32) -> &'static str {
    crate::gguf::GgmlQuantType::from_id(qtype).map_or("Unknown", crate::gguf::GgmlQuantType::as_str)
}

/// Configuration for inference
#[derive(Debug, Clone)]
pub struct InferenceConfig {
    /// Path to model file (GGUF, APR, or SafeTensors)
    pub model_path: PathBuf,
    /// Text prompt for generation
    pub prompt: Option<String>,
    /// Token IDs for generation (alternative to prompt)
    pub input_tokens: Option<Vec<u32>>,
    /// Maximum tokens to generate
    pub max_tokens: usize,
    /// Temperature for sampling (0.0 = greedy)
    pub temperature: f32,
    /// Top-k sampling (0 = disabled)
    pub top_k: usize,
    /// Disable GPU acceleration
    pub no_gpu: bool,
    /// Enable inference tracing (APR-TRACE-001)
    pub trace: bool,
    /// Verbose tracing output
    pub trace_verbose: bool,
    /// Trace output file path
    pub trace_output: Option<PathBuf>,
    /// Specific trace steps to capture
    pub trace_steps: Option<Vec<String>>,
    /// Show verbose loading/progress output
    pub verbose: bool,
    /// Stop token IDs for early termination (GH-373)
    pub stop_tokens: Vec<u32>,
    /// INTERNAL: Use mock backend for testing (PMAT-COV-95)
    #[doc(hidden)]
    pub use_mock_backend: bool,
}

impl InferenceConfig {
    /// Create a new inference config for a model file
    #[must_use]
    pub fn new(model_path: impl Into<PathBuf>) -> Self {
        Self {
            model_path: model_path.into(),
            prompt: None,
            input_tokens: None,
            max_tokens: 32,
            temperature: 0.0, // Greedy by default
            top_k: 1,
            no_gpu: false,
            trace: false,
            trace_verbose: false,
            trace_output: None,
            trace_steps: None,
            verbose: false,
            stop_tokens: Vec::new(),
            use_mock_backend: false,
        }
    }

    /// Set the text prompt
    #[must_use]
    pub fn with_prompt(mut self, prompt: impl Into<String>) -> Self {
        self.prompt = Some(prompt.into());
        self
    }

    /// Set input tokens directly
    #[must_use]
    pub fn with_input_tokens(mut self, tokens: Vec<u32>) -> Self {
        self.input_tokens = Some(tokens);
        self
    }

    /// Set maximum tokens to generate
    #[must_use]
    pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
        self.max_tokens = max_tokens;
        self
    }

    /// Set temperature (0.0 = greedy)
    #[must_use]
    pub fn with_temperature(mut self, temperature: f32) -> Self {
        self.temperature = temperature;
        self
    }

    /// Set top-k sampling
    #[must_use]
    pub fn with_top_k(mut self, top_k: usize) -> Self {
        self.top_k = top_k;
        self
    }

    /// Disable GPU acceleration
    #[must_use]
    pub fn without_gpu(mut self) -> Self {
        self.no_gpu = true;
        self
    }

    /// Enable verbose output
    #[must_use]
    pub fn with_verbose(mut self, verbose: bool) -> Self {
        self.verbose = verbose;
        self
    }

    /// Enable inference tracing
    #[must_use]
    pub fn with_trace(mut self, trace: bool) -> Self {
        self.trace = trace;
        self
    }

    /// Set trace output file path
    #[must_use]
    pub fn with_trace_output(mut self, path: impl Into<PathBuf>) -> Self {
        self.trace_output = Some(path.into());
        self
    }

    /// Set stop token IDs for early termination (GH-373)
    #[must_use]
    pub fn with_stop_tokens(mut self, stop_tokens: Vec<u32>) -> Self {
        self.stop_tokens = stop_tokens;
        self
    }
}

// ============================================================================
// PreparedTokens - Compile-time chat template enforcement (PMAT-236)
// ============================================================================

/// Tokenized input that has been processed through chat template formatting.
///
/// # Compile-time enforcement (Poka-Yoke)
///
/// The inner `Vec<u32>` is **private** - the only way to construct `PreparedTokens`
/// is via `prepare_tokens()`, which ALWAYS applies chat template formatting for
/// instruct models. This makes it a **compile error** to pass raw tokens to
/// inference functions, preventing the bug where SafeTensors inference skipped
/// chat template application (producing "4" then garbage).
///
/// # Theoretical basis
///
/// Shingo, S. (1986). *Zero Quality Control: Source Inspection and the Poka-Yoke System*.
/// Brady, E. (2017). *Type-Driven Development with Idris*.
///
/// # References
///
/// - PMAT-236: Chat template enforcement for multi-format inference
/// - GH-205: SafeTensors inference garbage root cause
#[derive(Debug, Clone)]
pub struct PreparedTokens {
    /// Tokenized input (PRIVATE - enforces construction via prepare_tokens only)
    tokens: Vec<u32>,
    /// Number of input tokens (for separating prefill from generated tokens)
    input_count: usize,
}

impl PreparedTokens {
    /// Access the prepared token IDs (read-only).
    #[must_use]
    pub fn tokens(&self) -> &[u32] {
        &self.tokens
    }

    /// Number of input tokens.
    #[must_use]
    pub fn input_count(&self) -> usize {
        self.input_count
    }
}

/// Prepare tokens for inference, applying chat template for instruct models.
///
/// This is the ONLY way to create `PreparedTokens`. It handles:
/// 1. Format detection (GGUF vs SafeTensors vs APR)
/// 2. Architecture detection (Qwen2, LLaMA, Phi, etc.)
/// 3. Chat template application for instruct models
/// 4. Tokenization using the appropriate tokenizer
///
/// # Chat Template Rules
///
/// - If model name/architecture contains "instruct", chat template is applied
/// - GGUF: uses embedded tokenizer + architecture from metadata
/// - SafeTensors: uses sibling tokenizer.json + config.json architecture
/// - APR: uses sibling tokenizer.json + model metadata
///
/// # Errors
///
/// Returns error if the model cannot be read or tokenization fails.
pub fn prepare_tokens(config: &InferenceConfig, format: &ModelFormat) -> Result<PreparedTokens> {
    // If raw token IDs are provided, use them directly (user knows what they're doing)
    if let Some(ref tokens) = config.input_tokens {
        return Ok(PreparedTokens {
            input_count: tokens.len(),
            tokens: tokens.clone(),
        });
    }

    let prompt = match config.prompt {
        Some(ref p) => p.clone(),
        None => {
            return Ok(PreparedTokens {
                tokens: vec![1u32],
                input_count: 1,
            })
        },
    };

    match format {
        ModelFormat::Gguf => prepare_tokens_gguf(config, &prompt),
        ModelFormat::SafeTensors => prepare_tokens_safetensors(config, &prompt),
        ModelFormat::Apr => prepare_tokens_apr(config, &prompt),
    }
}

/// Prepare tokens for GGUF format (chat template from GGUF metadata)
///
/// GH-278: Only apply chat template when the GGUF actually contains one in its
/// metadata (`tokenizer.chat_template`). Previously, ALL models with known
/// architectures (llama, qwen2, etc.) got chat-template wrapping even if they
/// were base completion models, causing complete output divergence vs llama.cpp.
///
/// BOS token: Prepend BOS when the model metadata says `add_bos_token = true`
/// or when a BOS token ID exists and `add_bos_token` is not explicitly false.
/// This matches llama.cpp behavior for LLaMA-family models.
fn prepare_tokens_gguf(config: &InferenceConfig, prompt: &str) -> Result<PreparedTokens> {
    use crate::chat_template::{format_messages, ChatMessage};
    use crate::gguf::{GGUFValue, MappedGGUFModel};

    let mapped = MappedGGUFModel::from_path(&config.model_path)?;
    let gguf_arch = mapped.model.architecture().unwrap_or("transformer");

    // GH-278: Check if model actually has a chat template in its GGUF metadata.
    // Base models (SmolLM-135M, GPT-2) don't have one — only instruct/chat models do.
    let has_chat_template = mapped
        .model
        .metadata
        .get("tokenizer.chat_template")
        .is_some_and(|v| matches!(v, GGUFValue::String(s) if !s.is_empty()));

    let model_name = config
        .model_path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("");
    let filename_instruct = model_name.to_lowercase().contains("instruct")
        || model_name.to_lowercase().contains("-chat");

    // Only apply chat template if the model actually has one, or filename says instruct
    let formatted_prompt = if has_chat_template || filename_instruct {
        let template_hint = apr_arch_to_template_hint(gguf_arch, model_name);
        let messages = vec![ChatMessage::user(prompt)];
        format_messages(&messages, Some(template_hint)).unwrap_or_else(|_| prompt.to_string())
    } else {
        prompt.to_string()
    };

    if config.verbose {
        eprintln!(
            "[DEBUG] has_chat_template={}, filename_instruct={}",
            has_chat_template, filename_instruct
        );
        eprintln!(
            "[DEBUG] formatted_prompt={:?}",
            &formatted_prompt[..formatted_prompt.len().min(200)]
        );
    }

    let mut tokens = mapped.model.encode(&formatted_prompt).ok_or_else(|| {
        RealizarError::InferenceError(format!(
            "Tokenizer encode failed for GGUF model (no tokenizer data in GGUF file?). \
                 Prompt length: {} chars",
            formatted_prompt.len()
        ))
    })?;

    // GH-278: Prepend BOS token to match llama.cpp behavior.
    // llama.cpp adds BOS when add_bos_token is true (default for LLaMA-family).
    // Only add if not already present AND model has a BOS token defined.
    let add_bos = match mapped
        .model
        .metadata
        .get(crate::gguf::keys::TOKENIZER_ADD_BOS)
    {
        Some(GGUFValue::Bool(b)) => *b,
        // GH-326: Derive BOS default from architecture constraints, not hardcoded string.
        // Models with absolute position embeddings (GPT-2, BERT) use BPE → no BOS.
        // Models with RoPE (LLaMA, Qwen, Mistral) use SentencePiece → add BOS.
        _ => {
            let arch = mapped
                .model
                .metadata
                .get(crate::gguf::keys::GENERAL_ARCHITECTURE)
                .and_then(|v| {
                    if let GGUFValue::String(s) = v {
                        Some(s.as_str())
                    } else {
                        None
                    }
                })
                // R-01 (Meyer DbC): "unknown" — don't pretend unidentified model is LLaMA.
                .unwrap_or("unknown");
            let constraints = crate::gguf::ArchConstraints::from_architecture(arch);
            constraints.positional_encoding != crate::gguf::PositionalEncoding::Absolute
        },
    };

    if add_bos {
        if let Some(bos_id) = mapped.model.bos_token_id() {
            if tokens.first() != Some(&bos_id) {
                tokens.insert(0, bos_id);
            }
        }
    }

    if config.verbose {
        eprintln!(
            "[DEBUG] add_bos={}, encoded {} tokens: {:?}",
            add_bos,
            tokens.len(),
            &tokens[..tokens.len().min(30)]
        );
    }

    Ok(PreparedTokens {
        input_count: tokens.len(),
        tokens,
    })
}

/// Prepare tokens for SafeTensors format (chat template from config.json)
fn prepare_tokens_safetensors(config: &InferenceConfig, prompt: &str) -> Result<PreparedTokens> {
    use crate::apr::AprV2Model;
    use crate::chat_template::{format_messages, ChatMessage};
    use crate::safetensors::SafetensorsConfig;

    // Load config.json for architecture detection
    let st_config = SafetensorsConfig::load_from_sibling(&config.model_path);
    let architecture = st_config
        .as_ref()
        .map(SafetensorsConfig::architecture)
        .unwrap_or_default();

    let model_name = config
        .model_path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("");

    // Detect instruct model from architecture or filename
    let arch_lower = architecture.to_lowercase();
    let is_instruct = arch_lower.contains("instruct")
        || model_name.to_lowercase().contains("instruct")
        || matches!(
            arch_lower.as_str(),
            "qwen2forcausallm" | "llamaforcausallm" | "mistralforcausallm" | "phiforcausallm"
        );

    let formatted_prompt = if is_instruct {
        let template_hint = safetensors_arch_to_template_hint(&architecture, model_name);
        let messages = vec![ChatMessage::user(prompt)];
        format_messages(&messages, Some(template_hint)).unwrap_or_else(|_| prompt.to_string())
    } else {
        prompt.to_string()
    };

    let tokens =
        AprV2Model::encode_text(&config.model_path, &formatted_prompt).ok_or_else(|| {
            RealizarError::InferenceError(format!(
                "Tokenizer encode failed for SafeTensors model (no tokenizer.json sibling?). \
                 Prompt length: {} chars",
                formatted_prompt.len()
            ))
        })?;

    Ok(PreparedTokens {
        input_count: tokens.len(),
        tokens,
    })
}

/// Prepare tokens for APR format (chat template from model metadata)
fn prepare_tokens_apr(config: &InferenceConfig, prompt: &str) -> Result<PreparedTokens> {
    use crate::apr::AprV2Model;
    use crate::chat_template::{format_messages, ChatMessage};

    let model_name = config
        .model_path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("");

    // PMAT-237: Detect instruct from MODEL DATA, not filename.
    // Filename heuristic silently skips chat template for hash-named APR files.
    // Three-tier detection: architecture metadata > vocab special tokens > filename fallback.
    let (apr_arch, has_chatml_tokens) =
        if config.model_path.extension().is_some_and(|e| e == "apr") {
            match AprV2Model::load(&config.model_path) {
                Ok(model) => {
                    let arch = model.metadata().architecture.clone().unwrap_or_default();
                    let has_chatml = model.metadata().get_embedded_vocabulary().is_some_and(
                        |vocab: Vec<String>| vocab.iter().any(|t| t == "<|im_start|>"),
                    );
                    (arch, has_chatml)
                },
                Err(_) => (String::new(), false),
            }
        } else {
            (String::new(), false)
        };

    let is_instruct_arch = matches!(
        apr_arch.to_lowercase().as_str(),
        "qwen2" | "qwen" | "llama" | "mistral" | "phi" | "phi3"
    );
    let filename_instruct = model_name.to_lowercase().contains("instruct");

    let is_instruct = is_instruct_arch || has_chatml_tokens || filename_instruct;

    let formatted_prompt = if is_instruct {
        let template_hint = apr_arch_to_template_hint(&apr_arch, model_name);
        let messages = vec![ChatMessage::user(prompt)];
        format_messages(&messages, Some(template_hint)).unwrap_or_else(|_| prompt.to_string())
    } else {
        prompt.to_string()
    };

    let tokens =
        AprV2Model::encode_text(&config.model_path, &formatted_prompt).ok_or_else(|| {
            RealizarError::InferenceError(format!(
                "Tokenizer encode failed for APR model (no tokenizer in APR metadata?). \
                 Prompt length: {} chars",
                formatted_prompt.len()
            ))
        })?;

    Ok(PreparedTokens {
        input_count: tokens.len(),
        tokens,
    })
}

/// Map SafeTensors architecture string to chat template hint.
///
/// GH-317/318: Contract-driven — uses normalize_architecture() from
/// tensor-names-v1.yaml. No contains() heuristics, no model_name fallback.
fn safetensors_arch_to_template_hint(architecture: &str, _model_name: &str) -> &'static str {
    crate::tensor_names::normalize_architecture(architecture)
}

include!("inference_result.rs");
include!("gguf_gpu_generate.rs");
include!("mod_log_transformer_eos.rs");
include!("mod_05.rs");
include!("batch.rs");