1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// =============================================================================
// ChatSession with realizar (Y13/Y14: architecture and format agnostic)
// =============================================================================
#[cfg(feature = "inference")]
mod realizar_chat {
use super::*;
use aprender::text::bpe::Qwen2BpeTokenizer;
use std::fs::File;
use std::io::Read;
/// Chat session using realizar for high-performance inference
/// Y13: Architecture-agnostic (detected from model metadata)
/// Y14: Format-agnostic (APR, GGUF, SafeTensors)
///
/// PMAT-108: ALL inference delegated to realizar engine.
/// aprender::models is NOT used for inference (only training).
pub struct ChatSession {
/// Model bytes (kept for regeneration if needed)
model_bytes: Vec<u8>,
/// Model path (for mmap-based loading)
model_path: std::path::PathBuf,
/// Detected format
format: ModelFormat,
/// Conversation history as ChatMessage objects
history: Vec<ChatMessage>,
/// Chat template engine (Toyota Way: Standardized Work)
chat_template: Box<dyn ChatTemplateEngine + Send + Sync>,
/// Detected template format name (for display)
template_format: TemplateFormat,
/// LLaMA tokenizer (for GGUF format)
llama_tokenizer: Option<LlamaTokenizer>,
/// Qwen2 BPE tokenizer (for SafeTensors/APR format)
qwen_tokenizer: Option<Qwen2BpeTokenizer>,
/// GH-224: Cached GGUF mmap model (for tokenizer encode/decode across messages)
cached_gguf_mapped: Option<realizar::gguf::MappedGGUFModel>,
/// GH-224: Cached GGUF CUDA model (avoids re-uploading weights per message)
#[cfg(feature = "cuda")]
cached_gguf_cuda: Option<realizar::gguf::OwnedQuantizedModelCuda>,
/// GH-224: Cached APR CUDA model (avoids re-uploading weights per message)
#[cfg(feature = "cuda")]
cached_apr_cuda: Option<realizar::apr::AprV2ModelCuda>,
/// GH-224: Cached SafeTensors CUDA model (avoids re-loading per message)
#[cfg(feature = "cuda")]
cached_safetensors_cuda: Option<realizar::safetensors_cuda::SafeTensorsCudaModel>,
/// GH-224: Whether CUDA init was attempted and failed (skip retries)
#[cfg(feature = "cuda")]
cuda_init_failed: bool,
}
include!("chat_load_tokenizers.rs");
include!("chat_session_02.rs");
include!("chat_generate_session_02.rs");
include!("chat_generate_safetensors.rs");
}
#[cfg(feature = "inference")]
use realizar_chat::ChatSession;