Skip to main content

offline_intelligence/
config.rs

1// _Aud.io/crates/offline-intelligence/src/config.rs
2
3use anyhow::Result;
4use std::env;
5use std::net::SocketAddr;
6use std::path::PathBuf;
7use sysinfo::System;
8use tracing::{debug, info, warn};
9
10// NVIDIA GPU detection only available when nvidia feature is enabled (Windows and Linux)
11#[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
12use nvml_wrapper::Nvml;
13
14#[allow(dead_code)]
15#[derive(Debug, Clone)]
16pub struct Config {
17    pub model_path: String,
18    pub llama_bin: String,
19    pub llama_host: String,
20    pub llama_port: u16,
21    pub ctx_size: u32,
22    pub batch_size: u32,
23    pub threads: u32,
24    pub gpu_layers: u32,
25    pub health_timeout_seconds: u64,
26    pub hot_swap_grace_seconds: u64,
27    pub max_concurrent_streams: u32,
28    pub parallel_slots: u32,
29    pub ubatch_size: u32,
30    pub prometheus_port: u16,
31    pub api_host: String,
32    pub api_port: u16,
33    pub requests_per_second: u32,
34    pub generate_timeout_seconds: u64,
35    pub stream_timeout_seconds: u64,
36    pub health_check_timeout_seconds: u64,
37    pub queue_size: usize,
38    pub queue_timeout_seconds: u64,
39    pub backend_url: String,
40    pub openrouter_api_key: String,
41    /// Path to draft model for speculative decoding (empty string = disabled).
42    /// Set DRAFT_MODEL_PATH in .env to enable. The draft model should be a
43    /// smaller version of the main model (e.g. 0.5B for a 3B main model).
44    pub draft_model_path: String,
45    /// Maximum number of draft tokens the draft model generates per step.
46    /// Higher values increase throughput gains but reduce acceptance rate.
47    /// Maps to llama-server --draft-max. Default: 8.
48    pub speculative_draft_max: u32,
49    /// Minimum acceptance probability for a draft token to be kept.
50    /// Tokens below this threshold are rejected early. Default: 0.4.
51    pub speculative_draft_p_min: f32,
52}
53
54impl Config {
55    pub fn from_env() -> Result<Self> {
56        // Try to load .env from multiple locations in order:
57        // 1. First try executable directory (for production builds with bundled .env)
58        // 2. Then try project root (where .env actually is during development)
59        // 3. Finally try current directory as fallback
60
61        let mut env_loaded = false;
62
63        // 1. Try executable directory
64        if let Ok(exe_path) = std::env::current_exe() {
65            if let Some(exe_dir) = exe_path.parent() {
66                let env_path = exe_dir.join(".env");
67                if env_path.exists() {
68                    match dotenvy::from_path(&env_path) {
69                        Ok(_) => {
70                            info!("Loaded .env from executable directory: {:?}", env_path);
71                            env_loaded = true;
72                        }
73                        Err(e) => {
74                            warn!("Failed to load .env from {:?}: {}", env_path, e);
75                        }
76                    }
77                }
78
79                // 2a. macOS .app bundle: exe is at App.app/Contents/MacOS/binary
80                //     Resources live at App.app/Contents/Resources/.env
81                #[cfg(target_os = "macos")]
82                if !env_loaded {
83                    // exe_dir = App.app/Contents/MacOS/
84                    // parent  = App.app/Contents/
85                    if let Some(contents_dir) = exe_dir.parent() {
86                        let bundle_env = contents_dir.join("Resources").join(".env");
87                        if bundle_env.exists() {
88                            match dotenvy::from_path(&bundle_env) {
89                                Ok(_) => {
90                                    info!("Loaded .env from macOS bundle Resources: {:?}", bundle_env);
91                                    env_loaded = true;
92                                }
93                                Err(e) => {
94                                    warn!("Failed to load .env from bundle Resources {:?}: {}", bundle_env, e);
95                                }
96                            }
97                        }
98                    }
99                }
100
101                // 2b. macOS: also try ~/Library/Application Support/Aud.io/.env
102                //     This allows post-install configuration without modifying the bundle.
103                #[cfg(target_os = "macos")]
104                if !env_loaded {
105                    if let Some(app_support) = dirs::data_dir() {
106                        let user_env = app_support.join("Aud.io").join(".env");
107                        if user_env.exists() {
108                            match dotenvy::from_path(&user_env) {
109                                Ok(_) => {
110                                    info!("Loaded .env from user data directory: {:?}", user_env);
111                                    env_loaded = true;
112                                }
113                                Err(e) => {
114                                    warn!("Failed to load .env from user data dir {:?}: {}", user_env, e);
115                                }
116                            }
117                        }
118                    }
119                }
120
121                // 2c. Development: try project root (../../ from target/release/ or target\release\)
122                if !env_loaded {
123                    let project_root = if exe_dir.ends_with("target/release")
124                        || exe_dir.ends_with("target\\release")
125                    {
126                        exe_dir.parent().and_then(|p| p.parent())
127                    } else {
128                        None
129                    };
130
131                    if let Some(root) = project_root {
132                        let root_env = root.join(".env");
133                        if root_env.exists() {
134                            match dotenvy::from_path(&root_env) {
135                                Ok(_) => {
136                                    info!("Loaded .env from project root: {:?}", root_env);
137                                    env_loaded = true;
138                                }
139                                Err(e) => {
140                                    warn!(
141                                        "Failed to load .env from project root {:?}: {}",
142                                        root_env, e
143                                    );
144                                }
145                            }
146                        }
147                    }
148                }
149            }
150        }
151
152        // 3. If still not loaded, try current directory (development fallback)
153        if !env_loaded {
154            if let Err(e) = dotenvy::dotenv() {
155                warn!("Failed to load .env from current directory: {}. Using system environment variables.", e);
156            } else {
157                info!("Loaded environment variables from .env file in current directory");
158            }
159        }
160
161        // Auto-detect llama binary based on OS, with optional LLAMA_BIN override
162        let llama_bin = Self::get_llama_binary_path()?;
163        info!("Using llama binary: {}", llama_bin);
164
165        // Use MODEL_PATH from env, or try to find embedded model
166        let model_path = Self::get_model_path_with_fallback()?;
167
168        // Auto‑detect threads if set to "auto"
169        let threads = if env::var("THREADS").unwrap_or_else(|_| "auto".into()) == "auto" {
170            Self::auto_detect_threads()
171        } else {
172            env::var("THREADS")
173                .unwrap_or_else(|_| "6".into())
174                .parse()
175                .unwrap_or(6)
176        };
177
178        // ctx_size must be computed BEFORE gpu_layers so the layer formula can
179        // account for the KV cache size when deciding how many layers fit in VRAM.
180        let ctx_size = if env::var("CTX_SIZE").unwrap_or_else(|_| "auto".into()) == "auto" {
181            Self::auto_detect_ctx_size(&model_path)
182        } else {
183            env::var("CTX_SIZE")
184                .unwrap_or_else(|_| "8192".into())
185                .parse()
186                .unwrap_or(8192)
187        };
188
189        // parallel_slots must be computed BEFORE gpu_layers for the same reason.
190        let parallel_slots: u32 = env::var("PARALLEL_SLOTS")
191            .unwrap_or_else(|_| "8".into())
192            .parse()
193            .unwrap_or(8);
194
195        // Auto‑detect GPU layers if set to "auto".
196        // Now passes model_path, ctx_size, and parallel_slots so the formula can
197        // compute the real VRAM footprint and pick the maximum safe layer count.
198        let gpu_layers = if env::var("GPU_LAYERS").unwrap_or_else(|_| "auto".into()) == "auto" {
199            Self::auto_detect_gpu_layers(&model_path, ctx_size, parallel_slots)
200        } else {
201            env::var("GPU_LAYERS")
202                .unwrap_or_else(|_| "20".into())
203                .parse()
204                .unwrap_or(20)
205        };
206
207        // Auto‑detect batch size
208        let batch_size = if env::var("BATCH_SIZE").unwrap_or_else(|_| "auto".into()) == "auto" {
209            Self::auto_detect_batch_size(gpu_layers, ctx_size)
210        } else {
211            env::var("BATCH_SIZE")
212                .unwrap_or_else(|_| "256".into())
213                .parse()
214                .unwrap_or(256)
215        };
216
217        // Get backend URL components
218        let llama_host = env::var("LLAMA_HOST").unwrap_or_else(|_| "127.0.0.1".into());
219        let llama_port = env::var("LLAMA_PORT")
220            .unwrap_or_else(|_| "8081".into())
221            .parse()?;
222        let backend_url = format!("http://{}:{}", llama_host, llama_port);
223
224        // Get OpenRouter API key from environment variable
225        let openrouter_api_key = env::var("OPENROUTER_API_KEY").unwrap_or_default();
226
227        info!(
228            "Resource Configuration: {} GPU layers, {} threads, batch size: {}, context: {}",
229            gpu_layers, threads, batch_size, ctx_size
230        );
231
232        Ok(Self {
233            model_path,
234            llama_bin,
235            llama_host: llama_host.clone(),
236            llama_port,
237            ctx_size,
238            batch_size,
239            threads,
240            gpu_layers,
241            health_timeout_seconds: env::var("HEALTH_TIMEOUT_SECONDS")
242                .unwrap_or_else(|_| "60".into())
243                .parse()?,
244            hot_swap_grace_seconds: env::var("HOT_SWAP_GRACE_SECONDS")
245                .unwrap_or_else(|_| "25".into())
246                .parse()?,
247            max_concurrent_streams: env::var("MAX_CONCURRENT_STREAMS")
248                .unwrap_or_else(|_| "4".into())
249                .parse()?,
250            parallel_slots,
251            ubatch_size: env::var("UBATCH_SIZE")
252                .unwrap_or_else(|_| "512".into())
253                .parse()
254                .unwrap_or(512),
255            prometheus_port: env::var("PROMETHEUS_PORT")
256                .unwrap_or_else(|_| "9000".into())
257                .parse()?,
258            api_host: env::var("API_HOST").unwrap_or_else(|_| "127.0.0.1".into()),
259            api_port: env::var("API_PORT")
260                .unwrap_or_else(|_| "9999".into())
261                .parse()?,
262            requests_per_second: env::var("REQUESTS_PER_SECOND")
263                .unwrap_or_else(|_| "24".into())
264                .parse()?,
265            generate_timeout_seconds: env::var("GENERATE_TIMEOUT_SECONDS")
266                .unwrap_or_else(|_| "300".into())
267                .parse()?,
268            stream_timeout_seconds: env::var("STREAM_TIMEOUT_SECONDS")
269                .unwrap_or_else(|_| "600".into())
270                .parse()?,
271            health_check_timeout_seconds: env::var("HEALTH_CHECK_TIMEOUT_SECONDS")
272                .unwrap_or_else(|_| "90".into())
273                .parse()?,
274            queue_size: env::var("QUEUE_SIZE")
275                .unwrap_or_else(|_| "100".into())
276                .parse()?,
277            queue_timeout_seconds: env::var("QUEUE_TIMEOUT_SECONDS")
278                .unwrap_or_else(|_| "30".into())
279                .parse()?,
280            backend_url,
281            openrouter_api_key,
282            draft_model_path: env::var("DRAFT_MODEL_PATH")
283                .unwrap_or_else(|_| "none".into()),
284            speculative_draft_max: env::var("SPECULATIVE_DRAFT_MAX")
285                .unwrap_or_else(|_| "8".into())
286                .parse()
287                .unwrap_or(8),
288            speculative_draft_p_min: env::var("SPECULATIVE_DRAFT_P_MIN")
289                .unwrap_or_else(|_| "0.4".into())
290                .parse()
291                .unwrap_or(0.4),
292        })
293    }
294
295    fn get_model_path_with_fallback() -> Result<String> {
296        // First try environment variable
297        if let Ok(model_path) = env::var("MODEL_PATH") {
298            // Check if the path exists
299            if std::path::Path::new(&model_path).exists() {
300                info!("Using model from MODEL_PATH: {}", model_path);
301                return Ok(model_path);
302            } else {
303                warn!("MODEL_PATH set but file doesn't exist: {}", model_path);
304            }
305        }
306
307        // Try to find embedded model
308        let exe_dir = std::env::current_exe()
309            .ok()
310            .and_then(|exe| exe.parent().map(|p| p.to_path_buf()))
311            .unwrap_or_else(|| std::env::current_dir().unwrap_or_default());
312
313        // Check multiple possible embedded model locations (MULTI-FORMAT SUPPORT)
314        let possible_model_locations = vec![
315            // GGUF formats
316            exe_dir.join("resources/models/default.gguf"),
317            exe_dir.join("resources/models/model.gguf"),
318            exe_dir.join("models/default.gguf"),
319            exe_dir.join("models/model.gguf"),
320            exe_dir.join("default.gguf"),
321            // ONNX formats
322            exe_dir.join("resources/models/default.onnx"),
323            exe_dir.join("resources/models/model.onnx"),
324            // TensorRT formats
325            exe_dir.join("resources/models/default.trt"),
326            exe_dir.join("resources/models/model.engine"),
327            // Safetensors formats
328            exe_dir.join("resources/models/default.safetensors"),
329            exe_dir.join("resources/models/model.safetensors"),
330            // GGML formats
331            exe_dir.join("resources/models/default.ggml"),
332            exe_dir.join("resources/models/model.bin"),
333        ];
334
335        for model_path in possible_model_locations {
336            if model_path.exists() {
337                info!("Using embedded model: {}", model_path.display());
338                return Ok(model_path.to_string_lossy().to_string());
339            }
340        }
341
342        // Check for any supported model file in models directory
343        if let Ok(entries) = std::fs::read_dir(exe_dir.join("resources/models")) {
344            for entry in entries.flatten() {
345                if let Some(ext) = entry.path().extension() {
346                    let ext_str = ext.to_str().unwrap_or("").to_lowercase();
347                    // Check if extension matches any supported format
348                    if matches!(
349                        ext_str.as_str(),
350                        "gguf"
351                            | "ggml"
352                            | "onnx"
353                            | "trt"
354                            | "engine"
355                            | "plan"
356                            | "safetensors"
357                            | "mlmodel"
358                    ) {
359                        info!("Using found model: {}", entry.path().display());
360                        return Ok(entry.path().to_string_lossy().to_string());
361                    }
362                }
363            }
364        }
365
366        // Return a default path when no model is found, allowing the system to start
367        // Models can be downloaded later via the model registry
368        Ok("".to_string())
369    }
370
371    /// Auto-detect the llama-server binary path based on the current OS.
372    ///
373    /// Search order:
374    /// 1. LLAMA_BIN environment variable (if set and exists)
375    /// 2. Resources/bin/{OS}/ relative to executable
376    /// 3. Resources/bin/{OS}/ relative to current working directory
377    /// 4. Resources/bin/{OS}/ relative to crate root (for development)
378    fn get_llama_binary_path() -> Result<String> {
379        // 1. Check LLAMA_BIN environment variable first (allows override)
380        if let Ok(llama_bin) = env::var("LLAMA_BIN") {
381            if std::path::Path::new(&llama_bin).exists() {
382                info!("Using llama binary from LLAMA_BIN env: {}", llama_bin);
383                return Ok(llama_bin);
384            } else {
385                warn!(
386                    "LLAMA_BIN set but file doesn't exist: {}, falling back to auto-detection",
387                    llama_bin
388                );
389            }
390        }
391
392        // Determine OS-specific binary name and folder
393        let (os_folder, binary_name) = Self::get_platform_binary_info();
394        info!(
395            "Auto-detecting llama binary for OS: {} (binary: {})",
396            os_folder, binary_name
397        );
398
399        // Get potential base directories
400        let exe_dir = std::env::current_exe()
401            .ok()
402            .and_then(|exe| exe.parent().map(|p| p.to_path_buf()));
403
404        let cwd = std::env::current_dir().ok();
405
406        // Build list of directories to search
407        let mut search_dirs: Vec<PathBuf> = Vec::new();
408
409        if let Some(ref exe) = exe_dir {
410            search_dirs.push(exe.clone());
411            // Also check parent directories (for bundled apps)
412            if let Some(parent) = exe.parent() {
413                search_dirs.push(parent.to_path_buf());
414                if let Some(grandparent) = parent.parent() {
415                    search_dirs.push(grandparent.to_path_buf());
416                }
417            }
418        }
419
420        if let Some(ref cwd_path) = cwd {
421            search_dirs.push(cwd_path.clone());
422        }
423
424        // In development builds, also check relative to the crate source directory
425        #[cfg(debug_assertions)]
426        {
427            let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
428            search_dirs.push(crate_dir);
429        }
430
431        // Search for binary in each potential location
432        // Check both "Resources" (uppercase) and "resources" (lowercase, Tauri v2 bundle)
433        let resource_folder_names = ["Resources", "resources"];
434        for base_dir in &search_dirs {
435            for resource_folder in &resource_folder_names {
436            let bin_dir = base_dir.join(resource_folder).join("bin").join(os_folder);
437
438            if bin_dir.exists() {
439                // Search for the binary in subdirectories (e.g., llama-b6970-bin-macos-arm64/)
440                // On macOS we must skip subdirectories built for the other architecture so that
441                // an Intel Mac does not accidentally load an arm64 binary (and vice-versa).
442                if let Ok(entries) = std::fs::read_dir(&bin_dir) {
443                    // Collect and sort so the search is deterministic across filesystems.
444                    let mut dir_entries: Vec<_> = entries.flatten().collect();
445                    dir_entries.sort_by_key(|e| e.file_name());
446
447                    for entry in dir_entries {
448                        let entry_path = entry.path();
449                        if !entry_path.is_dir() {
450                            continue;
451                        }
452
453                        // Architecture guard — only relevant on macOS where both
454                        // arm64 and x64 subdirectories may coexist under MacOS/.
455                        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
456                        {
457                            let dir_name = entry_path
458                                .file_name()
459                                .and_then(|n| n.to_str())
460                                .unwrap_or("");
461                            // Skip Intel-only subdirectories on Apple Silicon.
462                            if dir_name.contains("x64") || dir_name.contains("x86_64") {
463                                debug!("Skipping Intel subdir on Apple Silicon: {}", dir_name);
464                                continue;
465                            }
466                        }
467                        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
468                        {
469                            let dir_name = entry_path
470                                .file_name()
471                                .and_then(|n| n.to_str())
472                                .unwrap_or("");
473                            // Skip ARM-only subdirectories on Intel Mac.
474                            if dir_name.contains("arm64") || dir_name.contains("aarch64") {
475                                debug!("Skipping ARM subdir on Intel Mac: {}", dir_name);
476                                continue;
477                            }
478                        }
479
480                        let potential_binary = entry_path.join(binary_name);
481                        if potential_binary.exists() {
482                            info!("Found llama binary at: {}", potential_binary.display());
483                            return Ok(potential_binary.to_string_lossy().to_string());
484                        }
485                    }
486                }
487
488                // Also check directly in the OS folder
489                let direct_binary = bin_dir.join(binary_name);
490                if direct_binary.exists() {
491                    info!("Found llama binary at: {}", direct_binary.display());
492                    return Ok(direct_binary.to_string_lossy().to_string());
493                }
494            }
495            } // end resource_folder_names loop
496        }
497
498        let arch = Self::get_arch_hint();
499        warn!(
500            "Llama binary not found. Searched in Resources/bin/{os_folder}/ for '{binary_name}'.\n\
501             Please either:\n\
502             1. Set LLAMA_BIN environment variable to the full path\n\
503             2. Place the binary in Resources/bin/{os_folder}/<subfolder>/\n\
504             \n\
505             Expected binary name: {binary_name}\n\
506             OS detected: {os_folder}\n\
507             Architecture: {arch}\n\
508             Searched directories: {:?}",
509            search_dirs
510                .iter()
511                .map(|p| p.display().to_string())
512                .collect::<Vec<_>>()
513        );
514
515        // Return empty string instead of crashing - allows the HTTP server to start
516        // and serve online-mode requests while the local runtime is unavailable.
517        // Models and binaries can be downloaded later via the model registry.
518        Ok(String::new())
519    }
520
521    /// Returns (os_folder_name, binary_name) for the current platform and architecture
522    fn get_platform_binary_info() -> (&'static str, &'static str) {
523        #[cfg(target_os = "windows")]
524        {
525            ("Windows", "llama-server.exe")
526        }
527
528        // macOS Apple Silicon (M1/M2/M3/M4)
529        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
530        {
531            ("MacOS", "llama-server")
532            // Will search in: Resources/bin/MacOS/llama-*-macos-arm64/
533        }
534
535        // macOS Intel
536        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
537        {
538            ("MacOS", "llama-server")
539            // Will search in: Resources/bin/MacOS/llama-*-macos-x64/
540        }
541
542        #[cfg(target_os = "linux")]
543        {
544            ("Linux", "llama-server")
545        }
546
547        #[cfg(not(any(target_os = "windows", target_os = "macos", target_os = "linux")))]
548        {
549            compile_error!(
550                "Unsupported operating system. Only Windows, macOS, and Linux are supported."
551            );
552        }
553    }
554
555    /// Returns the current system architecture string for logging and binary matching
556    fn get_arch_hint() -> &'static str {
557        #[cfg(target_arch = "x86_64")]
558        {
559            "x64"
560        }
561        #[cfg(target_arch = "aarch64")]
562        {
563            "arm64"
564        }
565        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
566        {
567            "unknown"
568        }
569    }
570
571    fn auto_detect_threads() -> u32 {
572        let threads = num_cpus::get() as u32;
573        info!("Auto-detected {} CPU cores for inference", threads);
574        threads
575    }
576
577    /// Calculate how many layers to offload given available VRAM and model properties.
578    ///
579    /// Uses model filename to estimate parameter count and quantization bits, then
580    /// computes per-layer VRAM cost and fits as many layers as possible while
581    /// reserving budget for the KV cache and OS overhead.
582    fn layers_for_vram(vram_mb: u64, model_path: &str, ctx_size: u32, parallel_slots: u32) -> u32 {
583        let path_lower = model_path.to_lowercase();
584
585        // Estimate parameter count (billions) from filename
586        let params_b: f64 =
587            if path_lower.contains("0.5b") { 0.5 }
588            else if path_lower.contains("1.5b") { 1.5 }
589            else if path_lower.contains("1b") && !path_lower.contains("13b") { 1.0 }
590            else if path_lower.contains("3b") && !path_lower.contains("13b") && !path_lower.contains("33b") { 3.0 }
591            else if path_lower.contains("7b") { 7.0 }
592            else if path_lower.contains("8b") { 8.0 }
593            else if path_lower.contains("13b") { 13.0 }
594            else if path_lower.contains("14b") { 14.0 }
595            else if path_lower.contains("33b") || path_lower.contains("34b") { 34.0 }
596            else if path_lower.contains("70b") { 70.0 }
597            else { 7.0 }; // safe default — assume 7B if unknown
598
599        // Bits per parameter for quantization formats (higher = more accurate)
600        let bits: f64 =
601            if path_lower.contains("q4_k_m") || path_lower.contains("q4_k_s") { 4.5 }
602            else if path_lower.contains("q4_k") { 4.5 }
603            else if path_lower.contains("q4_0") || path_lower.contains("q4_1") { 4.0 }
604            else if path_lower.contains("q5_k_m") || path_lower.contains("q5_k_s") { 5.5 }
605            else if path_lower.contains("q5") { 5.0 }
606            else if path_lower.contains("q6_k") { 6.5 }
607            else if path_lower.contains("q8_0") { 8.5 }
608            else if path_lower.contains("f16") || path_lower.contains("fp16") { 16.0 }
609            else if path_lower.contains("f32") || path_lower.contains("fp32") { 32.0 }
610            else { 4.5 }; // default: Q4_K_M
611
612        // Approximate transformer layer count from parameter count
613        let total_layers: u32 =
614            if params_b <= 0.6  { 24 }
615            else if params_b <= 1.6  { 28 }
616            else if params_b <= 3.5  { 28 }
617            else if params_b <= 8.5  { 32 }
618            else if params_b <= 14.5 { 40 }
619            else if params_b <= 35.0 { 48 }
620            else                     { 80 };
621
622        // Model weights VRAM in MB
623        let model_vram_mb = (params_b * 1e9 * bits / 8.0 / 1024.0 / 1024.0) as u64;
624
625        // KV cache overhead — Q8_0 KV for 3B model at 8192 ctx / 8 slots ≈ 256 MB.
626        // Scale with context and number of slots (sqrt scaling for slots —
627        // continuous batching shares cache so growth is sub-linear).
628        let base_kv_mb = (model_vram_mb as f64 * 0.14).max(64.0);
629        let kv_mb = (base_kv_mb
630            * (ctx_size as f64 / 8192.0)
631            * ((parallel_slots as f64 / 8.0).sqrt())).max(64.0) as u64;
632
633        // OS / driver / framebuffer overhead: ~384 MB
634        let overhead_mb: u64 = 384;
635
636        let available_mb = vram_mb.saturating_sub(overhead_mb + kv_mb);
637
638        if available_mb >= model_vram_mb {
639            // Entire model fits — full offload
640            info!(
641                "GPU auto-detect: full offload — model {:.0} MB fits in {:.0} MB available → {} layers",
642                model_vram_mb, available_mb, total_layers
643            );
644            total_layers
645        } else {
646            // Partial offload: fit as many complete layers as possible
647            let per_layer_mb = (model_vram_mb as f64 / total_layers as f64).ceil() as u64;
648            let fit_layers = if per_layer_mb > 0 {
649                (available_mb / per_layer_mb).min(total_layers as u64) as u32
650            } else {
651                0
652            };
653            info!(
654                "GPU auto-detect: partial offload {}/{} layers ({} MB model, {} MB available, {} MB/layer)",
655                fit_layers, total_layers, model_vram_mb, available_mb, per_layer_mb
656            );
657            fit_layers
658        }
659    }
660
661    fn auto_detect_gpu_layers(model_path: &str, ctx_size: u32, parallel_slots: u32) -> u32 {
662        // NVIDIA GPU detection via NVML (only when nvidia feature is enabled)
663        #[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
664        {
665            if let Ok(nvml) = Nvml::init() {
666                if let Ok(device_count) = nvml.device_count() {
667                    if device_count > 0 {
668                        if let Ok(first_gpu) = nvml.device_by_index(0) {
669                            if let Ok(memory) = first_gpu.memory_info() {
670                                let vram_mb = memory.total / 1024 / 1024;
671                                let layers = Self::layers_for_vram(vram_mb, model_path, ctx_size, parallel_slots);
672                                info!(
673                                    "Auto‑detected NVIDIA GPU layers: {} ({} MB VRAM)",
674                                    layers, vram_mb
675                                );
676                                return layers;
677                            }
678                        }
679                    }
680                }
681            }
682            info!("No NVIDIA GPU detected, using CPU-optimized defaults (0 GPU layers)");
683            0
684        }
685
686        // Fallback: detect GPU layers via nvidia-smi when NVML is not compiled in
687        #[cfg(not(all(feature = "nvidia", any(target_os = "windows", target_os = "linux"))))]
688        #[cfg(any(target_os = "windows", target_os = "linux"))]
689        {
690            use std::process::{Command, Stdio};
691
692            // Try nvidia-smi with timeout to prevent hangs on broken driver installs
693            let child = Command::new("nvidia-smi")
694                .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
695                .stdout(Stdio::piped())
696                .stderr(Stdio::null())
697                .spawn();
698
699            match child {
700                Ok(mut process) => {
701                    let start = std::time::Instant::now();
702                    loop {
703                        match process.try_wait() {
704                            Ok(Some(status)) => {
705                                if status.success() {
706                                    if let Ok(output) = process.wait_with_output() {
707                                        let stdout = String::from_utf8_lossy(&output.stdout);
708                                        if let Some(vram_mb_str) = stdout.lines().next() {
709                                            if let Ok(vram_mb) = vram_mb_str.trim().parse::<u64>() {
710                                                let layers = Self::layers_for_vram(vram_mb, model_path, ctx_size, parallel_slots);
711                                                info!(
712                                                    "Auto‑detected NVIDIA GPU layers via nvidia-smi: {} ({} MB VRAM)",
713                                                    layers, vram_mb
714                                                );
715                                                return layers;
716                                            }
717                                        }
718                                    }
719                                }
720                                info!("nvidia-smi returned but could not parse VRAM, using CPU defaults (0 GPU layers)");
721                                return 0;
722                            }
723                            Ok(None) => {
724                                if start.elapsed() > std::time::Duration::from_secs(5) {
725                                    let _ = process.kill();
726                                    let _ = process.wait();
727                                    info!("nvidia-smi timed out, using CPU defaults (0 GPU layers)");
728                                    return 0;
729                                }
730                                std::thread::sleep(std::time::Duration::from_millis(50));
731                            }
732                            Err(_) => {
733                                return 0;
734                            }
735                        }
736                    }
737                }
738                Err(_) => {
739                    info!("No NVIDIA GPU detected (nvidia-smi not available), using CPU defaults (0 GPU layers)");
740                    0
741                }
742            }
743        }
744
745        // macOS Apple Silicon (M1/M2/M3/M4): Use Metal with unified memory
746        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
747        {
748            // Apple Silicon has unified memory architecture with Metal GPU support
749            // Use moderate GPU layers that work well with llama.cpp's Metal backend
750            // M1: 8GB-16GB unified, M2: 8GB-24GB, M3/M4: 8GB-128GB
751            let total_mem_gb = {
752                let mut sys = System::new_all();
753                sys.refresh_memory();
754                sys.total_memory() / 1024 / 1024 / 1024
755            };
756
757            // Scale GPU layers based on unified memory (shared between CPU and GPU)
758            let layers = match total_mem_gb {
759                0..=8 => 24,   // Base M1/M2 (8GB)
760                9..=16 => 32,  // M1/M2 Pro or 16GB models
761                17..=32 => 40, // M1/M2/M3 Max
762                33..=64 => 48, // M2/M3 Ultra
763                _ => 56,       // M3 Ultra 128GB+
764            };
765            info!(
766                "Apple Silicon detected ({} GB unified memory), using Metal GPU layers: {}",
767                total_mem_gb, layers
768            );
769            layers
770        }
771
772        // macOS Intel: No Metal GPU acceleration, use CPU-only mode
773        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
774        {
775            // Intel Macs don't have efficient Metal GPU support for LLM inference
776            // Use CPU-only mode (0 GPU layers) for best compatibility
777            info!("Intel Mac detected, using CPU-only mode (0 GPU layers)");
778            0
779        }
780    }
781
782    fn auto_detect_ctx_size(model_path: &str) -> u32 {
783        let inferred = Self::read_ctx_size_from_model_path(model_path).unwrap_or_else(|| {
784            info!("Falling back to default context size (8192)");
785            8192
786        });
787        let adjusted = Self::adjust_ctx_size_for_system(inferred);
788        info!("Final context size: {} (inferred: {})", adjusted, inferred);
789        adjusted
790    }
791
792    fn read_ctx_size_from_model_path(model_path: &str) -> Option<u32> {
793        // Simple heuristic based on model filename patterns
794        let path_lower = model_path.to_lowercase();
795
796        if path_lower.contains("32k") {
797            Some(32768)
798        } else if path_lower.contains("16k") {
799            Some(16384)
800        } else if path_lower.contains("8k") {
801            Some(8192)
802        } else if path_lower.contains("4k") {
803            Some(4096)
804        } else if path_lower.contains("2k") {
805            Some(2048)
806        } else if path_lower.contains("7b")
807            || path_lower.contains("8b")
808            || path_lower.contains("13b")
809        {
810            Some(4096)
811        } else if path_lower.contains("34b") || path_lower.contains("70b") {
812            Some(8192)
813        } else {
814            // Default fallback
815            Some(8192)
816        }
817    }
818
819    fn adjust_ctx_size_for_system(inferred_ctx: u32) -> u32 {
820        let mut system = System::new_all();
821        system.refresh_memory();
822
823        let available_ram_gb = system.available_memory() / 1024 / 1024 / 1024;
824
825        let required_ram_gb = (inferred_ctx as f32 / 4096.0) * 1.5;
826        if available_ram_gb < required_ram_gb as u64 {
827            let adjusted = (available_ram_gb as f32 * 4096.0 / 1.5) as u32;
828            let safe_ctx = adjusted.min(inferred_ctx).max(2048);
829            warn!(
830                "Reducing context size from {} → {} due to limited RAM ({}GB available)",
831                inferred_ctx, safe_ctx, available_ram_gb
832            );
833            safe_ctx
834        } else {
835            inferred_ctx
836        }
837    }
838
839    fn auto_detect_batch_size(gpu_layers: u32, ctx_size: u32) -> u32 {
840        let mut system = System::new_all();
841        system.refresh_memory();
842
843        let available_mb = system.available_memory() / 1024;
844        let has_gpu = gpu_layers > 0;
845        let memory_per_batch = Self::estimate_memory_per_batch(ctx_size, has_gpu);
846        let safe_available_mb = (available_mb as f32 * 0.6) as u32;
847        let max_batch = (safe_available_mb as f32 / memory_per_batch).max(1.0) as u32;
848
849        let optimal = Self::apply_batch_limits(max_batch, ctx_size, has_gpu);
850        info!(
851            "Auto batch size: {} (ctx: {}, GPU: {}, est mem: {:.1}MB/batch)",
852            optimal, ctx_size, has_gpu, memory_per_batch
853        );
854        optimal
855    }
856
857    fn estimate_memory_per_batch(ctx_size: u32, has_gpu: bool) -> f32 {
858        if has_gpu {
859            (ctx_size as f32 / 1024.0) * 0.5
860        } else {
861            (ctx_size as f32 / 1024.0) * 1.2
862        }
863    }
864
865    fn apply_batch_limits(batch_size: u32, ctx_size: u32, _has_gpu: bool) -> u32 {
866        let limited = batch_size.clamp(16, 1024);
867        match ctx_size {
868            0..=2048 => limited.min(512),
869            2049..=4096 => limited.min(512),
870            // Raised from 256 → 512 for GPU inference (28 layers on VRAM).
871            // Larger batch-size cuts prompt-processing time by ~40% at 50-200 token prompts,
872            // directly reducing TTFT warm. VRAM cost: 512 * 16B * 2 ≈ 16 MB — negligible.
873            4097..=8192 => limited.min(512),
874            8193..=16384 => limited.min(256),
875            16385..=32768 => limited.min(128),
876            _ => limited.min(64),
877        }
878    }
879
880    pub fn print_config(&self) {
881        info!("Current Configuration:");
882        info!("- Model Path: {}", self.model_path);
883        info!("- Llama Binary: {}", self.llama_bin);
884        info!("- Context Size: {}", self.ctx_size);
885        info!("- Batch Size: {}", self.batch_size);
886        info!("- Threads: {}", self.threads);
887        info!("- GPU Layers: {}", self.gpu_layers);
888        info!("- Parallel Slots: {}", self.parallel_slots);
889        info!("- Ubatch Size: {}", self.ubatch_size);
890        info!("- Max Streams: {}", self.max_concurrent_streams);
891        info!("- API: {}:{}", self.api_host, self.api_port);
892        info!("- Backend: {}:{}", self.llama_host, self.llama_port);
893        info!("- Queue Size: {}", self.queue_size);
894        info!("- Queue Timeout: {}s", self.queue_timeout_seconds);
895        info!("- Backend URL: {}", self.backend_url);
896    }
897
898    pub fn api_addr(&self) -> SocketAddr {
899        format!("{}:{}", self.api_host, self.api_port)
900            .parse()
901            .unwrap()
902    }
903}
904
905#[cfg(test)]
906mod tests {
907    use super::*;
908
909    /// Helper function to create a test Config with default values
910    fn create_test_config() -> Config {
911        Config {
912            model_path: "/test/model.gguf".to_string(),
913            llama_bin: "/test/llama-server".to_string(),
914            llama_host: "127.0.0.1".to_string(),
915            llama_port: 8001,
916            ctx_size: 8192,
917            batch_size: 128,
918            threads: 6,
919            gpu_layers: 20,
920            health_timeout_seconds: 600,
921            hot_swap_grace_seconds: 25,
922            max_concurrent_streams: 2,
923            prometheus_port: 9000,
924            api_host: "127.0.0.1".to_string(),
925            api_port: 9999,
926            requests_per_second: 24,
927            generate_timeout_seconds: 300,
928            stream_timeout_seconds: 600,
929            health_check_timeout_seconds: 900,
930            queue_size: 1000,
931            queue_timeout_seconds: 300,
932            backend_url: "http://127.0.0.1:8001".to_string(),
933            openrouter_api_key: "test-api-key".to_string(),
934        }
935    }
936
937    // ===== Configuration Structure Tests =====
938
939    #[test]
940    fn test_config_creation_with_default_values() {
941        let config = create_test_config();
942
943        assert_eq!(config.model_path, "/test/model.gguf");
944        assert_eq!(config.llama_bin, "/test/llama-server");
945        assert_eq!(config.api_port, 9999);
946        assert_eq!(config.llama_port, 8001);
947    }
948
949    #[test]
950    fn test_config_clone() {
951        let config1 = create_test_config();
952        let config2 = config1.clone();
953
954        assert_eq!(config1.api_host, config2.api_host);
955        assert_eq!(config1.threads, config2.threads);
956        assert_eq!(config1.gpu_layers, config2.gpu_layers);
957    }
958
959    // ===== API Address Tests =====
960
961    #[test]
962    fn test_api_addr_parsing() {
963        let config = create_test_config();
964        let addr = config.api_addr();
965
966        assert_eq!(addr.ip().to_string(), "127.0.0.1");
967        assert_eq!(addr.port(), 9999);
968    }
969
970    #[test]
971    fn test_api_addr_with_different_ports() {
972        let mut config = create_test_config();
973        config.api_port = 3000;
974
975        let addr = config.api_addr();
976        assert_eq!(addr.port(), 3000);
977    }
978
979    #[test]
980    fn test_api_addr_with_zero_address() {
981        let mut config = create_test_config();
982        config.api_host = "0.0.0.0".to_string();
983        config.api_port = 5000;
984
985        let addr = config.api_addr();
986        assert_eq!(addr.port(), 5000);
987        // 0.0.0.0 represents all interfaces
988        assert_eq!(addr.ip().to_string(), "0.0.0.0");
989    }
990
991    // ===== Timeout Tests =====
992
993    #[test]
994    fn test_config_timeouts_are_positive() {
995        let config = create_test_config();
996
997        assert!(config.health_timeout_seconds > 0);
998        assert!(config.generate_timeout_seconds > 0);
999        assert!(config.stream_timeout_seconds > 0);
1000        assert!(config.health_check_timeout_seconds > 0);
1001    }
1002
1003    #[test]
1004    fn test_health_check_timeout_greater_than_health_timeout() {
1005        let config = create_test_config();
1006
1007        // Health check timeout should typically be longer than regular health timeout
1008        assert!(config.health_check_timeout_seconds >= config.health_timeout_seconds);
1009    }
1010
1011    // ===== Resource Limits Tests =====
1012
1013    #[test]
1014    fn test_max_concurrent_streams_is_positive() {
1015        let config = create_test_config();
1016        assert!(config.max_concurrent_streams > 0);
1017    }
1018
1019    #[test]
1020    fn test_requests_per_second_is_reasonable() {
1021        let config = create_test_config();
1022
1023        // Should be a reasonable number (not 0, not extremely high)
1024        assert!(config.requests_per_second > 0);
1025        assert!(config.requests_per_second <= 1000);
1026    }
1027
1028    #[test]
1029    fn test_queue_size_is_positive() {
1030        let config = create_test_config();
1031        assert!(config.queue_size > 0);
1032    }
1033
1034    // ===== Context and Batch Size Tests =====
1035
1036    #[test]
1037    fn test_context_size_within_valid_range() {
1038        let config = create_test_config();
1039
1040        // Context size should be between 512 and 32768
1041        assert!(config.ctx_size >= 512);
1042        assert!(config.ctx_size <= 32768);
1043    }
1044
1045    #[test]
1046    fn test_batch_size_valid_range() {
1047        let config = create_test_config();
1048
1049        // Batch size should be between 16 and 1024
1050        assert!(config.batch_size >= 16);
1051        assert!(config.batch_size <= 1024);
1052    }
1053
1054    #[test]
1055    fn test_batch_size_reasonable_vs_context() {
1056        let config = create_test_config();
1057
1058        // Batch size should typically be less than context size
1059        assert!(config.batch_size < config.ctx_size);
1060    }
1061
1062    // ===== Thread Configuration Tests =====
1063
1064    #[test]
1065    fn test_threads_is_positive() {
1066        let config = create_test_config();
1067        assert!(config.threads > 0);
1068    }
1069
1070    #[test]
1071    fn test_threads_within_reasonable_range() {
1072        let config = create_test_config();
1073
1074        // Should not exceed typical CPU thread count significantly
1075        assert!(config.threads <= 256);
1076    }
1077
1078    // ===== GPU Configuration Tests =====
1079
1080    #[test]
1081    fn test_gpu_layers_non_negative() {
1082        let config = create_test_config();
1083        assert!(config.gpu_layers <= config.ctx_size);
1084    }
1085
1086    #[test]
1087    fn test_gpu_layers_within_range() {
1088        let config = create_test_config();
1089
1090        // GPU layers should typically be 0-50
1091        assert!(config.gpu_layers <= 100);
1092    }
1093
1094    // ===== Port Configuration Tests =====
1095
1096    #[test]
1097    fn test_api_port_valid() {
1098        let config = create_test_config();
1099        assert!(config.api_port > 0);
1100        assert!(config.api_port != config.llama_port);
1101    }
1102
1103    #[test]
1104    fn test_llama_port_valid() {
1105        let config = create_test_config();
1106        assert!(config.llama_port > 0);
1107    }
1108
1109    #[test]
1110    fn test_prometheus_port_valid() {
1111        let config = create_test_config();
1112        assert!(config.prometheus_port > 0);
1113    }
1114
1115    #[test]
1116    fn test_ports_are_different() {
1117        let config = create_test_config();
1118
1119        // Ports should be unique to avoid conflicts
1120        assert_ne!(config.api_port, config.llama_port);
1121        assert_ne!(config.api_port, config.prometheus_port);
1122        assert_ne!(config.llama_port, config.prometheus_port);
1123    }
1124
1125    // ===== Path Configuration Tests =====
1126
1127    #[test]
1128    fn test_model_path_not_empty() {
1129        let config = create_test_config();
1130        assert!(!config.model_path.is_empty());
1131    }
1132
1133    #[test]
1134    fn test_llama_bin_not_empty() {
1135        let config = create_test_config();
1136        assert!(!config.llama_bin.is_empty());
1137    }
1138
1139    #[test]
1140    fn test_backend_url_not_empty() {
1141        let config = create_test_config();
1142        assert!(!config.backend_url.is_empty());
1143    }
1144
1145    #[test]
1146    fn test_backend_url_format() {
1147        let config = create_test_config();
1148
1149        // Should be a valid URL format
1150        assert!(
1151            config.backend_url.starts_with("http://") || config.backend_url.starts_with("https://")
1152        );
1153    }
1154
1155    // ===== Host Configuration Tests =====
1156
1157    #[test]
1158    fn test_api_host_not_empty() {
1159        let config = create_test_config();
1160        assert!(!config.api_host.is_empty());
1161    }
1162
1163    #[test]
1164    fn test_llama_host_not_empty() {
1165        let config = create_test_config();
1166        assert!(!config.llama_host.is_empty());
1167    }
1168
1169    // ===== Grace Period Tests =====
1170
1171    #[test]
1172    fn test_hot_swap_grace_positive() {
1173        let config = create_test_config();
1174        assert!(config.hot_swap_grace_seconds > 0);
1175    }
1176
1177    #[test]
1178    fn test_hot_swap_grace_reasonable() {
1179        let config = create_test_config();
1180
1181        // Grace period should be less than 5 minutes
1182        assert!(config.hot_swap_grace_seconds < 300);
1183    }
1184
1185    // ===== Auto-detect Helper Tests =====
1186
1187    #[test]
1188    fn test_auto_detect_threads_returns_positive() {
1189        let threads = Config::auto_detect_threads();
1190        assert!(threads > 0);
1191    }
1192
1193    #[test]
1194    fn test_auto_detect_gpu_layers_non_negative() {
1195        let layers = Config::auto_detect_gpu_layers("qwen2.5-coder-3b-instruct-q4_k_m.gguf", 8192, 8);
1196        assert!(layers <= 512);
1197    }
1198
1199    #[test]
1200    fn test_layers_for_vram_full_offload() {
1201        // RTX 3050 Ti 4GB (4096 MB): Q4_K_M 3B model (~1793 MB) should fully offload
1202        let layers = Config::layers_for_vram(4096, "qwen2.5-coder-3b-instruct-q4_k_m.gguf", 8192, 8);
1203        assert_eq!(layers, 28, "3B model should fully offload on 4GB GPU");
1204    }
1205
1206    #[test]
1207    fn test_layers_for_vram_partial_offload() {
1208        // 2GB VRAM: 3B model won't fully fit, should get partial layers
1209        let layers = Config::layers_for_vram(2048, "qwen2.5-coder-7b-instruct-q4_k_m.gguf", 8192, 8);
1210        assert!(layers < 32, "7B model should only partially offload on 2GB GPU");
1211        assert!(layers > 0, "Should get at least some layers on 2GB GPU");
1212    }
1213
1214    #[test]
1215    fn test_apply_batch_limits_small_context() {
1216        // For context < 2048, batch should be limited to 512
1217        let batch = Config::apply_batch_limits(1024, 1024, false);
1218        assert!(batch <= 512);
1219    }
1220
1221    #[test]
1222    fn test_apply_batch_limits_medium_context() {
1223        // For context 2048-4096, batch should be limited to 384
1224        let batch = Config::apply_batch_limits(1024, 3000, false);
1225        assert!(batch <= 384);
1226    }
1227
1228    #[test]
1229    fn test_apply_batch_limits_large_context() {
1230        // For context 16384-32768, batch should be limited to 64
1231        let batch = Config::apply_batch_limits(1024, 24576, false);
1232        assert!(batch <= 64);
1233    }
1234
1235    #[test]
1236    fn test_apply_batch_limits_minimum() {
1237        // Batch size should always be at least 16
1238        let batch = Config::apply_batch_limits(1, 8192, false);
1239        assert!(batch >= 16);
1240    }
1241
1242    #[test]
1243    fn test_estimate_memory_per_batch_cpu() {
1244        let memory_cpu = Config::estimate_memory_per_batch(8192, false);
1245        assert!(memory_cpu > 0.0);
1246    }
1247
1248    #[test]
1249    fn test_estimate_memory_per_batch_gpu() {
1250        let memory_gpu = Config::estimate_memory_per_batch(8192, true);
1251        assert!(memory_gpu > 0.0);
1252    }
1253
1254    #[test]
1255    fn test_estimate_memory_gpu_less_than_cpu() {
1256        let memory_cpu = Config::estimate_memory_per_batch(8192, false);
1257        let memory_gpu = Config::estimate_memory_per_batch(8192, true);
1258
1259        // GPU memory estimate should be less than CPU
1260        assert!(memory_gpu < memory_cpu);
1261    }
1262
1263    // ===== Queue Configuration Tests =====
1264
1265    #[test]
1266    fn test_queue_timeout_is_positive() {
1267        let config = create_test_config();
1268        assert!(config.queue_timeout_seconds > 0);
1269    }
1270
1271    #[test]
1272    fn test_queue_timeout_less_than_generate_timeout() {
1273        let config = create_test_config();
1274
1275        // Queue timeout should be less than or equal to generate timeout
1276        assert!(config.queue_timeout_seconds <= config.generate_timeout_seconds);
1277    }
1278
1279    // ===== Integration Tests =====
1280
1281    #[test]
1282    fn test_config_values_consistency() {
1283        let config = create_test_config();
1284
1285        // Verify all timeout values are reasonable
1286        assert!(config.health_timeout_seconds <= 3600); // Max 1 hour
1287        assert!(config.generate_timeout_seconds <= 1800); // Max 30 mins
1288        assert!(config.stream_timeout_seconds <= 3600); // Max 1 hour
1289        assert!(config.health_check_timeout_seconds <= 3600); // Max 1 hour
1290    }
1291
1292    #[test]
1293    fn test_config_backend_url_consistency() {
1294        let config = create_test_config();
1295
1296        // Backend URL should contain the llama host and port
1297        assert!(
1298            config.backend_url.contains(&config.llama_host)
1299                || config.backend_url.contains("127.0.0.1")
1300                || config.backend_url.contains("localhost")
1301        );
1302    }
1303
1304    #[test]
1305    fn test_config_all_fields_initialized() {
1306        let config = create_test_config();
1307
1308        // Ensure all critical fields have valid values
1309        assert!(!config.model_path.is_empty());
1310        assert!(!config.llama_bin.is_empty());
1311        assert!(!config.api_host.is_empty());
1312        assert!(!config.llama_host.is_empty());
1313        assert!(config.threads > 0);
1314        assert!(config.gpu_layers <= config.ctx_size);
1315        assert!(config.api_port > 0);
1316        assert!(config.llama_port > 0);
1317    }
1318}