Skip to main content

offline_intelligence/
config.rs

1// _Aud.io/crates/offline-intelligence/src/config.rs
2
3use anyhow::Result;
4use std::env;
5use std::net::SocketAddr;
6use std::path::PathBuf;
7use sysinfo::System;
8use tracing::{debug, info, warn};
9
10// NVIDIA GPU detection only available when nvidia feature is enabled (Windows and Linux)
11#[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
12use nvml_wrapper::Nvml;
13
14#[allow(dead_code)]
15#[derive(Debug, Clone)]
16pub struct Config {
17    pub model_path: String,
18    pub llama_bin: String,
19    pub llama_host: String,
20    pub llama_port: u16,
21    pub ctx_size: u32,
22    pub batch_size: u32,
23    pub threads: u32,
24    pub gpu_layers: u32,
25    pub health_timeout_seconds: u64,
26    pub hot_swap_grace_seconds: u64,
27    pub max_concurrent_streams: u32,
28    pub prometheus_port: u16,
29    pub api_host: String,
30    pub api_port: u16,
31    pub requests_per_second: u32,
32    pub generate_timeout_seconds: u64,
33    pub stream_timeout_seconds: u64,
34    pub health_check_timeout_seconds: u64,
35    pub queue_size: usize,
36    pub queue_timeout_seconds: u64,
37    pub backend_url: String,
38    pub openrouter_api_key: String,
39}
40
41impl Config {
42    pub fn from_env() -> Result<Self> {
43        // Try to load .env from multiple locations in order:
44        // 1. First try executable directory (for production builds with bundled .env)
45        // 2. Then try project root (where .env actually is during development)
46        // 3. Finally try current directory as fallback
47
48        let mut env_loaded = false;
49
50        // 1. Try executable directory
51        if let Ok(exe_path) = std::env::current_exe() {
52            if let Some(exe_dir) = exe_path.parent() {
53                let env_path = exe_dir.join(".env");
54                if env_path.exists() {
55                    match dotenvy::from_path(&env_path) {
56                        Ok(_) => {
57                            info!("Loaded .env from executable directory: {:?}", env_path);
58                            env_loaded = true;
59                        }
60                        Err(e) => {
61                            warn!("Failed to load .env from {:?}: {}", env_path, e);
62                        }
63                    }
64                }
65
66                // 2a. macOS .app bundle: exe is at App.app/Contents/MacOS/binary
67                //     Resources live at App.app/Contents/Resources/.env
68                #[cfg(target_os = "macos")]
69                if !env_loaded {
70                    // exe_dir = App.app/Contents/MacOS/
71                    // parent  = App.app/Contents/
72                    if let Some(contents_dir) = exe_dir.parent() {
73                        let bundle_env = contents_dir.join("Resources").join(".env");
74                        if bundle_env.exists() {
75                            match dotenvy::from_path(&bundle_env) {
76                                Ok(_) => {
77                                    info!("Loaded .env from macOS bundle Resources: {:?}", bundle_env);
78                                    env_loaded = true;
79                                }
80                                Err(e) => {
81                                    warn!("Failed to load .env from bundle Resources {:?}: {}", bundle_env, e);
82                                }
83                            }
84                        }
85                    }
86                }
87
88                // 2b. macOS: also try ~/Library/Application Support/Aud.io/.env
89                //     This allows post-install configuration without modifying the bundle.
90                #[cfg(target_os = "macos")]
91                if !env_loaded {
92                    if let Some(app_support) = dirs::data_dir() {
93                        let user_env = app_support.join("Aud.io").join(".env");
94                        if user_env.exists() {
95                            match dotenvy::from_path(&user_env) {
96                                Ok(_) => {
97                                    info!("Loaded .env from user data directory: {:?}", user_env);
98                                    env_loaded = true;
99                                }
100                                Err(e) => {
101                                    warn!("Failed to load .env from user data dir {:?}: {}", user_env, e);
102                                }
103                            }
104                        }
105                    }
106                }
107
108                // 2c. Development: try project root (../../ from target/release/ or target\release\)
109                if !env_loaded {
110                    let project_root = if exe_dir.ends_with("target/release")
111                        || exe_dir.ends_with("target\\release")
112                    {
113                        exe_dir.parent().and_then(|p| p.parent())
114                    } else {
115                        None
116                    };
117
118                    if let Some(root) = project_root {
119                        let root_env = root.join(".env");
120                        if root_env.exists() {
121                            match dotenvy::from_path(&root_env) {
122                                Ok(_) => {
123                                    info!("Loaded .env from project root: {:?}", root_env);
124                                    env_loaded = true;
125                                }
126                                Err(e) => {
127                                    warn!(
128                                        "Failed to load .env from project root {:?}: {}",
129                                        root_env, e
130                                    );
131                                }
132                            }
133                        }
134                    }
135                }
136            }
137        }
138
139        // 3. If still not loaded, try current directory (development fallback)
140        if !env_loaded {
141            if let Err(e) = dotenvy::dotenv() {
142                warn!("Failed to load .env from current directory: {}. Using system environment variables.", e);
143            } else {
144                info!("Loaded environment variables from .env file in current directory");
145            }
146        }
147
148        // Auto-detect llama binary based on OS, with optional LLAMA_BIN override
149        let llama_bin = Self::get_llama_binary_path()?;
150        info!("Using llama binary: {}", llama_bin);
151
152        // Use MODEL_PATH from env, or try to find embedded model
153        let model_path = Self::get_model_path_with_fallback()?;
154
155        // Auto‑detect threads if set to "auto"
156        let threads = if env::var("THREADS").unwrap_or_else(|_| "auto".into()) == "auto" {
157            Self::auto_detect_threads()
158        } else {
159            env::var("THREADS")
160                .unwrap_or_else(|_| "6".into())
161                .parse()
162                .unwrap_or(6)
163        };
164
165        // Auto‑detect GPU layers if set to "auto"
166        let gpu_layers = if env::var("GPU_LAYERS").unwrap_or_else(|_| "auto".into()) == "auto" {
167            Self::auto_detect_gpu_layers()
168        } else {
169            env::var("GPU_LAYERS")
170                .unwrap_or_else(|_| "20".into())
171                .parse()
172                .unwrap_or(20)
173        };
174
175        let ctx_size = if env::var("CTX_SIZE").unwrap_or_else(|_| "auto".into()) == "auto" {
176            Self::auto_detect_ctx_size(&model_path)
177        } else {
178            env::var("CTX_SIZE")
179                .unwrap_or_else(|_| "8192".into())
180                .parse()
181                .unwrap_or(8192)
182        };
183
184        // Auto‑detect batch size
185        let batch_size = if env::var("BATCH_SIZE").unwrap_or_else(|_| "auto".into()) == "auto" {
186            Self::auto_detect_batch_size(gpu_layers, ctx_size)
187        } else {
188            env::var("BATCH_SIZE")
189                .unwrap_or_else(|_| "256".into())
190                .parse()
191                .unwrap_or(256)
192        };
193
194        // Get backend URL components
195        let llama_host = env::var("LLAMA_HOST").unwrap_or_else(|_| "127.0.0.1".into());
196        let llama_port = env::var("LLAMA_PORT")
197            .unwrap_or_else(|_| "8081".into())
198            .parse()?;
199        let backend_url = format!("http://{}:{}", llama_host, llama_port);
200
201        // Get OpenRouter API key from environment variable
202        let openrouter_api_key = env::var("OPENROUTER_API_KEY").unwrap_or_default();
203
204        info!(
205            "Resource Configuration: {} GPU layers, {} threads, batch size: {}, context: {}",
206            gpu_layers, threads, batch_size, ctx_size
207        );
208
209        Ok(Self {
210            model_path,
211            llama_bin,
212            llama_host: llama_host.clone(),
213            llama_port,
214            ctx_size,
215            batch_size,
216            threads,
217            gpu_layers,
218            health_timeout_seconds: env::var("HEALTH_TIMEOUT_SECONDS")
219                .unwrap_or_else(|_| "60".into())
220                .parse()?,
221            hot_swap_grace_seconds: env::var("HOT_SWAP_GRACE_SECONDS")
222                .unwrap_or_else(|_| "25".into())
223                .parse()?,
224            max_concurrent_streams: env::var("MAX_CONCURRENT_STREAMS")
225                .unwrap_or_else(|_| "4".into())
226                .parse()?,
227            prometheus_port: env::var("PROMETHEUS_PORT")
228                .unwrap_or_else(|_| "9000".into())
229                .parse()?,
230            api_host: env::var("API_HOST").unwrap_or_else(|_| "127.0.0.1".into()),
231            api_port: env::var("API_PORT")
232                .unwrap_or_else(|_| "9999".into())
233                .parse()?,
234            requests_per_second: env::var("REQUESTS_PER_SECOND")
235                .unwrap_or_else(|_| "24".into())
236                .parse()?,
237            generate_timeout_seconds: env::var("GENERATE_TIMEOUT_SECONDS")
238                .unwrap_or_else(|_| "300".into())
239                .parse()?,
240            stream_timeout_seconds: env::var("STREAM_TIMEOUT_SECONDS")
241                .unwrap_or_else(|_| "600".into())
242                .parse()?,
243            health_check_timeout_seconds: env::var("HEALTH_CHECK_TIMEOUT_SECONDS")
244                .unwrap_or_else(|_| "90".into())
245                .parse()?,
246            queue_size: env::var("QUEUE_SIZE")
247                .unwrap_or_else(|_| "100".into())
248                .parse()?,
249            queue_timeout_seconds: env::var("QUEUE_TIMEOUT_SECONDS")
250                .unwrap_or_else(|_| "30".into())
251                .parse()?,
252            backend_url,
253            openrouter_api_key,
254        })
255    }
256
257    fn get_model_path_with_fallback() -> Result<String> {
258        // First try environment variable
259        if let Ok(model_path) = env::var("MODEL_PATH") {
260            // Check if the path exists
261            if std::path::Path::new(&model_path).exists() {
262                info!("Using model from MODEL_PATH: {}", model_path);
263                return Ok(model_path);
264            } else {
265                warn!("MODEL_PATH set but file doesn't exist: {}", model_path);
266            }
267        }
268
269        // Try to find embedded model
270        let exe_dir = std::env::current_exe()
271            .ok()
272            .and_then(|exe| exe.parent().map(|p| p.to_path_buf()))
273            .unwrap_or_else(|| std::env::current_dir().unwrap_or_default());
274
275        // Check multiple possible embedded model locations (MULTI-FORMAT SUPPORT)
276        let possible_model_locations = vec![
277            // GGUF formats
278            exe_dir.join("resources/models/default.gguf"),
279            exe_dir.join("resources/models/model.gguf"),
280            exe_dir.join("models/default.gguf"),
281            exe_dir.join("models/model.gguf"),
282            exe_dir.join("default.gguf"),
283            // ONNX formats
284            exe_dir.join("resources/models/default.onnx"),
285            exe_dir.join("resources/models/model.onnx"),
286            // TensorRT formats
287            exe_dir.join("resources/models/default.trt"),
288            exe_dir.join("resources/models/model.engine"),
289            // Safetensors formats
290            exe_dir.join("resources/models/default.safetensors"),
291            exe_dir.join("resources/models/model.safetensors"),
292            // GGML formats
293            exe_dir.join("resources/models/default.ggml"),
294            exe_dir.join("resources/models/model.bin"),
295        ];
296
297        for model_path in possible_model_locations {
298            if model_path.exists() {
299                info!("Using embedded model: {}", model_path.display());
300                return Ok(model_path.to_string_lossy().to_string());
301            }
302        }
303
304        // Check for any supported model file in models directory
305        if let Ok(entries) = std::fs::read_dir(exe_dir.join("resources/models")) {
306            for entry in entries.flatten() {
307                if let Some(ext) = entry.path().extension() {
308                    let ext_str = ext.to_str().unwrap_or("").to_lowercase();
309                    // Check if extension matches any supported format
310                    if matches!(
311                        ext_str.as_str(),
312                        "gguf"
313                            | "ggml"
314                            | "onnx"
315                            | "trt"
316                            | "engine"
317                            | "plan"
318                            | "safetensors"
319                            | "mlmodel"
320                    ) {
321                        info!("Using found model: {}", entry.path().display());
322                        return Ok(entry.path().to_string_lossy().to_string());
323                    }
324                }
325            }
326        }
327
328        // Return a default path when no model is found, allowing the system to start
329        // Models can be downloaded later via the model registry
330        Ok("".to_string())
331    }
332
333    /// Auto-detect the llama-server binary path based on the current OS.
334    ///
335    /// Search order:
336    /// 1. LLAMA_BIN environment variable (if set and exists)
337    /// 2. Resources/bin/{OS}/ relative to executable
338    /// 3. Resources/bin/{OS}/ relative to current working directory
339    /// 4. Resources/bin/{OS}/ relative to crate root (for development)
340    fn get_llama_binary_path() -> Result<String> {
341        // 1. Check LLAMA_BIN environment variable first (allows override)
342        if let Ok(llama_bin) = env::var("LLAMA_BIN") {
343            if std::path::Path::new(&llama_bin).exists() {
344                info!("Using llama binary from LLAMA_BIN env: {}", llama_bin);
345                return Ok(llama_bin);
346            } else {
347                warn!(
348                    "LLAMA_BIN set but file doesn't exist: {}, falling back to auto-detection",
349                    llama_bin
350                );
351            }
352        }
353
354        // Determine OS-specific binary name and folder
355        let (os_folder, binary_name) = Self::get_platform_binary_info();
356        info!(
357            "Auto-detecting llama binary for OS: {} (binary: {})",
358            os_folder, binary_name
359        );
360
361        // Get potential base directories
362        let exe_dir = std::env::current_exe()
363            .ok()
364            .and_then(|exe| exe.parent().map(|p| p.to_path_buf()));
365
366        let cwd = std::env::current_dir().ok();
367
368        // Build list of directories to search
369        let mut search_dirs: Vec<PathBuf> = Vec::new();
370
371        if let Some(ref exe) = exe_dir {
372            search_dirs.push(exe.clone());
373            // Also check parent directories (for bundled apps)
374            if let Some(parent) = exe.parent() {
375                search_dirs.push(parent.to_path_buf());
376                if let Some(grandparent) = parent.parent() {
377                    search_dirs.push(grandparent.to_path_buf());
378                }
379            }
380        }
381
382        if let Some(ref cwd_path) = cwd {
383            search_dirs.push(cwd_path.clone());
384        }
385
386        // In development builds, also check relative to the crate source directory
387        #[cfg(debug_assertions)]
388        {
389            let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
390            search_dirs.push(crate_dir);
391        }
392
393        // Search for binary in each potential location
394        // Check both "Resources" (uppercase) and "resources" (lowercase, Tauri v2 bundle)
395        let resource_folder_names = ["Resources", "resources"];
396        for base_dir in &search_dirs {
397            for resource_folder in &resource_folder_names {
398            let bin_dir = base_dir.join(resource_folder).join("bin").join(os_folder);
399
400            if bin_dir.exists() {
401                // Search for the binary in subdirectories (e.g., llama-b6970-bin-macos-arm64/)
402                // On macOS we must skip subdirectories built for the other architecture so that
403                // an Intel Mac does not accidentally load an arm64 binary (and vice-versa).
404                if let Ok(entries) = std::fs::read_dir(&bin_dir) {
405                    // Collect and sort so the search is deterministic across filesystems.
406                    let mut dir_entries: Vec<_> = entries.flatten().collect();
407                    dir_entries.sort_by_key(|e| e.file_name());
408
409                    for entry in dir_entries {
410                        let entry_path = entry.path();
411                        if !entry_path.is_dir() {
412                            continue;
413                        }
414
415                        // Architecture guard — only relevant on macOS where both
416                        // arm64 and x64 subdirectories may coexist under MacOS/.
417                        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
418                        {
419                            let dir_name = entry_path
420                                .file_name()
421                                .and_then(|n| n.to_str())
422                                .unwrap_or("");
423                            // Skip Intel-only subdirectories on Apple Silicon.
424                            if dir_name.contains("x64") || dir_name.contains("x86_64") {
425                                debug!("Skipping Intel subdir on Apple Silicon: {}", dir_name);
426                                continue;
427                            }
428                        }
429                        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
430                        {
431                            let dir_name = entry_path
432                                .file_name()
433                                .and_then(|n| n.to_str())
434                                .unwrap_or("");
435                            // Skip ARM-only subdirectories on Intel Mac.
436                            if dir_name.contains("arm64") || dir_name.contains("aarch64") {
437                                debug!("Skipping ARM subdir on Intel Mac: {}", dir_name);
438                                continue;
439                            }
440                        }
441
442                        let potential_binary = entry_path.join(binary_name);
443                        if potential_binary.exists() {
444                            info!("Found llama binary at: {}", potential_binary.display());
445                            return Ok(potential_binary.to_string_lossy().to_string());
446                        }
447                    }
448                }
449
450                // Also check directly in the OS folder
451                let direct_binary = bin_dir.join(binary_name);
452                if direct_binary.exists() {
453                    info!("Found llama binary at: {}", direct_binary.display());
454                    return Ok(direct_binary.to_string_lossy().to_string());
455                }
456            }
457            } // end resource_folder_names loop
458        }
459
460        let arch = Self::get_arch_hint();
461        warn!(
462            "Llama binary not found. Searched in Resources/bin/{os_folder}/ for '{binary_name}'.\n\
463             Please either:\n\
464             1. Set LLAMA_BIN environment variable to the full path\n\
465             2. Place the binary in Resources/bin/{os_folder}/<subfolder>/\n\
466             \n\
467             Expected binary name: {binary_name}\n\
468             OS detected: {os_folder}\n\
469             Architecture: {arch}\n\
470             Searched directories: {:?}",
471            search_dirs
472                .iter()
473                .map(|p| p.display().to_string())
474                .collect::<Vec<_>>()
475        );
476
477        // Return empty string instead of crashing - allows the HTTP server to start
478        // and serve online-mode requests while the local runtime is unavailable.
479        // Models and binaries can be downloaded later via the model registry.
480        Ok(String::new())
481    }
482
483    /// Returns (os_folder_name, binary_name) for the current platform and architecture
484    fn get_platform_binary_info() -> (&'static str, &'static str) {
485        #[cfg(target_os = "windows")]
486        {
487            ("Windows", "llama-server.exe")
488        }
489
490        // macOS Apple Silicon (M1/M2/M3/M4)
491        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
492        {
493            ("MacOS", "llama-server")
494            // Will search in: Resources/bin/MacOS/llama-*-macos-arm64/
495        }
496
497        // macOS Intel
498        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
499        {
500            ("MacOS", "llama-server")
501            // Will search in: Resources/bin/MacOS/llama-*-macos-x64/
502        }
503
504        #[cfg(target_os = "linux")]
505        {
506            ("Linux", "llama-server")
507        }
508
509        #[cfg(not(any(target_os = "windows", target_os = "macos", target_os = "linux")))]
510        {
511            compile_error!(
512                "Unsupported operating system. Only Windows, macOS, and Linux are supported."
513            );
514        }
515    }
516
517    /// Returns the current system architecture string for logging and binary matching
518    fn get_arch_hint() -> &'static str {
519        #[cfg(target_arch = "x86_64")]
520        {
521            "x64"
522        }
523        #[cfg(target_arch = "aarch64")]
524        {
525            "arm64"
526        }
527        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
528        {
529            "unknown"
530        }
531    }
532
533    fn auto_detect_threads() -> u32 {
534        let threads = num_cpus::get() as u32;
535        info!("Auto-detected {} CPU cores for inference", threads);
536        threads
537    }
538
539    fn auto_detect_gpu_layers() -> u32 {
540        // NVIDIA GPU detection via NVML (only when nvidia feature is enabled)
541        #[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
542        {
543            if let Ok(nvml) = Nvml::init() {
544                if let Ok(device_count) = nvml.device_count() {
545                    if device_count > 0 {
546                        if let Ok(first_gpu) = nvml.device_by_index(0) {
547                            if let Ok(memory) = first_gpu.memory_info() {
548                                let vram_gb = memory.total / 1024 / 1024 / 1024;
549                                let layers = match vram_gb {
550                                    0..=4  => 12, // partial offload for small VRAM
551                                    5..=8  => 20, // full 7B Q4 model
552                                    9..=12 => 32, // full 13B Q4 model
553                                    13..=16 => 50,
554                                    _ => 50,
555                                };
556                                info!(
557                                    "Auto‑detected NVIDIA GPU layers: {} ({} GB VRAM)",
558                                    layers, vram_gb
559                                );
560                                return layers;
561                            }
562                        }
563                    }
564                }
565            }
566            info!("No NVIDIA GPU detected, using CPU-optimized defaults (0 GPU layers)");
567            0
568        }
569
570        // Fallback: detect GPU layers via nvidia-smi when NVML is not compiled in
571        #[cfg(not(all(feature = "nvidia", any(target_os = "windows", target_os = "linux"))))]
572        #[cfg(any(target_os = "windows", target_os = "linux"))]
573        {
574            use std::process::{Command, Stdio};
575
576            // Try nvidia-smi with timeout to prevent hangs on broken driver installs
577            let child = Command::new("nvidia-smi")
578                .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
579                .stdout(Stdio::piped())
580                .stderr(Stdio::null())
581                .spawn();
582
583            match child {
584                Ok(mut process) => {
585                    let start = std::time::Instant::now();
586                    loop {
587                        match process.try_wait() {
588                            Ok(Some(status)) => {
589                                if status.success() {
590                                    if let Ok(output) = process.wait_with_output() {
591                                        let stdout = String::from_utf8_lossy(&output.stdout);
592                                        if let Some(vram_mb_str) = stdout.lines().next() {
593                                            if let Ok(vram_mb) = vram_mb_str.trim().parse::<u64>() {
594                                                let vram_gb = vram_mb / 1024;
595                                                let layers = match vram_gb {
596                                                    0..=4  => 12,
597                                                    5..=8  => 20,
598                                                    9..=12 => 32,
599                                                    13..=16 => 50,
600                                                    _ => 50,
601                                                };
602                                                info!(
603                                                    "Auto‑detected NVIDIA GPU layers via nvidia-smi: {} ({} GB VRAM)",
604                                                    layers, vram_gb
605                                                );
606                                                return layers;
607                                            }
608                                        }
609                                    }
610                                }
611                                info!("nvidia-smi returned but could not parse VRAM, using CPU defaults (0 GPU layers)");
612                                return 0;
613                            }
614                            Ok(None) => {
615                                if start.elapsed() > std::time::Duration::from_secs(5) {
616                                    let _ = process.kill();
617                                    let _ = process.wait();
618                                    info!("nvidia-smi timed out, using CPU defaults (0 GPU layers)");
619                                    return 0;
620                                }
621                                std::thread::sleep(std::time::Duration::from_millis(50));
622                            }
623                            Err(_) => {
624                                return 0;
625                            }
626                        }
627                    }
628                }
629                Err(_) => {
630                    info!("No NVIDIA GPU detected (nvidia-smi not available), using CPU defaults (0 GPU layers)");
631                    0
632                }
633            }
634        }
635
636        // macOS Apple Silicon (M1/M2/M3/M4): Use Metal with unified memory
637        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
638        {
639            // Apple Silicon has unified memory architecture with Metal GPU support
640            // Use moderate GPU layers that work well with llama.cpp's Metal backend
641            // M1: 8GB-16GB unified, M2: 8GB-24GB, M3/M4: 8GB-128GB
642            let total_mem_gb = {
643                let mut sys = System::new_all();
644                sys.refresh_memory();
645                sys.total_memory() / 1024 / 1024 / 1024
646            };
647
648            // Scale GPU layers based on unified memory (shared between CPU and GPU)
649            let layers = match total_mem_gb {
650                0..=8 => 24,   // Base M1/M2 (8GB)
651                9..=16 => 32,  // M1/M2 Pro or 16GB models
652                17..=32 => 40, // M1/M2/M3 Max
653                33..=64 => 48, // M2/M3 Ultra
654                _ => 56,       // M3 Ultra 128GB+
655            };
656            info!(
657                "Apple Silicon detected ({} GB unified memory), using Metal GPU layers: {}",
658                total_mem_gb, layers
659            );
660            layers
661        }
662
663        // macOS Intel: No Metal GPU acceleration, use CPU-only mode
664        #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
665        {
666            // Intel Macs don't have efficient Metal GPU support for LLM inference
667            // Use CPU-only mode (0 GPU layers) for best compatibility
668            info!("Intel Mac detected, using CPU-only mode (0 GPU layers)");
669            0
670        }
671    }
672
673    fn auto_detect_ctx_size(model_path: &str) -> u32 {
674        let inferred = Self::read_ctx_size_from_model_path(model_path).unwrap_or_else(|| {
675            info!("Falling back to default context size (8192)");
676            8192
677        });
678        let adjusted = Self::adjust_ctx_size_for_system(inferred);
679        info!("Final context size: {} (inferred: {})", adjusted, inferred);
680        adjusted
681    }
682
683    fn read_ctx_size_from_model_path(model_path: &str) -> Option<u32> {
684        // Simple heuristic based on model filename patterns
685        let path_lower = model_path.to_lowercase();
686
687        if path_lower.contains("32k") {
688            Some(32768)
689        } else if path_lower.contains("16k") {
690            Some(16384)
691        } else if path_lower.contains("8k") {
692            Some(8192)
693        } else if path_lower.contains("4k") {
694            Some(4096)
695        } else if path_lower.contains("2k") {
696            Some(2048)
697        } else if path_lower.contains("7b")
698            || path_lower.contains("8b")
699            || path_lower.contains("13b")
700        {
701            Some(4096)
702        } else if path_lower.contains("34b") || path_lower.contains("70b") {
703            Some(8192)
704        } else {
705            // Default fallback
706            Some(8192)
707        }
708    }
709
710    fn adjust_ctx_size_for_system(inferred_ctx: u32) -> u32 {
711        let mut system = System::new_all();
712        system.refresh_memory();
713
714        let available_ram_gb = system.available_memory() / 1024 / 1024 / 1024;
715
716        let required_ram_gb = (inferred_ctx as f32 / 4096.0) * 1.5;
717        if available_ram_gb < required_ram_gb as u64 {
718            let adjusted = (available_ram_gb as f32 * 4096.0 / 1.5) as u32;
719            let safe_ctx = adjusted.min(inferred_ctx).max(2048);
720            warn!(
721                "Reducing context size from {} → {} due to limited RAM ({}GB available)",
722                inferred_ctx, safe_ctx, available_ram_gb
723            );
724            safe_ctx
725        } else {
726            inferred_ctx
727        }
728    }
729
730    fn auto_detect_batch_size(gpu_layers: u32, ctx_size: u32) -> u32 {
731        let mut system = System::new_all();
732        system.refresh_memory();
733
734        let available_mb = system.available_memory() / 1024;
735        let has_gpu = gpu_layers > 0;
736        let memory_per_batch = Self::estimate_memory_per_batch(ctx_size, has_gpu);
737        let safe_available_mb = (available_mb as f32 * 0.6) as u32;
738        let max_batch = (safe_available_mb as f32 / memory_per_batch).max(1.0) as u32;
739
740        let optimal = Self::apply_batch_limits(max_batch, ctx_size, has_gpu);
741        info!(
742            "Auto batch size: {} (ctx: {}, GPU: {}, est mem: {:.1}MB/batch)",
743            optimal, ctx_size, has_gpu, memory_per_batch
744        );
745        optimal
746    }
747
748    fn estimate_memory_per_batch(ctx_size: u32, has_gpu: bool) -> f32 {
749        if has_gpu {
750            (ctx_size as f32 / 1024.0) * 0.5
751        } else {
752            (ctx_size as f32 / 1024.0) * 1.2
753        }
754    }
755
756    fn apply_batch_limits(batch_size: u32, ctx_size: u32, _has_gpu: bool) -> u32 {
757        let limited = batch_size.clamp(16, 1024);
758        match ctx_size {
759            0..=2048 => limited.min(512),
760            2049..=4096 => limited.min(512),
761            4097..=8192 => limited.min(256),
762            8193..=16384 => limited.min(128),
763            16385..=32768 => limited.min(64),
764            _ => limited.min(32),
765        }
766    }
767
768    pub fn print_config(&self) {
769        info!("Current Configuration:");
770        info!("- Model Path: {}", self.model_path);
771        info!("- Llama Binary: {}", self.llama_bin);
772        info!("- Context Size: {}", self.ctx_size);
773        info!("- Batch Size: {}", self.batch_size);
774        info!("- Threads: {}", self.threads);
775        info!("- GPU Layers: {}", self.gpu_layers);
776        info!("- Max Streams: {}", self.max_concurrent_streams);
777        info!("- API: {}:{}", self.api_host, self.api_port);
778        info!("- Backend: {}:{}", self.llama_host, self.llama_port);
779        info!("- Queue Size: {}", self.queue_size);
780        info!("- Queue Timeout: {}s", self.queue_timeout_seconds);
781        info!("- Backend URL: {}", self.backend_url);
782    }
783
784    pub fn api_addr(&self) -> SocketAddr {
785        format!("{}:{}", self.api_host, self.api_port)
786            .parse()
787            .unwrap()
788    }
789}
790
791#[cfg(test)]
792mod tests {
793    use super::*;
794
795    /// Helper function to create a test Config with default values
796    fn create_test_config() -> Config {
797        Config {
798            model_path: "/test/model.gguf".to_string(),
799            llama_bin: "/test/llama-server".to_string(),
800            llama_host: "127.0.0.1".to_string(),
801            llama_port: 8001,
802            ctx_size: 8192,
803            batch_size: 128,
804            threads: 6,
805            gpu_layers: 20,
806            health_timeout_seconds: 600,
807            hot_swap_grace_seconds: 25,
808            max_concurrent_streams: 2,
809            prometheus_port: 9000,
810            api_host: "127.0.0.1".to_string(),
811            api_port: 9999,
812            requests_per_second: 24,
813            generate_timeout_seconds: 300,
814            stream_timeout_seconds: 600,
815            health_check_timeout_seconds: 900,
816            queue_size: 1000,
817            queue_timeout_seconds: 300,
818            backend_url: "http://127.0.0.1:8001".to_string(),
819            openrouter_api_key: "test-api-key".to_string(),
820        }
821    }
822
823    // ===== Configuration Structure Tests =====
824
825    #[test]
826    fn test_config_creation_with_default_values() {
827        let config = create_test_config();
828
829        assert_eq!(config.model_path, "/test/model.gguf");
830        assert_eq!(config.llama_bin, "/test/llama-server");
831        assert_eq!(config.api_port, 9999);
832        assert_eq!(config.llama_port, 8001);
833    }
834
835    #[test]
836    fn test_config_clone() {
837        let config1 = create_test_config();
838        let config2 = config1.clone();
839
840        assert_eq!(config1.api_host, config2.api_host);
841        assert_eq!(config1.threads, config2.threads);
842        assert_eq!(config1.gpu_layers, config2.gpu_layers);
843    }
844
845    // ===== API Address Tests =====
846
847    #[test]
848    fn test_api_addr_parsing() {
849        let config = create_test_config();
850        let addr = config.api_addr();
851
852        assert_eq!(addr.ip().to_string(), "127.0.0.1");
853        assert_eq!(addr.port(), 9999);
854    }
855
856    #[test]
857    fn test_api_addr_with_different_ports() {
858        let mut config = create_test_config();
859        config.api_port = 3000;
860
861        let addr = config.api_addr();
862        assert_eq!(addr.port(), 3000);
863    }
864
865    #[test]
866    fn test_api_addr_with_zero_address() {
867        let mut config = create_test_config();
868        config.api_host = "0.0.0.0".to_string();
869        config.api_port = 5000;
870
871        let addr = config.api_addr();
872        assert_eq!(addr.port(), 5000);
873        // 0.0.0.0 represents all interfaces
874        assert_eq!(addr.ip().to_string(), "0.0.0.0");
875    }
876
877    // ===== Timeout Tests =====
878
879    #[test]
880    fn test_config_timeouts_are_positive() {
881        let config = create_test_config();
882
883        assert!(config.health_timeout_seconds > 0);
884        assert!(config.generate_timeout_seconds > 0);
885        assert!(config.stream_timeout_seconds > 0);
886        assert!(config.health_check_timeout_seconds > 0);
887    }
888
889    #[test]
890    fn test_health_check_timeout_greater_than_health_timeout() {
891        let config = create_test_config();
892
893        // Health check timeout should typically be longer than regular health timeout
894        assert!(config.health_check_timeout_seconds >= config.health_timeout_seconds);
895    }
896
897    // ===== Resource Limits Tests =====
898
899    #[test]
900    fn test_max_concurrent_streams_is_positive() {
901        let config = create_test_config();
902        assert!(config.max_concurrent_streams > 0);
903    }
904
905    #[test]
906    fn test_requests_per_second_is_reasonable() {
907        let config = create_test_config();
908
909        // Should be a reasonable number (not 0, not extremely high)
910        assert!(config.requests_per_second > 0);
911        assert!(config.requests_per_second <= 1000);
912    }
913
914    #[test]
915    fn test_queue_size_is_positive() {
916        let config = create_test_config();
917        assert!(config.queue_size > 0);
918    }
919
920    // ===== Context and Batch Size Tests =====
921
922    #[test]
923    fn test_context_size_within_valid_range() {
924        let config = create_test_config();
925
926        // Context size should be between 512 and 32768
927        assert!(config.ctx_size >= 512);
928        assert!(config.ctx_size <= 32768);
929    }
930
931    #[test]
932    fn test_batch_size_valid_range() {
933        let config = create_test_config();
934
935        // Batch size should be between 16 and 1024
936        assert!(config.batch_size >= 16);
937        assert!(config.batch_size <= 1024);
938    }
939
940    #[test]
941    fn test_batch_size_reasonable_vs_context() {
942        let config = create_test_config();
943
944        // Batch size should typically be less than context size
945        assert!(config.batch_size < config.ctx_size);
946    }
947
948    // ===== Thread Configuration Tests =====
949
950    #[test]
951    fn test_threads_is_positive() {
952        let config = create_test_config();
953        assert!(config.threads > 0);
954    }
955
956    #[test]
957    fn test_threads_within_reasonable_range() {
958        let config = create_test_config();
959
960        // Should not exceed typical CPU thread count significantly
961        assert!(config.threads <= 256);
962    }
963
964    // ===== GPU Configuration Tests =====
965
966    #[test]
967    fn test_gpu_layers_non_negative() {
968        let config = create_test_config();
969        assert!(config.gpu_layers <= config.ctx_size);
970    }
971
972    #[test]
973    fn test_gpu_layers_within_range() {
974        let config = create_test_config();
975
976        // GPU layers should typically be 0-50
977        assert!(config.gpu_layers <= 100);
978    }
979
980    // ===== Port Configuration Tests =====
981
982    #[test]
983    fn test_api_port_valid() {
984        let config = create_test_config();
985        assert!(config.api_port > 0);
986        assert!(config.api_port != config.llama_port);
987    }
988
989    #[test]
990    fn test_llama_port_valid() {
991        let config = create_test_config();
992        assert!(config.llama_port > 0);
993    }
994
995    #[test]
996    fn test_prometheus_port_valid() {
997        let config = create_test_config();
998        assert!(config.prometheus_port > 0);
999    }
1000
1001    #[test]
1002    fn test_ports_are_different() {
1003        let config = create_test_config();
1004
1005        // Ports should be unique to avoid conflicts
1006        assert_ne!(config.api_port, config.llama_port);
1007        assert_ne!(config.api_port, config.prometheus_port);
1008        assert_ne!(config.llama_port, config.prometheus_port);
1009    }
1010
1011    // ===== Path Configuration Tests =====
1012
1013    #[test]
1014    fn test_model_path_not_empty() {
1015        let config = create_test_config();
1016        assert!(!config.model_path.is_empty());
1017    }
1018
1019    #[test]
1020    fn test_llama_bin_not_empty() {
1021        let config = create_test_config();
1022        assert!(!config.llama_bin.is_empty());
1023    }
1024
1025    #[test]
1026    fn test_backend_url_not_empty() {
1027        let config = create_test_config();
1028        assert!(!config.backend_url.is_empty());
1029    }
1030
1031    #[test]
1032    fn test_backend_url_format() {
1033        let config = create_test_config();
1034
1035        // Should be a valid URL format
1036        assert!(
1037            config.backend_url.starts_with("http://") || config.backend_url.starts_with("https://")
1038        );
1039    }
1040
1041    // ===== Host Configuration Tests =====
1042
1043    #[test]
1044    fn test_api_host_not_empty() {
1045        let config = create_test_config();
1046        assert!(!config.api_host.is_empty());
1047    }
1048
1049    #[test]
1050    fn test_llama_host_not_empty() {
1051        let config = create_test_config();
1052        assert!(!config.llama_host.is_empty());
1053    }
1054
1055    // ===== Grace Period Tests =====
1056
1057    #[test]
1058    fn test_hot_swap_grace_positive() {
1059        let config = create_test_config();
1060        assert!(config.hot_swap_grace_seconds > 0);
1061    }
1062
1063    #[test]
1064    fn test_hot_swap_grace_reasonable() {
1065        let config = create_test_config();
1066
1067        // Grace period should be less than 5 minutes
1068        assert!(config.hot_swap_grace_seconds < 300);
1069    }
1070
1071    // ===== Auto-detect Helper Tests =====
1072
1073    #[test]
1074    fn test_auto_detect_threads_returns_positive() {
1075        let threads = Config::auto_detect_threads();
1076        assert!(threads > 0);
1077    }
1078
1079    #[test]
1080    fn test_auto_detect_gpu_layers_non_negative() {
1081        let layers = Config::auto_detect_gpu_layers();
1082        assert!(layers <= 512);
1083    }
1084
1085    #[test]
1086    fn test_apply_batch_limits_small_context() {
1087        // For context < 2048, batch should be limited to 512
1088        let batch = Config::apply_batch_limits(1024, 1024, false);
1089        assert!(batch <= 512);
1090    }
1091
1092    #[test]
1093    fn test_apply_batch_limits_medium_context() {
1094        // For context 2048-4096, batch should be limited to 384
1095        let batch = Config::apply_batch_limits(1024, 3000, false);
1096        assert!(batch <= 384);
1097    }
1098
1099    #[test]
1100    fn test_apply_batch_limits_large_context() {
1101        // For context 16384-32768, batch should be limited to 64
1102        let batch = Config::apply_batch_limits(1024, 24576, false);
1103        assert!(batch <= 64);
1104    }
1105
1106    #[test]
1107    fn test_apply_batch_limits_minimum() {
1108        // Batch size should always be at least 16
1109        let batch = Config::apply_batch_limits(1, 8192, false);
1110        assert!(batch >= 16);
1111    }
1112
1113    #[test]
1114    fn test_estimate_memory_per_batch_cpu() {
1115        let memory_cpu = Config::estimate_memory_per_batch(8192, false);
1116        assert!(memory_cpu > 0.0);
1117    }
1118
1119    #[test]
1120    fn test_estimate_memory_per_batch_gpu() {
1121        let memory_gpu = Config::estimate_memory_per_batch(8192, true);
1122        assert!(memory_gpu > 0.0);
1123    }
1124
1125    #[test]
1126    fn test_estimate_memory_gpu_less_than_cpu() {
1127        let memory_cpu = Config::estimate_memory_per_batch(8192, false);
1128        let memory_gpu = Config::estimate_memory_per_batch(8192, true);
1129
1130        // GPU memory estimate should be less than CPU
1131        assert!(memory_gpu < memory_cpu);
1132    }
1133
1134    // ===== Queue Configuration Tests =====
1135
1136    #[test]
1137    fn test_queue_timeout_is_positive() {
1138        let config = create_test_config();
1139        assert!(config.queue_timeout_seconds > 0);
1140    }
1141
1142    #[test]
1143    fn test_queue_timeout_less_than_generate_timeout() {
1144        let config = create_test_config();
1145
1146        // Queue timeout should be less than or equal to generate timeout
1147        assert!(config.queue_timeout_seconds <= config.generate_timeout_seconds);
1148    }
1149
1150    // ===== Integration Tests =====
1151
1152    #[test]
1153    fn test_config_values_consistency() {
1154        let config = create_test_config();
1155
1156        // Verify all timeout values are reasonable
1157        assert!(config.health_timeout_seconds <= 3600); // Max 1 hour
1158        assert!(config.generate_timeout_seconds <= 1800); // Max 30 mins
1159        assert!(config.stream_timeout_seconds <= 3600); // Max 1 hour
1160        assert!(config.health_check_timeout_seconds <= 3600); // Max 1 hour
1161    }
1162
1163    #[test]
1164    fn test_config_backend_url_consistency() {
1165        let config = create_test_config();
1166
1167        // Backend URL should contain the llama host and port
1168        assert!(
1169            config.backend_url.contains(&config.llama_host)
1170                || config.backend_url.contains("127.0.0.1")
1171                || config.backend_url.contains("localhost")
1172        );
1173    }
1174
1175    #[test]
1176    fn test_config_all_fields_initialized() {
1177        let config = create_test_config();
1178
1179        // Ensure all critical fields have valid values
1180        assert!(!config.model_path.is_empty());
1181        assert!(!config.llama_bin.is_empty());
1182        assert!(!config.api_host.is_empty());
1183        assert!(!config.llama_host.is_empty());
1184        assert!(config.threads > 0);
1185        assert!(config.gpu_layers <= config.ctx_size);
1186        assert!(config.api_port > 0);
1187        assert!(config.llama_port > 0);
1188    }
1189}