1use anyhow::Result;
4use std::env;
5use std::net::SocketAddr;
6use std::path::PathBuf;
7use sysinfo::System;
8use tracing::{debug, info, warn};
9
10#[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
12use nvml_wrapper::Nvml;
13
14#[allow(dead_code)]
15#[derive(Debug, Clone)]
16pub struct Config {
17 pub model_path: String,
18 pub llama_bin: String,
19 pub llama_host: String,
20 pub llama_port: u16,
21 pub ctx_size: u32,
22 pub batch_size: u32,
23 pub threads: u32,
24 pub gpu_layers: u32,
25 pub health_timeout_seconds: u64,
26 pub hot_swap_grace_seconds: u64,
27 pub max_concurrent_streams: u32,
28 pub parallel_slots: u32,
29 pub ubatch_size: u32,
30 pub prometheus_port: u16,
31 pub api_host: String,
32 pub api_port: u16,
33 pub requests_per_second: u32,
34 pub generate_timeout_seconds: u64,
35 pub stream_timeout_seconds: u64,
36 pub health_check_timeout_seconds: u64,
37 pub queue_size: usize,
38 pub queue_timeout_seconds: u64,
39 pub backend_url: String,
40 pub openrouter_api_key: String,
41 pub draft_model_path: String,
45 pub speculative_draft_max: u32,
49 pub speculative_draft_p_min: f32,
52}
53
54impl Config {
55 pub fn from_env() -> Result<Self> {
56 let mut env_loaded = false;
62
63 if let Ok(exe_path) = std::env::current_exe() {
65 if let Some(exe_dir) = exe_path.parent() {
66 let env_path = exe_dir.join(".env");
67 if env_path.exists() {
68 match dotenvy::from_path(&env_path) {
69 Ok(_) => {
70 info!("Loaded .env from executable directory: {:?}", env_path);
71 env_loaded = true;
72 }
73 Err(e) => {
74 warn!("Failed to load .env from {:?}: {}", env_path, e);
75 }
76 }
77 }
78
79 #[cfg(target_os = "macos")]
82 if !env_loaded {
83 if let Some(contents_dir) = exe_dir.parent() {
86 let bundle_env = contents_dir.join("Resources").join(".env");
87 if bundle_env.exists() {
88 match dotenvy::from_path(&bundle_env) {
89 Ok(_) => {
90 info!("Loaded .env from macOS bundle Resources: {:?}", bundle_env);
91 env_loaded = true;
92 }
93 Err(e) => {
94 warn!("Failed to load .env from bundle Resources {:?}: {}", bundle_env, e);
95 }
96 }
97 }
98 }
99 }
100
101 #[cfg(target_os = "macos")]
104 if !env_loaded {
105 if let Some(app_support) = dirs::data_dir() {
106 let user_env = app_support.join("Aud.io").join(".env");
107 if user_env.exists() {
108 match dotenvy::from_path(&user_env) {
109 Ok(_) => {
110 info!("Loaded .env from user data directory: {:?}", user_env);
111 env_loaded = true;
112 }
113 Err(e) => {
114 warn!("Failed to load .env from user data dir {:?}: {}", user_env, e);
115 }
116 }
117 }
118 }
119 }
120
121 if !env_loaded {
123 let project_root = if exe_dir.ends_with("target/release")
124 || exe_dir.ends_with("target\\release")
125 {
126 exe_dir.parent().and_then(|p| p.parent())
127 } else {
128 None
129 };
130
131 if let Some(root) = project_root {
132 let root_env = root.join(".env");
133 if root_env.exists() {
134 match dotenvy::from_path(&root_env) {
135 Ok(_) => {
136 info!("Loaded .env from project root: {:?}", root_env);
137 env_loaded = true;
138 }
139 Err(e) => {
140 warn!(
141 "Failed to load .env from project root {:?}: {}",
142 root_env, e
143 );
144 }
145 }
146 }
147 }
148 }
149 }
150 }
151
152 if !env_loaded {
154 if let Err(e) = dotenvy::dotenv() {
155 warn!("Failed to load .env from current directory: {}. Using system environment variables.", e);
156 } else {
157 info!("Loaded environment variables from .env file in current directory");
158 }
159 }
160
161 let llama_bin = Self::get_llama_binary_path()?;
163 info!("Using llama binary: {}", llama_bin);
164
165 let model_path = Self::get_model_path_with_fallback()?;
167
168 let threads = if env::var("THREADS").unwrap_or_else(|_| "auto".into()) == "auto" {
170 Self::auto_detect_threads()
171 } else {
172 env::var("THREADS")
173 .unwrap_or_else(|_| "6".into())
174 .parse()
175 .unwrap_or(6)
176 };
177
178 let ctx_size = if env::var("CTX_SIZE").unwrap_or_else(|_| "auto".into()) == "auto" {
181 Self::auto_detect_ctx_size(&model_path)
182 } else {
183 env::var("CTX_SIZE")
184 .unwrap_or_else(|_| "8192".into())
185 .parse()
186 .unwrap_or(8192)
187 };
188
189 let parallel_slots: u32 = env::var("PARALLEL_SLOTS")
191 .unwrap_or_else(|_| "8".into())
192 .parse()
193 .unwrap_or(8);
194
195 let gpu_layers = if env::var("GPU_LAYERS").unwrap_or_else(|_| "auto".into()) == "auto" {
199 Self::auto_detect_gpu_layers(&model_path, ctx_size, parallel_slots)
200 } else {
201 env::var("GPU_LAYERS")
202 .unwrap_or_else(|_| "20".into())
203 .parse()
204 .unwrap_or(20)
205 };
206
207 let batch_size = if env::var("BATCH_SIZE").unwrap_or_else(|_| "auto".into()) == "auto" {
209 Self::auto_detect_batch_size(gpu_layers, ctx_size)
210 } else {
211 env::var("BATCH_SIZE")
212 .unwrap_or_else(|_| "256".into())
213 .parse()
214 .unwrap_or(256)
215 };
216
217 let llama_host = env::var("LLAMA_HOST").unwrap_or_else(|_| "127.0.0.1".into());
219 let llama_port = env::var("LLAMA_PORT")
220 .unwrap_or_else(|_| "8081".into())
221 .parse()?;
222 let backend_url = format!("http://{}:{}", llama_host, llama_port);
223
224 let openrouter_api_key = env::var("OPENROUTER_API_KEY").unwrap_or_default();
226
227 info!(
228 "Resource Configuration: {} GPU layers, {} threads, batch size: {}, context: {}",
229 gpu_layers, threads, batch_size, ctx_size
230 );
231
232 Ok(Self {
233 model_path,
234 llama_bin,
235 llama_host: llama_host.clone(),
236 llama_port,
237 ctx_size,
238 batch_size,
239 threads,
240 gpu_layers,
241 health_timeout_seconds: env::var("HEALTH_TIMEOUT_SECONDS")
242 .unwrap_or_else(|_| "60".into())
243 .parse()?,
244 hot_swap_grace_seconds: env::var("HOT_SWAP_GRACE_SECONDS")
245 .unwrap_or_else(|_| "25".into())
246 .parse()?,
247 max_concurrent_streams: env::var("MAX_CONCURRENT_STREAMS")
248 .unwrap_or_else(|_| "4".into())
249 .parse()?,
250 parallel_slots,
251 ubatch_size: env::var("UBATCH_SIZE")
252 .unwrap_or_else(|_| "512".into())
253 .parse()
254 .unwrap_or(512),
255 prometheus_port: env::var("PROMETHEUS_PORT")
256 .unwrap_or_else(|_| "9000".into())
257 .parse()?,
258 api_host: env::var("API_HOST").unwrap_or_else(|_| "127.0.0.1".into()),
259 api_port: env::var("API_PORT")
260 .unwrap_or_else(|_| "9999".into())
261 .parse()?,
262 requests_per_second: env::var("REQUESTS_PER_SECOND")
263 .unwrap_or_else(|_| "24".into())
264 .parse()?,
265 generate_timeout_seconds: env::var("GENERATE_TIMEOUT_SECONDS")
266 .unwrap_or_else(|_| "300".into())
267 .parse()?,
268 stream_timeout_seconds: env::var("STREAM_TIMEOUT_SECONDS")
269 .unwrap_or_else(|_| "600".into())
270 .parse()?,
271 health_check_timeout_seconds: env::var("HEALTH_CHECK_TIMEOUT_SECONDS")
272 .unwrap_or_else(|_| "90".into())
273 .parse()?,
274 queue_size: env::var("QUEUE_SIZE")
275 .unwrap_or_else(|_| "100".into())
276 .parse()?,
277 queue_timeout_seconds: env::var("QUEUE_TIMEOUT_SECONDS")
278 .unwrap_or_else(|_| "30".into())
279 .parse()?,
280 backend_url,
281 openrouter_api_key,
282 draft_model_path: env::var("DRAFT_MODEL_PATH")
283 .unwrap_or_else(|_| "none".into()),
284 speculative_draft_max: env::var("SPECULATIVE_DRAFT_MAX")
285 .unwrap_or_else(|_| "8".into())
286 .parse()
287 .unwrap_or(8),
288 speculative_draft_p_min: env::var("SPECULATIVE_DRAFT_P_MIN")
289 .unwrap_or_else(|_| "0.4".into())
290 .parse()
291 .unwrap_or(0.4),
292 })
293 }
294
295 fn get_model_path_with_fallback() -> Result<String> {
296 if let Ok(model_path) = env::var("MODEL_PATH") {
298 if std::path::Path::new(&model_path).exists() {
300 info!("Using model from MODEL_PATH: {}", model_path);
301 return Ok(model_path);
302 } else {
303 warn!("MODEL_PATH set but file doesn't exist: {}", model_path);
304 }
305 }
306
307 let exe_dir = std::env::current_exe()
309 .ok()
310 .and_then(|exe| exe.parent().map(|p| p.to_path_buf()))
311 .unwrap_or_else(|| std::env::current_dir().unwrap_or_default());
312
313 let possible_model_locations = vec![
315 exe_dir.join("resources/models/default.gguf"),
317 exe_dir.join("resources/models/model.gguf"),
318 exe_dir.join("models/default.gguf"),
319 exe_dir.join("models/model.gguf"),
320 exe_dir.join("default.gguf"),
321 exe_dir.join("resources/models/default.onnx"),
323 exe_dir.join("resources/models/model.onnx"),
324 exe_dir.join("resources/models/default.trt"),
326 exe_dir.join("resources/models/model.engine"),
327 exe_dir.join("resources/models/default.safetensors"),
329 exe_dir.join("resources/models/model.safetensors"),
330 exe_dir.join("resources/models/default.ggml"),
332 exe_dir.join("resources/models/model.bin"),
333 ];
334
335 for model_path in possible_model_locations {
336 if model_path.exists() {
337 info!("Using embedded model: {}", model_path.display());
338 return Ok(model_path.to_string_lossy().to_string());
339 }
340 }
341
342 if let Ok(entries) = std::fs::read_dir(exe_dir.join("resources/models")) {
344 for entry in entries.flatten() {
345 if let Some(ext) = entry.path().extension() {
346 let ext_str = ext.to_str().unwrap_or("").to_lowercase();
347 if matches!(
349 ext_str.as_str(),
350 "gguf"
351 | "ggml"
352 | "onnx"
353 | "trt"
354 | "engine"
355 | "plan"
356 | "safetensors"
357 | "mlmodel"
358 ) {
359 info!("Using found model: {}", entry.path().display());
360 return Ok(entry.path().to_string_lossy().to_string());
361 }
362 }
363 }
364 }
365
366 Ok("".to_string())
369 }
370
371 fn get_llama_binary_path() -> Result<String> {
379 if let Ok(llama_bin) = env::var("LLAMA_BIN") {
381 if std::path::Path::new(&llama_bin).exists() {
382 info!("Using llama binary from LLAMA_BIN env: {}", llama_bin);
383 return Ok(llama_bin);
384 } else {
385 warn!(
386 "LLAMA_BIN set but file doesn't exist: {}, falling back to auto-detection",
387 llama_bin
388 );
389 }
390 }
391
392 let (os_folder, binary_name) = Self::get_platform_binary_info();
394 info!(
395 "Auto-detecting llama binary for OS: {} (binary: {})",
396 os_folder, binary_name
397 );
398
399 let exe_dir = std::env::current_exe()
401 .ok()
402 .and_then(|exe| exe.parent().map(|p| p.to_path_buf()));
403
404 let cwd = std::env::current_dir().ok();
405
406 let mut search_dirs: Vec<PathBuf> = Vec::new();
408
409 if let Some(ref exe) = exe_dir {
410 search_dirs.push(exe.clone());
411 if let Some(parent) = exe.parent() {
413 search_dirs.push(parent.to_path_buf());
414 if let Some(grandparent) = parent.parent() {
415 search_dirs.push(grandparent.to_path_buf());
416 }
417 }
418 }
419
420 if let Some(ref cwd_path) = cwd {
421 search_dirs.push(cwd_path.clone());
422 }
423
424 #[cfg(debug_assertions)]
426 {
427 let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
428 search_dirs.push(crate_dir);
429 }
430
431 let resource_folder_names = ["Resources", "resources"];
434 for base_dir in &search_dirs {
435 for resource_folder in &resource_folder_names {
436 let bin_dir = base_dir.join(resource_folder).join("bin").join(os_folder);
437
438 if bin_dir.exists() {
439 if let Ok(entries) = std::fs::read_dir(&bin_dir) {
443 let mut dir_entries: Vec<_> = entries.flatten().collect();
445 dir_entries.sort_by_key(|e| e.file_name());
446
447 for entry in dir_entries {
448 let entry_path = entry.path();
449 if !entry_path.is_dir() {
450 continue;
451 }
452
453 #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
456 {
457 let dir_name = entry_path
458 .file_name()
459 .and_then(|n| n.to_str())
460 .unwrap_or("");
461 if dir_name.contains("x64") || dir_name.contains("x86_64") {
463 debug!("Skipping Intel subdir on Apple Silicon: {}", dir_name);
464 continue;
465 }
466 }
467 #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
468 {
469 let dir_name = entry_path
470 .file_name()
471 .and_then(|n| n.to_str())
472 .unwrap_or("");
473 if dir_name.contains("arm64") || dir_name.contains("aarch64") {
475 debug!("Skipping ARM subdir on Intel Mac: {}", dir_name);
476 continue;
477 }
478 }
479
480 let potential_binary = entry_path.join(binary_name);
481 if potential_binary.exists() {
482 info!("Found llama binary at: {}", potential_binary.display());
483 return Ok(potential_binary.to_string_lossy().to_string());
484 }
485 }
486 }
487
488 let direct_binary = bin_dir.join(binary_name);
490 if direct_binary.exists() {
491 info!("Found llama binary at: {}", direct_binary.display());
492 return Ok(direct_binary.to_string_lossy().to_string());
493 }
494 }
495 } }
497
498 let arch = Self::get_arch_hint();
499 warn!(
500 "Llama binary not found. Searched in Resources/bin/{os_folder}/ for '{binary_name}'.\n\
501 Please either:\n\
502 1. Set LLAMA_BIN environment variable to the full path\n\
503 2. Place the binary in Resources/bin/{os_folder}/<subfolder>/\n\
504 \n\
505 Expected binary name: {binary_name}\n\
506 OS detected: {os_folder}\n\
507 Architecture: {arch}\n\
508 Searched directories: {:?}",
509 search_dirs
510 .iter()
511 .map(|p| p.display().to_string())
512 .collect::<Vec<_>>()
513 );
514
515 Ok(String::new())
519 }
520
521 fn get_platform_binary_info() -> (&'static str, &'static str) {
523 #[cfg(target_os = "windows")]
524 {
525 ("Windows", "llama-server.exe")
526 }
527
528 #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
530 {
531 ("MacOS", "llama-server")
532 }
534
535 #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
537 {
538 ("MacOS", "llama-server")
539 }
541
542 #[cfg(target_os = "linux")]
543 {
544 ("Linux", "llama-server")
545 }
546
547 #[cfg(not(any(target_os = "windows", target_os = "macos", target_os = "linux")))]
548 {
549 compile_error!(
550 "Unsupported operating system. Only Windows, macOS, and Linux are supported."
551 );
552 }
553 }
554
555 fn get_arch_hint() -> &'static str {
557 #[cfg(target_arch = "x86_64")]
558 {
559 "x64"
560 }
561 #[cfg(target_arch = "aarch64")]
562 {
563 "arm64"
564 }
565 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
566 {
567 "unknown"
568 }
569 }
570
571 fn auto_detect_threads() -> u32 {
572 let threads = num_cpus::get() as u32;
573 info!("Auto-detected {} CPU cores for inference", threads);
574 threads
575 }
576
577 fn layers_for_vram(vram_mb: u64, model_path: &str, ctx_size: u32, parallel_slots: u32) -> u32 {
583 let path_lower = model_path.to_lowercase();
584
585 let params_b: f64 =
587 if path_lower.contains("0.5b") { 0.5 }
588 else if path_lower.contains("1.5b") { 1.5 }
589 else if path_lower.contains("1b") && !path_lower.contains("13b") { 1.0 }
590 else if path_lower.contains("3b") && !path_lower.contains("13b") && !path_lower.contains("33b") { 3.0 }
591 else if path_lower.contains("7b") { 7.0 }
592 else if path_lower.contains("8b") { 8.0 }
593 else if path_lower.contains("13b") { 13.0 }
594 else if path_lower.contains("14b") { 14.0 }
595 else if path_lower.contains("33b") || path_lower.contains("34b") { 34.0 }
596 else if path_lower.contains("70b") { 70.0 }
597 else { 7.0 }; let bits: f64 =
601 if path_lower.contains("q4_k_m") || path_lower.contains("q4_k_s") { 4.5 }
602 else if path_lower.contains("q4_k") { 4.5 }
603 else if path_lower.contains("q4_0") || path_lower.contains("q4_1") { 4.0 }
604 else if path_lower.contains("q5_k_m") || path_lower.contains("q5_k_s") { 5.5 }
605 else if path_lower.contains("q5") { 5.0 }
606 else if path_lower.contains("q6_k") { 6.5 }
607 else if path_lower.contains("q8_0") { 8.5 }
608 else if path_lower.contains("f16") || path_lower.contains("fp16") { 16.0 }
609 else if path_lower.contains("f32") || path_lower.contains("fp32") { 32.0 }
610 else { 4.5 }; let total_layers: u32 =
614 if params_b <= 0.6 { 24 }
615 else if params_b <= 1.6 { 28 }
616 else if params_b <= 3.5 { 28 }
617 else if params_b <= 8.5 { 32 }
618 else if params_b <= 14.5 { 40 }
619 else if params_b <= 35.0 { 48 }
620 else { 80 };
621
622 let model_vram_mb = (params_b * 1e9 * bits / 8.0 / 1024.0 / 1024.0) as u64;
624
625 let base_kv_mb = (model_vram_mb as f64 * 0.14).max(64.0);
629 let kv_mb = (base_kv_mb
630 * (ctx_size as f64 / 8192.0)
631 * ((parallel_slots as f64 / 8.0).sqrt())).max(64.0) as u64;
632
633 let overhead_mb: u64 = 384;
635
636 let available_mb = vram_mb.saturating_sub(overhead_mb + kv_mb);
637
638 if available_mb >= model_vram_mb {
639 info!(
641 "GPU auto-detect: full offload — model {:.0} MB fits in {:.0} MB available → {} layers",
642 model_vram_mb, available_mb, total_layers
643 );
644 total_layers
645 } else {
646 let per_layer_mb = (model_vram_mb as f64 / total_layers as f64).ceil() as u64;
648 let fit_layers = if per_layer_mb > 0 {
649 (available_mb / per_layer_mb).min(total_layers as u64) as u32
650 } else {
651 0
652 };
653 info!(
654 "GPU auto-detect: partial offload {}/{} layers ({} MB model, {} MB available, {} MB/layer)",
655 fit_layers, total_layers, model_vram_mb, available_mb, per_layer_mb
656 );
657 fit_layers
658 }
659 }
660
661 fn auto_detect_gpu_layers(model_path: &str, ctx_size: u32, parallel_slots: u32) -> u32 {
662 #[cfg(all(feature = "nvidia", any(target_os = "windows", target_os = "linux")))]
664 {
665 if let Ok(nvml) = Nvml::init() {
666 if let Ok(device_count) = nvml.device_count() {
667 if device_count > 0 {
668 if let Ok(first_gpu) = nvml.device_by_index(0) {
669 if let Ok(memory) = first_gpu.memory_info() {
670 let vram_mb = memory.total / 1024 / 1024;
671 let layers = Self::layers_for_vram(vram_mb, model_path, ctx_size, parallel_slots);
672 info!(
673 "Auto‑detected NVIDIA GPU layers: {} ({} MB VRAM)",
674 layers, vram_mb
675 );
676 return layers;
677 }
678 }
679 }
680 }
681 }
682 info!("No NVIDIA GPU detected, using CPU-optimized defaults (0 GPU layers)");
683 0
684 }
685
686 #[cfg(not(all(feature = "nvidia", any(target_os = "windows", target_os = "linux"))))]
688 #[cfg(any(target_os = "windows", target_os = "linux"))]
689 {
690 use std::process::{Command, Stdio};
691
692 let child = Command::new("nvidia-smi")
694 .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
695 .stdout(Stdio::piped())
696 .stderr(Stdio::null())
697 .spawn();
698
699 match child {
700 Ok(mut process) => {
701 let start = std::time::Instant::now();
702 loop {
703 match process.try_wait() {
704 Ok(Some(status)) => {
705 if status.success() {
706 if let Ok(output) = process.wait_with_output() {
707 let stdout = String::from_utf8_lossy(&output.stdout);
708 if let Some(vram_mb_str) = stdout.lines().next() {
709 if let Ok(vram_mb) = vram_mb_str.trim().parse::<u64>() {
710 let layers = Self::layers_for_vram(vram_mb, model_path, ctx_size, parallel_slots);
711 info!(
712 "Auto‑detected NVIDIA GPU layers via nvidia-smi: {} ({} MB VRAM)",
713 layers, vram_mb
714 );
715 return layers;
716 }
717 }
718 }
719 }
720 info!("nvidia-smi returned but could not parse VRAM, using CPU defaults (0 GPU layers)");
721 return 0;
722 }
723 Ok(None) => {
724 if start.elapsed() > std::time::Duration::from_secs(5) {
725 let _ = process.kill();
726 let _ = process.wait();
727 info!("nvidia-smi timed out, using CPU defaults (0 GPU layers)");
728 return 0;
729 }
730 std::thread::sleep(std::time::Duration::from_millis(50));
731 }
732 Err(_) => {
733 return 0;
734 }
735 }
736 }
737 }
738 Err(_) => {
739 info!("No NVIDIA GPU detected (nvidia-smi not available), using CPU defaults (0 GPU layers)");
740 0
741 }
742 }
743 }
744
745 #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
747 {
748 let total_mem_gb = {
752 let mut sys = System::new_all();
753 sys.refresh_memory();
754 sys.total_memory() / 1024 / 1024 / 1024
755 };
756
757 let layers = match total_mem_gb {
759 0..=8 => 24, 9..=16 => 32, 17..=32 => 40, 33..=64 => 48, _ => 56, };
765 info!(
766 "Apple Silicon detected ({} GB unified memory), using Metal GPU layers: {}",
767 total_mem_gb, layers
768 );
769 layers
770 }
771
772 #[cfg(all(target_os = "macos", target_arch = "x86_64"))]
774 {
775 info!("Intel Mac detected, using CPU-only mode (0 GPU layers)");
778 0
779 }
780 }
781
782 fn auto_detect_ctx_size(model_path: &str) -> u32 {
783 let inferred = Self::read_ctx_size_from_model_path(model_path).unwrap_or_else(|| {
784 info!("Falling back to default context size (8192)");
785 8192
786 });
787 let adjusted = Self::adjust_ctx_size_for_system(inferred);
788 info!("Final context size: {} (inferred: {})", adjusted, inferred);
789 adjusted
790 }
791
792 fn read_ctx_size_from_model_path(model_path: &str) -> Option<u32> {
793 let path_lower = model_path.to_lowercase();
795
796 if path_lower.contains("32k") {
797 Some(32768)
798 } else if path_lower.contains("16k") {
799 Some(16384)
800 } else if path_lower.contains("8k") {
801 Some(8192)
802 } else if path_lower.contains("4k") {
803 Some(4096)
804 } else if path_lower.contains("2k") {
805 Some(2048)
806 } else if path_lower.contains("7b")
807 || path_lower.contains("8b")
808 || path_lower.contains("13b")
809 {
810 Some(4096)
811 } else if path_lower.contains("34b") || path_lower.contains("70b") {
812 Some(8192)
813 } else {
814 Some(8192)
816 }
817 }
818
819 fn adjust_ctx_size_for_system(inferred_ctx: u32) -> u32 {
820 let mut system = System::new_all();
821 system.refresh_memory();
822
823 let available_ram_gb = system.available_memory() / 1024 / 1024 / 1024;
824
825 let required_ram_gb = (inferred_ctx as f32 / 4096.0) * 1.5;
826 if available_ram_gb < required_ram_gb as u64 {
827 let adjusted = (available_ram_gb as f32 * 4096.0 / 1.5) as u32;
828 let safe_ctx = adjusted.min(inferred_ctx).max(2048);
829 warn!(
830 "Reducing context size from {} → {} due to limited RAM ({}GB available)",
831 inferred_ctx, safe_ctx, available_ram_gb
832 );
833 safe_ctx
834 } else {
835 inferred_ctx
836 }
837 }
838
839 fn auto_detect_batch_size(gpu_layers: u32, ctx_size: u32) -> u32 {
840 let mut system = System::new_all();
841 system.refresh_memory();
842
843 let available_mb = system.available_memory() / 1024;
844 let has_gpu = gpu_layers > 0;
845 let memory_per_batch = Self::estimate_memory_per_batch(ctx_size, has_gpu);
846 let safe_available_mb = (available_mb as f32 * 0.6) as u32;
847 let max_batch = (safe_available_mb as f32 / memory_per_batch).max(1.0) as u32;
848
849 let optimal = Self::apply_batch_limits(max_batch, ctx_size, has_gpu);
850 info!(
851 "Auto batch size: {} (ctx: {}, GPU: {}, est mem: {:.1}MB/batch)",
852 optimal, ctx_size, has_gpu, memory_per_batch
853 );
854 optimal
855 }
856
857 fn estimate_memory_per_batch(ctx_size: u32, has_gpu: bool) -> f32 {
858 if has_gpu {
859 (ctx_size as f32 / 1024.0) * 0.5
860 } else {
861 (ctx_size as f32 / 1024.0) * 1.2
862 }
863 }
864
865 fn apply_batch_limits(batch_size: u32, ctx_size: u32, _has_gpu: bool) -> u32 {
866 let limited = batch_size.clamp(16, 1024);
867 match ctx_size {
868 0..=2048 => limited.min(512),
869 2049..=4096 => limited.min(512),
870 4097..=8192 => limited.min(512),
874 8193..=16384 => limited.min(256),
875 16385..=32768 => limited.min(128),
876 _ => limited.min(64),
877 }
878 }
879
880 pub fn print_config(&self) {
881 info!("Current Configuration:");
882 info!("- Model Path: {}", self.model_path);
883 info!("- Llama Binary: {}", self.llama_bin);
884 info!("- Context Size: {}", self.ctx_size);
885 info!("- Batch Size: {}", self.batch_size);
886 info!("- Threads: {}", self.threads);
887 info!("- GPU Layers: {}", self.gpu_layers);
888 info!("- Parallel Slots: {}", self.parallel_slots);
889 info!("- Ubatch Size: {}", self.ubatch_size);
890 info!("- Max Streams: {}", self.max_concurrent_streams);
891 info!("- API: {}:{}", self.api_host, self.api_port);
892 info!("- Backend: {}:{}", self.llama_host, self.llama_port);
893 info!("- Queue Size: {}", self.queue_size);
894 info!("- Queue Timeout: {}s", self.queue_timeout_seconds);
895 info!("- Backend URL: {}", self.backend_url);
896 }
897
898 pub fn api_addr(&self) -> SocketAddr {
899 format!("{}:{}", self.api_host, self.api_port)
900 .parse()
901 .unwrap()
902 }
903}
904
905#[cfg(test)]
906mod tests {
907 use super::*;
908
909 fn create_test_config() -> Config {
911 Config {
912 model_path: "/test/model.gguf".to_string(),
913 llama_bin: "/test/llama-server".to_string(),
914 llama_host: "127.0.0.1".to_string(),
915 llama_port: 8001,
916 ctx_size: 8192,
917 batch_size: 128,
918 threads: 6,
919 gpu_layers: 20,
920 health_timeout_seconds: 600,
921 hot_swap_grace_seconds: 25,
922 max_concurrent_streams: 2,
923 prometheus_port: 9000,
924 api_host: "127.0.0.1".to_string(),
925 api_port: 9999,
926 requests_per_second: 24,
927 generate_timeout_seconds: 300,
928 stream_timeout_seconds: 600,
929 health_check_timeout_seconds: 900,
930 queue_size: 1000,
931 queue_timeout_seconds: 300,
932 backend_url: "http://127.0.0.1:8001".to_string(),
933 openrouter_api_key: "test-api-key".to_string(),
934 }
935 }
936
937 #[test]
940 fn test_config_creation_with_default_values() {
941 let config = create_test_config();
942
943 assert_eq!(config.model_path, "/test/model.gguf");
944 assert_eq!(config.llama_bin, "/test/llama-server");
945 assert_eq!(config.api_port, 9999);
946 assert_eq!(config.llama_port, 8001);
947 }
948
949 #[test]
950 fn test_config_clone() {
951 let config1 = create_test_config();
952 let config2 = config1.clone();
953
954 assert_eq!(config1.api_host, config2.api_host);
955 assert_eq!(config1.threads, config2.threads);
956 assert_eq!(config1.gpu_layers, config2.gpu_layers);
957 }
958
959 #[test]
962 fn test_api_addr_parsing() {
963 let config = create_test_config();
964 let addr = config.api_addr();
965
966 assert_eq!(addr.ip().to_string(), "127.0.0.1");
967 assert_eq!(addr.port(), 9999);
968 }
969
970 #[test]
971 fn test_api_addr_with_different_ports() {
972 let mut config = create_test_config();
973 config.api_port = 3000;
974
975 let addr = config.api_addr();
976 assert_eq!(addr.port(), 3000);
977 }
978
979 #[test]
980 fn test_api_addr_with_zero_address() {
981 let mut config = create_test_config();
982 config.api_host = "0.0.0.0".to_string();
983 config.api_port = 5000;
984
985 let addr = config.api_addr();
986 assert_eq!(addr.port(), 5000);
987 assert_eq!(addr.ip().to_string(), "0.0.0.0");
989 }
990
991 #[test]
994 fn test_config_timeouts_are_positive() {
995 let config = create_test_config();
996
997 assert!(config.health_timeout_seconds > 0);
998 assert!(config.generate_timeout_seconds > 0);
999 assert!(config.stream_timeout_seconds > 0);
1000 assert!(config.health_check_timeout_seconds > 0);
1001 }
1002
1003 #[test]
1004 fn test_health_check_timeout_greater_than_health_timeout() {
1005 let config = create_test_config();
1006
1007 assert!(config.health_check_timeout_seconds >= config.health_timeout_seconds);
1009 }
1010
1011 #[test]
1014 fn test_max_concurrent_streams_is_positive() {
1015 let config = create_test_config();
1016 assert!(config.max_concurrent_streams > 0);
1017 }
1018
1019 #[test]
1020 fn test_requests_per_second_is_reasonable() {
1021 let config = create_test_config();
1022
1023 assert!(config.requests_per_second > 0);
1025 assert!(config.requests_per_second <= 1000);
1026 }
1027
1028 #[test]
1029 fn test_queue_size_is_positive() {
1030 let config = create_test_config();
1031 assert!(config.queue_size > 0);
1032 }
1033
1034 #[test]
1037 fn test_context_size_within_valid_range() {
1038 let config = create_test_config();
1039
1040 assert!(config.ctx_size >= 512);
1042 assert!(config.ctx_size <= 32768);
1043 }
1044
1045 #[test]
1046 fn test_batch_size_valid_range() {
1047 let config = create_test_config();
1048
1049 assert!(config.batch_size >= 16);
1051 assert!(config.batch_size <= 1024);
1052 }
1053
1054 #[test]
1055 fn test_batch_size_reasonable_vs_context() {
1056 let config = create_test_config();
1057
1058 assert!(config.batch_size < config.ctx_size);
1060 }
1061
1062 #[test]
1065 fn test_threads_is_positive() {
1066 let config = create_test_config();
1067 assert!(config.threads > 0);
1068 }
1069
1070 #[test]
1071 fn test_threads_within_reasonable_range() {
1072 let config = create_test_config();
1073
1074 assert!(config.threads <= 256);
1076 }
1077
1078 #[test]
1081 fn test_gpu_layers_non_negative() {
1082 let config = create_test_config();
1083 assert!(config.gpu_layers <= config.ctx_size);
1084 }
1085
1086 #[test]
1087 fn test_gpu_layers_within_range() {
1088 let config = create_test_config();
1089
1090 assert!(config.gpu_layers <= 100);
1092 }
1093
1094 #[test]
1097 fn test_api_port_valid() {
1098 let config = create_test_config();
1099 assert!(config.api_port > 0);
1100 assert!(config.api_port != config.llama_port);
1101 }
1102
1103 #[test]
1104 fn test_llama_port_valid() {
1105 let config = create_test_config();
1106 assert!(config.llama_port > 0);
1107 }
1108
1109 #[test]
1110 fn test_prometheus_port_valid() {
1111 let config = create_test_config();
1112 assert!(config.prometheus_port > 0);
1113 }
1114
1115 #[test]
1116 fn test_ports_are_different() {
1117 let config = create_test_config();
1118
1119 assert_ne!(config.api_port, config.llama_port);
1121 assert_ne!(config.api_port, config.prometheus_port);
1122 assert_ne!(config.llama_port, config.prometheus_port);
1123 }
1124
1125 #[test]
1128 fn test_model_path_not_empty() {
1129 let config = create_test_config();
1130 assert!(!config.model_path.is_empty());
1131 }
1132
1133 #[test]
1134 fn test_llama_bin_not_empty() {
1135 let config = create_test_config();
1136 assert!(!config.llama_bin.is_empty());
1137 }
1138
1139 #[test]
1140 fn test_backend_url_not_empty() {
1141 let config = create_test_config();
1142 assert!(!config.backend_url.is_empty());
1143 }
1144
1145 #[test]
1146 fn test_backend_url_format() {
1147 let config = create_test_config();
1148
1149 assert!(
1151 config.backend_url.starts_with("http://") || config.backend_url.starts_with("https://")
1152 );
1153 }
1154
1155 #[test]
1158 fn test_api_host_not_empty() {
1159 let config = create_test_config();
1160 assert!(!config.api_host.is_empty());
1161 }
1162
1163 #[test]
1164 fn test_llama_host_not_empty() {
1165 let config = create_test_config();
1166 assert!(!config.llama_host.is_empty());
1167 }
1168
1169 #[test]
1172 fn test_hot_swap_grace_positive() {
1173 let config = create_test_config();
1174 assert!(config.hot_swap_grace_seconds > 0);
1175 }
1176
1177 #[test]
1178 fn test_hot_swap_grace_reasonable() {
1179 let config = create_test_config();
1180
1181 assert!(config.hot_swap_grace_seconds < 300);
1183 }
1184
1185 #[test]
1188 fn test_auto_detect_threads_returns_positive() {
1189 let threads = Config::auto_detect_threads();
1190 assert!(threads > 0);
1191 }
1192
1193 #[test]
1194 fn test_auto_detect_gpu_layers_non_negative() {
1195 let layers = Config::auto_detect_gpu_layers("qwen2.5-coder-3b-instruct-q4_k_m.gguf", 8192, 8);
1196 assert!(layers <= 512);
1197 }
1198
1199 #[test]
1200 fn test_layers_for_vram_full_offload() {
1201 let layers = Config::layers_for_vram(4096, "qwen2.5-coder-3b-instruct-q4_k_m.gguf", 8192, 8);
1203 assert_eq!(layers, 28, "3B model should fully offload on 4GB GPU");
1204 }
1205
1206 #[test]
1207 fn test_layers_for_vram_partial_offload() {
1208 let layers = Config::layers_for_vram(2048, "qwen2.5-coder-7b-instruct-q4_k_m.gguf", 8192, 8);
1210 assert!(layers < 32, "7B model should only partially offload on 2GB GPU");
1211 assert!(layers > 0, "Should get at least some layers on 2GB GPU");
1212 }
1213
1214 #[test]
1215 fn test_apply_batch_limits_small_context() {
1216 let batch = Config::apply_batch_limits(1024, 1024, false);
1218 assert!(batch <= 512);
1219 }
1220
1221 #[test]
1222 fn test_apply_batch_limits_medium_context() {
1223 let batch = Config::apply_batch_limits(1024, 3000, false);
1225 assert!(batch <= 384);
1226 }
1227
1228 #[test]
1229 fn test_apply_batch_limits_large_context() {
1230 let batch = Config::apply_batch_limits(1024, 24576, false);
1232 assert!(batch <= 64);
1233 }
1234
1235 #[test]
1236 fn test_apply_batch_limits_minimum() {
1237 let batch = Config::apply_batch_limits(1, 8192, false);
1239 assert!(batch >= 16);
1240 }
1241
1242 #[test]
1243 fn test_estimate_memory_per_batch_cpu() {
1244 let memory_cpu = Config::estimate_memory_per_batch(8192, false);
1245 assert!(memory_cpu > 0.0);
1246 }
1247
1248 #[test]
1249 fn test_estimate_memory_per_batch_gpu() {
1250 let memory_gpu = Config::estimate_memory_per_batch(8192, true);
1251 assert!(memory_gpu > 0.0);
1252 }
1253
1254 #[test]
1255 fn test_estimate_memory_gpu_less_than_cpu() {
1256 let memory_cpu = Config::estimate_memory_per_batch(8192, false);
1257 let memory_gpu = Config::estimate_memory_per_batch(8192, true);
1258
1259 assert!(memory_gpu < memory_cpu);
1261 }
1262
1263 #[test]
1266 fn test_queue_timeout_is_positive() {
1267 let config = create_test_config();
1268 assert!(config.queue_timeout_seconds > 0);
1269 }
1270
1271 #[test]
1272 fn test_queue_timeout_less_than_generate_timeout() {
1273 let config = create_test_config();
1274
1275 assert!(config.queue_timeout_seconds <= config.generate_timeout_seconds);
1277 }
1278
1279 #[test]
1282 fn test_config_values_consistency() {
1283 let config = create_test_config();
1284
1285 assert!(config.health_timeout_seconds <= 3600); assert!(config.generate_timeout_seconds <= 1800); assert!(config.stream_timeout_seconds <= 3600); assert!(config.health_check_timeout_seconds <= 3600); }
1291
1292 #[test]
1293 fn test_config_backend_url_consistency() {
1294 let config = create_test_config();
1295
1296 assert!(
1298 config.backend_url.contains(&config.llama_host)
1299 || config.backend_url.contains("127.0.0.1")
1300 || config.backend_url.contains("localhost")
1301 );
1302 }
1303
1304 #[test]
1305 fn test_config_all_fields_initialized() {
1306 let config = create_test_config();
1307
1308 assert!(!config.model_path.is_empty());
1310 assert!(!config.llama_bin.is_empty());
1311 assert!(!config.api_host.is_empty());
1312 assert!(!config.llama_host.is_empty());
1313 assert!(config.threads > 0);
1314 assert!(config.gpu_layers <= config.ctx_size);
1315 assert!(config.api_port > 0);
1316 assert!(config.llama_port > 0);
1317 }
1318}