1use std::fmt::Display;
2use std::process::Stdio;
3use tokio::io::{AsyncBufReadExt, BufReader};
4use tokio::process::Command;
5use tokio::sync::mpsc;
6use tracing::{info, warn};
7
8use crate::config::{Config, DEFAULT_SYSTEM_PROMPT};
9use crate::models::{
10 DiscoveredModel, ModelSettings, RopeScaling, ServerMetrics, clean_host, strip_gguf,
11};
12
13#[derive(Clone)]
15pub struct ServerHandle {
16 pub port: u16,
17 pub host: String,
18 pub pid: u32,
19 pub kill_tx: mpsc::Sender<()>,
20}
21
22fn push_arg(cmd: &mut Command, parts: &mut Vec<String>, name: &str, value: impl Display) {
24 let val_str = value.to_string();
25 cmd.arg(name).arg(&val_str);
26 parts.push(name.to_string());
27 parts.push(val_str);
28}
29
30fn push_flag(cmd: &mut Command, parts: &mut Vec<String>, name: &str) {
32 cmd.arg(name);
33 parts.push(name.to_string());
34}
35
36fn push_gpu_layers(cmd: &mut Command, parts: &mut Vec<String>, settings: &ModelSettings) {
37 match settings.gpu_layers_mode {
38 crate::models::GpuLayersMode::Specific(n) => push_arg(cmd, parts, "-ngl", n),
39 crate::models::GpuLayersMode::All => push_arg(cmd, parts, "-ngl", "999"),
40 crate::models::GpuLayersMode::Auto => {}
41 }
42}
43
44fn push_spec_decoding(cmd: &mut Command, parts: &mut Vec<String>, settings: &ModelSettings) {
45 if !settings.spec_type.is_empty() {
46 push_arg(cmd, parts, "--spec-type", &settings.spec_type);
47 if settings.draft_tokens > 0 {
48 push_arg(cmd, parts, "--spec-draft-n-max", settings.draft_tokens);
49 }
50 }
51}
52
53pub fn build_server_cmd(
56 binary: &std::path::Path,
57 model: Option<&DiscoveredModel>,
58 settings: &ModelSettings,
59 config: &Config,
60 server_mode: crate::models::ServerMode,
61 router_max_models: u32,
62) -> (Command, String) {
63 let mut cmd = Command::new(binary);
64 let mut parts: Vec<String> = vec![binary.display().to_string()];
65
66 match server_mode {
68 crate::models::ServerMode::Normal => {
69 if let Some(model) = model {
70 push_arg(&mut cmd, &mut parts, "-m", model.path.display());
71 push_arg(&mut cmd, &mut parts, "--alias", &model.display_name);
73 }
74 }
75 crate::models::ServerMode::Router => {
76 if router_max_models > 0 {
78 push_arg(&mut cmd, &mut parts, "--models-max", router_max_models);
79 }
80 if let Some(dir) = config.models_dirs.first() {
82 push_arg(&mut cmd, &mut parts, "--models-dir", dir.display());
83 }
84 }
85 crate::models::ServerMode::Bench => {
86 }
88 crate::models::ServerMode::BenchTune => {
89 }
91 }
92
93 let gguf_meta = model
95 .map(|m| crate::models::GgufMetadata::from_path(&m.path))
96 .transpose();
97
98 push_arg(&mut cmd, &mut parts, "--threads", settings.threads);
100 push_arg(
101 &mut cmd,
102 &mut parts,
103 "--threads-batch",
104 settings.threads_batch,
105 );
106 let effective_ctx = (settings.context_length as f32 * settings.rope_scale) as u32;
107 push_arg(&mut cmd, &mut parts, "--ctx-size", effective_ctx);
108 push_arg(&mut cmd, &mut parts, "--ubatch-size", settings.ubatch_size);
109 if let Some(n) = settings.max_concurrent_predictions {
110 push_arg(&mut cmd, &mut parts, "--parallel", n);
111 }
112
113 push_flag(&mut cmd, &mut parts, "--no-warmup");
114
115 push_spec_decoding(&mut cmd, &mut parts, settings);
116
117 if let Some(cache_k) = settings.cache_type_k {
118 push_arg(&mut cmd, &mut parts, "--cache-type-k", cache_k);
119 }
120 if let Some(cache_v) = settings.cache_type_v {
121 push_arg(&mut cmd, &mut parts, "--cache-type-v", cache_v);
122 }
123
124 if settings.keep != 0 {
125 push_arg(&mut cmd, &mut parts, "--keep", settings.keep);
126 }
127 if settings.swa_full {
128 push_flag(&mut cmd, &mut parts, "--swa-full");
129 }
130 if settings.mlock {
131 push_flag(&mut cmd, &mut parts, "--mlock");
132 }
133 if !settings.mmap {
134 push_flag(&mut cmd, &mut parts, "--no-mmap");
135 }
136 if settings.numa != Default::default() {
137 push_arg(&mut cmd, &mut parts, "--numa", settings.numa.to_string());
138 }
139 if settings.kv_cache_offload {
140 push_flag(&mut cmd, &mut parts, "--kv-offload");
141 }
142
143 push_gpu_layers(&mut cmd, &mut parts, settings);
145
146 if settings.split_mode != Default::default() {
147 push_arg(
148 &mut cmd,
149 &mut parts,
150 "--split-mode",
151 settings.split_mode.to_string(),
152 );
153 }
154 if !settings.tensor_split.is_empty() {
155 push_arg(
156 &mut cmd,
157 &mut parts,
158 "--tensor-split",
159 &settings.tensor_split,
160 );
161 }
162 if settings.main_gpu != 0 {
163 push_arg(&mut cmd, &mut parts, "--main-gpu", settings.main_gpu);
164 }
165 if settings.fit {
166 push_arg(&mut cmd, &mut parts, "--fit", "on");
167 }
168
169 if let Some(ref lora) = settings.lora {
170 push_arg(&mut cmd, &mut parts, "--lora", lora.display());
171 }
172 if let Some((ref lora, scale)) = settings.lora_scaled {
173 push_arg(
174 &mut cmd,
175 &mut parts,
176 "--lora-scaled",
177 format!("{}:{}", lora.display(), scale),
178 );
179 }
180
181 let mut rpc_list = Vec::new();
182 if !settings.rpc.is_empty() {
183 rpc_list.push(settings.rpc.clone());
184 }
185 for worker in &config.rpc_workers {
186 if worker.selected {
187 rpc_list.push(format!("{}:{}", worker.ip, worker.port));
188 }
189 }
190
191 if !rpc_list.is_empty() {
192 let joined_rpc = rpc_list.join(",");
193 push_arg(&mut cmd, &mut parts, "--rpc", joined_rpc);
194 }
195
196 if settings.embedding {
197 push_flag(&mut cmd, &mut parts, "--embedding");
198 }
199
200 if settings.expert_count > 0 {
201 let arch = gguf_meta
202 .as_ref()
203 .ok()
204 .and_then(|opt| opt.as_ref())
205 .map(|m| m.arch.as_str())
206 .unwrap_or("llama");
207 push_arg(
208 &mut cmd,
209 &mut parts,
210 "--override-kv",
211 format!("{}.expert_used_count=int:int:{}", arch, settings.expert_count),
212 );
213 }
214
215 push_arg(
216 &mut cmd,
217 &mut parts,
218 "-fa",
219 if settings.flash_attn { "on" } else { "off" },
220 );
221
222 if settings.jinja {
223 push_flag(&mut cmd, &mut parts, "--jinja");
224 }
225
226 if let Some(ref template) = settings.chat_template {
227 push_arg(&mut cmd, &mut parts, "--chat-template", template);
228 }
229
230 if settings.system_prompt != DEFAULT_SYSTEM_PROMPT {
232 let mut merged = serde_json::Map::new();
233 if let Some(ref kwargs) = settings.chat_template_kwargs
234 && let Ok(obj) = serde_json::from_str::<serde_json::Value>(kwargs)
235 && let serde_json::Value::Object(map) = obj {
236 for (k, v) in map {
237 merged.insert(k, v);
238 }
239 }
240 merged.insert(
241 "system_prompt".to_string(),
242 serde_json::Value::String(settings.system_prompt.clone()),
243 );
244 push_arg(
245 &mut cmd,
246 &mut parts,
247 "--chat-template-kwargs",
248 serde_json::to_string(&merged).unwrap(),
249 );
250 } else if let Some(ref kwargs) = settings.chat_template_kwargs {
251 push_arg(&mut cmd, &mut parts, "--chat-template-kwargs", kwargs);
252 }
253
254 if settings.seed != -1 {
256 push_arg(&mut cmd, &mut parts, "--seed", settings.seed);
257 }
258 if let Some(max_tokens) = settings.max_tokens {
259 push_arg(&mut cmd, &mut parts, "--n-predict", max_tokens);
260 }
261 push_arg(
262 &mut cmd,
263 &mut parts,
264 "--temp",
265 format!("{:.2}", settings.temperature),
266 );
267
268 push_arg(&mut cmd, &mut parts, "--top-k", settings.top_k);
269
270 push_arg(
271 &mut cmd,
272 &mut parts,
273 "--top-p",
274 format!("{:.2}", settings.top_p),
275 );
276
277 push_arg(
278 &mut cmd,
279 &mut parts,
280 "--min-p",
281 format!("{:.2}", settings.min_p),
282 );
283
284 push_arg(
285 &mut cmd,
286 &mut parts,
287 "--typical",
288 format!("{:.2}", settings.typical_p),
289 );
290
291 if settings.mirostat != Default::default() {
292 push_arg(
293 &mut cmd,
294 &mut parts,
295 "--mirostat",
296 settings.mirostat.to_string(),
297 );
298 push_arg(
299 &mut cmd,
300 &mut parts,
301 "--mirostat-lr",
302 format!("{:.2}", settings.mirostat_lr),
303 );
304 push_arg(
305 &mut cmd,
306 &mut parts,
307 "--mirostat-ent",
308 format!("{:.2}", settings.mirostat_ent),
309 );
310 }
311
312 if settings.ignore_eos {
313 push_flag(&mut cmd, &mut parts, "--ignore-eos");
314 }
315
316 if !settings.samplers.0.is_empty() {
317 push_arg(
318 &mut cmd,
319 &mut parts,
320 "--samplers",
321 settings.samplers.to_string(),
322 );
323 }
324
325 if let Some(frequency) = settings.frequency_penalty {
326 push_arg(
327 &mut cmd,
328 &mut parts,
329 "--frequency-penalty",
330 format!("{:.2}", frequency),
331 );
332 }
333
334 if settings.dry_multiplier != 0.0 {
335 push_arg(
336 &mut cmd,
337 &mut parts,
338 "--dry-multiplier",
339 format!("{:.2}", settings.dry_multiplier),
340 );
341 push_arg(
342 &mut cmd,
343 &mut parts,
344 "--dry-base",
345 format!("{:.2}", settings.dry_base),
346 );
347 push_arg(
348 &mut cmd,
349 &mut parts,
350 "--dry-allowed-length",
351 settings.dry_allowed_length,
352 );
353 push_arg(
354 &mut cmd,
355 &mut parts,
356 "--dry-penalty-last-n",
357 settings.dry_penalty_last_n,
358 );
359 }
360
361 let rope_scaling = if settings.rope_yarn_enabled {
363 RopeScaling::Yarn
364 } else {
365 settings.rope_scaling
366 };
367 if rope_scaling != Default::default() {
368 push_arg(
369 &mut cmd,
370 &mut parts,
371 "--rope-scaling",
372 rope_scaling.to_string(),
373 );
374 }
375 if settings.rope_scale != 1.0 {
376 push_arg(
377 &mut cmd,
378 &mut parts,
379 "--rope-scale",
380 format!("{:.2}", settings.rope_scale),
381 );
382 }
383 if settings.rope_freq_base != 0.0 {
384 push_arg(
385 &mut cmd,
386 &mut parts,
387 "--rope-freq-base",
388 format!("{:.2}", settings.rope_freq_base),
389 );
390 }
391 if settings.rope_freq_scale != 1.0 {
392 push_arg(
393 &mut cmd,
394 &mut parts,
395 "--rope-freq-scale",
396 format!("{:.2}", settings.rope_freq_scale),
397 );
398 }
399
400 if settings.rope_yarn_enabled && settings.rope_scale > 1.0 {
401 if let Some(ref meta) = gguf_meta.as_ref().ok().and_then(|x| x.as_ref()) {
402 push_arg(
403 &mut cmd,
404 &mut parts,
405 "--override-kv",
406 format!("{}.context_length=int:{}", meta.arch, effective_ctx),
407 );
408 let orig_ctx = meta.n_ctx_train;
409 push_arg(
410 &mut cmd,
411 &mut parts,
412 "--yarn-orig-ctx",
413 orig_ctx,
414 );
415 }
416 }
417
418 let resolved_host = clean_host(&settings.host);
419 push_arg(&mut cmd, &mut parts, "--host", resolved_host);
420 push_arg(&mut cmd, &mut parts, "--port", settings.port);
421 push_arg(&mut cmd, &mut parts, "--timeout", settings.timeout);
422
423 push_flag(&mut cmd, &mut parts, "--metrics");
424 if !settings.cache_prompt {
425 push_flag(&mut cmd, &mut parts, "--no-cache-prompt");
426 }
427 if settings.cache_reuse != 0 {
428 push_arg(&mut cmd, &mut parts, "--cache-reuse", settings.cache_reuse);
429 }
430 if !settings.webui {
431 push_flag(&mut cmd, &mut parts, "--no-webui");
432 }
433
434 let display = parts.join(" ");
437 (cmd, display)
438}
439
440pub fn build_bench_cmd(
442 binary: &std::path::Path,
443 model: &DiscoveredModel,
444 settings: &ModelSettings,
445) -> (Command, String) {
446 let mut cmd = Command::new(binary);
447 let mut parts: Vec<String> = vec![binary.display().to_string()];
448
449 push_arg(&mut cmd, &mut parts, "-m", model.path.display());
450 push_arg(&mut cmd, &mut parts, "-t", settings.threads);
451 push_arg(&mut cmd, &mut parts, "-b", settings.batch_size);
452
453 push_gpu_layers(&mut cmd, &mut parts, settings);
454
455 if settings.flash_attn {
456 push_arg(&mut cmd, &mut parts, "-fa", "1");
457 }
458
459 push_spec_decoding(&mut cmd, &mut parts, settings);
460
461 push_flag(&mut cmd, &mut parts, "--progress");
462
463 let display = parts.join(" ");
464 (cmd, display)
465}
466
467pub struct SpawnServerRequest<'a> {
470 pub config: &'a Config,
471 pub model: Option<&'a DiscoveredModel>,
472 pub settings: &'a ModelSettings,
473 pub log_tx: mpsc::Sender<String>,
474 pub progress_tx: Option<tokio::sync::broadcast::Sender<crate::models::DownloadState>>,
475 pub server_mode: crate::models::ServerMode,
476 pub router_max_models: u32,
477 pub exit_tx: mpsc::Sender<()>,
478}
479
480pub async fn spawn_server(
481 req: SpawnServerRequest<'_>,
482) -> Result<(ServerHandle, String), String> {
483 let SpawnServerRequest {
484 config,
485 model,
486 settings,
487 log_tx,
488 progress_tx,
489 server_mode,
490 router_max_models,
491 exit_tx,
492 } = req;
493 if server_mode != crate::models::ServerMode::Bench
494 && server_mode != crate::models::ServerMode::BenchTune
495 {
496 let port = settings.port;
497 let resolved_host = clean_host(&settings.host);
499 if std::net::TcpListener::bind(format!("{}:{}", resolved_host, port)).is_err() {
500 return Err(format!("Port {} is already in use", port));
501 }
502 }
503
504 if server_mode == crate::models::ServerMode::BenchTune {
507 unreachable!("BenchTune mode must be handled before calling spawn_server")
508 }
509
510 let backend_name = if server_mode == crate::models::ServerMode::Bench {
512 "llama-bench"
513 } else {
514 "llama-server"
515 };
516 let version_display = settings.get_active_backend_version_display();
517 info!(
518 "spawn_server: backend={}, requested_version={:?}, version_display={}",
519 settings.backend,
520 settings.get_active_backend_version(),
521 version_display
522 );
523 log_tx
524 .send(format!(
525 "Resolving {} (v{}) binary...",
526 backend_name, version_display
527 ))
528 .await
529 .ok();
530 let version_param = settings.get_active_backend_version().map(|s| s.as_str());
531
532 let server_binary = match crate::backend::hub::resolve_backend_binary(
533 settings.backend,
534 version_param,
535 Some(log_tx.clone()),
536 progress_tx,
537 )
538 .await
539 {
540 Ok(path) => {
541 info!("spawn_server: resolved binary path={}", path.display());
542 path
543 }
544 Err(e) => {
545 return Err(format!("Failed to resolve backend binary: {}", e));
546 }
547 };
548
549 let binary = if server_mode == crate::models::ServerMode::Bench {
550 server_binary.parent().unwrap().join("llama-bench")
551 } else {
552 server_binary
553 };
554
555 if !binary.exists() {
556 return Err(format!("Binary not found at: {}", binary.display()));
557 }
558 #[cfg(unix)]
559 {
560 use std::os::unix::fs::PermissionsExt;
561 if let Ok(metadata) = binary.metadata()
562 && metadata.permissions().mode() & 0o111 == 0
563 {
564 return Err(format!("Binary is not executable: {}", binary.display()));
565 }
566 }
567
568 let (mut cmd, cmd_string) = if server_mode == crate::models::ServerMode::Bench {
569 if let Some(m) = model {
570 build_bench_cmd(&binary, m, settings)
571 } else {
572 return Err("Model required for benchmark".to_string());
573 }
574 } else {
575 build_server_cmd(
576 &binary,
577 model,
578 settings,
579 config,
580 server_mode,
581 router_max_models,
582 )
583 };
584
585 cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
586
587 let bin_dir = binary.parent().unwrap();
589 match std::env::consts::OS {
590 "windows" => {
591 if let Ok(current) = std::env::var("PATH") {
593 cmd.env("PATH", format!("{};{}", bin_dir.display(), current));
594 } else {
595 cmd.env("PATH", bin_dir);
596 }
597 }
598 "macos" => {
599 if let Ok(current) = std::env::var("DYLD_LIBRARY_PATH") {
601 cmd.env(
602 "DYLD_LIBRARY_PATH",
603 format!("{}:{}", bin_dir.display(), current),
604 );
605 } else {
606 cmd.env("DYLD_LIBRARY_PATH", bin_dir);
607 }
608 }
609 _ => {
610 if let Ok(current) = std::env::var("LD_LIBRARY_PATH") {
612 cmd.env(
613 "LD_LIBRARY_PATH",
614 format!("{}:{}", bin_dir.display(), current),
615 );
616 } else {
617 cmd.env("LD_LIBRARY_PATH", bin_dir);
618 }
619 }
620 }
621
622 info!("Spawning: {}", cmd_string);
623 let _ = log_tx
624 .send(format!("{}: {}", backend_name, cmd_string))
625 .await;
626 let mut child = cmd
627 .spawn()
628 .map_err(|e| format!("Failed to spawn process: {}", e))?;
629 let pid = child.id().unwrap_or(0);
630
631 let (kill_tx, mut kill_rx) = mpsc::channel(1);
632
633 let log_tx_inner = log_tx.clone();
636 let exit_tx_inner = exit_tx.clone();
637 let backend_name_upper = backend_name.to_uppercase();
638 tokio::spawn(async move {
639 let stdout = child.stdout.take().unwrap();
640 let stderr = child.stderr.take().unwrap();
641
642 let (stdout_tx, mut stdout_rx) = mpsc::channel::<String>(64);
643 let (stderr_tx, mut stderr_rx) = mpsc::channel::<String>(64);
644
645 let mut std_out = Some(tokio::spawn(async move {
647 let reader = BufReader::new(stdout).lines();
648 tokio::pin!(reader);
649 while let Ok(Some(line)) = reader.next_line().await {
650 if stdout_tx.send(line).await.is_err() {
651 break;
652 }
653 }
654 }));
655
656 let mut std_err = Some(tokio::spawn(async move {
657 let reader = BufReader::new(stderr).lines();
658 tokio::pin!(reader);
659 while let Ok(Some(line)) = reader.next_line().await {
660 if stderr_tx.send(line).await.is_err() {
661 break;
662 }
663 }
664 }));
665
666 loop {
669 tokio::select! {
670 _ = kill_rx.recv() => {
671 let _ = child.kill().await;
672 if let Some(h) = std_out.take() { let _ = h.await; }
673 if let Some(h) = std_err.take() { let _ = h.await; }
674 break;
675 }
676 line = stdout_rx.recv() => {
677 if let Some(line) = line { let _ = log_tx_inner.send(line).await; } else { break; }
678 }
679 line = stderr_rx.recv() => {
680 if let Some(line) = line { let _ = log_tx_inner.send(line).await; } else { break; }
681 }
682 else => break,
683 }
684 }
685
686 if let Some(h) = std_out.take() {
688 let _ = h.await;
689 }
690 if let Some(h) = std_err.take() {
691 let _ = h.await;
692 }
693
694 let exit_code = child.wait().await.ok().and_then(|s| s.code());
695 let _ = exit_tx_inner.send(()).await;
696 let _ = log_tx_inner
697 .send(format!(
698 "{} exited with code {:?}",
699 backend_name_upper, exit_code
700 ))
701 .await;
702 });
703
704 Ok((
705 ServerHandle {
706 port: if server_mode == crate::models::ServerMode::Bench {
707 0
708 } else {
709 settings.port
710 },
711 host: settings.host.clone(),
712 pid,
713 kill_tx,
714 },
715 cmd_string,
716 ))
717}
718
719pub async fn check_health(host: &str, port: u16) -> bool {
721 let host = clean_host(host);
722 let url = format!("http://{}:{}/health", host, port);
723 let client = reqwest::Client::builder()
724 .timeout(std::time::Duration::from_secs(1))
725 .build()
726 .unwrap_or_default();
727
728 match client.get(&url).send().await {
729 Ok(resp) => resp.status().is_success(),
730 Err(_) => false,
731 }
732}
733
734pub async fn kill_server(handle: ServerHandle) -> Result<(), String> {
736 handle
737 .kill_tx
738 .send(())
739 .await
740 .map_err(|_| "Server already stopped".to_string())
741}
742
743pub async fn get_metrics(
745 host: &str,
746 port: u16,
747 model_name: Option<&str>,
748 pid: Option<u32>,
749) -> Result<ServerMetrics, String> {
750 let host = clean_host(host);
751 let mut url = if let Some(model) = model_name {
754 let name = strip_gguf(model);
755 format!("http://{}:{}/metrics?model={}", host, port, name)
756 } else {
757 format!("http://{}:{}/metrics", host, port)
758 };
759
760 let mut resp = reqwest::get(&url)
761 .await
762 .map_err(|e| format!("Failed to get metrics: {}", e))?;
763
764 if (resp.status() == reqwest::StatusCode::NOT_FOUND
766 || resp.status() == reqwest::StatusCode::BAD_REQUEST)
767 && model_name.is_some()
768 {
769 url = format!("http://{}:{}/metrics", host, port);
770 resp = reqwest::get(&url)
771 .await
772 .map_err(|e| format!("Failed to get metrics: {}", e))?;
773 }
774
775 if !resp.status().is_success() {
776 return Err(format!("Server returned {}", resp.status()));
777 }
778
779 let text = resp
780 .text()
781 .await
782 .map_err(|e| format!("Failed to read metrics: {}", e))?;
783
784 let mut m = ServerMetrics { loaded: true, ..Default::default() };
785
786 let mut ctx_max_slots = 0u32;
787 let mut ctx_used_slots = 0u32;
788 let mut ctx_used_global = 0u32;
789 let mut ctx_max_global = 0u32;
790
791 let mut vram_used_slots = 0u64;
792 let mut vram_total_slots = 0u64;
793 let mut vram_used_global = 0u64;
794 let mut vram_total_global = 0u64;
795
796 for line in text.lines() {
797 if line.starts_with('#') || line.is_empty() {
798 continue;
799 }
800
801 let parts: Vec<&str> = line.split_whitespace().collect();
802 if parts.len() < 2 {
803 continue;
804 }
805
806 let name_with_labels = parts[0];
807 let mut val = 0.0;
808 for part in parts.iter().skip(1) {
809 if let Ok(v) = part.parse::<f64>() {
810 val = v;
811 break;
812 }
813 }
814
815 let is_slot = name_with_labels.contains("slot=\"") || name_with_labels.contains("pool=\"");
816 let name = name_with_labels
817 .split('{')
818 .next()
819 .unwrap_or(name_with_labels);
820
821 match name {
822 "llama_kv_cache_usage_bytes"
823 | "kv_cache_usage_bytes"
824 | "llama_server_kv_cache_usage_bytes"
825 | "llama_server_kv_cache_used_bytes"
826 | "llama_server_vram_used_bytes" => {
827 if is_slot {
828 vram_used_slots += val as u64;
829 } else {
830 vram_used_global = vram_used_global.max(val as u64);
831 }
832 }
833 "llama_kv_cache_total_bytes"
834 | "kv_cache_total_bytes"
835 | "llama_server_kv_cache_total_bytes"
836 | "llama_server_vram_total_bytes" => {
837 if is_slot {
838 vram_total_slots += val as u64;
839 } else {
840 vram_total_global = vram_total_global.max(val as u64);
841 }
842 }
843 "llama_model_memory_usage_bytes"
844 | "model_memory_usage_bytes"
845 | "llama_server_model_memory_usage_bytes"
846 | "llama_server_memory_usage_bytes"
847 | "llama_server_ram_usage_bytes"
848 | "llama_server_mem_used_bytes" => {
849 m.ram_used = m.ram_used.max(val as u64);
850 }
851 "llama_kv_cache_tokens_used"
852 | "kv_cache_usage_tokens"
853 | "kv_cache_tokens_used"
854 | "llama_server_kv_cache_tokens_used"
855 | "llamacpp:n_tokens_used"
856 | "llama_server_n_tokens_used"
857 | "llama_server_n_past"
858 | "llamacpp:n_past" => {
859 if is_slot {
860 ctx_used_slots += val as u32;
861 } else {
862 ctx_used_global = ctx_used_global.max(val as u32);
863 }
864 }
865 "llama_kv_cache_tokens_total"
866 | "kv_cache_total_tokens"
867 | "kv_cache_tokens_total"
868 | "llama_server_kv_cache_tokens_total"
869 | "llamacpp:n_ctx"
870 | "llamacpp:n_tokens_max"
871 | "llama_server_n_ctx"
872 | "llama_server_n_tokens_max" => {
873 if is_slot {
874 ctx_max_slots += val as u32;
875 } else {
876 ctx_max_global = ctx_max_global.max(val as u32);
877 }
878 }
879 "llama_server_cpu_usage_percentage"
880 | "cpu_usage_percentage"
881 | "llama_server_cpu_usage"
882 | "llama_server_cpu_percent" => {
883 m.cpu_usage = m.cpu_usage.max(val);
884 }
885 "llamacpp:predicted_tokens_seconds"
886 | "llama_server_predicted_tokens_seconds"
887 | "llama_server_tps" => {
888 m.tps += val;
889 }
890 "llamacpp:prompt_tokens_seconds"
891 | "llama_server_prompt_tokens_seconds"
892 | "llama_server_prompt_tps" => {
893 m.prompt_tps += val;
894 }
895 "llamacpp:kv_cache_usage_ratio" | "llama_server_kv_cache_usage_ratio" => {
896 if !is_slot && ctx_max_global > 0 {
897 ctx_used_global = ctx_used_global.max((val * ctx_max_global as f64) as u32);
898 }
899 }
900 _ => {}
901 }
902 }
903
904 m.gpu_mem_used = if vram_used_global > 0 {
906 vram_used_global
907 } else if vram_used_slots > 0 {
908 vram_used_slots
909 } else {
910 0
911 };
912 m.gpu_mem_total = if vram_total_global > 0 {
913 vram_total_global
914 } else if vram_total_slots > 0 {
915 vram_total_slots
916 } else {
917 0
918 };
919
920 m.ctx_used = if ctx_used_slots > 0 {
923 ctx_used_slots
924 } else {
925 ctx_used_global
926 };
927 m.ctx_max = if ctx_max_slots > 0 {
928 ctx_max_slots
929 } else {
930 ctx_max_global
931 };
932 if model_name.is_none() {
939 let set_if_better = |out: &mut ServerMetrics, used: u64, total: u64| {
943 if out.gpu_mem_used == 0 || used > out.gpu_mem_used {
944 out.gpu_mem_used = used;
945 out.gpu_mem_total = total;
946 }
947 };
948
949 let (nv_used, nv_total) = get_nvidia_vram_metrics().unwrap_or((0, 0));
950 set_if_better(&mut m, nv_used, nv_total);
951
952 if m.gpu_mem_total == 0 {
953 let (amd_used, amd_total) = get_amdgpu_vram_metrics().unwrap_or((0, 0));
955 set_if_better(&mut m, amd_used, amd_total);
956 }
957 } else if m.gpu_mem_used == 0 {
958 if let Ok((used, total)) = get_nvidia_vram_metrics() {
960 m.gpu_mem_used = used;
961 m.gpu_mem_total = total;
962 } else if let Ok((used, total)) = get_amdgpu_vram_metrics() {
963 m.gpu_mem_used = used;
964 m.gpu_mem_total = total;
965 }
966 }
967
968 if let Some(p) = pid {
970 if let Ok((ram, cpu)) = get_process_metrics(p) {
971 if m.ram_used == 0 {
972 m.ram_used = ram;
973 }
974 if m.cpu_usage == 0.0 {
975 m.cpu_usage = cpu;
976 }
977 }
978 }
979
980 Ok(m)
981}
982
983fn get_nvidia_vram_metrics() -> Result<(u64, u64), String> {
985 let output = std::process::Command::new("nvidia-smi")
986 .args([
987 "--query-gpu=memory.used,memory.total",
988 "--format=csv,noheader,nounits",
989 ])
990 .output()
991 .map_err(|e| e.to_string())?;
992
993 if !output.status.success() {
994 return Err("nvidia-smi failed".to_string());
995 }
996
997 let stdout = String::from_utf8_lossy(&output.stdout);
998 let mut total_used: u64 = 0;
999 let mut total_total: u64 = 0;
1000 for line in stdout.lines() {
1001 let parts: Vec<&str> = line.split(',').collect();
1002 if parts.len() >= 2 {
1003 let used = match parts[0].trim().parse::<u64>() {
1004 Ok(v) => v,
1005 Err(_) => {
1006 warn!("nvidia-smi: failed to parse used memory from '{}'", parts[0]);
1007 continue;
1008 }
1009 } * 1024 * 1024;
1010 let total = match parts[1].trim().parse::<u64>() {
1011 Ok(v) => v,
1012 Err(_) => {
1013 warn!("nvidia-smi: failed to parse total memory from '{}'", parts[1]);
1014 continue;
1015 }
1016 } * 1024 * 1024;
1017 total_used += used;
1018 total_total += total;
1019 }
1020 }
1021 if total_total > 0 {
1022 return Ok((total_used, total_total));
1023 }
1024
1025 Err("Invalid output from nvidia-smi".to_string())
1026}
1027
1028fn get_amdgpu_vram_metrics() -> Result<(u64, u64), String> {
1030 let output = std::process::Command::new("amdgpu_top")
1031 .args(["-d", "--json"])
1032 .output()
1033 .map_err(|e| e.to_string())?;
1034
1035 if !output.status.success() {
1036 return Err("amdgpu_top failed".to_string());
1037 }
1038
1039 let json: serde_json::Value =
1040 serde_json::from_slice(&output.stdout).map_err(|e| e.to_string())?;
1041
1042 let devices = if json.is_array() {
1044 json.as_array()
1045 } else {
1046 json.get("devices").and_then(|d| d.as_array())
1047 };
1048
1049 if let Some(devices) = devices
1050 && let Some(device) = devices.first()
1051 {
1052 let root_used = device.get("VRAM Usage Size").and_then(|v| v.as_u64());
1055 let root_total = device.get("VRAM Size").and_then(|v| v.as_u64());
1056
1057 if let (Some(used), Some(total)) = (root_used, root_total)
1058 && total > 0
1059 {
1060 return Ok((used, total));
1061 }
1062
1063 let vram_obj = device.get("VRAM");
1065 if let Some(vram) = vram_obj {
1066 let nested_used = vram
1068 .get("Total VRAM Usage")
1069 .and_then(|v| v.get("value").or(Some(v)))
1070 .and_then(|v| v.as_u64());
1071 let nested_total = vram
1072 .get("Total VRAM")
1073 .and_then(|v| v.get("value").or(Some(v)))
1074 .and_then(|v| v.as_u64());
1075
1076 if let (Some(used), Some(total)) = (nested_used, nested_total) {
1077 let multiplier = if vram.get("Total VRAM").and_then(|v| v.get("unit")).is_some() {
1079 1024 * 1024
1080 } else {
1081 1
1082 };
1083 if total > 0 {
1084 return Ok((used * multiplier, total * multiplier));
1085 }
1086 }
1087 }
1088
1089 let vram_usage = device.get("vram_usage");
1091 if let Some(vram) = vram_usage {
1092 let used = vram
1093 .get("VRAM")
1094 .or_else(|| vram.get("usage"))
1095 .and_then(|v| v.get("value").or(Some(v)))
1096 .and_then(|v| v.as_u64())
1097 .unwrap_or(0);
1098 let total = vram
1099 .get("TotalVRAM")
1100 .or_else(|| vram.get("total"))
1101 .and_then(|v| v.get("value").or(Some(v)))
1102 .and_then(|v| v.as_u64())
1103 .unwrap_or(0);
1104
1105 if total > 0 {
1106 return Ok((used * 1024 * 1024, total * 1024 * 1024));
1107 }
1108 }
1109 }
1110
1111 Err("Could not find VRAM info in amdgpu_top output".to_string())
1112}
1113
1114fn get_process_metrics(
1118 pid: u32,
1119) -> Result<(u64, f64), String> {
1120 use std::sync::{LazyLock, Mutex};
1121 use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
1122
1123 static SYS: LazyLock<Mutex<System>> = LazyLock::new(|| {
1124 Mutex::new(System::new_with_specifics(
1125 RefreshKind::everything().with_processes(ProcessRefreshKind::nothing().with_cpu().with_memory()),
1126 ))
1127 });
1128
1129 let mut sys = SYS.lock().unwrap();
1130 let pids = [Pid::from(pid as usize)];
1131 sys.refresh_processes_specifics(
1132 ProcessesToUpdate::Some(&pids),
1133 true,
1134 ProcessRefreshKind::nothing().with_cpu().with_memory(),
1135 );
1136
1137 let sys_pid = Pid::from(pid as usize);
1138
1139 if let Some(process) = sys.process(sys_pid) {
1140 let ram = process.memory(); let cpu = process.cpu_usage() as f64; return Ok((ram, cpu));
1143 }
1144
1145 Err(format!("Process not found: pid={}", pid))
1146}
1147
1148pub async fn load_model(
1150 host: &str,
1151 port: u16,
1152 model_id: &str,
1153 model_path: Option<&str>,
1154) -> Result<(), String> {
1155 let client = reqwest::Client::new();
1156 let host = clean_host(host);
1157
1158 let endpoints = ["/models/load", "/v1/models/load"];
1160
1161 let mut variants = Vec::new();
1163
1164 variants.push(model_id.to_string());
1166 variants.push(strip_gguf(model_id).to_string());
1167
1168 if let Some(filename) = std::path::Path::new(model_id)
1170 .file_name()
1171 .and_then(|f| f.to_str())
1172 {
1173 variants.push(filename.to_string());
1174 variants.push(strip_gguf(filename).to_string());
1175 }
1176
1177 if let Some(path) = model_path {
1179 variants.push(path.to_string());
1180 }
1181
1182 let mut last_status = reqwest::StatusCode::OK;
1183 let mut last_error = String::new();
1184
1185 for endpoint in endpoints {
1186 let url = format!("http://{}:{}{}", host, port, endpoint);
1187 for variant in &variants {
1188 let bodies = vec![
1190 serde_json::json!({ "model": variant }),
1191 serde_json::json!({ "alias": variant }),
1192 ];
1193
1194 for body in bodies {
1195 match client.post(&url).json(&body).send().await {
1196 Ok(res) => {
1197 if res.status().is_success() {
1198 return Ok(());
1199 }
1200 last_status = res.status();
1201 last_error = res
1202 .text()
1203 .await
1204 .unwrap_or_else(|_| "Unknown error".to_string());
1205 }
1206 Err(e) => {
1207 last_error = e.to_string();
1208 }
1209 }
1210 }
1211 }
1212 }
1213
1214 Err(format!(
1215 "Failed to load model (tried {} requests). Last status {}: {}",
1216 endpoints.len() * variants.len() * 2,
1217 last_status,
1218 last_error
1219 ))
1220}
1221
1222pub async fn list_models(
1224 host: &str,
1225 port: u16,
1226) -> Result<Vec<(String, String, Option<String>)>, String> {
1227 let client = reqwest::Client::new();
1228 let host = clean_host(host);
1229 let url = format!("http://{}:{}/models", host, port);
1230
1231 let res = client
1232 .get(&url)
1233 .send()
1234 .await
1235 .map_err(|e| format!("Failed to list models: {}", e))?;
1236
1237 if !res.status().is_success() {
1238 return Err(format!("Server returned error {}", res.status()));
1239 }
1240
1241 let json: serde_json::Value = res
1242 .json()
1243 .await
1244 .map_err(|e| format!("Invalid JSON: {}", e))?;
1245
1246 let mut results = Vec::new();
1247 if let Some(data) = json.get("data").and_then(|d| d.as_array()) {
1248 for model in data {
1249 let id = model
1250 .get("id")
1251 .and_then(|v| v.as_str())
1252 .unwrap_or_default()
1253 .to_string();
1254 let status = model
1256 .get("status")
1257 .and_then(|s| s.get("value").or(Some(s)))
1258 .and_then(|v| v.as_str())
1259 .unwrap_or("unloaded")
1260 .to_string();
1261 let path = model
1262 .get("path")
1263 .or_else(|| model.get("filename"))
1264 .and_then(|v| v.as_str())
1265 .map(|s| s.to_string());
1266
1267 results.push((id, status, path));
1268 }
1269 }
1270
1271 Ok(results)
1272}
1273
1274pub async fn unload_model(
1276 host: &str,
1277 port: u16,
1278 model_id: &str,
1279 model_path: Option<&str>,
1280) -> Result<(), String> {
1281 let client = reqwest::Client::new();
1282 let host = clean_host(host);
1283
1284 let endpoints = ["/models/unload", "/v1/models/unload"];
1285 let stripped = strip_gguf(model_id);
1286 let mut variants = vec![model_id.to_string(), stripped.to_string()];
1287 if let Some(path) = model_path {
1288 variants.push(path.to_string());
1289 }
1290
1291 let mut last_status = reqwest::StatusCode::OK;
1292 let mut last_error = String::new();
1293
1294 for endpoint in endpoints {
1295 let url = format!("http://{}:{}{}", host, port, endpoint);
1296 for variant in &variants {
1297 let body = serde_json::json!({
1298 "model": variant
1299 });
1300
1301 if let Ok(res) = client.post(&url).json(&body).send().await {
1302 if res.status().is_success() {
1303 return Ok(());
1304 }
1305 last_status = res.status();
1306 last_error = res
1307 .text()
1308 .await
1309 .unwrap_or_else(|_| "Unknown error".to_string());
1310 }
1311 }
1312 }
1313
1314 tracing::debug!(
1315 "Model unload failed (status {}, error: {}): this is expected if model was already unloaded",
1316 last_status,
1317 last_error
1318 );
1319 Ok(())
1320}