1use std::fmt::Display;
2use std::process::Stdio;
3use tokio::io::{AsyncBufReadExt, BufReader};
4use tokio::process::Command;
5use tokio::sync::mpsc;
6use tracing::info;
7
8use crate::config::Config;
9use crate::models::{
10 DiscoveredModel, ModelSettings, RopeScaling, ServerMetrics, clean_host, strip_gguf,
11};
12
13#[derive(Clone)]
15pub struct ServerHandle {
16 pub port: u16,
17 pub host: String,
18 pub pid: u32,
19 pub kill_tx: mpsc::Sender<()>,
20}
21
22fn push_arg(cmd: &mut Command, parts: &mut Vec<String>, name: &str, value: impl Display) {
24 let val_str = value.to_string();
25 cmd.arg(name).arg(&val_str);
26 parts.push(name.to_string());
27 parts.push(val_str);
28}
29
30fn push_flag(cmd: &mut Command, parts: &mut Vec<String>, name: &str) {
32 cmd.arg(name);
33 parts.push(name.to_string());
34}
35
36fn push_gpu_layers(cmd: &mut Command, parts: &mut Vec<String>, settings: &ModelSettings) {
37 match settings.gpu_layers_mode {
38 crate::models::GpuLayersMode::Specific(n) => push_arg(cmd, parts, "-ngl", n),
39 crate::models::GpuLayersMode::All => push_arg(cmd, parts, "-ngl", "999"),
40 crate::models::GpuLayersMode::Auto => {}
41 }
42}
43
44fn push_spec_decoding(cmd: &mut Command, parts: &mut Vec<String>, settings: &ModelSettings) {
45 if !settings.spec_type.is_empty() {
46 push_arg(cmd, parts, "--spec-type", &settings.spec_type);
47 if settings.draft_tokens > 0 {
48 push_arg(cmd, parts, "--spec-draft-n-max", settings.draft_tokens);
49 }
50 }
51}
52
53pub fn build_server_cmd(
56 binary: &std::path::Path,
57 model: Option<&DiscoveredModel>,
58 settings: &ModelSettings,
59 config: &Config,
60 server_mode: crate::models::ServerMode,
61 router_max_models: u32,
62) -> (Command, String) {
63 let mut cmd = Command::new(binary);
64 let mut parts: Vec<String> = vec![binary.display().to_string()];
65
66 match server_mode {
68 crate::models::ServerMode::Normal => {
69 if let Some(model) = model {
70 push_arg(&mut cmd, &mut parts, "-m", model.path.display());
71 push_arg(&mut cmd, &mut parts, "--alias", &model.display_name);
73 }
74 }
75 crate::models::ServerMode::Router => {
76 if router_max_models > 0 {
78 push_arg(&mut cmd, &mut parts, "--models-max", router_max_models);
79 }
80 if let Some(dir) = config.models_dirs.first() {
82 push_arg(&mut cmd, &mut parts, "--models-dir", dir.display());
83 }
84 }
85 crate::models::ServerMode::Bench => {
86 }
88 crate::models::ServerMode::BenchTune => {
89 }
91 }
92
93 push_arg(&mut cmd, &mut parts, "--threads", settings.threads);
95 push_arg(
96 &mut cmd,
97 &mut parts,
98 "--threads-batch",
99 settings.threads_batch,
100 );
101 let effective_ctx = (settings.context_length as f32 * settings.rope_scale) as u32;
102 push_arg(&mut cmd, &mut parts, "--ctx-size", effective_ctx);
103 push_arg(&mut cmd, &mut parts, "--ubatch-size", settings.ubatch_size);
104 if let Some(n) = settings.max_concurrent_predictions {
105 push_arg(&mut cmd, &mut parts, "--parallel", n);
106 }
107
108 push_flag(&mut cmd, &mut parts, "--no-warmup");
109
110 push_spec_decoding(&mut cmd, &mut parts, settings);
111
112 if let Some(cache_k) = settings.cache_type_k {
113 push_arg(&mut cmd, &mut parts, "--cache-type-k", cache_k);
114 }
115 if let Some(cache_v) = settings.cache_type_v {
116 push_arg(&mut cmd, &mut parts, "--cache-type-v", cache_v);
117 }
118
119 if settings.keep != 0 {
120 push_arg(&mut cmd, &mut parts, "--keep", settings.keep);
121 }
122 if settings.swa_full {
123 push_flag(&mut cmd, &mut parts, "--swa-full");
124 }
125 if settings.mlock {
126 push_flag(&mut cmd, &mut parts, "--mlock");
127 }
128 if !settings.mmap {
129 push_flag(&mut cmd, &mut parts, "--no-mmap");
130 }
131 if settings.numa != Default::default() {
132 push_arg(&mut cmd, &mut parts, "--numa", settings.numa.to_string());
133 }
134 if settings.kv_cache_offload {
135 push_flag(&mut cmd, &mut parts, "--kv-offload");
136 }
137
138 push_gpu_layers(&mut cmd, &mut parts, settings);
140
141 if settings.split_mode != Default::default() {
142 push_arg(
143 &mut cmd,
144 &mut parts,
145 "--split-mode",
146 settings.split_mode.to_string(),
147 );
148 }
149 if !settings.tensor_split.is_empty() {
150 push_arg(
151 &mut cmd,
152 &mut parts,
153 "--tensor-split",
154 &settings.tensor_split,
155 );
156 }
157 if settings.main_gpu != 0 {
158 push_arg(&mut cmd, &mut parts, "--main-gpu", settings.main_gpu);
159 }
160 if settings.fit {
161 push_arg(&mut cmd, &mut parts, "--fit", "on");
162 } else {
163 push_arg(&mut cmd, &mut parts, "--fit", "off");
164 }
165
166 if let Some(ref lora) = settings.lora {
167 push_arg(&mut cmd, &mut parts, "--lora", lora.display());
168 }
169 if let Some((ref lora, scale)) = settings.lora_scaled {
170 push_arg(
171 &mut cmd,
172 &mut parts,
173 "--lora-scaled",
174 format!("{}:{}", lora.display(), scale),
175 );
176 }
177
178 let mut rpc_list = Vec::new();
179 if !settings.rpc.is_empty() {
180 rpc_list.push(settings.rpc.clone());
181 }
182 for worker in &config.rpc_workers {
183 if worker.selected {
184 rpc_list.push(format!("{}:{}", worker.ip, worker.port));
185 }
186 }
187
188 if !rpc_list.is_empty() {
189 let joined_rpc = rpc_list.join(",");
190 push_arg(&mut cmd, &mut parts, "--rpc", joined_rpc);
191 }
192
193 if settings.embedding {
194 push_flag(&mut cmd, &mut parts, "--embedding");
195 }
196
197 if settings.expert_count > 0 {
198 push_arg(
199 &mut cmd,
200 &mut parts,
201 "--override-kv",
202 format!("llama.expert_used_count=int:int:{}", settings.expert_count),
203 );
204 }
205
206 push_arg(
207 &mut cmd,
208 &mut parts,
209 "-fa",
210 if settings.flash_attn { "on" } else { "off" },
211 );
212
213 if settings.jinja {
214 push_flag(&mut cmd, &mut parts, "--jinja");
215 }
216
217 if let Some(ref template) = settings.chat_template {
218 push_arg(&mut cmd, &mut parts, "--chat-template", template);
219 }
220
221 if settings.system_prompt != "You are a helpful assistant." {
223 let escaped = settings
224 .system_prompt
225 .replace('\\', "\\\\")
226 .replace('"', "\\\"");
227 let mut merged = serde_json::Map::new();
228 if let Some(ref kwargs) = settings.chat_template_kwargs
229 && let Ok(obj) = serde_json::from_str::<serde_json::Value>(kwargs)
230 && let serde_json::Value::Object(map) = obj {
231 for (k, v) in map {
232 merged.insert(k, v);
233 }
234 }
235 merged.insert(
236 "system_prompt".to_string(),
237 serde_json::Value::String(escaped),
238 );
239 push_arg(
240 &mut cmd,
241 &mut parts,
242 "--chat-template-kwargs",
243 serde_json::to_string(&merged).unwrap(),
244 );
245 } else if let Some(ref kwargs) = settings.chat_template_kwargs {
246 push_arg(&mut cmd, &mut parts, "--chat-template-kwargs", kwargs);
247 }
248
249 if settings.seed != -1 {
251 push_arg(&mut cmd, &mut parts, "--seed", settings.seed);
252 }
253 if let Some(max_tokens) = settings.max_tokens {
254 push_arg(&mut cmd, &mut parts, "--n-predict", max_tokens);
255 }
256 push_arg(
257 &mut cmd,
258 &mut parts,
259 "--temp",
260 format!("{:.2}", settings.temperature),
261 );
262
263 push_arg(&mut cmd, &mut parts, "--top-k", settings.top_k);
264
265 push_arg(
266 &mut cmd,
267 &mut parts,
268 "--top-p",
269 format!("{:.2}", settings.top_p),
270 );
271
272 push_arg(
273 &mut cmd,
274 &mut parts,
275 "--min-p",
276 format!("{:.2}", settings.min_p),
277 );
278
279 push_arg(
280 &mut cmd,
281 &mut parts,
282 "--typical",
283 format!("{:.2}", settings.typical_p),
284 );
285
286 if settings.mirostat != Default::default() {
287 push_arg(
288 &mut cmd,
289 &mut parts,
290 "--mirostat",
291 settings.mirostat.to_string(),
292 );
293 push_arg(
294 &mut cmd,
295 &mut parts,
296 "--mirostat-lr",
297 format!("{:.2}", settings.mirostat_lr),
298 );
299 push_arg(
300 &mut cmd,
301 &mut parts,
302 "--mirostat-ent",
303 format!("{:.2}", settings.mirostat_ent),
304 );
305 }
306
307 if settings.ignore_eos {
308 push_flag(&mut cmd, &mut parts, "--ignore-eos");
309 }
310
311 if !settings.samplers.0.is_empty() {
312 push_arg(
313 &mut cmd,
314 &mut parts,
315 "--samplers",
316 settings.samplers.to_string(),
317 );
318 }
319
320 if let Some(frequency) = settings.frequency_penalty {
321 push_arg(
322 &mut cmd,
323 &mut parts,
324 "--frequency-penalty",
325 format!("{:.2}", frequency),
326 );
327 }
328
329 if settings.dry_multiplier != 0.0 {
330 push_arg(
331 &mut cmd,
332 &mut parts,
333 "--dry-multiplier",
334 format!("{:.2}", settings.dry_multiplier),
335 );
336 push_arg(
337 &mut cmd,
338 &mut parts,
339 "--dry-base",
340 format!("{:.2}", settings.dry_base),
341 );
342 push_arg(
343 &mut cmd,
344 &mut parts,
345 "--dry-allowed-length",
346 settings.dry_allowed_length,
347 );
348 push_arg(
349 &mut cmd,
350 &mut parts,
351 "--dry-penalty-last-n",
352 settings.dry_penalty_last_n,
353 );
354 }
355
356 let rope_scaling = if settings.rope_yarn_enabled {
358 RopeScaling::Yarn
359 } else {
360 settings.rope_scaling
361 };
362 if rope_scaling != Default::default() {
363 push_arg(
364 &mut cmd,
365 &mut parts,
366 "--rope-scaling",
367 rope_scaling.to_string(),
368 );
369 }
370 if settings.rope_scale != 1.0 {
371 push_arg(
372 &mut cmd,
373 &mut parts,
374 "--rope-scale",
375 format!("{:.2}", settings.rope_scale),
376 );
377 }
378 if settings.rope_freq_base != 0.0 {
379 push_arg(
380 &mut cmd,
381 &mut parts,
382 "--rope-freq-base",
383 format!("{:.2}", settings.rope_freq_base),
384 );
385 }
386 if settings.rope_freq_scale != 1.0 {
387 push_arg(
388 &mut cmd,
389 &mut parts,
390 "--rope-freq-scale",
391 format!("{:.2}", settings.rope_freq_scale),
392 );
393 }
394
395 let resolved_host = clean_host(&settings.host);
396 push_arg(&mut cmd, &mut parts, "--host", resolved_host);
397 push_arg(&mut cmd, &mut parts, "--port", settings.port);
398 push_arg(&mut cmd, &mut parts, "--timeout", settings.timeout);
399
400 push_flag(&mut cmd, &mut parts, "--metrics");
401 if !settings.cache_prompt {
402 push_flag(&mut cmd, &mut parts, "--no-cache-prompt");
403 }
404 if settings.cache_reuse != 0 {
405 push_arg(&mut cmd, &mut parts, "--cache-reuse", settings.cache_reuse);
406 }
407 if !settings.webui {
408 push_flag(&mut cmd, &mut parts, "--no-webui");
409 }
410
411 let display = parts.join(" ");
414 (cmd, display)
415}
416
417pub fn build_bench_cmd(
419 binary: &std::path::Path,
420 model: &DiscoveredModel,
421 settings: &ModelSettings,
422) -> (Command, String) {
423 let mut cmd = Command::new(binary);
424 let mut parts: Vec<String> = vec![binary.display().to_string()];
425
426 push_arg(&mut cmd, &mut parts, "-m", model.path.display());
427 push_arg(&mut cmd, &mut parts, "-t", settings.threads);
428 push_arg(&mut cmd, &mut parts, "-b", settings.batch_size);
429
430 push_gpu_layers(&mut cmd, &mut parts, settings);
431
432 if settings.flash_attn {
433 push_arg(&mut cmd, &mut parts, "-fa", "1");
434 }
435
436 push_spec_decoding(&mut cmd, &mut parts, settings);
437
438 push_flag(&mut cmd, &mut parts, "--progress");
439
440 let display = parts.join(" ");
441 (cmd, display)
442}
443
444pub struct SpawnServerRequest<'a> {
447 pub config: &'a Config,
448 pub model: Option<&'a DiscoveredModel>,
449 pub settings: &'a ModelSettings,
450 pub log_tx: mpsc::Sender<String>,
451 pub progress_tx: Option<tokio::sync::broadcast::Sender<crate::models::DownloadState>>,
452 pub server_mode: crate::models::ServerMode,
453 pub router_max_models: u32,
454 pub exit_tx: mpsc::Sender<()>,
455}
456
457pub async fn spawn_server(
458 req: SpawnServerRequest<'_>,
459) -> Result<(ServerHandle, String), String> {
460 let SpawnServerRequest {
461 config,
462 model,
463 settings,
464 log_tx,
465 progress_tx,
466 server_mode,
467 router_max_models,
468 exit_tx,
469 } = req;
470 if server_mode != crate::models::ServerMode::Bench
471 && server_mode != crate::models::ServerMode::BenchTune
472 {
473 let port = settings.port;
474 if std::net::TcpListener::bind(("127.0.0.1", port)).is_err() {
476 return Err(format!("Port {} is already in use", port));
477 }
478 }
479
480 if server_mode == crate::models::ServerMode::BenchTune {
483 unreachable!("BenchTune mode must be handled before calling spawn_server")
484 }
485
486 let backend_name = if server_mode == crate::models::ServerMode::Bench {
488 "llama-bench"
489 } else {
490 "llama-server"
491 };
492 let version_display = settings.get_active_backend_version_display();
493 info!(
494 "spawn_server: backend={}, requested_version={:?}, version_display={}",
495 settings.backend,
496 settings.get_active_backend_version(),
497 version_display
498 );
499 log_tx
500 .send(format!(
501 "Resolving {} (v{}) binary...",
502 backend_name, version_display
503 ))
504 .await
505 .ok();
506 let version_param = settings.get_active_backend_version().map(|s| s.as_str());
507
508 let server_binary = match crate::backend::hub::resolve_backend_binary(
509 settings.backend,
510 version_param,
511 Some(log_tx.clone()),
512 progress_tx,
513 )
514 .await
515 {
516 Ok(path) => {
517 info!("spawn_server: resolved binary path={}", path.display());
518 path
519 }
520 Err(e) => {
521 return Err(format!("Failed to resolve backend binary: {}", e));
522 }
523 };
524
525 let binary = if server_mode == crate::models::ServerMode::Bench {
526 server_binary.parent().unwrap().join("llama-bench")
527 } else {
528 server_binary
529 };
530
531 if !binary.exists() {
532 return Err(format!("Binary not found at: {}", binary.display()));
533 }
534 #[cfg(unix)]
535 {
536 use std::os::unix::fs::PermissionsExt;
537 if let Ok(metadata) = binary.metadata()
538 && metadata.permissions().mode() & 0o111 == 0
539 {
540 return Err(format!("Binary is not executable: {}", binary.display()));
541 }
542 }
543
544 let (mut cmd, cmd_string) = if server_mode == crate::models::ServerMode::Bench {
545 if let Some(m) = model {
546 build_bench_cmd(&binary, m, settings)
547 } else {
548 return Err("Model required for benchmark".to_string());
549 }
550 } else {
551 build_server_cmd(
552 &binary,
553 model,
554 settings,
555 config,
556 server_mode,
557 router_max_models,
558 )
559 };
560
561 cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
562
563 let bin_dir = binary.parent().unwrap();
565 match std::env::consts::OS {
566 "windows" => {
567 if let Ok(current) = std::env::var("PATH") {
569 cmd.env("PATH", format!("{};{}", bin_dir.display(), current));
570 } else {
571 cmd.env("PATH", bin_dir);
572 }
573 }
574 "macos" => {
575 if let Ok(current) = std::env::var("DYLD_LIBRARY_PATH") {
577 cmd.env(
578 "DYLD_LIBRARY_PATH",
579 format!("{}:{}", bin_dir.display(), current),
580 );
581 } else {
582 cmd.env("DYLD_LIBRARY_PATH", bin_dir);
583 }
584 }
585 _ => {
586 if let Ok(current) = std::env::var("LD_LIBRARY_PATH") {
588 cmd.env(
589 "LD_LIBRARY_PATH",
590 format!("{}:{}", bin_dir.display(), current),
591 );
592 } else {
593 cmd.env("LD_LIBRARY_PATH", bin_dir);
594 }
595 }
596 }
597
598 info!("Spawning: {}", cmd_string);
599 let _ = log_tx
600 .send(format!("{}: {}", backend_name, cmd_string))
601 .await;
602 let mut child = cmd
603 .spawn()
604 .map_err(|e| format!("Failed to spawn process: {}", e))?;
605 let pid = child.id().unwrap_or(0);
606
607 let (kill_tx, mut kill_rx) = mpsc::channel(1);
608
609 let log_tx_inner = log_tx.clone();
612 let exit_tx_inner = exit_tx.clone();
613 let backend_name_upper = backend_name.to_uppercase();
614 tokio::spawn(async move {
615 let stdout = child.stdout.take().unwrap();
616 let stderr = child.stderr.take().unwrap();
617
618 let (stdout_tx, mut stdout_rx) = mpsc::channel::<String>(64);
619 let (stderr_tx, mut stderr_rx) = mpsc::channel::<String>(64);
620
621 let mut std_out = Some(tokio::spawn(async move {
623 let reader = BufReader::new(stdout).lines();
624 tokio::pin!(reader);
625 while let Ok(Some(line)) = reader.next_line().await {
626 if stdout_tx.send(line).await.is_err() {
627 break;
628 }
629 }
630 }));
631
632 let mut std_err = Some(tokio::spawn(async move {
633 let reader = BufReader::new(stderr).lines();
634 tokio::pin!(reader);
635 while let Ok(Some(line)) = reader.next_line().await {
636 if stderr_tx.send(line).await.is_err() {
637 break;
638 }
639 }
640 }));
641
642 loop {
645 tokio::select! {
646 _ = kill_rx.recv() => {
647 let _ = child.kill().await;
648 if let Some(h) = std_out.take() { let _ = h.await; }
649 if let Some(h) = std_err.take() { let _ = h.await; }
650 break;
651 }
652 line = stdout_rx.recv() => {
653 if let Some(line) = line { let _ = log_tx_inner.send(line).await; } else { break; }
654 }
655 line = stderr_rx.recv() => {
656 if let Some(line) = line { let _ = log_tx_inner.send(line).await; } else { break; }
657 }
658 else => break,
659 }
660 }
661
662 if let Some(h) = std_out.take() {
664 let _ = h.await;
665 }
666 if let Some(h) = std_err.take() {
667 let _ = h.await;
668 }
669
670 let exit_code = child.wait().await.ok().and_then(|s| s.code());
671 let _ = exit_tx_inner.send(()).await;
672 let _ = log_tx_inner
673 .send(format!(
674 "{} exited with code {:?}",
675 backend_name_upper, exit_code
676 ))
677 .await;
678 });
679
680 Ok((
681 ServerHandle {
682 port: if server_mode == crate::models::ServerMode::Bench {
683 0
684 } else {
685 settings.port
686 },
687 host: settings.host.clone(),
688 pid,
689 kill_tx,
690 },
691 cmd_string,
692 ))
693}
694
695pub async fn check_health(host: &str, port: u16) -> bool {
697 let host = clean_host(host);
698 let url = format!("http://{}:{}/health", host, port);
699 let client = reqwest::Client::builder()
700 .timeout(std::time::Duration::from_secs(1))
701 .build()
702 .unwrap_or_default();
703
704 match client.get(&url).send().await {
705 Ok(resp) => resp.status().is_success(),
706 Err(_) => false,
707 }
708}
709
710pub async fn kill_server(handle: ServerHandle) -> Result<(), String> {
712 handle
713 .kill_tx
714 .send(())
715 .await
716 .map_err(|_| "Server already stopped".to_string())
717}
718
719pub async fn get_metrics(
721 host: &str,
722 port: u16,
723 model_name: Option<&str>,
724 pid: Option<u32>,
725) -> Result<ServerMetrics, String> {
726 let host = clean_host(host);
727 let mut url = if let Some(model) = model_name {
730 let name = strip_gguf(model);
731 format!("http://{}:{}/metrics?model={}", host, port, name)
732 } else {
733 format!("http://{}:{}/metrics", host, port)
734 };
735
736 let mut resp = reqwest::get(&url)
737 .await
738 .map_err(|e| format!("Failed to get metrics: {}", e))?;
739
740 if (resp.status() == reqwest::StatusCode::NOT_FOUND
742 || resp.status() == reqwest::StatusCode::BAD_REQUEST)
743 && model_name.is_some()
744 {
745 url = format!("http://{}:{}/metrics", host, port);
746 resp = reqwest::get(&url)
747 .await
748 .map_err(|e| format!("Failed to get metrics: {}", e))?;
749 }
750
751 if !resp.status().is_success() {
752 return Err(format!("Server returned {}", resp.status()));
753 }
754
755 let text = resp
756 .text()
757 .await
758 .map_err(|e| format!("Failed to read metrics: {}", e))?;
759
760 let mut m = ServerMetrics { loaded: true, ..Default::default() };
761
762 let mut ctx_max_slots = 0u32;
763 let mut ctx_used_slots = 0u32;
764 let mut ctx_used_global = 0u32;
765 let mut ctx_max_global = 0u32;
766
767 let mut vram_used_slots = 0u64;
768 let mut vram_total_slots = 0u64;
769 let mut vram_used_global = 0u64;
770 let mut vram_total_global = 0u64;
771
772 for line in text.lines() {
773 if line.starts_with('#') || line.is_empty() {
774 continue;
775 }
776
777 let parts: Vec<&str> = line.split_whitespace().collect();
778 if parts.len() < 2 {
779 continue;
780 }
781
782 let name_with_labels = parts[0];
783 let mut val = 0.0;
784 for part in parts.iter().skip(1) {
785 if let Ok(v) = part.parse::<f64>() {
786 val = v;
787 break;
788 }
789 }
790
791 let is_slot = name_with_labels.contains("slot=\"") || name_with_labels.contains("pool=\"");
792 let name = name_with_labels
793 .split('{')
794 .next()
795 .unwrap_or(name_with_labels);
796
797 match name {
798 "llama_kv_cache_usage_bytes"
799 | "kv_cache_usage_bytes"
800 | "llama_server_kv_cache_usage_bytes"
801 | "llama_server_kv_cache_used_bytes"
802 | "llama_server_vram_used_bytes" => {
803 if is_slot {
804 vram_used_slots += val as u64;
805 } else {
806 vram_used_global = vram_used_global.max(val as u64);
807 }
808 }
809 "llama_kv_cache_total_bytes"
810 | "kv_cache_total_bytes"
811 | "llama_server_kv_cache_total_bytes"
812 | "llama_server_vram_total_bytes" => {
813 if is_slot {
814 vram_total_slots += val as u64;
815 } else {
816 vram_total_global = vram_total_global.max(val as u64);
817 }
818 }
819 "llama_model_memory_usage_bytes"
820 | "model_memory_usage_bytes"
821 | "llama_server_model_memory_usage_bytes"
822 | "llama_server_memory_usage_bytes"
823 | "llama_server_ram_usage_bytes"
824 | "llama_server_mem_used_bytes" => {
825 m.ram_used = m.ram_used.max(val as u64);
826 }
827 "llama_kv_cache_tokens_used"
828 | "kv_cache_usage_tokens"
829 | "kv_cache_tokens_used"
830 | "llama_server_kv_cache_tokens_used"
831 | "llamacpp:n_tokens_used"
832 | "llama_server_n_tokens_used"
833 | "llama_server_n_past"
834 | "llamacpp:n_past" => {
835 if is_slot {
836 ctx_used_slots += val as u32;
837 } else {
838 ctx_used_global = ctx_used_global.max(val as u32);
839 }
840 }
841 "llama_kv_cache_tokens_total"
842 | "kv_cache_total_tokens"
843 | "kv_cache_tokens_total"
844 | "llama_server_kv_cache_tokens_total"
845 | "llamacpp:n_ctx"
846 | "llamacpp:n_tokens_max"
847 | "llama_server_n_ctx"
848 | "llama_server_n_tokens_max" => {
849 if is_slot {
850 ctx_max_slots += val as u32;
851 } else {
852 ctx_max_global = ctx_max_global.max(val as u32);
853 }
854 }
855 "llama_server_cpu_usage_percentage"
856 | "cpu_usage_percentage"
857 | "llama_server_cpu_usage"
858 | "llama_server_cpu_percent" => {
859 m.cpu_usage = m.cpu_usage.max(val);
860 }
861 "llamacpp:predicted_tokens_seconds"
862 | "llama_server_predicted_tokens_seconds"
863 | "llama_server_tps" => {
864 m.tps += val;
865 }
866 "llamacpp:prompt_tokens_seconds"
867 | "llama_server_prompt_tokens_seconds"
868 | "llama_server_prompt_tps" => {
869 m.prompt_tps += val;
870 }
871 "llamacpp:kv_cache_usage_ratio" | "llama_server_kv_cache_usage_ratio" => {
872 if !is_slot && ctx_max_global > 0 {
873 ctx_used_global = ctx_used_global.max((val * ctx_max_global as f64) as u32);
874 }
875 }
876 _ => {}
877 }
878 }
879
880 m.gpu_mem_used = if vram_used_slots > 0 {
881 vram_used_slots
882 } else {
883 vram_used_global
884 };
885 m.gpu_mem_total = if vram_total_slots > 0 {
886 vram_total_slots
887 } else {
888 vram_total_global
889 };
890
891 m.ctx_used = if ctx_used_slots > 0 {
894 ctx_used_slots
895 } else {
896 ctx_used_global
897 };
898 m.ctx_max = if ctx_max_slots > 0 {
899 ctx_max_slots
900 } else {
901 ctx_max_global
902 };
903 if model_name.is_none() {
910 let set_if_better = |out: &mut ServerMetrics, used: u64, total: u64| {
914 if out.gpu_mem_used == 0 || used > out.gpu_mem_used {
915 out.gpu_mem_used = used;
916 out.gpu_mem_total = total;
917 }
918 };
919
920 let (nv_used, nv_total) = get_nvidia_vram_metrics().unwrap_or((0, 0));
921 set_if_better(&mut m, nv_used, nv_total);
922
923 if m.gpu_mem_total == 0 {
924 let (amd_used, amd_total) = get_amdgpu_vram_metrics().unwrap_or((0, 0));
926 set_if_better(&mut m, amd_used, amd_total);
927 }
928 } else if m.gpu_mem_used == 0 {
929 if let Ok((used, total)) = get_nvidia_vram_metrics() {
931 m.gpu_mem_used = used;
932 m.gpu_mem_total = total;
933 } else if let Ok((used, total)) = get_amdgpu_vram_metrics() {
934 m.gpu_mem_used = used;
935 m.gpu_mem_total = total;
936 }
937 }
938
939 if let Some(p) = pid {
941 if let Ok((ram, cpu)) = get_process_metrics(p) {
942 if m.ram_used == 0 {
943 m.ram_used = ram;
944 }
945 if m.cpu_usage == 0.0 {
946 m.cpu_usage = cpu;
947 }
948 }
949 }
950
951 Ok(m)
952}
953
954fn get_nvidia_vram_metrics() -> Result<(u64, u64), String> {
956 let output = std::process::Command::new("nvidia-smi")
957 .args([
958 "--query-gpu=memory.used,memory.total",
959 "--format=csv,noheader,nounits",
960 ])
961 .output()
962 .map_err(|e| e.to_string())?;
963
964 if !output.status.success() {
965 return Err("nvidia-smi failed".to_string());
966 }
967
968 let stdout = String::from_utf8_lossy(&output.stdout);
969 let line = stdout.lines().next().ok_or("No output from nvidia-smi")?;
970 let parts: Vec<&str> = line.split(',').collect();
971 if parts.len() >= 2 {
972 let used = parts[0].trim().parse::<u64>().unwrap_or(0) * 1024 * 1024;
973 let total = parts[1].trim().parse::<u64>().unwrap_or(0) * 1024 * 1024;
974 return Ok((used, total));
975 }
976
977 Err("Invalid output from nvidia-smi".to_string())
978}
979
980fn get_amdgpu_vram_metrics() -> Result<(u64, u64), String> {
982 let output = std::process::Command::new("amdgpu_top")
983 .args(["-d", "--json"])
984 .output()
985 .map_err(|e| e.to_string())?;
986
987 if !output.status.success() {
988 return Err("amdgpu_top failed".to_string());
989 }
990
991 let json: serde_json::Value =
992 serde_json::from_slice(&output.stdout).map_err(|e| e.to_string())?;
993
994 let devices = if json.is_array() {
996 json.as_array()
997 } else {
998 json.get("devices").and_then(|d| d.as_array())
999 };
1000
1001 if let Some(devices) = devices
1002 && let Some(device) = devices.first()
1003 {
1004 let root_used = device.get("VRAM Usage Size").and_then(|v| v.as_u64());
1007 let root_total = device.get("VRAM Size").and_then(|v| v.as_u64());
1008
1009 if let (Some(used), Some(total)) = (root_used, root_total)
1010 && total > 0
1011 {
1012 return Ok((used, total));
1013 }
1014
1015 let vram_obj = device.get("VRAM");
1017 if let Some(vram) = vram_obj {
1018 let nested_used = vram
1020 .get("Total VRAM Usage")
1021 .and_then(|v| v.get("value").or(Some(v)))
1022 .and_then(|v| v.as_u64());
1023 let nested_total = vram
1024 .get("Total VRAM")
1025 .and_then(|v| v.get("value").or(Some(v)))
1026 .and_then(|v| v.as_u64());
1027
1028 if let (Some(used), Some(total)) = (nested_used, nested_total) {
1029 let multiplier = if vram.get("Total VRAM").and_then(|v| v.get("unit")).is_some() {
1031 1024 * 1024
1032 } else {
1033 1
1034 };
1035 if total > 0 {
1036 return Ok((used * multiplier, total * multiplier));
1037 }
1038 }
1039 }
1040
1041 let vram_usage = device.get("vram_usage");
1043 if let Some(vram) = vram_usage {
1044 let used = vram
1045 .get("VRAM")
1046 .or_else(|| vram.get("usage"))
1047 .and_then(|v| v.get("value").or(Some(v)))
1048 .and_then(|v| v.as_u64())
1049 .unwrap_or(0);
1050 let total = vram
1051 .get("TotalVRAM")
1052 .or_else(|| vram.get("total"))
1053 .and_then(|v| v.get("value").or(Some(v)))
1054 .and_then(|v| v.as_u64())
1055 .unwrap_or(0);
1056
1057 if total > 0 {
1058 return Ok((used * 1024 * 1024, total * 1024 * 1024));
1059 }
1060 }
1061 }
1062
1063 Err("Could not find VRAM info in amdgpu_top output".to_string())
1064}
1065
1066fn get_process_metrics(
1070 pid: u32,
1071) -> Result<(u64, f64), String> {
1072 use std::sync::{LazyLock, Mutex};
1073 use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
1074
1075 static SYS: LazyLock<Mutex<System>> = LazyLock::new(|| {
1076 Mutex::new(System::new_with_specifics(
1077 RefreshKind::everything().with_processes(ProcessRefreshKind::nothing().with_cpu().with_memory()),
1078 ))
1079 });
1080
1081 let mut sys = SYS.lock().unwrap();
1082 let pids = [Pid::from(pid as usize)];
1083 sys.refresh_processes_specifics(
1084 ProcessesToUpdate::Some(&pids),
1085 true,
1086 ProcessRefreshKind::nothing().with_cpu().with_memory(),
1087 );
1088
1089 let sys_pid = Pid::from(pid as usize);
1090
1091 if let Some(process) = sys.process(sys_pid) {
1092 let ram = process.memory(); let cpu = process.cpu_usage() as f64; return Ok((ram, cpu));
1095 }
1096
1097 Err(format!("Process not found: pid={}", pid))
1098}
1099
1100pub async fn load_model(
1102 host: &str,
1103 port: u16,
1104 model_id: &str,
1105 model_path: Option<&str>,
1106) -> Result<(), String> {
1107 let client = reqwest::Client::new();
1108 let host = clean_host(host);
1109
1110 let endpoints = ["/models/load", "/v1/models/load"];
1112
1113 let mut variants = Vec::new();
1115
1116 variants.push(model_id.to_string());
1118 variants.push(strip_gguf(model_id).to_string());
1119
1120 if let Some(filename) = std::path::Path::new(model_id)
1122 .file_name()
1123 .and_then(|f| f.to_str())
1124 {
1125 variants.push(filename.to_string());
1126 variants.push(strip_gguf(filename).to_string());
1127 }
1128
1129 if let Some(path) = model_path {
1131 variants.push(path.to_string());
1132 }
1133
1134 let mut last_status = reqwest::StatusCode::OK;
1135 let mut last_error = String::new();
1136
1137 for endpoint in endpoints {
1138 let url = format!("http://{}:{}{}", host, port, endpoint);
1139 for variant in &variants {
1140 let bodies = vec![
1142 serde_json::json!({ "model": variant }),
1143 serde_json::json!({ "alias": variant }),
1144 ];
1145
1146 for body in bodies {
1147 match client.post(&url).json(&body).send().await {
1148 Ok(res) => {
1149 if res.status().is_success() {
1150 return Ok(());
1151 }
1152 last_status = res.status();
1153 last_error = res
1154 .text()
1155 .await
1156 .unwrap_or_else(|_| "Unknown error".to_string());
1157 }
1158 Err(e) => {
1159 last_error = e.to_string();
1160 }
1161 }
1162 }
1163 }
1164 }
1165
1166 Err(format!(
1167 "Failed to load model (tried {} variants). Last status {}: {}",
1168 variants.len() * 2,
1169 last_status,
1170 last_error
1171 ))
1172}
1173
1174pub async fn list_models(
1176 host: &str,
1177 port: u16,
1178) -> Result<Vec<(String, String, Option<String>)>, String> {
1179 let client = reqwest::Client::new();
1180 let host = clean_host(host);
1181 let url = format!("http://{}:{}/models", host, port);
1182
1183 let res = client
1184 .get(&url)
1185 .send()
1186 .await
1187 .map_err(|e| format!("Failed to list models: {}", e))?;
1188
1189 if !res.status().is_success() {
1190 return Err(format!("Server returned error {}", res.status()));
1191 }
1192
1193 let json: serde_json::Value = res
1194 .json()
1195 .await
1196 .map_err(|e| format!("Invalid JSON: {}", e))?;
1197
1198 let mut results = Vec::new();
1199 if let Some(data) = json.get("data").and_then(|d| d.as_array()) {
1200 for model in data {
1201 let id = model
1202 .get("id")
1203 .and_then(|v| v.as_str())
1204 .unwrap_or_default()
1205 .to_string();
1206 let status = model
1208 .get("status")
1209 .and_then(|s| s.get("value").or(Some(s)))
1210 .and_then(|v| v.as_str())
1211 .unwrap_or("unloaded")
1212 .to_string();
1213 let path = model
1214 .get("path")
1215 .or_else(|| model.get("filename"))
1216 .and_then(|v| v.as_str())
1217 .map(|s| s.to_string());
1218
1219 results.push((id, status, path));
1220 }
1221 }
1222
1223 Ok(results)
1224}
1225
1226pub async fn unload_model(
1228 host: &str,
1229 port: u16,
1230 model_id: &str,
1231 model_path: Option<&str>,
1232) -> Result<(), String> {
1233 let client = reqwest::Client::new();
1234 let host = clean_host(host);
1235
1236 let endpoints = ["/models/unload", "/v1/models/unload"];
1237 let stripped = strip_gguf(model_id);
1238 let mut variants = vec![model_id.to_string(), stripped.to_string()];
1239 if let Some(path) = model_path {
1240 variants.push(path.to_string());
1241 }
1242
1243 for endpoint in endpoints {
1244 let url = format!("http://{}:{}{}", host, port, endpoint);
1245 for variant in &variants {
1246 let body = serde_json::json!({
1247 "model": variant
1248 });
1249
1250 if let Ok(res) = client.post(&url).json(&body).send().await
1251 && res.status().is_success()
1252 {
1253 return Ok(());
1254 }
1255 }
1256 }
1257
1258 Ok(()) }