use std::path::PathBuf;
use clap::Subcommand;
use swarm_engine_core::config::PathResolver;
#[derive(Subcommand)]
pub enum LlamaAction {
Start {
#[arg(short, long)]
model: PathBuf,
#[arg(long)]
lora: Option<PathBuf>,
#[arg(long, default_value = "127.0.0.1")]
host: String,
#[arg(short, long, default_value = "8080")]
port: u16,
#[arg(long, default_value = "99")]
n_gpu_layers: u32,
#[arg(long)]
ctx_size: Option<u32>,
#[arg(long, default_value = "4")]
parallel: u32,
},
Stop,
Logs {
#[arg(short = 'n', long, default_value = "50")]
lines: usize,
#[arg(short, long)]
follow: bool,
},
Status,
}
fn llama_pid_file() -> PathBuf {
PathResolver::user_data_dir().join("llama-server.pid")
}
fn llama_log_file() -> PathBuf {
PathResolver::user_data_dir().join("llama-server.log")
}
pub fn cmd_llama(action: LlamaAction) {
match action {
LlamaAction::Start {
model,
lora,
host,
port,
n_gpu_layers,
ctx_size,
parallel,
} => cmd_llama_start(model, lora, host, port, n_gpu_layers, ctx_size, parallel),
LlamaAction::Stop => cmd_llama_stop(),
LlamaAction::Logs { lines, follow } => cmd_llama_logs(lines, follow),
LlamaAction::Status => cmd_llama_status(),
}
}
fn cmd_llama_start(
model: PathBuf,
lora: Option<PathBuf>,
host: String,
port: u16,
n_gpu_layers: u32,
ctx_size: Option<u32>,
parallel: u32,
) {
let ctx_size = ctx_size.unwrap_or_else(|| {
let auto = 1024 * parallel;
println!(" (auto-calculated: 1024 * {} slots)", parallel);
auto
});
if !model.exists() {
eprintln!("Model file not found: {}", model.display());
std::process::exit(1);
}
let pid_file = llama_pid_file();
if pid_file.exists() {
if let Ok(pid_str) = std::fs::read_to_string(&pid_file) {
if let Ok(pid) = pid_str.trim().parse::<u32>() {
let check = std::process::Command::new("kill")
.args(["-0", &pid.to_string()])
.status();
if check.map(|s| s.success()).unwrap_or(false) {
eprintln!("llama-server is already running (PID: {})", pid);
eprintln!("Use 'swarm-engine llama stop' to stop it first");
std::process::exit(1);
}
}
}
}
let data_dir = PathResolver::user_data_dir();
if !data_dir.exists() {
std::fs::create_dir_all(&data_dir).expect("Failed to create data directory");
}
let log_file = llama_log_file();
println!("Starting llama-server...");
println!(" Model: {}", model.display());
if let Some(ref lora_path) = lora {
println!(" LoRA: {}", lora_path.display());
}
println!(" Host: {}:{}", host, port);
println!(" GPU layers: {}", n_gpu_layers);
println!(" Context size: {}", ctx_size);
println!(" Parallel slots: {}", parallel);
println!(" Continuous batching: enabled");
println!(" Log file: {}", log_file.display());
let mut cmd = std::process::Command::new("llama-server");
cmd.args([
"-m",
model.to_str().unwrap(),
"--host",
&host,
"--port",
&port.to_string(),
"-ngl",
&n_gpu_layers.to_string(),
"-c",
&ctx_size.to_string(),
"-np",
¶llel.to_string(),
"--cont-batching",
]);
if let Some(lora_path) = lora {
if !lora_path.exists() {
eprintln!("LoRA adapter not found: {}", lora_path.display());
std::process::exit(1);
}
cmd.args(["--lora", lora_path.to_str().unwrap()]);
}
let log = std::fs::File::create(&log_file).expect("Failed to create log file");
let log_err = log.try_clone().expect("Failed to clone log file");
cmd.stdout(std::process::Stdio::from(log));
cmd.stderr(std::process::Stdio::from(log_err));
match cmd.spawn() {
Ok(child) => {
let pid = child.id();
std::fs::write(&pid_file, pid.to_string()).expect("Failed to write PID file");
println!("\nllama-server started (PID: {})", pid);
println!("Endpoint: http://{}:{}", host, port);
println!("\nUse 'swarm-engine llama logs -f' to follow the logs");
println!("Use 'swarm-engine llama status' to check status");
}
Err(e) => {
eprintln!("Failed to start llama-server: {}", e);
eprintln!("\nMake sure llama-server is installed and in your PATH.");
eprintln!("Install: brew install llama.cpp (macOS) or build from source");
std::process::exit(1);
}
}
}
fn cmd_llama_stop() {
let pid_file = llama_pid_file();
if !pid_file.exists() {
println!("llama-server is not running (no PID file found)");
return;
}
let pid_str = match std::fs::read_to_string(&pid_file) {
Ok(s) => s,
Err(e) => {
eprintln!("Failed to read PID file: {}", e);
std::process::exit(1);
}
};
let pid: u32 = match pid_str.trim().parse() {
Ok(p) => p,
Err(_) => {
eprintln!("Invalid PID in file: {}", pid_str);
let _ = std::fs::remove_file(&pid_file);
std::process::exit(1);
}
};
println!("Stopping llama-server (PID: {})...", pid);
let status = std::process::Command::new("kill")
.arg(pid.to_string())
.status();
match status {
Ok(s) if s.success() => {
let _ = std::fs::remove_file(&pid_file);
println!("llama-server stopped");
}
Ok(_) => {
let _ = std::fs::remove_file(&pid_file);
println!("llama-server was not running");
}
Err(e) => {
eprintln!("Failed to stop llama-server: {}", e);
std::process::exit(1);
}
}
}
fn cmd_llama_logs(lines: usize, follow: bool) {
let log_file = llama_log_file();
if !log_file.exists() {
println!("No log file found. Start llama-server first.");
return;
}
if follow {
let status = std::process::Command::new("tail")
.args(["-f", "-n", &lines.to_string()])
.arg(&log_file)
.status();
if let Err(e) = status {
eprintln!("Failed to tail log file: {}", e);
}
} else {
let output = std::process::Command::new("tail")
.args(["-n", &lines.to_string()])
.arg(&log_file)
.output();
match output {
Ok(o) => {
print!("{}", String::from_utf8_lossy(&o.stdout));
}
Err(e) => {
eprintln!("Failed to read log file: {}", e);
}
}
}
}
fn cmd_llama_status() {
let pid_file = llama_pid_file();
if !pid_file.exists() {
println!("llama-server: stopped (no PID file)");
return;
}
let pid_str = match std::fs::read_to_string(&pid_file) {
Ok(s) => s,
Err(_) => {
println!("llama-server: unknown (failed to read PID file)");
return;
}
};
let pid: u32 = match pid_str.trim().parse() {
Ok(p) => p,
Err(_) => {
println!("llama-server: unknown (invalid PID)");
return;
}
};
let check = std::process::Command::new("kill")
.args(["-0", &pid.to_string()])
.status();
if check.map(|s| s.success()).unwrap_or(false) {
println!("llama-server: running (PID: {})", pid);
let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime");
let healthy = rt.block_on(async {
let client = reqwest::Client::new();
match client
.get("http://localhost:8080/health")
.timeout(std::time::Duration::from_secs(2))
.send()
.await
{
Ok(resp) => resp.status().is_success(),
Err(_) => false,
}
});
if healthy {
println!("Health: OK (http://localhost:8080)");
} else {
println!("Health: not responding (might still be loading model)");
}
let log_file = llama_log_file();
if log_file.exists() {
println!("Log file: {}", log_file.display());
}
} else {
println!("llama-server: stopped (process not found)");
let _ = std::fs::remove_file(&pid_file);
}
}