use clap::{Parser, Subcommand};
use colored::Colorize;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
mod commands;
mod models;
use commands::{benchmark, chat, download, info, list, quantize, serve};
#[derive(Parser)]
#[command(name = "ruvllm")]
#[command(author, version, about, long_about = None)]
#[command(propagate_version = true)]
struct Cli {
#[arg(short, long, global = true)]
verbose: bool,
#[arg(long, global = true)]
no_color: bool,
#[arg(long, global = true, env = "RUVLLM_CACHE_DIR")]
cache_dir: Option<String>,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
#[command(alias = "dl")]
Download {
model: String,
#[arg(short, long, default_value = "q4k")]
quantization: String,
#[arg(short, long)]
force: bool,
#[arg(long)]
revision: Option<String>,
},
#[command(alias = "ls")]
List {
#[arg(short, long)]
downloaded: bool,
#[arg(short, long)]
long: bool,
},
Info {
model: String,
},
Serve {
model: String,
#[arg(long, default_value = "127.0.0.1")]
host: String,
#[arg(short, long, default_value = "8080")]
port: u16,
#[arg(long, default_value = "4")]
max_concurrent: usize,
#[arg(long, default_value = "4096")]
max_context: usize,
#[arg(short, long, default_value = "q4k")]
quantization: String,
},
Chat {
model: String,
#[arg(short, long)]
system: Option<String>,
#[arg(long, default_value = "512")]
max_tokens: usize,
#[arg(short, long, default_value = "0.7")]
temperature: f32,
#[arg(short, long, default_value = "q4k")]
quantization: String,
#[arg(long)]
speculative: Option<String>,
#[arg(long, default_value = "4")]
speculative_lookahead: usize,
},
#[command(alias = "bench")]
Benchmark {
model: String,
#[arg(long, default_value = "3")]
warmup: usize,
#[arg(short, long, default_value = "10")]
iterations: usize,
#[arg(long, default_value = "128")]
prompt_length: usize,
#[arg(long, default_value = "64")]
gen_length: usize,
#[arg(short, long, default_value = "q4k")]
quantization: String,
#[arg(long, default_value = "text")]
format: String,
},
#[command(alias = "quant")]
Quantize {
#[arg(short, long)]
model: String,
#[arg(short, long, default_value = "")]
output: String,
#[arg(short, long, default_value = "q4_k_m")]
quant: String,
#[arg(long, default_value = "true")]
ane_optimize: bool,
#[arg(long, default_value = "true")]
keep_embed_fp16: bool,
#[arg(long, default_value = "true")]
keep_output_fp16: bool,
#[arg(long)]
verbose: bool,
},
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
let log_level = if cli.verbose { "debug" } else { "info" };
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| log_level.into()),
)
.with(tracing_subscriber::fmt::layer().with_target(false))
.init();
if cli.no_color {
colored::control::set_override(false);
}
let cache_dir = cli.cache_dir.unwrap_or_else(|| {
dirs::cache_dir()
.unwrap_or_else(|| std::path::PathBuf::from("."))
.join("ruvllm")
.to_string_lossy()
.to_string()
});
let result = match cli.command {
Commands::Download {
model,
quantization,
force,
revision,
} => {
download::run(
&model,
&quantization,
force,
revision.as_deref(),
&cache_dir,
)
.await
}
Commands::List { downloaded, long } => list::run(downloaded, long, &cache_dir).await,
Commands::Info { model } => info::run(&model, &cache_dir).await,
Commands::Serve {
model,
host,
port,
max_concurrent,
max_context,
quantization,
} => {
serve::run(
&model,
&host,
port,
max_concurrent,
max_context,
&quantization,
&cache_dir,
)
.await
}
Commands::Chat {
model,
system,
max_tokens,
temperature,
quantization,
speculative,
speculative_lookahead,
} => {
chat::run(
&model,
system.as_deref(),
max_tokens,
temperature,
&quantization,
&cache_dir,
speculative.as_deref(),
speculative_lookahead,
)
.await
}
Commands::Benchmark {
model,
warmup,
iterations,
prompt_length,
gen_length,
quantization,
format,
} => {
benchmark::run(
&model,
warmup,
iterations,
prompt_length,
gen_length,
&quantization,
&format,
&cache_dir,
)
.await
}
Commands::Quantize {
model,
output,
quant,
ane_optimize,
keep_embed_fp16,
keep_output_fp16,
verbose,
} => {
quantize::run(
&model,
&output,
&quant,
ane_optimize,
keep_embed_fp16,
keep_output_fp16,
verbose,
&cache_dir,
)
.await
}
};
if let Err(e) = result {
eprintln!("{} {}", "Error:".red().bold(), e);
std::process::exit(1);
}
Ok(())
}