apr_cli/serve_commands.rs
1
2/// Inference server subcommands (plan/run).
3///
4/// `apr serve plan` computes VRAM budget, throughput estimates, and contract
5/// verification before starting a server. `apr serve run` launches the server.
6#[derive(Subcommand, Debug)]
7pub enum ServeCommands {
8 /// Pre-flight inference capacity plan (VRAM budget, roofline, contracts)
9 ///
10 /// Inspects model metadata, detects GPU hardware, and produces a capacity
11 /// plan showing whether the model fits in VRAM with the requested batch size.
12 /// No weights are loaded — header-only inspection.
13 ///
14 /// Accepts local files (.gguf, .apr, .safetensors) or HuggingFace repo IDs
15 /// (hf://org/repo or org/repo). For HF repos, only the ~2KB config.json is
16 /// fetched — no weight download needed.
17 Plan {
18 /// Model source: local path or HuggingFace repo (hf://org/repo, org/repo)
19 #[arg(value_name = "MODEL")]
20 model: String,
21 /// Detect GPU via nvidia-smi for VRAM budget
22 #[arg(long)]
23 gpu: bool,
24 /// Target batch size for throughput estimation
25 #[arg(long, default_value = "1")]
26 batch_size: usize,
27 /// Sequence length for KV cache estimation
28 #[arg(long, default_value = "4096")]
29 seq_len: usize,
30 /// Output format: text, json, yaml
31 #[arg(long, default_value = "text")]
32 format: String,
33 /// Quantization override for HF models (e.g., Q4_K_M, Q6_K, F16)
34 #[arg(long)]
35 quant: Option<String>,
36 },
37 /// Start inference server (REST API, streaming, metrics)
38 Run {
39 /// Path to model file
40 #[arg(value_name = "FILE")]
41 file: PathBuf,
42 /// Port to listen on
43 #[arg(short, long, default_value = "8080")]
44 port: u16,
45 /// Host to bind to
46 #[arg(long, default_value = "127.0.0.1")]
47 host: String,
48 /// Disable CORS
49 #[arg(long)]
50 no_cors: bool,
51 /// Disable Prometheus metrics endpoint
52 #[arg(long)]
53 no_metrics: bool,
54 /// Disable GPU acceleration
55 #[arg(long)]
56 no_gpu: bool,
57 /// Force GPU acceleration (requires CUDA)
58 #[arg(long)]
59 gpu: bool,
60 /// Enable batched GPU inference for 2X+ throughput
61 #[arg(long)]
62 batch: bool,
63 /// Enable inference tracing (PMAT-SHOWCASE-METHODOLOGY-001)
64 #[arg(long)]
65 trace: bool,
66 /// Trace detail level (none, basic, layer)
67 #[arg(long, value_name = "LEVEL", default_value = "basic")]
68 trace_level: String,
69 /// Enable inline Roofline profiling (adds X-Profile headers)
70 #[arg(long)]
71 profile: bool,
72 },
73}