Skip to main content

apr_cli/
serve_commands.rs

1
2/// Inference server subcommands (plan/run).
3///
4/// `apr serve plan` computes VRAM budget, throughput estimates, and contract
5/// verification before starting a server. `apr serve run` launches the server.
6#[derive(Subcommand, Debug)]
7pub enum ServeCommands {
8    /// Pre-flight inference capacity plan (VRAM budget, roofline, contracts)
9    ///
10    /// Inspects model metadata, detects GPU hardware, and produces a capacity
11    /// plan showing whether the model fits in VRAM with the requested batch size.
12    /// No weights are loaded — header-only inspection.
13    ///
14    /// Accepts local files (.gguf, .apr, .safetensors) or HuggingFace repo IDs
15    /// (hf://org/repo or org/repo). For HF repos, only the ~2KB config.json is
16    /// fetched — no weight download needed.
17    Plan {
18        /// Model source: local path or HuggingFace repo (hf://org/repo, org/repo)
19        #[arg(value_name = "MODEL")]
20        model: String,
21        /// Detect GPU via nvidia-smi for VRAM budget
22        #[arg(long)]
23        gpu: bool,
24        /// Target batch size for throughput estimation
25        #[arg(long, default_value = "1")]
26        batch_size: usize,
27        /// Sequence length for KV cache estimation
28        #[arg(long, default_value = "4096")]
29        seq_len: usize,
30        /// Output format: text, json, yaml
31        #[arg(long, default_value = "text")]
32        format: String,
33        /// Quantization override for HF models (e.g., Q4_K_M, Q6_K, F16)
34        #[arg(long)]
35        quant: Option<String>,
36    },
37    /// Start inference server (REST API, streaming, metrics)
38    Run {
39        /// Path to model file
40        #[arg(value_name = "FILE")]
41        file: PathBuf,
42        /// Port to listen on
43        #[arg(short, long, default_value = "8080")]
44        port: u16,
45        /// Host to bind to
46        #[arg(long, default_value = "127.0.0.1")]
47        host: String,
48        /// Disable CORS
49        #[arg(long)]
50        no_cors: bool,
51        /// Disable Prometheus metrics endpoint
52        #[arg(long)]
53        no_metrics: bool,
54        /// Disable GPU acceleration
55        #[arg(long)]
56        no_gpu: bool,
57        /// Force GPU acceleration (requires CUDA)
58        #[arg(long)]
59        gpu: bool,
60        /// Enable batched GPU inference for 2X+ throughput
61        #[arg(long)]
62        batch: bool,
63        /// Enable inference tracing (PMAT-SHOWCASE-METHODOLOGY-001)
64        #[arg(long)]
65        trace: bool,
66        /// Trace detail level (none, basic, layer)
67        #[arg(long, value_name = "LEVEL", default_value = "basic")]
68        trace_level: String,
69        /// Enable inline Roofline profiling (adds X-Profile headers)
70        #[arg(long)]
71        profile: bool,
72    },
73}