1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/// Inference server subcommands (plan/run).
///
/// `apr serve plan` computes VRAM budget, throughput estimates, and contract
/// verification before starting a server. `apr serve run` launches the server.
#[derive(Subcommand, Debug)]
pub enum ServeCommands {
/// Pre-flight inference capacity plan (VRAM budget, roofline, contracts)
///
/// Inspects model metadata, detects GPU hardware, and produces a capacity
/// plan showing whether the model fits in VRAM with the requested batch size.
/// No weights are loaded — header-only inspection.
///
/// Accepts local files (.gguf, .apr, .safetensors) or HuggingFace repo IDs
/// (hf://org/repo or org/repo). For HF repos, only the ~2KB config.json is
/// fetched — no weight download needed.
Plan {
/// Model source: local path or HuggingFace repo (hf://org/repo, org/repo)
#[arg(value_name = "MODEL")]
model: String,
/// Detect GPU via nvidia-smi for VRAM budget
#[arg(long)]
gpu: bool,
/// Target batch size for throughput estimation
#[arg(long, default_value = "1")]
batch_size: usize,
/// Sequence length for KV cache estimation
#[arg(long, default_value = "4096")]
seq_len: usize,
/// Output format: text, json, yaml
#[arg(long, default_value = "text")]
format: String,
/// Quantization override for HF models (e.g., Q4_K_M, Q6_K, F16)
#[arg(long)]
quant: Option<String>,
},
/// Start inference server (REST API, streaming, metrics)
Run {
/// Path to model file
#[arg(value_name = "FILE")]
file: PathBuf,
/// Port to listen on
#[arg(short, long, default_value = "8080")]
port: u16,
/// Host to bind to
#[arg(long, default_value = "127.0.0.1")]
host: String,
/// Disable CORS
#[arg(long)]
no_cors: bool,
/// Disable Prometheus metrics endpoint
#[arg(long)]
no_metrics: bool,
/// Disable GPU acceleration
#[arg(long)]
no_gpu: bool,
/// Force GPU acceleration (requires CUDA)
#[arg(long)]
gpu: bool,
/// Enable batched GPU inference for 2X+ throughput
#[arg(long)]
batch: bool,
/// Enable inference tracing (PMAT-SHOWCASE-METHODOLOGY-001)
#[arg(long)]
trace: bool,
/// Trace detail level (none, basic, layer)
#[arg(long, value_name = "LEVEL", default_value = "basic")]
trace_level: String,
/// Enable inline Roofline profiling (adds X-Profile headers)
#[arg(long)]
profile: bool,
/// PMAT-332: Compute backend override (cuda, cpu, wgpu)
#[arg(long, value_name = "BACKEND")]
backend: Option<String>,
/// PMAT-485: OTLP endpoint for distributed tracing export (Jaeger/Tempo)
///
/// When set, inference spans (W3C Trace Context) are exported via OTLP.
/// Each request = parent span, each layer = child span with TensorStats.
/// Example: --otlp-endpoint http://localhost:4317
#[arg(long, value_name = "URL")]
otlp_endpoint: Option<String>,
/// GH-286: Max context/sequence length for KV cache. Lower = less RSS.
#[arg(long, default_value = "4096")]
context_length: usize,
/// GH-286: Skip FP8 weight cache warmup. Saves ~1.5 GB RSS.
#[arg(long)]
no_fp8_cache: bool,
},
}