Enum ServeCommands

Source

pub enum ServeCommands {
    Plan {
        model: String,
        gpu: bool,
        batch_size: usize,
        seq_len: usize,
        format: String,
        quant: Option<String>,
    },
    Run {Show 15 fields
        file: PathBuf,
        port: u16,
        host: String,
        no_cors: bool,
        no_metrics: bool,
        no_gpu: bool,
        gpu: bool,
        batch: bool,
        trace: bool,
        trace_level: String,
        profile: bool,
        backend: Option<String>,
        otlp_endpoint: Option<String>,
        context_length: usize,
        no_fp8_cache: bool,
    },
}

Expand description

Inference server subcommands (plan/run).

apr serve plan computes VRAM budget, throughput estimates, and contract verification before starting a server. apr serve run launches the server.

Variants§

§

Plan

Pre-flight inference capacity plan (VRAM budget, roofline, contracts)

Inspects model metadata, detects GPU hardware, and produces a capacity plan showing whether the model fits in VRAM with the requested batch size. No weights are loaded — header-only inspection.

Accepts local files (.gguf, .apr, .safetensors) or HuggingFace repo IDs (hf://org/repo or org/repo). For HF repos, only the ~2KB config.json is fetched — no weight download needed.

Fields

§model: String

Model source: local path or HuggingFace repo (hf://org/repo, org/repo)

§gpu: bool

Detect GPU via nvidia-smi for VRAM budget

§batch_size: usize

Target batch size for throughput estimation

§seq_len: usize

Sequence length for KV cache estimation

§format: String

Output format: text, json, yaml

§quant: Option<String>

Quantization override for HF models (e.g., Q4_K_M, Q6_K, F16)

§

Run

Start inference server (REST API, streaming, metrics)

Fields

§file: PathBuf

Path to model file

§port: u16

Port to listen on

§host: String

Host to bind to

§no_cors: bool

Disable CORS

§no_metrics: bool

Disable Prometheus metrics endpoint

§no_gpu: bool

Disable GPU acceleration

§gpu: bool

Force GPU acceleration (requires CUDA)

§batch: bool

Enable batched GPU inference for 2X+ throughput

§trace: bool

Enable inference tracing (PMAT-SHOWCASE-METHODOLOGY-001)

§trace_level: String

Trace detail level (none, basic, layer)

§profile: bool

Enable inline Roofline profiling (adds X-Profile headers)

§backend: Option<String>

PMAT-332: Compute backend override (cuda, cpu, wgpu)

§otlp_endpoint: Option<String>

PMAT-485: OTLP endpoint for distributed tracing export (Jaeger/Tempo)

When set, inference spans (W3C Trace Context) are exported via OTLP. Each request = parent span, each layer = child span with TensorStats. Example: –otlp-endpoint http://localhost:4317

§context_length: usize

GH-286: Max context/sequence length for KV cache. Lower = less RSS.

§no_fp8_cache: bool

GH-286: Skip FP8 weight cache warmup. Saves ~1.5 GB RSS.

ServeCommands

Enum ServeCommands Copy item path

Variants§

Plan

Fields

Run

Fields

Trait Implementations§

impl Debug for ServeCommands

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl FromArgMatches for ServeCommands

fn from_arg_matches(__clap_arg_matches: &ArgMatches) -> Result<Self, Error>

fn from_arg_matches_mut( __clap_arg_matches: &mut ArgMatches, ) -> Result<Self, Error>

fn update_from_arg_matches( &mut self, __clap_arg_matches: &ArgMatches, ) -> Result<(), Error>

fn update_from_arg_matches_mut<'b>( &mut self, __clap_arg_matches: &mut ArgMatches, ) -> Result<(), Error>

impl Subcommand for ServeCommands

fn augment_subcommands<'b>(__clap_app: Command) -> Command

fn augment_subcommands_for_update<'b>(__clap_app: Command) -> Command

fn has_subcommand(__clap_name: &str) -> bool

Auto Trait Implementations§

impl Freeze for ServeCommands

impl RefUnwindSafe for ServeCommands

impl Send for ServeCommands

impl Sync for ServeCommands

impl Unpin for ServeCommands

impl UnsafeUnpin for ServeCommands

impl UnwindSafe for ServeCommands

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> PolicyExt for Twhere T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>where T: Policy<B, E>, P: Policy<B, E>,

impl<T> Same for T

type Output = T

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> Allocation for Twhere T: RefUnwindSafe + Send + Sync,

Enum ServeCommands

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,