pub trait ModelRunner: Send + Sync {
// Required methods
fn execute<'life0, 'async_trait>(
&'life0 mut self,
batch: ExecuteBatch,
) -> Pin<Box<dyn Future<Output = InferenceResult<RunHandle>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait;
fn rebuild_session<'life0, 'async_trait>(
&'life0 mut self,
cause: SessionRebuildCause,
) -> Pin<Box<dyn Future<Output = InferenceResult<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait;
fn runtime_kind(&self) -> RuntimeKind;
fn transport_kind(&self) -> TransportKind;
// Provided methods
fn load_weights<'life0, 'life1, 'async_trait>(
&'life0 mut self,
_ctx: Option<&'life1 CudaContextHandle>,
_source: WeightSource,
) -> Pin<Box<dyn Future<Output = InferenceResult<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn gil_pinned(&self) -> bool { ... }
fn rate_limits(&self) -> Option<&RateLimits> { ... }
fn estimate_cost_usd(&self, _batch: &ExecuteBatch) -> f64 { ... }
}Required Methods§
Sourcefn execute<'life0, 'async_trait>(
&'life0 mut self,
batch: ExecuteBatch,
) -> Pin<Box<dyn Future<Output = InferenceResult<RunHandle>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
fn execute<'life0, 'async_trait>(
&'life0 mut self,
batch: ExecuteBatch,
) -> Pin<Box<dyn Future<Output = InferenceResult<RunHandle>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
Run an inference. For local runtimes, dispatches kernels; for
remote runtimes, sends an HTTP request. Returns immediately;
completion is observed via the returned RunHandle stream.
Sourcefn rebuild_session<'life0, 'async_trait>(
&'life0 mut self,
cause: SessionRebuildCause,
) -> Pin<Box<dyn Future<Output = InferenceResult<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
fn rebuild_session<'life0, 'async_trait>(
&'life0 mut self,
cause: SessionRebuildCause,
) -> Pin<Box<dyn Future<Output = InferenceResult<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
Local runtimes rebuild after CUDA context poison; remote runtimes rebuild after auth failure or config change.
fn runtime_kind(&self) -> RuntimeKind
fn transport_kind(&self) -> TransportKind
Provided Methods§
Sourcefn load_weights<'life0, 'life1, 'async_trait>(
&'life0 mut self,
_ctx: Option<&'life1 CudaContextHandle>,
_source: WeightSource,
) -> Pin<Box<dyn Future<Output = InferenceResult<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn load_weights<'life0, 'life1, 'async_trait>(
&'life0 mut self,
_ctx: Option<&'life1 CudaContextHandle>,
_source: WeightSource,
) -> Pin<Box<dyn Future<Output = InferenceResult<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Local runtimes load weights to GPU; remote runtimes default to a no-op.
fn gil_pinned(&self) -> bool
Sourcefn rate_limits(&self) -> Option<&RateLimits>
fn rate_limits(&self) -> Option<&RateLimits>
Rate-limit metadata. Returns None for local runtimes; remote
runtimes return their configured limits so the
RateLimiterActor can be initialized at deploy time.
Sourcefn estimate_cost_usd(&self, _batch: &ExecuteBatch) -> f64
fn estimate_cost_usd(&self, _batch: &ExecuteBatch) -> f64
Best-effort cost estimate for the given batch (USD). Used by
TieredRouter-style actors and budget enforcement. Local
runtimes default to 0 (compute cost is amortized).