atomr_infer_core/
deployment.rs

1//! `Deployment` value object — the shared declarative surface for every
2//! local-GPU and remote-network backend (doc §11.1, §11.3).
3
4use std::time::Duration;
5
6use serde::{Deserialize, Serialize};
7
8use crate::runtime::{humantime_serde_ms, CircuitBreakerConfig, JitterKind, RuntimeConfig, RuntimeKind};
9
10/// A model deployment. The `runtime` field selects the backend; every
11/// other field has a runtime-agnostic interpretation. Local deployments
12/// fill `gpus`; remote deployments leave it `None` and use `serving`'s
13/// `max_concurrent` instead.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct Deployment {
16    pub name: String,
17    pub model: String,
18    /// Optional explicit runtime. When omitted, `infer_runtime` picks
19    /// based on the model name (doc §3.2).
20    #[serde(default, skip_serializing_if = "Option::is_none")]
21    pub runtime: Option<RuntimeKind>,
22    /// Backend-specific configuration. When omitted, defaults are used.
23    #[serde(default, skip_serializing_if = "Option::is_none")]
24    pub runtime_config: Option<RuntimeConfig>,
25    /// Local-only: number of GPUs per replica.
26    #[serde(default, skip_serializing_if = "Option::is_none")]
27    pub gpus: Option<u32>,
28    /// Number of replicas (local: HA + scale-out; remote: independent
29    /// worker pools, possibly different API keys).
30    #[serde(default = "default_replicas")]
31    pub replicas: u32,
32    #[serde(default)]
33    pub serving: Serving,
34    #[serde(default, skip_serializing_if = "Option::is_none")]
35    pub budget: Option<Budget>,
36    /// True for normal LLM inference; false to disable retries on
37    /// non-idempotent stateful APIs (doc §12.3).
38    #[serde(default = "default_idempotent")]
39    pub idempotent: bool,
40}
41
42fn default_replicas() -> u32 {
43    1
44}
45fn default_idempotent() -> bool {
46    true
47}
48
49impl Deployment {
50    /// Effective runtime kind: explicit override wins, otherwise infer
51    /// from the model name (doc §3.2).
52    pub fn effective_runtime(&self) -> RuntimeKind {
53        self.runtime
54            .clone()
55            .or_else(|| self.runtime_config.as_ref().map(RuntimeConfig::runtime_kind))
56            .unwrap_or_else(|| crate::registry::infer_runtime(&self.model))
57    }
58
59    /// Cheap structural validation done at deploy time. Heavier checks
60    /// (provider tier limits, network egress) live in `inference-runtime`
61    /// where we can perform IO.
62    pub fn validate(&self) -> Result<(), DeploymentValidationError> {
63        if self.name.is_empty() {
64            return Err(DeploymentValidationError::EmptyName);
65        }
66        if self.model.is_empty() {
67            return Err(DeploymentValidationError::EmptyModel);
68        }
69        if self.replicas == 0 {
70            return Err(DeploymentValidationError::ZeroReplicas);
71        }
72        Ok(())
73    }
74}
75
76/// Per-deployment serving capacity (doc §11.2).
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct Serving {
79    /// For remote: worker pool size; for local: maximum in-flight
80    /// requests on the engine. Doc §3.5 (capacity bottleneck).
81    pub max_concurrent: u32,
82    pub on_capacity_exhausted: CapacityPolicy,
83}
84
85impl Default for Serving {
86    fn default() -> Self {
87        Self {
88            max_concurrent: 32,
89            on_capacity_exhausted: CapacityPolicy::Queue,
90        }
91    }
92}
93
94#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
95#[serde(rename_all = "snake_case")]
96pub enum CapacityPolicy {
97    Reject,
98    Queue,
99    Fallback,
100}
101
102/// Replica metadata (doc §7.2). Owned by the `DeploymentManagerActor`;
103/// the `Deployment` itself doesn't carry placements.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct Replica {
106    pub deployment: String,
107    pub replica_index: u32,
108    pub node: Option<String>,
109    pub gpu_indices: Vec<u32>,
110}
111
112/// Provider-imposed rate limits (doc §3.5). Per `(provider, api_key,
113/// model)`. Cluster-distributed via `RateLimiterActor`.
114#[derive(Debug, Clone, Default, Serialize, Deserialize)]
115pub struct RateLimits {
116    #[serde(default, skip_serializing_if = "Option::is_none")]
117    pub requests_per_minute: Option<u64>,
118    #[serde(default, skip_serializing_if = "Option::is_none")]
119    pub tokens_per_minute: Option<u64>,
120    #[serde(default, skip_serializing_if = "Option::is_none")]
121    pub concurrent_requests: Option<u32>,
122    /// True selects `StrictRateLimiterActor` (cluster singleton, exact
123    /// accounting). False selects the approximate CRDT-backed variant.
124    /// Doc §12.1.
125    #[serde(default)]
126    pub strict: bool,
127}
128
129/// Retry policy applied inside `RemoteWorkerActor` (doc §3.5, §12.3).
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RetryPolicy {
132    pub max_retries: u32,
133    #[serde(with = "humantime_serde_ms")]
134    pub initial_backoff: Duration,
135    #[serde(with = "humantime_serde_ms")]
136    pub max_backoff: Duration,
137    pub backoff_multiplier: f64,
138    pub jitter: JitterKind,
139    pub respect_retry_after: bool,
140}
141
142impl Default for RetryPolicy {
143    fn default() -> Self {
144        Self {
145            max_retries: 3,
146            initial_backoff: Duration::from_millis(1_000),
147            max_backoff: Duration::from_millis(60_000),
148            backoff_multiplier: 2.0,
149            jitter: JitterKind::Equal,
150            respect_retry_after: true,
151        }
152    }
153}
154
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct Timeouts {
157    /// Time from send to first byte received.
158    #[serde(with = "humantime_serde_ms")]
159    pub request_timeout: Duration,
160    /// For streaming responses, time between consecutive bytes.
161    #[serde(with = "humantime_serde_ms")]
162    pub read_timeout: Duration,
163}
164
165impl Default for Timeouts {
166    fn default() -> Self {
167        Self {
168            request_timeout: Duration::from_millis(30_000),
169            read_timeout: Duration::from_millis(10_000),
170        }
171    }
172}
173
174/// Spend ceilings; enforced by `MetricsActor` + `RemoteEngineCoreActor`
175/// (doc §11.6, §12.4).
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct Budget {
178    #[serde(default, skip_serializing_if = "Option::is_none")]
179    pub max_spend_per_hour_usd: Option<f64>,
180    #[serde(default, skip_serializing_if = "Option::is_none")]
181    pub max_spend_per_day_usd: Option<f64>,
182    pub on_exceeded: BudgetAction,
183}
184
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
186#[serde(rename_all = "snake_case")]
187pub enum BudgetAction {
188    Reject,
189    Warn,
190    Throttle,
191}
192
193/// Re-export so callers that only depend on `inference-core` can also
194/// see the circuit-breaker config without reaching into `runtime`.
195pub use crate::runtime::CircuitBreakerConfig as CircuitBreakerConfigAlias;
196
197#[derive(Debug, thiserror::Error)]
198pub enum DeploymentValidationError {
199    #[error("deployment name must not be empty")]
200    EmptyName,
201    #[error("deployment model must not be empty")]
202    EmptyModel,
203    #[error("deployment must have at least one replica")]
204    ZeroReplicas,
205    #[error("rate limits exceed known provider tier: {0}")]
206    RateLimitTooHigh(String),
207}
208
209// keep CircuitBreakerConfig importable from this module too — the doc's
210// examples (e.g. §11.2) put `CircuitBreakerConfig` next to `RetryPolicy`.
211#[allow(dead_code)]
212fn _ensure_cb_visible(_c: CircuitBreakerConfig) {}
atomr_infer_core/deployment.rs

atomr_infer_core/
deployment.rs