1use std::time::Duration;
5
6use serde::{Deserialize, Serialize};
7
8use crate::runtime::{humantime_serde_ms, CircuitBreakerConfig, JitterKind, RuntimeConfig, RuntimeKind};
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct Deployment {
16 pub name: String,
17 pub model: String,
18 #[serde(default, skip_serializing_if = "Option::is_none")]
21 pub runtime: Option<RuntimeKind>,
22 #[serde(default, skip_serializing_if = "Option::is_none")]
24 pub runtime_config: Option<RuntimeConfig>,
25 #[serde(default, skip_serializing_if = "Option::is_none")]
27 pub gpus: Option<u32>,
28 #[serde(default = "default_replicas")]
31 pub replicas: u32,
32 #[serde(default)]
33 pub serving: Serving,
34 #[serde(default, skip_serializing_if = "Option::is_none")]
35 pub budget: Option<Budget>,
36 #[serde(default = "default_idempotent")]
39 pub idempotent: bool,
40}
41
42fn default_replicas() -> u32 {
43 1
44}
45fn default_idempotent() -> bool {
46 true
47}
48
49impl Deployment {
50 pub fn effective_runtime(&self) -> RuntimeKind {
53 self.runtime
54 .clone()
55 .or_else(|| self.runtime_config.as_ref().map(RuntimeConfig::runtime_kind))
56 .unwrap_or_else(|| crate::registry::infer_runtime(&self.model))
57 }
58
59 pub fn validate(&self) -> Result<(), DeploymentValidationError> {
63 if self.name.is_empty() {
64 return Err(DeploymentValidationError::EmptyName);
65 }
66 if self.model.is_empty() {
67 return Err(DeploymentValidationError::EmptyModel);
68 }
69 if self.replicas == 0 {
70 return Err(DeploymentValidationError::ZeroReplicas);
71 }
72 Ok(())
73 }
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct Serving {
79 pub max_concurrent: u32,
82 pub on_capacity_exhausted: CapacityPolicy,
83}
84
85impl Default for Serving {
86 fn default() -> Self {
87 Self {
88 max_concurrent: 32,
89 on_capacity_exhausted: CapacityPolicy::Queue,
90 }
91 }
92}
93
94#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
95#[serde(rename_all = "snake_case")]
96pub enum CapacityPolicy {
97 Reject,
98 Queue,
99 Fallback,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct Replica {
106 pub deployment: String,
107 pub replica_index: u32,
108 pub node: Option<String>,
109 pub gpu_indices: Vec<u32>,
110}
111
112#[derive(Debug, Clone, Default, Serialize, Deserialize)]
115pub struct RateLimits {
116 #[serde(default, skip_serializing_if = "Option::is_none")]
117 pub requests_per_minute: Option<u64>,
118 #[serde(default, skip_serializing_if = "Option::is_none")]
119 pub tokens_per_minute: Option<u64>,
120 #[serde(default, skip_serializing_if = "Option::is_none")]
121 pub concurrent_requests: Option<u32>,
122 #[serde(default)]
126 pub strict: bool,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RetryPolicy {
132 pub max_retries: u32,
133 #[serde(with = "humantime_serde_ms")]
134 pub initial_backoff: Duration,
135 #[serde(with = "humantime_serde_ms")]
136 pub max_backoff: Duration,
137 pub backoff_multiplier: f64,
138 pub jitter: JitterKind,
139 pub respect_retry_after: bool,
140}
141
142impl Default for RetryPolicy {
143 fn default() -> Self {
144 Self {
145 max_retries: 3,
146 initial_backoff: Duration::from_millis(1_000),
147 max_backoff: Duration::from_millis(60_000),
148 backoff_multiplier: 2.0,
149 jitter: JitterKind::Equal,
150 respect_retry_after: true,
151 }
152 }
153}
154
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct Timeouts {
157 #[serde(with = "humantime_serde_ms")]
159 pub request_timeout: Duration,
160 #[serde(with = "humantime_serde_ms")]
162 pub read_timeout: Duration,
163}
164
165impl Default for Timeouts {
166 fn default() -> Self {
167 Self {
168 request_timeout: Duration::from_millis(30_000),
169 read_timeout: Duration::from_millis(10_000),
170 }
171 }
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct Budget {
178 #[serde(default, skip_serializing_if = "Option::is_none")]
179 pub max_spend_per_hour_usd: Option<f64>,
180 #[serde(default, skip_serializing_if = "Option::is_none")]
181 pub max_spend_per_day_usd: Option<f64>,
182 pub on_exceeded: BudgetAction,
183}
184
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
186#[serde(rename_all = "snake_case")]
187pub enum BudgetAction {
188 Reject,
189 Warn,
190 Throttle,
191}
192
193pub use crate::runtime::CircuitBreakerConfig as CircuitBreakerConfigAlias;
196
197#[derive(Debug, thiserror::Error)]
198pub enum DeploymentValidationError {
199 #[error("deployment name must not be empty")]
200 EmptyName,
201 #[error("deployment model must not be empty")]
202 EmptyModel,
203 #[error("deployment must have at least one replica")]
204 ZeroReplicas,
205 #[error("rate limits exceed known provider tier: {0}")]
206 RateLimitTooHigh(String),
207}
208
209#[allow(dead_code)]
212fn _ensure_cb_visible(_c: CircuitBreakerConfig) {}