use std::time::Duration;
use serde::{Deserialize, Serialize};
use crate::runtime::{humantime_serde_ms, CircuitBreakerConfig, JitterKind, RuntimeConfig, RuntimeKind};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Deployment {
pub name: String,
pub model: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub runtime: Option<RuntimeKind>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub runtime_config: Option<RuntimeConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub gpus: Option<u32>,
#[serde(default = "default_replicas")]
pub replicas: u32,
#[serde(default)]
pub serving: Serving,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub budget: Option<Budget>,
#[serde(default = "default_idempotent")]
pub idempotent: bool,
}
fn default_replicas() -> u32 {
1
}
fn default_idempotent() -> bool {
true
}
impl Deployment {
pub fn effective_runtime(&self) -> RuntimeKind {
self.runtime
.clone()
.or_else(|| self.runtime_config.as_ref().map(RuntimeConfig::runtime_kind))
.unwrap_or_else(|| crate::registry::infer_runtime(&self.model))
}
pub fn validate(&self) -> Result<(), DeploymentValidationError> {
if self.name.is_empty() {
return Err(DeploymentValidationError::EmptyName);
}
if self.model.is_empty() {
return Err(DeploymentValidationError::EmptyModel);
}
if self.replicas == 0 {
return Err(DeploymentValidationError::ZeroReplicas);
}
Ok(())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Serving {
pub max_concurrent: u32,
pub on_capacity_exhausted: CapacityPolicy,
}
impl Default for Serving {
fn default() -> Self {
Self {
max_concurrent: 32,
on_capacity_exhausted: CapacityPolicy::Queue,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum CapacityPolicy {
Reject,
Queue,
Fallback,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Replica {
pub deployment: String,
pub replica_index: u32,
pub node: Option<String>,
pub gpu_indices: Vec<u32>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct RateLimits {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub requests_per_minute: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub tokens_per_minute: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub concurrent_requests: Option<u32>,
#[serde(default)]
pub strict: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetryPolicy {
pub max_retries: u32,
#[serde(with = "humantime_serde_ms")]
pub initial_backoff: Duration,
#[serde(with = "humantime_serde_ms")]
pub max_backoff: Duration,
pub backoff_multiplier: f64,
pub jitter: JitterKind,
pub respect_retry_after: bool,
}
impl Default for RetryPolicy {
fn default() -> Self {
Self {
max_retries: 3,
initial_backoff: Duration::from_millis(1_000),
max_backoff: Duration::from_millis(60_000),
backoff_multiplier: 2.0,
jitter: JitterKind::Equal,
respect_retry_after: true,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Timeouts {
#[serde(with = "humantime_serde_ms")]
pub request_timeout: Duration,
#[serde(with = "humantime_serde_ms")]
pub read_timeout: Duration,
}
impl Default for Timeouts {
fn default() -> Self {
Self {
request_timeout: Duration::from_millis(30_000),
read_timeout: Duration::from_millis(10_000),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Budget {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_spend_per_hour_usd: Option<f64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_spend_per_day_usd: Option<f64>,
pub on_exceeded: BudgetAction,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum BudgetAction {
Reject,
Warn,
Throttle,
}
pub use crate::runtime::CircuitBreakerConfig as CircuitBreakerConfigAlias;
#[derive(Debug, thiserror::Error)]
pub enum DeploymentValidationError {
#[error("deployment name must not be empty")]
EmptyName,
#[error("deployment model must not be empty")]
EmptyModel,
#[error("deployment must have at least one replica")]
ZeroReplicas,
#[error("rate limits exceed known provider tier: {0}")]
RateLimitTooHigh(String),
}
#[allow(dead_code)]
fn _ensure_cb_visible(_c: CircuitBreakerConfig) {}