Skip to main content

rust_supervisor/spec/
supervisor.rs

1//! Supervisor declaration model.
2//!
3//! This module owns the root and nested supervisor specification shape used by
4//! tree construction and runtime startup.
5
6use crate::error::types::SupervisorError;
7use crate::id::types::{ChildId, SupervisorPath};
8use crate::spec::child::{BackoffPolicy, ChildSpec, HealthPolicy, RestartPolicy, ShutdownPolicy};
9use schemars::JsonSchema;
10use serde::{Deserialize, Serialize};
11use std::collections::HashSet;
12use std::time::Duration;
13
14/// Strategy used when a child exits and a restart scope is needed.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
16pub enum SupervisionStrategy {
17    /// Restart only the failed child.
18    OneForOne,
19    /// Restart every child under the same supervisor.
20    OneForAll,
21    /// Restart the failed child and all children declared after it.
22    RestForOne,
23}
24
25/// Policy used when a restart scope cannot remain local.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum EscalationPolicy {
28    /// Escalate the failure to the parent supervisor.
29    EscalateToParent,
30    /// Shut down the current supervisor tree.
31    ShutdownTree,
32    /// Quarantine the selected restart scope.
33    QuarantineScope,
34}
35
36/// Restart budget attached to a supervisor, group, or child override.
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub struct RestartBudget {
39    /// Maximum restarts allowed within the window.
40    pub max_restarts: u32,
41    /// Time window used to count restarts.
42    pub window: Duration,
43}
44
45impl RestartBudget {
46    /// Creates a restart budget.
47    ///
48    /// # Arguments
49    ///
50    /// - `max_restarts`: Maximum restart count allowed in the window.
51    /// - `window`: Duration used for the restart count window.
52    ///
53    /// # Returns
54    ///
55    /// Returns a [`RestartBudget`] value.
56    pub fn new(max_restarts: u32, window: Duration) -> Self {
57        Self {
58            max_restarts,
59            window,
60        }
61    }
62}
63
64/// Strategy and governance overrides for a named child group.
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct GroupStrategy {
67    /// Low-cardinality group tag shared by children.
68    pub group: String,
69    /// Restart strategy applied inside the group.
70    pub strategy: SupervisionStrategy,
71    /// Optional restart budget for this group.
72    pub restart_budget: Option<RestartBudget>,
73    /// Optional escalation policy for this group.
74    pub escalation_policy: Option<EscalationPolicy>,
75}
76
77impl GroupStrategy {
78    /// Creates a group strategy.
79    ///
80    /// # Arguments
81    ///
82    /// - `group`: Child tag that identifies the restart group.
83    /// - `strategy`: Restart strategy applied to the group.
84    ///
85    /// # Returns
86    ///
87    /// Returns a [`GroupStrategy`] with no budget or escalation override.
88    pub fn new(group: impl Into<String>, strategy: SupervisionStrategy) -> Self {
89        Self {
90            group: group.into(),
91            strategy,
92            restart_budget: None,
93            escalation_policy: None,
94        }
95    }
96}
97
98/// Per-child strategy and governance override.
99#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct ChildStrategyOverride {
101    /// Child identifier that owns the override.
102    pub child_id: ChildId,
103    /// Restart strategy used when this child fails.
104    pub strategy: SupervisionStrategy,
105    /// Optional restart budget for this child.
106    pub restart_budget: Option<RestartBudget>,
107    /// Optional escalation policy for this child.
108    pub escalation_policy: Option<EscalationPolicy>,
109}
110
111impl ChildStrategyOverride {
112    /// Creates a child strategy override.
113    ///
114    /// # Arguments
115    ///
116    /// - `child_id`: Child identifier that owns the override.
117    /// - `strategy`: Restart strategy used for the child.
118    ///
119    /// # Returns
120    ///
121    /// Returns a [`ChildStrategyOverride`] value.
122    pub fn new(child_id: ChildId, strategy: SupervisionStrategy) -> Self {
123        Self {
124            child_id,
125            strategy,
126            restart_budget: None,
127            escalation_policy: None,
128        }
129    }
130}
131
132/// Dynamic supervisor policy for runtime child additions.
133#[derive(Debug, Clone, Copy, PartialEq, Eq)]
134pub struct DynamicSupervisorPolicy {
135    /// Whether runtime child additions are allowed.
136    pub enabled: bool,
137    /// Optional maximum number of declared and dynamic children.
138    pub child_limit: Option<usize>,
139}
140
141impl DynamicSupervisorPolicy {
142    /// Creates an unbounded dynamic supervisor policy.
143    ///
144    /// # Arguments
145    ///
146    /// This function has no arguments.
147    ///
148    /// # Returns
149    ///
150    /// Returns a policy that allows dynamic child additions without a limit.
151    pub fn unbounded() -> Self {
152        Self {
153            enabled: true,
154            child_limit: None,
155        }
156    }
157
158    /// Creates a limited dynamic supervisor policy.
159    ///
160    /// # Arguments
161    ///
162    /// - `child_limit`: Maximum declared and dynamic child count.
163    ///
164    /// # Returns
165    ///
166    /// Returns a policy that allows dynamic additions up to the limit.
167    pub fn limited(child_limit: usize) -> Self {
168        Self {
169            enabled: true,
170            child_limit: Some(child_limit),
171        }
172    }
173
174    /// Reports whether another dynamic child can be added.
175    ///
176    /// # Arguments
177    ///
178    /// - `current_child_count`: Current declared plus dynamic child count.
179    ///
180    /// # Returns
181    ///
182    /// Returns `true` when the next addition is allowed.
183    pub fn allows_addition(&self, current_child_count: usize) -> bool {
184        self.enabled
185            && self
186                .child_limit
187                .is_none_or(|limit| current_child_count < limit)
188    }
189}
190
191/// Restart plan selected after strategy, group, and child overrides are merged.
192#[derive(Debug, Clone, PartialEq, Eq)]
193pub struct StrategyExecutionPlan {
194    /// Child whose exit triggered the plan.
195    pub failed_child: ChildId,
196    /// Strategy selected for this execution.
197    pub strategy: SupervisionStrategy,
198    /// Child identifiers selected for restart.
199    pub scope: Vec<ChildId>,
200    /// Optional group that constrained the scope.
201    pub group: Option<String>,
202    /// Optional restart budget selected for the plan.
203    pub restart_budget: Option<RestartBudget>,
204    /// Optional escalation policy selected for the plan.
205    pub escalation_policy: Option<EscalationPolicy>,
206    /// Whether dynamic supervisor additions are allowed.
207    pub dynamic_supervisor_enabled: bool,
208}
209
210/// Declarative specification for one supervisor node.
211#[derive(Debug, Clone)]
212pub struct SupervisorSpec {
213    /// Stable path for this supervisor.
214    pub path: SupervisorPath,
215    /// Restart scope strategy for child exits.
216    pub strategy: SupervisionStrategy,
217    /// Children in declaration order.
218    pub children: Vec<ChildSpec>,
219    /// Configuration version that produced this declaration.
220    pub config_version: String,
221    /// Restart policy inherited by children that do not override it.
222    pub default_restart_policy: RestartPolicy,
223    /// Backoff policy inherited by children that do not override it.
224    pub default_backoff_policy: BackoffPolicy,
225    /// Health policy inherited by children that do not override it.
226    pub default_health_policy: HealthPolicy,
227    /// Shutdown policy inherited by children that do not override it.
228    pub default_shutdown_policy: ShutdownPolicy,
229    /// Maximum supervisor failures before parent escalation.
230    pub supervisor_failure_limit: u32,
231    /// Optional supervisor-level restart budget.
232    pub restart_budget: Option<RestartBudget>,
233    /// Optional supervisor-level escalation policy.
234    pub escalation_policy: Option<EscalationPolicy>,
235    /// Group-level strategy overrides.
236    pub group_strategies: Vec<GroupStrategy>,
237    /// Child-level strategy overrides.
238    pub child_strategy_overrides: Vec<ChildStrategyOverride>,
239    /// Runtime policy for dynamic child additions.
240    pub dynamic_supervisor_policy: DynamicSupervisorPolicy,
241    /// Control command channel capacity.
242    pub control_channel_capacity: usize,
243    /// Event broadcast channel capacity.
244    pub event_channel_capacity: usize,
245}
246
247impl SupervisorSpec {
248    /// Creates a root supervisor specification.
249    ///
250    /// # Arguments
251    ///
252    /// - `children`: Children declared under the root supervisor.
253    ///
254    /// # Returns
255    ///
256    /// Returns a root [`SupervisorSpec`] with declaration-order children.
257    ///
258    /// # Examples
259    ///
260    /// ```
261    /// let spec = rust_supervisor::spec::supervisor::SupervisorSpec::root(Vec::new());
262    /// assert_eq!(spec.path.to_string(), "/");
263    /// ```
264    pub fn root(children: Vec<ChildSpec>) -> Self {
265        let channel_capacity = channel_capacity_for_children(children.len());
266        Self {
267            path: SupervisorPath::root(),
268            strategy: SupervisionStrategy::OneForOne,
269            children,
270            config_version: String::from("unversioned"),
271            default_restart_policy: RestartPolicy::Transient,
272            default_backoff_policy: BackoffPolicy::new(
273                Duration::from_millis(10),
274                Duration::from_secs(1),
275                0.0,
276            ),
277            default_health_policy: HealthPolicy::new(
278                Duration::from_secs(1),
279                Duration::from_secs(3),
280            ),
281            default_shutdown_policy: ShutdownPolicy::new(
282                Duration::from_secs(5),
283                Duration::from_secs(1),
284            ),
285            supervisor_failure_limit: 1,
286            restart_budget: None,
287            escalation_policy: None,
288            group_strategies: Vec::new(),
289            child_strategy_overrides: Vec::new(),
290            dynamic_supervisor_policy: DynamicSupervisorPolicy::unbounded(),
291            control_channel_capacity: channel_capacity,
292            event_channel_capacity: channel_capacity.saturating_mul(2),
293        }
294    }
295
296    /// Validates this supervisor and its direct children.
297    ///
298    /// # Arguments
299    ///
300    /// This function has no arguments.
301    ///
302    /// # Returns
303    ///
304    /// Returns `Ok(())` when the supervisor declaration is usable.
305    pub fn validate(&self) -> Result<(), SupervisorError> {
306        if self.config_version.trim().is_empty() {
307            return Err(SupervisorError::fatal_config(
308                "config version must not be empty",
309            ));
310        }
311        if self.supervisor_failure_limit == 0 {
312            return Err(SupervisorError::fatal_config(
313                "supervisor failure limit must be greater than zero",
314            ));
315        }
316        if self.control_channel_capacity == 0 {
317            return Err(SupervisorError::fatal_config(
318                "control channel capacity must be greater than zero",
319            ));
320        }
321        if self.event_channel_capacity == 0 {
322            return Err(SupervisorError::fatal_config(
323                "event channel capacity must be greater than zero",
324            ));
325        }
326        for child in &self.children {
327            child.validate()?;
328        }
329        validate_restart_budget(self.restart_budget)?;
330        validate_group_strategies(&self.group_strategies, &self.children)?;
331        validate_child_strategy_overrides(self)?;
332        validate_dynamic_policy(self.dynamic_supervisor_policy)?;
333        Ok(())
334    }
335}
336
337/// Validates an optional restart budget.
338///
339/// # Arguments
340///
341/// - `budget`: Optional restart budget to validate.
342///
343/// # Returns
344///
345/// Returns `Ok(())` when the budget is absent or valid.
346fn validate_restart_budget(budget: Option<RestartBudget>) -> Result<(), SupervisorError> {
347    let Some(budget) = budget else {
348        return Ok(());
349    };
350    if budget.max_restarts == 0 {
351        return Err(SupervisorError::fatal_config(
352            "restart budget max_restarts must be greater than zero",
353        ));
354    }
355    if budget.window.is_zero() {
356        return Err(SupervisorError::fatal_config(
357            "restart budget window must be greater than zero",
358        ));
359    }
360    Ok(())
361}
362
363/// Validates group strategy declarations.
364///
365/// # Arguments
366///
367/// - `strategies`: Group strategies declared on the supervisor.
368///
369/// # Returns
370///
371/// Returns `Ok(())` when group names are unique and valid.
372fn validate_group_strategies(
373    strategies: &[GroupStrategy],
374    children: &[ChildSpec],
375) -> Result<(), SupervisorError> {
376    let mut groups = HashSet::new();
377    for strategy in strategies {
378        if strategy.group.trim().is_empty() {
379            return Err(SupervisorError::fatal_config(
380                "group strategy group must not be empty",
381            ));
382        }
383        if !groups.insert(strategy.group.clone()) {
384            return Err(SupervisorError::fatal_config(format!(
385                "duplicate group strategy: {}",
386                strategy.group
387            )));
388        }
389        validate_restart_budget(strategy.restart_budget)?;
390    }
391    validate_group_membership(strategies, children)?;
392    Ok(())
393}
394
395/// Validates child membership against configured restart groups.
396///
397/// # Arguments
398///
399/// - `strategies`: Group strategies declared on the supervisor.
400/// - `children`: Children declared under the supervisor.
401///
402/// # Returns
403///
404/// Returns `Ok(())` when every configured group is used without ambiguity.
405fn validate_group_membership(
406    strategies: &[GroupStrategy],
407    children: &[ChildSpec],
408) -> Result<(), SupervisorError> {
409    let groups = strategies
410        .iter()
411        .map(|strategy| strategy.group.clone())
412        .collect::<HashSet<_>>();
413    for strategy in strategies {
414        if !children
415            .iter()
416            .any(|child| child.tags.contains(&strategy.group))
417        {
418            return Err(SupervisorError::fatal_config(format!(
419                "group strategy references unused group: {}",
420                strategy.group
421            )));
422        }
423    }
424    for child in children {
425        let configured_group_count = child
426            .tags
427            .iter()
428            .filter(|tag| groups.contains(*tag))
429            .count();
430        if configured_group_count > 1 {
431            return Err(SupervisorError::fatal_config(format!(
432                "child strategy groups are ambiguous for child: {}",
433                child.id
434            )));
435        }
436    }
437    Ok(())
438}
439
440/// Validates child strategy overrides.
441///
442/// # Arguments
443///
444/// - `spec`: Supervisor specification that owns children and overrides.
445///
446/// # Returns
447///
448/// Returns `Ok(())` when every override targets a known child once.
449fn validate_child_strategy_overrides(spec: &SupervisorSpec) -> Result<(), SupervisorError> {
450    let child_ids = spec
451        .children
452        .iter()
453        .map(|child| child.id.clone())
454        .collect::<HashSet<_>>();
455    let mut overrides = HashSet::new();
456    for strategy in &spec.child_strategy_overrides {
457        if !child_ids.contains(&strategy.child_id) {
458            return Err(SupervisorError::fatal_config(format!(
459                "child strategy override references unknown child: {}",
460                strategy.child_id
461            )));
462        }
463        if !overrides.insert(strategy.child_id.clone()) {
464            return Err(SupervisorError::fatal_config(format!(
465                "duplicate child strategy override: {}",
466                strategy.child_id
467            )));
468        }
469        validate_restart_budget(strategy.restart_budget)?;
470    }
471    Ok(())
472}
473
474/// Validates dynamic supervisor policy.
475///
476/// # Arguments
477///
478/// - `policy`: Dynamic supervisor policy to validate.
479///
480/// # Returns
481///
482/// Returns `Ok(())` when the policy limit is coherent.
483fn validate_dynamic_policy(policy: DynamicSupervisorPolicy) -> Result<(), SupervisorError> {
484    if policy.child_limit == Some(0) {
485        return Err(SupervisorError::fatal_config(
486            "dynamic supervisor child_limit must be greater than zero",
487        ));
488    }
489    Ok(())
490}
491
492/// Derives a channel capacity from declared children.
493///
494/// # Arguments
495///
496/// - `child_count`: Number of children declared under the supervisor.
497///
498/// # Returns
499///
500/// Returns a non-zero channel capacity.
501fn channel_capacity_for_children(child_count: usize) -> usize {
502    child_count.saturating_add(1)
503}