Skip to main content

rust_supervisor/config/
policy.rs

1//! YAML-friendly policy configuration models.
2//!
3//! This module owns configuration input structs for lower-level supervision
4//! policy objects whose runtime form uses `Duration`, `ChildId`, or other
5//! strongly typed values.
6
7use confique::Config;
8use schemars::JsonSchema;
9use serde::{Deserialize, Serialize};
10use std::time::Duration;
11
12use crate::id::types::ChildId;
13use crate::policy::budget as runtime_budget;
14use crate::policy::failure_window as runtime_failure_window;
15use crate::policy::group::{GroupDependencyEdge, PropagationPolicy};
16use crate::policy::meltdown::MeltdownPolicy;
17use crate::policy::task_role_defaults::{SeverityClass, TaskRole};
18use crate::spec::supervisor::{
19    ChildStrategyOverride, DynamicSupervisorPolicy, EscalationPolicy,
20    GroupConfig as RuntimeGroupConfig, GroupStrategy, RestartLimit, SupervisionStrategy,
21};
22
23/// Restart budget configuration loaded from YAML.
24#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Config, JsonSchema)]
25pub struct RestartBudgetConfig {
26    /// Sliding window duration in seconds.
27    #[config(default = 60)]
28    #[serde(default = "default_restart_budget_window_secs")]
29    pub window_secs: u64,
30    /// Maximum burst failures allowed within the window.
31    #[config(default = 10)]
32    #[serde(default = "default_restart_budget_max_burst")]
33    pub max_burst: u32,
34    /// Token recovery rate per second.
35    #[config(default = 0.5)]
36    #[serde(default = "default_restart_budget_recovery_rate")]
37    pub recovery_rate_per_sec: f64,
38}
39
40impl RestartBudgetConfig {
41    /// Converts this YAML-friendly config into the runtime restart budget.
42    ///
43    /// # Arguments
44    ///
45    /// This function has no arguments.
46    ///
47    /// # Returns
48    ///
49    /// Returns a [`runtime_budget::RestartBudgetConfig`] value.
50    pub fn to_runtime(&self) -> runtime_budget::RestartBudgetConfig {
51        runtime_budget::RestartBudgetConfig::new(
52            Duration::from_secs(self.window_secs),
53            self.max_burst,
54            self.recovery_rate_per_sec,
55        )
56    }
57}
58
59impl Default for RestartBudgetConfig {
60    /// Returns the default restart budget configuration.
61    fn default() -> Self {
62        Self {
63            window_secs: default_restart_budget_window_secs(),
64            max_burst: default_restart_budget_max_burst(),
65            recovery_rate_per_sec: default_restart_budget_recovery_rate(),
66        }
67    }
68}
69
70/// Failure window mode loaded from YAML.
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
72#[serde(rename_all = "snake_case")]
73pub enum FailureWindowMode {
74    /// Count failures that occur inside a time window.
75    TimeSliding,
76    /// Keep the most recent failure samples by count.
77    CountSliding,
78}
79
80impl Default for FailureWindowMode {
81    /// Returns the default time-sliding mode.
82    fn default() -> Self {
83        Self::TimeSliding
84    }
85}
86
87/// Failure window configuration loaded from YAML.
88#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
89pub struct FailureWindowConfig {
90    /// Window mode selection.
91    #[config(default = "time_sliding")]
92    #[serde(default)]
93    pub mode: FailureWindowMode,
94    /// Time window width in seconds for `time_sliding` mode.
95    #[config(default = 60)]
96    #[serde(default = "default_failure_window_secs")]
97    pub window_secs: u64,
98    /// Maximum retained failure count for `count_sliding` mode.
99    #[config(default = 5)]
100    #[serde(default = "default_failure_window_max_count")]
101    pub max_count: usize,
102    /// Failure threshold at which the window is considered exhausted.
103    #[config(default = 5)]
104    #[serde(default = "default_failure_window_threshold")]
105    pub threshold: usize,
106}
107
108impl FailureWindowConfig {
109    /// Converts this YAML-friendly config into the runtime failure window.
110    ///
111    /// # Arguments
112    ///
113    /// This function has no arguments.
114    ///
115    /// # Returns
116    ///
117    /// Returns a [`runtime_failure_window::FailureWindowConfig`] value.
118    pub fn to_runtime(&self) -> runtime_failure_window::FailureWindowConfig {
119        match self.mode {
120            FailureWindowMode::TimeSliding => {
121                runtime_failure_window::FailureWindowConfig::time_sliding(
122                    self.window_secs,
123                    self.threshold,
124                )
125            }
126            FailureWindowMode::CountSliding => {
127                runtime_failure_window::FailureWindowConfig::count_sliding(
128                    self.max_count,
129                    self.threshold,
130                )
131            }
132        }
133    }
134}
135
136impl Default for FailureWindowConfig {
137    /// Returns the default failure window configuration.
138    fn default() -> Self {
139        Self {
140            mode: FailureWindowMode::default(),
141            window_secs: default_failure_window_secs(),
142            max_count: default_failure_window_max_count(),
143            threshold: default_failure_window_threshold(),
144        }
145    }
146}
147
148/// Meltdown fuse configuration loaded from YAML.
149#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
150pub struct MeltdownConfig {
151    /// Maximum restarts allowed for one child inside the child window.
152    #[config(default = 3)]
153    #[serde(default = "default_meltdown_child_max_restarts")]
154    pub child_max_restarts: u32,
155    /// Window used to count child restarts, in seconds.
156    #[config(default = 10)]
157    #[serde(default = "default_meltdown_child_window_secs")]
158    pub child_window_secs: u64,
159    /// Maximum failures allowed for one group inside the group window.
160    #[config(default = 5)]
161    #[serde(default = "default_meltdown_group_max_failures")]
162    pub group_max_failures: u32,
163    /// Window used to count group failures, in seconds.
164    #[config(default = 30)]
165    #[serde(default = "default_meltdown_group_window_secs")]
166    pub group_window_secs: u64,
167    /// Maximum failures allowed for the supervisor inside the supervisor window.
168    #[config(default = 10)]
169    #[serde(default = "default_meltdown_supervisor_max_failures")]
170    pub supervisor_max_failures: u32,
171    /// Window used to count supervisor failures, in seconds.
172    #[config(default = 60)]
173    #[serde(default = "default_meltdown_supervisor_window_secs")]
174    pub supervisor_window_secs: u64,
175    /// Stable duration after which recorded counters may be cleared, in seconds.
176    #[config(default = 120)]
177    #[serde(default = "default_meltdown_reset_after_secs")]
178    pub reset_after_secs: u64,
179}
180
181impl MeltdownConfig {
182    /// Converts this YAML-friendly config into the runtime meltdown policy.
183    ///
184    /// # Arguments
185    ///
186    /// This function has no arguments.
187    ///
188    /// # Returns
189    ///
190    /// Returns a [`MeltdownPolicy`] value.
191    pub fn to_runtime(&self) -> MeltdownPolicy {
192        MeltdownPolicy::new(
193            self.child_max_restarts,
194            Duration::from_secs(self.child_window_secs),
195            self.group_max_failures,
196            Duration::from_secs(self.group_window_secs),
197            self.supervisor_max_failures,
198            Duration::from_secs(self.supervisor_window_secs),
199            Duration::from_secs(self.reset_after_secs),
200        )
201    }
202}
203
204impl Default for MeltdownConfig {
205    /// Returns the default meltdown fuse configuration.
206    fn default() -> Self {
207        Self {
208            child_max_restarts: default_meltdown_child_max_restarts(),
209            child_window_secs: default_meltdown_child_window_secs(),
210            group_max_failures: default_meltdown_group_max_failures(),
211            group_window_secs: default_meltdown_group_window_secs(),
212            supervisor_max_failures: default_meltdown_supervisor_max_failures(),
213            supervisor_window_secs: default_meltdown_supervisor_window_secs(),
214            reset_after_secs: default_meltdown_reset_after_secs(),
215        }
216    }
217}
218
219/// Supervision pipeline capacities loaded from YAML.
220#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
221pub struct SupervisionPipelineConfig {
222    /// Event journal capacity used by the supervision pipeline.
223    #[config(default = 100)]
224    #[serde(default = "default_pipeline_journal_capacity")]
225    pub journal_capacity: usize,
226    /// Subscriber queue capacity used by the supervision pipeline.
227    #[config(default = 10)]
228    #[serde(default = "default_pipeline_subscriber_capacity")]
229    pub subscriber_capacity: usize,
230    /// Maximum concurrent restarts allowed for one supervisor instance.
231    #[config(default = 5)]
232    #[serde(default = "default_concurrent_restart_limit")]
233    pub concurrent_restart_limit: u32,
234}
235
236impl Default for SupervisionPipelineConfig {
237    /// Returns the default supervision pipeline capacities.
238    fn default() -> Self {
239        Self {
240            journal_capacity: default_pipeline_journal_capacity(),
241            subscriber_capacity: default_pipeline_subscriber_capacity(),
242            concurrent_restart_limit: default_concurrent_restart_limit(),
243        }
244    }
245}
246
247/// Dynamic child acceptance policy loaded from YAML.
248#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
249pub struct DynamicSupervisorConfig {
250    /// Whether runtime child additions are accepted.
251    #[config(default = true)]
252    #[serde(default = "default_true")]
253    pub enabled: bool,
254    /// Optional maximum number of declared and dynamic children.
255    #[serde(default)]
256    pub child_limit: Option<usize>,
257}
258
259impl DynamicSupervisorConfig {
260    /// Converts this YAML-friendly config into the runtime dynamic policy.
261    ///
262    /// # Arguments
263    ///
264    /// This function has no arguments.
265    ///
266    /// # Returns
267    ///
268    /// Returns a [`DynamicSupervisorPolicy`] value.
269    pub fn to_runtime(&self) -> DynamicSupervisorPolicy {
270        DynamicSupervisorPolicy {
271            enabled: self.enabled,
272            child_limit: self.child_limit,
273        }
274    }
275}
276
277impl Default for DynamicSupervisorConfig {
278    /// Returns the default dynamic supervisor policy.
279    fn default() -> Self {
280        Self {
281            enabled: true,
282            child_limit: None,
283        }
284    }
285}
286
287/// Restart limit configuration loaded from YAML.
288#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
289pub struct RestartLimitConfig {
290    /// Maximum restart count inside the configured window.
291    pub max_restarts: u32,
292    /// Window used to count restarts, in milliseconds.
293    pub window_ms: u64,
294}
295
296impl RestartLimitConfig {
297    /// Converts this YAML-friendly config into a runtime restart limit.
298    ///
299    /// # Arguments
300    ///
301    /// This function has no arguments.
302    ///
303    /// # Returns
304    ///
305    /// Returns a [`RestartLimit`] value.
306    pub fn to_runtime(&self) -> RestartLimit {
307        RestartLimit::new(self.max_restarts, Duration::from_millis(self.window_ms))
308    }
309}
310
311/// Group-level configuration loaded from YAML.
312#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Config, JsonSchema)]
313pub struct GroupConfig {
314    /// Low-cardinality group name shared by member children.
315    pub name: String,
316    /// Child names that belong to this group.
317    #[config(default = [])]
318    #[serde(default)]
319    pub children: Vec<String>,
320    /// Optional group-specific restart budget override.
321    #[serde(default)]
322    pub budget: Option<RestartBudgetConfig>,
323}
324
325impl GroupConfig {
326    /// Converts this YAML-friendly config into a runtime group config.
327    ///
328    /// # Arguments
329    ///
330    /// This function has no arguments.
331    ///
332    /// # Returns
333    ///
334    /// Returns a [`RuntimeGroupConfig`] value.
335    pub fn to_runtime(&self) -> RuntimeGroupConfig {
336        RuntimeGroupConfig::new(
337            self.name.clone(),
338            self.children.iter().map(ChildId::new).collect(),
339            self.budget.as_ref().map(RestartBudgetConfig::to_runtime),
340        )
341    }
342}
343
344/// Group strategy override loaded from YAML.
345#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Config, JsonSchema)]
346pub struct GroupStrategyConfig {
347    /// Group name that owns the strategy override.
348    pub group: String,
349    /// Restart strategy used when a member child fails.
350    pub strategy: SupervisionStrategy,
351    /// Optional group-level restart limit.
352    #[serde(default)]
353    pub restart_limit: Option<RestartLimitConfig>,
354    /// Optional escalation policy for this group.
355    #[serde(default)]
356    pub escalation_policy: Option<EscalationPolicy>,
357}
358
359impl GroupStrategyConfig {
360    /// Converts this YAML-friendly config into a runtime group strategy.
361    ///
362    /// # Arguments
363    ///
364    /// This function has no arguments.
365    ///
366    /// # Returns
367    ///
368    /// Returns a [`GroupStrategy`] value.
369    pub fn to_runtime(&self) -> GroupStrategy {
370        let mut strategy = GroupStrategy::new(self.group.clone(), self.strategy);
371        strategy.restart_limit = self
372            .restart_limit
373            .as_ref()
374            .map(RestartLimitConfig::to_runtime);
375        strategy.escalation_policy = self.escalation_policy;
376        strategy
377    }
378}
379
380/// Child strategy override loaded from YAML.
381#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Config, JsonSchema)]
382pub struct ChildStrategyOverrideConfig {
383    /// Child name that owns the strategy override.
384    pub child_id: String,
385    /// Restart strategy used for this child.
386    pub strategy: SupervisionStrategy,
387    /// Optional child-level restart limit.
388    #[serde(default)]
389    pub restart_limit: Option<RestartLimitConfig>,
390    /// Optional escalation policy for this child.
391    #[serde(default)]
392    pub escalation_policy: Option<EscalationPolicy>,
393}
394
395impl ChildStrategyOverrideConfig {
396    /// Converts this YAML-friendly config into a runtime child strategy override.
397    ///
398    /// # Arguments
399    ///
400    /// This function has no arguments.
401    ///
402    /// # Returns
403    ///
404    /// Returns a [`ChildStrategyOverride`] value.
405    pub fn to_runtime(&self) -> ChildStrategyOverride {
406        let mut override_config =
407            ChildStrategyOverride::new(ChildId::new(&self.child_id), self.strategy);
408        override_config.restart_limit = self
409            .restart_limit
410            .as_ref()
411            .map(RestartLimitConfig::to_runtime);
412        override_config.escalation_policy = self.escalation_policy;
413        override_config
414    }
415}
416
417/// Group dependency edge loaded from YAML.
418#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
419pub struct GroupDependencyConfig {
420    /// Group that depends on another group.
421    pub from_group: String,
422    /// Group that is depended on.
423    pub to_group: String,
424    /// Failure propagation policy.
425    pub propagation: PropagationPolicy,
426}
427
428impl GroupDependencyConfig {
429    /// Converts this YAML-friendly config into a runtime dependency edge.
430    ///
431    /// # Arguments
432    ///
433    /// This function has no arguments.
434    ///
435    /// # Returns
436    ///
437    /// Returns a [`GroupDependencyEdge`] value.
438    pub fn to_runtime(&self) -> GroupDependencyEdge {
439        GroupDependencyEdge {
440            from_group: self.from_group.clone(),
441            to_group: self.to_group.clone(),
442            propagation: self.propagation,
443        }
444    }
445}
446
447/// Severity default loaded from YAML.
448#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Config, JsonSchema)]
449pub struct SeverityDefaultConfig {
450    /// Task role that receives this default severity.
451    pub task_role: TaskRole,
452    /// Severity assigned to the task role.
453    pub severity: SeverityClass,
454}
455
456/// Returns the default restart budget window in seconds.
457fn default_restart_budget_window_secs() -> u64 {
458    60
459}
460
461/// Returns the default restart budget burst.
462fn default_restart_budget_max_burst() -> u32 {
463    10
464}
465
466/// Returns the default restart budget recovery rate per second.
467fn default_restart_budget_recovery_rate() -> f64 {
468    0.5
469}
470
471/// Returns the default failure window width in seconds.
472fn default_failure_window_secs() -> u64 {
473    60
474}
475
476/// Returns the default retained failure count.
477fn default_failure_window_max_count() -> usize {
478    5
479}
480
481/// Returns the default failure threshold.
482fn default_failure_window_threshold() -> usize {
483    5
484}
485
486/// Returns the default child meltdown limit.
487fn default_meltdown_child_max_restarts() -> u32 {
488    3
489}
490
491/// Returns the default child meltdown window in seconds.
492fn default_meltdown_child_window_secs() -> u64 {
493    10
494}
495
496/// Returns the default group meltdown limit.
497fn default_meltdown_group_max_failures() -> u32 {
498    5
499}
500
501/// Returns the default group meltdown window in seconds.
502fn default_meltdown_group_window_secs() -> u64 {
503    30
504}
505
506/// Returns the default supervisor meltdown limit.
507fn default_meltdown_supervisor_max_failures() -> u32 {
508    10
509}
510
511/// Returns the default supervisor meltdown window in seconds.
512fn default_meltdown_supervisor_window_secs() -> u64 {
513    60
514}
515
516/// Returns the default stable reset window in seconds.
517fn default_meltdown_reset_after_secs() -> u64 {
518    120
519}
520
521/// Returns the default supervision pipeline journal capacity.
522fn default_pipeline_journal_capacity() -> usize {
523    100
524}
525
526/// Returns the default supervision pipeline subscriber capacity.
527fn default_pipeline_subscriber_capacity() -> usize {
528    10
529}
530
531/// Returns the default concurrent restart limit.
532fn default_concurrent_restart_limit() -> u32 {
533    5
534}
535
536/// Serde default helper: returns true.
537fn default_true() -> bool {
538    true
539}