Skip to main content

rust_supervisor/spec/
child.rs

1//! Child declaration model.
2//!
3//! This module owns declarative child specifications and validates local child
4//! invariants before the runtime registers or starts work.
5
6use crate::error::types::SupervisorError;
7use crate::id::types::ChildId;
8use crate::policy::task_role_defaults::{SeverityClass, SidecarConfig, TaskRole};
9use crate::readiness::signal::ReadinessPolicy;
10use crate::task::factory::TaskFactory;
11use schemars::JsonSchema;
12use serde::{Deserialize, Serialize};
13use std::fmt::{Debug, Formatter};
14use std::sync::Arc;
15use std::time::Duration;
16
17/// Kind of task represented by a child declaration.
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
19#[serde(rename_all = "snake_case")]
20pub enum TaskKind {
21    /// Asynchronous worker that can be cancelled through its context.
22    AsyncWorker,
23    /// Blocking worker with explicit shutdown and escalation boundaries.
24    BlockingWorker,
25    /// Nested supervisor node.
26    Supervisor,
27}
28
29impl Default for TaskKind {
30    /// Returns the default task kind: [`AsyncWorker`](TaskKind::AsyncWorker).
31    fn default() -> Self {
32        Self::AsyncWorker
33    }
34}
35
36/// Importance of a child to its parent supervisor.
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
38#[serde(rename_all = "snake_case")]
39pub enum Criticality {
40    /// The child is required for the supervisor to remain healthy.
41    Critical,
42    /// The child can fail without forcing parent shutdown.
43    Optional,
44}
45
46impl Default for Criticality {
47    /// Returns the default criticality: [`Optional`](Criticality::Optional).
48    fn default() -> Self {
49        Self::Optional
50    }
51}
52
53/// Restart behavior attached to a child.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
55#[serde(rename_all = "snake_case")]
56pub enum RestartPolicy {
57    /// Restart regardless of the exit result.
58    Permanent,
59    /// Restart only when the task failed.
60    Transient,
61    /// Do not restart after any exit.
62    Temporary,
63}
64
65impl Default for RestartPolicy {
66    /// Returns the default restart policy: [`Permanent`](RestartPolicy::Permanent).
67    fn default() -> Self {
68        Self::Permanent
69    }
70}
71
72/// Shutdown behavior attached to a child.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
74pub struct ShutdownPolicy {
75    /// Graceful stop budget for cooperative shutdown.
76    pub graceful_timeout: Duration,
77    /// Wait budget after an abort request.
78    pub abort_wait: Duration,
79}
80
81impl ShutdownPolicy {
82    /// Creates a shutdown policy.
83    ///
84    /// # Arguments
85    ///
86    /// - `graceful_timeout`: Cooperative shutdown budget.
87    /// - `abort_wait`: Wait budget after abort escalation.
88    ///
89    /// # Returns
90    ///
91    /// Returns a [`ShutdownPolicy`] value.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// let policy = rust_supervisor::spec::child::ShutdownPolicy::new(
97    ///     std::time::Duration::from_secs(1),
98    ///     std::time::Duration::from_millis(100),
99    /// );
100    /// assert_eq!(policy.graceful_timeout.as_secs(), 1);
101    /// ```
102    pub fn new(graceful_timeout: Duration, abort_wait: Duration) -> Self {
103        Self {
104            graceful_timeout,
105            abort_wait,
106        }
107    }
108}
109
110/// Health behavior attached to a child.
111#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
112pub struct HealthPolicy {
113    /// Expected heartbeat interval.
114    pub heartbeat_interval: Duration,
115    /// Maximum age for the last heartbeat before the child is stale.
116    pub stale_after: Duration,
117}
118
119impl HealthPolicy {
120    /// Creates a health policy.
121    ///
122    /// # Arguments
123    ///
124    /// - `heartbeat_interval`: Expected heartbeat interval.
125    /// - `stale_after`: Maximum heartbeat age.
126    ///
127    /// # Returns
128    ///
129    /// Returns a [`HealthPolicy`] value.
130    pub fn new(heartbeat_interval: Duration, stale_after: Duration) -> Self {
131        Self {
132            heartbeat_interval,
133            stale_after,
134        }
135    }
136}
137
138/// Health check configuration for a child declaration.
139#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
140pub struct HealthCheckConfig {
141    /// Interval between health checks in seconds.
142    pub check_interval_secs: u64,
143    /// Timeout for each health check in seconds.
144    pub timeout_secs: u64,
145    /// Maximum retries before marking the child as unhealthy.
146    pub max_retries: u32,
147}
148
149impl Default for HealthCheckConfig {
150    /// Returns the default health check config: 10s interval, 5s timeout, 3 retries.
151    fn default() -> Self {
152        Self {
153            check_interval_secs: 10,
154            timeout_secs: 5,
155            max_retries: 3,
156        }
157    }
158}
159
160/// Readiness check configuration for a child declaration.
161#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
162pub struct ReadinessConfig {
163    /// Interval between readiness checks in seconds.
164    pub check_interval_secs: u64,
165    /// Timeout for each readiness check in seconds.
166    pub timeout_secs: u64,
167}
168
169impl Default for ReadinessConfig {
170    /// Returns the default readiness config: 5s interval, 3s timeout.
171    fn default() -> Self {
172        Self {
173            check_interval_secs: 5,
174            timeout_secs: 3,
175        }
176    }
177}
178
179/// Resource limits for a child process.
180#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
181pub struct ResourceLimits {
182    /// Maximum memory in megabytes.
183    pub max_memory_mb: Option<u64>,
184    /// Maximum CPU usage as a percentage.
185    pub max_cpu_percent: Option<u8>,
186    /// Maximum number of open file descriptors.
187    pub max_file_descriptors: Option<u64>,
188}
189
190/// Command permissions granted to a child.
191#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
192pub struct CommandPermissions {
193    /// Whether the child may trigger supervisor shutdown.
194    pub allow_shutdown: bool,
195    /// Whether the child may request its own restart.
196    pub allow_restart: bool,
197    /// Signals the child is allowed to send.
198    pub allowed_signals: Vec<String>,
199}
200
201impl Default for CommandPermissions {
202    /// Returns the default command permissions: no shutdown, no restart, SIGTERM only.
203    fn default() -> Self {
204        Self {
205            allow_shutdown: false,
206            allow_restart: false,
207            allowed_signals: vec!["SIGTERM".to_string()],
208        }
209    }
210}
211
212/// Environment variable for a child.
213#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
214pub struct EnvVar {
215    /// Environment variable name.
216    pub name: String,
217    /// Plain-text value (mutually exclusive with secret_ref).
218    pub value: Option<String>,
219    /// Secret reference in `${SECRET_NAME}` format (mutually exclusive with value).
220    pub secret_ref: Option<String>,
221}
222
223/// Secret reference for a child.
224#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
225pub struct SecretRef {
226    /// Secret name used as an identifier.
227    pub name: String,
228    /// Key path within the vault.
229    pub key: String,
230    /// Whether the secret is required (vault offline treated as rejection when true).
231    pub required: bool,
232}
233
234/// Backoff behavior attached to a child.
235#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, JsonSchema)]
236pub struct BackoffPolicy {
237    /// Initial delay before the first restart.
238    pub initial_delay: Duration,
239    /// Maximum restart delay.
240    pub max_delay: Duration,
241    /// Jitter ratio between zero and one.
242    pub jitter_ratio: f64,
243}
244
245impl BackoffPolicy {
246    /// Creates a backoff policy.
247    ///
248    /// # Arguments
249    ///
250    /// - `initial_delay`: Initial restart delay.
251    /// - `max_delay`: Maximum restart delay.
252    /// - `jitter_ratio`: Jitter ratio between zero and one.
253    ///
254    /// # Returns
255    ///
256    /// Returns a [`BackoffPolicy`] value.
257    pub fn new(initial_delay: Duration, max_delay: Duration, jitter_ratio: f64) -> Self {
258        Self {
259            initial_delay,
260            max_delay,
261            jitter_ratio,
262        }
263    }
264}
265
266/// Declarative specification for a child task or nested supervisor.
267#[derive(Clone, Serialize, Deserialize, JsonSchema)]
268pub struct ChildSpec {
269    /// Stable child identifier.
270    pub id: ChildId,
271    /// Human-readable child name.
272    pub name: String,
273    /// Child task kind.
274    pub kind: TaskKind,
275    /// Optional factory for worker children.
276    #[serde(skip)]
277    #[schemars(skip)]
278    pub factory: Option<Arc<dyn TaskFactory>>,
279    /// Restart policy for this child.
280    pub restart_policy: RestartPolicy,
281    /// Shutdown policy for this child.
282    pub shutdown_policy: ShutdownPolicy,
283    /// Health policy for this child.
284    pub health_policy: HealthPolicy,
285    /// Readiness policy for this child.
286    pub readiness_policy: ReadinessPolicy,
287    /// Backoff policy for this child.
288    pub backoff_policy: BackoffPolicy,
289    /// Child identifiers that must become ready before this child starts.
290    pub dependencies: Vec<ChildId>,
291    /// Low-cardinality tags used for grouping and diagnostics.
292    pub tags: Vec<String>,
293    /// Criticality used by parent policy decisions.
294    pub criticality: Criticality,
295    /// Optional role that selects default lifecycle policy semantics.
296    #[serde(default)]
297    pub task_role: Option<TaskRole>,
298    /// Optional sidecar binding used when the role is [`TaskRole::Sidecar`].
299    #[serde(default)]
300    pub sidecar_config: Option<SidecarConfig>,
301    /// Optional explicit severity classification that overrides the role default (US3).
302    #[serde(default)]
303    pub severity: Option<SeverityClass>,
304    /// Optional group name for group-level isolation and budget tracking (US2).
305    #[serde(default)]
306    pub group: Option<String>,
307    /// Optional health check configuration.
308    #[serde(default)]
309    pub health_check: Option<HealthCheckConfig>,
310    /// Optional readiness check configuration.
311    #[serde(default)]
312    pub readiness: Option<ReadinessConfig>,
313    /// Optional resource limits.
314    #[serde(default)]
315    pub resource_limits: Option<ResourceLimits>,
316    /// Command permissions granted to this child.
317    #[serde(default)]
318    pub command_permissions: CommandPermissions,
319    /// Environment variables for this child.
320    #[serde(default)]
321    pub environment: Vec<EnvVar>,
322    /// Secret references for this child.
323    #[serde(default)]
324    pub secrets: Vec<SecretRef>,
325}
326
327impl Debug for ChildSpec {
328    /// Formats the child specification without printing the task factory.
329    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
330        formatter
331            .debug_struct("ChildSpec")
332            .field("id", &self.id)
333            .field("name", &self.name)
334            .field("kind", &self.kind)
335            .field("restart_policy", &self.restart_policy)
336            .field("shutdown_policy", &self.shutdown_policy)
337            .field("health_policy", &self.health_policy)
338            .field("readiness_policy", &self.readiness_policy)
339            .field("backoff_policy", &self.backoff_policy)
340            .field("dependencies", &self.dependencies)
341            .field("tags", &self.tags)
342            .field("criticality", &self.criticality)
343            .field("task_role", &self.task_role)
344            .field("sidecar_config", &self.sidecar_config)
345            .field("severity", &self.severity)
346            .field("group", &self.group)
347            .field("health_check", &self.health_check)
348            .field("readiness", &self.readiness)
349            .field("resource_limits", &self.resource_limits)
350            .field("command_permissions", &self.command_permissions)
351            .field("environment", &self.environment)
352            .field("secrets", &self.secrets)
353            .finish()
354    }
355}
356
357impl ChildSpec {
358    /// Creates a worker child specification.
359    ///
360    /// # Arguments
361    ///
362    /// - `id`: Stable child identifier.
363    /// - `name`: Human-readable child name.
364    /// - `kind`: Worker task kind.
365    /// - `factory`: Task factory used to build each child_start_count.
366    ///
367    /// # Returns
368    ///
369    /// Returns a [`ChildSpec`] with conservative policy values.
370    ///
371    /// # Examples
372    ///
373    /// ```
374    /// let factory = rust_supervisor::task::factory::service_fn(|_ctx| async {
375    ///     rust_supervisor::task::factory::TaskResult::Succeeded
376    /// });
377    /// let spec = rust_supervisor::spec::child::ChildSpec::worker(
378    ///     rust_supervisor::id::types::ChildId::new("worker"),
379    ///     "worker",
380    ///     rust_supervisor::spec::child::TaskKind::AsyncWorker,
381    ///     std::sync::Arc::new(factory),
382    /// );
383    /// assert_eq!(spec.name, "worker");
384    /// ```
385    pub fn worker(
386        id: ChildId,
387        name: impl Into<String>,
388        kind: TaskKind,
389        factory: Arc<dyn TaskFactory>,
390    ) -> Self {
391        Self {
392            id,
393            name: name.into(),
394            kind,
395            factory: Some(factory),
396            restart_policy: RestartPolicy::Transient,
397            shutdown_policy: ShutdownPolicy::new(Duration::from_secs(5), Duration::from_secs(1)),
398            health_policy: HealthPolicy::new(Duration::from_secs(1), Duration::from_secs(3)),
399            readiness_policy: ReadinessPolicy::Immediate,
400            backoff_policy: BackoffPolicy::new(
401                Duration::from_millis(10),
402                Duration::from_secs(1),
403                0.0,
404            ),
405            dependencies: Vec::new(),
406            tags: Vec::new(),
407            criticality: Criticality::Critical,
408            task_role: Some(TaskRole::Worker),
409            sidecar_config: None,
410            severity: None,
411            group: None,
412            health_check: None,
413            readiness: None,
414            resource_limits: None,
415            command_permissions: CommandPermissions::default(),
416            environment: Vec::new(),
417            secrets: Vec::new(),
418        }
419    }
420
421    /// Validates local child specification invariants.
422    ///
423    /// # Arguments
424    ///
425    /// This function has no arguments.
426    ///
427    /// # Returns
428    ///
429    /// Returns `Ok(())` when the child can be registered.
430    pub fn validate(&self) -> Result<(), SupervisorError> {
431        validate_non_empty(&self.id.value, "child id")?;
432        validate_non_empty(&self.name, "child name")?;
433        validate_tags(&self.tags)?;
434        validate_backoff(self.backoff_policy)?;
435        validate_factory(self.kind, self.factory.is_some())?;
436        validate_sidecar_local(self)
437    }
438}
439
440/// Validates a non-empty string invariant.
441///
442/// # Arguments
443///
444/// - `value`: String value being validated.
445/// - `label`: Field label used in the error message.
446///
447/// # Returns
448///
449/// Returns `Ok(())` when the string is not empty.
450fn validate_non_empty(value: &str, label: &str) -> Result<(), SupervisorError> {
451    if value.trim().is_empty() {
452        Err(SupervisorError::fatal_config(format!(
453            "{label} must not be empty"
454        )))
455    } else {
456        Ok(())
457    }
458}
459
460/// Validates tag invariants.
461///
462/// # Arguments
463///
464/// - `tags`: Tags attached to the child.
465///
466/// # Returns
467///
468/// Returns `Ok(())` when every tag is non-empty.
469fn validate_tags(tags: &[String]) -> Result<(), SupervisorError> {
470    for tag in tags {
471        validate_non_empty(tag, "child tag")?;
472    }
473    Ok(())
474}
475
476/// Validates backoff invariants.
477///
478/// # Arguments
479///
480/// - `policy`: Backoff policy attached to the child.
481///
482/// # Returns
483///
484/// Returns `Ok(())` when delay and jitter values are valid.
485fn validate_backoff(policy: BackoffPolicy) -> Result<(), SupervisorError> {
486    if policy.initial_delay > policy.max_delay {
487        return Err(SupervisorError::fatal_config(
488            "initial backoff must not exceed max backoff",
489        ));
490    }
491    if !(0.0..=1.0).contains(&policy.jitter_ratio) {
492        return Err(SupervisorError::fatal_config(
493            "jitter ratio must be between zero and one",
494        ));
495    }
496    Ok(())
497}
498
499/// Validates factory presence for the child kind.
500///
501/// # Arguments
502///
503/// - `kind`: Child task kind.
504/// - `has_factory`: Whether a factory was supplied.
505///
506/// # Returns
507///
508/// Returns `Ok(())` when factory presence matches the task kind.
509fn validate_factory(kind: TaskKind, has_factory: bool) -> Result<(), SupervisorError> {
510    match (kind, has_factory) {
511        (TaskKind::Supervisor, true) => Err(SupervisorError::fatal_config(
512            "supervisor child must not own a task factory",
513        )),
514        (TaskKind::AsyncWorker | TaskKind::BlockingWorker, false) => Err(
515            SupervisorError::fatal_config("worker child requires a task factory"),
516        ),
517        _ => Ok(()),
518    }
519}
520
521/// Validates local sidecar fields without inspecting sibling children.
522///
523/// # Arguments
524///
525/// - `child`: Child specification to validate.
526///
527/// # Returns
528///
529/// Returns `Ok(())` when the local sidecar declaration is coherent.
530fn validate_sidecar_local(child: &ChildSpec) -> Result<(), SupervisorError> {
531    match (child.task_role, child.sidecar_config.as_ref()) {
532        (Some(TaskRole::Sidecar), None) => Err(SupervisorError::fatal_config(
533            "sidecar task_role requires sidecar_config",
534        )),
535        (role, Some(_)) if role != Some(TaskRole::Sidecar) => Err(SupervisorError::fatal_config(
536            "sidecar_config requires sidecar task_role",
537        )),
538        _ => Ok(()),
539    }
540}