Skip to main content

rust_supervisor/event/
payload.rs

1//! Lifecycle event payloads and event envelopes.
2//!
3//! This module owns the observable shape of supervisor lifecycle facts. It keeps
4//! payloads typed so state, journal, metrics, and tests do not infer behavior
5//! from strings.
6
7use crate::child_runner::run_exit::TaskExit;
8use crate::control::outcome::{
9    ChildAttemptStatus, ChildControlFailurePhase, ChildControlOperation, ChildControlResult,
10    ChildStopState, RestartLimitState, StaleReportHandling,
11};
12use crate::error::types::TaskFailure;
13use crate::event::time::{CorrelationId, EventSequence, When};
14use crate::id::types::{ChildId, ChildStartCount, Generation, SupervisorPath};
15use crate::policy::task_role_defaults::{PolicySource, TaskRole};
16use serde::{Deserialize, Serialize};
17use uuid::Uuid;
18
19/// Wrapper around [`f64`] that implements [`Eq`] via bit comparison.
20///
21/// NaN is disallowed. If a NaN value is constructed at runtime, equality
22/// panics. This type exists solely to satisfy the `Eq` bound on the `What`
23/// enum and should not be used outside this module.
24#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
25#[serde(transparent)]
26pub struct FiniteF64(#[serde(with = "finite_f64_serde")] f64);
27
28impl Eq for FiniteF64 {}
29
30impl FiniteF64 {
31    /// Creates a `FiniteF64` from a raw `f64`.
32    ///
33    /// # Panics
34    ///
35    /// Panics if `value` is NaN.
36    pub fn new(value: f64) -> Self {
37        assert!(!value.is_nan(), "FiniteF64 does not support NaN");
38        Self(value)
39    }
40
41    /// Returns the inner `f64` value.
42    pub fn into_inner(self) -> f64 {
43        self.0
44    }
45}
46
47impl From<f64> for FiniteF64 {
48    /// Creates a `FiniteF64` from a raw `f64`.
49    ///
50    /// # Panics
51    ///
52    /// Panics if `value` is NaN.
53    fn from(value: f64) -> Self {
54        Self::new(value)
55    }
56}
57
58/// Serde helper that serializes `FiniteF64` as a plain JSON number.
59mod finite_f64_serde {
60    use serde::{Deserialize, Deserializer, Serialize, Serializer};
61
62    /// Serializes an `f64` as a plain JSON number.
63    pub fn serialize<S: Serializer>(value: &f64, serializer: S) -> Result<S::Ok, S::Error> {
64        value.serialize(serializer)
65    }
66
67    /// Deserializes an `f64` from a JSON number, rejecting NaN.
68    pub fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result<f64, D::Error> {
69        let value = f64::deserialize(deserializer)?;
70        if value.is_nan() {
71            return Err(serde::de::Error::custom("FiniteF64 does not support NaN"));
72        }
73        Ok(value)
74    }
75}
76
77/// Meltdown scope identifier for failure tracking.
78#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
79pub enum MeltdownScope {
80    /// Child-level scope bound to a specific child identifier.
81    Child,
82    /// Group-level scope bound to a restart execution plan group.
83    Group,
84    /// Supervisor-level scope bound to the supervisor instance boundary.
85    Supervisor,
86}
87
88impl std::fmt::Display for MeltdownScope {
89    /// Formats the meltdown scope as a string.
90    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91        match self {
92            Self::Child => write!(f, "child"),
93            Self::Group => write!(f, "group"),
94            Self::Supervisor => write!(f, "supervisor"),
95        }
96    }
97}
98
99/// Protection restrictiveness ladder defining escalation severity levels.
100///
101/// This enum defines six protection tiers from least to most restrictive.
102#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
103pub enum ProtectionAction {
104    /// Restart is allowed without restrictions.
105    RestartAllowed,
106    /// Restart is queued behind concurrency throttle gates.
107    RestartQueued,
108    /// Restart is denied due to policy limits.
109    RestartDenied,
110    /// Supervision is paused temporarily.
111    SupervisionPaused,
112    /// Failure is escalated to parent supervisor.
113    Escalated,
114    /// Supervised stop is enforced for the child.
115    SupervisedStop,
116}
117
118impl std::fmt::Display for ProtectionAction {
119    /// Formats the protection action as a string.
120    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
121        match self {
122            Self::RestartAllowed => write!(f, "restart_allowed"),
123            Self::RestartQueued => write!(f, "restart_queued"),
124            Self::RestartDenied => write!(f, "restart_denied"),
125            Self::SupervisionPaused => write!(f, "supervision_paused"),
126            Self::Escalated => write!(f, "escalated"),
127            Self::SupervisedStop => write!(f, "supervised_stop"),
128        }
129    }
130}
131
132/// Reason for cold start budget triggering or exhaustion.
133#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
134pub enum ColdStartReason {
135    /// Cold start budget has not been triggered.
136    NotApplicable,
137    /// Initial startup within cold start window.
138    InitialStartup,
139    /// Cold start budget exhausted within time window.
140    BudgetExhausted,
141    /// Too many restarts during cold start period.
142    ExcessiveRestarts,
143}
144
145impl std::fmt::Display for ColdStartReason {
146    /// Formats the cold start reason as a string.
147    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148        match self {
149            Self::NotApplicable => write!(f, "not_applicable"),
150            Self::InitialStartup => write!(f, "initial_startup"),
151            Self::BudgetExhausted => write!(f, "budget_exhausted"),
152            Self::ExcessiveRestarts => write!(f, "excessive_restarts"),
153        }
154    }
155}
156
157/// Reason for hot loop detection triggering.
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
159pub enum HotLoopReason {
160    /// Hot loop detection has not been triggered.
161    NotApplicable,
162    /// Rapid crash detected within sliding time window.
163    RapidCrashDetected,
164    /// Crash-restart cycle exceeded threshold frequency.
165    CycleThresholdExceeded,
166    /// Insufficient stable runtime between restarts.
167    InsufficientStableRuntime,
168}
169
170impl std::fmt::Display for HotLoopReason {
171    /// Formats the hot loop reason as a string.
172    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173        match self {
174            Self::NotApplicable => write!(f, "not_applicable"),
175            Self::RapidCrashDetected => write!(f, "rapid_crash_detected"),
176            Self::CycleThresholdExceeded => write!(f, "cycle_threshold_exceeded"),
177            Self::InsufficientStableRuntime => write!(f, "insufficient_stable_runtime"),
178        }
179    }
180}
181
182/// Ownership of the throttle gate that limited concurrent restarts.
183#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
184pub enum ThrottleGateOwner {
185    /// No throttle gate was active.
186    None,
187    /// Instance-global supervisor throttle gate.
188    SupervisorInstance,
189    /// Group-level throttle gate with group identifier.
190    Group(String),
191}
192
193impl std::fmt::Display for ThrottleGateOwner {
194    /// Formats the throttle gate owner as a string.
195    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196        match self {
197            Self::None => write!(f, "none"),
198            Self::SupervisorInstance => write!(f, "supervisor_global"),
199            Self::Group(group) => write!(f, "group:{}", group),
200        }
201    }
202}
203
204/// Location data attached to a supervisor event.
205#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
206pub struct Where {
207    /// Stable supervisor path that owns the fact.
208    pub supervisor_path: SupervisorPath,
209    /// Parent child identifier when the fact belongs to a nested node.
210    pub parent_id: Option<ChildId>,
211    /// Child identifier related to the fact.
212    pub child_id: Option<ChildId>,
213    /// Human-readable child name.
214    pub child_name: Option<String>,
215    /// Tokio task identifier when it is available.
216    pub tokio_task_id: Option<String>,
217    /// Host name reported by the runtime.
218    pub host: Option<String>,
219    /// Process identifier that emitted the event.
220    pub pid: u32,
221    /// Current thread name when available.
222    pub thread_name: Option<String>,
223    /// Rust module path that emitted the event.
224    pub module_path: Option<String>,
225    /// Source file that emitted the event.
226    pub source_file: Option<String>,
227    /// Source line that emitted the event.
228    pub source_line: Option<u32>,
229}
230
231impl Where {
232    /// Creates a location for a supervisor path.
233    ///
234    /// # Arguments
235    ///
236    /// - `supervisor_path`: Path that owns this lifecycle fact.
237    ///
238    /// # Returns
239    ///
240    /// Returns a [`Where`] value with process and thread defaults.
241    ///
242    /// # Examples
243    ///
244    /// ```
245    /// let location = rust_supervisor::event::payload::Where::new(
246    ///     rust_supervisor::id::types::SupervisorPath::root(),
247    /// );
248    /// assert_eq!(location.supervisor_path.to_string(), "/");
249    /// ```
250    pub fn new(supervisor_path: SupervisorPath) -> Self {
251        Self {
252            supervisor_path,
253            parent_id: None,
254            child_id: None,
255            child_name: None,
256            tokio_task_id: None,
257            host: None,
258            pid: std::process::id(),
259            thread_name: std::thread::current().name().map(ToOwned::to_owned),
260            module_path: None,
261            source_file: None,
262            source_line: None,
263        }
264    }
265
266    /// Adds child identity to the location.
267    ///
268    /// # Arguments
269    ///
270    /// - `child_id`: Stable child identifier.
271    /// - `child_name`: Human-readable child name.
272    ///
273    /// # Returns
274    ///
275    /// Returns the updated [`Where`] value.
276    pub fn with_child(mut self, child_id: ChildId, child_name: impl Into<String>) -> Self {
277        self.child_id = Some(child_id);
278        self.child_name = Some(child_name.into());
279        self
280    }
281}
282
283/// State transition recorded by an event payload.
284#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
285pub struct StateTransition {
286    /// State before the transition.
287    pub from: String,
288    /// State after the transition.
289    pub to: String,
290}
291
292impl StateTransition {
293    /// Creates a state transition description.
294    ///
295    /// # Arguments
296    ///
297    /// - `from`: Previous state name.
298    /// - `to`: New state name.
299    ///
300    /// # Returns
301    ///
302    /// Returns a [`StateTransition`].
303    pub fn new(from: impl Into<String>, to: impl Into<String>) -> Self {
304        Self {
305            from: from.into(),
306            to: to.into(),
307        }
308    }
309}
310
311/// Policy decision data stored with an event.
312#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
313pub struct PolicyDecision {
314    /// Low-cardinality decision name.
315    pub decision: String,
316    /// Delay in milliseconds when restart is delayed.
317    pub delay_ms: Option<u64>,
318    /// Human-readable reason for diagnostics.
319    pub reason: Option<String>,
320}
321
322impl PolicyDecision {
323    /// Creates a policy decision value.
324    ///
325    /// # Arguments
326    ///
327    /// - `decision`: Low-cardinality decision name.
328    /// - `delay_ms`: Optional delay in milliseconds.
329    /// - `reason`: Optional diagnostic reason.
330    ///
331    /// # Returns
332    ///
333    /// Returns a [`PolicyDecision`].
334    pub fn new(decision: impl Into<String>, delay_ms: Option<u64>, reason: Option<String>) -> Self {
335        Self {
336            decision: decision.into(),
337            delay_ms,
338            reason,
339        }
340    }
341}
342
343/// Command audit data attached to command lifecycle events.
344#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
345pub struct CommandAudit {
346    /// Stable command identifier.
347    pub command_id: String,
348    /// Caller that requested the command.
349    pub requested_by: String,
350    /// Operator-provided reason.
351    pub reason: String,
352    /// Target path for the command.
353    pub target_path: SupervisorPath,
354    /// Accepted time in nanoseconds since the Unix epoch.
355    pub accepted_at_unix_nanos: u128,
356    /// Command result summary.
357    pub result: String,
358}
359
360/// Typed payload for supervisor lifecycle events.
361#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
362#[serde(tag = "type", content = "payload", rename_all = "snake_case")]
363pub enum What {
364    /// Child is being started.
365    ChildStarting {
366        /// Optional state transition carried by this event.
367        transition: Option<StateTransition>,
368    },
369    /// Child is running.
370    ChildRunning {
371        /// Optional state transition carried by this event.
372        transition: Option<StateTransition>,
373    },
374    /// Child is ready.
375    ChildReady {
376        /// Optional state transition carried by this event.
377        transition: Option<StateTransition>,
378    },
379    /// Child emitted a heartbeat.
380    ChildHeartbeat {
381        /// Heartbeat age in milliseconds.
382        age_ms: u64,
383    },
384    /// Child failed with a typed failure.
385    ChildFailed {
386        /// Failure payload reported by the task.
387        failure: TaskFailure,
388    },
389    /// Child panicked.
390    ChildPanicked {
391        /// Panic category used for metrics.
392        category: String,
393    },
394    /// Restart backoff was scheduled.
395    BackoffScheduled {
396        /// Backoff delay in milliseconds.
397        delay_ms: u64,
398    },
399    /// Child is restarting.
400    ChildRestarting {
401        /// Restart generation after the transition.
402        generation: u64,
403    },
404    /// Child restarted.
405    ChildRestarted {
406        /// Restart count for the child window.
407        restart_count: u64,
408    },
409    /// Child was quarantined.
410    ChildQuarantined {
411        /// Quarantine reason.
412        reason: String,
413    },
414    /// Child stopped.
415    ChildStopped {
416        /// Exit reason.
417        reason: String,
418    },
419    /// Child became unhealthy.
420    ChildUnhealthy {
421        /// Unhealthy reason.
422        reason: String,
423    },
424    /// Meltdown fuse was tripped.
425    Meltdown {
426        /// Scope that tripped the fuse.
427        scope: String,
428    },
429    /// Shutdown was requested.
430    ShutdownRequested {
431        /// Shutdown cause.
432        cause: String,
433    },
434    /// Shutdown phase changed.
435    ShutdownPhaseChanged {
436        /// Previous phase name.
437        from: String,
438        /// New phase name.
439        to: String,
440    },
441    /// Shutdown completed.
442    ShutdownCompleted {
443        /// Final shutdown phase.
444        phase: String,
445        /// Shutdown result summary.
446        result: String,
447        /// Full pipeline duration in milliseconds.
448        duration_ms: u64,
449    },
450    /// Child shutdown cancel delivered for one supervised child_start_count during shutdown draining.
451    ChildShutdownCancelDelivered {
452        /// Child that received cancellation.
453        child_id: ChildId,
454        /// Generation associated with the child child_start_count.
455        generation: Generation,
456        /// ChildStartCount associated with the child run.
457        child_start_count: ChildStartCount,
458        /// Shutdown phase that delivered cancellation.
459        phase: String,
460    },
461    /// Child finished during graceful shutdown draining.
462    ChildShutdownGraceful {
463        /// Child that completed gracefully.
464        child_id: ChildId,
465        /// Generation associated with the child child_start_count.
466        generation: Generation,
467        /// ChildStartCount associated with the child run.
468        child_start_count: ChildStartCount,
469        /// Shutdown phase that recorded the outcome.
470        phase: String,
471        /// Exit classification reported by the child.
472        exit: String,
473    },
474    /// Child was aborted during shutdown.
475    ChildShutdownAborted {
476        /// Child that was aborted.
477        child_id: ChildId,
478        /// Generation associated with the child child_start_count.
479        generation: Generation,
480        /// ChildStartCount associated with the child run.
481        child_start_count: ChildStartCount,
482        /// Shutdown phase that recorded the outcome.
483        phase: String,
484        /// Low-cardinality abort result.
485        result: String,
486        /// Human-readable abort reason.
487        reason: String,
488    },
489    /// Child reported after its normal shutdown accounting window.
490    ChildShutdownLateReport {
491        /// Child that produced a late report.
492        child_id: ChildId,
493        /// Generation associated with the child child_start_count.
494        generation: Generation,
495        /// ChildStartCount associated with the child run.
496        child_start_count: ChildStartCount,
497        /// Shutdown phase that received the late report.
498        phase: String,
499        /// Exit classification reported by the child.
500        exit: String,
501    },
502    /// Generation fence engaged for an accepted manual restart waiting for an old attempt to stop.
503    ChildRestartFenceEntered {
504        /// Child awaiting restart isolation.
505        child_id: ChildId,
506        /// Old generation pinned until the fence releases.
507        old_generation: Generation,
508        /// Old attempt pinned until the fence releases.
509        old_attempt: ChildStartCount,
510        /// Target generation queued after the old attempt completes.
511        target_generation: Generation,
512        /// Command identifier tying this fence to auditing metadata.
513        command_id: String,
514        /// Restart requester captured from command metadata.
515        requested_by: String,
516        /// Restart reason captured from command metadata.
517        reason: String,
518        /// Deadline for cooperative stop before escalation to abort paths.
519        stop_deadline_at_unix_nanos: u128,
520    },
521    /// Runtime escalated restart isolation to abort the old attempt after the cooperative deadline elapsed.
522    ChildRestartFenceAbortRequested {
523        /// Child awaiting restart isolation.
524        child_id: ChildId,
525        /// Old generation that failed to exit before the graceful deadline expired.
526        old_generation: Generation,
527        /// Old attempt that failed to exit before the graceful deadline expired.
528        old_attempt: ChildStartCount,
529        /// Target generation queued for start after isolation completes.
530        target_generation: Generation,
531        /// Command identifier tied to the pending restart bookkeeping.
532        command_id: String,
533        /// Deadline that triggered the abort escalation.
534        deadline_unix_nanos: u128,
535    },
536    /// Old attempt completed and a new generation may start under the pending restart request.
537    ChildRestartFenceReleased {
538        /// Child whose fence released.
539        child_id: ChildId,
540        /// Old generation that fully stopped.
541        old_generation: Generation,
542        /// Old attempt that fully stopped.
543        old_attempt: ChildStartCount,
544        /// Target generation allowed to start after this release.
545        target_generation: Generation,
546        /// Exit classification reported for the old attempt.
547        exit_kind: TaskExit,
548    },
549    /// Conflicting restart intent that was merged, rejected, or superseded by policy.
550    ChildRestartConflict {
551        /// Child identifier for the fencing scope.
552        child_id: ChildId,
553        /// Generation that was active or pinned when the conflict was classified.
554        current_generation: Option<Generation>,
555        /// Attempt counter that was active or pinned when the conflict was classified.
556        current_attempt: Option<ChildStartCount>,
557        /// Generation the caller wanted to reach, if applicable.
558        target_generation: Option<Generation>,
559        /// Command identifier supplied by the caller when present.
560        command_id: String,
561        /// Low-cardinality conflict classifier (`already_pending`, `rejected`, ...).
562        decision: String,
563        /// Human-readable reason for observability dumps.
564        reason: String,
565    },
566    /// Stale completion triple observed after authoritative state moved forward.
567    ChildAttemptStaleReport {
568        /// Child identifier tied to the completion triple.
569        child_id: ChildId,
570        /// Generation carried by the stale completion report.
571        reported_generation: Generation,
572        /// Attempt counter carried by the stale completion report.
573        reported_attempt: ChildStartCount,
574        /// Generation considered authoritative when the stale report arrived.
575        current_generation: Option<Generation>,
576        /// Attempt counter considered authoritative when the stale report arrived.
577        current_attempt: Option<ChildStartCount>,
578        /// Exit classification supplied by the stale report.
579        exit_kind: TaskExit,
580        /// Runtime-selected handling bucket for metrics and audits.
581        handled_as: StaleReportHandling,
582    },
583    /// Pending restart bookkeeping drained because the pinned old attempt exited.
584    ChildRestartFencePendingDrained {
585        /// Child whose pending restart advanced past the cooperative stop barrier.
586        child_id: ChildId,
587    },
588    /// Child control command delivered cancellation.
589    ChildControlCancelDelivered {
590        /// Child that received cancellation.
591        child_id: ChildId,
592        /// Generation that received cancellation.
593        generation: Generation,
594        /// Attempt that received cancellation.
595        attempt: ChildStartCount,
596        /// Control command name.
597        command: String,
598        /// Control command identifier.
599        command_id: String,
600    },
601    /// Child control stop completed.
602    ChildControlStopCompleted {
603        /// Child that completed stopping.
604        child_id: ChildId,
605        /// Generation that completed stopping.
606        generation: Generation,
607        /// Attempt that completed stopping.
608        attempt: ChildStartCount,
609        /// Child exit classification.
610        exit_kind: TaskExit,
611    },
612    /// Child control stop failed.
613    ChildControlStopFailed {
614        /// Child that failed to stop.
615        child_id: ChildId,
616        /// Generation that failed to stop.
617        generation: Generation,
618        /// Attempt that failed to stop.
619        attempt: ChildStartCount,
620        /// Current attempt status.
621        status: ChildAttemptStatus,
622        /// Current stop progress.
623        stop_state: ChildStopState,
624        /// Control failure phase.
625        phase: ChildControlFailurePhase,
626        /// Human-readable failure reason.
627        reason: String,
628        /// Whether callers can retry to recover.
629        recoverable: bool,
630    },
631    /// Child control operation changed.
632    ChildControlOperationChanged {
633        /// Child whose operation changed.
634        child_id: ChildId,
635        /// Previous operation.
636        from: ChildControlOperation,
637        /// New operation.
638        to: ChildControlOperation,
639        /// Control command name.
640        command: String,
641        /// Control command identifier.
642        command_id: String,
643    },
644    /// Child control command completed with a full outcome.
645    ChildControlCommandCompleted {
646        /// Child that the command targeted.
647        child_id: ChildId,
648        /// Stable control command name.
649        command: String,
650        /// Control command identifier.
651        command_id: String,
652        /// Caller that requested the command.
653        requested_by: String,
654        /// Operator-provided reason.
655        reason: String,
656        /// Low-cardinality command result.
657        result: String,
658        /// Full control outcome.
659        outcome: Box<ChildControlResult>,
660    },
661    /// Child restart limit accounting was refreshed.
662    ChildRuntimeRestartLimitUpdated {
663        /// Child whose restart limit accounting changed.
664        child_id: ChildId,
665        /// Updated restart limit state.
666        restart_limit: RestartLimitState,
667    },
668    /// Child runtime state record was removed.
669    ChildRuntimeStateRemoved {
670        /// Removed child.
671        child_id: ChildId,
672        /// Child path in the supervisor tree.
673        path: SupervisorPath,
674        /// Final attempt status.
675        final_status: Option<ChildAttemptStatus>,
676    },
677    /// Child heartbeat became stale.
678    ChildHeartbeatStale {
679        /// Child with a stale heartbeat.
680        child_id: ChildId,
681        /// Attempt with a stale heartbeat.
682        attempt: ChildStartCount,
683        /// Last heartbeat timestamp in Unix epoch nanoseconds.
684        since_unix_nanos: u128,
685    },
686    /// Control command was accepted.
687    CommandAccepted {
688        /// Command audit payload.
689        audit: CommandAudit,
690    },
691    /// Control command completed.
692    CommandCompleted {
693        /// Command audit payload.
694        audit: CommandAudit,
695    },
696    /// Runtime control loop started.
697    RuntimeControlLoopStarted {
698        /// Startup phase label.
699        phase: String,
700        /// Startup time in Unix epoch nanoseconds.
701        started_at_unix_nanos: u128,
702    },
703    /// Runtime control loop shutdown was requested.
704    RuntimeControlLoopShutdownRequested {
705        /// Stable command identifier.
706        command_id: String,
707        /// Caller that requested shutdown.
708        requested_by: String,
709        /// Operator-provided reason.
710        reason: String,
711    },
712    /// Runtime control loop completed normally.
713    RuntimeControlLoopCompleted {
714        /// Completion phase label.
715        phase: String,
716        /// Completion reason.
717        reason: String,
718        /// Completion time in Unix epoch nanoseconds.
719        completed_at_unix_nanos: u128,
720    },
721    /// Runtime control loop failed.
722    RuntimeControlLoopFailed {
723        /// Failure phase label.
724        phase: String,
725        /// Failure reason.
726        reason: String,
727        /// Whether failure came from panic.
728        panic: bool,
729        /// Whether a new supervisor can recover.
730        recoverable: bool,
731    },
732    /// Runtime control loop join completed.
733    RuntimeControlLoopJoinCompleted {
734        /// Stable command identifier.
735        command_id: String,
736        /// Caller that requested join.
737        requested_by: String,
738        /// Final state label.
739        state: String,
740        /// Final phase label.
741        phase: String,
742        /// Final reason.
743        reason: String,
744    },
745    /// Event subscriber lagged.
746    SubscriberLagged {
747        /// Number of missed events.
748        missed: u64,
749    },
750    /// Restart budget exhausted for a child.
751    BudgetExhausted {
752        /// Child whose budget ran out.
753        child_id: ChildId,
754        /// Nanoseconds to wait before retrying.
755        retry_after_ns: u128,
756        /// Source group that triggered the budget check (when applicable).
757        budget_source_group: Option<String>,
758    },
759    /// Group meltdown fuse was triggered.
760    GroupFuseTriggered {
761        /// Group that entered meltdown.
762        group_name: String,
763        /// Group from which the fuse propagated (when applicable).
764        propagated_from_group: Option<String>,
765    },
766    /// Escalation path bifurcation between critical and optional children.
767    EscalationBifurcated {
768        /// Severity classification for the escalation decision.
769        severity: String,
770        /// Budget verdict at the time of escalation (when available).
771        budget_verdict: Option<String>,
772        /// Meltdown outcome at the time of escalation (when available).
773        fuse_outcome: Option<String>,
774        /// Reason for tie-breaking (when applicable).
775        tie_break_reason: Option<String>,
776    },
777    /// Starvation alert emitted by the fairness probe (US1).
778    FairnessProbeStarvation {
779        /// The child that has been starved.
780        starved_child_id: ChildId,
781        /// How many scheduling opportunities were missed.
782        skip_count: u64,
783        /// Start of the probe window (Unix nanos).
784        probe_start_unix_nanos: u128,
785        /// End of the probe window (Unix nanos).
786        probe_end_unix_nanos: u128,
787    },
788    /// Restart budget denied by policy.
789    BudgetDenied {
790        /// Group associated with the budget check.
791        group: Option<String>,
792        /// Reason for the denial.
793        reason: String,
794        /// Remaining budget ratio.
795        budget_remaining: FiniteF64,
796    },
797    /// Generation fence engaged for child restart isolation.
798    GenerationFenced {
799        /// Old generation that was fenced.
800        old_generation: u64,
801        /// New generation that was allowed.
802        new_generation: u64,
803        /// Reason for the fence.
804        reason: String,
805    },
806    /// Health check passed for a child.
807    HealthCheckPassed {
808        /// Time since last check in milliseconds.
809        age_ms: u64,
810        /// Wall clock time when the child became healthy.
811        healthy_since_unix_nanos: u128,
812    },
813    /// Health check failed for a child.
814    HealthCheckFailed {
815        /// Failure reason.
816        reason: String,
817        /// Consecutive failure count.
818        consecutive_failures: u32,
819    },
820    /// Supervision paused for a child or group.
821    Paused {
822        /// Pause reason.
823        reason: String,
824        /// Caller that initiated the pause.
825        paused_by: String,
826    },
827    /// Supervision resumed for a child or group.
828    Resumed {
829        /// Resume reason.
830        reason: String,
831    },
832    /// Child or group was quarantined.
833    Quarantined {
834        /// Meltdown scope that triggered quarantine.
835        scope: MeltdownScope,
836        /// Quarantine reason.
837        reason: String,
838        /// Quarantine duration in seconds.
839        duration_secs: u64,
840    },
841    /// Backpressure alert emitted when subscriber buffer exceeds soft threshold.
842    BackpressureAlert {
843        /// Subscriber name or identifier.
844        subscriber: String,
845        /// Current buffer occupancy percentage.
846        buffer_pct: u8,
847        /// Threshold that triggered the alert.
848        threshold_pct: u8,
849    },
850    /// Backpressure degradation when subscriber buffer exceeds hard threshold.
851    BackpressureDegradation {
852        /// Subscriber name or identifier.
853        subscriber: String,
854        /// Active backpressure strategy.
855        strategy: String,
856        /// Current sampling ratio.
857        sample_ratio: FiniteF64,
858        /// Peak buffer occupancy during the degradation window.
859        buffer_peak_pct: u8,
860        /// Whether the subscriber has recovered.
861        recovered: bool,
862    },
863    /// Audit record for a command or lifecycle event.
864    AuditRecorded {
865        /// Command identifier.
866        command_id: String,
867        /// Event type being audited.
868        event_type: String,
869        /// Sampling ratio in effect when the audit was recorded.
870        sample_ratio: FiniteF64,
871        /// Correlation identifier linking this audit to the event chain.
872        correlation_id: CorrelationId,
873        /// Reason the audit was triggered.
874        trigger_reason: String,
875        /// Number of events discarded by sampling.
876        events_discarded: u64,
877    },
878    /// Child declaration was accepted and committed via add_child.
879    ChildDeclarationAccepted {
880        /// Transaction identifier for audit tracing.
881        transaction_id: Uuid,
882        /// Name of the accepted child.
883        child_name: String,
884        /// Runtime child identifier.
885        child_id: ChildId,
886        /// Supervisor spec hash after this operation.
887        spec_hash: String,
888    },
889    /// Child declaration was rejected via add_child.
890    ChildDeclarationRejected {
891        /// Transaction identifier for audit tracing.
892        transaction_id: Uuid,
893        /// Name of the rejected child.
894        child_name: String,
895        /// Human-readable rejection reason.
896        reason: String,
897        /// Optional JSON Pointer field path pointing to the error source.
898        field_path: Option<String>,
899    },
900}
901
902impl What {
903    /// Returns a low-cardinality event name.
904    ///
905    /// # Arguments
906    ///
907    /// This function has no arguments.
908    ///
909    /// # Returns
910    ///
911    /// Returns the stable event name.
912    ///
913    /// # Examples
914    ///
915    /// ```
916    /// let event = rust_supervisor::event::payload::What::ChildRunning {
917    ///     transition: None,
918    /// };
919    /// assert_eq!(event.name(), "ChildRunning");
920    /// ```
921    pub fn name(&self) -> &'static str {
922        match self {
923            Self::ChildStarting { .. } => "ChildStarting",
924            Self::ChildRunning { .. } => "ChildRunning",
925            Self::ChildReady { .. } => "ChildReady",
926            Self::ChildHeartbeat { .. } => "ChildHeartbeat",
927            Self::ChildFailed { .. } => "ChildFailed",
928            Self::ChildPanicked { .. } => "ChildPanicked",
929            Self::BackoffScheduled { .. } => "BackoffScheduled",
930            Self::ChildRestarting { .. } => "ChildRestarting",
931            Self::ChildRestarted { .. } => "ChildRestarted",
932            Self::ChildQuarantined { .. } => "ChildQuarantined",
933            Self::ChildStopped { .. } => "ChildStopped",
934            Self::ChildUnhealthy { .. } => "ChildUnhealthy",
935            Self::Meltdown { .. } => "Meltdown",
936            Self::ShutdownRequested { .. } => "ShutdownRequested",
937            Self::ShutdownPhaseChanged { .. } => "ShutdownPhaseChanged",
938            Self::ShutdownCompleted { .. } => "ShutdownCompleted",
939            Self::ChildShutdownCancelDelivered { .. } => "ChildShutdownCancelDelivered",
940            Self::ChildShutdownGraceful { .. } => "ChildShutdownGraceful",
941            Self::ChildShutdownAborted { .. } => "ChildShutdownAborted",
942            Self::ChildShutdownLateReport { .. } => "ChildShutdownLateReport",
943            Self::ChildRestartFenceEntered { .. } => "ChildRestartFenceEntered",
944            Self::ChildRestartFenceAbortRequested { .. } => "ChildRestartFenceAbortRequested",
945            Self::ChildRestartFenceReleased { .. } => "ChildRestartFenceReleased",
946            Self::ChildRestartConflict { .. } => "ChildRestartConflict",
947            Self::ChildAttemptStaleReport { .. } => "ChildAttemptStaleReport",
948            Self::ChildRestartFencePendingDrained { .. } => "ChildRestartFencePendingDrained",
949            Self::ChildControlCancelDelivered { .. } => "ChildControlCancelDelivered",
950            Self::ChildControlStopCompleted { .. } => "ChildControlStopCompleted",
951            Self::ChildControlStopFailed { .. } => "ChildControlStopFailed",
952            Self::ChildControlOperationChanged { .. } => "ChildControlOperationChanged",
953            Self::ChildControlCommandCompleted { .. } => "ChildControlCommandCompleted",
954            Self::ChildRuntimeRestartLimitUpdated { .. } => "ChildRuntimeRestartLimitUpdated",
955            Self::ChildRuntimeStateRemoved { .. } => "ChildRuntimeStateRemoved",
956            Self::ChildHeartbeatStale { .. } => "ChildHeartbeatStale",
957            Self::CommandAccepted { .. } => "CommandAccepted",
958            Self::CommandCompleted { .. } => "CommandCompleted",
959            Self::RuntimeControlLoopStarted { .. } => "RuntimeControlLoopStarted",
960            Self::RuntimeControlLoopShutdownRequested { .. } => {
961                "RuntimeControlLoopShutdownRequested"
962            }
963            Self::RuntimeControlLoopCompleted { .. } => "RuntimeControlLoopCompleted",
964            Self::RuntimeControlLoopFailed { .. } => "RuntimeControlLoopFailed",
965            Self::RuntimeControlLoopJoinCompleted { .. } => "RuntimeControlLoopJoinCompleted",
966            Self::SubscriberLagged { .. } => "SubscriberLagged",
967            Self::BudgetExhausted { .. } => "BudgetExhausted",
968            Self::GroupFuseTriggered { .. } => "GroupFuseTriggered",
969            Self::EscalationBifurcated { .. } => "EscalationBifurcated",
970            Self::FairnessProbeStarvation { .. } => "FairnessProbeStarvation",
971            Self::BudgetDenied { .. } => "BudgetDenied",
972            Self::GenerationFenced { .. } => "GenerationFenced",
973            Self::HealthCheckPassed { .. } => "HealthCheckPassed",
974            Self::HealthCheckFailed { .. } => "HealthCheckFailed",
975            Self::Paused { .. } => "Paused",
976            Self::Resumed { .. } => "Resumed",
977            Self::Quarantined { .. } => "Quarantined",
978            Self::BackpressureAlert { .. } => "BackpressureAlert",
979            Self::BackpressureDegradation { .. } => "BackpressureDegradation",
980            Self::AuditRecorded { .. } => "AuditRecorded",
981            Self::ChildDeclarationAccepted { .. } => "ChildDeclarationAccepted",
982            Self::ChildDeclarationRejected { .. } => "ChildDeclarationRejected",
983        }
984    }
985}
986
987/// Complete lifecycle event envelope.
988#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
989pub struct SupervisorEvent {
990    /// Schema version identifier, monotonically increasing.
991    pub schema_id: u64,
992    /// Time information for the lifecycle fact.
993    pub when: When,
994    /// Location information for the lifecycle fact.
995    pub r#where: Where,
996    /// Typed event payload.
997    pub what: What,
998    /// Optional policy decision related to the event.
999    pub policy: Option<PolicyDecision>,
1000    /// Monotonic event sequence.
1001    pub sequence: EventSequence,
1002    /// Correlation identifier shared by related signals.
1003    pub correlation_id: CorrelationId,
1004    /// Configuration version that produced this fact.
1005    pub config_version: u64,
1006    /// List of meltdown scopes that reached or exceeded thresholds in this evaluation round.
1007    pub scopes_triggered: Vec<MeltdownScope>,
1008    /// The dominant attribution scope for the effective meltdown verdict.
1009    pub lead_scope: Option<MeltdownScope>,
1010    /// The effective protection action on the restrictiveness ladder.
1011    pub effective_protective_action: Option<ProtectionAction>,
1012    /// Reason for cold start budget triggering or exhaustion.
1013    pub cold_start_reason: ColdStartReason,
1014    /// Reason for hot loop detection triggering.
1015    pub hot_loop_reason: HotLoopReason,
1016    /// Ownership of the throttle gate that limited concurrent restarts.
1017    pub throttle_gate_owner: ThrottleGateOwner,
1018    /// Effective task role used by the policy decision.
1019    pub task_role: Option<TaskRole>,
1020    /// Whether fallback task role defaults were used.
1021    pub used_fallback_default: bool,
1022    /// Source that produced the effective policy.
1023    pub effective_policy_source: Option<PolicySource>,
1024}
1025
1026impl SupervisorEvent {
1027    /// Creates a supervisor lifecycle event.
1028    ///
1029    /// # Arguments
1030    ///
1031    /// - `when`: Event timing.
1032    /// - `r#where`: Event location.
1033    /// - `what`: Event payload.
1034    /// - `sequence`: Monotonic event sequence.
1035    /// - `correlation_id`: Correlation identifier for related signals.
1036    /// - `config_version`: Configuration version for this event.
1037    ///
1038    /// # Returns
1039    ///
1040    /// Returns a [`SupervisorEvent`].
1041    ///
1042    /// # Examples
1043    ///
1044    /// ```
1045    /// let event = rust_supervisor::event::payload::SupervisorEvent::new(
1046    ///     rust_supervisor::event::time::When::new(
1047    ///         rust_supervisor::event::time::EventTime::deterministic(
1048    ///             1,
1049    ///             1,
1050    ///             0,
1051    ///             rust_supervisor::id::types::Generation::initial(),
1052    ///             rust_supervisor::id::types::ChildStartCount::first(),
1053    ///         ),
1054    ///     ),
1055    ///     rust_supervisor::event::payload::Where::new(
1056    ///         rust_supervisor::id::types::SupervisorPath::root(),
1057    ///     ),
1058    ///     rust_supervisor::event::payload::What::ChildRunning { transition: None },
1059    ///     rust_supervisor::event::time::EventSequence::new(1),
1060    ///     rust_supervisor::event::time::CorrelationId::from_uuid(uuid::Uuid::nil()),
1061    ///     1,
1062    /// );
1063    /// assert_eq!(event.what.name(), "ChildRunning");
1064    /// ```
1065    pub fn new(
1066        when: When,
1067        r#where: Where,
1068        what: What,
1069        sequence: EventSequence,
1070        correlation_id: CorrelationId,
1071        config_version: u64,
1072    ) -> Self {
1073        Self {
1074            schema_id: 1,
1075            when,
1076            r#where,
1077            what,
1078            policy: None,
1079            sequence,
1080            correlation_id,
1081            config_version,
1082            scopes_triggered: Vec::new(),
1083            lead_scope: None,
1084            effective_protective_action: None,
1085            cold_start_reason: ColdStartReason::NotApplicable,
1086            hot_loop_reason: HotLoopReason::NotApplicable,
1087            throttle_gate_owner: ThrottleGateOwner::None,
1088            task_role: None,
1089            used_fallback_default: false,
1090            effective_policy_source: None,
1091        }
1092    }
1093
1094    /// Attaches a policy decision to an event.
1095    ///
1096    /// # Arguments
1097    ///
1098    /// - `policy`: Policy decision produced for this lifecycle fact.
1099    ///
1100    /// # Returns
1101    ///
1102    /// Returns the updated [`SupervisorEvent`].
1103    pub fn with_policy(mut self, policy: PolicyDecision) -> Self {
1104        self.policy = Some(policy);
1105        self
1106    }
1107}