rust_supervisor/event/payload.rs
1//! Lifecycle event payloads and event envelopes.
2//!
3//! This module owns the observable shape of supervisor lifecycle facts. It keeps
4//! payloads typed so state, journal, metrics, and tests do not infer behavior
5//! from strings.
6
7use crate::child_runner::run_exit::TaskExit;
8use crate::control::outcome::{
9 ChildAttemptStatus, ChildControlFailurePhase, ChildControlOperation, ChildControlResult,
10 ChildStopState, RestartLimitState, StaleReportHandling,
11};
12use crate::error::types::TaskFailure;
13use crate::event::time::{CorrelationId, EventSequence, When};
14use crate::id::types::{ChildId, ChildStartCount, Generation, SupervisorPath};
15use crate::policy::task_role_defaults::{PolicySource, TaskRole};
16use serde::{Deserialize, Serialize};
17use uuid::Uuid;
18
19/// Wrapper around [`f64`] that implements [`Eq`] via bit comparison.
20///
21/// NaN is disallowed. If a NaN value is constructed at runtime, equality
22/// panics. This type exists solely to satisfy the `Eq` bound on the `What`
23/// enum and should not be used outside this module.
24#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
25#[serde(transparent)]
26pub struct FiniteF64(#[serde(with = "finite_f64_serde")] f64);
27
28impl Eq for FiniteF64 {}
29
30impl FiniteF64 {
31 /// Creates a `FiniteF64` from a raw `f64`.
32 ///
33 /// # Panics
34 ///
35 /// Panics if `value` is NaN.
36 pub fn new(value: f64) -> Self {
37 assert!(!value.is_nan(), "FiniteF64 does not support NaN");
38 Self(value)
39 }
40
41 /// Returns the inner `f64` value.
42 pub fn into_inner(self) -> f64 {
43 self.0
44 }
45}
46
47impl From<f64> for FiniteF64 {
48 /// Creates a `FiniteF64` from a raw `f64`.
49 ///
50 /// # Panics
51 ///
52 /// Panics if `value` is NaN.
53 fn from(value: f64) -> Self {
54 Self::new(value)
55 }
56}
57
58/// Serde helper that serializes `FiniteF64` as a plain JSON number.
59mod finite_f64_serde {
60 use serde::{Deserialize, Deserializer, Serialize, Serializer};
61
62 /// Serializes an `f64` as a plain JSON number.
63 pub fn serialize<S: Serializer>(value: &f64, serializer: S) -> Result<S::Ok, S::Error> {
64 value.serialize(serializer)
65 }
66
67 /// Deserializes an `f64` from a JSON number, rejecting NaN.
68 pub fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result<f64, D::Error> {
69 let value = f64::deserialize(deserializer)?;
70 if value.is_nan() {
71 return Err(serde::de::Error::custom("FiniteF64 does not support NaN"));
72 }
73 Ok(value)
74 }
75}
76
77/// Meltdown scope identifier for failure tracking.
78#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
79pub enum MeltdownScope {
80 /// Child-level scope bound to a specific child identifier.
81 Child,
82 /// Group-level scope bound to a restart execution plan group.
83 Group,
84 /// Supervisor-level scope bound to the supervisor instance boundary.
85 Supervisor,
86}
87
88impl std::fmt::Display for MeltdownScope {
89 /// Formats the meltdown scope as a string.
90 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91 match self {
92 Self::Child => write!(f, "child"),
93 Self::Group => write!(f, "group"),
94 Self::Supervisor => write!(f, "supervisor"),
95 }
96 }
97}
98
99/// Protection restrictiveness ladder defining escalation severity levels.
100///
101/// This enum defines six protection tiers from least to most restrictive.
102#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
103pub enum ProtectionAction {
104 /// Restart is allowed without restrictions.
105 RestartAllowed,
106 /// Restart is queued behind concurrency throttle gates.
107 RestartQueued,
108 /// Restart is denied due to policy limits.
109 RestartDenied,
110 /// Supervision is paused temporarily.
111 SupervisionPaused,
112 /// Failure is escalated to parent supervisor.
113 Escalated,
114 /// Supervised stop is enforced for the child.
115 SupervisedStop,
116}
117
118impl std::fmt::Display for ProtectionAction {
119 /// Formats the protection action as a string.
120 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
121 match self {
122 Self::RestartAllowed => write!(f, "restart_allowed"),
123 Self::RestartQueued => write!(f, "restart_queued"),
124 Self::RestartDenied => write!(f, "restart_denied"),
125 Self::SupervisionPaused => write!(f, "supervision_paused"),
126 Self::Escalated => write!(f, "escalated"),
127 Self::SupervisedStop => write!(f, "supervised_stop"),
128 }
129 }
130}
131
132/// Reason for cold start budget triggering or exhaustion.
133#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
134pub enum ColdStartReason {
135 /// Cold start budget has not been triggered.
136 NotApplicable,
137 /// Initial startup within cold start window.
138 InitialStartup,
139 /// Cold start budget exhausted within time window.
140 BudgetExhausted,
141 /// Too many restarts during cold start period.
142 ExcessiveRestarts,
143}
144
145impl std::fmt::Display for ColdStartReason {
146 /// Formats the cold start reason as a string.
147 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148 match self {
149 Self::NotApplicable => write!(f, "not_applicable"),
150 Self::InitialStartup => write!(f, "initial_startup"),
151 Self::BudgetExhausted => write!(f, "budget_exhausted"),
152 Self::ExcessiveRestarts => write!(f, "excessive_restarts"),
153 }
154 }
155}
156
157/// Reason for hot loop detection triggering.
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
159pub enum HotLoopReason {
160 /// Hot loop detection has not been triggered.
161 NotApplicable,
162 /// Rapid crash detected within sliding time window.
163 RapidCrashDetected,
164 /// Crash-restart cycle exceeded threshold frequency.
165 CycleThresholdExceeded,
166 /// Insufficient stable runtime between restarts.
167 InsufficientStableRuntime,
168}
169
170impl std::fmt::Display for HotLoopReason {
171 /// Formats the hot loop reason as a string.
172 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173 match self {
174 Self::NotApplicable => write!(f, "not_applicable"),
175 Self::RapidCrashDetected => write!(f, "rapid_crash_detected"),
176 Self::CycleThresholdExceeded => write!(f, "cycle_threshold_exceeded"),
177 Self::InsufficientStableRuntime => write!(f, "insufficient_stable_runtime"),
178 }
179 }
180}
181
182/// Ownership of the throttle gate that limited concurrent restarts.
183#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
184pub enum ThrottleGateOwner {
185 /// No throttle gate was active.
186 None,
187 /// Instance-global supervisor throttle gate.
188 SupervisorInstance,
189 /// Group-level throttle gate with group identifier.
190 Group(String),
191}
192
193impl std::fmt::Display for ThrottleGateOwner {
194 /// Formats the throttle gate owner as a string.
195 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196 match self {
197 Self::None => write!(f, "none"),
198 Self::SupervisorInstance => write!(f, "supervisor_global"),
199 Self::Group(group) => write!(f, "group:{}", group),
200 }
201 }
202}
203
204/// Location data attached to a supervisor event.
205#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
206pub struct Where {
207 /// Stable supervisor path that owns the fact.
208 pub supervisor_path: SupervisorPath,
209 /// Parent child identifier when the fact belongs to a nested node.
210 pub parent_id: Option<ChildId>,
211 /// Child identifier related to the fact.
212 pub child_id: Option<ChildId>,
213 /// Human-readable child name.
214 pub child_name: Option<String>,
215 /// Tokio task identifier when it is available.
216 pub tokio_task_id: Option<String>,
217 /// Host name reported by the runtime.
218 pub host: Option<String>,
219 /// Process identifier that emitted the event.
220 pub pid: u32,
221 /// Current thread name when available.
222 pub thread_name: Option<String>,
223 /// Rust module path that emitted the event.
224 pub module_path: Option<String>,
225 /// Source file that emitted the event.
226 pub source_file: Option<String>,
227 /// Source line that emitted the event.
228 pub source_line: Option<u32>,
229}
230
231impl Where {
232 /// Creates a location for a supervisor path.
233 ///
234 /// # Arguments
235 ///
236 /// - `supervisor_path`: Path that owns this lifecycle fact.
237 ///
238 /// # Returns
239 ///
240 /// Returns a [`Where`] value with process and thread defaults.
241 ///
242 /// # Examples
243 ///
244 /// ```
245 /// let location = rust_supervisor::event::payload::Where::new(
246 /// rust_supervisor::id::types::SupervisorPath::root(),
247 /// );
248 /// assert_eq!(location.supervisor_path.to_string(), "/");
249 /// ```
250 pub fn new(supervisor_path: SupervisorPath) -> Self {
251 Self {
252 supervisor_path,
253 parent_id: None,
254 child_id: None,
255 child_name: None,
256 tokio_task_id: None,
257 host: None,
258 pid: std::process::id(),
259 thread_name: std::thread::current().name().map(ToOwned::to_owned),
260 module_path: None,
261 source_file: None,
262 source_line: None,
263 }
264 }
265
266 /// Adds child identity to the location.
267 ///
268 /// # Arguments
269 ///
270 /// - `child_id`: Stable child identifier.
271 /// - `child_name`: Human-readable child name.
272 ///
273 /// # Returns
274 ///
275 /// Returns the updated [`Where`] value.
276 pub fn with_child(mut self, child_id: ChildId, child_name: impl Into<String>) -> Self {
277 self.child_id = Some(child_id);
278 self.child_name = Some(child_name.into());
279 self
280 }
281}
282
283/// State transition recorded by an event payload.
284#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
285pub struct StateTransition {
286 /// State before the transition.
287 pub from: String,
288 /// State after the transition.
289 pub to: String,
290}
291
292impl StateTransition {
293 /// Creates a state transition description.
294 ///
295 /// # Arguments
296 ///
297 /// - `from`: Previous state name.
298 /// - `to`: New state name.
299 ///
300 /// # Returns
301 ///
302 /// Returns a [`StateTransition`].
303 pub fn new(from: impl Into<String>, to: impl Into<String>) -> Self {
304 Self {
305 from: from.into(),
306 to: to.into(),
307 }
308 }
309}
310
311/// Policy decision data stored with an event.
312#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
313pub struct PolicyDecision {
314 /// Low-cardinality decision name.
315 pub decision: String,
316 /// Delay in milliseconds when restart is delayed.
317 pub delay_ms: Option<u64>,
318 /// Human-readable reason for diagnostics.
319 pub reason: Option<String>,
320}
321
322impl PolicyDecision {
323 /// Creates a policy decision value.
324 ///
325 /// # Arguments
326 ///
327 /// - `decision`: Low-cardinality decision name.
328 /// - `delay_ms`: Optional delay in milliseconds.
329 /// - `reason`: Optional diagnostic reason.
330 ///
331 /// # Returns
332 ///
333 /// Returns a [`PolicyDecision`].
334 pub fn new(decision: impl Into<String>, delay_ms: Option<u64>, reason: Option<String>) -> Self {
335 Self {
336 decision: decision.into(),
337 delay_ms,
338 reason,
339 }
340 }
341}
342
343/// Command audit data attached to command lifecycle events.
344#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
345pub struct CommandAudit {
346 /// Stable command identifier.
347 pub command_id: String,
348 /// Caller that requested the command.
349 pub requested_by: String,
350 /// Operator-provided reason.
351 pub reason: String,
352 /// Target path for the command.
353 pub target_path: SupervisorPath,
354 /// Accepted time in nanoseconds since the Unix epoch.
355 pub accepted_at_unix_nanos: u128,
356 /// Command result summary.
357 pub result: String,
358}
359
360/// Typed payload for supervisor lifecycle events.
361#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
362#[serde(tag = "type", content = "payload", rename_all = "snake_case")]
363pub enum What {
364 /// Child is being started.
365 ChildStarting {
366 /// Optional state transition carried by this event.
367 transition: Option<StateTransition>,
368 },
369 /// Child is running.
370 ChildRunning {
371 /// Optional state transition carried by this event.
372 transition: Option<StateTransition>,
373 },
374 /// Child is ready.
375 ChildReady {
376 /// Optional state transition carried by this event.
377 transition: Option<StateTransition>,
378 },
379 /// Child emitted a heartbeat.
380 ChildHeartbeat {
381 /// Heartbeat age in milliseconds.
382 age_ms: u64,
383 },
384 /// Child failed with a typed failure.
385 ChildFailed {
386 /// Failure payload reported by the task.
387 failure: TaskFailure,
388 },
389 /// Child panicked.
390 ChildPanicked {
391 /// Panic category used for metrics.
392 category: String,
393 },
394 /// Restart backoff was scheduled.
395 BackoffScheduled {
396 /// Backoff delay in milliseconds.
397 delay_ms: u64,
398 },
399 /// Child is restarting.
400 ChildRestarting {
401 /// Restart generation after the transition.
402 generation: u64,
403 },
404 /// Child restarted.
405 ChildRestarted {
406 /// Restart count for the child window.
407 restart_count: u64,
408 },
409 /// Child was quarantined.
410 ChildQuarantined {
411 /// Quarantine reason.
412 reason: String,
413 },
414 /// Child stopped.
415 ChildStopped {
416 /// Exit reason.
417 reason: String,
418 },
419 /// Child became unhealthy.
420 ChildUnhealthy {
421 /// Unhealthy reason.
422 reason: String,
423 },
424 /// Meltdown fuse was tripped.
425 Meltdown {
426 /// Scope that tripped the fuse.
427 scope: String,
428 },
429 /// Shutdown was requested.
430 ShutdownRequested {
431 /// Shutdown cause.
432 cause: String,
433 },
434 /// Shutdown phase changed.
435 ShutdownPhaseChanged {
436 /// Previous phase name.
437 from: String,
438 /// New phase name.
439 to: String,
440 },
441 /// Shutdown completed.
442 ShutdownCompleted {
443 /// Final shutdown phase.
444 phase: String,
445 /// Shutdown result summary.
446 result: String,
447 /// Full pipeline duration in milliseconds.
448 duration_ms: u64,
449 },
450 /// Child shutdown cancel delivered for one supervised child_start_count during shutdown draining.
451 ChildShutdownCancelDelivered {
452 /// Child that received cancellation.
453 child_id: ChildId,
454 /// Generation associated with the child child_start_count.
455 generation: Generation,
456 /// ChildStartCount associated with the child run.
457 child_start_count: ChildStartCount,
458 /// Shutdown phase that delivered cancellation.
459 phase: String,
460 },
461 /// Child finished during graceful shutdown draining.
462 ChildShutdownGraceful {
463 /// Child that completed gracefully.
464 child_id: ChildId,
465 /// Generation associated with the child child_start_count.
466 generation: Generation,
467 /// ChildStartCount associated with the child run.
468 child_start_count: ChildStartCount,
469 /// Shutdown phase that recorded the outcome.
470 phase: String,
471 /// Exit classification reported by the child.
472 exit: String,
473 },
474 /// Child was aborted during shutdown.
475 ChildShutdownAborted {
476 /// Child that was aborted.
477 child_id: ChildId,
478 /// Generation associated with the child child_start_count.
479 generation: Generation,
480 /// ChildStartCount associated with the child run.
481 child_start_count: ChildStartCount,
482 /// Shutdown phase that recorded the outcome.
483 phase: String,
484 /// Low-cardinality abort result.
485 result: String,
486 /// Human-readable abort reason.
487 reason: String,
488 },
489 /// Child reported after its normal shutdown accounting window.
490 ChildShutdownLateReport {
491 /// Child that produced a late report.
492 child_id: ChildId,
493 /// Generation associated with the child child_start_count.
494 generation: Generation,
495 /// ChildStartCount associated with the child run.
496 child_start_count: ChildStartCount,
497 /// Shutdown phase that received the late report.
498 phase: String,
499 /// Exit classification reported by the child.
500 exit: String,
501 },
502 /// Generation fence engaged for an accepted manual restart waiting for an old attempt to stop.
503 ChildRestartFenceEntered {
504 /// Child awaiting restart isolation.
505 child_id: ChildId,
506 /// Old generation pinned until the fence releases.
507 old_generation: Generation,
508 /// Old attempt pinned until the fence releases.
509 old_attempt: ChildStartCount,
510 /// Target generation queued after the old attempt completes.
511 target_generation: Generation,
512 /// Command identifier tying this fence to auditing metadata.
513 command_id: String,
514 /// Restart requester captured from command metadata.
515 requested_by: String,
516 /// Restart reason captured from command metadata.
517 reason: String,
518 /// Deadline for cooperative stop before escalation to abort paths.
519 stop_deadline_at_unix_nanos: u128,
520 },
521 /// Runtime escalated restart isolation to abort the old attempt after the cooperative deadline elapsed.
522 ChildRestartFenceAbortRequested {
523 /// Child awaiting restart isolation.
524 child_id: ChildId,
525 /// Old generation that failed to exit before the graceful deadline expired.
526 old_generation: Generation,
527 /// Old attempt that failed to exit before the graceful deadline expired.
528 old_attempt: ChildStartCount,
529 /// Target generation queued for start after isolation completes.
530 target_generation: Generation,
531 /// Command identifier tied to the pending restart bookkeeping.
532 command_id: String,
533 /// Deadline that triggered the abort escalation.
534 deadline_unix_nanos: u128,
535 },
536 /// Old attempt completed and a new generation may start under the pending restart request.
537 ChildRestartFenceReleased {
538 /// Child whose fence released.
539 child_id: ChildId,
540 /// Old generation that fully stopped.
541 old_generation: Generation,
542 /// Old attempt that fully stopped.
543 old_attempt: ChildStartCount,
544 /// Target generation allowed to start after this release.
545 target_generation: Generation,
546 /// Exit classification reported for the old attempt.
547 exit_kind: TaskExit,
548 },
549 /// Conflicting restart intent that was merged, rejected, or superseded by policy.
550 ChildRestartConflict {
551 /// Child identifier for the fencing scope.
552 child_id: ChildId,
553 /// Generation that was active or pinned when the conflict was classified.
554 current_generation: Option<Generation>,
555 /// Attempt counter that was active or pinned when the conflict was classified.
556 current_attempt: Option<ChildStartCount>,
557 /// Generation the caller wanted to reach, if applicable.
558 target_generation: Option<Generation>,
559 /// Command identifier supplied by the caller when present.
560 command_id: String,
561 /// Low-cardinality conflict classifier (`already_pending`, `rejected`, ...).
562 decision: String,
563 /// Human-readable reason for observability dumps.
564 reason: String,
565 },
566 /// Stale completion triple observed after authoritative state moved forward.
567 ChildAttemptStaleReport {
568 /// Child identifier tied to the completion triple.
569 child_id: ChildId,
570 /// Generation carried by the stale completion report.
571 reported_generation: Generation,
572 /// Attempt counter carried by the stale completion report.
573 reported_attempt: ChildStartCount,
574 /// Generation considered authoritative when the stale report arrived.
575 current_generation: Option<Generation>,
576 /// Attempt counter considered authoritative when the stale report arrived.
577 current_attempt: Option<ChildStartCount>,
578 /// Exit classification supplied by the stale report.
579 exit_kind: TaskExit,
580 /// Runtime-selected handling bucket for metrics and audits.
581 handled_as: StaleReportHandling,
582 },
583 /// Pending restart bookkeeping drained because the pinned old attempt exited.
584 ChildRestartFencePendingDrained {
585 /// Child whose pending restart advanced past the cooperative stop barrier.
586 child_id: ChildId,
587 },
588 /// Child control command delivered cancellation.
589 ChildControlCancelDelivered {
590 /// Child that received cancellation.
591 child_id: ChildId,
592 /// Generation that received cancellation.
593 generation: Generation,
594 /// Attempt that received cancellation.
595 attempt: ChildStartCount,
596 /// Control command name.
597 command: String,
598 /// Control command identifier.
599 command_id: String,
600 },
601 /// Child control stop completed.
602 ChildControlStopCompleted {
603 /// Child that completed stopping.
604 child_id: ChildId,
605 /// Generation that completed stopping.
606 generation: Generation,
607 /// Attempt that completed stopping.
608 attempt: ChildStartCount,
609 /// Child exit classification.
610 exit_kind: TaskExit,
611 },
612 /// Child control stop failed.
613 ChildControlStopFailed {
614 /// Child that failed to stop.
615 child_id: ChildId,
616 /// Generation that failed to stop.
617 generation: Generation,
618 /// Attempt that failed to stop.
619 attempt: ChildStartCount,
620 /// Current attempt status.
621 status: ChildAttemptStatus,
622 /// Current stop progress.
623 stop_state: ChildStopState,
624 /// Control failure phase.
625 phase: ChildControlFailurePhase,
626 /// Human-readable failure reason.
627 reason: String,
628 /// Whether callers can retry to recover.
629 recoverable: bool,
630 },
631 /// Child control operation changed.
632 ChildControlOperationChanged {
633 /// Child whose operation changed.
634 child_id: ChildId,
635 /// Previous operation.
636 from: ChildControlOperation,
637 /// New operation.
638 to: ChildControlOperation,
639 /// Control command name.
640 command: String,
641 /// Control command identifier.
642 command_id: String,
643 },
644 /// Child control command completed with a full outcome.
645 ChildControlCommandCompleted {
646 /// Child that the command targeted.
647 child_id: ChildId,
648 /// Stable control command name.
649 command: String,
650 /// Control command identifier.
651 command_id: String,
652 /// Caller that requested the command.
653 requested_by: String,
654 /// Operator-provided reason.
655 reason: String,
656 /// Low-cardinality command result.
657 result: String,
658 /// Full control outcome.
659 outcome: Box<ChildControlResult>,
660 },
661 /// Child restart limit accounting was refreshed.
662 ChildRuntimeRestartLimitUpdated {
663 /// Child whose restart limit accounting changed.
664 child_id: ChildId,
665 /// Updated restart limit state.
666 restart_limit: RestartLimitState,
667 },
668 /// Child runtime state record was removed.
669 ChildRuntimeStateRemoved {
670 /// Removed child.
671 child_id: ChildId,
672 /// Child path in the supervisor tree.
673 path: SupervisorPath,
674 /// Final attempt status.
675 final_status: Option<ChildAttemptStatus>,
676 },
677 /// Child heartbeat became stale.
678 ChildHeartbeatStale {
679 /// Child with a stale heartbeat.
680 child_id: ChildId,
681 /// Attempt with a stale heartbeat.
682 attempt: ChildStartCount,
683 /// Last heartbeat timestamp in Unix epoch nanoseconds.
684 since_unix_nanos: u128,
685 },
686 /// Control command was accepted.
687 CommandAccepted {
688 /// Command audit payload.
689 audit: CommandAudit,
690 },
691 /// Control command completed.
692 CommandCompleted {
693 /// Command audit payload.
694 audit: CommandAudit,
695 },
696 /// Runtime control loop started.
697 RuntimeControlLoopStarted {
698 /// Startup phase label.
699 phase: String,
700 /// Startup time in Unix epoch nanoseconds.
701 started_at_unix_nanos: u128,
702 },
703 /// Runtime control loop shutdown was requested.
704 RuntimeControlLoopShutdownRequested {
705 /// Stable command identifier.
706 command_id: String,
707 /// Caller that requested shutdown.
708 requested_by: String,
709 /// Operator-provided reason.
710 reason: String,
711 },
712 /// Runtime control loop completed normally.
713 RuntimeControlLoopCompleted {
714 /// Completion phase label.
715 phase: String,
716 /// Completion reason.
717 reason: String,
718 /// Completion time in Unix epoch nanoseconds.
719 completed_at_unix_nanos: u128,
720 },
721 /// Runtime control loop failed.
722 RuntimeControlLoopFailed {
723 /// Failure phase label.
724 phase: String,
725 /// Failure reason.
726 reason: String,
727 /// Whether failure came from panic.
728 panic: bool,
729 /// Whether a new supervisor can recover.
730 recoverable: bool,
731 },
732 /// Runtime control loop join completed.
733 RuntimeControlLoopJoinCompleted {
734 /// Stable command identifier.
735 command_id: String,
736 /// Caller that requested join.
737 requested_by: String,
738 /// Final state label.
739 state: String,
740 /// Final phase label.
741 phase: String,
742 /// Final reason.
743 reason: String,
744 },
745 /// Event subscriber lagged.
746 SubscriberLagged {
747 /// Number of missed events.
748 missed: u64,
749 },
750 /// Restart budget exhausted for a child.
751 BudgetExhausted {
752 /// Child whose budget ran out.
753 child_id: ChildId,
754 /// Nanoseconds to wait before retrying.
755 retry_after_ns: u128,
756 /// Source group that triggered the budget check (when applicable).
757 budget_source_group: Option<String>,
758 },
759 /// Group meltdown fuse was triggered.
760 GroupFuseTriggered {
761 /// Group that entered meltdown.
762 group_name: String,
763 /// Group from which the fuse propagated (when applicable).
764 propagated_from_group: Option<String>,
765 },
766 /// Escalation path bifurcation between critical and optional children.
767 EscalationBifurcated {
768 /// Severity classification for the escalation decision.
769 severity: String,
770 /// Budget verdict at the time of escalation (when available).
771 budget_verdict: Option<String>,
772 /// Meltdown outcome at the time of escalation (when available).
773 fuse_outcome: Option<String>,
774 /// Reason for tie-breaking (when applicable).
775 tie_break_reason: Option<String>,
776 },
777 /// Starvation alert emitted by the fairness probe (US1).
778 FairnessProbeStarvation {
779 /// The child that has been starved.
780 starved_child_id: ChildId,
781 /// How many scheduling opportunities were missed.
782 skip_count: u64,
783 /// Start of the probe window (Unix nanos).
784 probe_start_unix_nanos: u128,
785 /// End of the probe window (Unix nanos).
786 probe_end_unix_nanos: u128,
787 },
788 /// Restart budget denied by policy.
789 BudgetDenied {
790 /// Group associated with the budget check.
791 group: Option<String>,
792 /// Reason for the denial.
793 reason: String,
794 /// Remaining budget ratio.
795 budget_remaining: FiniteF64,
796 },
797 /// Generation fence engaged for child restart isolation.
798 GenerationFenced {
799 /// Old generation that was fenced.
800 old_generation: u64,
801 /// New generation that was allowed.
802 new_generation: u64,
803 /// Reason for the fence.
804 reason: String,
805 },
806 /// Health check passed for a child.
807 HealthCheckPassed {
808 /// Time since last check in milliseconds.
809 age_ms: u64,
810 /// Wall clock time when the child became healthy.
811 healthy_since_unix_nanos: u128,
812 },
813 /// Health check failed for a child.
814 HealthCheckFailed {
815 /// Failure reason.
816 reason: String,
817 /// Consecutive failure count.
818 consecutive_failures: u32,
819 },
820 /// Supervision paused for a child or group.
821 Paused {
822 /// Pause reason.
823 reason: String,
824 /// Caller that initiated the pause.
825 paused_by: String,
826 },
827 /// Supervision resumed for a child or group.
828 Resumed {
829 /// Resume reason.
830 reason: String,
831 },
832 /// Child or group was quarantined.
833 Quarantined {
834 /// Meltdown scope that triggered quarantine.
835 scope: MeltdownScope,
836 /// Quarantine reason.
837 reason: String,
838 /// Quarantine duration in seconds.
839 duration_secs: u64,
840 },
841 /// Backpressure alert emitted when subscriber buffer exceeds soft threshold.
842 BackpressureAlert {
843 /// Subscriber name or identifier.
844 subscriber: String,
845 /// Current buffer occupancy percentage.
846 buffer_pct: u8,
847 /// Threshold that triggered the alert.
848 threshold_pct: u8,
849 },
850 /// Backpressure degradation when subscriber buffer exceeds hard threshold.
851 BackpressureDegradation {
852 /// Subscriber name or identifier.
853 subscriber: String,
854 /// Active backpressure strategy.
855 strategy: String,
856 /// Current sampling ratio.
857 sample_ratio: FiniteF64,
858 /// Peak buffer occupancy during the degradation window.
859 buffer_peak_pct: u8,
860 /// Whether the subscriber has recovered.
861 recovered: bool,
862 },
863 /// Audit record for a command or lifecycle event.
864 AuditRecorded {
865 /// Command identifier.
866 command_id: String,
867 /// Event type being audited.
868 event_type: String,
869 /// Sampling ratio in effect when the audit was recorded.
870 sample_ratio: FiniteF64,
871 /// Correlation identifier linking this audit to the event chain.
872 correlation_id: CorrelationId,
873 /// Reason the audit was triggered.
874 trigger_reason: String,
875 /// Number of events discarded by sampling.
876 events_discarded: u64,
877 },
878 /// Child declaration was accepted and committed via add_child.
879 ChildDeclarationAccepted {
880 /// Transaction identifier for audit tracing.
881 transaction_id: Uuid,
882 /// Name of the accepted child.
883 child_name: String,
884 /// Runtime child identifier.
885 child_id: ChildId,
886 /// Supervisor spec hash after this operation.
887 spec_hash: String,
888 },
889 /// Child declaration was rejected via add_child.
890 ChildDeclarationRejected {
891 /// Transaction identifier for audit tracing.
892 transaction_id: Uuid,
893 /// Name of the rejected child.
894 child_name: String,
895 /// Human-readable rejection reason.
896 reason: String,
897 /// Optional JSON Pointer field path pointing to the error source.
898 field_path: Option<String>,
899 },
900}
901
902impl What {
903 /// Returns a low-cardinality event name.
904 ///
905 /// # Arguments
906 ///
907 /// This function has no arguments.
908 ///
909 /// # Returns
910 ///
911 /// Returns the stable event name.
912 ///
913 /// # Examples
914 ///
915 /// ```
916 /// let event = rust_supervisor::event::payload::What::ChildRunning {
917 /// transition: None,
918 /// };
919 /// assert_eq!(event.name(), "ChildRunning");
920 /// ```
921 pub fn name(&self) -> &'static str {
922 match self {
923 Self::ChildStarting { .. } => "ChildStarting",
924 Self::ChildRunning { .. } => "ChildRunning",
925 Self::ChildReady { .. } => "ChildReady",
926 Self::ChildHeartbeat { .. } => "ChildHeartbeat",
927 Self::ChildFailed { .. } => "ChildFailed",
928 Self::ChildPanicked { .. } => "ChildPanicked",
929 Self::BackoffScheduled { .. } => "BackoffScheduled",
930 Self::ChildRestarting { .. } => "ChildRestarting",
931 Self::ChildRestarted { .. } => "ChildRestarted",
932 Self::ChildQuarantined { .. } => "ChildQuarantined",
933 Self::ChildStopped { .. } => "ChildStopped",
934 Self::ChildUnhealthy { .. } => "ChildUnhealthy",
935 Self::Meltdown { .. } => "Meltdown",
936 Self::ShutdownRequested { .. } => "ShutdownRequested",
937 Self::ShutdownPhaseChanged { .. } => "ShutdownPhaseChanged",
938 Self::ShutdownCompleted { .. } => "ShutdownCompleted",
939 Self::ChildShutdownCancelDelivered { .. } => "ChildShutdownCancelDelivered",
940 Self::ChildShutdownGraceful { .. } => "ChildShutdownGraceful",
941 Self::ChildShutdownAborted { .. } => "ChildShutdownAborted",
942 Self::ChildShutdownLateReport { .. } => "ChildShutdownLateReport",
943 Self::ChildRestartFenceEntered { .. } => "ChildRestartFenceEntered",
944 Self::ChildRestartFenceAbortRequested { .. } => "ChildRestartFenceAbortRequested",
945 Self::ChildRestartFenceReleased { .. } => "ChildRestartFenceReleased",
946 Self::ChildRestartConflict { .. } => "ChildRestartConflict",
947 Self::ChildAttemptStaleReport { .. } => "ChildAttemptStaleReport",
948 Self::ChildRestartFencePendingDrained { .. } => "ChildRestartFencePendingDrained",
949 Self::ChildControlCancelDelivered { .. } => "ChildControlCancelDelivered",
950 Self::ChildControlStopCompleted { .. } => "ChildControlStopCompleted",
951 Self::ChildControlStopFailed { .. } => "ChildControlStopFailed",
952 Self::ChildControlOperationChanged { .. } => "ChildControlOperationChanged",
953 Self::ChildControlCommandCompleted { .. } => "ChildControlCommandCompleted",
954 Self::ChildRuntimeRestartLimitUpdated { .. } => "ChildRuntimeRestartLimitUpdated",
955 Self::ChildRuntimeStateRemoved { .. } => "ChildRuntimeStateRemoved",
956 Self::ChildHeartbeatStale { .. } => "ChildHeartbeatStale",
957 Self::CommandAccepted { .. } => "CommandAccepted",
958 Self::CommandCompleted { .. } => "CommandCompleted",
959 Self::RuntimeControlLoopStarted { .. } => "RuntimeControlLoopStarted",
960 Self::RuntimeControlLoopShutdownRequested { .. } => {
961 "RuntimeControlLoopShutdownRequested"
962 }
963 Self::RuntimeControlLoopCompleted { .. } => "RuntimeControlLoopCompleted",
964 Self::RuntimeControlLoopFailed { .. } => "RuntimeControlLoopFailed",
965 Self::RuntimeControlLoopJoinCompleted { .. } => "RuntimeControlLoopJoinCompleted",
966 Self::SubscriberLagged { .. } => "SubscriberLagged",
967 Self::BudgetExhausted { .. } => "BudgetExhausted",
968 Self::GroupFuseTriggered { .. } => "GroupFuseTriggered",
969 Self::EscalationBifurcated { .. } => "EscalationBifurcated",
970 Self::FairnessProbeStarvation { .. } => "FairnessProbeStarvation",
971 Self::BudgetDenied { .. } => "BudgetDenied",
972 Self::GenerationFenced { .. } => "GenerationFenced",
973 Self::HealthCheckPassed { .. } => "HealthCheckPassed",
974 Self::HealthCheckFailed { .. } => "HealthCheckFailed",
975 Self::Paused { .. } => "Paused",
976 Self::Resumed { .. } => "Resumed",
977 Self::Quarantined { .. } => "Quarantined",
978 Self::BackpressureAlert { .. } => "BackpressureAlert",
979 Self::BackpressureDegradation { .. } => "BackpressureDegradation",
980 Self::AuditRecorded { .. } => "AuditRecorded",
981 Self::ChildDeclarationAccepted { .. } => "ChildDeclarationAccepted",
982 Self::ChildDeclarationRejected { .. } => "ChildDeclarationRejected",
983 }
984 }
985}
986
987/// Complete lifecycle event envelope.
988#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
989pub struct SupervisorEvent {
990 /// Schema version identifier, monotonically increasing.
991 pub schema_id: u64,
992 /// Time information for the lifecycle fact.
993 pub when: When,
994 /// Location information for the lifecycle fact.
995 pub r#where: Where,
996 /// Typed event payload.
997 pub what: What,
998 /// Optional policy decision related to the event.
999 pub policy: Option<PolicyDecision>,
1000 /// Monotonic event sequence.
1001 pub sequence: EventSequence,
1002 /// Correlation identifier shared by related signals.
1003 pub correlation_id: CorrelationId,
1004 /// Configuration version that produced this fact.
1005 pub config_version: u64,
1006 /// List of meltdown scopes that reached or exceeded thresholds in this evaluation round.
1007 pub scopes_triggered: Vec<MeltdownScope>,
1008 /// The dominant attribution scope for the effective meltdown verdict.
1009 pub lead_scope: Option<MeltdownScope>,
1010 /// The effective protection action on the restrictiveness ladder.
1011 pub effective_protective_action: Option<ProtectionAction>,
1012 /// Reason for cold start budget triggering or exhaustion.
1013 pub cold_start_reason: ColdStartReason,
1014 /// Reason for hot loop detection triggering.
1015 pub hot_loop_reason: HotLoopReason,
1016 /// Ownership of the throttle gate that limited concurrent restarts.
1017 pub throttle_gate_owner: ThrottleGateOwner,
1018 /// Effective task role used by the policy decision.
1019 pub task_role: Option<TaskRole>,
1020 /// Whether fallback task role defaults were used.
1021 pub used_fallback_default: bool,
1022 /// Source that produced the effective policy.
1023 pub effective_policy_source: Option<PolicySource>,
1024}
1025
1026impl SupervisorEvent {
1027 /// Creates a supervisor lifecycle event.
1028 ///
1029 /// # Arguments
1030 ///
1031 /// - `when`: Event timing.
1032 /// - `r#where`: Event location.
1033 /// - `what`: Event payload.
1034 /// - `sequence`: Monotonic event sequence.
1035 /// - `correlation_id`: Correlation identifier for related signals.
1036 /// - `config_version`: Configuration version for this event.
1037 ///
1038 /// # Returns
1039 ///
1040 /// Returns a [`SupervisorEvent`].
1041 ///
1042 /// # Examples
1043 ///
1044 /// ```
1045 /// let event = rust_supervisor::event::payload::SupervisorEvent::new(
1046 /// rust_supervisor::event::time::When::new(
1047 /// rust_supervisor::event::time::EventTime::deterministic(
1048 /// 1,
1049 /// 1,
1050 /// 0,
1051 /// rust_supervisor::id::types::Generation::initial(),
1052 /// rust_supervisor::id::types::ChildStartCount::first(),
1053 /// ),
1054 /// ),
1055 /// rust_supervisor::event::payload::Where::new(
1056 /// rust_supervisor::id::types::SupervisorPath::root(),
1057 /// ),
1058 /// rust_supervisor::event::payload::What::ChildRunning { transition: None },
1059 /// rust_supervisor::event::time::EventSequence::new(1),
1060 /// rust_supervisor::event::time::CorrelationId::from_uuid(uuid::Uuid::nil()),
1061 /// 1,
1062 /// );
1063 /// assert_eq!(event.what.name(), "ChildRunning");
1064 /// ```
1065 pub fn new(
1066 when: When,
1067 r#where: Where,
1068 what: What,
1069 sequence: EventSequence,
1070 correlation_id: CorrelationId,
1071 config_version: u64,
1072 ) -> Self {
1073 Self {
1074 schema_id: 1,
1075 when,
1076 r#where,
1077 what,
1078 policy: None,
1079 sequence,
1080 correlation_id,
1081 config_version,
1082 scopes_triggered: Vec::new(),
1083 lead_scope: None,
1084 effective_protective_action: None,
1085 cold_start_reason: ColdStartReason::NotApplicable,
1086 hot_loop_reason: HotLoopReason::NotApplicable,
1087 throttle_gate_owner: ThrottleGateOwner::None,
1088 task_role: None,
1089 used_fallback_default: false,
1090 effective_policy_source: None,
1091 }
1092 }
1093
1094 /// Attaches a policy decision to an event.
1095 ///
1096 /// # Arguments
1097 ///
1098 /// - `policy`: Policy decision produced for this lifecycle fact.
1099 ///
1100 /// # Returns
1101 ///
1102 /// Returns the updated [`SupervisorEvent`].
1103 pub fn with_policy(mut self, policy: PolicyDecision) -> Self {
1104 self.policy = Some(policy);
1105 self
1106 }
1107}