use std::collections::VecDeque;
use std::time::{Duration, Instant};
const RING_BUFFER_CAP: usize = 10;
const FLAP_WINDOW: Duration = Duration::from_secs(60);
const FLAP_THRESHOLD: usize = 3;
const HALT_THRESHOLD: usize = 5;
const TRANSIENT_DELAY: Duration = Duration::from_secs(2);
const FLAPPING_DELAY: Duration = Duration::from_secs(30);
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExitClass {
Clean,
Transient,
Flapping,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SupervisorState {
Healthy,
Degraded,
Halted,
}
impl SupervisorState {
pub fn as_str(&self) -> &'static str {
match self {
SupervisorState::Healthy => "healthy",
SupervisorState::Degraded => "degraded",
SupervisorState::Halted => "halted",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RestartAction {
PromptUser,
RestartAfter {
delay: Duration,
with_continue: bool,
},
Halt,
}
#[derive(Debug, Clone)]
pub struct ExitRecord {
pub code: i32,
pub timestamp: Instant,
}
#[derive(Debug)]
pub struct RestartHistory {
entries: VecDeque<ExitRecord>,
}
impl RestartHistory {
pub fn new() -> Self {
Self {
entries: VecDeque::with_capacity(RING_BUFFER_CAP),
}
}
pub fn push(&mut self, record: ExitRecord) {
if self.entries.len() >= RING_BUFFER_CAP {
self.entries.pop_front();
}
self.entries.push_back(record);
}
pub fn exits_in_window(&self, window: Duration, now: Instant) -> usize {
let cutoff = now.checked_sub(window).unwrap_or(now);
self.entries
.iter()
.filter(|r| r.code != 0 && r.timestamp >= cutoff)
.count()
}
#[allow(dead_code)] pub fn len(&self) -> usize {
self.entries.len()
}
}
#[derive(Debug)]
pub struct CrashPolicy {
pub state: SupervisorState,
pub history: RestartHistory,
consecutive_flaps: usize,
}
impl CrashPolicy {
pub fn new() -> Self {
Self {
state: SupervisorState::Healthy,
history: RestartHistory::new(),
consecutive_flaps: 0,
}
}
pub fn on_exit(&mut self, exit_code: i32) -> RestartAction {
self.on_exit_at(exit_code, Instant::now())
}
pub fn on_exit_at(&mut self, exit_code: i32, now: Instant) -> RestartAction {
let record = ExitRecord {
code: exit_code,
timestamp: now,
};
self.history.push(record);
let class = self.classify(exit_code, now);
self.transition(class);
self.action(class)
}
fn classify(&self, exit_code: i32, now: Instant) -> ExitClass {
if exit_code == 0 {
return ExitClass::Clean;
}
if self.history.exits_in_window(FLAP_WINDOW, now) >= FLAP_THRESHOLD {
ExitClass::Flapping
} else {
ExitClass::Transient
}
}
fn transition(&mut self, class: ExitClass) {
match class {
ExitClass::Clean => {
self.state = SupervisorState::Healthy;
self.consecutive_flaps = 0;
}
ExitClass::Transient => {
if self.state == SupervisorState::Degraded {
self.state = SupervisorState::Healthy;
self.consecutive_flaps = 0;
}
}
ExitClass::Flapping => {
self.consecutive_flaps += 1;
if self.consecutive_flaps >= HALT_THRESHOLD {
self.state = SupervisorState::Halted;
} else {
self.state = SupervisorState::Degraded;
}
}
}
}
fn action(&self, class: ExitClass) -> RestartAction {
if self.state == SupervisorState::Halted {
return RestartAction::Halt;
}
match class {
ExitClass::Clean => RestartAction::PromptUser,
ExitClass::Transient => RestartAction::RestartAfter {
delay: TRANSIENT_DELAY,
with_continue: true,
},
ExitClass::Flapping => RestartAction::RestartAfter {
delay: FLAPPING_DELAY,
with_continue: true,
},
}
}
#[allow(dead_code)] pub fn reset(&mut self) {
self.state = SupervisorState::Healthy;
self.history = RestartHistory::new();
self.consecutive_flaps = 0;
}
#[allow(dead_code)] pub fn consecutive_flaps(&self) -> usize {
self.consecutive_flaps
}
}
#[cfg(test)]
mod tests {
use super::*;
fn offset(base: Instant, secs: u64) -> Instant {
base + Duration::from_secs(secs)
}
#[test]
fn classify_clean_exit() {
let mut policy = CrashPolicy::new();
let action = policy.on_exit_at(0, Instant::now());
assert_eq!(action, RestartAction::PromptUser);
assert_eq!(policy.state, SupervisorState::Healthy);
}
#[test]
fn classify_single_nonzero_is_transient() {
let mut policy = CrashPolicy::new();
let action = policy.on_exit_at(1, Instant::now());
assert_eq!(
action,
RestartAction::RestartAfter {
delay: TRANSIENT_DELAY,
with_continue: true,
}
);
assert_eq!(policy.state, SupervisorState::Healthy);
}
#[test]
fn classify_three_in_60s_is_flapping() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
policy.on_exit_at(1, offset(base, 0));
policy.on_exit_at(1, offset(base, 10));
let action = policy.on_exit_at(1, offset(base, 20));
assert_eq!(
action,
RestartAction::RestartAfter {
delay: FLAPPING_DELAY,
with_continue: true,
}
);
assert_eq!(policy.state, SupervisorState::Degraded);
}
#[test]
fn ring_buffer_caps_at_10() {
let mut history = RestartHistory::new();
let base = Instant::now();
for i in 0..15 {
history.push(ExitRecord {
code: 1,
timestamp: offset(base, i),
});
}
assert_eq!(history.len(), RING_BUFFER_CAP);
}
#[test]
fn exits_in_window_excludes_old() {
let mut history = RestartHistory::new();
let base = Instant::now();
history.push(ExitRecord {
code: 1,
timestamp: base,
});
let now = offset(base, 120);
history.push(ExitRecord {
code: 1,
timestamp: offset(base, 110),
});
assert_eq!(
history.exits_in_window(FLAP_WINDOW, now),
1,
"only the recent exit should count"
);
}
#[test]
fn flapping_transitions_to_degraded() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
policy.on_exit_at(1, offset(base, 0));
policy.on_exit_at(1, offset(base, 1));
policy.on_exit_at(1, offset(base, 2));
assert_eq!(policy.state, SupervisorState::Degraded);
assert_eq!(policy.consecutive_flaps(), 1);
}
#[test]
fn five_consecutive_flaps_halts() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
for i in 0..7 {
policy.on_exit_at(1, offset(base, i));
}
assert_eq!(policy.state, SupervisorState::Halted);
}
#[test]
fn halted_always_returns_halt() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
for i in 0..7 {
policy.on_exit_at(1, offset(base, i));
}
assert_eq!(policy.state, SupervisorState::Halted);
let action = policy.on_exit_at(1, offset(base, 8));
assert_eq!(action, RestartAction::Halt);
}
#[test]
fn clean_exit_resets_consecutive_flaps() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
policy.on_exit_at(1, offset(base, 0));
policy.on_exit_at(1, offset(base, 1));
policy.on_exit_at(1, offset(base, 2));
assert_eq!(policy.state, SupervisorState::Degraded);
assert!(policy.consecutive_flaps() > 0);
policy.on_exit_at(0, offset(base, 3));
assert_eq!(policy.state, SupervisorState::Healthy);
assert_eq!(policy.consecutive_flaps(), 0);
}
#[test]
fn reset_clears_everything() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
for i in 0..7 {
policy.on_exit_at(1, offset(base, i));
}
assert_eq!(policy.state, SupervisorState::Halted);
policy.reset();
assert_eq!(policy.state, SupervisorState::Healthy);
assert_eq!(policy.consecutive_flaps(), 0);
assert_eq!(policy.history.len(), 0);
}
#[test]
fn state_tag_strings_are_stable() {
assert_eq!(SupervisorState::Healthy.as_str(), "healthy");
assert_eq!(SupervisorState::Degraded.as_str(), "degraded");
assert_eq!(SupervisorState::Halted.as_str(), "halted");
}
#[test]
fn transient_while_degraded_recovers_to_healthy() {
let mut policy = CrashPolicy::new();
let base = Instant::now();
policy.on_exit_at(1, offset(base, 0));
policy.on_exit_at(1, offset(base, 1));
policy.on_exit_at(1, offset(base, 2));
assert_eq!(policy.state, SupervisorState::Degraded);
let later = offset(base, 120);
let action = policy.on_exit_at(1, later);
assert_eq!(
action,
RestartAction::RestartAfter {
delay: TRANSIENT_DELAY,
with_continue: true,
}
);
assert_eq!(policy.state, SupervisorState::Healthy);
}
}