use std::collections::HashMap;
use std::hash::Hash;
use std::sync::RwLock;
use std::time::{Duration, Instant};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum HealthStatus {
Healthy,
Degraded,
Unhealthy,
Recovering,
}
impl HealthStatus {
pub fn is_available(self) -> bool {
matches!(self, Self::Healthy | Self::Degraded | Self::Recovering)
}
pub const fn bit_flag(self) -> u8 {
match self {
Self::Healthy => 1,
Self::Degraded => 2,
Self::Unhealthy => 4,
Self::Recovering => 8,
}
}
}
#[derive(Debug, Clone)]
pub struct NodeHealth {
pub status: HealthStatus,
pub consecutive_failures: u32,
pub consecutive_successes: u32,
pub last_failure: Option<Instant>,
pub last_success: Option<Instant>,
}
impl Default for NodeHealth {
fn default() -> Self {
Self {
status: HealthStatus::Healthy,
consecutive_failures: 0,
consecutive_successes: 0,
last_failure: None,
last_success: None,
}
}
}
#[derive(Debug, Clone)]
pub enum Outcome {
Success,
Failure,
RateLimited {
retry_after: Option<Duration>,
},
}
pub trait HealthPolicy: Send + Sync {
fn on_success(&self, health: &mut NodeHealth);
fn on_failure(&self, health: &mut NodeHealth);
fn should_probe(&self, health: &NodeHealth) -> bool;
fn on_outcome(&self, health: &mut NodeHealth, outcome: &Outcome) {
match outcome {
Outcome::Success => self.on_success(health),
Outcome::Failure => self.on_failure(health),
Outcome::RateLimited { .. } => {}
}
}
}
#[derive(bon::Builder)]
pub struct ConsecutiveFailurePolicy {
#[builder(default = 3)]
pub failure_threshold: u32,
#[builder(default = 2)]
pub recovery_successes: u32,
#[builder(default = Duration::from_secs(10))]
pub probe_interval: Duration,
}
impl Default for ConsecutiveFailurePolicy {
fn default() -> Self {
Self::builder().build()
}
}
impl HealthPolicy for ConsecutiveFailurePolicy {
fn on_success(&self, health: &mut NodeHealth) {
health.consecutive_failures = 0;
health.consecutive_successes += 1;
health.last_success = Some(Instant::now());
match health.status {
HealthStatus::Recovering => {
if health.consecutive_successes >= self.recovery_successes {
health.status = HealthStatus::Healthy;
}
}
HealthStatus::Unhealthy => {
health.status = HealthStatus::Recovering;
health.consecutive_successes = 1;
}
HealthStatus::Degraded => {
health.status = HealthStatus::Healthy;
}
HealthStatus::Healthy => {}
}
}
fn on_failure(&self, health: &mut NodeHealth) {
health.consecutive_successes = 0;
health.consecutive_failures += 1;
health.last_failure = Some(Instant::now());
if health.consecutive_failures >= self.failure_threshold {
health.status = HealthStatus::Unhealthy;
} else if self.failure_threshold > 1
&& health.consecutive_failures >= self.failure_threshold.div_ceil(2)
{
health.status = HealthStatus::Degraded;
}
}
fn should_probe(&self, health: &NodeHealth) -> bool {
if health.status != HealthStatus::Unhealthy {
return false;
}
match health.last_failure {
Some(t) => t.elapsed() >= self.probe_interval,
None => true,
}
}
}
pub struct HealthTracker<Id: Eq + Hash + Clone> {
states: RwLock<HashMap<Id, NodeHealth>>,
policy: Box<dyn HealthPolicy>,
}
impl<Id: Eq + Hash + Clone> std::fmt::Debug for HealthTracker<Id> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let count = self.states.read().unwrap().len();
f.debug_struct("HealthTracker")
.field("tracked_nodes", &count)
.finish_non_exhaustive()
}
}
impl<Id: Eq + Hash + Clone> HealthTracker<Id> {
pub fn new(policy: impl HealthPolicy + 'static) -> Self {
Self {
states: RwLock::new(HashMap::new()),
policy: Box::new(policy),
}
}
pub fn report_success(&self, id: &Id) {
let mut states = self.states.write().unwrap();
let health = states.entry(id.clone()).or_default();
self.policy.on_success(health);
}
pub fn report_failure(&self, id: &Id) {
let mut states = self.states.write().unwrap();
let health = states.entry(id.clone()).or_default();
self.policy.on_failure(health);
}
pub fn report_outcome(&self, id: &Id, outcome: &Outcome) {
let mut states = self.states.write().unwrap();
let health = states.entry(id.clone()).or_default();
self.policy.on_outcome(health, outcome);
}
pub fn status(&self, id: &Id) -> HealthStatus {
self.states
.read()
.unwrap()
.get(id)
.map(|h| h.status)
.unwrap_or(HealthStatus::Healthy)
}
pub fn should_probe(&self, id: &Id) -> bool {
self.states
.read()
.unwrap()
.get(id)
.is_some_and(|h| self.policy.should_probe(h))
}
pub fn get_health(&self, id: &Id) -> NodeHealth {
self.states
.read()
.unwrap()
.get(id)
.cloned()
.unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn healthy_to_degraded_to_unhealthy() {
let policy = ConsecutiveFailurePolicy::builder()
.failure_threshold(3)
.build();
let mut health = NodeHealth::default();
policy.on_failure(&mut health);
assert_eq!(health.status, HealthStatus::Healthy);
policy.on_failure(&mut health);
assert_eq!(health.status, HealthStatus::Degraded);
policy.on_failure(&mut health);
assert_eq!(health.status, HealthStatus::Unhealthy);
}
#[test]
fn recovery_path() {
let policy = ConsecutiveFailurePolicy::builder()
.failure_threshold(2)
.recovery_successes(2)
.build();
let mut health = NodeHealth::default();
policy.on_failure(&mut health);
policy.on_failure(&mut health);
assert_eq!(health.status, HealthStatus::Unhealthy);
policy.on_success(&mut health);
assert_eq!(health.status, HealthStatus::Recovering);
policy.on_success(&mut health);
assert_eq!(health.status, HealthStatus::Healthy);
}
#[test]
fn threshold_one_skips_degraded() {
let policy = ConsecutiveFailurePolicy::builder()
.failure_threshold(1)
.build();
let mut health = NodeHealth::default();
policy.on_failure(&mut health);
assert_eq!(health.status, HealthStatus::Unhealthy);
}
#[test]
fn tracker_defaults_to_healthy() {
let tracker = HealthTracker::<String>::new(ConsecutiveFailurePolicy::default());
assert_eq!(tracker.status(&"unknown".to_string()), HealthStatus::Healthy);
}
#[test]
fn rate_limited_does_not_count_as_failure() {
let policy = ConsecutiveFailurePolicy::builder()
.failure_threshold(3)
.build();
let mut health = NodeHealth::default();
policy.on_outcome(&mut health, &Outcome::Failure);
assert_eq!(health.consecutive_failures, 1);
assert_eq!(health.status, HealthStatus::Healthy);
policy.on_outcome(&mut health, &Outcome::RateLimited { retry_after: None });
assert_eq!(health.consecutive_failures, 1);
assert_eq!(health.status, HealthStatus::Healthy);
policy.on_outcome(&mut health, &Outcome::Failure);
assert_eq!(health.status, HealthStatus::Degraded);
policy.on_outcome(&mut health, &Outcome::Failure);
assert_eq!(health.status, HealthStatus::Unhealthy);
}
#[test]
fn outcome_success_resets_failures() {
let policy = ConsecutiveFailurePolicy::default();
let mut health = NodeHealth::default();
policy.on_outcome(&mut health, &Outcome::Failure);
assert_eq!(health.consecutive_failures, 1);
policy.on_outcome(&mut health, &Outcome::Success);
assert_eq!(health.consecutive_failures, 0);
assert_eq!(health.consecutive_successes, 1);
}
}