use crate::types::AgentRole;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheckConfig {
pub heartbeat_interval_secs: u64,
pub heartbeat_timeout_secs: u64,
pub liveness_check_interval_secs: u64,
pub max_consecutive_failures: u32,
pub auto_restart_enabled: bool,
pub auto_restart_delay_secs: u64,
pub max_auto_restarts: u32,
}
impl Default for HealthCheckConfig {
fn default() -> Self {
Self {
heartbeat_interval_secs: 30,
heartbeat_timeout_secs: 90,
liveness_check_interval_secs: 15,
max_consecutive_failures: 3,
auto_restart_enabled: true,
auto_restart_delay_secs: 5,
max_auto_restarts: 5,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum HealthStatus {
Healthy,
Degraded {
reason: String,
},
Unhealthy {
reason: String,
},
Dead {
reason: String,
},
Unknown,
}
impl std::fmt::Display for HealthStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
HealthStatus::Healthy => write!(f, "healthy"),
HealthStatus::Degraded { reason } => write!(f, "degraded: {reason}"),
HealthStatus::Unhealthy { reason } => write!(f, "unhealthy: {reason}"),
HealthStatus::Dead { reason } => write!(f, "dead: {reason}"),
HealthStatus::Unknown => write!(f, "unknown"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ProbeType {
Liveness,
Readiness,
Heartbeat,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthProbe {
pub name: String,
pub probe_type: ProbeType,
pub last_check: Option<DateTime<Utc>>,
pub last_success: Option<DateTime<Utc>>,
pub consecutive_failures: u32,
pub total_checks: u64,
pub total_failures: u64,
}
impl HealthProbe {
fn new(name: impl Into<String>, probe_type: ProbeType) -> Self {
Self {
name: name.into(),
probe_type,
last_check: None,
last_success: None,
consecutive_failures: 0,
total_checks: 0,
total_failures: 0,
}
}
fn record_success(&mut self) {
let now = Utc::now();
self.last_check = Some(now);
self.last_success = Some(now);
self.consecutive_failures = 0;
self.total_checks += 1;
}
fn record_failure(&mut self) {
self.last_check = Some(Utc::now());
self.consecutive_failures += 1;
self.total_checks += 1;
self.total_failures += 1;
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AgentHealthState {
pub agent_id: Uuid,
pub agent_name: String,
pub role: AgentRole,
pub status: HealthStatus,
pub probes: Vec<HealthProbe>,
pub last_heartbeat: Option<DateTime<Utc>>,
pub restart_count: u32,
pub uptime_secs: u64,
pub started_at: DateTime<Utc>,
}
impl AgentHealthState {
fn new(agent_id: Uuid, name: String, role: AgentRole) -> Self {
let now = Utc::now();
Self {
agent_id,
agent_name: name,
role,
status: HealthStatus::Unknown,
probes: vec![
HealthProbe::new("liveness", ProbeType::Liveness),
HealthProbe::new("readiness", ProbeType::Readiness),
HealthProbe::new("heartbeat", ProbeType::Heartbeat),
],
last_heartbeat: None,
restart_count: 0,
uptime_secs: 0,
started_at: now,
}
}
fn refresh_uptime(&mut self) {
let elapsed = Utc::now().signed_duration_since(self.started_at);
self.uptime_secs = elapsed.num_seconds().max(0) as u64;
}
fn probe_mut(&mut self, pt: &ProbeType) -> Option<&mut HealthProbe> {
self.probes.iter_mut().find(|p| p.probe_type == *pt)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "event", rename_all = "snake_case")]
pub enum HealthEvent {
AgentBecameHealthy {
agent_id: Uuid,
agent_name: String,
},
AgentBecameDegraded {
agent_id: Uuid,
agent_name: String,
reason: String,
},
AgentBecameUnhealthy {
agent_id: Uuid,
agent_name: String,
reason: String,
},
AgentDied {
agent_id: Uuid,
agent_name: String,
reason: String,
},
AgentRestarted {
agent_id: Uuid,
agent_name: String,
restart_count: u32,
},
HeartbeatMissed {
agent_id: Uuid,
agent_name: String,
last_seen_secs_ago: u64,
},
ProbeFailure {
agent_id: Uuid,
probe_name: String,
error: String,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthSummary {
pub total_agents: usize,
pub healthy: usize,
pub degraded: usize,
pub unhealthy: usize,
pub dead: usize,
pub unknown: usize,
pub total_restarts: u32,
pub events: Vec<HealthEvent>,
pub checked_at: DateTime<Utc>,
}
pub struct HealthChecker {
config: HealthCheckConfig,
agents: Arc<RwLock<HashMap<Uuid, AgentHealthState>>>,
}
impl HealthChecker {
pub fn new(config: HealthCheckConfig) -> Self {
Self {
config,
agents: Arc::new(RwLock::new(HashMap::new())),
}
}
pub async fn register_agent(&self, agent_id: Uuid, name: String, role: AgentRole) {
let state = AgentHealthState::new(agent_id, name, role);
let mut agents = self.agents.write().await;
agents.insert(agent_id, state);
}
pub async fn unregister_agent(&self, agent_id: Uuid) {
let mut agents = self.agents.write().await;
agents.remove(&agent_id);
}
pub async fn record_heartbeat(&self, agent_id: Uuid) -> Result<(), String> {
let mut agents = self.agents.write().await;
let state = agents
.get_mut(&agent_id)
.ok_or_else(|| format!("agent {agent_id} not registered"))?;
state.last_heartbeat = Some(Utc::now());
if let Some(probe) = state.probe_mut(&ProbeType::Heartbeat) {
probe.record_success();
}
Ok(())
}
pub async fn record_liveness_success(&self, agent_id: Uuid) -> Result<(), String> {
let mut agents = self.agents.write().await;
let state = agents
.get_mut(&agent_id)
.ok_or_else(|| format!("agent {agent_id} not registered"))?;
if let Some(probe) = state.probe_mut(&ProbeType::Liveness) {
probe.record_success();
}
Ok(())
}
pub async fn record_liveness_failure(&self, agent_id: Uuid, error: &str) -> Result<(), String> {
let mut agents = self.agents.write().await;
let state = agents
.get_mut(&agent_id)
.ok_or_else(|| format!("agent {agent_id} not registered"))?;
if let Some(probe) = state.probe_mut(&ProbeType::Liveness) {
probe.record_failure();
}
if state.status == HealthStatus::Healthy || state.status == HealthStatus::Unknown {
state.status = HealthStatus::Degraded {
reason: error.to_string(),
};
}
Ok(())
}
pub async fn record_readiness(&self, agent_id: Uuid, ready: bool) -> Result<(), String> {
let mut agents = self.agents.write().await;
let state = agents
.get_mut(&agent_id)
.ok_or_else(|| format!("agent {agent_id} not registered"))?;
if let Some(probe) = state.probe_mut(&ProbeType::Readiness) {
if ready {
probe.record_success();
} else {
probe.record_failure();
}
}
Ok(())
}
pub async fn record_restart(&self, agent_id: Uuid) -> Result<(), String> {
let mut agents = self.agents.write().await;
let state = agents
.get_mut(&agent_id)
.ok_or_else(|| format!("agent {agent_id} not registered"))?;
state.restart_count += 1;
state.started_at = Utc::now();
state.uptime_secs = 0;
for probe in &mut state.probes {
probe.consecutive_failures = 0;
probe.last_check = None;
probe.last_success = None;
}
state.status = HealthStatus::Unknown;
Ok(())
}
pub async fn get_health(&self, agent_id: Uuid) -> Option<AgentHealthState> {
let mut agents = self.agents.write().await;
if let Some(state) = agents.get_mut(&agent_id) {
state.refresh_uptime();
Some(state.clone())
} else {
None
}
}
pub async fn get_all_health(&self) -> Vec<AgentHealthState> {
let mut agents = self.agents.write().await;
agents
.values_mut()
.map(|s| {
s.refresh_uptime();
s.clone()
})
.collect()
}
pub async fn check_all(&self) -> Vec<HealthEvent> {
let mut events = Vec::new();
let now = Utc::now();
let mut agents = self.agents.write().await;
for state in agents.values_mut() {
state.refresh_uptime();
let previous_status = state.status.clone();
let heartbeat_stale = if let Some(last) = state.last_heartbeat {
let elapsed = now.signed_duration_since(last).num_seconds().max(0) as u64;
if elapsed >= self.config.heartbeat_timeout_secs {
if let Some(probe) = state.probe_mut(&ProbeType::Heartbeat) {
probe.record_failure();
}
events.push(HealthEvent::HeartbeatMissed {
agent_id: state.agent_id,
agent_name: state.agent_name.clone(),
last_seen_secs_ago: elapsed,
});
true
} else {
false
}
} else {
false
};
let liveness_failures = state
.probes
.iter()
.find(|p| p.probe_type == ProbeType::Liveness)
.map_or(0, |p| p.consecutive_failures);
let new_status = if liveness_failures >= self.config.max_consecutive_failures {
HealthStatus::Dead {
reason: format!("liveness probe failed {liveness_failures} consecutive times"),
}
} else if heartbeat_stale && liveness_failures > 0 {
HealthStatus::Unhealthy {
reason: "heartbeat stale and liveness probe failing".to_string(),
}
} else if heartbeat_stale {
HealthStatus::Unhealthy {
reason: "heartbeat timeout exceeded".to_string(),
}
} else if liveness_failures > 0 {
HealthStatus::Degraded {
reason: format!("liveness probe failed {liveness_failures} time(s)"),
}
} else {
let readiness_failures = state
.probes
.iter()
.find(|p| p.probe_type == ProbeType::Readiness)
.map_or(0, |p| p.consecutive_failures);
if readiness_failures > 0 {
HealthStatus::Degraded {
reason: format!("readiness probe failed {readiness_failures} time(s)"),
}
} else if state.last_heartbeat.is_some() || previous_status == HealthStatus::Healthy
{
HealthStatus::Healthy
} else {
previous_status.clone()
}
};
state.status = new_status.clone();
if new_status != previous_status {
match &new_status {
HealthStatus::Healthy => {
events.push(HealthEvent::AgentBecameHealthy {
agent_id: state.agent_id,
agent_name: state.agent_name.clone(),
});
}
HealthStatus::Degraded { reason } => {
events.push(HealthEvent::AgentBecameDegraded {
agent_id: state.agent_id,
agent_name: state.agent_name.clone(),
reason: reason.clone(),
});
}
HealthStatus::Unhealthy { reason } => {
events.push(HealthEvent::AgentBecameUnhealthy {
agent_id: state.agent_id,
agent_name: state.agent_name.clone(),
reason: reason.clone(),
});
}
HealthStatus::Dead { reason } => {
events.push(HealthEvent::AgentDied {
agent_id: state.agent_id,
agent_name: state.agent_name.clone(),
reason: reason.clone(),
});
}
HealthStatus::Unknown => {}
}
}
for probe in &state.probes {
if probe.consecutive_failures > 0 {
if let Some(last_check) = probe.last_check {
let since = now.signed_duration_since(last_check).num_seconds().abs();
if since < self.config.liveness_check_interval_secs as i64 {
events.push(HealthEvent::ProbeFailure {
agent_id: state.agent_id,
probe_name: probe.name.clone(),
error: format!(
"{} consecutive failures",
probe.consecutive_failures
),
});
}
}
}
}
}
events
}
pub async fn get_unhealthy(&self) -> Vec<AgentHealthState> {
let mut agents = self.agents.write().await;
agents
.values_mut()
.filter(|s| {
matches!(
s.status,
HealthStatus::Degraded { .. }
| HealthStatus::Unhealthy { .. }
| HealthStatus::Dead { .. }
)
})
.map(|s| {
s.refresh_uptime();
s.clone()
})
.collect()
}
pub async fn get_summary(&self) -> HealthSummary {
let events = self.check_all().await;
let agents = self.agents.read().await;
let mut summary = HealthSummary {
total_agents: agents.len(),
healthy: 0,
degraded: 0,
unhealthy: 0,
dead: 0,
unknown: 0,
total_restarts: 0,
events,
checked_at: Utc::now(),
};
for state in agents.values() {
match &state.status {
HealthStatus::Healthy => summary.healthy += 1,
HealthStatus::Degraded { .. } => summary.degraded += 1,
HealthStatus::Unhealthy { .. } => summary.unhealthy += 1,
HealthStatus::Dead { .. } => summary.dead += 1,
HealthStatus::Unknown => summary.unknown += 1,
}
summary.total_restarts += state.restart_count;
}
summary
}
pub async fn should_restart(&self, agent_id: Uuid) -> bool {
if !self.config.auto_restart_enabled {
return false;
}
let agents = self.agents.read().await;
let Some(state) = agents.get(&agent_id) else {
return false;
};
if state.restart_count >= self.config.max_auto_restarts {
return false;
}
matches!(
state.status,
HealthStatus::Unhealthy { .. } | HealthStatus::Dead { .. }
)
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
fn default_checker() -> HealthChecker {
HealthChecker::new(HealthCheckConfig::default())
}
#[tokio::test]
async fn test_register_and_get_health() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-1".into(), AgentRole::Coder)
.await;
let health = checker.get_health(id).await.unwrap();
assert_eq!(health.agent_id, id);
assert_eq!(health.agent_name, "agent-1");
assert_eq!(health.role, AgentRole::Coder);
assert_eq!(health.status, HealthStatus::Unknown);
assert_eq!(health.restart_count, 0);
}
#[tokio::test]
async fn test_heartbeat_updates_timestamp() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-hb".into(), AgentRole::Tester)
.await;
assert!(checker
.get_health(id)
.await
.unwrap()
.last_heartbeat
.is_none());
checker.record_heartbeat(id).await.unwrap();
let health = checker.get_health(id).await.unwrap();
assert!(health.last_heartbeat.is_some());
}
#[tokio::test]
async fn test_missed_heartbeat_generates_event() {
let checker = HealthChecker::new(HealthCheckConfig {
heartbeat_timeout_secs: 0, ..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-miss".into(), AgentRole::Spec)
.await;
checker.record_heartbeat(id).await.unwrap();
let events = checker.check_all().await;
let missed = events
.iter()
.any(|e| matches!(e, HealthEvent::HeartbeatMissed { agent_id, .. } if *agent_id == id));
assert!(missed, "expected HeartbeatMissed event");
}
#[tokio::test]
async fn test_liveness_failure_increments_counter() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-lf".into(), AgentRole::Coder)
.await;
checker
.record_liveness_failure(id, "timeout")
.await
.unwrap();
let health = checker.get_health(id).await.unwrap();
let liveness = health
.probes
.iter()
.find(|p| p.probe_type == ProbeType::Liveness)
.unwrap();
assert_eq!(liveness.consecutive_failures, 1);
assert_eq!(liveness.total_failures, 1);
}
#[tokio::test]
async fn test_consecutive_failures_transition_to_unhealthy() {
let checker = HealthChecker::new(HealthCheckConfig {
heartbeat_timeout_secs: 0,
max_consecutive_failures: 3,
..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-uh".into(), AgentRole::Reviewer)
.await;
checker.record_heartbeat(id).await.unwrap();
checker.record_liveness_failure(id, "err1").await.unwrap();
checker.record_liveness_failure(id, "err2").await.unwrap();
let events = checker.check_all().await;
let became_unhealthy = events.iter().any(
|e| matches!(e, HealthEvent::AgentBecameUnhealthy { agent_id, .. } if *agent_id == id),
);
assert!(
became_unhealthy,
"expected AgentBecameUnhealthy (heartbeat stale + liveness failures)"
);
}
#[tokio::test]
async fn test_max_failures_transition_to_dead() {
let checker = HealthChecker::new(HealthCheckConfig {
max_consecutive_failures: 2,
..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-dead".into(), AgentRole::DevOps)
.await;
for _ in 0..2 {
checker.record_liveness_failure(id, "crash").await.unwrap();
}
let events = checker.check_all().await;
let died = events
.iter()
.any(|e| matches!(e, HealthEvent::AgentDied { agent_id, .. } if *agent_id == id));
assert!(died, "expected AgentDied event");
let health = checker.get_health(id).await.unwrap();
assert!(matches!(health.status, HealthStatus::Dead { .. }));
}
#[tokio::test]
async fn test_record_restart_increments_counter() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-rs".into(), AgentRole::Coder)
.await;
checker.record_restart(id).await.unwrap();
let health = checker.get_health(id).await.unwrap();
assert_eq!(health.restart_count, 1);
assert_eq!(health.status, HealthStatus::Unknown);
checker.record_restart(id).await.unwrap();
let health = checker.get_health(id).await.unwrap();
assert_eq!(health.restart_count, 2);
}
#[tokio::test]
async fn test_should_restart_true() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-sr".into(), AgentRole::Tester)
.await;
{
let mut agents = checker.agents.write().await;
let state = agents.get_mut(&id).unwrap();
state.status = HealthStatus::Unhealthy {
reason: "test".into(),
};
}
assert!(checker.should_restart(id).await);
}
#[tokio::test]
async fn test_should_restart_false_max_exceeded() {
let checker = HealthChecker::new(HealthCheckConfig {
max_auto_restarts: 2,
..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "agent-nsr".into(), AgentRole::Coder)
.await;
checker.record_restart(id).await.unwrap();
checker.record_restart(id).await.unwrap();
{
let mut agents = checker.agents.write().await;
let state = agents.get_mut(&id).unwrap();
state.status = HealthStatus::Unhealthy {
reason: "stuck".into(),
};
}
assert!(!checker.should_restart(id).await);
}
#[tokio::test]
async fn test_get_unhealthy_filters() {
let checker = default_checker();
let healthy_id = Uuid::new_v4();
let sick_id = Uuid::new_v4();
checker
.register_agent(healthy_id, "healthy".into(), AgentRole::Coder)
.await;
checker
.register_agent(sick_id, "sick".into(), AgentRole::Tester)
.await;
checker.record_heartbeat(healthy_id).await.unwrap();
checker.record_liveness_success(healthy_id).await.unwrap();
checker
.record_liveness_failure(sick_id, "down")
.await
.unwrap();
checker.check_all().await;
let unhealthy = checker.get_unhealthy().await;
assert_eq!(unhealthy.len(), 1);
assert_eq!(unhealthy[0].agent_id, sick_id);
}
#[tokio::test]
async fn test_summary_aggregates_counts() {
let checker = default_checker();
let id1 = Uuid::new_v4();
let id2 = Uuid::new_v4();
checker
.register_agent(id1, "a1".into(), AgentRole::Coder)
.await;
checker
.register_agent(id2, "a2".into(), AgentRole::Tester)
.await;
checker.record_heartbeat(id1).await.unwrap();
checker.record_liveness_success(id1).await.unwrap();
checker.record_restart(id2).await.unwrap();
let summary = checker.get_summary().await;
assert_eq!(summary.total_agents, 2);
assert_eq!(summary.total_restarts, 1);
}
#[tokio::test]
async fn test_check_all_detects_stale_heartbeats() {
let checker = HealthChecker::new(HealthCheckConfig {
heartbeat_timeout_secs: 0,
..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "stale".into(), AgentRole::Architect)
.await;
checker.record_heartbeat(id).await.unwrap();
let events = checker.check_all().await;
let has_missed = events
.iter()
.any(|e| matches!(e, HealthEvent::HeartbeatMissed { .. }));
assert!(has_missed);
}
#[tokio::test]
async fn test_check_all_heartbeat_missed_event_fields() {
let checker = HealthChecker::new(HealthCheckConfig {
heartbeat_timeout_secs: 0,
..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "hb-miss".into(), AgentRole::Spec)
.await;
checker.record_heartbeat(id).await.unwrap();
let events = checker.check_all().await;
let missed_event = events.iter().find(
|e| matches!(e, HealthEvent::HeartbeatMissed { agent_id, .. } if *agent_id == id),
);
assert!(missed_event.is_some());
if let Some(HealthEvent::HeartbeatMissed {
agent_name,
last_seen_secs_ago,
..
}) = missed_event
{
assert_eq!(agent_name, "hb-miss");
assert!(*last_seen_secs_ago <= 5);
}
}
#[tokio::test]
async fn test_check_all_agent_became_unhealthy_event() {
let checker = HealthChecker::new(HealthCheckConfig {
heartbeat_timeout_secs: 0,
max_consecutive_failures: 5, ..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "will-degrade".into(), AgentRole::SecurityAuditor)
.await;
checker.record_heartbeat(id).await.unwrap();
let events = checker.check_all().await;
let became_unhealthy = events.iter().any(
|e| matches!(e, HealthEvent::AgentBecameUnhealthy { agent_id, .. } if *agent_id == id),
);
assert!(
became_unhealthy,
"expected AgentBecameUnhealthy due to stale heartbeat"
);
}
#[tokio::test]
async fn test_readiness_probe_tracking() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "ready-test".into(), AgentRole::DocumentWriter)
.await;
checker.record_readiness(id, true).await.unwrap();
let health = checker.get_health(id).await.unwrap();
let readiness = health
.probes
.iter()
.find(|p| p.probe_type == ProbeType::Readiness)
.unwrap();
assert_eq!(readiness.consecutive_failures, 0);
assert_eq!(readiness.total_checks, 1);
checker.record_readiness(id, false).await.unwrap();
let health = checker.get_health(id).await.unwrap();
let readiness = health
.probes
.iter()
.find(|p| p.probe_type == ProbeType::Readiness)
.unwrap();
assert_eq!(readiness.consecutive_failures, 1);
assert_eq!(readiness.total_failures, 1);
assert_eq!(readiness.total_checks, 2);
}
#[tokio::test]
async fn test_unregister_removes_agent() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "ephemeral".into(), AgentRole::Coder)
.await;
assert!(checker.get_health(id).await.is_some());
checker.unregister_agent(id).await;
assert!(checker.get_health(id).await.is_none());
}
#[test]
fn test_default_config_values() {
let cfg = HealthCheckConfig::default();
assert_eq!(cfg.heartbeat_interval_secs, 30);
assert_eq!(cfg.heartbeat_timeout_secs, 90);
assert_eq!(cfg.liveness_check_interval_secs, 15);
assert_eq!(cfg.max_consecutive_failures, 3);
assert!(cfg.auto_restart_enabled);
assert_eq!(cfg.auto_restart_delay_secs, 5);
assert_eq!(cfg.max_auto_restarts, 5);
}
#[test]
fn test_health_status_serde() {
let statuses = vec![
HealthStatus::Healthy,
HealthStatus::Degraded {
reason: "slow".into(),
},
HealthStatus::Unhealthy {
reason: "crash".into(),
},
HealthStatus::Dead {
reason: "oom".into(),
},
HealthStatus::Unknown,
];
for status in &statuses {
let json = serde_json::to_string(status).unwrap();
let parsed: HealthStatus = serde_json::from_str(&json).unwrap();
assert_eq!(&parsed, status);
}
}
#[tokio::test]
async fn test_heartbeat_unregistered_agent_error() {
let checker = default_checker();
let result = checker.record_heartbeat(Uuid::new_v4()).await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_should_restart_disabled() {
let checker = HealthChecker::new(HealthCheckConfig {
auto_restart_enabled: false,
..HealthCheckConfig::default()
});
let id = Uuid::new_v4();
checker
.register_agent(id, "no-restart".into(), AgentRole::Coder)
.await;
{
let mut agents = checker.agents.write().await;
let state = agents.get_mut(&id).unwrap();
state.status = HealthStatus::Unhealthy {
reason: "err".into(),
};
}
assert!(!checker.should_restart(id).await);
}
#[tokio::test]
async fn test_liveness_success_resets_failures() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "recover".into(), AgentRole::Coder)
.await;
checker.record_liveness_failure(id, "blip").await.unwrap();
checker.record_liveness_success(id).await.unwrap();
let health = checker.get_health(id).await.unwrap();
let liveness = health
.probes
.iter()
.find(|p| p.probe_type == ProbeType::Liveness)
.unwrap();
assert_eq!(liveness.consecutive_failures, 0);
assert_eq!(liveness.total_checks, 2);
assert_eq!(liveness.total_failures, 1);
}
#[tokio::test]
async fn test_get_all_health() {
let checker = default_checker();
for i in 0..4 {
checker
.register_agent(Uuid::new_v4(), format!("a{i}"), AgentRole::Coder)
.await;
}
let all = checker.get_all_health().await;
assert_eq!(all.len(), 4);
}
#[tokio::test]
async fn test_restart_resets_probes() {
let checker = default_checker();
let id = Uuid::new_v4();
checker
.register_agent(id, "rst".into(), AgentRole::Tester)
.await;
checker.record_liveness_failure(id, "err").await.unwrap();
checker.record_restart(id).await.unwrap();
let health = checker.get_health(id).await.unwrap();
for probe in &health.probes {
assert_eq!(probe.consecutive_failures, 0);
assert!(probe.last_check.is_none());
}
}
}