use serde::{Deserialize, Serialize};
use super::disk::PartitionState;
pub const MIB: u64 = 1024 * 1024;
const EVICTION_THRESHOLD_MB: u64 = 500;
const WARNING_THRESHOLD_MB: u64 = 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
#[serde(rename_all = "snake_case")]
pub enum HealthLevel {
Green,
Warning,
Critical,
}
impl HealthLevel {
fn rank(self) -> u8 {
match self {
HealthLevel::Green => 0,
HealthLevel::Warning => 1,
HealthLevel::Critical => 2,
}
}
pub fn worst(self, other: HealthLevel) -> HealthLevel {
if other.rank() > self.rank() {
other
} else {
self
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
#[serde(rename_all = "snake_case")]
pub enum HealthCheckKind {
DiskSpace,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
pub struct EvictionCandidate {
pub node_id: u32,
#[schema(value_type = String)]
pub data_dir: String,
pub size_bytes: u64,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
pub struct HealthCheck {
pub kind: HealthCheckKind,
pub level: HealthLevel,
pub summary: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub partition: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub available_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub eviction_threshold_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub candidate: Option<EvictionCandidate>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
pub struct FleetHealth {
pub overall: HealthLevel,
pub checks: Vec<HealthCheck>,
}
impl FleetHealth {
pub fn healthy() -> Self {
FleetHealth {
overall: HealthLevel::Green,
checks: Vec::new(),
}
}
pub fn from_partitions(partitions: &[PartitionState], thresholds: &DiskThresholds) -> Self {
let checks: Vec<HealthCheck> = partitions
.iter()
.map(|p| disk_space_check(p, thresholds))
.collect();
let overall = checks
.iter()
.fold(HealthLevel::Green, |acc, c| acc.worst(c.level));
FleetHealth { overall, checks }
}
}
#[derive(Debug, Clone, Copy)]
pub struct DiskThresholds {
pub eviction_bytes: u64,
pub warning_bytes: u64,
}
impl Default for DiskThresholds {
fn default() -> Self {
DiskThresholds {
eviction_bytes: EVICTION_THRESHOLD_MB * MIB,
warning_bytes: WARNING_THRESHOLD_MB * MIB,
}
}
}
fn disk_space_check(p: &PartitionState, thresholds: &DiskThresholds) -> HealthCheck {
let available = p.available_bytes;
let candidate = p.eviction_candidate();
let can_evict = p.nodes.len() >= 2;
let (level, summary) = if available <= thresholds.eviction_bytes {
if can_evict {
let who = candidate
.map(|c| format!("node {}", c.node_id))
.unwrap_or_else(|| "a node".to_string());
(
HealthLevel::Critical,
format!(
"Disk space critical on {}: {} free (≤ {}). Evicting {} to reclaim space.",
p.partition,
fmt_bytes(available),
fmt_bytes(thresholds.eviction_bytes),
who,
),
)
} else {
(
HealthLevel::Critical,
format!(
"Disk space critical on {}: {} free (≤ {}), but only one node is running here \
so it cannot be auto-evicted. Free disk space or reduce node count manually.",
p.partition,
fmt_bytes(available),
fmt_bytes(thresholds.eviction_bytes),
),
)
}
} else if available <= thresholds.warning_bytes {
let who = candidate
.filter(|_| can_evict)
.map(|c| format!("; node {} would be evicted next", c.node_id))
.unwrap_or_default();
(
HealthLevel::Warning,
format!(
"Disk space low on {}: {} free. An eviction may occur once it reaches {}{}.",
p.partition,
fmt_bytes(available),
fmt_bytes(thresholds.eviction_bytes),
who,
),
)
} else {
(
HealthLevel::Green,
format!(
"Disk space healthy on {}: {} free.",
p.partition,
fmt_bytes(available)
),
)
};
let candidate_out = if can_evict && level != HealthLevel::Green {
candidate.map(|c| EvictionCandidate {
node_id: c.node_id,
data_dir: c.data_dir.to_string_lossy().into_owned(),
size_bytes: c.size_bytes,
})
} else {
None
};
HealthCheck {
kind: HealthCheckKind::DiskSpace,
level,
summary,
partition: Some(p.partition.to_string()),
available_bytes: Some(available),
eviction_threshold_bytes: Some(thresholds.eviction_bytes),
candidate: candidate_out,
}
}
fn fmt_bytes(bytes: u64) -> String {
const GIB: u64 = 1024 * MIB;
if bytes >= GIB {
format!("{:.2} GiB", bytes as f64 / GIB as f64)
} else {
format!("{:.0} MiB", bytes as f64 / MIB as f64)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::node::daemon::disk::{NodeDiskUsage, PartitionKey, PartitionState};
use std::path::PathBuf;
fn node(id: u32, size: u64) -> NodeDiskUsage {
NodeDiskUsage {
node_id: id,
data_dir: PathBuf::from(format!("/data/node-{id}")),
size_bytes: size,
}
}
fn partition(available_bytes: u64, nodes: Vec<NodeDiskUsage>) -> PartitionState {
PartitionState {
partition: PartitionKey::for_test("p0"),
available_bytes,
nodes,
}
}
fn thresholds() -> DiskThresholds {
DiskThresholds {
eviction_bytes: 500 * MIB,
warning_bytes: 1024 * MIB,
}
}
#[test]
fn green_when_space_comfortable() {
let p = partition(4 * 1024 * MIB, vec![node(1, 100), node(2, 200)]);
let health = FleetHealth::from_partitions(&[p], &thresholds());
assert_eq!(health.overall, HealthLevel::Green);
assert_eq!(health.checks[0].candidate, None);
}
#[test]
fn warning_names_candidate_between_thresholds() {
let p = partition(800 * MIB, vec![node(1, 900), node(2, 100)]);
let health = FleetHealth::from_partitions(&[p], &thresholds());
assert_eq!(health.overall, HealthLevel::Warning);
assert_eq!(health.checks[0].candidate.as_ref().unwrap().node_id, 2);
}
#[test]
fn critical_when_at_eviction_threshold_multi_node() {
let p = partition(400 * MIB, vec![node(1, 900), node(2, 100)]);
let health = FleetHealth::from_partitions(&[p], &thresholds());
assert_eq!(health.overall, HealthLevel::Critical);
assert_eq!(health.checks[0].candidate.as_ref().unwrap().node_id, 2);
assert!(health.checks[0].summary.contains("Evicting node 2"));
}
#[test]
fn critical_sole_node_warns_no_candidate() {
let p = partition(400 * MIB, vec![node(7, 100)]);
let health = FleetHealth::from_partitions(&[p], &thresholds());
assert_eq!(health.overall, HealthLevel::Critical);
assert!(health.checks[0].candidate.is_none());
assert!(health.checks[0].summary.contains("only one node"));
}
#[test]
fn overall_is_worst_across_partitions() {
let healthy = partition(4 * 1024 * MIB, vec![node(1, 100), node(2, 100)]);
let warning = partition(800 * MIB, vec![node(3, 100), node(4, 100)]);
let health = FleetHealth::from_partitions(&[healthy, warning], &thresholds());
assert_eq!(health.overall, HealthLevel::Warning);
assert_eq!(health.checks.len(), 2);
}
#[test]
fn empty_fleet_is_green() {
let health = FleetHealth::from_partitions(&[], &thresholds());
assert_eq!(health.overall, HealthLevel::Green);
assert!(health.checks.is_empty());
}
}