Skip to main content

ant_core/node/daemon/
health.rs

1//! Fleet health: an extensible, always-available indicator of node-fleet health.
2//!
3//! The model is deliberately generic — [`FleetHealth`] is a list of [`HealthCheck`]s plus an
4//! `overall` level — so future signals (connectivity, sync lag, reward health, …) can be added as
5//! new [`HealthCheckKind`]s without changing the API surface the CLI and GUI consume.
6//!
7//! The first and only check today is **disk space**. It answers, per partition: are we comfortable
8//! (green), is an eviction likely soon (warning, with the candidate named), or is the partition at
9//! the eviction threshold (critical)? The candidate it names is computed by [`super::disk`] — the
10//! exact same selection the eviction monitor uses, so the warning never points at a different node
11//! than the one that actually gets evicted.
12
13use serde::{Deserialize, Serialize};
14
15use super::disk::PartitionState;
16
17/// One mebibyte, in bytes.
18pub const MIB: u64 = 1024 * 1024;
19
20/// Fixed free-space floor at which the daemon evicts a node, mirroring the node's own refuse-to-store
21/// reserve in `ant-node`'s storage layer. Internal constant — deliberately not user-configurable.
22const EVICTION_THRESHOLD_MB: u64 = 500;
23
24/// Fixed free-space level at which the fleet health turns to `Warning` and names the node that would
25/// be evicted next. Internal constant — deliberately not user-configurable.
26const WARNING_THRESHOLD_MB: u64 = 1024;
27
28/// Severity level for a single check or the fleet as a whole.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
30#[serde(rename_all = "snake_case")]
31pub enum HealthLevel {
32    /// Everything is comfortable.
33    Green,
34    /// Action may be needed soon (e.g. an eviction is approaching).
35    Warning,
36    /// At or past a hard limit right now (e.g. eviction is imminent, or space is exhausted and the
37    /// daemon cannot help automatically).
38    Critical,
39}
40
41impl HealthLevel {
42    fn rank(self) -> u8 {
43        match self {
44            HealthLevel::Green => 0,
45            HealthLevel::Warning => 1,
46            HealthLevel::Critical => 2,
47        }
48    }
49
50    /// The more severe of two levels.
51    pub fn worst(self, other: HealthLevel) -> HealthLevel {
52        if other.rank() > self.rank() {
53            other
54        } else {
55            self
56        }
57    }
58}
59
60/// Which signal produced a [`HealthCheck`]. Extensible: new variants slot in here.
61#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
62#[serde(rename_all = "snake_case")]
63pub enum HealthCheckKind {
64    /// Free disk space at node data directories.
65    DiskSpace,
66}
67
68/// The node a check has identified as the next eviction candidate.
69#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
70pub struct EvictionCandidate {
71    pub node_id: u32,
72    #[schema(value_type = String)]
73    pub data_dir: String,
74    /// Bytes the candidate's data directory currently occupies (≈ space its eviction would free).
75    pub size_bytes: u64,
76}
77
78/// A single health finding.
79#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
80pub struct HealthCheck {
81    pub kind: HealthCheckKind,
82    pub level: HealthLevel,
83    /// Human-readable, user-facing one-liner.
84    pub summary: String,
85    /// Partition this finding concerns (disk checks only).
86    #[serde(skip_serializing_if = "Option::is_none")]
87    pub partition: Option<String>,
88    /// Free bytes on the partition (disk checks only).
89    #[serde(skip_serializing_if = "Option::is_none")]
90    pub available_bytes: Option<u64>,
91    /// Free-space floor at which an eviction triggers (disk checks only).
92    #[serde(skip_serializing_if = "Option::is_none")]
93    pub eviction_threshold_bytes: Option<u64>,
94    /// The node that would be evicted next, when one applies.
95    #[serde(skip_serializing_if = "Option::is_none")]
96    pub candidate: Option<EvictionCandidate>,
97}
98
99/// Fleet-wide health snapshot: the worst level across all checks, plus the individual findings.
100#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
101pub struct FleetHealth {
102    pub overall: HealthLevel,
103    pub checks: Vec<HealthCheck>,
104}
105
106impl FleetHealth {
107    /// A green snapshot with no findings (e.g. no nodes registered).
108    pub fn healthy() -> Self {
109        FleetHealth {
110            overall: HealthLevel::Green,
111            checks: Vec::new(),
112        }
113    }
114
115    /// Build the fleet health from measured partition states and the configured disk thresholds.
116    pub fn from_partitions(partitions: &[PartitionState], thresholds: &DiskThresholds) -> Self {
117        let checks: Vec<HealthCheck> = partitions
118            .iter()
119            .map(|p| disk_space_check(p, thresholds))
120            .collect();
121        let overall = checks
122            .iter()
123            .fold(HealthLevel::Green, |acc, c| acc.worst(c.level));
124        FleetHealth { overall, checks }
125    }
126}
127
128/// Configured free-space thresholds for the disk-space check.
129#[derive(Debug, Clone, Copy)]
130pub struct DiskThresholds {
131    /// Evict a node once free space falls to/below this many bytes.
132    pub eviction_bytes: u64,
133    /// Turn the health to `Warning` once free space falls to/below this many bytes.
134    pub warning_bytes: u64,
135}
136
137impl Default for DiskThresholds {
138    fn default() -> Self {
139        DiskThresholds {
140            eviction_bytes: EVICTION_THRESHOLD_MB * MIB,
141            warning_bytes: WARNING_THRESHOLD_MB * MIB,
142        }
143    }
144}
145
146/// Evaluate the disk-space health of a single partition.
147fn disk_space_check(p: &PartitionState, thresholds: &DiskThresholds) -> HealthCheck {
148    let available = p.available_bytes;
149    let candidate = p.eviction_candidate();
150    // Eviction only helps the *other* nodes on the partition, so it is only possible when at least
151    // two nodes share it (one is evicted, at least one remains to benefit). A partition with a
152    // single node therefore cannot be auto-helped — this subsumes the "only one node running" case.
153    let can_evict = p.nodes.len() >= 2;
154
155    let (level, summary) = if available <= thresholds.eviction_bytes {
156        if can_evict {
157            let who = candidate
158                .map(|c| format!("node {}", c.node_id))
159                .unwrap_or_else(|| "a node".to_string());
160            (
161                HealthLevel::Critical,
162                format!(
163                    "Disk space critical on {}: {} free (≤ {}). Evicting {} to reclaim space.",
164                    p.partition,
165                    fmt_bytes(available),
166                    fmt_bytes(thresholds.eviction_bytes),
167                    who,
168                ),
169            )
170        } else {
171            (
172                HealthLevel::Critical,
173                format!(
174                    "Disk space critical on {}: {} free (≤ {}), but only one node is running here \
175                     so it cannot be auto-evicted. Free disk space or reduce node count manually.",
176                    p.partition,
177                    fmt_bytes(available),
178                    fmt_bytes(thresholds.eviction_bytes),
179                ),
180            )
181        }
182    } else if available <= thresholds.warning_bytes {
183        let who = candidate
184            .filter(|_| can_evict)
185            .map(|c| format!("; node {} would be evicted next", c.node_id))
186            .unwrap_or_default();
187        (
188            HealthLevel::Warning,
189            format!(
190                "Disk space low on {}: {} free. An eviction may occur once it reaches {}{}.",
191                p.partition,
192                fmt_bytes(available),
193                fmt_bytes(thresholds.eviction_bytes),
194                who,
195            ),
196        )
197    } else {
198        (
199            HealthLevel::Green,
200            format!(
201                "Disk space healthy on {}: {} free.",
202                p.partition,
203                fmt_bytes(available)
204            ),
205        )
206    };
207
208    // Only surface a candidate when an eviction could actually happen.
209    let candidate_out = if can_evict && level != HealthLevel::Green {
210        candidate.map(|c| EvictionCandidate {
211            node_id: c.node_id,
212            data_dir: c.data_dir.to_string_lossy().into_owned(),
213            size_bytes: c.size_bytes,
214        })
215    } else {
216        None
217    };
218
219    HealthCheck {
220        kind: HealthCheckKind::DiskSpace,
221        level,
222        summary,
223        partition: Some(p.partition.to_string()),
224        available_bytes: Some(available),
225        eviction_threshold_bytes: Some(thresholds.eviction_bytes),
226        candidate: candidate_out,
227    }
228}
229
230/// Format a byte count as a human-friendly string (GiB/MiB).
231fn fmt_bytes(bytes: u64) -> String {
232    const GIB: u64 = 1024 * MIB;
233    if bytes >= GIB {
234        format!("{:.2} GiB", bytes as f64 / GIB as f64)
235    } else {
236        format!("{:.0} MiB", bytes as f64 / MIB as f64)
237    }
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243    use crate::node::daemon::disk::{NodeDiskUsage, PartitionKey, PartitionState};
244    use std::path::PathBuf;
245
246    fn node(id: u32, size: u64) -> NodeDiskUsage {
247        NodeDiskUsage {
248            node_id: id,
249            data_dir: PathBuf::from(format!("/data/node-{id}")),
250            size_bytes: size,
251        }
252    }
253
254    fn partition(available_bytes: u64, nodes: Vec<NodeDiskUsage>) -> PartitionState {
255        PartitionState {
256            partition: PartitionKey::for_test("p0"),
257            available_bytes,
258            nodes,
259        }
260    }
261
262    fn thresholds() -> DiskThresholds {
263        DiskThresholds {
264            eviction_bytes: 500 * MIB,
265            warning_bytes: 1024 * MIB,
266        }
267    }
268
269    #[test]
270    fn green_when_space_comfortable() {
271        let p = partition(4 * 1024 * MIB, vec![node(1, 100), node(2, 200)]);
272        let health = FleetHealth::from_partitions(&[p], &thresholds());
273        assert_eq!(health.overall, HealthLevel::Green);
274        assert_eq!(health.checks[0].candidate, None);
275    }
276
277    #[test]
278    fn warning_names_candidate_between_thresholds() {
279        // 800 MiB free: below warning (1024) but above eviction (500).
280        let p = partition(800 * MIB, vec![node(1, 900), node(2, 100)]);
281        let health = FleetHealth::from_partitions(&[p], &thresholds());
282        assert_eq!(health.overall, HealthLevel::Warning);
283        // Smallest node (2) is the candidate.
284        assert_eq!(health.checks[0].candidate.as_ref().unwrap().node_id, 2);
285    }
286
287    #[test]
288    fn critical_when_at_eviction_threshold_multi_node() {
289        let p = partition(400 * MIB, vec![node(1, 900), node(2, 100)]);
290        let health = FleetHealth::from_partitions(&[p], &thresholds());
291        assert_eq!(health.overall, HealthLevel::Critical);
292        assert_eq!(health.checks[0].candidate.as_ref().unwrap().node_id, 2);
293        assert!(health.checks[0].summary.contains("Evicting node 2"));
294    }
295
296    #[test]
297    fn critical_sole_node_warns_no_candidate() {
298        // Single node on a full partition: cannot auto-evict.
299        let p = partition(400 * MIB, vec![node(7, 100)]);
300        let health = FleetHealth::from_partitions(&[p], &thresholds());
301        assert_eq!(health.overall, HealthLevel::Critical);
302        assert!(health.checks[0].candidate.is_none());
303        assert!(health.checks[0].summary.contains("only one node"));
304    }
305
306    #[test]
307    fn overall_is_worst_across_partitions() {
308        let healthy = partition(4 * 1024 * MIB, vec![node(1, 100), node(2, 100)]);
309        let warning = partition(800 * MIB, vec![node(3, 100), node(4, 100)]);
310        let health = FleetHealth::from_partitions(&[healthy, warning], &thresholds());
311        assert_eq!(health.overall, HealthLevel::Warning);
312        assert_eq!(health.checks.len(), 2);
313    }
314
315    #[test]
316    fn empty_fleet_is_green() {
317        let health = FleetHealth::from_partitions(&[], &thresholds());
318        assert_eq!(health.overall, HealthLevel::Green);
319        assert!(health.checks.is_empty());
320    }
321}