1use serde::{Deserialize, Serialize};
14
15use super::disk::PartitionState;
16
17pub const MIB: u64 = 1024 * 1024;
19
20const EVICTION_THRESHOLD_MB: u64 = 500;
23
24const WARNING_THRESHOLD_MB: u64 = 1024;
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
30#[serde(rename_all = "snake_case")]
31pub enum HealthLevel {
32 Green,
34 Warning,
36 Critical,
39}
40
41impl HealthLevel {
42 fn rank(self) -> u8 {
43 match self {
44 HealthLevel::Green => 0,
45 HealthLevel::Warning => 1,
46 HealthLevel::Critical => 2,
47 }
48 }
49
50 pub fn worst(self, other: HealthLevel) -> HealthLevel {
52 if other.rank() > self.rank() {
53 other
54 } else {
55 self
56 }
57 }
58}
59
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
62#[serde(rename_all = "snake_case")]
63pub enum HealthCheckKind {
64 DiskSpace,
66}
67
68#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
70pub struct EvictionCandidate {
71 pub node_id: u32,
72 #[schema(value_type = String)]
73 pub data_dir: String,
74 pub size_bytes: u64,
76}
77
78#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
80pub struct HealthCheck {
81 pub kind: HealthCheckKind,
82 pub level: HealthLevel,
83 pub summary: String,
85 #[serde(skip_serializing_if = "Option::is_none")]
87 pub partition: Option<String>,
88 #[serde(skip_serializing_if = "Option::is_none")]
90 pub available_bytes: Option<u64>,
91 #[serde(skip_serializing_if = "Option::is_none")]
93 pub eviction_threshold_bytes: Option<u64>,
94 #[serde(skip_serializing_if = "Option::is_none")]
96 pub candidate: Option<EvictionCandidate>,
97}
98
99#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, utoipa::ToSchema)]
101pub struct FleetHealth {
102 pub overall: HealthLevel,
103 pub checks: Vec<HealthCheck>,
104}
105
106impl FleetHealth {
107 pub fn healthy() -> Self {
109 FleetHealth {
110 overall: HealthLevel::Green,
111 checks: Vec::new(),
112 }
113 }
114
115 pub fn from_partitions(partitions: &[PartitionState], thresholds: &DiskThresholds) -> Self {
117 let checks: Vec<HealthCheck> = partitions
118 .iter()
119 .map(|p| disk_space_check(p, thresholds))
120 .collect();
121 let overall = checks
122 .iter()
123 .fold(HealthLevel::Green, |acc, c| acc.worst(c.level));
124 FleetHealth { overall, checks }
125 }
126}
127
128#[derive(Debug, Clone, Copy)]
130pub struct DiskThresholds {
131 pub eviction_bytes: u64,
133 pub warning_bytes: u64,
135}
136
137impl Default for DiskThresholds {
138 fn default() -> Self {
139 DiskThresholds {
140 eviction_bytes: EVICTION_THRESHOLD_MB * MIB,
141 warning_bytes: WARNING_THRESHOLD_MB * MIB,
142 }
143 }
144}
145
146fn disk_space_check(p: &PartitionState, thresholds: &DiskThresholds) -> HealthCheck {
148 let available = p.available_bytes;
149 let candidate = p.eviction_candidate();
150 let can_evict = p.nodes.len() >= 2;
154
155 let (level, summary) = if available <= thresholds.eviction_bytes {
156 if can_evict {
157 let who = candidate
158 .map(|c| format!("node {}", c.node_id))
159 .unwrap_or_else(|| "a node".to_string());
160 (
161 HealthLevel::Critical,
162 format!(
163 "Disk space critical on {}: {} free (≤ {}). Evicting {} to reclaim space.",
164 p.partition,
165 fmt_bytes(available),
166 fmt_bytes(thresholds.eviction_bytes),
167 who,
168 ),
169 )
170 } else {
171 (
172 HealthLevel::Critical,
173 format!(
174 "Disk space critical on {}: {} free (≤ {}), but only one node is running here \
175 so it cannot be auto-evicted. Free disk space or reduce node count manually.",
176 p.partition,
177 fmt_bytes(available),
178 fmt_bytes(thresholds.eviction_bytes),
179 ),
180 )
181 }
182 } else if available <= thresholds.warning_bytes {
183 let who = candidate
184 .filter(|_| can_evict)
185 .map(|c| format!("; node {} would be evicted next", c.node_id))
186 .unwrap_or_default();
187 (
188 HealthLevel::Warning,
189 format!(
190 "Disk space low on {}: {} free. An eviction may occur once it reaches {}{}.",
191 p.partition,
192 fmt_bytes(available),
193 fmt_bytes(thresholds.eviction_bytes),
194 who,
195 ),
196 )
197 } else {
198 (
199 HealthLevel::Green,
200 format!(
201 "Disk space healthy on {}: {} free.",
202 p.partition,
203 fmt_bytes(available)
204 ),
205 )
206 };
207
208 let candidate_out = if can_evict && level != HealthLevel::Green {
210 candidate.map(|c| EvictionCandidate {
211 node_id: c.node_id,
212 data_dir: c.data_dir.to_string_lossy().into_owned(),
213 size_bytes: c.size_bytes,
214 })
215 } else {
216 None
217 };
218
219 HealthCheck {
220 kind: HealthCheckKind::DiskSpace,
221 level,
222 summary,
223 partition: Some(p.partition.to_string()),
224 available_bytes: Some(available),
225 eviction_threshold_bytes: Some(thresholds.eviction_bytes),
226 candidate: candidate_out,
227 }
228}
229
230fn fmt_bytes(bytes: u64) -> String {
232 const GIB: u64 = 1024 * MIB;
233 if bytes >= GIB {
234 format!("{:.2} GiB", bytes as f64 / GIB as f64)
235 } else {
236 format!("{:.0} MiB", bytes as f64 / MIB as f64)
237 }
238}
239
240#[cfg(test)]
241mod tests {
242 use super::*;
243 use crate::node::daemon::disk::{NodeDiskUsage, PartitionKey, PartitionState};
244 use std::path::PathBuf;
245
246 fn node(id: u32, size: u64) -> NodeDiskUsage {
247 NodeDiskUsage {
248 node_id: id,
249 data_dir: PathBuf::from(format!("/data/node-{id}")),
250 size_bytes: size,
251 }
252 }
253
254 fn partition(available_bytes: u64, nodes: Vec<NodeDiskUsage>) -> PartitionState {
255 PartitionState {
256 partition: PartitionKey::for_test("p0"),
257 available_bytes,
258 nodes,
259 }
260 }
261
262 fn thresholds() -> DiskThresholds {
263 DiskThresholds {
264 eviction_bytes: 500 * MIB,
265 warning_bytes: 1024 * MIB,
266 }
267 }
268
269 #[test]
270 fn green_when_space_comfortable() {
271 let p = partition(4 * 1024 * MIB, vec![node(1, 100), node(2, 200)]);
272 let health = FleetHealth::from_partitions(&[p], &thresholds());
273 assert_eq!(health.overall, HealthLevel::Green);
274 assert_eq!(health.checks[0].candidate, None);
275 }
276
277 #[test]
278 fn warning_names_candidate_between_thresholds() {
279 let p = partition(800 * MIB, vec![node(1, 900), node(2, 100)]);
281 let health = FleetHealth::from_partitions(&[p], &thresholds());
282 assert_eq!(health.overall, HealthLevel::Warning);
283 assert_eq!(health.checks[0].candidate.as_ref().unwrap().node_id, 2);
285 }
286
287 #[test]
288 fn critical_when_at_eviction_threshold_multi_node() {
289 let p = partition(400 * MIB, vec![node(1, 900), node(2, 100)]);
290 let health = FleetHealth::from_partitions(&[p], &thresholds());
291 assert_eq!(health.overall, HealthLevel::Critical);
292 assert_eq!(health.checks[0].candidate.as_ref().unwrap().node_id, 2);
293 assert!(health.checks[0].summary.contains("Evicting node 2"));
294 }
295
296 #[test]
297 fn critical_sole_node_warns_no_candidate() {
298 let p = partition(400 * MIB, vec![node(7, 100)]);
300 let health = FleetHealth::from_partitions(&[p], &thresholds());
301 assert_eq!(health.overall, HealthLevel::Critical);
302 assert!(health.checks[0].candidate.is_none());
303 assert!(health.checks[0].summary.contains("only one node"));
304 }
305
306 #[test]
307 fn overall_is_worst_across_partitions() {
308 let healthy = partition(4 * 1024 * MIB, vec![node(1, 100), node(2, 100)]);
309 let warning = partition(800 * MIB, vec![node(3, 100), node(4, 100)]);
310 let health = FleetHealth::from_partitions(&[healthy, warning], &thresholds());
311 assert_eq!(health.overall, HealthLevel::Warning);
312 assert_eq!(health.checks.len(), 2);
313 }
314
315 #[test]
316 fn empty_fleet_is_green() {
317 let health = FleetHealth::from_partitions(&[], &thresholds());
318 assert_eq!(health.overall, HealthLevel::Green);
319 assert!(health.checks.is_empty());
320 }
321}