Skip to main content

manta_shared/shared/
cluster_status.rs

1//! Pure helpers for summarizing node and cluster status.
2//!
3//! Both the CLI (table rendering) and the server (`service::hardware`,
4//! `service::hw_cluster`) call into these. Living in `shared/` keeps the
5//! CLI from importing `crate::service::*` for what are really data-only
6//! helpers.
7
8use std::collections::HashMap;
9
10use crate::shared::dto::NodeDetails;
11use manta_backend_dispatcher::types::NodeSummary;
12
13/// Divisor to convert MiB to GiB.
14const MIB_PER_GIB: usize = 1024;
15
16/// Compute a summary status from a list of node details.
17///
18/// Priority order: FAILED > OFF > ON > STANDBY > UNCONFIGURED > OK.
19/// The first matching condition wins, regardless of how many nodes
20/// fall under lower-priority states.
21///
22/// # Examples
23///
24/// One failed node makes the whole cluster `"FAILED"`, even if every
25/// other node is on and configured:
26///
27/// ```
28/// use manta_shared::shared::cluster_status::compute_summary_status;
29/// use manta_shared::shared::dto::NodeDetails;
30///
31/// fn n(power: &str, config: &str) -> NodeDetails {
32///   NodeDetails {
33///     xname: String::new(), nid: String::new(), hsm: String::new(),
34///     power_status: power.into(),
35///     desired_configuration: String::new(),
36///     configuration_status: config.into(),
37///     enabled: String::new(), error_count: String::new(),
38///     boot_image_id: String::new(), boot_configuration: String::new(),
39///     kernel_params: String::new(),
40///   }
41/// }
42///
43/// assert_eq!(
44///   compute_summary_status(&[
45///     n("ON", "failed"),
46///     n("ON", "configured"),
47///   ]),
48///   "FAILED",
49/// );
50/// assert_eq!(
51///   compute_summary_status(&[n("ON", "configured"), n("OFF", "configured")]),
52///   "OFF",
53/// );
54/// assert_eq!(compute_summary_status(&[]), "OK");
55/// ```
56pub fn compute_summary_status(nodes: &[NodeDetails]) -> &'static str {
57  if nodes
58    .iter()
59    .any(|n| n.configuration_status.eq_ignore_ascii_case("failed"))
60  {
61    "FAILED"
62  } else if nodes
63    .iter()
64    .any(|n| n.power_status.eq_ignore_ascii_case("OFF"))
65  {
66    "OFF"
67  } else if nodes
68    .iter()
69    .any(|n| n.power_status.eq_ignore_ascii_case("on"))
70  {
71    "ON"
72  } else if nodes
73    .iter()
74    .any(|n| n.power_status.eq_ignore_ascii_case("standby"))
75  {
76    "STANDBY"
77  } else if nodes
78    .iter()
79    .any(|n| !n.configuration_status.eq_ignore_ascii_case("configured"))
80  {
81    "UNCONFIGURED"
82  } else {
83    "OK"
84  }
85}
86
87/// Aggregate hardware component counts across nodes (summary view).
88///
89/// Counts processors and accelerators by info string, converts
90/// memory from MiB to GiB, and counts HSN NICs.
91pub fn calculate_hsm_hw_component_summary(
92  node_summary_vec: &[NodeSummary],
93) -> HashMap<String, usize> {
94  let mut node_hw_component_summary: HashMap<String, usize> = HashMap::new();
95
96  for node_summary in node_summary_vec {
97    for artifact_summary in &node_summary.processors {
98      if let Some(info) = artifact_summary.info.as_ref() {
99        node_hw_component_summary
100          .entry(info.to_string())
101          .and_modify(|qty| *qty += 1)
102          .or_insert(1);
103      }
104    }
105    for artifact_summary in &node_summary.node_accels {
106      if let Some(info) = artifact_summary.info.as_ref() {
107        node_hw_component_summary
108          .entry(info.to_string())
109          .and_modify(|qty| *qty += 1)
110          .or_insert(1);
111      }
112    }
113    for artifact_summary in &node_summary.memory {
114      let memory_capacity = artifact_summary
115        .info
116        .as_deref()
117        .unwrap_or("ERROR NA")
118        .split(' ')
119        .collect::<Vec<_>>()
120        .first()
121        .copied()
122        .unwrap_or("0")
123        .parse::<usize>()
124        .unwrap_or(0);
125      node_hw_component_summary
126        .entry(artifact_summary.r#type.to_string() + " (GiB)")
127        .and_modify(|qty| *qty += memory_capacity / MIB_PER_GIB)
128        .or_insert(memory_capacity / MIB_PER_GIB);
129    }
130    for artifact_summary in &node_summary.node_hsn_nics {
131      if let Some(info) = artifact_summary.info.as_ref() {
132        node_hw_component_summary
133          .entry(info.to_string())
134          .and_modify(|qty| *qty += 1)
135          .or_insert(1);
136      }
137    }
138  }
139
140  node_hw_component_summary
141}
142
143/// Compute a hardware pattern (component counts with whitespace stripped).
144pub fn get_cluster_hw_pattern(
145  hsm_summary: Vec<NodeSummary>,
146) -> HashMap<String, usize> {
147  let mut hsm_node_hw_component_count_hashmap: HashMap<String, usize> =
148    HashMap::new();
149
150  for node_summary in hsm_summary {
151    for processor in node_summary.processors {
152      if let Some(info) = processor.info {
153        hsm_node_hw_component_count_hashmap
154          .entry(info.chars().filter(|c| !c.is_whitespace()).collect())
155          .and_modify(|qty| *qty += 1)
156          .or_insert(1);
157      }
158    }
159
160    for node_accel in node_summary.node_accels {
161      if let Some(info) = node_accel.info {
162        hsm_node_hw_component_count_hashmap
163          .entry(info.chars().filter(|c| !c.is_whitespace()).collect())
164          .and_modify(|qty| *qty += 1)
165          .or_insert(1);
166      }
167    }
168
169    for memory_dimm in node_summary.memory {
170      let memory_capacity = memory_dimm
171        .info
172        .unwrap_or_else(|| "0".to_string())
173        .split(' ')
174        .next()
175        .unwrap_or("0")
176        .to_string()
177        .parse::<usize>()
178        .unwrap_or(0);
179
180      hsm_node_hw_component_count_hashmap
181        .entry("memory".to_string())
182        .and_modify(|qty| *qty += memory_capacity)
183        .or_insert(memory_capacity);
184    }
185  }
186
187  hsm_node_hw_component_count_hashmap
188}
189
190#[cfg(test)]
191mod tests {
192  use super::*;
193  use manta_backend_dispatcher::types::{ArtifactSummary, ArtifactType};
194
195  // ---- fixtures ----
196
197  fn node(power: &str, config: &str) -> NodeDetails {
198    NodeDetails {
199      xname: String::new(),
200      nid: String::new(),
201      hsm: String::new(),
202      power_status: power.to_string(),
203      desired_configuration: String::new(),
204      configuration_status: config.to_string(),
205      enabled: String::new(),
206      error_count: String::new(),
207      boot_image_id: String::new(),
208      boot_configuration: String::new(),
209      kernel_params: String::new(),
210    }
211  }
212
213  fn artifact(kind: ArtifactType, info: Option<&str>) -> ArtifactSummary {
214    ArtifactSummary {
215      xname: String::new(),
216      r#type: kind,
217      info: info.map(String::from),
218    }
219  }
220
221  fn summary(
222    processors: Vec<ArtifactSummary>,
223    memory: Vec<ArtifactSummary>,
224    accels: Vec<ArtifactSummary>,
225    nics: Vec<ArtifactSummary>,
226  ) -> NodeSummary {
227    NodeSummary {
228      xname: String::new(),
229      r#type: String::new(),
230      processors,
231      memory,
232      node_accels: accels,
233      node_hsn_nics: nics,
234    }
235  }
236
237  // ---- compute_summary_status priority ladder ----
238  //
239  // Priority: FAILED > OFF > ON > STANDBY > UNCONFIGURED > OK
240  // Each test mixes a higher-priority node with lower-priority ones
241  // to pin the precedence — a swap (e.g. OFF and ON reversed) would
242  // change what operators see in `manta get cluster` and is silent
243  // without these tests.
244
245  #[test]
246  fn summary_status_failed_beats_everything() {
247    let nodes = [
248      node("ON", "failed"),
249      node("OFF", "configured"),
250      node("on", "configured"),
251    ];
252    assert_eq!(compute_summary_status(&nodes), "FAILED");
253  }
254
255  #[test]
256  fn summary_status_off_beats_on() {
257    let nodes = [node("OFF", "configured"), node("on", "configured")];
258    assert_eq!(compute_summary_status(&nodes), "OFF");
259  }
260
261  #[test]
262  fn summary_status_on_beats_standby() {
263    let nodes = [node("on", "configured"), node("standby", "configured")];
264    assert_eq!(compute_summary_status(&nodes), "ON");
265  }
266
267  #[test]
268  fn summary_status_standby_beats_unconfigured() {
269    let nodes = [node("standby", "configured"), node("ready", "pending")];
270    assert_eq!(compute_summary_status(&nodes), "STANDBY");
271  }
272
273  #[test]
274  fn summary_status_unconfigured_when_only_config_differs() {
275    let nodes = [node("ready", "pending")];
276    assert_eq!(compute_summary_status(&nodes), "UNCONFIGURED");
277  }
278
279  #[test]
280  fn summary_status_ok_when_all_configured_and_no_known_power_state() {
281    let nodes = [node("ready", "configured"), node("ready", "configured")];
282    assert_eq!(compute_summary_status(&nodes), "OK");
283  }
284
285  #[test]
286  fn summary_status_empty_input_is_ok() {
287    // No nodes means no `any()` matches, falls through to OK.
288    // Worth pinning so callers can rely on it instead of pre-checking.
289    assert_eq!(compute_summary_status(&[]), "OK");
290  }
291
292  #[test]
293  fn summary_status_matches_case_insensitively() {
294    // Power and configuration status checks use eq_ignore_ascii_case.
295    assert_eq!(compute_summary_status(&[node("off", "configured")]), "OFF");
296    assert_eq!(compute_summary_status(&[node("ON", "CONFIGURED")]), "ON");
297  }
298
299  // ---- calculate_hsm_hw_component_summary ----
300
301  #[test]
302  fn hw_summary_empty_input_is_empty() {
303    assert!(calculate_hsm_hw_component_summary(&[]).is_empty());
304  }
305
306  #[test]
307  fn hw_summary_counts_identical_processors_across_nodes() {
308    let node_a = summary(
309      vec![
310        artifact(ArtifactType::Processor, Some("AMD EPYC 7763")),
311        artifact(ArtifactType::Processor, Some("AMD EPYC 7763")),
312      ],
313      vec![],
314      vec![],
315      vec![],
316    );
317    let node_b = summary(
318      vec![artifact(ArtifactType::Processor, Some("AMD EPYC 7763"))],
319      vec![],
320      vec![],
321      vec![],
322    );
323    let got = calculate_hsm_hw_component_summary(&[node_a, node_b]);
324    assert_eq!(got.get("AMD EPYC 7763"), Some(&3));
325  }
326
327  #[test]
328  fn hw_summary_converts_memory_mib_to_gib() {
329    // 524 288 MiB / 1024 = 512 GiB.
330    let node = summary(
331      vec![],
332      vec![artifact(ArtifactType::Memory, Some("524288 MiB"))],
333      vec![],
334      vec![],
335    );
336    let got = calculate_hsm_hw_component_summary(&[node]);
337    assert_eq!(got.get("Memory (GiB)"), Some(&512));
338  }
339
340  #[test]
341  fn hw_summary_skips_artifacts_with_no_info_field() {
342    // Processors / accels / NICs with `info = None` must not be counted.
343    let node = summary(
344      vec![artifact(ArtifactType::Processor, None)],
345      vec![],
346      vec![artifact(ArtifactType::NodeAccel, None)],
347      vec![artifact(ArtifactType::NodeHsnNic, None)],
348    );
349    assert!(calculate_hsm_hw_component_summary(&[node]).is_empty());
350  }
351
352  #[test]
353  fn hw_summary_treats_unparseable_memory_as_zero() {
354    // "ERROR NA".parse::<usize>() fails — the function defaults to 0,
355    // which still creates the entry with value 0. Pin the behaviour
356    // so a future "raise on parse error" change is deliberate.
357    let node = summary(
358      vec![],
359      vec![artifact(ArtifactType::Memory, Some("garbage"))],
360      vec![],
361      vec![],
362    );
363    let got = calculate_hsm_hw_component_summary(&[node]);
364    assert_eq!(got.get("Memory (GiB)"), Some(&0));
365  }
366
367  // ---- get_cluster_hw_pattern ----
368
369  #[test]
370  fn hw_pattern_empty_input_is_empty() {
371    assert!(get_cluster_hw_pattern(vec![]).is_empty());
372  }
373
374  #[test]
375  fn hw_pattern_strips_whitespace_from_processor_info() {
376    let node = summary(
377      vec![artifact(ArtifactType::Processor, Some("AMD EPYC 7763"))],
378      vec![],
379      vec![],
380      vec![],
381    );
382    let got = get_cluster_hw_pattern(vec![node]);
383    assert_eq!(got.get("AMDEPYC7763"), Some(&1));
384    assert!(
385      got.get("AMD EPYC 7763").is_none(),
386      "whitespace-bearing key must NOT be present"
387    );
388  }
389
390  #[test]
391  fn hw_pattern_aggregates_memory_as_raw_value_not_gib() {
392    // Unlike `calculate_hsm_hw_component_summary`, this helper does
393    // NOT divide memory by 1024; it sums the raw value under the
394    // literal key "memory". Catches a future "let's unify these
395    // helpers" change that would silently shift consumers' numbers.
396    let node = summary(
397      vec![],
398      vec![artifact(ArtifactType::Memory, Some("512 MiB"))],
399      vec![],
400      vec![],
401    );
402    let got = get_cluster_hw_pattern(vec![node]);
403    assert_eq!(got.get("memory"), Some(&512));
404  }
405}