Skip to main content

kanade_shared/wire/
agent_config.rs

1//! Layered fleet configuration that lives in the `agent_config` KV
2//! bucket (Sprint 6).
3//!
4//! Three scopes flow into the agent's effective config, in order of
5//! increasing specificity:
6//!
7//! ```text
8//! built-in default        (compiled in; floor when nothing else is set)
9//!   ↓
10//! agent_config:global     (whole-fleet default)
11//!   ↓
12//! agent_config:groups.<g> (per-group override; one or more apply)
13//!   ↓
14//! agent_config:pcs.<pc>   (per-PC override; final word)
15//! ```
16//!
17//! The wire type for every scope is the same — [`ConfigScope`], a
18//! struct of `Option<T>` fields. `Some` means "this scope sets this
19//! field"; `None` means "fall through to the next layer". JSON
20//! `null` is the same as the field being absent thanks to serde's
21//! struct-level `default`.
22//!
23//! [`resolve`] is the pure functional core that flattens the scope
24//! stack into an [`EffectiveConfig`] (concrete values, no Options).
25//! When the same field is set on more than one group the PC belongs
26//! to, alphabetical group order wins last (CSS-cascade style) and a
27//! [`ResolutionWarning::MultiGroupConflict`] is emitted so the
28//! caller can log it — pre-empts the "why does this PC have value X?
29//! none of my groups say X" debugging session.
30//!
31//! v0.20.0: `inventory_interval` / `inventory_jitter` /
32//! `inventory_enabled` removed. They were leftovers from the
33//! v0.14-retired hardcoded WMI inventory loop; runtime inventory
34//! now lives in operator-defined probe jobs (`configs/jobs/
35//! inventory-*.yaml`), so the layered config no longer carries
36//! anything about it.
37
38use std::collections::BTreeMap;
39use std::time::Duration;
40
41use serde::{Deserialize, Serialize};
42
43/// Per-scope partial config. Every field is `Option<T>`: `Some` =
44/// set, `None` = inherit from the next-less-specific scope. Serde
45/// `default` + `skip_serializing_if` keeps the wire JSON tight —
46/// unset fields don't appear in the bucket value.
47#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)]
48#[serde(default)]
49pub struct ConfigScope {
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub target_version: Option<String>,
52    /// Random sleep window applied at each agent before it starts
53    /// downloading a new target_version, so a fleet-wide rollout
54    /// doesn't slam the Object Store / broker all at once
55    /// (humantime, e.g. `"30m"`). `"0s"` (or unset) = no jitter.
56    #[serde(skip_serializing_if = "Option::is_none")]
57    pub target_version_jitter: Option<String>,
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub heartbeat_interval: Option<String>,
60}
61
62impl ConfigScope {
63    pub fn is_empty(&self) -> bool {
64        self.target_version.is_none()
65            && self.target_version_jitter.is_none()
66            && self.heartbeat_interval.is_none()
67    }
68}
69
70/// Concrete config the agent runs against once the scope stack has
71/// been flattened. `target_version` stays `Option` because "no
72/// rollout target set anywhere" is a meaningful state (the agent
73/// just keeps running the version it has); the other fields always
74/// have a value, falling back to [`EffectiveConfig::builtin_defaults`]
75/// when no scope sets them.
76#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
77pub struct EffectiveConfig {
78    pub target_version: Option<String>,
79    pub target_version_jitter: String,
80    pub heartbeat_interval: String,
81}
82
83impl EffectiveConfig {
84    /// Floor values used when no KV scope sets a given field.
85    pub fn builtin_defaults() -> Self {
86        Self {
87            target_version: None,
88            // 0s = "no jitter" = pre-Sprint-11 behaviour. Operators
89            // running ≥ 100-host fleets are expected to bump this
90            // (via `kanade agent rollout … --jitter 30m` or
91            // `kanade config set target_version_jitter=30m`) so the
92            // Object Store fan-out doesn't synchronise. See issue
93            // #26 for the broader "safe-by-default" debate.
94            target_version_jitter: "0s".to_string(),
95            heartbeat_interval: "30s".to_string(),
96        }
97    }
98
99    /// Parsed `heartbeat_interval`, falling back to the built-in
100    /// 30 s default on a malformed string. Logging the parse error
101    /// is the caller's job (so that test code can stay quiet).
102    pub fn heartbeat_duration(&self) -> Duration {
103        humantime::parse_duration(&self.heartbeat_interval).unwrap_or(Duration::from_secs(30))
104    }
105
106    /// Parsed `target_version_jitter`, falling back to zero (= no
107    /// jitter) on a malformed string. Zero means "start downloading
108    /// immediately when target_version drifts" — fine for small
109    /// fleets / canary smoke tests, bad for 3000 hosts.
110    pub fn target_version_jitter_duration(&self) -> Duration {
111        humantime::parse_duration(&self.target_version_jitter).unwrap_or(Duration::ZERO)
112    }
113}
114
115impl Default for EffectiveConfig {
116    fn default() -> Self {
117        Self::builtin_defaults()
118    }
119}
120
121/// Non-fatal observations from [`resolve`] that the caller should
122/// log. Currently only "two of this PC's groups set the same field
123/// to different values" — useful pre-emptive debugging signal when
124/// canary / wave / dept overlays accidentally overlap.
125#[derive(Debug, Clone, PartialEq, Eq)]
126pub enum ResolutionWarning {
127    MultiGroupConflict {
128        field: &'static str,
129        /// Group names that set this field, in alphabetical order
130        /// (i.e. the application order — the last name in this list
131        /// is the one whose value actually won).
132        groups: Vec<String>,
133    },
134}
135
136/// Flatten the scope stack into an [`EffectiveConfig`].
137///
138/// * `global` — the `global` key in the `agent_config` bucket
139///   (`None` if no row yet).
140/// * `group_scopes` — every `groups.<name>` row currently in the
141///   bucket (the caller can pass all of them; only the ones whose
142///   name is in `my_groups` are applied).
143/// * `pc_scope` — the `pcs.<pc_id>` row for this agent (`None` if
144///   no row yet).
145/// * `my_groups` — this agent's current memberships (from the
146///   `agent_groups` bucket).
147///
148/// Order of application: built-in default → global → per-group
149/// (alphabetical, last wins) → per-pc. Multi-group conflicts (≥ 2
150/// of `my_groups` setting the same field) are returned as warnings
151/// alongside the resolved config.
152pub fn resolve(
153    global: Option<&ConfigScope>,
154    group_scopes: &BTreeMap<String, ConfigScope>,
155    pc_scope: Option<&ConfigScope>,
156    my_groups: &[String],
157) -> (EffectiveConfig, Vec<ResolutionWarning>) {
158    let mut out = EffectiveConfig::builtin_defaults();
159    let mut warnings = Vec::new();
160
161    if let Some(g) = global {
162        apply_scope(&mut out, g);
163    }
164
165    // Sort + dedup the group list so iteration order is deterministic
166    // and "last wins" is well-defined.
167    let mut sorted_groups: Vec<&str> = my_groups.iter().map(String::as_str).collect();
168    sorted_groups.sort();
169    sorted_groups.dedup();
170
171    // Pass 1: find multi-setter fields so the caller can warn before
172    // pass 2 silently lets the alphabetical-last value win.
173    let mut setters: BTreeMap<&'static str, Vec<String>> = BTreeMap::new();
174    for g in &sorted_groups {
175        let Some(scope) = group_scopes.get(*g) else {
176            continue;
177        };
178        if scope.target_version.is_some() {
179            setters
180                .entry("target_version")
181                .or_default()
182                .push(g.to_string());
183        }
184        if scope.target_version_jitter.is_some() {
185            setters
186                .entry("target_version_jitter")
187                .or_default()
188                .push(g.to_string());
189        }
190        if scope.heartbeat_interval.is_some() {
191            setters
192                .entry("heartbeat_interval")
193                .or_default()
194                .push(g.to_string());
195        }
196    }
197    for (field, groups) in setters {
198        if groups.len() > 1 {
199            warnings.push(ResolutionWarning::MultiGroupConflict { field, groups });
200        }
201    }
202
203    // Pass 2: actually apply, alphabetically. Last-wins by construction.
204    for g in &sorted_groups {
205        if let Some(scope) = group_scopes.get(*g) {
206            apply_scope(&mut out, scope);
207        }
208    }
209
210    if let Some(p) = pc_scope {
211        apply_scope(&mut out, p);
212    }
213
214    (out, warnings)
215}
216
217fn apply_scope(out: &mut EffectiveConfig, s: &ConfigScope) {
218    if let Some(v) = &s.target_version {
219        out.target_version = Some(v.clone());
220    }
221    if let Some(v) = &s.target_version_jitter {
222        out.target_version_jitter = v.clone();
223    }
224    if let Some(v) = &s.heartbeat_interval {
225        out.heartbeat_interval = v.clone();
226    }
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    fn scope() -> ConfigScope {
234        ConfigScope::default()
235    }
236
237    #[test]
238    fn empty_stack_gives_builtin_defaults() {
239        let (eff, warns) = resolve(None, &BTreeMap::new(), None, &[]);
240        assert_eq!(eff, EffectiveConfig::builtin_defaults());
241        assert!(warns.is_empty());
242    }
243
244    #[test]
245    fn global_only() {
246        let g = ConfigScope {
247            heartbeat_interval: Some("60s".into()),
248            ..scope()
249        };
250        let (eff, _) = resolve(Some(&g), &BTreeMap::new(), None, &[]);
251        assert_eq!(eff.heartbeat_interval, "60s");
252        // Unset fields stay at builtin defaults.
253        assert_eq!(eff.target_version_jitter, "0s");
254        assert!(eff.target_version.is_none());
255    }
256
257    #[test]
258    fn group_overrides_global() {
259        let global = ConfigScope {
260            heartbeat_interval: Some("30s".into()),
261            ..scope()
262        };
263        let mut groups = BTreeMap::new();
264        groups.insert(
265            "canary".into(),
266            ConfigScope {
267                heartbeat_interval: Some("5s".into()),
268                ..scope()
269            },
270        );
271        let (eff, warns) = resolve(Some(&global), &groups, None, &["canary".into()]);
272        assert_eq!(eff.heartbeat_interval, "5s");
273        assert!(warns.is_empty());
274    }
275
276    #[test]
277    fn pc_overrides_group() {
278        let mut groups = BTreeMap::new();
279        groups.insert(
280            "wave1".into(),
281            ConfigScope {
282                heartbeat_interval: Some("30s".into()),
283                ..scope()
284            },
285        );
286        let pc = ConfigScope {
287            heartbeat_interval: Some("5s".into()),
288            ..scope()
289        };
290        let (eff, _) = resolve(None, &groups, Some(&pc), &["wave1".into()]);
291        assert_eq!(eff.heartbeat_interval, "5s");
292    }
293
294    #[test]
295    fn pc_overrides_global_when_no_group_match() {
296        let global = ConfigScope {
297            heartbeat_interval: Some("30s".into()),
298            ..scope()
299        };
300        let pc = ConfigScope {
301            heartbeat_interval: Some("5s".into()),
302            ..scope()
303        };
304        let (eff, _) = resolve(Some(&global), &BTreeMap::new(), Some(&pc), &[]);
305        assert_eq!(eff.heartbeat_interval, "5s");
306    }
307
308    #[test]
309    fn partial_override_only_changes_named_fields() {
310        let global = ConfigScope {
311            target_version_jitter: Some("30m".into()),
312            heartbeat_interval: Some("30s".into()),
313            ..scope()
314        };
315        let pc = ConfigScope {
316            heartbeat_interval: Some("15s".into()),
317            // intentionally not touching target_version_jitter
318            ..scope()
319        };
320        let (eff, _) = resolve(Some(&global), &BTreeMap::new(), Some(&pc), &[]);
321        assert_eq!(eff.target_version_jitter, "30m"); // from global
322        assert_eq!(eff.heartbeat_interval, "15s"); // from pc
323    }
324
325    #[test]
326    fn multi_group_conflict_emits_warning() {
327        let mut groups = BTreeMap::new();
328        groups.insert(
329            "wave1".into(),
330            ConfigScope {
331                heartbeat_interval: Some("5s".into()),
332                ..scope()
333            },
334        );
335        groups.insert(
336            "dept-eng".into(),
337            ConfigScope {
338                heartbeat_interval: Some("60s".into()),
339                ..scope()
340            },
341        );
342        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "dept-eng".into()]);
343        // "dept-eng" sorts before "wave1", so wave1 wins (last alphabetical).
344        assert_eq!(eff.heartbeat_interval, "5s");
345        assert_eq!(warns.len(), 1);
346        match &warns[0] {
347            ResolutionWarning::MultiGroupConflict { field, groups } => {
348                assert_eq!(*field, "heartbeat_interval");
349                assert_eq!(groups, &vec!["dept-eng".to_string(), "wave1".to_string()]);
350            }
351        }
352    }
353
354    #[test]
355    fn group_alphabetical_last_wins_no_conflict_when_only_one_sets() {
356        let mut groups = BTreeMap::new();
357        groups.insert(
358            "wave1".into(),
359            ConfigScope {
360                heartbeat_interval: Some("5s".into()),
361                ..scope()
362            },
363        );
364        groups.insert(
365            "dept-eng".into(),
366            ConfigScope {
367                // Different field — doesn't conflict.
368                target_version_jitter: Some("15m".into()),
369                ..scope()
370            },
371        );
372        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "dept-eng".into()]);
373        assert_eq!(eff.heartbeat_interval, "5s");
374        assert_eq!(eff.target_version_jitter, "15m");
375        assert!(warns.is_empty());
376    }
377
378    #[test]
379    fn unknown_group_is_silently_ignored() {
380        // my_groups names a group that has no scope row yet. Common
381        // on the first agent that joins a freshly-named group; the
382        // resolver should treat it as a no-op, not an error.
383        let mut groups = BTreeMap::new();
384        groups.insert(
385            "canary".into(),
386            ConfigScope {
387                heartbeat_interval: Some("5s".into()),
388                ..scope()
389            },
390        );
391        let (eff, warns) = resolve(
392            None,
393            &groups,
394            None,
395            &["canary".into(), "ghost-group".into()],
396        );
397        assert_eq!(eff.heartbeat_interval, "5s");
398        assert!(warns.is_empty());
399    }
400
401    #[test]
402    fn group_scope_not_applied_when_pc_not_in_group() {
403        let mut groups = BTreeMap::new();
404        groups.insert(
405            "canary".into(),
406            ConfigScope {
407                target_version: Some("0.3.0".into()),
408                ..scope()
409            },
410        );
411        let (eff, _) = resolve(None, &groups, None, &["dept-eng".into()]);
412        // PC is NOT in canary, so the rollout target shouldn't apply.
413        assert!(eff.target_version.is_none());
414    }
415
416    #[test]
417    fn duplicate_group_names_dedup_silently() {
418        let mut groups = BTreeMap::new();
419        groups.insert(
420            "wave1".into(),
421            ConfigScope {
422                heartbeat_interval: Some("5s".into()),
423                ..scope()
424            },
425        );
426        // my_groups carries the same name twice — the dedup pass
427        // keeps it from looking like a conflict-with-self.
428        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "wave1".into()]);
429        assert_eq!(eff.heartbeat_interval, "5s");
430        assert!(warns.is_empty());
431    }
432
433    #[test]
434    fn config_scope_serde_round_trip() {
435        let s = ConfigScope {
436            target_version: Some("0.3.0".into()),
437            heartbeat_interval: Some("15s".into()),
438            ..scope()
439        };
440        let json = serde_json::to_string(&s).unwrap();
441        // Only set fields appear in JSON.
442        assert_eq!(
443            json,
444            r#"{"target_version":"0.3.0","heartbeat_interval":"15s"}"#
445        );
446        let back: ConfigScope = serde_json::from_str(&json).unwrap();
447        assert_eq!(back, s);
448    }
449
450    #[test]
451    fn empty_config_scope_round_trips_as_empty_json() {
452        let s = ConfigScope::default();
453        assert!(s.is_empty());
454        let json = serde_json::to_string(&s).unwrap();
455        assert_eq!(json, "{}");
456        let back: ConfigScope = serde_json::from_str(&json).unwrap();
457        assert_eq!(back, s);
458    }
459
460    #[test]
461    fn deserialize_tolerates_unknown_fields_for_forward_compat() {
462        // Older agent / backend builds should keep parsing in case
463        // we add fields later. v0.20 also relies on this so pre-v0.20
464        // rows that still have inventory_interval / inventory_jitter
465        // / inventory_enabled in the bucket value parse OK as the
466        // new (smaller) ConfigScope — the dropped fields just
467        // dissolve into "unknown, ignored".
468        let json =
469            r#"{"target_version":"0.3.0","inventory_interval":"24h","future_knob":"future_value"}"#;
470        let s: ConfigScope = serde_json::from_str(json).unwrap();
471        assert_eq!(s.target_version.as_deref(), Some("0.3.0"));
472    }
473
474    #[test]
475    fn pc_does_not_override_other_pcs() {
476        // Sanity: pc_scope passed in is by definition the row for THIS
477        // pc; the caller is responsible for picking the right one.
478        // This test guards against a future refactor that accidentally
479        // wires in the wrong scope by ensuring the apply happens last
480        // (after groups), so the PC value is the visible one.
481        let mut groups = BTreeMap::new();
482        groups.insert(
483            "wave1".into(),
484            ConfigScope {
485                heartbeat_interval: Some("30s".into()),
486                ..scope()
487            },
488        );
489        let pc = ConfigScope {
490            heartbeat_interval: Some("5s".into()),
491            ..scope()
492        };
493        let (eff, _) = resolve(None, &groups, Some(&pc), &["wave1".into()]);
494        assert_eq!(eff.heartbeat_interval, "5s");
495    }
496}