Skip to main content

kanade_shared/wire/
agent_config.rs

1//! Layered fleet configuration that lives in the `agent_config` KV
2//! bucket (Sprint 6).
3//!
4//! Three scopes flow into the agent's effective config, in order of
5//! increasing specificity:
6//!
7//! ```text
8//! built-in default        (compiled in; floor when nothing else is set)
9//!   ↓
10//! agent_config:global     (whole-fleet default)
11//!   ↓
12//! agent_config:groups.<g> (per-group override; one or more apply)
13//!   ↓
14//! agent_config:pcs.<pc>   (per-PC override; final word)
15//! ```
16//!
17//! The wire type for every scope is the same — [`ConfigScope`], a
18//! struct of `Option<T>` fields. `Some` means "this scope sets this
19//! field"; `None` means "fall through to the next layer". JSON
20//! `null` is the same as the field being absent thanks to serde's
21//! struct-level `default`.
22//!
23//! [`resolve`] is the pure functional core that flattens the scope
24//! stack into an [`EffectiveConfig`] (concrete values, no Options).
25//! When the same field is set on more than one group the PC belongs
26//! to, alphabetical group order wins last (CSS-cascade style) and a
27//! [`ResolutionWarning::MultiGroupConflict`] is emitted so the
28//! caller can log it — pre-empts the "why does this PC have value X?
29//! none of my groups say X" debugging session.
30
31use std::collections::BTreeMap;
32use std::time::Duration;
33
34use serde::{Deserialize, Serialize};
35
36/// Per-scope partial config. Every field is `Option<T>`: `Some` =
37/// set, `None` = inherit from the next-less-specific scope. Serde
38/// `default` + `skip_serializing_if` keeps the wire JSON tight —
39/// unset fields don't appear in the bucket value.
40#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)]
41#[serde(default)]
42pub struct ConfigScope {
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub target_version: Option<String>,
45    /// Random sleep window applied at each agent before it starts
46    /// downloading a new target_version, so a fleet-wide rollout
47    /// doesn't slam the Object Store / broker all at once
48    /// (humantime, e.g. `"30m"`). `"0s"` (or unset) = no jitter.
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub target_version_jitter: Option<String>,
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub inventory_interval: Option<String>,
53    #[serde(skip_serializing_if = "Option::is_none")]
54    pub inventory_jitter: Option<String>,
55    #[serde(skip_serializing_if = "Option::is_none")]
56    pub inventory_enabled: Option<bool>,
57    #[serde(skip_serializing_if = "Option::is_none")]
58    pub heartbeat_interval: Option<String>,
59}
60
61impl ConfigScope {
62    pub fn is_empty(&self) -> bool {
63        self.target_version.is_none()
64            && self.target_version_jitter.is_none()
65            && self.inventory_interval.is_none()
66            && self.inventory_jitter.is_none()
67            && self.inventory_enabled.is_none()
68            && self.heartbeat_interval.is_none()
69    }
70}
71
72/// Concrete config the agent runs against once the scope stack has
73/// been flattened. `target_version` stays `Option` because "no
74/// rollout target set anywhere" is a meaningful state (the agent
75/// just keeps running the version it has); the other fields always
76/// have a value, falling back to [`EffectiveConfig::builtin_defaults`]
77/// when no scope sets them.
78#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
79pub struct EffectiveConfig {
80    pub target_version: Option<String>,
81    pub target_version_jitter: String,
82    pub inventory_interval: String,
83    pub inventory_jitter: String,
84    pub inventory_enabled: bool,
85    pub heartbeat_interval: String,
86}
87
88impl EffectiveConfig {
89    /// Floor values used when no KV scope sets a given field.
90    /// Mirrors the historic agent.toml defaults so unbootstrapped
91    /// fleets keep behaving the way they did pre-Sprint 6.
92    pub fn builtin_defaults() -> Self {
93        Self {
94            target_version: None,
95            // 0s = "no jitter" = pre-Sprint-11 behaviour. Operators
96            // running ≥ 100-host fleets are expected to bump this
97            // (via `kanade agent rollout … --jitter 30m` or
98            // `kanade config set target_version_jitter=30m`) so the
99            // Object Store fan-out doesn't synchronise.
100            target_version_jitter: "0s".to_string(),
101            inventory_interval: "24h".to_string(),
102            inventory_jitter: "10m".to_string(),
103            inventory_enabled: true,
104            heartbeat_interval: "30s".to_string(),
105        }
106    }
107
108    /// Parsed `heartbeat_interval`, falling back to the built-in
109    /// 30 s default on a malformed string. Logging the parse error
110    /// is the caller's job (so that test code can stay quiet).
111    pub fn heartbeat_duration(&self) -> Duration {
112        humantime::parse_duration(&self.heartbeat_interval).unwrap_or(Duration::from_secs(30))
113    }
114
115    pub fn inventory_interval_duration(&self) -> Duration {
116        humantime::parse_duration(&self.inventory_interval)
117            .unwrap_or(Duration::from_secs(24 * 60 * 60))
118    }
119
120    pub fn inventory_jitter_duration(&self) -> Duration {
121        humantime::parse_duration(&self.inventory_jitter).unwrap_or(Duration::from_secs(600))
122    }
123
124    /// Parsed `target_version_jitter`, falling back to zero (= no
125    /// jitter) on a malformed string. Zero means "start downloading
126    /// immediately when target_version drifts" — fine for small
127    /// fleets / canary smoke tests, bad for 3000 hosts.
128    pub fn target_version_jitter_duration(&self) -> Duration {
129        humantime::parse_duration(&self.target_version_jitter).unwrap_or(Duration::ZERO)
130    }
131}
132
133impl Default for EffectiveConfig {
134    fn default() -> Self {
135        Self::builtin_defaults()
136    }
137}
138
139/// Non-fatal observations from [`resolve`] that the caller should
140/// log. Currently only "two of this PC's groups set the same field
141/// to different values" — useful pre-emptive debugging signal when
142/// canary / wave / dept overlays accidentally overlap.
143#[derive(Debug, Clone, PartialEq, Eq)]
144pub enum ResolutionWarning {
145    MultiGroupConflict {
146        field: &'static str,
147        /// Group names that set this field, in alphabetical order
148        /// (i.e. the application order — the last name in this list
149        /// is the one whose value actually won).
150        groups: Vec<String>,
151    },
152}
153
154/// Flatten the scope stack into an [`EffectiveConfig`].
155///
156/// * `global` — the `global` key in the `agent_config` bucket
157///   (`None` if no row yet).
158/// * `group_scopes` — every `groups.<name>` row currently in the
159///   bucket (the caller can pass all of them; only the ones whose
160///   name is in `my_groups` are applied).
161/// * `pc_scope` — the `pcs.<pc_id>` row for this agent (`None` if
162///   no row yet).
163/// * `my_groups` — this agent's current memberships (from the
164///   `agent_groups` bucket).
165///
166/// Order of application: built-in default → global → per-group
167/// (alphabetical, last wins) → per-pc. Multi-group conflicts (≥ 2
168/// of `my_groups` setting the same field) are returned as warnings
169/// alongside the resolved config.
170pub fn resolve(
171    global: Option<&ConfigScope>,
172    group_scopes: &BTreeMap<String, ConfigScope>,
173    pc_scope: Option<&ConfigScope>,
174    my_groups: &[String],
175) -> (EffectiveConfig, Vec<ResolutionWarning>) {
176    let mut out = EffectiveConfig::builtin_defaults();
177    let mut warnings = Vec::new();
178
179    if let Some(g) = global {
180        apply_scope(&mut out, g);
181    }
182
183    // Sort + dedup the group list so iteration order is deterministic
184    // and "last wins" is well-defined.
185    let mut sorted_groups: Vec<&str> = my_groups.iter().map(String::as_str).collect();
186    sorted_groups.sort();
187    sorted_groups.dedup();
188
189    // Pass 1: find multi-setter fields so the caller can warn before
190    // pass 2 silently lets the alphabetical-last value win.
191    let mut setters: BTreeMap<&'static str, Vec<String>> = BTreeMap::new();
192    for g in &sorted_groups {
193        let Some(scope) = group_scopes.get(*g) else {
194            continue;
195        };
196        if scope.target_version.is_some() {
197            setters
198                .entry("target_version")
199                .or_default()
200                .push(g.to_string());
201        }
202        if scope.target_version_jitter.is_some() {
203            setters
204                .entry("target_version_jitter")
205                .or_default()
206                .push(g.to_string());
207        }
208        if scope.inventory_interval.is_some() {
209            setters
210                .entry("inventory_interval")
211                .or_default()
212                .push(g.to_string());
213        }
214        if scope.inventory_jitter.is_some() {
215            setters
216                .entry("inventory_jitter")
217                .or_default()
218                .push(g.to_string());
219        }
220        if scope.inventory_enabled.is_some() {
221            setters
222                .entry("inventory_enabled")
223                .or_default()
224                .push(g.to_string());
225        }
226        if scope.heartbeat_interval.is_some() {
227            setters
228                .entry("heartbeat_interval")
229                .or_default()
230                .push(g.to_string());
231        }
232    }
233    for (field, groups) in setters {
234        if groups.len() > 1 {
235            warnings.push(ResolutionWarning::MultiGroupConflict { field, groups });
236        }
237    }
238
239    // Pass 2: actually apply, alphabetically. Last-wins by construction.
240    for g in &sorted_groups {
241        if let Some(scope) = group_scopes.get(*g) {
242            apply_scope(&mut out, scope);
243        }
244    }
245
246    if let Some(p) = pc_scope {
247        apply_scope(&mut out, p);
248    }
249
250    (out, warnings)
251}
252
253fn apply_scope(out: &mut EffectiveConfig, s: &ConfigScope) {
254    if let Some(v) = &s.target_version {
255        out.target_version = Some(v.clone());
256    }
257    if let Some(v) = &s.target_version_jitter {
258        out.target_version_jitter = v.clone();
259    }
260    if let Some(v) = &s.inventory_interval {
261        out.inventory_interval = v.clone();
262    }
263    if let Some(v) = &s.inventory_jitter {
264        out.inventory_jitter = v.clone();
265    }
266    if let Some(v) = s.inventory_enabled {
267        out.inventory_enabled = v;
268    }
269    if let Some(v) = &s.heartbeat_interval {
270        out.heartbeat_interval = v.clone();
271    }
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    fn scope() -> ConfigScope {
279        ConfigScope::default()
280    }
281
282    #[test]
283    fn empty_stack_gives_builtin_defaults() {
284        let (eff, warns) = resolve(None, &BTreeMap::new(), None, &[]);
285        assert_eq!(eff, EffectiveConfig::builtin_defaults());
286        assert!(warns.is_empty());
287    }
288
289    #[test]
290    fn global_only() {
291        let g = ConfigScope {
292            inventory_interval: Some("12h".into()),
293            heartbeat_interval: Some("60s".into()),
294            ..scope()
295        };
296        let (eff, _) = resolve(Some(&g), &BTreeMap::new(), None, &[]);
297        assert_eq!(eff.inventory_interval, "12h");
298        assert_eq!(eff.heartbeat_interval, "60s");
299        // Unset fields stay at builtin defaults.
300        assert_eq!(eff.inventory_jitter, "10m");
301        assert!(eff.inventory_enabled);
302        assert!(eff.target_version.is_none());
303    }
304
305    #[test]
306    fn group_overrides_global() {
307        let global = ConfigScope {
308            inventory_interval: Some("24h".into()),
309            ..scope()
310        };
311        let mut groups = BTreeMap::new();
312        groups.insert(
313            "canary".into(),
314            ConfigScope {
315                inventory_interval: Some("1h".into()),
316                ..scope()
317            },
318        );
319        let (eff, warns) = resolve(Some(&global), &groups, None, &["canary".into()]);
320        assert_eq!(eff.inventory_interval, "1h");
321        assert!(warns.is_empty());
322    }
323
324    #[test]
325    fn pc_overrides_group() {
326        let mut groups = BTreeMap::new();
327        groups.insert(
328            "wave1".into(),
329            ConfigScope {
330                inventory_interval: Some("12h".into()),
331                ..scope()
332            },
333        );
334        let pc = ConfigScope {
335            inventory_interval: Some("5m".into()),
336            ..scope()
337        };
338        let (eff, _) = resolve(None, &groups, Some(&pc), &["wave1".into()]);
339        assert_eq!(eff.inventory_interval, "5m");
340    }
341
342    #[test]
343    fn pc_overrides_global_when_no_group_match() {
344        let global = ConfigScope {
345            inventory_interval: Some("24h".into()),
346            ..scope()
347        };
348        let pc = ConfigScope {
349            inventory_interval: Some("30m".into()),
350            ..scope()
351        };
352        let (eff, _) = resolve(Some(&global), &BTreeMap::new(), Some(&pc), &[]);
353        assert_eq!(eff.inventory_interval, "30m");
354    }
355
356    #[test]
357    fn partial_override_only_changes_named_fields() {
358        let global = ConfigScope {
359            inventory_interval: Some("24h".into()),
360            heartbeat_interval: Some("30s".into()),
361            ..scope()
362        };
363        let pc = ConfigScope {
364            heartbeat_interval: Some("15s".into()),
365            // intentionally not touching inventory_interval
366            ..scope()
367        };
368        let (eff, _) = resolve(Some(&global), &BTreeMap::new(), Some(&pc), &[]);
369        assert_eq!(eff.inventory_interval, "24h"); // from global
370        assert_eq!(eff.heartbeat_interval, "15s"); // from pc
371    }
372
373    #[test]
374    fn multi_group_conflict_emits_warning() {
375        let mut groups = BTreeMap::new();
376        groups.insert(
377            "wave1".into(),
378            ConfigScope {
379                inventory_interval: Some("12h".into()),
380                ..scope()
381            },
382        );
383        groups.insert(
384            "dept-eng".into(),
385            ConfigScope {
386                inventory_interval: Some("24h".into()),
387                ..scope()
388            },
389        );
390        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "dept-eng".into()]);
391        // "dept-eng" sorts before "wave1", so wave1 wins (last alphabetical).
392        assert_eq!(eff.inventory_interval, "12h");
393        assert_eq!(warns.len(), 1);
394        match &warns[0] {
395            ResolutionWarning::MultiGroupConflict { field, groups } => {
396                assert_eq!(*field, "inventory_interval");
397                assert_eq!(groups, &vec!["dept-eng".to_string(), "wave1".to_string()]);
398            }
399        }
400    }
401
402    #[test]
403    fn group_alphabetical_last_wins_no_conflict_when_only_one_sets() {
404        let mut groups = BTreeMap::new();
405        groups.insert(
406            "wave1".into(),
407            ConfigScope {
408                inventory_interval: Some("12h".into()),
409                ..scope()
410            },
411        );
412        groups.insert(
413            "dept-eng".into(),
414            ConfigScope {
415                // Different field — doesn't conflict.
416                heartbeat_interval: Some("15s".into()),
417                ..scope()
418            },
419        );
420        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "dept-eng".into()]);
421        assert_eq!(eff.inventory_interval, "12h");
422        assert_eq!(eff.heartbeat_interval, "15s");
423        assert!(warns.is_empty());
424    }
425
426    #[test]
427    fn unknown_group_is_silently_ignored() {
428        // my_groups names a group that has no scope row yet. Common
429        // on the first agent that joins a freshly-named group; the
430        // resolver should treat it as a no-op, not an error.
431        let mut groups = BTreeMap::new();
432        groups.insert(
433            "canary".into(),
434            ConfigScope {
435                inventory_interval: Some("1h".into()),
436                ..scope()
437            },
438        );
439        let (eff, warns) = resolve(
440            None,
441            &groups,
442            None,
443            &["canary".into(), "ghost-group".into()],
444        );
445        assert_eq!(eff.inventory_interval, "1h");
446        assert!(warns.is_empty());
447    }
448
449    #[test]
450    fn group_scope_not_applied_when_pc_not_in_group() {
451        let mut groups = BTreeMap::new();
452        groups.insert(
453            "canary".into(),
454            ConfigScope {
455                target_version: Some("0.3.0".into()),
456                ..scope()
457            },
458        );
459        let (eff, _) = resolve(None, &groups, None, &["dept-eng".into()]);
460        // PC is NOT in canary, so the rollout target shouldn't apply.
461        assert!(eff.target_version.is_none());
462    }
463
464    #[test]
465    fn duplicate_group_names_dedup_silently() {
466        let mut groups = BTreeMap::new();
467        groups.insert(
468            "wave1".into(),
469            ConfigScope {
470                inventory_interval: Some("12h".into()),
471                ..scope()
472            },
473        );
474        // my_groups carries the same name twice — the dedup pass
475        // keeps it from looking like a conflict-with-self.
476        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "wave1".into()]);
477        assert_eq!(eff.inventory_interval, "12h");
478        assert!(warns.is_empty());
479    }
480
481    #[test]
482    fn config_scope_serde_round_trip() {
483        let s = ConfigScope {
484            target_version: Some("0.3.0".into()),
485            heartbeat_interval: Some("15s".into()),
486            ..scope()
487        };
488        let json = serde_json::to_string(&s).unwrap();
489        // Only set fields appear in JSON.
490        assert_eq!(
491            json,
492            r#"{"target_version":"0.3.0","heartbeat_interval":"15s"}"#
493        );
494        let back: ConfigScope = serde_json::from_str(&json).unwrap();
495        assert_eq!(back, s);
496    }
497
498    #[test]
499    fn empty_config_scope_round_trips_as_empty_json() {
500        let s = ConfigScope::default();
501        assert!(s.is_empty());
502        let json = serde_json::to_string(&s).unwrap();
503        assert_eq!(json, "{}");
504        let back: ConfigScope = serde_json::from_str(&json).unwrap();
505        assert_eq!(back, s);
506    }
507
508    #[test]
509    fn deserialize_tolerates_unknown_fields_for_forward_compat() {
510        // Sprint 6+ may add fields (log_level, jitter strategy, …);
511        // older agent / backend builds should keep parsing.
512        let json = r#"{"target_version":"0.3.0","future_knob":"future_value"}"#;
513        let s: ConfigScope = serde_json::from_str(json).unwrap();
514        assert_eq!(s.target_version.as_deref(), Some("0.3.0"));
515    }
516
517    #[test]
518    fn pc_does_not_override_other_pcs() {
519        // Sanity: pc_scope passed in is by definition the row for THIS
520        // pc; the caller is responsible for picking the right one.
521        // This test guards against a future refactor that accidentally
522        // wires in the wrong scope by ensuring the apply happens last
523        // (after groups), so the PC value is the visible one.
524        let mut groups = BTreeMap::new();
525        groups.insert(
526            "wave1".into(),
527            ConfigScope {
528                inventory_interval: Some("12h".into()),
529                ..scope()
530            },
531        );
532        let pc = ConfigScope {
533            inventory_interval: Some("5m".into()),
534            ..scope()
535        };
536        let (eff, _) = resolve(None, &groups, Some(&pc), &["wave1".into()]);
537        assert_eq!(eff.inventory_interval, "5m");
538    }
539}