Skip to main content

kanade_shared/wire/
agent_config.rs

1//! Layered fleet configuration that lives in the `agent_config` KV
2//! bucket (Sprint 6).
3//!
4//! Three scopes flow into the agent's effective config, in order of
5//! increasing specificity:
6//!
7//! ```text
8//! built-in default        (compiled in; floor when nothing else is set)
9//!   ↓
10//! agent_config:global     (whole-fleet default)
11//!   ↓
12//! agent_config:groups.<g> (per-group override; one or more apply)
13//!   ↓
14//! agent_config:pcs.<pc>   (per-PC override; final word)
15//! ```
16//!
17//! The wire type for every scope is the same — [`ConfigScope`], a
18//! struct of `Option<T>` fields. `Some` means "this scope sets this
19//! field"; `None` means "fall through to the next layer". JSON
20//! `null` is the same as the field being absent thanks to serde's
21//! struct-level `default`.
22//!
23//! [`resolve`] is the pure functional core that flattens the scope
24//! stack into an [`EffectiveConfig`] (concrete values, no Options).
25//! When the same field is set on more than one group the PC belongs
26//! to, alphabetical group order wins last (CSS-cascade style) and a
27//! [`ResolutionWarning::MultiGroupConflict`] is emitted so the
28//! caller can log it — pre-empts the "why does this PC have value X?
29//! none of my groups say X" debugging session.
30//!
31//! v0.20.0: `inventory_interval` / `inventory_jitter` /
32//! `inventory_enabled` removed. They were leftovers from the
33//! v0.14-retired hardcoded WMI inventory loop; runtime inventory
34//! now lives in operator-defined probe jobs (`configs/jobs/
35//! inventory-*.yaml`), so the layered config no longer carries
36//! anything about it.
37
38use std::collections::BTreeMap;
39use std::time::Duration;
40
41use serde::{Deserialize, Serialize};
42
43/// Per-scope partial config. Every field is `Option<T>`: `Some` =
44/// set, `None` = inherit from the next-less-specific scope. Serde
45/// `default` + `skip_serializing_if` keeps the wire JSON tight —
46/// unset fields don't appear in the bucket value.
47#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)]
48#[serde(default)]
49pub struct ConfigScope {
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub target_version: Option<String>,
52    /// Random sleep window applied at each agent before it starts
53    /// downloading a new target_version, so a fleet-wide rollout
54    /// doesn't slam the Object Store / broker all at once
55    /// (humantime, e.g. `"30m"`). `"0s"` (or unset) = no jitter.
56    #[serde(skip_serializing_if = "Option::is_none")]
57    pub target_version_jitter: Option<String>,
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub heartbeat_interval: Option<String>,
60    /// Cadence for the whole-host perf snapshot loop (`host_perf.<pc_id>`).
61    /// Separate from `heartbeat_interval` because the host-wide
62    /// sysinfo refresh is slightly heavier than the per-process self-
63    /// perf one (memory + disk + network counters in addition to CPU)
64    /// and gappier data is acceptable for graphing. Default 60 s.
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub host_perf_interval: Option<String>,
67}
68
69impl ConfigScope {
70    pub fn is_empty(&self) -> bool {
71        self.target_version.is_none()
72            && self.target_version_jitter.is_none()
73            && self.heartbeat_interval.is_none()
74            && self.host_perf_interval.is_none()
75    }
76}
77
78/// Concrete config the agent runs against once the scope stack has
79/// been flattened. `target_version` stays `Option` because "no
80/// rollout target set anywhere" is a meaningful state (the agent
81/// just keeps running the version it has); the other fields always
82/// have a value, falling back to [`EffectiveConfig::builtin_defaults`]
83/// when no scope sets them.
84#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
85pub struct EffectiveConfig {
86    pub target_version: Option<String>,
87    pub target_version_jitter: String,
88    pub heartbeat_interval: String,
89    pub host_perf_interval: String,
90}
91
92impl EffectiveConfig {
93    /// Floor values used when no KV scope sets a given field.
94    pub fn builtin_defaults() -> Self {
95        Self {
96            target_version: None,
97            // 0s = "no jitter" = pre-Sprint-11 behaviour. Operators
98            // running ≥ 100-host fleets are expected to bump this
99            // (via `kanade agent rollout … --jitter 30m` or
100            // `kanade config set target_version_jitter=30m`) so the
101            // Object Store fan-out doesn't synchronise. See issue
102            // #26 for the broader "safe-by-default" debate.
103            target_version_jitter: "0s".to_string(),
104            heartbeat_interval: "30s".to_string(),
105            // 60 s default: 2× the heartbeat cadence so the chart has
106            // a roughly aligned point every other heartbeat, while
107            // keeping the host-wide sysinfo refresh (which on Citrix /
108            // RDS hosts is the heaviest call we make) out of the
109            // tight 30 s loop.
110            host_perf_interval: "60s".to_string(),
111        }
112    }
113
114    /// Parsed `heartbeat_interval`, falling back to the built-in
115    /// 30 s default on a malformed string. Logging the parse error
116    /// is the caller's job (so that test code can stay quiet).
117    pub fn heartbeat_duration(&self) -> Duration {
118        humantime::parse_duration(&self.heartbeat_interval).unwrap_or(Duration::from_secs(30))
119    }
120
121    /// Parsed `host_perf_interval`, falling back to the built-in
122    /// 60 s default on a malformed string.
123    pub fn host_perf_duration(&self) -> Duration {
124        humantime::parse_duration(&self.host_perf_interval).unwrap_or(Duration::from_secs(60))
125    }
126
127    /// Parsed `target_version_jitter`, falling back to zero (= no
128    /// jitter) on a malformed string. Zero means "start downloading
129    /// immediately when target_version drifts" — fine for small
130    /// fleets / canary smoke tests, bad for 3000 hosts.
131    pub fn target_version_jitter_duration(&self) -> Duration {
132        humantime::parse_duration(&self.target_version_jitter).unwrap_or(Duration::ZERO)
133    }
134}
135
136impl Default for EffectiveConfig {
137    fn default() -> Self {
138        Self::builtin_defaults()
139    }
140}
141
142/// Non-fatal observations from [`resolve`] that the caller should
143/// log. Currently only "two of this PC's groups set the same field
144/// to different values" — useful pre-emptive debugging signal when
145/// canary / wave / dept overlays accidentally overlap.
146#[derive(Debug, Clone, PartialEq, Eq)]
147pub enum ResolutionWarning {
148    MultiGroupConflict {
149        field: &'static str,
150        /// Group names that set this field, in alphabetical order
151        /// (i.e. the application order — the last name in this list
152        /// is the one whose value actually won).
153        groups: Vec<String>,
154    },
155}
156
157/// Flatten the scope stack into an [`EffectiveConfig`].
158///
159/// * `global` — the `global` key in the `agent_config` bucket
160///   (`None` if no row yet).
161/// * `group_scopes` — every `groups.<name>` row currently in the
162///   bucket (the caller can pass all of them; only the ones whose
163///   name is in `my_groups` are applied).
164/// * `pc_scope` — the `pcs.<pc_id>` row for this agent (`None` if
165///   no row yet).
166/// * `my_groups` — this agent's current memberships (from the
167///   `agent_groups` bucket).
168///
169/// Order of application: built-in default → global → per-group
170/// (alphabetical, last wins) → per-pc. Multi-group conflicts (≥ 2
171/// of `my_groups` setting the same field) are returned as warnings
172/// alongside the resolved config.
173pub fn resolve(
174    global: Option<&ConfigScope>,
175    group_scopes: &BTreeMap<String, ConfigScope>,
176    pc_scope: Option<&ConfigScope>,
177    my_groups: &[String],
178) -> (EffectiveConfig, Vec<ResolutionWarning>) {
179    let mut out = EffectiveConfig::builtin_defaults();
180    let mut warnings = Vec::new();
181
182    if let Some(g) = global {
183        apply_scope(&mut out, g);
184    }
185
186    // Sort + dedup the group list so iteration order is deterministic
187    // and "last wins" is well-defined.
188    let mut sorted_groups: Vec<&str> = my_groups.iter().map(String::as_str).collect();
189    sorted_groups.sort();
190    sorted_groups.dedup();
191
192    // Pass 1: find multi-setter fields so the caller can warn before
193    // pass 2 silently lets the alphabetical-last value win.
194    let mut setters: BTreeMap<&'static str, Vec<String>> = BTreeMap::new();
195    for g in &sorted_groups {
196        let Some(scope) = group_scopes.get(*g) else {
197            continue;
198        };
199        if scope.target_version.is_some() {
200            setters
201                .entry("target_version")
202                .or_default()
203                .push(g.to_string());
204        }
205        if scope.target_version_jitter.is_some() {
206            setters
207                .entry("target_version_jitter")
208                .or_default()
209                .push(g.to_string());
210        }
211        if scope.heartbeat_interval.is_some() {
212            setters
213                .entry("heartbeat_interval")
214                .or_default()
215                .push(g.to_string());
216        }
217        if scope.host_perf_interval.is_some() {
218            setters
219                .entry("host_perf_interval")
220                .or_default()
221                .push(g.to_string());
222        }
223    }
224    for (field, groups) in setters {
225        if groups.len() > 1 {
226            warnings.push(ResolutionWarning::MultiGroupConflict { field, groups });
227        }
228    }
229
230    // Pass 2: actually apply, alphabetically. Last-wins by construction.
231    for g in &sorted_groups {
232        if let Some(scope) = group_scopes.get(*g) {
233            apply_scope(&mut out, scope);
234        }
235    }
236
237    if let Some(p) = pc_scope {
238        apply_scope(&mut out, p);
239    }
240
241    (out, warnings)
242}
243
244fn apply_scope(out: &mut EffectiveConfig, s: &ConfigScope) {
245    if let Some(v) = &s.target_version {
246        out.target_version = Some(v.clone());
247    }
248    if let Some(v) = &s.target_version_jitter {
249        out.target_version_jitter = v.clone();
250    }
251    if let Some(v) = &s.heartbeat_interval {
252        out.heartbeat_interval = v.clone();
253    }
254    if let Some(v) = &s.host_perf_interval {
255        out.host_perf_interval = v.clone();
256    }
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262
263    fn scope() -> ConfigScope {
264        ConfigScope::default()
265    }
266
267    #[test]
268    fn empty_stack_gives_builtin_defaults() {
269        let (eff, warns) = resolve(None, &BTreeMap::new(), None, &[]);
270        assert_eq!(eff, EffectiveConfig::builtin_defaults());
271        assert!(warns.is_empty());
272    }
273
274    #[test]
275    fn global_only() {
276        let g = ConfigScope {
277            heartbeat_interval: Some("60s".into()),
278            ..scope()
279        };
280        let (eff, _) = resolve(Some(&g), &BTreeMap::new(), None, &[]);
281        assert_eq!(eff.heartbeat_interval, "60s");
282        // Unset fields stay at builtin defaults.
283        assert_eq!(eff.target_version_jitter, "0s");
284        assert!(eff.target_version.is_none());
285    }
286
287    #[test]
288    fn group_overrides_global() {
289        let global = ConfigScope {
290            heartbeat_interval: Some("30s".into()),
291            ..scope()
292        };
293        let mut groups = BTreeMap::new();
294        groups.insert(
295            "canary".into(),
296            ConfigScope {
297                heartbeat_interval: Some("5s".into()),
298                ..scope()
299            },
300        );
301        let (eff, warns) = resolve(Some(&global), &groups, None, &["canary".into()]);
302        assert_eq!(eff.heartbeat_interval, "5s");
303        assert!(warns.is_empty());
304    }
305
306    #[test]
307    fn pc_overrides_group() {
308        let mut groups = BTreeMap::new();
309        groups.insert(
310            "wave1".into(),
311            ConfigScope {
312                heartbeat_interval: Some("30s".into()),
313                ..scope()
314            },
315        );
316        let pc = ConfigScope {
317            heartbeat_interval: Some("5s".into()),
318            ..scope()
319        };
320        let (eff, _) = resolve(None, &groups, Some(&pc), &["wave1".into()]);
321        assert_eq!(eff.heartbeat_interval, "5s");
322    }
323
324    #[test]
325    fn pc_overrides_global_when_no_group_match() {
326        let global = ConfigScope {
327            heartbeat_interval: Some("30s".into()),
328            ..scope()
329        };
330        let pc = ConfigScope {
331            heartbeat_interval: Some("5s".into()),
332            ..scope()
333        };
334        let (eff, _) = resolve(Some(&global), &BTreeMap::new(), Some(&pc), &[]);
335        assert_eq!(eff.heartbeat_interval, "5s");
336    }
337
338    #[test]
339    fn partial_override_only_changes_named_fields() {
340        let global = ConfigScope {
341            target_version_jitter: Some("30m".into()),
342            heartbeat_interval: Some("30s".into()),
343            ..scope()
344        };
345        let pc = ConfigScope {
346            heartbeat_interval: Some("15s".into()),
347            // intentionally not touching target_version_jitter
348            ..scope()
349        };
350        let (eff, _) = resolve(Some(&global), &BTreeMap::new(), Some(&pc), &[]);
351        assert_eq!(eff.target_version_jitter, "30m"); // from global
352        assert_eq!(eff.heartbeat_interval, "15s"); // from pc
353    }
354
355    #[test]
356    fn multi_group_conflict_emits_warning() {
357        let mut groups = BTreeMap::new();
358        groups.insert(
359            "wave1".into(),
360            ConfigScope {
361                heartbeat_interval: Some("5s".into()),
362                ..scope()
363            },
364        );
365        groups.insert(
366            "dept-eng".into(),
367            ConfigScope {
368                heartbeat_interval: Some("60s".into()),
369                ..scope()
370            },
371        );
372        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "dept-eng".into()]);
373        // "dept-eng" sorts before "wave1", so wave1 wins (last alphabetical).
374        assert_eq!(eff.heartbeat_interval, "5s");
375        assert_eq!(warns.len(), 1);
376        match &warns[0] {
377            ResolutionWarning::MultiGroupConflict { field, groups } => {
378                assert_eq!(*field, "heartbeat_interval");
379                assert_eq!(groups, &vec!["dept-eng".to_string(), "wave1".to_string()]);
380            }
381        }
382    }
383
384    #[test]
385    fn group_alphabetical_last_wins_no_conflict_when_only_one_sets() {
386        let mut groups = BTreeMap::new();
387        groups.insert(
388            "wave1".into(),
389            ConfigScope {
390                heartbeat_interval: Some("5s".into()),
391                ..scope()
392            },
393        );
394        groups.insert(
395            "dept-eng".into(),
396            ConfigScope {
397                // Different field — doesn't conflict.
398                target_version_jitter: Some("15m".into()),
399                ..scope()
400            },
401        );
402        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "dept-eng".into()]);
403        assert_eq!(eff.heartbeat_interval, "5s");
404        assert_eq!(eff.target_version_jitter, "15m");
405        assert!(warns.is_empty());
406    }
407
408    #[test]
409    fn unknown_group_is_silently_ignored() {
410        // my_groups names a group that has no scope row yet. Common
411        // on the first agent that joins a freshly-named group; the
412        // resolver should treat it as a no-op, not an error.
413        let mut groups = BTreeMap::new();
414        groups.insert(
415            "canary".into(),
416            ConfigScope {
417                heartbeat_interval: Some("5s".into()),
418                ..scope()
419            },
420        );
421        let (eff, warns) = resolve(
422            None,
423            &groups,
424            None,
425            &["canary".into(), "ghost-group".into()],
426        );
427        assert_eq!(eff.heartbeat_interval, "5s");
428        assert!(warns.is_empty());
429    }
430
431    #[test]
432    fn group_scope_not_applied_when_pc_not_in_group() {
433        let mut groups = BTreeMap::new();
434        groups.insert(
435            "canary".into(),
436            ConfigScope {
437                target_version: Some("0.3.0".into()),
438                ..scope()
439            },
440        );
441        let (eff, _) = resolve(None, &groups, None, &["dept-eng".into()]);
442        // PC is NOT in canary, so the rollout target shouldn't apply.
443        assert!(eff.target_version.is_none());
444    }
445
446    #[test]
447    fn duplicate_group_names_dedup_silently() {
448        let mut groups = BTreeMap::new();
449        groups.insert(
450            "wave1".into(),
451            ConfigScope {
452                heartbeat_interval: Some("5s".into()),
453                ..scope()
454            },
455        );
456        // my_groups carries the same name twice — the dedup pass
457        // keeps it from looking like a conflict-with-self.
458        let (eff, warns) = resolve(None, &groups, None, &["wave1".into(), "wave1".into()]);
459        assert_eq!(eff.heartbeat_interval, "5s");
460        assert!(warns.is_empty());
461    }
462
463    #[test]
464    fn config_scope_serde_round_trip() {
465        let s = ConfigScope {
466            target_version: Some("0.3.0".into()),
467            heartbeat_interval: Some("15s".into()),
468            ..scope()
469        };
470        let json = serde_json::to_string(&s).unwrap();
471        // Only set fields appear in JSON.
472        assert_eq!(
473            json,
474            r#"{"target_version":"0.3.0","heartbeat_interval":"15s"}"#
475        );
476        let back: ConfigScope = serde_json::from_str(&json).unwrap();
477        assert_eq!(back, s);
478    }
479
480    #[test]
481    fn empty_config_scope_round_trips_as_empty_json() {
482        let s = ConfigScope::default();
483        assert!(s.is_empty());
484        let json = serde_json::to_string(&s).unwrap();
485        assert_eq!(json, "{}");
486        let back: ConfigScope = serde_json::from_str(&json).unwrap();
487        assert_eq!(back, s);
488    }
489
490    #[test]
491    fn deserialize_tolerates_unknown_fields_for_forward_compat() {
492        // Older agent / backend builds should keep parsing in case
493        // we add fields later. v0.20 also relies on this so pre-v0.20
494        // rows that still have inventory_interval / inventory_jitter
495        // / inventory_enabled in the bucket value parse OK as the
496        // new (smaller) ConfigScope — the dropped fields just
497        // dissolve into "unknown, ignored".
498        let json =
499            r#"{"target_version":"0.3.0","inventory_interval":"24h","future_knob":"future_value"}"#;
500        let s: ConfigScope = serde_json::from_str(json).unwrap();
501        assert_eq!(s.target_version.as_deref(), Some("0.3.0"));
502    }
503
504    #[test]
505    fn pc_does_not_override_other_pcs() {
506        // Sanity: pc_scope passed in is by definition the row for THIS
507        // pc; the caller is responsible for picking the right one.
508        // This test guards against a future refactor that accidentally
509        // wires in the wrong scope by ensuring the apply happens last
510        // (after groups), so the PC value is the visible one.
511        let mut groups = BTreeMap::new();
512        groups.insert(
513            "wave1".into(),
514            ConfigScope {
515                heartbeat_interval: Some("30s".into()),
516                ..scope()
517            },
518        );
519        let pc = ConfigScope {
520            heartbeat_interval: Some("5s".into()),
521            ..scope()
522        };
523        let (eff, _) = resolve(None, &groups, Some(&pc), &["wave1".into()]);
524        assert_eq!(eff.heartbeat_interval, "5s");
525    }
526}