Skip to main content

bee_tui/
alerts.rs

1//! Webhook health-gate alerts. When a gate transitions to `Fail`
2//! (or back to `Pass` after being broken), bee-tui POSTs a small
3//! JSON object to the operator-configured webhook URL. Slack and
4//! Discord-compatible โ€” both accept the same `{"text": "..."}` shape
5//! on their incoming-webhook URLs.
6//!
7//! ## Why
8//!
9//! Operators don't want to leave bee-tui open on a second monitor
10//! to catch a gate failure overnight. A single Slack ping when
11//! something flips red is the lowest-effort handoff to mobile / DMs.
12//!
13//! ## Design constraints
14//!
15//! * **Opt-in only.** `[alerts].webhook_url` defaults to absent.
16//!   No surprise outbound traffic from a fresh install.
17//! * **Debounced per-gate.** Each gate has its own 5-minute cool-down
18//!   so a flapping `Reachability` doesn't pin the operator's Slack.
19//! * **Read-only on Bee.** This module makes outbound HTTP only.
20//!   No chain interaction, no Bee-API write.
21
22use std::collections::HashMap;
23use std::time::{Duration, SystemTime};
24
25use crate::components::health::{Gate, GateStatus};
26
27/// Default debounce window per-gate. After firing an alert for gate
28/// `X`, we won't fire another alert for `X` until this elapses,
29/// regardless of whether the gate flapped in the meantime.
30pub const DEFAULT_DEBOUNCE_SECS: u64 = 5 * 60;
31
32/// One transition the state-comparator detected. Each `Alert` becomes
33/// one webhook POST.
34#[derive(Debug, Clone, PartialEq, Eq)]
35pub struct Alert {
36    pub gate: String,
37    pub from: GateStatus,
38    pub to: GateStatus,
39    /// Gate's current `value` line (the one rendered next to the
40    /// status glyph in the cockpit) โ€” included in the message body
41    /// so operators don't need to open bee-tui to see what the
42    /// gate said.
43    pub value: String,
44    /// `why` continuation if the gate had one โ€” adds tribal-knowledge
45    /// context (e.g. "wait for the next 30-min reserve worker tick").
46    pub why: Option<String>,
47}
48
49impl Alert {
50    /// Slack/Discord-compatible JSON body. Both accept `{"text": ...}`
51    /// on their incoming-webhook URLs.
52    pub fn json_body(&self) -> serde_json::Value {
53        serde_json::json!({
54            "text": self.message_line(),
55        })
56    }
57
58    /// One-line operator-facing message used in the webhook body.
59    pub fn message_line(&self) -> String {
60        let arrow = match (self.from, self.to) {
61            (_, GateStatus::Fail) => "๐Ÿ”ด FAILED",
62            (_, GateStatus::Warn) => "๐ŸŸก WARN",
63            (_, GateStatus::Pass) => "๐ŸŸข RECOVERED",
64            (_, GateStatus::Unknown) => "โšช UNKNOWN",
65        };
66        let mut s = format!(
67            "bee-tui: {} {} (was {:?}, now {:?}) โ€” {}",
68            arrow, self.gate, self.from, self.to, self.value,
69        );
70        if let Some(why) = &self.why {
71            s.push_str(" ยท ");
72            s.push_str(why);
73        }
74        s
75    }
76
77    /// True for transitions worth pinging on. Transitions to/from
78    /// `Unknown` are ignored โ€” that's "data not loaded yet" and
79    /// flapping during startup would spam.
80    pub fn is_worth_alerting(&self) -> bool {
81        if self.from == GateStatus::Unknown || self.to == GateStatus::Unknown {
82            return false;
83        }
84        self.from != self.to
85    }
86}
87
88/// Mutable per-gate state the alerter keeps across ticks. Owned by
89/// [`AlertState::new`]; `App` calls [`AlertState::diff_and_record`]
90/// once per Tick after computing the latest gates.
91#[derive(Debug, Default)]
92pub struct AlertState {
93    /// Last seen `GateStatus` per gate label.
94    last_status: HashMap<String, GateStatus>,
95    /// Last fired alert wall-clock time per gate label. Used for
96    /// debouncing โ€” we keep the entry across cool-downs.
97    last_fired: HashMap<String, SystemTime>,
98    debounce: Duration,
99}
100
101impl AlertState {
102    pub fn new(debounce_secs: u64) -> Self {
103        Self {
104            last_status: HashMap::new(),
105            last_fired: HashMap::new(),
106            debounce: Duration::from_secs(debounce_secs),
107        }
108    }
109
110    /// Compare `current` gates to the previously-recorded ones and
111    /// produce a list of [`Alert`]s for transitions that pass the
112    /// debounce filter. Mutates `self` to record the new state.
113    pub fn diff_and_record(&mut self, current: &[Gate]) -> Vec<Alert> {
114        self.diff_and_record_at(current, SystemTime::now())
115    }
116
117    /// Test seam โ€” same as [`Self::diff_and_record`] but uses the
118    /// supplied `now` instead of wall-clock time. Pure for the
119    /// debounce-window assertions to be deterministic.
120    pub fn diff_and_record_at(&mut self, current: &[Gate], now: SystemTime) -> Vec<Alert> {
121        let mut out = Vec::new();
122        for gate in current {
123            let prev = self
124                .last_status
125                .get(gate.label)
126                .copied()
127                .unwrap_or(GateStatus::Unknown);
128            self.last_status.insert(gate.label.to_string(), gate.status);
129            let alert = Alert {
130                gate: gate.label.to_string(),
131                from: prev,
132                to: gate.status,
133                value: gate.value.clone(),
134                why: gate.why.clone(),
135            };
136            if !alert.is_worth_alerting() {
137                continue;
138            }
139            // Debounce check.
140            if let Some(last) = self.last_fired.get(gate.label) {
141                if now.duration_since(*last).unwrap_or_default() < self.debounce {
142                    continue;
143                }
144            }
145            self.last_fired.insert(gate.label.to_string(), now);
146            out.push(alert);
147        }
148        out
149    }
150}
151
152/// Fire a single alert at `webhook_url`. Async; the caller spawns
153/// this on tokio. Returns `Ok(())` on 2xx; `Err(reason)` on every
154/// other outcome โ€” operator-facing log lines, not panics.
155pub async fn fire(webhook_url: &str, alert: &Alert) -> Result<(), String> {
156    let client = reqwest::Client::builder()
157        .timeout(Duration::from_secs(10))
158        .user_agent(concat!("bee-tui/", env!("CARGO_PKG_VERSION")))
159        .build()
160        .map_err(|e| format!("client build: {e}"))?;
161    let resp = client
162        .post(webhook_url)
163        .json(&alert.json_body())
164        .send()
165        .await
166        .map_err(|e| format!("POST {webhook_url}: {e}"))?;
167    if !resp.status().is_success() {
168        return Err(format!("webhook returned HTTP {}", resp.status()));
169    }
170    Ok(())
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    fn gate(label: &'static str, status: GateStatus, value: &str) -> Gate {
178        Gate {
179            label,
180            status,
181            value: value.to_string(),
182            why: None,
183        }
184    }
185
186    #[test]
187    fn first_observation_is_unknown_baseline_and_silent() {
188        let mut s = AlertState::new(60);
189        // Initial Pass on fresh state โ€” prev = Unknown so we don't
190        // alert (not a meaningful transition).
191        let out = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
192        assert!(out.is_empty(), "fresh start should be silent: {out:?}");
193    }
194
195    #[test]
196    fn pass_to_fail_fires_alert() {
197        let mut s = AlertState::new(60);
198        let _ = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
199        let now = SystemTime::now();
200        let out = s.diff_and_record_at(&[gate("Health", GateStatus::Fail, "broken")], now);
201        assert_eq!(out.len(), 1);
202        assert_eq!(out[0].from, GateStatus::Pass);
203        assert_eq!(out[0].to, GateStatus::Fail);
204        assert!(out[0].message_line().contains("FAILED"));
205    }
206
207    #[test]
208    fn fail_to_pass_fires_recovery() {
209        let mut s = AlertState::new(60);
210        let _ = s.diff_and_record(&[gate("Health", GateStatus::Fail, "broken")]);
211        let out = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
212        assert_eq!(out.len(), 1);
213        assert!(out[0].message_line().contains("RECOVERED"));
214    }
215
216    #[test]
217    fn unchanged_status_is_silent() {
218        let mut s = AlertState::new(60);
219        let _ = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
220        let out = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
221        assert!(out.is_empty());
222    }
223
224    #[test]
225    fn unknown_transitions_are_ignored() {
226        let mut s = AlertState::new(60);
227        // Pass โ†’ Unknown shouldn't fire (we lost data, not a real
228        // failure).
229        let _ = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
230        let out = s.diff_and_record(&[gate("Health", GateStatus::Unknown, "")]);
231        assert!(out.is_empty());
232        // Unknown โ†’ Pass shouldn't fire either (initial load).
233        let mut s2 = AlertState::new(60);
234        let _ = s2.diff_and_record(&[gate("Health", GateStatus::Unknown, "")]);
235        let out = s2.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
236        assert!(out.is_empty());
237    }
238
239    #[test]
240    fn debounce_suppresses_repeat_within_window() {
241        let mut s = AlertState::new(60);
242        let _ = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
243        let t0 = SystemTime::now();
244        // First flap: silent (new state).
245        let out = s.diff_and_record_at(&[gate("Health", GateStatus::Fail, "broken")], t0);
246        assert_eq!(out.len(), 1);
247        // Within window: re-flap is suppressed.
248        let t1 = t0 + Duration::from_secs(30);
249        let _ = s.diff_and_record_at(&[gate("Health", GateStatus::Pass, "ok")], t1);
250        let t2 = t0 + Duration::from_secs(45);
251        let out = s.diff_and_record_at(&[gate("Health", GateStatus::Fail, "broken again")], t2);
252        assert!(
253            out.is_empty(),
254            "second fail within 60s should be debounced: {out:?}"
255        );
256    }
257
258    #[test]
259    fn debounce_releases_after_window() {
260        let mut s = AlertState::new(60);
261        let _ = s.diff_and_record(&[gate("Health", GateStatus::Pass, "ok")]);
262        let t0 = SystemTime::now();
263        let _ = s.diff_and_record_at(&[gate("Health", GateStatus::Fail, "x")], t0);
264        // After 61s, a new transition fires again.
265        let _ = s.diff_and_record_at(
266            &[gate("Health", GateStatus::Pass, "ok")],
267            t0 + Duration::from_secs(61),
268        );
269        let out = s.diff_and_record_at(
270            &[gate("Health", GateStatus::Fail, "y")],
271            t0 + Duration::from_secs(122),
272        );
273        assert_eq!(out.len(), 1);
274    }
275
276    #[test]
277    fn json_body_uses_text_field() {
278        let alert = Alert {
279            gate: "Health".into(),
280            from: GateStatus::Pass,
281            to: GateStatus::Fail,
282            value: "broken".into(),
283            why: None,
284        };
285        let body = alert.json_body();
286        assert!(body["text"].is_string(), "json: {body}");
287        assert!(body["text"].as_str().unwrap().contains("FAILED"));
288    }
289
290    #[test]
291    fn message_line_includes_why_when_present() {
292        let alert = Alert {
293            gate: "StorageRadius".into(),
294            from: GateStatus::Pass,
295            to: GateStatus::Warn,
296            value: "below committed".into(),
297            why: Some("decreases ONLY on the 30-min reserve worker tick".into()),
298        };
299        let s = alert.message_line();
300        assert!(s.contains("30-min reserve worker tick"));
301    }
302}