Skip to main content

heldar_kernel/services/
health.rs

1//! Health monitor: downgrades cameras that claim to be recording but have stopped producing
2//! segments (a stalled-but-connected stream), emitting an event on the transition.
3
4use std::process::Stdio;
5use std::sync::atomic::{AtomicBool, Ordering};
6use std::sync::Arc;
7use std::time::{Duration, Instant};
8
9use chrono::{DateTime, Utc};
10use serde_json::json;
11use sqlx::SqlitePool;
12use tokio::process::Command;
13
14use crate::config::Config;
15use crate::repo;
16
17/// SMART CLI (smartmontools). Looked up on PATH; a missing binary degrades to a one-time log + skip.
18const SMARTCTL_BIN: &str = "smartctl";
19
20pub async fn run(pool: SqlitePool, cfg: Arc<Config>) {
21    let mut tick = tokio::time::interval(Duration::from_secs(cfg.health_interval_s.max(5)));
22    // Disk-health (SMART/RAID) runs on its own slower cadence inside this same loop, so a busy
23    // appliance is not probed for drive health every few seconds.
24    let smart_interval = Duration::from_secs(cfg.smart_check_interval_s.max(30));
25    let mut last_disk_check: Option<Instant> = None;
26    loop {
27        tick.tick().await;
28        if let Err(e) = check_once(&pool).await {
29            tracing::error!(error = %e, "health: check failed");
30        }
31        let due = last_disk_check
32            .map(|t| t.elapsed() >= smart_interval)
33            .unwrap_or(true);
34        if due {
35            last_disk_check = Some(Instant::now());
36            check_disk_health(&pool, &cfg).await;
37        }
38    }
39}
40
41/// (camera_id, last_segment_at, last_started_at, segment_seconds)
42type StaleRow = (String, Option<DateTime<Utc>>, Option<DateTime<Utc>>, i64);
43
44async fn check_once(pool: &SqlitePool) -> anyhow::Result<()> {
45    let rows: Vec<StaleRow> = sqlx::query_as(
46        "SELECT cs.camera_id, cs.last_segment_at, cs.last_started_at, c.segment_seconds
47         FROM camera_status cs
48         JOIN cameras c ON c.id = cs.camera_id
49         WHERE cs.state = 'recording'",
50    )
51    .fetch_all(pool)
52    .await?;
53
54    let now = Utc::now();
55    for (camera_id, last_seg, last_start, seg_s) in rows {
56        let threshold = (seg_s.max(10) * 3).max(30);
57        let seg_age = last_seg.map(|t| (now - t).num_seconds());
58        let start_age = last_start.map(|t| (now - t).num_seconds());
59
60        let recent_segment = seg_age.map(|a| a <= threshold).unwrap_or(false);
61        let recently_started = start_age.map(|a| a <= threshold).unwrap_or(false);
62        if recent_segment || recently_started {
63            continue;
64        }
65
66        let msg = format!("no segments for >{threshold}s while recording");
67        let _ = repo::set_state(pool, &camera_id, "error", Some(&msg)).await;
68        let _ = repo::log_event(
69            pool,
70            Some(&camera_id),
71            "recorder_error",
72            "warning",
73            json!({ "reason": "stale", "threshold_seconds": threshold, "last_segment_age_s": seg_age }),
74        )
75        .await;
76        tracing::warn!(%camera_id, threshold, "health: camera stale, marked error");
77    }
78    Ok(())
79}
80
81/// SMART/RAID disk-health pass: opt-in drive self-assessment (`smartctl -H`) and Linux md/RAID
82/// array-state monitoring. Both degrade gracefully when their inputs are absent (missing smartctl is
83/// logged once then skipped; no `/proc/mdstat` is a no-op), so the build and tests never require them.
84async fn check_disk_health(pool: &SqlitePool, cfg: &Config) {
85    if cfg.smart_check_enabled {
86        if smartctl_available().await {
87            for dev in &cfg.smart_devices {
88                check_smart_device(pool, dev).await;
89            }
90        } else if !SMARTCTL_MISSING_WARNED.swap(true, Ordering::Relaxed) {
91            tracing::warn!(
92                "health: HELDAR_SMART_CHECK_ENABLED set but `smartctl` is not on PATH; skipping \
93                 SMART checks (install smartmontools)"
94            );
95        }
96    }
97    #[cfg(target_os = "linux")]
98    if cfg.mdstat_check_enabled {
99        check_mdstat(pool).await;
100    }
101}
102
103/// One-shot guard so the "smartctl missing" warning is logged once per process, not every interval.
104static SMARTCTL_MISSING_WARNED: AtomicBool = AtomicBool::new(false);
105
106/// Whether `smartctl` is runnable on PATH (so a missing binary degrades to a skip, not a panic).
107async fn smartctl_available() -> bool {
108    Command::new(SMARTCTL_BIN)
109        .arg("--version")
110        .stdin(Stdio::null())
111        .stdout(Stdio::null())
112        .stderr(Stdio::null())
113        .kill_on_drop(true)
114        .status()
115        .await
116        .map(|s| s.success())
117        .unwrap_or(false)
118}
119
120/// Run `smartctl -H <dev>` and emit a `disk_smart_warning` event unless the drive reports healthy
121/// (PASSED / OK). A FAILED result or an unreadable/missing device both warn.
122async fn check_smart_device(pool: &SqlitePool, dev: &str) {
123    let out = Command::new(SMARTCTL_BIN)
124        .arg("-H")
125        .arg(dev)
126        .stdin(Stdio::null())
127        .stdout(Stdio::piped())
128        .stderr(Stdio::piped())
129        .kill_on_drop(true)
130        .output()
131        .await;
132    match out {
133        Ok(o) => {
134            let stdout = String::from_utf8_lossy(&o.stdout);
135            if smart_is_healthy(&stdout) {
136                return;
137            }
138            let detail = stdout
139                .lines()
140                .find(|l| l.contains("health") || l.contains("Health") || l.contains("SMART"))
141                .unwrap_or("")
142                .trim()
143                .to_string();
144            let _ = repo::log_event(
145                pool,
146                None,
147                "disk_smart_warning",
148                "warning",
149                json!({ "device": dev, "detail": detail, "exit_ok": o.status.success() }),
150            )
151            .await;
152            tracing::warn!(device = %dev, "health: SMART self-assessment did not report PASSED");
153        }
154        Err(e) => {
155            let _ = repo::log_event(
156                pool,
157                None,
158                "disk_smart_warning",
159                "warning",
160                json!({ "device": dev, "detail": format!("smartctl could not run: {e}") }),
161            )
162            .await;
163            tracing::warn!(device = %dev, error = %e, "health: smartctl invocation failed");
164        }
165    }
166}
167
168/// A SMART health summary is healthy only when it positively reports PASSED/OK and never FAILED.
169fn smart_is_healthy(stdout: &str) -> bool {
170    !stdout.contains("FAILED") && (stdout.contains("PASSED") || stdout.contains("OK"))
171}
172
173/// Read `/proc/mdstat` and emit a `raid_degraded` event for each array showing a down member.
174#[cfg(target_os = "linux")]
175async fn check_mdstat(pool: &SqlitePool) {
176    let contents = match tokio::fs::read_to_string("/proc/mdstat").await {
177        Ok(c) => c,
178        // No md subsystem on this host — nothing to monitor.
179        Err(_) => return,
180    };
181    for name in mdstat_degraded(&contents) {
182        let _ = repo::log_event(
183            pool,
184            None,
185            "raid_degraded",
186            "critical",
187            json!({ "array": name, "source": "/proc/mdstat" }),
188        )
189        .await;
190        tracing::warn!(array = %name, "health: RAID array degraded");
191    }
192}
193
194/// Parse `/proc/mdstat` and return the names of arrays with a down member. An array's per-disk state
195/// map is a bracketed token of only `U`/`_` on the status line (e.g. `[U_]`); any `_` means degraded.
196fn mdstat_degraded(contents: &str) -> Vec<String> {
197    let mut degraded = Vec::new();
198    let mut current: Option<String> = None;
199    for line in contents.lines() {
200        // Array header lines start at column 0 with the device name, e.g. "md0 : active raid1 ...".
201        if line.starts_with("md") {
202            current = line.split([' ', ':']).next().map(|s| s.to_string());
203            continue;
204        }
205        if let Some(name) = &current {
206            if line_has_down_member(line) {
207                degraded.push(name.clone());
208                current = None; // one verdict per array
209            }
210        }
211    }
212    degraded
213}
214
215/// Whether a status line carries a `[U.._..]` map (only `U`/`_`) with at least one down (`_`) member.
216fn line_has_down_member(line: &str) -> bool {
217    let mut rest = line;
218    while let Some(open) = rest.find('[') {
219        let after = &rest[open + 1..];
220        if let Some(close) = after.find(']') {
221            let inner = &after[..close];
222            if !inner.is_empty()
223                && inner.chars().all(|c| c == 'U' || c == '_')
224                && inner.contains('_')
225            {
226                return true;
227            }
228            rest = &after[close + 1..];
229        } else {
230            break;
231        }
232    }
233    false
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239
240    #[test]
241    fn smart_health_parsing() {
242        assert!(smart_is_healthy(
243            "SMART overall-health self-assessment test result: PASSED"
244        ));
245        assert!(smart_is_healthy("SMART Health Status: OK"));
246        assert!(!smart_is_healthy(
247            "SMART overall-health self-assessment test result: FAILED!"
248        ));
249        // Missing/unreadable device output has no positive health line.
250        assert!(!smart_is_healthy("Smartctl open device: /dev/sdz failed"));
251    }
252
253    #[test]
254    fn mdstat_flags_degraded_arrays_only() {
255        let healthy = "\
256Personalities : [raid1]
257md0 : active raid1 sdb1[1] sda1[0]
258      976630336 blocks super 1.2 [2/2] [UU]
259
260unused devices: <none>
261";
262        assert!(mdstat_degraded(healthy).is_empty());
263
264        let degraded = "\
265Personalities : [raid1] [raid6]
266md0 : active raid1 sdb1[1] sda1[0]
267      976630336 blocks super 1.2 [2/1] [U_]
268md1 : active raid6 sdc1[0] sdd1[1] sde1[2] sdf1[3]
269      3906248704 blocks super 1.2 level 6, 512k chunk, algorithm 2 [4/4] [UUUU]
270
271unused devices: <none>
272";
273        assert_eq!(mdstat_degraded(degraded), vec!["md0".to_string()]);
274    }
275
276    #[test]
277    fn down_member_detection_ignores_disk_index_brackets() {
278        // The header line's [0]/[1] disk-index brackets must not be read as a state map.
279        assert!(!line_has_down_member("md0 : active raid1 sdb1[1] sda1[0]"));
280        assert!(line_has_down_member(
281            "      976630336 blocks super 1.2 [2/1] [U_]"
282        ));
283        assert!(!line_has_down_member(
284            "      976630336 blocks super 1.2 [2/2] [UU]"
285        ));
286    }
287}