heldar_kernel/services/
health.rs1use std::process::Stdio;
5use std::sync::atomic::{AtomicBool, Ordering};
6use std::sync::Arc;
7use std::time::{Duration, Instant};
8
9use chrono::{DateTime, Utc};
10use serde_json::json;
11use sqlx::SqlitePool;
12use tokio::process::Command;
13
14use crate::config::Config;
15use crate::repo;
16
17const SMARTCTL_BIN: &str = "smartctl";
19
20pub async fn run(pool: SqlitePool, cfg: Arc<Config>) {
21 let mut tick = tokio::time::interval(Duration::from_secs(cfg.health_interval_s.max(5)));
22 let smart_interval = Duration::from_secs(cfg.smart_check_interval_s.max(30));
25 let mut last_disk_check: Option<Instant> = None;
26 loop {
27 tick.tick().await;
28 if let Err(e) = check_once(&pool).await {
29 tracing::error!(error = %e, "health: check failed");
30 }
31 let due = last_disk_check
32 .map(|t| t.elapsed() >= smart_interval)
33 .unwrap_or(true);
34 if due {
35 last_disk_check = Some(Instant::now());
36 check_disk_health(&pool, &cfg).await;
37 }
38 }
39}
40
41type StaleRow = (String, Option<DateTime<Utc>>, Option<DateTime<Utc>>, i64);
43
44async fn check_once(pool: &SqlitePool) -> anyhow::Result<()> {
45 let rows: Vec<StaleRow> = sqlx::query_as(
46 "SELECT cs.camera_id, cs.last_segment_at, cs.last_started_at, c.segment_seconds
47 FROM camera_status cs
48 JOIN cameras c ON c.id = cs.camera_id
49 WHERE cs.state = 'recording'",
50 )
51 .fetch_all(pool)
52 .await?;
53
54 let now = Utc::now();
55 for (camera_id, last_seg, last_start, seg_s) in rows {
56 let threshold = (seg_s.max(10) * 3).max(30);
57 let seg_age = last_seg.map(|t| (now - t).num_seconds());
58 let start_age = last_start.map(|t| (now - t).num_seconds());
59
60 let recent_segment = seg_age.map(|a| a <= threshold).unwrap_or(false);
61 let recently_started = start_age.map(|a| a <= threshold).unwrap_or(false);
62 if recent_segment || recently_started {
63 continue;
64 }
65
66 let msg = format!("no segments for >{threshold}s while recording");
67 let _ = repo::set_state(pool, &camera_id, "error", Some(&msg)).await;
68 let _ = repo::log_event(
69 pool,
70 Some(&camera_id),
71 "recorder_error",
72 "warning",
73 json!({ "reason": "stale", "threshold_seconds": threshold, "last_segment_age_s": seg_age }),
74 )
75 .await;
76 tracing::warn!(%camera_id, threshold, "health: camera stale, marked error");
77 }
78 Ok(())
79}
80
81async fn check_disk_health(pool: &SqlitePool, cfg: &Config) {
85 if cfg.smart_check_enabled {
86 if smartctl_available().await {
87 for dev in &cfg.smart_devices {
88 check_smart_device(pool, dev).await;
89 }
90 } else if !SMARTCTL_MISSING_WARNED.swap(true, Ordering::Relaxed) {
91 tracing::warn!(
92 "health: HELDAR_SMART_CHECK_ENABLED set but `smartctl` is not on PATH; skipping \
93 SMART checks (install smartmontools)"
94 );
95 }
96 }
97 #[cfg(target_os = "linux")]
98 if cfg.mdstat_check_enabled {
99 check_mdstat(pool).await;
100 }
101}
102
103static SMARTCTL_MISSING_WARNED: AtomicBool = AtomicBool::new(false);
105
106async fn smartctl_available() -> bool {
108 Command::new(SMARTCTL_BIN)
109 .arg("--version")
110 .stdin(Stdio::null())
111 .stdout(Stdio::null())
112 .stderr(Stdio::null())
113 .kill_on_drop(true)
114 .status()
115 .await
116 .map(|s| s.success())
117 .unwrap_or(false)
118}
119
120async fn check_smart_device(pool: &SqlitePool, dev: &str) {
123 let out = Command::new(SMARTCTL_BIN)
124 .arg("-H")
125 .arg(dev)
126 .stdin(Stdio::null())
127 .stdout(Stdio::piped())
128 .stderr(Stdio::piped())
129 .kill_on_drop(true)
130 .output()
131 .await;
132 match out {
133 Ok(o) => {
134 let stdout = String::from_utf8_lossy(&o.stdout);
135 if smart_is_healthy(&stdout) {
136 return;
137 }
138 let detail = stdout
139 .lines()
140 .find(|l| l.contains("health") || l.contains("Health") || l.contains("SMART"))
141 .unwrap_or("")
142 .trim()
143 .to_string();
144 let _ = repo::log_event(
145 pool,
146 None,
147 "disk_smart_warning",
148 "warning",
149 json!({ "device": dev, "detail": detail, "exit_ok": o.status.success() }),
150 )
151 .await;
152 tracing::warn!(device = %dev, "health: SMART self-assessment did not report PASSED");
153 }
154 Err(e) => {
155 let _ = repo::log_event(
156 pool,
157 None,
158 "disk_smart_warning",
159 "warning",
160 json!({ "device": dev, "detail": format!("smartctl could not run: {e}") }),
161 )
162 .await;
163 tracing::warn!(device = %dev, error = %e, "health: smartctl invocation failed");
164 }
165 }
166}
167
168fn smart_is_healthy(stdout: &str) -> bool {
170 !stdout.contains("FAILED") && (stdout.contains("PASSED") || stdout.contains("OK"))
171}
172
173#[cfg(target_os = "linux")]
175async fn check_mdstat(pool: &SqlitePool) {
176 let contents = match tokio::fs::read_to_string("/proc/mdstat").await {
177 Ok(c) => c,
178 Err(_) => return,
180 };
181 for name in mdstat_degraded(&contents) {
182 let _ = repo::log_event(
183 pool,
184 None,
185 "raid_degraded",
186 "critical",
187 json!({ "array": name, "source": "/proc/mdstat" }),
188 )
189 .await;
190 tracing::warn!(array = %name, "health: RAID array degraded");
191 }
192}
193
194fn mdstat_degraded(contents: &str) -> Vec<String> {
197 let mut degraded = Vec::new();
198 let mut current: Option<String> = None;
199 for line in contents.lines() {
200 if line.starts_with("md") {
202 current = line.split([' ', ':']).next().map(|s| s.to_string());
203 continue;
204 }
205 if let Some(name) = ¤t {
206 if line_has_down_member(line) {
207 degraded.push(name.clone());
208 current = None; }
210 }
211 }
212 degraded
213}
214
215fn line_has_down_member(line: &str) -> bool {
217 let mut rest = line;
218 while let Some(open) = rest.find('[') {
219 let after = &rest[open + 1..];
220 if let Some(close) = after.find(']') {
221 let inner = &after[..close];
222 if !inner.is_empty()
223 && inner.chars().all(|c| c == 'U' || c == '_')
224 && inner.contains('_')
225 {
226 return true;
227 }
228 rest = &after[close + 1..];
229 } else {
230 break;
231 }
232 }
233 false
234}
235
236#[cfg(test)]
237mod tests {
238 use super::*;
239
240 #[test]
241 fn smart_health_parsing() {
242 assert!(smart_is_healthy(
243 "SMART overall-health self-assessment test result: PASSED"
244 ));
245 assert!(smart_is_healthy("SMART Health Status: OK"));
246 assert!(!smart_is_healthy(
247 "SMART overall-health self-assessment test result: FAILED!"
248 ));
249 assert!(!smart_is_healthy("Smartctl open device: /dev/sdz failed"));
251 }
252
253 #[test]
254 fn mdstat_flags_degraded_arrays_only() {
255 let healthy = "\
256Personalities : [raid1]
257md0 : active raid1 sdb1[1] sda1[0]
258 976630336 blocks super 1.2 [2/2] [UU]
259
260unused devices: <none>
261";
262 assert!(mdstat_degraded(healthy).is_empty());
263
264 let degraded = "\
265Personalities : [raid1] [raid6]
266md0 : active raid1 sdb1[1] sda1[0]
267 976630336 blocks super 1.2 [2/1] [U_]
268md1 : active raid6 sdc1[0] sdd1[1] sde1[2] sdf1[3]
269 3906248704 blocks super 1.2 level 6, 512k chunk, algorithm 2 [4/4] [UUUU]
270
271unused devices: <none>
272";
273 assert_eq!(mdstat_degraded(degraded), vec!["md0".to_string()]);
274 }
275
276 #[test]
277 fn down_member_detection_ignores_disk_index_brackets() {
278 assert!(!line_has_down_member("md0 : active raid1 sdb1[1] sda1[0]"));
280 assert!(line_has_down_member(
281 " 976630336 blocks super 1.2 [2/1] [U_]"
282 ));
283 assert!(!line_has_down_member(
284 " 976630336 blocks super 1.2 [2/2] [UU]"
285 ));
286 }
287}