Skip to main content

heldar_kernel/routes/
system.rs

1use axum::extract::State;
2use axum::http::StatusCode;
3use axum::response::{IntoResponse, Response};
4use axum::routing::get;
5use axum::{Json, Router};
6use chrono::{DateTime, Utc};
7use serde::Serialize;
8use serde_json::{json, Value};
9
10use crate::error::AppResult;
11use crate::services::remote_access::{self, OverlayStatus};
12use crate::services::storage::{self, StorageReport};
13use crate::state::AppState;
14
15pub fn router() -> Router<AppState> {
16    Router::new()
17        .route("/healthz", get(healthz))
18        .route("/readyz", get(readyz))
19        .route("/api/v1/system", get(system_info))
20}
21
22/// Liveness: the process is up.
23async fn healthz() -> Json<Value> {
24    Json(json!({ "status": "ok" }))
25}
26
27/// Readiness: the database is reachable (returns 503 otherwise). When
28/// `HELDAR_READYZ_MIN_RECORDING_PERCENT > 0` this also acts as an HA recorder-quorum probe (see
29/// docs/HA.md): a node whose recording coverage drops below the threshold reports 503 so a
30/// keepalived `health_script` can fail it over to a hot spare. Default 0 keeps DB-only behaviour.
31async fn readyz(State(st): State<AppState>) -> Response {
32    if let Err(e) = sqlx::query_scalar::<_, i64>("SELECT 1")
33        .fetch_one(&st.pool)
34        .await
35    {
36        tracing::error!(error = %e, "readyz: database not reachable");
37        return (
38            StatusCode::SERVICE_UNAVAILABLE,
39            Json(json!({ "ready": false, "reason": "database" })),
40        )
41            .into_response();
42    }
43
44    let required = st.cfg.readyz_min_recording_percent;
45    if required > 0.0 {
46        let counts = async {
47            let enabled: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM cameras WHERE enabled = 1")
48                .fetch_one(&st.pool)
49                .await?;
50            let recording: i64 =
51                sqlx::query_scalar("SELECT COUNT(*) FROM camera_status WHERE state = 'recording'")
52                    .fetch_one(&st.pool)
53                    .await?;
54            Ok::<_, sqlx::Error>((enabled, recording))
55        }
56        .await;
57        let (enabled, recording) = match counts {
58            Ok(v) => v,
59            Err(e) => {
60                tracing::error!(error = %e, "readyz: recorder-quorum query failed");
61                return (
62                    StatusCode::SERVICE_UNAVAILABLE,
63                    Json(json!({ "ready": false, "reason": "database" })),
64                )
65                    .into_response();
66            }
67        };
68        // No enabled cameras => nothing to record => the node is ready by definition.
69        let pct = if enabled > 0 {
70            (recording as f64) * 100.0 / (enabled as f64)
71        } else {
72            100.0
73        };
74        let pct = (pct * 10.0).round() / 10.0;
75        if pct < required {
76            return (
77                StatusCode::SERVICE_UNAVAILABLE,
78                Json(json!({
79                    "ready": false,
80                    "reason": "insufficient_recorders",
81                    "recording_pct": pct,
82                    "required_pct": required,
83                })),
84            )
85                .into_response();
86        }
87    }
88
89    (StatusCode::OK, Json(json!({ "ready": true }))).into_response()
90}
91
92#[derive(Debug, Serialize)]
93struct SystemInfo {
94    name: &'static str,
95    version: &'static str,
96    started_at: DateTime<Utc>,
97    uptime_seconds: i64,
98    recorder_enabled: bool,
99    cameras_total: i64,
100    cameras_recording: i64,
101    active_recorders: usize,
102    segments_total: i64,
103    recordings_bytes: i64,
104    recordings_gb: f64,
105    max_recordings_gb: f64,
106    storage: StorageReport,
107    remote_access: OverlayStatus,
108    /// No recent disk_smart_warning/raid_degraded events (see services::health disk-health pass).
109    disk_health_ok: bool,
110    /// Timestamp of the most recent disk-health alert (any time), or null if none ever fired.
111    last_disk_alert_at: Option<DateTime<Utc>>,
112    /// Active live-preview transcode engine (software | vaapi | nvenc).
113    live_transcode_engine: String,
114}
115
116async fn system_info(State(st): State<AppState>) -> AppResult<Json<SystemInfo>> {
117    let cameras_total: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM cameras")
118        .fetch_one(&st.pool)
119        .await?;
120    let cameras_recording: i64 =
121        sqlx::query_scalar("SELECT COUNT(*) FROM camera_status WHERE state = 'recording'")
122            .fetch_one(&st.pool)
123            .await?;
124    let segments_total: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM segments")
125        .fetch_one(&st.pool)
126        .await?;
127    let recordings_bytes: i64 =
128        sqlx::query_scalar("SELECT COALESCE(SUM(size_bytes), 0) FROM segments")
129            .fetch_one(&st.pool)
130            .await?;
131    let active_recorders = st.recorder.active_ids().await.len();
132    let storage = storage::storage_report(&st.pool, &st.cfg).await?;
133
134    // Disk health: the latest disk-health alert (any time) and whether one fired recently (within a
135    // few SMART-check cycles). With checks disabled no such events exist, so health reads as OK.
136    let last_disk_alert_raw: Option<String> = sqlx::query_scalar(
137        "SELECT MAX(timestamp) FROM events WHERE event_type IN ('disk_smart_warning', 'raid_degraded')",
138    )
139    .fetch_one(&st.pool)
140    .await?;
141    let last_disk_alert_at = last_disk_alert_raw
142        .as_deref()
143        .and_then(crate::util::parse_rfc3339);
144    let recent_window_s = (st.cfg.smart_check_interval_s.saturating_mul(3)).max(900) as i64;
145    let cutoff = Utc::now() - chrono::Duration::seconds(recent_window_s);
146    let recent_disk_alerts: i64 = sqlx::query_scalar(
147        "SELECT COUNT(*) FROM events
148          WHERE event_type IN ('disk_smart_warning', 'raid_degraded') AND timestamp >= ?",
149    )
150    .bind(cutoff)
151    .fetch_one(&st.pool)
152    .await?;
153
154    Ok(Json(SystemInfo {
155        name: "Heldar Core",
156        version: env!("CARGO_PKG_VERSION"),
157        started_at: st.started_at,
158        uptime_seconds: (Utc::now() - st.started_at).num_seconds(),
159        recorder_enabled: st.cfg.recorder_enabled,
160        cameras_total,
161        cameras_recording,
162        active_recorders,
163        segments_total,
164        recordings_bytes,
165        recordings_gb: recordings_bytes as f64 / 1024.0 / 1024.0 / 1024.0,
166        max_recordings_gb: st.cfg.max_recordings_bytes as f64 / 1024.0 / 1024.0 / 1024.0,
167        storage,
168        remote_access: remote_access::status(&st.cfg),
169        disk_health_ok: recent_disk_alerts == 0,
170        last_disk_alert_at,
171        live_transcode_engine: st.cfg.live_transcode_engine.clone(),
172    }))
173}