Skip to main content

heldar_kernel/routes/
system.rs

1use axum::extract::State;
2use axum::http::StatusCode;
3use axum::response::{IntoResponse, Response};
4use axum::routing::get;
5use axum::{Json, Router};
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use serde_json::{json, Value};
9
10use crate::auth::Principal;
11use crate::error::{AppError, AppResult};
12use crate::services::remote_access::{self, OverlayStatus};
13use crate::services::settings;
14use crate::services::storage::{self, StorageReport};
15use crate::state::AppState;
16
17pub fn router() -> Router<AppState> {
18    Router::new()
19        .route("/healthz", get(healthz))
20        .route("/readyz", get(readyz))
21        .route("/api/v1/system", get(system_info))
22        .route(
23            "/api/v1/system/retention",
24            get(get_retention).put(put_retention),
25        )
26}
27
28const BYTES_PER_GB: f64 = 1024.0 * 1024.0 * 1024.0;
29
30/// The recording disk-limit policy enforced by the retention sweeper. Each value is the operator
31/// override (settings table) when set, otherwise the env default — `overridden` flags which is which.
32#[derive(Debug, Serialize)]
33struct RetentionLimits {
34    max_recordings_gb: f64,
35    max_recordings_bytes: i64,
36    max_overridden: bool,
37    min_free_disk_gb: f64,
38    min_free_disk_bytes: i64,
39    min_free_overridden: bool,
40}
41
42async fn effective_limits(st: &AppState) -> RetentionLimits {
43    let max_override = settings::get_i64(&st.pool, settings::RECORDING_MAX_BYTES)
44        .await
45        .filter(|&v| v > 0);
46    let floor_override = settings::get_i64(&st.pool, settings::RECORDING_MIN_FREE_BYTES)
47        .await
48        .filter(|&v| v >= 0);
49    let max = max_override.unwrap_or(st.cfg.max_recordings_bytes as i64);
50    let floor = floor_override.unwrap_or(st.cfg.min_free_disk_bytes as i64);
51    RetentionLimits {
52        max_recordings_gb: max as f64 / BYTES_PER_GB,
53        max_recordings_bytes: max,
54        max_overridden: max_override.is_some(),
55        min_free_disk_gb: floor as f64 / BYTES_PER_GB,
56        min_free_disk_bytes: floor,
57        min_free_overridden: floor_override.is_some(),
58    }
59}
60
61/// Current recording disk limits (effective values). Any authenticated caller may read.
62async fn get_retention(State(st): State<AppState>) -> AppResult<Json<RetentionLimits>> {
63    Ok(Json(effective_limits(&st).await))
64}
65
66#[derive(Debug, Deserialize)]
67struct RetentionUpdate {
68    /// New global recordings cap in GB (> 0). Omit to leave unchanged.
69    max_recordings_gb: Option<f64>,
70    /// New free-disk floor in GB (>= 0; 0 disables the floor). Omit to leave unchanged.
71    min_free_disk_gb: Option<f64>,
72}
73
74/// Set the recording disk limits at runtime (admin only) — the retention sweeper picks them up on its
75/// next pass, no restart. Stored in the settings table; clearing them reverts to the env defaults.
76async fn put_retention(
77    State(st): State<AppState>,
78    principal: Principal,
79    Json(body): Json<RetentionUpdate>,
80) -> AppResult<Json<RetentionLimits>> {
81    principal.require(principal.can_admin(), "change recording limits")?;
82    if let Some(gb) = body.max_recordings_gb {
83        if !gb.is_finite() || gb <= 0.0 {
84            return Err(AppError::BadRequest(
85                "`max_recordings_gb` must be greater than 0".into(),
86            ));
87        }
88        settings::set_i64(
89            &st.pool,
90            settings::RECORDING_MAX_BYTES,
91            (gb * BYTES_PER_GB) as i64,
92        )
93        .await?;
94    }
95    if let Some(gb) = body.min_free_disk_gb {
96        if !gb.is_finite() || gb < 0.0 {
97            return Err(AppError::BadRequest(
98                "`min_free_disk_gb` must be 0 or greater".into(),
99            ));
100        }
101        settings::set_i64(
102            &st.pool,
103            settings::RECORDING_MIN_FREE_BYTES,
104            (gb * BYTES_PER_GB) as i64,
105        )
106        .await?;
107    }
108    crate::auth::audit(
109        &st.pool,
110        &principal,
111        "update_retention_limits",
112        "settings",
113        "recording",
114        json!({ "max_recordings_gb": body.max_recordings_gb, "min_free_disk_gb": body.min_free_disk_gb }),
115    )
116    .await;
117    Ok(Json(effective_limits(&st).await))
118}
119
120/// Liveness: the process is up.
121async fn healthz() -> Json<Value> {
122    Json(json!({ "status": "ok" }))
123}
124
125/// Readiness: the database is reachable (returns 503 otherwise). When
126/// `HELDAR_READYZ_MIN_RECORDING_PERCENT > 0` this also acts as an HA recorder-quorum probe (see
127/// docs/HA.md): a node whose recording coverage drops below the threshold reports 503 so a
128/// keepalived `health_script` can fail it over to a hot spare. Default 0 keeps DB-only behaviour.
129async fn readyz(State(st): State<AppState>) -> Response {
130    if let Err(e) = sqlx::query_scalar::<_, i64>("SELECT 1")
131        .fetch_one(&st.pool)
132        .await
133    {
134        tracing::error!(error = %e, "readyz: database not reachable");
135        return (
136            StatusCode::SERVICE_UNAVAILABLE,
137            Json(json!({ "ready": false, "reason": "database" })),
138        )
139            .into_response();
140    }
141
142    let required = st.cfg.readyz_min_recording_percent;
143    if required > 0.0 {
144        let counts = async {
145            let enabled: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM cameras WHERE enabled = 1")
146                .fetch_one(&st.pool)
147                .await?;
148            let recording: i64 =
149                sqlx::query_scalar("SELECT COUNT(*) FROM camera_status cs JOIN cameras c ON c.id = cs.camera_id WHERE cs.state = 'recording' AND c.enabled = 1")
150                    .fetch_one(&st.pool)
151                    .await?;
152            Ok::<_, sqlx::Error>((enabled, recording))
153        }
154        .await;
155        let (enabled, recording) = match counts {
156            Ok(v) => v,
157            Err(e) => {
158                tracing::error!(error = %e, "readyz: recorder-quorum query failed");
159                return (
160                    StatusCode::SERVICE_UNAVAILABLE,
161                    Json(json!({ "ready": false, "reason": "database" })),
162                )
163                    .into_response();
164            }
165        };
166        // No enabled cameras => nothing to record => the node is ready by definition.
167        let pct = if enabled > 0 {
168            (recording as f64) * 100.0 / (enabled as f64)
169        } else {
170            100.0
171        };
172        let pct = (pct * 10.0).round() / 10.0;
173        if pct < required {
174            return (
175                StatusCode::SERVICE_UNAVAILABLE,
176                Json(json!({
177                    "ready": false,
178                    "reason": "insufficient_recorders",
179                    "recording_pct": pct,
180                    "required_pct": required,
181                })),
182            )
183                .into_response();
184        }
185    }
186
187    (StatusCode::OK, Json(json!({ "ready": true }))).into_response()
188}
189
190#[derive(Debug, Serialize)]
191struct SystemInfo {
192    name: &'static str,
193    version: &'static str,
194    started_at: DateTime<Utc>,
195    uptime_seconds: i64,
196    recorder_enabled: bool,
197    cameras_total: i64,
198    cameras_recording: i64,
199    active_recorders: usize,
200    segments_total: i64,
201    recordings_bytes: i64,
202    recordings_gb: f64,
203    max_recordings_gb: f64,
204    storage: StorageReport,
205    remote_access: OverlayStatus,
206    /// No recent disk_smart_warning/raid_degraded events (see services::health disk-health pass).
207    disk_health_ok: bool,
208    /// Timestamp of the most recent disk-health alert (any time), or null if none ever fired.
209    last_disk_alert_at: Option<DateTime<Utc>>,
210    /// Active live-preview transcode engine (software | vaapi | nvenc).
211    live_transcode_engine: String,
212}
213
214async fn system_info(State(st): State<AppState>) -> AppResult<Json<SystemInfo>> {
215    let cameras_total: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM cameras")
216        .fetch_one(&st.pool)
217        .await?;
218    let cameras_recording: i64 =
219        sqlx::query_scalar("SELECT COUNT(*) FROM camera_status cs JOIN cameras c ON c.id = cs.camera_id WHERE cs.state = 'recording' AND c.enabled = 1")
220            .fetch_one(&st.pool)
221            .await?;
222    let segments_total: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM segments")
223        .fetch_one(&st.pool)
224        .await?;
225    let recordings_bytes: i64 =
226        sqlx::query_scalar("SELECT COALESCE(SUM(size_bytes), 0) FROM segments")
227            .fetch_one(&st.pool)
228            .await?;
229    let active_recorders = st.recorder.active_ids().await.len();
230    let storage = storage::storage_report(&st.pool, &st.cfg).await?;
231    let limits = effective_limits(&st).await;
232
233    // Disk health: the latest disk-health alert (any time) and whether one fired recently (within a
234    // few SMART-check cycles). With checks disabled no such events exist, so health reads as OK.
235    let last_disk_alert_raw: Option<String> = sqlx::query_scalar(
236        "SELECT MAX(timestamp) FROM events WHERE event_type IN ('disk_smart_warning', 'raid_degraded')",
237    )
238    .fetch_one(&st.pool)
239    .await?;
240    let last_disk_alert_at = last_disk_alert_raw
241        .as_deref()
242        .and_then(crate::util::parse_rfc3339);
243    let recent_window_s = (st.cfg.smart_check_interval_s.saturating_mul(3)).max(900) as i64;
244    let cutoff = Utc::now() - chrono::Duration::seconds(recent_window_s);
245    let recent_disk_alerts: i64 = sqlx::query_scalar(
246        "SELECT COUNT(*) FROM events
247          WHERE event_type IN ('disk_smart_warning', 'raid_degraded') AND timestamp >= ?",
248    )
249    .bind(cutoff)
250    .fetch_one(&st.pool)
251    .await?;
252
253    Ok(Json(SystemInfo {
254        name: "Heldar Core",
255        version: env!("CARGO_PKG_VERSION"),
256        started_at: st.started_at,
257        uptime_seconds: (Utc::now() - st.started_at).num_seconds(),
258        recorder_enabled: st.cfg.recorder_enabled,
259        cameras_total,
260        cameras_recording,
261        active_recorders,
262        segments_total,
263        recordings_bytes,
264        recordings_gb: recordings_bytes as f64 / 1024.0 / 1024.0 / 1024.0,
265        max_recordings_gb: limits.max_recordings_gb,
266        storage,
267        remote_access: remote_access::status(&st.cfg),
268        disk_health_ok: recent_disk_alerts == 0,
269        last_disk_alert_at,
270        live_transcode_engine: st.cfg.live_transcode_engine.clone(),
271    }))
272}