nv_runtime/diagnostics.rs
1//! Consolidated diagnostics snapshots for feeds and the runtime.
2//!
3//! The library exposes several independent observability surfaces:
4//! `FeedMetrics`, `QueueTelemetry`, `DecodeStatus`, health events,
5//! `BatchMetrics`, and per-frame `Provenance`. Each serves a specific
6//! purpose, but downstream systems that want a comprehensive view of
7//! runtime state must stitch them together manually.
8//!
9//! This module provides **composite snapshot types** that unify the most
10//! commonly needed diagnostics into single, coherent reads:
11//!
12//! - [`FeedDiagnostics`] — everything about one feed in one call.
13//! - [`RuntimeDiagnostics`] — every feed, batch coordinator, and output
14//! channel status in one call.
15//!
16//! # Recommended consumption pattern
17//!
18//! ```text
19//! ┌──────────────────┐
20//! │ Dashboard / OPS │
21//! └──────┬───────────┘
22//! │ poll every 1–5 s
23//! ▼
24//! runtime.diagnostics()
25//! │
26//! ├─▶ RuntimeDiagnostics
27//! │ .uptime
28//! │ .feed_count / .max_feeds
29//! │ .output_lag (in_lag, pending_lost)
30//! │ .batches: Vec<BatchDiagnostics>
31//! │ .processor_id, .metrics
32//! │ .feeds: Vec<FeedDiagnostics> (sorted by FeedId)
33//! │ .alive / .paused / .uptime
34//! │ .metrics (frames counters, restarts)
35//! │ .queues (source/sink depth + capacity)
36//! │ .decode (hw/sw codec status)
37//! │ .view (stability score, context status)
38//! │ .batch_processor_id (links to .batches)
39//! │
40//! │ (complement with event-driven streams:)
41//! │
42//! runtime.health_subscribe() ← state transitions, errors, degradation
43//! runtime.output_subscribe() ← per-frame provenance, admission, detections
44//! ```
45//!
46//! The snapshot approach is intentionally poll-oriented. Snapshots are
47//! cheap (atomic loads plus small allocations for the `Vec` and decode
48//! detail `String`), idempotent, and do not perturb the pipeline.
49//! Event-driven details (individual health events, per-frame provenance)
50//! remain on their respective broadcast channels.
51
52use std::time::Duration;
53
54use nv_core::id::{FeedId, StageId};
55use nv_core::metrics::FeedMetrics;
56
57use crate::batch::BatchMetrics;
58use crate::feed_handle::{DecodeStatus, QueueTelemetry};
59
60// ---------------------------------------------------------------------------
61// Per-feed diagnostics
62// ---------------------------------------------------------------------------
63
64/// Consolidated per-feed diagnostics snapshot.
65///
66/// Combines lifecycle state, throughput metrics, queue depths, decode
67/// status, and view-system health into a single best-effort coherent
68/// snapshot. Each field is read from independent atomics/mutexes, so
69/// the snapshot is approximately — not transactionally — consistent.
70///
71/// Obtained via [`FeedHandle::diagnostics()`](crate::FeedHandle::diagnostics).
72#[derive(Debug, Clone)]
73pub struct FeedDiagnostics {
74 /// The feed's unique identifier.
75 pub feed_id: FeedId,
76 /// Whether the worker thread is still alive.
77 pub alive: bool,
78 /// Whether the feed is currently paused.
79 pub paused: bool,
80 /// Time since the current processing session started.
81 ///
82 /// Resets on each restart. A feed that restarts frequently will
83 /// show low uptime values.
84 pub uptime: Duration,
85 /// Frame counters, track count, view epoch, and restart count.
86 pub metrics: FeedMetrics,
87 /// Source and sink queue depths and capacities.
88 pub queues: QueueTelemetry,
89 /// Decode method selected by the media backend, if known.
90 ///
91 /// `None` until the stream starts and the backend confirms decoder
92 /// negotiation.
93 pub decode: Option<DecodeStatus>,
94 /// Current camera view-system status.
95 pub view: ViewDiagnostics,
96 /// The batch coordinator this feed submits to, if any.
97 ///
98 /// Use this to correlate with [`RuntimeDiagnostics::batches`] for
99 /// the coordinator's metrics.
100 pub batch_processor_id: Option<StageId>,
101}
102
103/// Summary of the camera view-system's current health.
104///
105/// Fixed cameras report `status: ViewStatus::Stable` and `stability_score: 1.0`.
106/// Observed (PTZ/moving) cameras reflect the live epoch policy output.
107#[derive(Debug, Clone, Copy, PartialEq)]
108pub struct ViewDiagnostics {
109 /// Current view epoch — incremented on significant view discontinuities.
110 pub epoch: u64,
111 /// Stability score in `[0.0, 1.0]`. `1.0` = fully stable.
112 pub stability_score: f32,
113 /// High-level view health status.
114 pub status: ViewStatus,
115}
116
117/// High-level camera view health.
118///
119/// This is a diagnostic summary of the underlying [`ContextValidity`]
120/// (from `nv-view`) — intentionally simpler to avoid forcing downstream
121/// consumers to depend on the view crate.
122///
123/// [`ContextValidity`]: nv_view::ContextValidity
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
125pub enum ViewStatus {
126 /// View is stable; temporal context is valid.
127 Stable,
128 /// View is changing; temporal context is degraded.
129 Degraded,
130 /// View has changed significantly; prior context is invalid.
131 Invalid,
132}
133
134// ---------------------------------------------------------------------------
135// Runtime-wide diagnostics
136// ---------------------------------------------------------------------------
137
138// ---------------------------------------------------------------------------
139// Batch diagnostics
140// ---------------------------------------------------------------------------
141
142/// Diagnostics snapshot for a batch coordinator.
143///
144/// One entry per coordinator created via `Runtime::create_batch()`.
145/// Included in [`RuntimeDiagnostics::batches`].
146#[derive(Debug, Clone)]
147pub struct BatchDiagnostics {
148 /// The processor's unique stage ID.
149 pub processor_id: StageId,
150 /// Live metrics snapshot (counters, timing, error state).
151 pub metrics: BatchMetrics,
152}
153
154// ---------------------------------------------------------------------------
155// Output lag diagnostics
156// ---------------------------------------------------------------------------
157
158/// Snapshot of the output broadcast channel's saturation state.
159///
160/// Obtained from the runtime's internal sentinel-based lag detector.
161/// A non-zero `pending_lost` during `in_lag == true` indicates the
162/// channel is saturated and subscribers may be losing messages.
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164pub struct OutputLagStatus {
165 /// Whether the output channel is currently saturated.
166 pub in_lag: bool,
167 /// Messages lost (sentinel-observed) since the last emitted
168 /// `HealthEvent::OutputLagged` event. Non-zero only during
169 /// active saturation.
170 pub pending_lost: u64,
171}
172
173// ---------------------------------------------------------------------------
174// Runtime-wide diagnostics
175// ---------------------------------------------------------------------------
176
177/// Consolidated runtime-wide diagnostics snapshot.
178///
179/// Provides a one-call overview of every feed, batch coordinator, and
180/// output channel health. Fields are read from independent sources, so
181/// the snapshot is best-effort coherent — not transactionally consistent.
182///
183/// Obtained via [`Runtime::diagnostics()`](crate::Runtime::diagnostics)
184/// or [`RuntimeHandle::diagnostics()`](crate::RuntimeHandle::diagnostics).
185#[derive(Debug, Clone)]
186pub struct RuntimeDiagnostics {
187 /// Elapsed time since the runtime was created.
188 pub uptime: Duration,
189 /// Number of currently active feeds.
190 pub feed_count: usize,
191 /// Maximum allowed concurrent feeds.
192 pub max_feeds: usize,
193 /// Per-feed diagnostics, sorted by [`FeedId`] for stable iteration.
194 pub feeds: Vec<FeedDiagnostics>,
195 /// Diagnostics for each batch coordinator owned by this runtime.
196 pub batches: Vec<BatchDiagnostics>,
197 /// Current output broadcast channel saturation status.
198 pub output_lag: OutputLagStatus,
199 /// Number of threads that were detached due to join timeouts and
200 /// have not yet been reaped. Non-zero values indicate threads that
201 /// are still blocked (e.g. in `OutputSink::emit()` or a batch
202 /// processor). Monitored and reaped on each diagnostics poll.
203 pub detached_thread_count: usize,
204}
205
206#[cfg(test)]
207mod tests {
208 use super::*;
209
210 #[test]
211 fn view_status_equality() {
212 assert_eq!(ViewStatus::Stable, ViewStatus::Stable);
213 assert_ne!(ViewStatus::Stable, ViewStatus::Degraded);
214 assert_ne!(ViewStatus::Degraded, ViewStatus::Invalid);
215 }
216
217 #[test]
218 fn view_diagnostics_debug() {
219 let v = ViewDiagnostics {
220 epoch: 3,
221 stability_score: 0.75,
222 status: ViewStatus::Degraded,
223 };
224 let dbg = format!("{v:?}");
225 assert!(dbg.contains("Degraded"));
226 assert!(dbg.contains("0.75"));
227 }
228}