Skip to main content

nv_runtime/
diagnostics.rs

1//! Consolidated diagnostics snapshots for feeds and the runtime.
2//!
3//! The library exposes several independent observability surfaces:
4//! `FeedMetrics`, `QueueTelemetry`, `DecodeStatus`, health events,
5//! `BatchMetrics`, and per-frame `Provenance`. Each serves a specific
6//! purpose, but downstream systems that want a comprehensive view of
7//! runtime state must stitch them together manually.
8//!
9//! This module provides **composite snapshot types** that unify the most
10//! commonly needed diagnostics into single, coherent reads:
11//!
12//! - [`FeedDiagnostics`] — everything about one feed in one call.
13//! - [`RuntimeDiagnostics`] — every feed, batch coordinator, and output
14//!   channel status in one call.
15//!
16//! # Recommended consumption pattern
17//!
18//! ```text
19//! ┌──────────────────┐
20//! │ Dashboard / OPS  │
21//! └──────┬───────────┘
22//!        │ poll every 1–5 s
23//!        ▼
24//!   runtime.diagnostics()
25//!        │
26//!        ├─▶ RuntimeDiagnostics
27//!        │     .uptime
28//!        │     .feed_count / .max_feeds
29//!        │     .output_lag  (in_lag, pending_lost)
30//!        │     .batches: Vec<BatchDiagnostics>
31//!        │         .processor_id, .metrics
32//!        │     .feeds: Vec<FeedDiagnostics>  (sorted by FeedId)
33//!        │         .alive / .paused / .uptime
34//!        │         .metrics  (frames counters, restarts)
35//!        │         .queues   (source/sink depth + capacity)
36//!        │         .decode   (hw/sw codec status)
37//!        │         .view     (stability score, context status)
38//!        │         .batch_processor_id  (links to .batches)
39//!        │
40//!        │  (complement with event-driven streams:)
41//!        │
42//!   runtime.health_subscribe()   ← state transitions, errors, degradation
43//!   runtime.output_subscribe()   ← per-frame provenance, admission, detections
44//! ```
45//!
46//! The snapshot approach is intentionally poll-oriented. Snapshots are
47//! cheap (atomic loads plus small allocations for the `Vec` and decode
48//! detail `String`), idempotent, and do not perturb the pipeline.
49//! Event-driven details (individual health events, per-frame provenance)
50//! remain on their respective broadcast channels.
51
52use std::time::Duration;
53
54use nv_core::id::{FeedId, StageId};
55use nv_core::metrics::FeedMetrics;
56
57use crate::batch::BatchMetrics;
58use crate::feed_handle::{DecodeStatus, QueueTelemetry};
59
60// ---------------------------------------------------------------------------
61// Per-feed diagnostics
62// ---------------------------------------------------------------------------
63
64/// Consolidated per-feed diagnostics snapshot.
65///
66/// Combines lifecycle state, throughput metrics, queue depths, decode
67/// status, and view-system health into a single best-effort coherent
68/// snapshot. Each field is read from independent atomics/mutexes, so
69/// the snapshot is approximately — not transactionally — consistent.
70///
71/// Obtained via [`FeedHandle::diagnostics()`](crate::FeedHandle::diagnostics).
72#[derive(Debug, Clone)]
73pub struct FeedDiagnostics {
74    /// The feed's unique identifier.
75    pub feed_id: FeedId,
76    /// Whether the worker thread is still alive.
77    pub alive: bool,
78    /// Whether the feed is currently paused.
79    pub paused: bool,
80    /// Time since the current processing session started.
81    ///
82    /// Resets on each restart. A feed that restarts frequently will
83    /// show low uptime values.
84    pub uptime: Duration,
85    /// Frame counters, track count, view epoch, and restart count.
86    pub metrics: FeedMetrics,
87    /// Source and sink queue depths and capacities.
88    pub queues: QueueTelemetry,
89    /// Decode method selected by the media backend, if known.
90    ///
91    /// `None` until the stream starts and the backend confirms decoder
92    /// negotiation.
93    pub decode: Option<DecodeStatus>,
94    /// Current camera view-system status.
95    pub view: ViewDiagnostics,
96    /// The batch coordinator this feed submits to, if any.
97    ///
98    /// Use this to correlate with [`RuntimeDiagnostics::batches`] for
99    /// the coordinator's metrics.
100    pub batch_processor_id: Option<StageId>,
101}
102
103/// Summary of the camera view-system's current health.
104///
105/// Fixed cameras report `status: ViewStatus::Stable` and `stability_score: 1.0`.
106/// Observed (PTZ/moving) cameras reflect the live epoch policy output.
107#[derive(Debug, Clone, Copy, PartialEq)]
108pub struct ViewDiagnostics {
109    /// Current view epoch — incremented on significant view discontinuities.
110    pub epoch: u64,
111    /// Stability score in `[0.0, 1.0]`. `1.0` = fully stable.
112    pub stability_score: f32,
113    /// High-level view health status.
114    pub status: ViewStatus,
115}
116
117/// High-level camera view health.
118///
119/// This is a diagnostic summary of the underlying [`ContextValidity`]
120/// (from `nv-view`) — intentionally simpler to avoid forcing downstream
121/// consumers to depend on the view crate.
122///
123/// [`ContextValidity`]: nv_view::ContextValidity
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
125pub enum ViewStatus {
126    /// View is stable; temporal context is valid.
127    Stable,
128    /// View is changing; temporal context is degraded.
129    Degraded,
130    /// View has changed significantly; prior context is invalid.
131    Invalid,
132}
133
134// ---------------------------------------------------------------------------
135// Runtime-wide diagnostics
136// ---------------------------------------------------------------------------
137
138// ---------------------------------------------------------------------------
139// Batch diagnostics
140// ---------------------------------------------------------------------------
141
142/// Diagnostics snapshot for a batch coordinator.
143///
144/// One entry per coordinator created via `Runtime::create_batch()`.
145/// Included in [`RuntimeDiagnostics::batches`].
146#[derive(Debug, Clone)]
147pub struct BatchDiagnostics {
148    /// The processor's unique stage ID.
149    pub processor_id: StageId,
150    /// Live metrics snapshot (counters, timing, error state).
151    pub metrics: BatchMetrics,
152}
153
154// ---------------------------------------------------------------------------
155// Output lag diagnostics
156// ---------------------------------------------------------------------------
157
158/// Snapshot of the output broadcast channel's saturation state.
159///
160/// Obtained from the runtime's internal sentinel-based lag detector.
161/// A non-zero `pending_lost` during `in_lag == true` indicates the
162/// channel is saturated and subscribers may be losing messages.
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164pub struct OutputLagStatus {
165    /// Whether the output channel is currently saturated.
166    pub in_lag: bool,
167    /// Messages lost (sentinel-observed) since the last emitted
168    /// `HealthEvent::OutputLagged` event. Non-zero only during
169    /// active saturation.
170    pub pending_lost: u64,
171}
172
173// ---------------------------------------------------------------------------
174// Runtime-wide diagnostics
175// ---------------------------------------------------------------------------
176
177/// Consolidated runtime-wide diagnostics snapshot.
178///
179/// Provides a one-call overview of every feed, batch coordinator, and
180/// output channel health. Fields are read from independent sources, so
181/// the snapshot is best-effort coherent — not transactionally consistent.
182///
183/// Obtained via [`Runtime::diagnostics()`](crate::Runtime::diagnostics)
184/// or [`RuntimeHandle::diagnostics()`](crate::RuntimeHandle::diagnostics).
185#[derive(Debug, Clone)]
186pub struct RuntimeDiagnostics {
187    /// Elapsed time since the runtime was created.
188    pub uptime: Duration,
189    /// Number of currently active feeds.
190    pub feed_count: usize,
191    /// Maximum allowed concurrent feeds.
192    pub max_feeds: usize,
193    /// Per-feed diagnostics, sorted by [`FeedId`] for stable iteration.
194    pub feeds: Vec<FeedDiagnostics>,
195    /// Diagnostics for each batch coordinator owned by this runtime.
196    pub batches: Vec<BatchDiagnostics>,
197    /// Current output broadcast channel saturation status.
198    pub output_lag: OutputLagStatus,
199    /// Number of threads that were detached due to join timeouts and
200    /// have not yet been reaped. Non-zero values indicate threads that
201    /// are still blocked (e.g. in `OutputSink::emit()` or a batch
202    /// processor). Monitored and reaped on each diagnostics poll.
203    pub detached_thread_count: usize,
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn view_status_equality() {
212        assert_eq!(ViewStatus::Stable, ViewStatus::Stable);
213        assert_ne!(ViewStatus::Stable, ViewStatus::Degraded);
214        assert_ne!(ViewStatus::Degraded, ViewStatus::Invalid);
215    }
216
217    #[test]
218    fn view_diagnostics_debug() {
219        let v = ViewDiagnostics {
220            epoch: 3,
221            stability_score: 0.75,
222            status: ViewStatus::Degraded,
223        };
224        let dbg = format!("{v:?}");
225        assert!(dbg.contains("Degraded"));
226        assert!(dbg.contains("0.75"));
227    }
228}