Skip to main content

sc_observability_types/
health.rs

1use std::path::PathBuf;
2use std::sync::Arc;
3
4use serde::{Deserialize, Serialize};
5
6use crate::{DiagnosticSummary, SinkName, telemetry_health_provider_sealed};
7
8/// Top-level health state for the lightweight logging layer.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
10pub enum LoggingHealthState {
11    /// Logging is operating normally.
12    Healthy,
13    /// Logging is operating but dropping some events or flushes.
14    DegradedDropping,
15    /// Logging is unavailable.
16    Unavailable,
17}
18
19/// Health state for an individual log sink.
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
21pub enum SinkHealthState {
22    /// The sink is operating normally.
23    Healthy,
24    /// The sink is operating but dropping writes.
25    DegradedDropping,
26    /// The sink is unavailable.
27    Unavailable,
28}
29
30/// Health summary for one concrete logging sink.
31#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
32pub struct SinkHealth {
33    /// Stable sink name.
34    pub name: SinkName,
35    /// Current sink health state.
36    pub state: SinkHealthState,
37    /// Optional last sink error summary.
38    pub last_error: Option<DiagnosticSummary>,
39}
40
41/// Aggregate logging health report.
42#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
43pub struct LoggingHealthReport {
44    /// Aggregate logging health state.
45    pub state: LoggingHealthState,
46    /// Total dropped log events.
47    pub dropped_events_total: u64,
48    /// Total flush failures.
49    pub flush_errors_total: u64,
50    /// Active JSONL log path used by the logger.
51    pub active_log_path: PathBuf,
52    /// Per-sink health snapshots.
53    pub sink_statuses: Vec<SinkHealth>,
54    /// Optional query/follow health snapshot.
55    pub query: Option<QueryHealthReport>,
56    /// Optional last logging error summary.
57    pub last_error: Option<DiagnosticSummary>,
58}
59
60/// Top-level health state for historical query and follow availability.
61#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
62pub enum QueryHealthState {
63    /// Query and follow are operating normally.
64    Healthy,
65    /// Query and follow are operating with degraded behavior.
66    Degraded,
67    /// Query and follow are unavailable.
68    Unavailable,
69}
70
71/// Aggregate health report for the shared query/follow surface.
72#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
73pub struct QueryHealthReport {
74    /// Aggregate query/follow health state.
75    pub state: QueryHealthState,
76    /// Optional last query/follow error summary.
77    pub last_error: Option<DiagnosticSummary>,
78}
79
80/// Top-level health state for the observation routing runtime.
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
82pub enum ObservationHealthState {
83    /// Routing is operating normally.
84    Healthy,
85    /// Routing is operating with degraded behavior.
86    Degraded,
87    /// Routing is unavailable.
88    Unavailable,
89}
90
91/// Top-level health state for telemetry export.
92#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
93pub enum TelemetryHealthState {
94    /// Telemetry is disabled by configuration.
95    Disabled,
96    /// Telemetry is operating normally.
97    Healthy,
98    /// Telemetry is operating with degraded exporters or dropped data.
99    Degraded,
100    /// Telemetry is unavailable.
101    Unavailable,
102}
103
104/// Health state for an individual telemetry exporter.
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
106pub enum ExporterHealthState {
107    /// The exporter is operating normally.
108    Healthy,
109    /// The exporter is operating with degraded behavior.
110    Degraded,
111    /// The exporter is unavailable.
112    Unavailable,
113}
114
115/// Health summary for one configured telemetry exporter.
116#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
117pub struct ExporterHealth {
118    /// Stable exporter name.
119    pub name: SinkName,
120    /// Current exporter health state.
121    pub state: ExporterHealthState,
122    /// Optional last exporter error summary.
123    pub last_error: Option<DiagnosticSummary>,
124}
125
126/// Aggregate telemetry/export health report.
127#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
128pub struct TelemetryHealthReport {
129    /// Aggregate telemetry health state.
130    pub state: TelemetryHealthState,
131    /// Total dropped exports.
132    pub dropped_exports_total: u64,
133    /// Total malformed or incomplete spans observed by telemetry.
134    pub malformed_spans_total: u64,
135    /// Per-exporter health snapshots.
136    pub exporter_statuses: Vec<ExporterHealth>,
137    /// Optional last telemetry error summary.
138    pub last_error: Option<DiagnosticSummary>,
139}
140
141/// Shared contract for exposing telemetry health without an OTLP crate dependency.
142pub trait ObservabilityHealthProvider:
143    telemetry_health_provider_sealed::Sealed + Send + Sync
144{
145    /// Returns the current telemetry health snapshot.
146    fn telemetry_health(&self) -> TelemetryHealthReport;
147}
148
149impl<T> telemetry_health_provider_sealed::Sealed for Arc<T>
150where
151    T: ObservabilityHealthProvider + ?Sized,
152{
153    fn token(&self) -> telemetry_health_provider_sealed::Token {
154        (**self).token()
155    }
156}
157
158impl<T> ObservabilityHealthProvider for Arc<T>
159where
160    T: ObservabilityHealthProvider + ?Sized,
161{
162    fn telemetry_health(&self) -> TelemetryHealthReport {
163        (**self).telemetry_health()
164    }
165}
166
167/// Aggregate routing/runtime health report.
168#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
169pub struct ObservabilityHealthReport {
170    /// Aggregate routing health state.
171    pub state: ObservationHealthState,
172    /// Total observations dropped because no route handled them.
173    pub dropped_observations_total: u64,
174    /// Total subscriber failures recorded by the runtime.
175    pub subscriber_failures_total: u64,
176    /// Total projector failures recorded by the runtime.
177    pub projection_failures_total: u64,
178    /// Optional attached logging health.
179    pub logging: Option<LoggingHealthReport>,
180    /// Optional attached telemetry health.
181    pub telemetry: Option<TelemetryHealthReport>,
182    /// Optional last routing error summary.
183    pub last_error: Option<DiagnosticSummary>,
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189    use serde_json::{Map, json};
190
191    use crate::{
192        Diagnostic, ErrorCode, ExporterHealth, ExporterHealthState, LoggingHealthState,
193        QueryHealthState, Remediation, Timestamp,
194    };
195
196    fn diagnostic() -> Diagnostic {
197        Diagnostic {
198            timestamp: Timestamp::UNIX_EPOCH,
199            code: ErrorCode::new_static("SC_TEST_DIAGNOSTIC"),
200            message: "diagnostic invalid".to_string(),
201            cause: Some("invalid example".to_string()),
202            remediation: Remediation::recoverable(
203                "fix the input",
204                ["rerun the command", "review the docs"],
205            ),
206            docs: Some("https://example.test/docs".to_string()),
207            details: Map::from_iter([("key".to_string(), json!("value"))]),
208        }
209    }
210
211    #[test]
212    fn health_reports_round_trip_through_serde() {
213        let sink = SinkHealth {
214            name: SinkName::new("jsonl").expect("valid sink name"),
215            state: SinkHealthState::Healthy,
216            last_error: Some(DiagnosticSummary::from(&diagnostic())),
217        };
218        let logging = LoggingHealthReport {
219            state: LoggingHealthState::Healthy,
220            dropped_events_total: 0,
221            flush_errors_total: 0,
222            active_log_path: std::path::PathBuf::from("logs/service.log.jsonl"),
223            sink_statuses: vec![sink],
224            query: Some(QueryHealthReport {
225                state: QueryHealthState::Healthy,
226                last_error: None,
227            }),
228            last_error: None,
229        };
230        let telemetry = TelemetryHealthReport {
231            state: TelemetryHealthState::Healthy,
232            dropped_exports_total: 1,
233            malformed_spans_total: 0,
234            exporter_statuses: vec![ExporterHealth {
235                name: SinkName::new("otlp").expect("valid sink name"),
236                state: ExporterHealthState::Degraded,
237                last_error: Some(DiagnosticSummary::from(&diagnostic())),
238            }],
239            last_error: Some(DiagnosticSummary::from(&diagnostic())),
240        };
241        let report = ObservabilityHealthReport {
242            state: ObservationHealthState::Degraded,
243            dropped_observations_total: 2,
244            subscriber_failures_total: 3,
245            projection_failures_total: 4,
246            logging: Some(logging),
247            telemetry: Some(telemetry),
248            last_error: Some(DiagnosticSummary::from(&diagnostic())),
249        };
250
251        let encoded = serde_json::to_string(&report).expect("serialize observability health");
252        let decoded: ObservabilityHealthReport =
253            serde_json::from_str(&encoded).expect("deserialize observability health");
254        assert_eq!(decoded, report);
255    }
256}