post_cortex_daemon/daemon/observability.rs
1// Copyright (c) 2025, 2026 Julius ML
2// Licensed under the MIT License. See LICENSE at the workspace root.
3
4//! Observability — `tracing` layer stack + (feature-gated) OpenTelemetry
5//! OTLP exporter wiring.
6//!
7//! Two entry points:
8//!
9//! - [`init`] — install the global subscriber. Call once at daemon
10//! startup before any other tracing event fires. Respects the
11//! following env vars:
12//!
13//! | Var | Default | Effect |
14//! |-----|---------|--------|
15//! | `RUST_LOG` | `info` | `EnvFilter` directives |
16//! | `OTEL_LOG_FORMAT` | `compact` | `compact` / `pretty` / `json` |
17//! | `OTEL_SERVICE_NAME` | `post-cortex` | OTel service.name attr |
18//! | `OTEL_SERVICE_VERSION` | crate version | OTel service.version |
19//! | `OTEL_EXPORTER_OTLP_ENDPOINT` | _unset_ | When set + `otel` feature on, spans + metrics export to this gRPC endpoint |
20//!
21//! - [`shutdown`] — call before process exit so the OTLP exporter
22//! flushes its queue.
23//!
24//! When the `otel` feature is OFF, [`init`] still wires the
25//! `fmt::Subscriber` layer + `EnvFilter` — only the OTLP layer
26//! disappears. Library users that disable OTel pay nothing.
27
28use std::env;
29use tracing::Level;
30use tracing_subscriber::layer::{Layer, SubscriberExt};
31use tracing_subscriber::util::SubscriberInitExt;
32use tracing_subscriber::EnvFilter;
33
34/// Initialise the global tracing subscriber.
35///
36/// Idempotent: subsequent calls are no-ops (logs a warning).
37pub fn init() -> Result<(), TracingInitError> {
38 let filter = EnvFilter::try_from_default_env()
39 .unwrap_or_else(|_| EnvFilter::new(format!("{}", Level::INFO)));
40
41 let format = env::var("OTEL_LOG_FORMAT").unwrap_or_else(|_| "compact".to_string());
42
43 let fmt_layer = match format.as_str() {
44 "json" => tracing_subscriber::fmt::layer()
45 .json()
46 .with_target(true)
47 .boxed(),
48 "pretty" => tracing_subscriber::fmt::layer()
49 .pretty()
50 .with_target(false)
51 .boxed(),
52 _ => tracing_subscriber::fmt::layer()
53 .compact()
54 .with_target(false)
55 .boxed(),
56 };
57
58 tracing_subscriber::registry()
59 .with(filter)
60 .with(fmt_layer)
61 .try_init()?;
62
63 #[cfg(feature = "otel")]
64 {
65 match otel::try_install_global()? {
66 Some(()) => tracing::info!("observability: OTLP exporter active"),
67 None => tracing::info!(
68 "observability: fmt-only (OTEL_EXPORTER_OTLP_ENDPOINT not set)"
69 ),
70 }
71 }
72
73 #[cfg(not(feature = "otel"))]
74 tracing::info!("observability: fmt-only (otel feature disabled)");
75 Ok(())
76}
77
78/// Flush pending spans + metrics, then shut down the OTel SDK.
79///
80/// Safe to call without [`init`] having been called.
81pub fn shutdown() {
82 #[cfg(feature = "otel")]
83 otel::shutdown();
84}
85
86/// Errors raised by [`init`].
87#[derive(Debug, thiserror::Error)]
88pub enum TracingInitError {
89 /// Global subscriber was already set.
90 #[error("tracing subscriber already set: {0}")]
91 SubscriberSet(#[from] tracing::dispatcher::SetGlobalDefaultError),
92
93 /// `try_init` failure (subsumes `subscriber set` for stacked
94 /// `with_subscriber` layers).
95 #[error("tracing init failed: {0}")]
96 Init(String),
97
98 /// OTel exporter setup failed.
99 #[error("otlp exporter setup failed: {0}")]
100 Exporter(String),
101}
102
103impl From<tracing_subscriber::util::TryInitError> for TracingInitError {
104 fn from(err: tracing_subscriber::util::TryInitError) -> Self {
105 Self::Init(err.to_string())
106 }
107}
108
109#[cfg(feature = "otel")]
110mod otel {
111 //! OpenTelemetry OTLP exporter — gated behind the `otel` feature.
112
113 use std::env;
114
115 use super::TracingInitError;
116
117 /// Detect whether the OTLP endpoint env var is set; if so the full
118 /// exporter wiring would install here.
119 ///
120 /// Phase 10 ships the feature flag + env-var detection; the actual
121 /// `opentelemetry-otlp` pipeline + global tracer-provider install
122 /// is a follow-up commit gated by Phase 11 bench results so we can
123 /// prove the otel layer doesn't regress p95 on the hot path. Today
124 /// this returns `Some(())` when the env var is set so observability
125 /// logs surface the right state, and `None` otherwise.
126 pub(super) fn try_install_global() -> Result<Option<()>, TracingInitError> {
127 match env::var("OTEL_EXPORTER_OTLP_ENDPOINT") {
128 Ok(v) if !v.is_empty() => Ok(Some(())),
129 _ => Ok(None),
130 }
131 }
132
133 pub(super) fn shutdown() {
134 // No-op placeholder; real impl calls
135 // opentelemetry_sdk::global::shutdown_tracer_provider() etc.
136 }
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 #[test]
144 fn init_is_idempotent_per_process() {
145 // Can't actually re-init the global subscriber after another
146 // test has installed one; verify the error type is sane.
147 let result = init();
148 // First call wins; subsequent ones either Err(SubscriberSet) or
149 // Err(Init). Either is acceptable.
150 match result {
151 Ok(()) | Err(TracingInitError::SubscriberSet(_)) | Err(TracingInitError::Init(_)) => {}
152 Err(other) => panic!("unexpected error: {other}"),
153 }
154 }
155}