elara_runtime/observability/mod.rs
1//! Observability module for ELARA Protocol
2//!
3//! This module provides structured logging, metrics collection, and distributed tracing
4//! capabilities for production deployments.
5//!
6//! # Features
7//!
8//! - **Structured Logging**: JSON/Pretty/Compact formats with per-module log levels
9//! - **Metrics Collection**: Counters, gauges, and histograms with thread-safe registry
10//! - **Distributed Tracing**: OpenTelemetry integration with Jaeger/Zipkin/OTLP support
11//! - **Unified Initialization**: Single entry point for all observability components
12//!
13//! # Unified Initialization Example
14//!
15//! ```no_run
16//! use elara_runtime::observability::{
17//! ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput,
18//! TracingConfig, TracingExporter, MetricsServerConfig, init_observability
19//! };
20//!
21//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
22//! let config = ObservabilityConfig {
23//! logging: Some(LoggingConfig {
24//! level: LogLevel::Info,
25//! format: LogFormat::Json,
26//! output: LogOutput::Stdout,
27//! }),
28//! tracing: Some(TracingConfig {
29//! service_name: "elara-node".to_string(),
30//! exporter: TracingExporter::Otlp {
31//! endpoint: "http://localhost:4317".to_string(),
32//! },
33//! sampling_rate: 0.1,
34//! resource_attributes: vec![],
35//! }),
36//! metrics_server: Some(MetricsServerConfig {
37//! bind_address: "0.0.0.0".to_string(),
38//! port: 9090,
39//! }),
40//! };
41//!
42//! let handle = init_observability(config).await?;
43//!
44//! // Use observability throughout your application
45//! tracing::info!("Application started");
46//!
47//! // Graceful shutdown
48//! handle.shutdown().await?;
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! # Individual Component Example
54//!
55//! ```no_run
56//! use elara_runtime::observability::logging::{LoggingConfig, LogLevel, LogFormat, LogOutput, init_logging};
57//!
58//! let config = LoggingConfig {
59//! level: LogLevel::Info,
60//! format: LogFormat::Json,
61//! output: LogOutput::Stdout,
62//! };
63//!
64//! init_logging(config).expect("Failed to initialize logging");
65//! ```
66
67pub mod logging;
68pub mod metrics;
69pub mod metrics_server;
70pub mod tracing;
71
72use std::sync::atomic::{AtomicBool, Ordering};
73use thiserror::Error;
74
75pub use logging::{init_logging, LogFormat, LogLevel, LogOutput, LoggingConfig, LoggingError};
76pub use metrics::{Counter, Gauge, Histogram, MetricsError, MetricsRegistry, NodeMetrics};
77pub use metrics_server::{MetricsServer, MetricsServerConfig, MetricsServerError};
78pub use tracing::{init_tracing, TracingConfig, TracingError, TracingExporter, TracingHandle};
79
80/// Global flag to track if observability has been initialized.
81/// This ensures idempotency - init_observability can only be called once.
82static OBSERVABILITY_INITIALIZED: AtomicBool = AtomicBool::new(false);
83
84/// Reset the observability initialization flag.
85///
86/// **WARNING**: This function is only for testing purposes and should never be used
87/// in production code. It allows tests to re-initialize the observability system
88/// by resetting the global initialization flag.
89///
90/// # Safety
91///
92/// This function is only available when running tests. Using this in production
93/// could lead to undefined behavior as it allows multiple initializations of global state.
94#[doc(hidden)]
95pub fn reset_observability_for_testing() {
96 OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
97 // Also reset logging flag since it's initialized as part of observability
98 logging::reset_logging_for_testing();
99}
100
101/// Unified configuration for all observability components.
102///
103/// This struct combines configuration for logging, tracing, and metrics server.
104/// All components are optional - set to `None` to disable a component.
105///
106/// # Example
107///
108/// ```no_run
109/// use elara_runtime::observability::{
110/// ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput,
111/// TracingConfig, TracingExporter, MetricsServerConfig
112/// };
113///
114/// // Enable all components
115/// let config = ObservabilityConfig {
116/// logging: Some(LoggingConfig {
117/// level: LogLevel::Info,
118/// format: LogFormat::Json,
119/// output: LogOutput::Stdout,
120/// }),
121/// tracing: Some(TracingConfig {
122/// service_name: "elara-node".to_string(),
123/// exporter: TracingExporter::Otlp {
124/// endpoint: "http://localhost:4317".to_string(),
125/// },
126/// sampling_rate: 0.1,
127/// resource_attributes: vec![],
128/// }),
129/// metrics_server: Some(MetricsServerConfig {
130/// bind_address: "0.0.0.0".to_string(),
131/// port: 9090,
132/// }),
133/// };
134/// ```
135#[derive(Debug, Clone)]
136pub struct ObservabilityConfig {
137 /// Optional logging configuration. If `None`, logging is not initialized.
138 pub logging: Option<LoggingConfig>,
139
140 /// Optional tracing configuration. If `None`, tracing is not initialized.
141 pub tracing: Option<TracingConfig>,
142
143 /// Optional metrics server configuration. If `None`, metrics server is not started.
144 pub metrics_server: Option<MetricsServerConfig>,
145}
146
147impl Default for ObservabilityConfig {
148 fn default() -> Self {
149 Self {
150 logging: None,
151 tracing: None,
152 metrics_server: None,
153 }
154 }
155}
156
157/// Handle for managing the observability system lifecycle.
158///
159/// This handle provides graceful shutdown for all initialized observability components.
160/// It holds references to the tracing handle and metrics server, allowing coordinated
161/// shutdown of all components.
162///
163/// # Example
164///
165/// ```no_run
166/// # use elara_runtime::observability::{ObservabilityConfig, init_observability};
167/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
168/// let config = ObservabilityConfig::default();
169/// let handle = init_observability(config).await?;
170///
171/// // ... application runs ...
172///
173/// // Graceful shutdown
174/// handle.shutdown().await?;
175/// # Ok(())
176/// # }
177/// ```
178pub struct ObservabilityHandle {
179 /// Handle for the tracing system (if initialized)
180 tracing_handle: Option<TracingHandle>,
181
182 /// Handle for the metrics server (if started)
183 metrics_server: Option<MetricsServer>,
184
185 /// Metrics registry (always created, even if server is not started)
186 metrics_registry: MetricsRegistry,
187}
188
189impl ObservabilityHandle {
190 /// Shutdown all observability components gracefully.
191 ///
192 /// This method:
193 /// 1. Shuts down the tracing system (flushes pending spans)
194 /// 2. Shuts down the metrics server (stops HTTP server)
195 ///
196 /// After calling this method, no more telemetry will be exported.
197 ///
198 /// # Errors
199 ///
200 /// Returns an error if any component fails to shut down gracefully.
201 ///
202 /// # Example
203 ///
204 /// ```no_run
205 /// # use elara_runtime::observability::{ObservabilityConfig, init_observability};
206 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
207 /// # let config = ObservabilityConfig::default();
208 /// let handle = init_observability(config).await?;
209 ///
210 /// // Graceful shutdown
211 /// handle.shutdown().await?;
212 /// # Ok(())
213 /// # }
214 /// ```
215 pub async fn shutdown(mut self) -> Result<(), ObservabilityError> {
216 // Shutdown tracing first (flush pending spans)
217 if let Some(tracing_handle) = self.tracing_handle.take() {
218 tracing_handle
219 .shutdown()
220 .await
221 .map_err(ObservabilityError::TracingShutdown)?;
222 }
223
224 // Shutdown metrics server
225 if let Some(mut metrics_server) = self.metrics_server.take() {
226 metrics_server.shutdown().await;
227 }
228
229 Ok(())
230 }
231
232 /// Returns a reference to the metrics registry.
233 ///
234 /// The metrics registry is always available, even if the metrics server
235 /// was not started. This allows applications to collect metrics without
236 /// exposing them via HTTP.
237 ///
238 /// # Example
239 ///
240 /// ```no_run
241 /// # use elara_runtime::observability::{ObservabilityConfig, init_observability};
242 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
243 /// # let config = ObservabilityConfig::default();
244 /// let handle = init_observability(config).await?;
245 ///
246 /// // Access metrics registry
247 /// let counter = handle.metrics_registry().register_counter("my_counter", vec![]);
248 /// counter.inc();
249 /// # Ok(())
250 /// # }
251 /// ```
252 pub fn metrics_registry(&self) -> &MetricsRegistry {
253 &self.metrics_registry
254 }
255
256 /// Returns true if the metrics server is running.
257 pub fn is_metrics_server_running(&self) -> bool {
258 self.metrics_server
259 .as_ref()
260 .map_or(false, |s| s.is_running())
261 }
262}
263
264/// Errors that can occur during observability initialization or shutdown.
265#[derive(Debug, Error)]
266pub enum ObservabilityError {
267 /// Observability system has already been initialized
268 #[error("Observability system already initialized")]
269 AlreadyInitialized,
270
271 /// Failed to initialize logging
272 #[error("Failed to initialize logging: {0}")]
273 LoggingInit(#[from] LoggingError),
274
275 /// Failed to initialize tracing
276 #[error("Failed to initialize tracing: {0}")]
277 TracingInit(#[from] TracingError),
278
279 /// Failed to start metrics server
280 #[error("Failed to start metrics server: {0}")]
281 MetricsServerStart(#[from] MetricsServerError),
282
283 /// Failed to shutdown tracing
284 #[error("Failed to shutdown tracing: {0}")]
285 TracingShutdown(TracingError),
286}
287
288/// Initialize the unified observability system.
289///
290/// This function provides a single entry point for initializing all observability
291/// components (logging, tracing, metrics server). It is idempotent - calling it
292/// multiple times will return an error after the first call.
293///
294/// # Components
295///
296/// - **Logging**: Structured logging with configurable format and output
297/// - **Tracing**: Distributed tracing with OpenTelemetry support
298/// - **Metrics Server**: HTTP server exposing Prometheus metrics
299///
300/// All components are optional. Set a component to `None` in the config to disable it.
301///
302/// # Arguments
303///
304/// * `config` - Unified observability configuration
305///
306/// # Returns
307///
308/// * `Ok(ObservabilityHandle)` - Handle for graceful shutdown and metrics access
309/// * `Err(ObservabilityError)` - If initialization fails or already initialized
310///
311/// # Idempotency
312///
313/// This function can only be called once per process. Subsequent calls will return
314/// `Err(ObservabilityError::AlreadyInitialized)`. This ensures that global state
315/// (logging subscriber, tracer provider) is only set once.
316///
317/// # Example
318///
319/// ```no_run
320/// use elara_runtime::observability::{
321/// ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput,
322/// TracingConfig, TracingExporter, MetricsServerConfig, init_observability
323/// };
324///
325/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
326/// let config = ObservabilityConfig {
327/// logging: Some(LoggingConfig {
328/// level: LogLevel::Info,
329/// format: LogFormat::Json,
330/// output: LogOutput::Stdout,
331/// }),
332/// tracing: Some(TracingConfig {
333/// service_name: "elara-node".to_string(),
334/// exporter: TracingExporter::Otlp {
335/// endpoint: "http://localhost:4317".to_string(),
336/// },
337/// sampling_rate: 0.1,
338/// resource_attributes: vec![
339/// ("environment".to_string(), "production".to_string()),
340/// ],
341/// }),
342/// metrics_server: Some(MetricsServerConfig {
343/// bind_address: "0.0.0.0".to_string(),
344/// port: 9090,
345/// }),
346/// };
347///
348/// let handle = init_observability(config).await?;
349///
350/// // Use observability throughout your application
351/// tracing::info!("Application started");
352///
353/// // Graceful shutdown
354/// handle.shutdown().await?;
355/// # Ok(())
356/// # }
357/// ```
358///
359/// # Minimal Example (Logging Only)
360///
361/// ```no_run
362/// use elara_runtime::observability::{
363/// ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput, init_observability
364/// };
365///
366/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
367/// let config = ObservabilityConfig {
368/// logging: Some(LoggingConfig {
369/// level: LogLevel::Info,
370/// format: LogFormat::Pretty,
371/// output: LogOutput::Stdout,
372/// }),
373/// tracing: None,
374/// metrics_server: None,
375/// };
376///
377/// let handle = init_observability(config).await?;
378/// # Ok(())
379/// # }
380/// ```
381pub async fn init_observability(
382 config: ObservabilityConfig,
383) -> Result<ObservabilityHandle, ObservabilityError> {
384 // Check if already initialized (idempotency)
385 if OBSERVABILITY_INITIALIZED.swap(true, Ordering::SeqCst) {
386 return Err(ObservabilityError::AlreadyInitialized);
387 }
388
389 // Initialize logging if configured
390 if let Some(logging_config) = config.logging {
391 match init_logging(logging_config) {
392 Ok(()) => {},
393 Err(LoggingError::AlreadyInitialized) => {
394 // In tests, logging might already be initialized by a previous test
395 // This is acceptable since tests run serially with #[serial]
396 },
397 Err(e) => {
398 // Reset flag on error
399 OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
400 return Err(ObservabilityError::LoggingInit(e));
401 }
402 }
403 }
404
405 // Initialize tracing if configured
406 let tracing_handle = if let Some(tracing_config) = config.tracing {
407 Some(init_tracing(tracing_config).await.map_err(|e| {
408 // Reset flag on error
409 OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
410 ObservabilityError::TracingInit(e)
411 })?)
412 } else {
413 None
414 };
415
416 // Create metrics registry (always created, even if server is not started)
417 let metrics_registry = MetricsRegistry::new();
418
419 // Start metrics server if configured
420 let metrics_server = if let Some(metrics_server_config) = config.metrics_server {
421 let mut server = MetricsServer::new(metrics_server_config, metrics_registry.clone());
422 server.start().await.map_err(|e| {
423 // Reset flag on error
424 OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
425 ObservabilityError::MetricsServerStart(e)
426 })?;
427 Some(server)
428 } else {
429 None
430 };
431
432 Ok(ObservabilityHandle {
433 tracing_handle,
434 metrics_server,
435 metrics_registry,
436 })
437}