Skip to main content

elara_runtime/observability/
mod.rs

1//! Observability module for ELARA Protocol
2//!
3//! This module provides structured logging, metrics collection, and distributed tracing
4//! capabilities for production deployments.
5//!
6//! # Features
7//!
8//! - **Structured Logging**: JSON/Pretty/Compact formats with per-module log levels
9//! - **Metrics Collection**: Counters, gauges, and histograms with thread-safe registry
10//! - **Distributed Tracing**: OpenTelemetry integration with Jaeger/Zipkin/OTLP support
11//! - **Unified Initialization**: Single entry point for all observability components
12//!
13//! # Unified Initialization Example
14//!
15//! ```no_run
16//! use elara_runtime::observability::{
17//!     ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput,
18//!     TracingConfig, TracingExporter, MetricsServerConfig, init_observability
19//! };
20//!
21//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
22//! let config = ObservabilityConfig {
23//!     logging: Some(LoggingConfig {
24//!         level: LogLevel::Info,
25//!         format: LogFormat::Json,
26//!         output: LogOutput::Stdout,
27//!     }),
28//!     tracing: Some(TracingConfig {
29//!         service_name: "elara-node".to_string(),
30//!         exporter: TracingExporter::Otlp {
31//!             endpoint: "http://localhost:4317".to_string(),
32//!         },
33//!         sampling_rate: 0.1,
34//!         resource_attributes: vec![],
35//!     }),
36//!     metrics_server: Some(MetricsServerConfig {
37//!         bind_address: "0.0.0.0".to_string(),
38//!         port: 9090,
39//!     }),
40//! };
41//!
42//! let handle = init_observability(config).await?;
43//!
44//! // Use observability throughout your application
45//! tracing::info!("Application started");
46//!
47//! // Graceful shutdown
48//! handle.shutdown().await?;
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! # Individual Component Example
54//!
55//! ```no_run
56//! use elara_runtime::observability::logging::{LoggingConfig, LogLevel, LogFormat, LogOutput, init_logging};
57//!
58//! let config = LoggingConfig {
59//!     level: LogLevel::Info,
60//!     format: LogFormat::Json,
61//!     output: LogOutput::Stdout,
62//! };
63//!
64//! init_logging(config).expect("Failed to initialize logging");
65//! ```
66
67pub mod logging;
68pub mod metrics;
69pub mod metrics_server;
70pub mod tracing;
71
72use std::sync::atomic::{AtomicBool, Ordering};
73use thiserror::Error;
74
75pub use logging::{init_logging, LogFormat, LogLevel, LogOutput, LoggingConfig, LoggingError};
76pub use metrics::{Counter, Gauge, Histogram, MetricsError, MetricsRegistry, NodeMetrics};
77pub use metrics_server::{MetricsServer, MetricsServerConfig, MetricsServerError};
78pub use tracing::{init_tracing, TracingConfig, TracingError, TracingExporter, TracingHandle};
79
80/// Global flag to track if observability has been initialized.
81/// This ensures idempotency - init_observability can only be called once.
82static OBSERVABILITY_INITIALIZED: AtomicBool = AtomicBool::new(false);
83
84/// Reset the observability initialization flag.
85///
86/// **WARNING**: This function is only for testing purposes and should never be used
87/// in production code. It allows tests to re-initialize the observability system
88/// by resetting the global initialization flag.
89///
90/// # Safety
91///
92/// This function is only available when running tests. Using this in production
93/// could lead to undefined behavior as it allows multiple initializations of global state.
94#[doc(hidden)]
95pub fn reset_observability_for_testing() {
96    OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
97    // Also reset logging flag since it's initialized as part of observability
98    logging::reset_logging_for_testing();
99}
100
101/// Unified configuration for all observability components.
102///
103/// This struct combines configuration for logging, tracing, and metrics server.
104/// All components are optional - set to `None` to disable a component.
105///
106/// # Example
107///
108/// ```no_run
109/// use elara_runtime::observability::{
110///     ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput,
111///     TracingConfig, TracingExporter, MetricsServerConfig
112/// };
113///
114/// // Enable all components
115/// let config = ObservabilityConfig {
116///     logging: Some(LoggingConfig {
117///         level: LogLevel::Info,
118///         format: LogFormat::Json,
119///         output: LogOutput::Stdout,
120///     }),
121///     tracing: Some(TracingConfig {
122///         service_name: "elara-node".to_string(),
123///         exporter: TracingExporter::Otlp {
124///             endpoint: "http://localhost:4317".to_string(),
125///         },
126///         sampling_rate: 0.1,
127///         resource_attributes: vec![],
128///     }),
129///     metrics_server: Some(MetricsServerConfig {
130///         bind_address: "0.0.0.0".to_string(),
131///         port: 9090,
132///     }),
133/// };
134/// ```
135#[derive(Debug, Clone)]
136pub struct ObservabilityConfig {
137    /// Optional logging configuration. If `None`, logging is not initialized.
138    pub logging: Option<LoggingConfig>,
139
140    /// Optional tracing configuration. If `None`, tracing is not initialized.
141    pub tracing: Option<TracingConfig>,
142
143    /// Optional metrics server configuration. If `None`, metrics server is not started.
144    pub metrics_server: Option<MetricsServerConfig>,
145}
146
147impl Default for ObservabilityConfig {
148    fn default() -> Self {
149        Self {
150            logging: None,
151            tracing: None,
152            metrics_server: None,
153        }
154    }
155}
156
157/// Handle for managing the observability system lifecycle.
158///
159/// This handle provides graceful shutdown for all initialized observability components.
160/// It holds references to the tracing handle and metrics server, allowing coordinated
161/// shutdown of all components.
162///
163/// # Example
164///
165/// ```no_run
166/// # use elara_runtime::observability::{ObservabilityConfig, init_observability};
167/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
168/// let config = ObservabilityConfig::default();
169/// let handle = init_observability(config).await?;
170///
171/// // ... application runs ...
172///
173/// // Graceful shutdown
174/// handle.shutdown().await?;
175/// # Ok(())
176/// # }
177/// ```
178pub struct ObservabilityHandle {
179    /// Handle for the tracing system (if initialized)
180    tracing_handle: Option<TracingHandle>,
181
182    /// Handle for the metrics server (if started)
183    metrics_server: Option<MetricsServer>,
184
185    /// Metrics registry (always created, even if server is not started)
186    metrics_registry: MetricsRegistry,
187}
188
189impl ObservabilityHandle {
190    /// Shutdown all observability components gracefully.
191    ///
192    /// This method:
193    /// 1. Shuts down the tracing system (flushes pending spans)
194    /// 2. Shuts down the metrics server (stops HTTP server)
195    ///
196    /// After calling this method, no more telemetry will be exported.
197    ///
198    /// # Errors
199    ///
200    /// Returns an error if any component fails to shut down gracefully.
201    ///
202    /// # Example
203    ///
204    /// ```no_run
205    /// # use elara_runtime::observability::{ObservabilityConfig, init_observability};
206    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
207    /// # let config = ObservabilityConfig::default();
208    /// let handle = init_observability(config).await?;
209    ///
210    /// // Graceful shutdown
211    /// handle.shutdown().await?;
212    /// # Ok(())
213    /// # }
214    /// ```
215    pub async fn shutdown(mut self) -> Result<(), ObservabilityError> {
216        // Shutdown tracing first (flush pending spans)
217        if let Some(tracing_handle) = self.tracing_handle.take() {
218            tracing_handle
219                .shutdown()
220                .await
221                .map_err(ObservabilityError::TracingShutdown)?;
222        }
223
224        // Shutdown metrics server
225        if let Some(mut metrics_server) = self.metrics_server.take() {
226            metrics_server.shutdown().await;
227        }
228
229        Ok(())
230    }
231
232    /// Returns a reference to the metrics registry.
233    ///
234    /// The metrics registry is always available, even if the metrics server
235    /// was not started. This allows applications to collect metrics without
236    /// exposing them via HTTP.
237    ///
238    /// # Example
239    ///
240    /// ```no_run
241    /// # use elara_runtime::observability::{ObservabilityConfig, init_observability};
242    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
243    /// # let config = ObservabilityConfig::default();
244    /// let handle = init_observability(config).await?;
245    ///
246    /// // Access metrics registry
247    /// let counter = handle.metrics_registry().register_counter("my_counter", vec![]);
248    /// counter.inc();
249    /// # Ok(())
250    /// # }
251    /// ```
252    pub fn metrics_registry(&self) -> &MetricsRegistry {
253        &self.metrics_registry
254    }
255
256    /// Returns true if the metrics server is running.
257    pub fn is_metrics_server_running(&self) -> bool {
258        self.metrics_server
259            .as_ref()
260            .map_or(false, |s| s.is_running())
261    }
262}
263
264/// Errors that can occur during observability initialization or shutdown.
265#[derive(Debug, Error)]
266pub enum ObservabilityError {
267    /// Observability system has already been initialized
268    #[error("Observability system already initialized")]
269    AlreadyInitialized,
270
271    /// Failed to initialize logging
272    #[error("Failed to initialize logging: {0}")]
273    LoggingInit(#[from] LoggingError),
274
275    /// Failed to initialize tracing
276    #[error("Failed to initialize tracing: {0}")]
277    TracingInit(#[from] TracingError),
278
279    /// Failed to start metrics server
280    #[error("Failed to start metrics server: {0}")]
281    MetricsServerStart(#[from] MetricsServerError),
282
283    /// Failed to shutdown tracing
284    #[error("Failed to shutdown tracing: {0}")]
285    TracingShutdown(TracingError),
286}
287
288/// Initialize the unified observability system.
289///
290/// This function provides a single entry point for initializing all observability
291/// components (logging, tracing, metrics server). It is idempotent - calling it
292/// multiple times will return an error after the first call.
293///
294/// # Components
295///
296/// - **Logging**: Structured logging with configurable format and output
297/// - **Tracing**: Distributed tracing with OpenTelemetry support
298/// - **Metrics Server**: HTTP server exposing Prometheus metrics
299///
300/// All components are optional. Set a component to `None` in the config to disable it.
301///
302/// # Arguments
303///
304/// * `config` - Unified observability configuration
305///
306/// # Returns
307///
308/// * `Ok(ObservabilityHandle)` - Handle for graceful shutdown and metrics access
309/// * `Err(ObservabilityError)` - If initialization fails or already initialized
310///
311/// # Idempotency
312///
313/// This function can only be called once per process. Subsequent calls will return
314/// `Err(ObservabilityError::AlreadyInitialized)`. This ensures that global state
315/// (logging subscriber, tracer provider) is only set once.
316///
317/// # Example
318///
319/// ```no_run
320/// use elara_runtime::observability::{
321///     ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput,
322///     TracingConfig, TracingExporter, MetricsServerConfig, init_observability
323/// };
324///
325/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
326/// let config = ObservabilityConfig {
327///     logging: Some(LoggingConfig {
328///         level: LogLevel::Info,
329///         format: LogFormat::Json,
330///         output: LogOutput::Stdout,
331///     }),
332///     tracing: Some(TracingConfig {
333///         service_name: "elara-node".to_string(),
334///         exporter: TracingExporter::Otlp {
335///             endpoint: "http://localhost:4317".to_string(),
336///         },
337///         sampling_rate: 0.1,
338///         resource_attributes: vec![
339///             ("environment".to_string(), "production".to_string()),
340///         ],
341///     }),
342///     metrics_server: Some(MetricsServerConfig {
343///         bind_address: "0.0.0.0".to_string(),
344///         port: 9090,
345///     }),
346/// };
347///
348/// let handle = init_observability(config).await?;
349///
350/// // Use observability throughout your application
351/// tracing::info!("Application started");
352///
353/// // Graceful shutdown
354/// handle.shutdown().await?;
355/// # Ok(())
356/// # }
357/// ```
358///
359/// # Minimal Example (Logging Only)
360///
361/// ```no_run
362/// use elara_runtime::observability::{
363///     ObservabilityConfig, LoggingConfig, LogLevel, LogFormat, LogOutput, init_observability
364/// };
365///
366/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
367/// let config = ObservabilityConfig {
368///     logging: Some(LoggingConfig {
369///         level: LogLevel::Info,
370///         format: LogFormat::Pretty,
371///         output: LogOutput::Stdout,
372///     }),
373///     tracing: None,
374///     metrics_server: None,
375/// };
376///
377/// let handle = init_observability(config).await?;
378/// # Ok(())
379/// # }
380/// ```
381pub async fn init_observability(
382    config: ObservabilityConfig,
383) -> Result<ObservabilityHandle, ObservabilityError> {
384    // Check if already initialized (idempotency)
385    if OBSERVABILITY_INITIALIZED.swap(true, Ordering::SeqCst) {
386        return Err(ObservabilityError::AlreadyInitialized);
387    }
388
389    // Initialize logging if configured
390    if let Some(logging_config) = config.logging {
391        match init_logging(logging_config) {
392            Ok(()) => {},
393            Err(LoggingError::AlreadyInitialized) => {
394                // In tests, logging might already be initialized by a previous test
395                // This is acceptable since tests run serially with #[serial]
396            },
397            Err(e) => {
398                // Reset flag on error
399                OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
400                return Err(ObservabilityError::LoggingInit(e));
401            }
402        }
403    }
404
405    // Initialize tracing if configured
406    let tracing_handle = if let Some(tracing_config) = config.tracing {
407        Some(init_tracing(tracing_config).await.map_err(|e| {
408            // Reset flag on error
409            OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
410            ObservabilityError::TracingInit(e)
411        })?)
412    } else {
413        None
414    };
415
416    // Create metrics registry (always created, even if server is not started)
417    let metrics_registry = MetricsRegistry::new();
418
419    // Start metrics server if configured
420    let metrics_server = if let Some(metrics_server_config) = config.metrics_server {
421        let mut server = MetricsServer::new(metrics_server_config, metrics_registry.clone());
422        server.start().await.map_err(|e| {
423            // Reset flag on error
424            OBSERVABILITY_INITIALIZED.store(false, Ordering::SeqCst);
425            ObservabilityError::MetricsServerStart(e)
426        })?;
427        Some(server)
428    } else {
429        None
430    };
431
432    Ok(ObservabilityHandle {
433        tracing_handle,
434        metrics_server,
435        metrics_registry,
436    })
437}