Skip to main content

taceo_nodes_observability/
lib.rs

1#![deny(missing_docs)]
2//! Telemetry setup for the MPC-nodes.
3//!
4//! This module centralizes configuration and initialization of observability:
5//!
6//! * Reading service name, tracing endpoint and metrics exporter settings
7//!   from environment variables into [`TracingConfig`], [`MetricsConfig`] and
8//!   related structs.
9//! * Setting up logging/tracing (Datadog or a default `tracing-subscriber`).
10//! * Installing metrics exporters (Datadog, StatsD or Prometheus) based on
11//!   the chosen [`MetricsConfig`].
12//!
13//! Call [`initialize_tracing`] once at startup to configure tracing and metrics.
14
15use eyre::Context;
16use metrics_exporter_dogstatsd::DogStatsDBuilder;
17use secrecy::{ExposeSecret, SecretString};
18use std::net::SocketAddr;
19use std::str::FromStr;
20use std::time::Duration;
21use std::{backtrace::Backtrace, panic};
22use telemetry_batteries::tracing::{TracingShutdownHandle, datadog::DatadogBattery};
23use tracing_subscriber::{
24    EnvFilter,
25    fmt::{self},
26};
27use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
28
29/// Configuration for telemetry (tracing + metrics) of the service.
30///
31/// Typically constructed from environment variables via [`TracingConfig::try_from_env`]
32/// and passed to [`initialize_tracing`] during startup.
33#[derive(Debug, Clone)]
34pub struct TracingConfig {
35    /// Service name - used for logging, Datadog metrics and tracing
36    pub service_name: Option<String>,
37    /// Traces
38    pub traces_endpoint: Option<String>,
39    /// Metrics
40    pub metrics: Option<MetricsConfig>,
41}
42
43impl TracingConfig {
44    /// Build a [`TracingConfig`] from environment variables.
45    ///
46    /// Looks for:
47    /// * `TRACING_SERVICE_NAME`
48    /// * `TRACING_ENDPOINT`
49    ///
50    /// plus metrics-related variables for [`MetricsConfig`].
51    pub fn try_from_env() -> eyre::Result<Self> {
52        let service_name = match std::env::var("TRACING_SERVICE_NAME") {
53            Ok(name) => Some(name),
54            Err(std::env::VarError::NotPresent) => None,
55            Err(e) => {
56                eyre::bail!("Failed to read SERVICE_NAME from environment: {}", e);
57            }
58        };
59        let traces_endpoint = match std::env::var("TRACING_ENDPOINT") {
60            Ok(endpoint) => Some(endpoint),
61            Err(std::env::VarError::NotPresent) => None,
62            Err(e) => {
63                eyre::bail!("Failed to read TRACING_ENDPOINT from environment: {}", e);
64            }
65        };
66
67        let metrics_config = MetricsConfig::try_from_env()?;
68
69        Ok(Self {
70            service_name,
71            traces_endpoint,
72            metrics: metrics_config,
73        })
74    }
75}
76
77/// Metrics exporter configuration.
78///
79/// Decides which backend (Datadog, StatsD or Prometheus) to use.
80#[derive(Debug, Clone)]
81pub enum MetricsConfig {
82    /// Datadog config
83    Datadog(DatadogMetricsConfig),
84    /// StatsD config
85    StatsD(StatsDMetricsConfig),
86    /// Prometheus config
87    Prometheus(PrometheusMetricsConfig),
88}
89
90impl MetricsConfig {
91    /// Build a [`MetricsConfig`] from environment variables.
92    ///
93    /// Reads `METRICS_EXPORTER` to decide the backend and delegates to the
94    /// corresponding `try_from_env` for the chosen type.
95    pub fn try_from_env() -> eyre::Result<Option<Self>> {
96        match std::env::var("METRICS_EXPORTER") {
97            Ok(choice) => match choice.trim().to_lowercase().as_str() {
98                "datadog" => Ok(Some(Self::Datadog(
99                    DatadogMetricsConfig::try_from_env()
100                        .context("during constructing Datadog metrics exporter from environment")?,
101                ))),
102                "statsd" => Ok(Some(Self::StatsD(
103                    StatsDMetricsConfig::try_from_env()
104                        .context("during constructing StatsD metrics exporter from environment")?,
105                ))),
106                "prometheus" => Ok(Some(Self::Prometheus(
107                    PrometheusMetricsConfig::try_from_env().context(
108                        "during constructing Prometheus metrics exporter from environment",
109                    )?,
110                ))),
111                _ => eyre::bail!(
112                    "environment: METRICS_EXPORTER must be \"datadog\", \"statsd\", or \"prometheus\", not \"{}\"",
113                    choice
114                ),
115            },
116            Err(std::env::VarError::NotPresent) => Ok(None),
117            Err(e) => {
118                eyre::bail!("Failed to read METRICS_EXPORTER from environment: {}", e);
119            }
120        }
121    }
122}
123
124/// Datadog metrics exporter configuration (DogStatsD).
125#[derive(Debug, Clone)]
126pub struct DatadogMetricsConfig {
127    pub(crate) host: String,
128    pub(crate) port: u16,
129    pub(crate) prefix: Option<String>,
130}
131
132impl DatadogMetricsConfig {
133    /// Build a [`DatadogMetricsConfig`] from environment variables:
134    /// * `METRICS_DATADOG_HOST`
135    /// * `METRICS_DATADOG_PORT` (optional, defaults to 8125)
136    /// * `METRICS_DATADOG_PREFIX` (optional)
137    pub fn try_from_env() -> eyre::Result<Self> {
138        let host = match std::env::var("METRICS_DATADOG_HOST") {
139            Ok(host) => host,
140            Err(e) => {
141                eyre::bail!(
142                    "Failed to read METRICS_DATADOG_HOST from environment: {}",
143                    e
144                );
145            }
146        };
147        let port = match std::env::var("METRICS_DATADOG_PORT") {
148            Ok(port) => match port.parse() {
149                Ok(port) => port,
150                Err(e) => {
151                    eyre::bail!("Failed to parse port from METRICS_DATADOG_PORT: {}", e);
152                }
153            },
154            Err(std::env::VarError::NotPresent) => 8125u16,
155            Err(e) => {
156                eyre::bail!(
157                    "Failed to read METRICS_DATADOG_PORT from environment: {}",
158                    e
159                );
160            }
161        };
162        let prefix = match std::env::var("METRICS_DATADOG_PREFIX") {
163            Ok(prefix) => Some(prefix),
164            Err(std::env::VarError::NotPresent) => None,
165            Err(e) => {
166                eyre::bail!(
167                    "Failed to read METRICS_DATADOG_PREFIX from environment: {}",
168                    e
169                );
170            }
171        };
172        Ok(Self { host, port, prefix })
173    }
174}
175
176/// StatsD metrics exporter configuration.
177#[derive(Debug, Clone)]
178pub struct StatsDMetricsConfig {
179    pub(crate) host: String,
180    pub(crate) port: u16,
181    pub(crate) prefix: Option<String>,
182    pub(crate) queue_size: Option<usize>,
183    pub(crate) buffer_size: Option<usize>,
184}
185
186impl StatsDMetricsConfig {
187    /// Build a [`StatsDMetricsConfig`] from environment variables:
188    /// * `METRICS_STATSD_HOST` / `PORT` (port defaults to 8125)
189    /// * Optional `PREFIX`, `QUEUE_SIZE`, `BUFFER_SIZE`
190    pub fn try_from_env() -> eyre::Result<Self> {
191        let host = match std::env::var("METRICS_STATSD_HOST") {
192            Ok(host) => host,
193            Err(e) => {
194                eyre::bail!("Failed to read METRICS_STATSD_HOST from environment: {}", e);
195            }
196        };
197        let port = match std::env::var("METRICS_STATSD_PORT") {
198            Ok(port) => match port.parse() {
199                Ok(port) => port,
200                Err(e) => {
201                    eyre::bail!("Failed to parse port from METRICS_STATSD_PORT: {}", e);
202                }
203            },
204            Err(std::env::VarError::NotPresent) => 8125u16,
205            Err(e) => {
206                eyre::bail!("Failed to read METRICS_STATSD_PORT from environment: {}", e);
207            }
208        };
209        let prefix = match std::env::var("METRICS_STATSD_PREFIX") {
210            Ok(prefix) => Some(prefix),
211            Err(std::env::VarError::NotPresent) => None,
212            Err(e) => {
213                eyre::bail!(
214                    "Failed to read METRICS_STATSD_PREFIX from environment: {}",
215                    e
216                );
217            }
218        };
219        let queue_size = match std::env::var("METRICS_STATSD_QUEUE_SIZE") {
220            Ok(queue_size) => Some(
221                queue_size
222                    .parse()
223                    .context("during reading METRICS_STATSD_QUEUE_SIZE from environment")?,
224            ),
225            Err(std::env::VarError::NotPresent) => None,
226            Err(e) => {
227                eyre::bail!(
228                    "Failed to read METRICS_STATSD_QUEUE_SIZE from environment: {}",
229                    e
230                );
231            }
232        };
233        let buffer_size = match std::env::var("METRICS_STATSD_BUFFER_SIZE") {
234            Ok(buffer_size) => Some(
235                buffer_size
236                    .parse()
237                    .context("during reading METRICS_STATSD_BUFFER_SIZE from environment")?,
238            ),
239            Err(std::env::VarError::NotPresent) => None,
240            Err(e) => {
241                eyre::bail!(
242                    "Failed to read METRICS_STATSD_BUFFER_SIZE from environment: {}",
243                    e
244                );
245            }
246        };
247        Ok(Self {
248            host,
249            port,
250            prefix,
251            queue_size,
252            buffer_size,
253        })
254    }
255}
256
257/// Prometheus metrics exporter configuration.
258#[derive(Debug, Clone)]
259pub enum PrometheusMetricsConfig {
260    /// Prometheus scrape endpoint (the service exposes metrics over HTTP).
261    Scrape(ScrapePrometheusMetricsConfig),
262    /// Push mode (service pushes metrics to a gateway).
263    Push(PushPrometheusMetricsConfig),
264}
265
266impl PrometheusMetricsConfig {
267    /// Build a [`PrometheusMetricsConfig`] from environment variables:
268    /// * `METRICS_PROMETHEUS_MODE` (must be `scrape` or `push`)
269    ///
270    /// plus mode-specific variables.
271    pub fn try_from_env() -> eyre::Result<Self> {
272        match std::env::var("METRICS_PROMETHEUS_MODE") {
273            Ok(choice) => match choice.trim().to_lowercase().as_str() {
274                "scrape" => Ok(Self::Scrape(ScrapePrometheusMetricsConfig::try_from_env()?)),
275                "push" => Ok(Self::Push(PushPrometheusMetricsConfig::try_from_env()?)),
276                _ => eyre::bail!(
277                    "environment: METRICS_PROMETHEUS_MODE must be \"scrape\" or \"push\", not \"{}\"",
278                    choice
279                ),
280            },
281            Err(e) => {
282                eyre::bail!(
283                    "Failed to read METRICS_PROMETHEUS_MODE from environment: {}",
284                    e
285                );
286            }
287        }
288    }
289}
290
291/// Scrape mode Prometheus metrics configuration.
292#[derive(Debug, Clone)]
293pub struct ScrapePrometheusMetricsConfig {
294    pub(crate) bind_addr: Option<SocketAddr>,
295}
296
297impl ScrapePrometheusMetricsConfig {
298    /// Build a [`ScrapePrometheusMetricsConfig`] from environment variable
299    /// `METRICS_PROMETHEUS_BIND_ADDR` (optional).
300    pub fn try_from_env() -> eyre::Result<Self> {
301        match std::env::var("METRICS_PROMETHEUS_BIND_ADDR") {
302            Ok(bind_addr) => Ok(ScrapePrometheusMetricsConfig {
303                bind_addr: Some(
304                    bind_addr
305                        .parse()
306                        .context("during reading METRICS_PROMETHEUS_BIND_ADDR from environment")?,
307                ),
308            }),
309            Err(std::env::VarError::NotPresent) => {
310                Ok(ScrapePrometheusMetricsConfig { bind_addr: None })
311            }
312            Err(e) => {
313                eyre::bail!(
314                    "Failed to read METRICS_PROMETHEUS_BIND_ADDR from environment: {}",
315                    e
316                );
317            }
318        }
319    }
320}
321
322/// Push mode Prometheus metrics configuration.
323#[derive(Debug, Clone)]
324pub struct PushPrometheusMetricsConfig {
325    pub(crate) endpoint: String,
326    pub(crate) interval: Duration,
327    pub(crate) username: Option<SecretString>,
328    pub(crate) password: Option<SecretString>,
329    pub(crate) use_http_post_method: bool,
330}
331impl PushPrometheusMetricsConfig {
332    /// Build a [`PushPrometheusMetricsConfig`] from environment variables:
333    /// `METRICS_PROMETHEUS_ENDPOINT`, `INTERVAL`, `USERNAME`, `PASSWORD`,
334    /// `USE_HTTP_POST_METHOD`.
335    pub fn try_from_env() -> eyre::Result<Self> {
336        let endpoint = match std::env::var("METRICS_PROMETHEUS_ENDPOINT") {
337            Ok(endpoint) => endpoint,
338            Err(e) => {
339                eyre::bail!(
340                    "Failed to read METRICS_PROMETHEUS_ENDPOINT from environment: {}",
341                    e
342                );
343            }
344        };
345        let interval = match std::env::var("METRICS_PROMETHEUS_INTERVAL") {
346            Ok(interval) => {
347                std::time::Duration::from(humantime::Duration::from_str(&interval).context(
348                    "During parsing METRICS_PROMETHEUS_INTERVAL from env: \
349                              Expecting a duration string such as \"1h 24min\", \"29s\", ..",
350                )?)
351            }
352            Err(e) => {
353                eyre::bail!(
354                    "Failed to read METRICS_PROMETHEUS_INTERVAL from environment: {}",
355                    e
356                );
357            }
358        };
359        let username = match std::env::var("METRICS_PROMETHEUS_USERNAME") {
360            Ok(username) => Some(SecretString::from(username)),
361            Err(std::env::VarError::NotPresent) => None,
362            Err(e) => {
363                eyre::bail!(
364                    "Failed to read METRICS_PROMETHEUS_USERNAME from environment: {}",
365                    e
366                );
367            }
368        };
369        let password = match std::env::var("METRICS_PROMETHEUS_PASSWORD") {
370            Ok(password) => Some(SecretString::from(password)),
371            Err(std::env::VarError::NotPresent) => None,
372            Err(e) => {
373                eyre::bail!(
374                    "Failed to read METRICS_PROMETHEUS_PASSWORD from environment: {}",
375                    e
376                );
377            }
378        };
379        let use_http_post_method = match std::env::var("METRICS_PROMETHEUS_USE_HTTP_POST_METHOD") {
380            Ok(use_http_post_method) => use_http_post_method.parse().context(
381                "during reading METRICS_PROMETHEUS_USE_HTTP_POST_METHOD from environment (expecting bool)",
382            )?,
383            Err(std::env::VarError::NotPresent) => false,
384            Err(e) => {
385                eyre::bail!(
386                    "Failed to read METRICS_PROMETHEUS_USE_HTTP_POST_METHOD from environment: {}",
387                    e
388                );
389            }
390        };
391        Ok(PushPrometheusMetricsConfig {
392            endpoint,
393            interval,
394            username,
395            password,
396            use_http_post_method,
397        })
398    }
399}
400
401/// Initialize metrics exporter according to [`MetricsConfig`].
402///
403/// Called internally by [`initialize_tracing`] once configuration is loaded.
404pub fn initialize_metrics(config: &MetricsConfig) -> eyre::Result<()> {
405    match config {
406        MetricsConfig::Datadog(datadog_conf) => {
407            tracing::debug!("Setting up Datadog metrics exporter ..");
408            let mut builder = DogStatsDBuilder::default()
409                .with_remote_address(format!("{}:{}", &datadog_conf.host, datadog_conf.port))?
410                .send_histograms_as_distributions(true);
411            if let Some(prefix) = &datadog_conf.prefix {
412                builder = builder.set_global_prefix(prefix);
413            };
414            builder.install()?;
415        }
416        MetricsConfig::StatsD(statsd_conf) => {
417            tracing::debug!("Setting up StatsD metrics exporter ..");
418            let builder = metrics_exporter_statsd::StatsdBuilder::from(
419                statsd_conf.host.to_owned(),
420                statsd_conf.port,
421            );
422            let builder = {
423                if let Some(buffer_size) = statsd_conf.buffer_size {
424                    builder.with_buffer_size(buffer_size)
425                } else {
426                    builder
427                }
428            };
429            let builder = {
430                if let Some(queue_size) = statsd_conf.queue_size {
431                    builder.with_queue_size(queue_size)
432                } else {
433                    builder
434                }
435            };
436            let recorder = builder
437                .build(statsd_conf.prefix.as_deref())
438                .context("during building StatsD metrics exporter")?;
439            metrics::set_global_recorder(recorder)
440                .context("during setting StatsD metrics exporter as global recorder")?;
441        }
442        MetricsConfig::Prometheus(prometheus_conf) => match prometheus_conf {
443            PrometheusMetricsConfig::Scrape(scrape_conf) => {
444                tracing::debug!("Setting up Prometheus scrape metrics exporter ..");
445                let builder = if let Some(bind_addr) = scrape_conf.bind_addr {
446                    metrics_exporter_prometheus::PrometheusBuilder::new()
447                        .with_http_listener(bind_addr)
448                } else {
449                    metrics_exporter_prometheus::PrometheusBuilder::new()
450                };
451                builder.install().context(
452                    "during installing Prometheus scrape metrics exporter as global recorder",
453                )?;
454            }
455            PrometheusMetricsConfig::Push(push_conf) => {
456                tracing::debug!("Setting up Prometheus push metrics exporter ..");
457                metrics_exporter_prometheus::PrometheusBuilder::new()
458                    .with_push_gateway(
459                        &push_conf.endpoint,
460                        push_conf.interval,
461                        push_conf
462                            .username
463                            .to_owned()
464                            .map(|x| x.expose_secret().to_owned()),
465                        push_conf
466                            .password
467                            .to_owned()
468                            .map(|x| x.expose_secret().to_owned()),
469                        push_conf.use_http_post_method,
470                    )
471                    .context("during building Prometheus push metrics exporter")?
472                    .install()
473                    .context(
474                        "during installing Prometheus push metrics exporter as global recorder",
475                    )?;
476            }
477        },
478    };
479    Ok(())
480}
481
482/// Initializes structured logging/tracing for the service.
483///
484/// Depending on the [`TracingConfig`]:
485///
486/// * If a `service_name` is set, Datadog tracing is initialized and a custom
487///   panic hook is installed. The hook logs panic messages and their backtraces
488///   as a single line to make them easier to ingest by log aggregators.
489/// * Otherwise, a default `tracing-subscriber` registry with human-readable
490///   formatting and an environment-based filter is installed.
491///
492/// If the configuration also contains metrics settings, [`initialize_metrics`]
493/// is called automatically.
494///
495/// # Returns
496/// - `Ok(Some(handle))` if Datadog tracing was started. The [`TracingShutdownHandle`]
497///   can be used to flush/stop traces on shutdown.
498/// - `Ok(None)` if only the default `tracing` subscriber was set up.
499/// - An error if tracing or metrics initialization failed.
500///
501/// This is intended as a one-time setup call during service startup.
502pub fn initialize_tracing(config: &TracingConfig) -> eyre::Result<Option<TracingShutdownHandle>> {
503    let handle = {
504        if let Some(service_name) = config.service_name.as_deref() {
505            let tracing_shutdown_handle =
506                DatadogBattery::init(config.traces_endpoint.as_deref(), service_name, None, true);
507            // Set a custom panic hook to print backtraces on one line
508            panic::set_hook(Box::new(|panic_info| {
509                let message = match panic_info.payload().downcast_ref::<&str>() {
510                    Some(s) => *s,
511                    None => match panic_info.payload().downcast_ref::<String>() {
512                        Some(s) => s.as_str(),
513                        None => "Unknown panic message",
514                    },
515                };
516                let location = if let Some(location) = panic_info.location() {
517                    format!(
518                        "{}:{}:{}",
519                        location.file(),
520                        location.line(),
521                        location.column()
522                    )
523                } else {
524                    "Unknown location".to_string()
525                };
526
527                let backtrace = Backtrace::capture();
528                let backtrace_string = format!("{backtrace:?}");
529
530                let backtrace_single_line = backtrace_string.replace('\n', " | ");
531
532                tracing::error!(
533                    { backtrace = %backtrace_single_line, location = %location},
534                    "Panic occurred with message: {}",
535                    message
536                );
537            }));
538            Ok(Some(tracing_shutdown_handle))
539        } else {
540            install_tracing("info");
541            Ok(None)
542        }
543    };
544
545    if let Some(metrics_conf) = &config.metrics {
546        initialize_metrics(metrics_conf)?;
547    }
548
549    handle
550}
551
552/// Installs a local `tracing` subscriber with a formatted output layer and an environment filter.
553///
554/// This helper configures global tracing for the application by:
555/// - Building a `tracing_subscriber::fmt::Layer` with targets and line numbers disabled,
556///   producing concise logs.
557/// - Building an `EnvFilter` from either the `RUST_LOG` environment variable or the provided
558///   `env_filter` string as a fallback.
559/// - Registering the combined layers into the global `tracing_subscriber::registry`.
560///
561/// Call this once at startup to initialize tracing before any logs are emitted.
562///
563/// # Parameters
564/// - `env_filter`: Fallback filter string to use if no `RUST_LOG` environment variable is set.
565///
566/// # Panics
567/// Panics if initializing the subscriber fails (this usually happens only if another global
568/// subscriber is already set).
569pub fn install_tracing(env_filter: &str) {
570    let fmt_layer = fmt::layer().with_target(false).with_line_number(false);
571    let filter_layer = EnvFilter::try_from_default_env()
572        .or_else(|_| EnvFilter::try_new(env_filter))
573        .unwrap();
574
575    tracing_subscriber::registry()
576        .with(filter_layer)
577        .with(fmt_layer)
578        .init();
579}