Skip to main content

miden_node_utils/
logging.rs

1use std::str::FromStr;
2use std::sync::OnceLock;
3
4use opentelemetry::trace::TracerProvider as _;
5use opentelemetry::{KeyValue, Value};
6use opentelemetry_sdk::Resource;
7use opentelemetry_sdk::propagation::TraceContextPropagator;
8use opentelemetry_sdk::resource::{EnvResourceDetector, TelemetryResourceDetector};
9use opentelemetry_sdk::trace::SdkTracerProvider;
10use tracing::subscriber::Subscriber;
11use tracing_opentelemetry::OpenTelemetryLayer;
12use tracing_subscriber::layer::{Filter, SubscriberExt};
13use tracing_subscriber::{Layer, Registry};
14
15use crate::tracing::OpenTelemetrySpanExt;
16
17/// Global tracer provider for flushing traces on panic.
18///
19/// This is necessary because the panic hook needs access to the tracer provider to flush
20/// pending spans before the program terminates.
21static TRACER_PROVIDER: OnceLock<SdkTracerProvider> = OnceLock::new();
22
23/// Default OpenTelemetry resource attributes for this process.
24#[derive(Clone, Default)]
25pub struct ResourceConfig {
26    service_name: Option<&'static str>,
27    attributes: Vec<(&'static str, &'static str)>,
28}
29
30impl ResourceConfig {
31    #[must_use]
32    pub fn with_name(mut self, service_name: &'static str) -> Self {
33        self.service_name = Some(service_name);
34        self
35    }
36
37    #[must_use]
38    pub fn with_attribute(mut self, key: &'static str, value: &'static str) -> Self {
39        self.attributes.push((key, value));
40        self
41    }
42}
43
44/// Configures [`setup_tracing`] to enable or disable the open-telemetry exporter.
45#[derive(Clone)]
46pub enum OpenTelemetry {
47    Enabled(ResourceConfig),
48    Disabled,
49}
50
51impl OpenTelemetry {
52    pub fn enabled() -> Self {
53        OpenTelemetry::Enabled(ResourceConfig::default())
54    }
55
56    pub fn from_env() -> Self {
57        if otlp_endpoint_configured() {
58            OpenTelemetry::enabled()
59        } else {
60            OpenTelemetry::Disabled
61        }
62    }
63
64    #[must_use]
65    pub fn with_name(self, service_name: &'static str) -> Self {
66        match self {
67            OpenTelemetry::Enabled(config) => {
68                OpenTelemetry::Enabled(config.with_name(service_name))
69            },
70            OpenTelemetry::Disabled => OpenTelemetry::Disabled,
71        }
72    }
73
74    #[must_use]
75    pub fn with_attribute(self, key: &'static str, value: &'static str) -> Self {
76        match self {
77            OpenTelemetry::Enabled(config) => {
78                OpenTelemetry::Enabled(config.with_attribute(key, value))
79            },
80            OpenTelemetry::Disabled => OpenTelemetry::Disabled,
81        }
82    }
83
84    fn is_enabled(&self) -> bool {
85        matches!(self, OpenTelemetry::Enabled(_))
86    }
87
88    fn resource_config(self) -> Option<ResourceConfig> {
89        match self {
90            OpenTelemetry::Enabled(config) => Some(config),
91            OpenTelemetry::Disabled => None,
92        }
93    }
94}
95
96/// A guard that shuts down the tracer provider when dropped. This ensures that the logs are flushed
97/// to the exporter before the program exits.
98pub struct OtelGuard {
99    tracer_provider: SdkTracerProvider,
100}
101
102impl Drop for OtelGuard {
103    fn drop(&mut self) {
104        if let Err(err) = self.tracer_provider.shutdown() {
105            eprintln!("{err:?}");
106        }
107    }
108}
109
110/// Initializes tracing to stdout and optionally an open-telemetry exporter.
111///
112/// Trace filtering defaults to `INFO` and can be configured using the conventional `RUST_LOG`
113/// environment variable.
114///
115/// The open-telemetry configuration is controlled via environment variables as defined in the
116/// [specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#opentelemetry-protocol-exporter)
117///
118/// Registers a panic hook so that panic errors are reported to the open-telemetry exporter.
119///
120/// Returns an [`OtelGuard`] if open-telemetry is enabled, otherwise `None`. When this guard is
121/// dropped, the tracer provider is shutdown.
122pub fn setup_tracing(otel: OpenTelemetry) -> anyhow::Result<Option<OtelGuard>> {
123    if otel.is_enabled() {
124        opentelemetry::global::set_text_map_propagator(TraceContextPropagator::new());
125    }
126
127    // Note: open-telemetry requires a tokio-runtime, so this _must_ be lazily evaluated (aka not
128    // `then_some`) to avoid crashing sync callers (with OpenTelemetry::Disabled set). Examples of
129    // such callers are tests with logging enabled.
130    let tracer_provider = if otel.is_enabled() {
131        let provider = init_tracer_provider(
132            otel.resource_config()
133                .expect("resource config is set when OpenTelemetry is enabled"),
134        )?;
135
136        // Store the provider globally so the panic hook can flush it. SdkTracerProvider is
137        // internally reference-counted, so cloning is cheap.
138        TRACER_PROVIDER
139            .set(provider.clone())
140            .expect("setup_tracing should only be called once");
141
142        Some(provider)
143    } else {
144        None
145    };
146    let otel_layer = tracer_provider.as_ref().map(|provider| {
147        OpenTelemetryLayer::new(provider.tracer("tracing-otel-subscriber")).boxed()
148    });
149
150    let subscriber = Registry::default()
151        .with(stdout_layer().with_filter(env_or_default_filter()))
152        .with(otel_layer.with_filter(env_or_default_filter()));
153    tracing::subscriber::set_global_default(subscriber).map_err(Into::<anyhow::Error>::into)?;
154
155    // Register panic hook now that tracing is initialized. This chains with the default panic hook
156    // to preserve backtrace printing.
157    let default_hook = std::panic::take_hook();
158    std::panic::set_hook(Box::new(move |info| {
159        tracing::error!(panic = true, info = %info, "panic");
160
161        // Mark the current span as failed for OpenTelemetry.
162        let info_str = info.to_string();
163        let wrapped = anyhow::Error::msg(info_str);
164        tracing::Span::current().set_error(wrapped.as_ref());
165
166        // Flush traces before the program terminates. This ensures the panic trace is exported even
167        // though the OtelGuard won't be dropped.
168        if let Some(provider) = TRACER_PROVIDER.get() {
169            if let Err(err) = provider.force_flush() {
170                eprintln!("Failed to flush traces on panic: {err:?}");
171            }
172        }
173
174        // Call the default hook to print the backtrace.
175        default_hook(info);
176    }));
177
178    Ok(tracer_provider.map(|tracer_provider| OtelGuard { tracer_provider }))
179}
180
181fn init_tracer_provider(resource_config: ResourceConfig) -> anyhow::Result<SdkTracerProvider> {
182    let builder = opentelemetry_otlp::SpanExporter::builder().with_tonic();
183
184    let exporter = builder.build()?;
185    let resource = resource(resource_config);
186
187    Ok(opentelemetry_sdk::trace::SdkTracerProvider::builder()
188        .with_resource(resource)
189        .with_batch_exporter(exporter)
190        .build())
191}
192
193fn resource(config: ResourceConfig) -> Resource {
194    let detected_resource = Resource::builder_empty()
195        .with_detector(Box::new(TelemetryResourceDetector))
196        .with_detector(Box::new(EnvResourceDetector::new()))
197        .build();
198
199    resource_from_detected(config, &detected_resource, otel_service_name_override())
200}
201
202fn resource_from_detected(
203    config: ResourceConfig,
204    detected_resource: &Resource,
205    service_name_override: Option<Value>,
206) -> Resource {
207    const SERVICE_NAME: &str = "service.name";
208    const SERVICE_NAMESPACE: &str = "service.namespace";
209
210    let mut attributes =
211        std::collections::BTreeMap::from([(SERVICE_NAMESPACE.to_string(), Value::from("miden"))]);
212
213    if let Some(service_name) = config.service_name {
214        attributes.insert(SERVICE_NAME.to_string(), Value::from(service_name));
215    }
216
217    for (key, value) in config.attributes {
218        attributes.insert(key.to_string(), Value::from(value));
219    }
220
221    // Environment resource attributes override defaults above, and OTEL_SERVICE_NAME overrides
222    // both.
223    for (key, value) in detected_resource {
224        attributes.insert(key.as_str().to_string(), value.clone());
225    }
226
227    if let Some(service_name) = service_name_override {
228        attributes.insert(SERVICE_NAME.to_string(), service_name);
229    }
230
231    Resource::builder_empty()
232        .with_attributes(attributes.into_iter().map(|(key, value)| KeyValue::new(key, value)))
233        .build()
234}
235
236fn otel_service_name_override() -> Option<Value> {
237    std::env::var("OTEL_SERVICE_NAME")
238        .ok()
239        .filter(|value| !value.is_empty())
240        .map(Value::from)
241}
242
243fn otlp_endpoint_configured() -> bool {
244    ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "OTEL_EXPORTER_OTLP_ENDPOINT"]
245        .into_iter()
246        .any(|key| std::env::var(key).is_ok_and(|value| !value.trim().is_empty()))
247}
248
249/// Initializes tracing to a test exporter.
250///
251/// Allows trace content to be inspected via the returned receiver.
252///
253/// All tests that use this function must be annotated with `#[serial(open_telemetry_tracing)]`.
254/// This forces serialization of all such tests. Otherwise, the tested spans could
255/// be interleaved during runtime. Also, the global exporter could be re-initialized in
256/// the middle of a concurrently running test.
257#[cfg(feature = "testing")]
258pub fn setup_test_tracing() -> anyhow::Result<(
259    tokio::sync::mpsc::UnboundedReceiver<opentelemetry_sdk::trace::SpanData>,
260    tokio::sync::mpsc::UnboundedReceiver<()>,
261)> {
262    let (exporter, rx_export, rx_shutdown) =
263        opentelemetry_sdk::testing::trace::new_tokio_test_exporter();
264
265    let tracer_provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
266        .with_batch_exporter(exporter)
267        .build();
268    let otel_layer =
269        OpenTelemetryLayer::new(tracer_provider.tracer("tracing-otel-subscriber")).boxed();
270    let subscriber = Registry::default()
271        .with(stdout_layer().with_filter(env_or_default_filter()))
272        .with(otel_layer.with_filter(env_or_default_filter()));
273    tracing::subscriber::set_global_default(subscriber)?;
274    Ok((rx_export, rx_shutdown))
275}
276
277#[cfg(not(feature = "tracing-forest"))]
278fn stdout_layer<S>() -> Box<dyn tracing_subscriber::Layer<S> + Send + Sync + 'static>
279where
280    S: Subscriber,
281    for<'a> S: tracing_subscriber::registry::LookupSpan<'a>,
282{
283    use tracing_subscriber::fmt::format::FmtSpan;
284
285    tracing_subscriber::fmt::layer()
286        .pretty()
287        .compact()
288        .with_level(true)
289        .with_file(true)
290        .with_line_number(true)
291        .with_target(true)
292        .with_span_events(FmtSpan::CLOSE)
293        .boxed()
294}
295
296#[cfg(feature = "tracing-forest")]
297fn stdout_layer<S>() -> Box<dyn tracing_subscriber::Layer<S> + Send + Sync + 'static>
298where
299    S: Subscriber,
300    for<'a> S: tracing_subscriber::registry::LookupSpan<'a>,
301{
302    tracing_forest::ForestLayer::default().boxed()
303}
304
305/// Creates a filter from the `RUST_LOG` env var with a default of `INFO` if unset.
306///
307/// # Panics
308///
309/// Panics if `RUST_LOG` fails to parse.
310fn env_or_default_filter<S>() -> Box<dyn Filter<S> + Send + Sync + 'static> {
311    use tracing::level_filters::LevelFilter;
312    use tracing_subscriber::EnvFilter;
313    use tracing_subscriber::filter::{FilterExt, Targets};
314
315    // `tracing` does not allow differentiating between invalid and missing env var so we manually
316    // do this instead. The alternative is to silently ignore parsing errors which I think is worse.
317    match std::env::var(EnvFilter::DEFAULT_ENV) {
318        Ok(rust_log) => FilterExt::boxed(
319            EnvFilter::from_str(&rust_log)
320                .expect("RUST_LOG should contain a valid filter configuration"),
321        ),
322        Err(std::env::VarError::NotUnicode(_)) => panic!("RUST_LOG contained non-unicode"),
323        Err(std::env::VarError::NotPresent) => {
324            // Default level is INFO, and additionally enable logs from axum extractor rejections.
325            FilterExt::boxed(
326                Targets::new()
327                    .with_default(LevelFilter::INFO)
328                    .with_target("axum::rejection", LevelFilter::TRACE),
329            )
330        },
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use opentelemetry::Key;
337
338    use super::*;
339
340    #[test]
341    fn resource_uses_configured_defaults() {
342        let detected_resource = Resource::builder_empty()
343            .with_attributes([KeyValue::new("telemetry.sdk.language", "rust")])
344            .build();
345
346        let resource = resource_from_detected(
347            ResourceConfig::default()
348                .with_name("node")
349                .with_attribute("miden.node.role", "sequencer"),
350            &detected_resource,
351            None,
352        );
353
354        assert_eq!(resource_value(&resource, "service.name"), Some(Value::from("node")),);
355        assert_eq!(resource_value(&resource, "service.namespace"), Some(Value::from("miden")),);
356        assert_eq!(resource_value(&resource, "miden.node.role"), Some(Value::from("sequencer")),);
357        assert_eq!(resource_value(&resource, "telemetry.sdk.language"), Some(Value::from("rust")),);
358    }
359
360    #[test]
361    fn resource_prefers_detected_attributes_over_configured_defaults() {
362        let detected_resource = Resource::builder_empty()
363            .with_attributes([
364                KeyValue::new("service.name", "custom-node"),
365                KeyValue::new("service.namespace", "custom-namespace"),
366                KeyValue::new("miden.node.role", "custom-role"),
367            ])
368            .build();
369
370        let resource = resource_from_detected(
371            ResourceConfig::default()
372                .with_name("node")
373                .with_attribute("miden.node.role", "sequencer"),
374            &detected_resource,
375            None,
376        );
377
378        assert_eq!(resource_value(&resource, "service.name"), Some(Value::from("custom-node")),);
379        assert_eq!(
380            resource_value(&resource, "service.namespace"),
381            Some(Value::from("custom-namespace")),
382        );
383        assert_eq!(resource_value(&resource, "miden.node.role"), Some(Value::from("custom-role")),);
384    }
385
386    #[test]
387    fn resource_prefers_explicit_service_name_override() {
388        let detected_resource = Resource::builder_empty()
389            .with_attributes([KeyValue::new("service.name", "resource-attribute-node")])
390            .build();
391
392        let resource = resource_from_detected(
393            ResourceConfig::default().with_name("node"),
394            &detected_resource,
395            Some(Value::from("service-env-node")),
396        );
397
398        assert_eq!(
399            resource_value(&resource, "service.name"),
400            Some(Value::from("service-env-node")),
401        );
402    }
403
404    fn resource_value(resource: &Resource, key: &'static str) -> Option<Value> {
405        resource.get(&Key::from_static_str(key))
406    }
407}