Skip to main content

polyc_runtime/
observability.rs

1//! Logging + traces + panic discipline (PRD §11).
2//!
3//! All logs go to **stderr** (stdout is reserved for program data). Format
4//! is selected by `RUST_LOG_FORMAT` (`json` in prod, pretty otherwise);
5//! verbosity by `RUST_LOG` via `EnvFilter`.
6//!
7//! When `OTEL_EXPORTER_OTLP_ENDPOINT` is set, the tracing subscriber also
8//! gains an OpenTelemetry layer that batches spans and ships them via OTLP
9//! gRPC to the configured collector. `OTEL_SERVICE_NAME` overrides the
10//! service name (defaults to the caller's argument). Dev runs without
11//! that env var pay zero overhead — the layer is not constructed.
12//!
13//! Spans propagate through the in-process `tokio` runtime via
14//! `tracing-opentelemetry`; cross-process propagation across the
15//! `connectrpc` boundary uses the W3C `traceparent` header — wire it through
16//! the dialer when running in production.
17
18use std::io;
19
20use opentelemetry::KeyValue;
21use opentelemetry::global;
22use opentelemetry::trace::TracerProvider as _;
23use opentelemetry_otlp::SpanExporter;
24use opentelemetry_sdk::Resource;
25use opentelemetry_sdk::propagation::TraceContextPropagator;
26use opentelemetry_sdk::trace::SdkTracerProvider;
27use tracing_subscriber::{EnvFilter, fmt, prelude::*};
28
29/// Handle returned from [`init`] — keep it alive for the process lifetime;
30/// `drop` flushes traces before exit. `SdkTracerProvider::shutdown` blocks
31/// until exporters drain.
32pub struct ShutdownGuard {
33    provider: Option<SdkTracerProvider>,
34}
35
36impl Drop for ShutdownGuard {
37    fn drop(&mut self) {
38        if let Some(provider) = self.provider.take() {
39            // Best-effort: log to stderr and continue if flush fails.
40            if let Err(err) = provider.shutdown() {
41                eprintln!("opentelemetry shutdown failed: {err}");
42            }
43        }
44    }
45}
46
47/// Initialise the global tracing subscriber + (optional) `OTel` layer.
48///
49/// Also installs the panic hook. Call once, early in `main`. The returned
50/// guard flushes traces on drop — keep it alive for the process lifetime.
51#[must_use]
52pub fn init(service_name: &str) -> ShutdownGuard {
53    let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
54    let json = std::env::var("RUST_LOG_FORMAT").is_ok_and(|v| v.eq_ignore_ascii_case("json"));
55
56    let (tracer, provider) = build_otel_tracer(service_name);
57    let otel_layer = tracer.map(|t| tracing_opentelemetry::layer().with_tracer(t));
58
59    // Install the W3C `traceparent` propagator unconditionally. The format
60    // half is cheap (a unit struct), and installing it always means the
61    // dialer's `inject_*` calls and the server's `extract_*` calls behave
62    // consistently whether or not OTLP export is enabled — without it, the
63    // global getter returns a no-op propagator and `traceparent` headers
64    // silently disappear, which is the worst-of-both-worlds debug state.
65    // See `crate::propagation` for the matching inject/extract helpers.
66    global::set_text_map_propagator(TraceContextPropagator::new());
67
68    let registry = tracing_subscriber::registry().with(filter).with(otel_layer);
69    if json {
70        let fmt_layer = fmt::layer()
71            .json()
72            .flatten_event(true)
73            .with_writer(io::stderr);
74        registry.with(fmt_layer).init();
75    } else {
76        let fmt_layer = fmt::layer().with_writer(io::stderr);
77        registry.with(fmt_layer).init();
78    }
79
80    install_panic_hook();
81    ShutdownGuard { provider }
82}
83
84/// Build the OpenTelemetry tracer (the thing the tracing layer wraps), or
85/// `(None, None)` if no OTLP endpoint is configured. Also installs the
86/// global tracer provider so libraries that talk to
87/// `opentelemetry::global` see it.
88///
89/// Caveat: "the global" means *this* otel version's static. The lock
90/// currently carries a second otel stack (0.31, pinned by
91/// commonware-runtime — see the lockstep comment in the workspace
92/// Cargo.toml) whose own `global` stays the no-op default; only libraries
93/// emitting through the shared `tracing` facade are version-proof.
94fn build_otel_tracer(
95    service_name: &str,
96) -> (
97    Option<opentelemetry_sdk::trace::Tracer>,
98    Option<SdkTracerProvider>,
99) {
100    if std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").is_err() {
101        return (None, None);
102    }
103    let resolved_name = std::env::var("OTEL_SERVICE_NAME")
104        .ok()
105        .unwrap_or_else(|| service_name.to_owned());
106    let exporter = match SpanExporter::builder().with_tonic().build() {
107        Ok(exp) => exp,
108        Err(err) => {
109            eprintln!("opentelemetry OTLP exporter build failed: {err}; continuing without traces");
110            return (None, None);
111        }
112    };
113    let resource = Resource::builder()
114        .with_attribute(KeyValue::new("service.name", resolved_name.clone()))
115        .build();
116    let provider = SdkTracerProvider::builder()
117        .with_batch_exporter(exporter)
118        .with_resource(resource)
119        .build();
120    let tracer = provider.tracer(resolved_name);
121    global::set_tracer_provider(provider.clone());
122    (Some(tracer), Some(provider))
123}
124
125/// Emit a single structured `error` event on panic, then defer to the previous
126/// hook (which, under `panic = "abort"`, terminates the process).
127fn install_panic_hook() {
128    let previous = std::panic::take_hook();
129    std::panic::set_hook(Box::new(move |info| {
130        let location = info
131            .location()
132            .map_or_else(|| "unknown".to_owned(), ToString::to_string);
133        tracing::error!(panic = %info, location = %location, "process panicked");
134        previous(info);
135    }));
136}