polyc_runtime/observability.rs
1//! Logging + traces + panic discipline (PRD §11).
2//!
3//! All logs go to **stderr** (stdout is reserved for program data). Format
4//! is selected by `RUST_LOG_FORMAT` (`json` in prod, pretty otherwise);
5//! verbosity by `RUST_LOG` via `EnvFilter`.
6//!
7//! When `OTEL_EXPORTER_OTLP_ENDPOINT` is set, the tracing subscriber also
8//! gains an OpenTelemetry layer that batches spans and ships them via OTLP
9//! gRPC to the configured collector. `OTEL_SERVICE_NAME` overrides the
10//! service name (defaults to the caller's argument). Dev runs without
11//! that env var pay zero overhead — the layer is not constructed.
12//!
13//! Spans propagate through the in-process `tokio` runtime via
14//! `tracing-opentelemetry`; cross-process propagation across the
15//! `connectrpc` boundary uses the W3C `traceparent` header — wire it through
16//! the dialer when running in production.
17
18use std::io;
19
20use opentelemetry::KeyValue;
21use opentelemetry::global;
22use opentelemetry::trace::TracerProvider as _;
23use opentelemetry_otlp::SpanExporter;
24use opentelemetry_sdk::Resource;
25use opentelemetry_sdk::propagation::TraceContextPropagator;
26use opentelemetry_sdk::trace::SdkTracerProvider;
27use tracing_subscriber::{EnvFilter, fmt, prelude::*};
28
29/// Handle returned from [`init`] — keep it alive for the process lifetime;
30/// `drop` flushes traces before exit. `SdkTracerProvider::shutdown` blocks
31/// until exporters drain.
32pub struct ShutdownGuard {
33 provider: Option<SdkTracerProvider>,
34}
35
36impl Drop for ShutdownGuard {
37 fn drop(&mut self) {
38 if let Some(provider) = self.provider.take() {
39 // Best-effort: log to stderr and continue if flush fails.
40 if let Err(err) = provider.shutdown() {
41 eprintln!("opentelemetry shutdown failed: {err}");
42 }
43 }
44 }
45}
46
47/// Initialise the global tracing subscriber + (optional) `OTel` layer.
48///
49/// Also installs the panic hook. Call once, early in `main`. The returned
50/// guard flushes traces on drop — keep it alive for the process lifetime.
51#[must_use]
52pub fn init(service_name: &str) -> ShutdownGuard {
53 let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
54 let json = std::env::var("RUST_LOG_FORMAT").is_ok_and(|v| v.eq_ignore_ascii_case("json"));
55
56 let (tracer, provider) = build_otel_tracer(service_name);
57 let otel_layer = tracer.map(|t| tracing_opentelemetry::layer().with_tracer(t));
58
59 // Install the W3C `traceparent` propagator unconditionally. The format
60 // half is cheap (a unit struct), and installing it always means the
61 // dialer's `inject_*` calls and the server's `extract_*` calls behave
62 // consistently whether or not OTLP export is enabled — without it, the
63 // global getter returns a no-op propagator and `traceparent` headers
64 // silently disappear, which is the worst-of-both-worlds debug state.
65 // See `crate::propagation` for the matching inject/extract helpers.
66 global::set_text_map_propagator(TraceContextPropagator::new());
67
68 let registry = tracing_subscriber::registry().with(filter).with(otel_layer);
69 if json {
70 let fmt_layer = fmt::layer()
71 .json()
72 .flatten_event(true)
73 .with_writer(io::stderr);
74 registry.with(fmt_layer).init();
75 } else {
76 let fmt_layer = fmt::layer().with_writer(io::stderr);
77 registry.with(fmt_layer).init();
78 }
79
80 install_panic_hook();
81 ShutdownGuard { provider }
82}
83
84/// Build the OpenTelemetry tracer (the thing the tracing layer wraps), or
85/// `(None, None)` if no OTLP endpoint is configured. Also installs the
86/// global tracer provider so libraries that talk to
87/// `opentelemetry::global` see it.
88///
89/// Caveat: "the global" means *this* otel version's static. The lock
90/// currently carries a second otel stack (0.31, pinned by
91/// commonware-runtime — see the lockstep comment in the workspace
92/// Cargo.toml) whose own `global` stays the no-op default; only libraries
93/// emitting through the shared `tracing` facade are version-proof.
94fn build_otel_tracer(
95 service_name: &str,
96) -> (
97 Option<opentelemetry_sdk::trace::Tracer>,
98 Option<SdkTracerProvider>,
99) {
100 if std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").is_err() {
101 return (None, None);
102 }
103 let resolved_name = std::env::var("OTEL_SERVICE_NAME")
104 .ok()
105 .unwrap_or_else(|| service_name.to_owned());
106 let exporter = match SpanExporter::builder().with_tonic().build() {
107 Ok(exp) => exp,
108 Err(err) => {
109 eprintln!("opentelemetry OTLP exporter build failed: {err}; continuing without traces");
110 return (None, None);
111 }
112 };
113 let resource = Resource::builder()
114 .with_attribute(KeyValue::new("service.name", resolved_name.clone()))
115 .build();
116 let provider = SdkTracerProvider::builder()
117 .with_batch_exporter(exporter)
118 .with_resource(resource)
119 .build();
120 let tracer = provider.tracer(resolved_name);
121 global::set_tracer_provider(provider.clone());
122 (Some(tracer), Some(provider))
123}
124
125/// Emit a single structured `error` event on panic, then defer to the previous
126/// hook (which, under `panic = "abort"`, terminates the process).
127fn install_panic_hook() {
128 let previous = std::panic::take_hook();
129 std::panic::set_hook(Box::new(move |info| {
130 let location = info
131 .location()
132 .map_or_else(|| "unknown".to_owned(), ToString::to_string);
133 tracing::error!(panic = %info, location = %location, "process panicked");
134 previous(info);
135 }));
136}