polyc-runtime 0.1.3

Shared Unix-coherence runtime for polychrome binaries: logging, health/metrics side-server, signals.
Documentation
//! Logging + traces + panic discipline (PRD §11).
//!
//! All logs go to **stderr** (stdout is reserved for program data). Format
//! is selected by `RUST_LOG_FORMAT` (`json` in prod, pretty otherwise);
//! verbosity by `RUST_LOG` via `EnvFilter`.
//!
//! When `OTEL_EXPORTER_OTLP_ENDPOINT` is set, the tracing subscriber also
//! gains an OpenTelemetry layer that batches spans and ships them via OTLP
//! gRPC to the configured collector. `OTEL_SERVICE_NAME` overrides the
//! service name (defaults to the caller's argument). Dev runs without
//! that env var pay zero overhead — the layer is not constructed.
//!
//! Spans propagate through the in-process `tokio` runtime via
//! `tracing-opentelemetry`; cross-process propagation across the
//! `connectrpc` boundary uses the W3C `traceparent` header — wire it through
//! the dialer when running in production.

use std::io;

use opentelemetry::KeyValue;
use opentelemetry::global;
use opentelemetry::trace::TracerProvider as _;
use opentelemetry_otlp::SpanExporter;
use opentelemetry_sdk::Resource;
use opentelemetry_sdk::propagation::TraceContextPropagator;
use opentelemetry_sdk::trace::SdkTracerProvider;
use tracing_subscriber::{EnvFilter, fmt, prelude::*};

/// Handle returned from [`init`] — keep it alive for the process lifetime;
/// `drop` flushes traces before exit. `SdkTracerProvider::shutdown` blocks
/// until exporters drain.
pub struct ShutdownGuard {
    provider: Option<SdkTracerProvider>,
}

impl Drop for ShutdownGuard {
    fn drop(&mut self) {
        if let Some(provider) = self.provider.take() {
            // Best-effort: log to stderr and continue if flush fails.
            if let Err(err) = provider.shutdown() {
                eprintln!("opentelemetry shutdown failed: {err}");
            }
        }
    }
}

/// Initialise the global tracing subscriber + (optional) `OTel` layer.
///
/// Also installs the panic hook. Call once, early in `main`. The returned
/// guard flushes traces on drop — keep it alive for the process lifetime.
#[must_use]
pub fn init(service_name: &str) -> ShutdownGuard {
    let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
    let json = std::env::var("RUST_LOG_FORMAT").is_ok_and(|v| v.eq_ignore_ascii_case("json"));

    let (tracer, provider) = build_otel_tracer(service_name);
    let otel_layer = tracer.map(|t| tracing_opentelemetry::layer().with_tracer(t));

    // Install the W3C `traceparent` propagator unconditionally. The format
    // half is cheap (a unit struct), and installing it always means the
    // dialer's `inject_*` calls and the server's `extract_*` calls behave
    // consistently whether or not OTLP export is enabled — without it, the
    // global getter returns a no-op propagator and `traceparent` headers
    // silently disappear, which is the worst-of-both-worlds debug state.
    // See `crate::propagation` for the matching inject/extract helpers.
    global::set_text_map_propagator(TraceContextPropagator::new());

    let registry = tracing_subscriber::registry().with(filter).with(otel_layer);
    if json {
        let fmt_layer = fmt::layer()
            .json()
            .flatten_event(true)
            .with_writer(io::stderr);
        registry.with(fmt_layer).init();
    } else {
        let fmt_layer = fmt::layer().with_writer(io::stderr);
        registry.with(fmt_layer).init();
    }

    install_panic_hook();
    ShutdownGuard { provider }
}

/// Build the OpenTelemetry tracer (the thing the tracing layer wraps), or
/// `(None, None)` if no OTLP endpoint is configured. Also installs the
/// global tracer provider so libraries that talk to
/// `opentelemetry::global` see it.
///
/// Caveat: "the global" means *this* otel version's static. The lock
/// currently carries a second otel stack (0.31, pinned by
/// commonware-runtime — see the lockstep comment in the workspace
/// Cargo.toml) whose own `global` stays the no-op default; only libraries
/// emitting through the shared `tracing` facade are version-proof.
fn build_otel_tracer(
    service_name: &str,
) -> (
    Option<opentelemetry_sdk::trace::Tracer>,
    Option<SdkTracerProvider>,
) {
    if std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").is_err() {
        return (None, None);
    }
    let resolved_name = std::env::var("OTEL_SERVICE_NAME")
        .ok()
        .unwrap_or_else(|| service_name.to_owned());
    let exporter = match SpanExporter::builder().with_tonic().build() {
        Ok(exp) => exp,
        Err(err) => {
            eprintln!("opentelemetry OTLP exporter build failed: {err}; continuing without traces");
            return (None, None);
        }
    };
    let resource = Resource::builder()
        .with_attribute(KeyValue::new("service.name", resolved_name.clone()))
        .build();
    let provider = SdkTracerProvider::builder()
        .with_batch_exporter(exporter)
        .with_resource(resource)
        .build();
    let tracer = provider.tracer(resolved_name);
    global::set_tracer_provider(provider.clone());
    (Some(tracer), Some(provider))
}

/// Emit a single structured `error` event on panic, then defer to the previous
/// hook (which, under `panic = "abort"`, terminates the process).
fn install_panic_hook() {
    let previous = std::panic::take_hook();
    std::panic::set_hook(Box::new(move |info| {
        let location = info
            .location()
            .map_or_else(|| "unknown".to_owned(), ToString::to_string);
        tracing::error!(panic = %info, location = %location, "process panicked");
        previous(info);
    }));
}