Skip to main content

quiver_server/
otlp.rs

1// SPDX-License-Identifier: AGPL-3.0-only
2//! OpenTelemetry traces export (ADR-0059).
3//!
4//! ADR-0054 added Prometheus `/metrics` and `#[tracing::instrument]` spans but
5//! **deliberately left the OTLP exporter unbundled** — the `opentelemetry` crate
6//! tree is heavy. This module adds it back as a strictly opt-in capability:
7//!
8//! - **Compiled only behind the `otlp` cargo feature** (off by default). With the
9//!   feature off, none of the OpenTelemetry crates are linked.
10//! - **Activated only when an endpoint is configured** at runtime
11//!   ([`OtlpConfig::endpoint`] / `QUIVER_OTLP_ENDPOINT`). Compiling the feature in
12//!   but leaving the endpoint empty exports nothing.
13//!
14//! The configuration ([`OtlpConfig`]) is always present and unit-tested, so a
15//! `quiver.toml` is validated identically regardless of build features. The code
16//! that builds the live exporter and talks to a collector is feature-gated and is
17//! **not exercised in CI** (it needs a running OTLP collector) — it is a thin
18//! shell over the `opentelemetry-otlp` builder, stated rather than faked.
19
20use serde::{Deserialize, Serialize};
21
22/// The default `service.name` resource attribute reported to the collector.
23const DEFAULT_SERVICE_NAME: &str = "quiver";
24/// Default per-export timeout, in seconds.
25const DEFAULT_TIMEOUT_SECS: u64 = 10;
26
27/// OpenTelemetry traces export configuration (`[otlp]` in `quiver.toml`, or the
28/// `QUIVER_OTLP_*` environment variables). Disabled unless an `endpoint` is set.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30#[serde(default)]
31pub struct OtlpConfig {
32    /// The OTLP/gRPC collector endpoint, e.g. `http://localhost:4317`. Empty (the
33    /// default) disables export entirely, even when the `otlp` feature is built.
34    pub endpoint: String,
35    /// The `service.name` resource attribute reported to the collector.
36    pub service_name: String,
37    /// Per-export timeout, in seconds.
38    pub timeout_secs: u64,
39}
40
41impl Default for OtlpConfig {
42    fn default() -> Self {
43        Self {
44            endpoint: String::new(),
45            service_name: DEFAULT_SERVICE_NAME.to_owned(),
46            timeout_secs: DEFAULT_TIMEOUT_SECS,
47        }
48    }
49}
50
51impl OtlpConfig {
52    /// Whether traces should be exported (an endpoint is configured). The exporter
53    /// also requires the `otlp` feature to be compiled in to have any effect.
54    #[must_use]
55    pub fn is_enabled(&self) -> bool {
56        !self.endpoint.trim().is_empty()
57    }
58
59    /// Apply the flat `QUIVER_OTLP_*` environment overrides (figment nests env
60    /// keys under tables, so the flat keys are applied explicitly, as for the
61    /// other config sections).
62    ///
63    /// # Errors
64    /// Returns an error if `QUIVER_OTLP_TIMEOUT_SECS` is set to a non-integer.
65    pub fn apply_env_overrides(&mut self) -> Result<(), String> {
66        if let Ok(v) = std::env::var("QUIVER_OTLP_ENDPOINT") {
67            self.endpoint = v;
68        }
69        if let Ok(v) = std::env::var("QUIVER_OTLP_SERVICE_NAME") {
70            self.service_name = v;
71        }
72        if let Ok(v) = std::env::var("QUIVER_OTLP_TIMEOUT_SECS") {
73            self.timeout_secs = v
74                .parse()
75                .map_err(|_| format!("QUIVER_OTLP_TIMEOUT_SECS must be an integer, got {v:?}"))?;
76        }
77        Ok(())
78    }
79}
80
81// ---------------------------------------------------------------------------
82// Live exporter — feature-gated, not exercised in CI (needs a collector).
83// ---------------------------------------------------------------------------
84
85#[cfg(feature = "otlp")]
86mod live {
87    use std::sync::OnceLock;
88    use std::time::Duration;
89
90    use super::OtlpConfig;
91
92    /// Holds the tracer provider so [`shutdown`] can flush batched spans on exit.
93    static PROVIDER: OnceLock<opentelemetry_sdk::trace::SdkTracerProvider> = OnceLock::new();
94
95    /// Build a batched OTLP/gRPC tracer provider for `cfg`. Returns an error
96    /// string (never panics) so a telemetry misconfiguration degrades to "no
97    /// export" instead of taking the server down.
98    pub fn build_provider(
99        cfg: &OtlpConfig,
100    ) -> Result<opentelemetry_sdk::trace::SdkTracerProvider, String> {
101        use opentelemetry_otlp::WithExportConfig;
102        let exporter = opentelemetry_otlp::SpanExporter::builder()
103            .with_tonic()
104            .with_endpoint(&cfg.endpoint)
105            .with_timeout(Duration::from_secs(cfg.timeout_secs))
106            .build()
107            .map_err(|e| format!("building OTLP span exporter: {e}"))?;
108        let resource = opentelemetry_sdk::Resource::builder()
109            .with_service_name(cfg.service_name.clone())
110            .build();
111        let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
112            .with_batch_exporter(exporter)
113            .with_resource(resource)
114            .build();
115        Ok(provider)
116    }
117
118    /// Remember the provider for shutdown-time flushing.
119    pub fn store_provider(provider: opentelemetry_sdk::trace::SdkTracerProvider) {
120        let _ = PROVIDER.set(provider);
121    }
122
123    /// Flush and shut down the provider, if one was installed.
124    pub fn shutdown() {
125        if let Some(provider) = PROVIDER.get() {
126            let _ = provider.shutdown();
127        }
128    }
129}
130
131#[cfg(feature = "otlp")]
132pub use live::{build_provider, shutdown, store_provider};
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn disabled_by_default() {
140        let c = OtlpConfig::default();
141        assert!(!c.is_enabled());
142        assert_eq!(c.service_name, "quiver");
143        assert_eq!(c.timeout_secs, 10);
144    }
145
146    #[test]
147    fn enabled_when_endpoint_set_and_defaults_apply() {
148        // A partial config fills the rest from defaults (the `#[serde(default)]`).
149        let c: OtlpConfig =
150            serde_json::from_value(serde_json::json!({"endpoint":"http://localhost:4317"}))
151                .unwrap();
152        assert!(c.is_enabled());
153        assert_eq!(c.service_name, "quiver");
154        assert_eq!(c.timeout_secs, 10);
155    }
156
157    #[test]
158    fn whitespace_endpoint_is_not_enabled() {
159        let c: OtlpConfig = serde_json::from_value(serde_json::json!({"endpoint":"   "})).unwrap();
160        assert!(!c.is_enabled());
161    }
162
163    #[test]
164    fn fields_deserialize() {
165        let c: OtlpConfig = serde_json::from_value(serde_json::json!({
166            "endpoint":"http://collector:4317","service_name":"q-prod","timeout_secs":3
167        }))
168        .unwrap();
169        assert_eq!(c.service_name, "q-prod");
170        assert_eq!(c.timeout_secs, 3);
171        assert!(c.is_enabled());
172    }
173
174    #[test]
175    fn env_overrides_apply() {
176        // SAFETY: test-only; these QUIVER_OTLP_* vars are read by no other test.
177        unsafe {
178            std::env::set_var("QUIVER_OTLP_ENDPOINT", "http://envhost:4317");
179            std::env::set_var("QUIVER_OTLP_SERVICE_NAME", "from-env");
180            std::env::set_var("QUIVER_OTLP_TIMEOUT_SECS", "7");
181        }
182        let mut c = OtlpConfig::default();
183        c.apply_env_overrides().unwrap();
184        assert_eq!(c.endpoint, "http://envhost:4317");
185        assert_eq!(c.service_name, "from-env");
186        assert_eq!(c.timeout_secs, 7);
187
188        // A non-integer timeout is a clear error.
189        unsafe { std::env::set_var("QUIVER_OTLP_TIMEOUT_SECS", "soon") }
190        assert!(OtlpConfig::default().apply_env_overrides().is_err());
191
192        unsafe {
193            std::env::remove_var("QUIVER_OTLP_ENDPOINT");
194            std::env::remove_var("QUIVER_OTLP_SERVICE_NAME");
195            std::env::remove_var("QUIVER_OTLP_TIMEOUT_SECS");
196        }
197    }
198}