Skip to main content

grapsus_proxy/
otel.rs

1//! OpenTelemetry integration for distributed tracing
2//!
3//! This module provides OpenTelemetry support with OTLP export for distributed tracing.
4//! It implements W3C Trace Context propagation (traceparent/tracestate headers).
5//!
6//! # Features
7//!
8//! - W3C Trace Context header propagation
9//! - OTLP export to Jaeger, Tempo, or any OTLP-compatible backend
10//! - Configurable sampling rates
11//! - Request lifecycle spans with semantic conventions
12//!
13//! # Configuration
14//!
15//! ```kdl
16//! observability {
17//!     tracing {
18//!         backend "otlp" {
19//!             endpoint "http://localhost:4317"
20//!         }
21//!         sampling-rate 0.1  // 10% of requests
22//!         service-name "grapsus"
23//!     }
24//! }
25//! ```
26
27use std::sync::OnceLock;
28use tracing::warn;
29
30use grapsus_config::TracingConfig;
31
32/// W3C Trace Context header names
33pub const TRACEPARENT_HEADER: &str = "traceparent";
34pub const TRACESTATE_HEADER: &str = "tracestate";
35
36/// Parsed W3C Trace Context
37#[derive(Debug, Clone)]
38pub struct TraceContext {
39    /// Trace ID (32 hex chars)
40    pub trace_id: String,
41    /// Parent span ID (16 hex chars)
42    pub parent_id: String,
43    /// Whether this trace is sampled
44    pub sampled: bool,
45    /// Optional tracestate header value
46    pub tracestate: Option<String>,
47}
48
49impl TraceContext {
50    /// Parse W3C traceparent header
51    ///
52    /// Format: version-trace_id-parent_id-flags
53    /// Example: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01
54    pub fn parse_traceparent(header: &str) -> Option<Self> {
55        let parts: Vec<&str> = header.split('-').collect();
56        if parts.len() != 4 {
57            return None;
58        }
59
60        let version = parts[0];
61        if version != "00" {
62            // Only support version 00 for now
63            return None;
64        }
65
66        let trace_id = parts[1];
67        let parent_id = parts[2];
68        let flags = parts[3];
69
70        // Validate lengths
71        if trace_id.len() != 32 || parent_id.len() != 16 || flags.len() != 2 {
72            return None;
73        }
74
75        // Parse flags
76        let sampled = u8::from_str_radix(flags, 16).ok()? & 0x01 == 1;
77
78        Some(Self {
79            trace_id: trace_id.to_string(),
80            parent_id: parent_id.to_string(),
81            sampled,
82            tracestate: None,
83        })
84    }
85
86    /// Create traceparent header value
87    pub fn to_traceparent(&self, span_id: &str) -> String {
88        let flags = if self.sampled { "01" } else { "00" };
89        format!("00-{}-{}-{}", self.trace_id, span_id, flags)
90    }
91
92    /// Create a new trace context with generated IDs
93    pub fn new_root(sampled: bool) -> Self {
94        Self {
95            trace_id: generate_trace_id(),
96            parent_id: generate_span_id(),
97            sampled,
98            tracestate: None,
99        }
100    }
101}
102
103/// Generate a new trace ID (32 hex chars)
104pub fn generate_trace_id() -> String {
105    let bytes: [u8; 16] = rand::random();
106    hex::encode(bytes)
107}
108
109/// Generate a new span ID (16 hex chars)
110pub fn generate_span_id() -> String {
111    let bytes: [u8; 8] = rand::random();
112    hex::encode(bytes)
113}
114
115/// Create a traceparent header value
116pub fn create_traceparent(trace_id: &str, span_id: &str, sampled: bool) -> String {
117    let flags = if sampled { "01" } else { "00" };
118    format!("00-{}-{}-{}", trace_id, span_id, flags)
119}
120
121// ============================================================================
122// OpenTelemetry Tracer (when feature enabled)
123// ============================================================================
124
125#[cfg(feature = "opentelemetry")]
126mod otel_impl {
127    use super::*;
128    use opentelemetry::trace::{Span, SpanKind, Status, Tracer, TracerProvider as _};
129    use opentelemetry::{global, KeyValue};
130    use opentelemetry_otlp::WithExportConfig;
131    use opentelemetry_sdk::trace::{Sampler, SdkTracerProvider};
132    use opentelemetry_sdk::Resource;
133    use tracing::{error, info};
134
135    /// OpenTelemetry tracer wrapper
136    pub struct OtelTracer {
137        provider: SdkTracerProvider,
138        sampling_rate: f64,
139        service_name: String,
140    }
141
142    impl OtelTracer {
143        /// Initialize OpenTelemetry with OTLP exporter
144        pub fn init(config: &TracingConfig) -> Result<Self, OtelError> {
145            let endpoint = match &config.backend {
146                grapsus_config::TracingBackend::Otlp { endpoint } => endpoint.clone(),
147                grapsus_config::TracingBackend::Jaeger { endpoint } => endpoint.clone(),
148                grapsus_config::TracingBackend::Zipkin { endpoint } => endpoint.clone(),
149            };
150
151            info!(
152                endpoint = %endpoint,
153                sampling_rate = config.sampling_rate,
154                service_name = %config.service_name,
155                "Initializing OpenTelemetry tracer"
156            );
157
158            // Create OTLP exporter
159            let exporter = opentelemetry_otlp::SpanExporter::builder()
160                .with_tonic()
161                .with_endpoint(&endpoint)
162                .build()
163                .map_err(|e| OtelError::ExporterInit(e.to_string()))?;
164
165            // Create sampler based on sampling rate
166            let sampler = if config.sampling_rate >= 1.0 {
167                Sampler::AlwaysOn
168            } else if config.sampling_rate <= 0.0 {
169                Sampler::AlwaysOff
170            } else {
171                Sampler::TraceIdRatioBased(config.sampling_rate)
172            };
173
174            // Create resource with service info
175            let resource = Resource::builder()
176                .with_service_name(config.service_name.clone())
177                .build();
178
179            // Build tracer provider
180            let provider = SdkTracerProvider::builder()
181                .with_batch_exporter(exporter)
182                .with_sampler(sampler)
183                .with_resource(resource)
184                .build();
185
186            // Set global provider
187            global::set_tracer_provider(provider.clone());
188
189            info!("OpenTelemetry tracer initialized successfully");
190
191            Ok(Self {
192                provider,
193                sampling_rate: config.sampling_rate,
194                service_name: config.service_name.clone(),
195            })
196        }
197
198        /// Create a request span
199        pub fn start_span(
200            &self,
201            method: &str,
202            path: &str,
203            trace_ctx: Option<&TraceContext>,
204        ) -> RequestSpan {
205            let tracer = global::tracer("grapsus-proxy");
206
207            let span = tracer
208                .span_builder(format!("{} {}", method, path))
209                .with_kind(SpanKind::Server)
210                .with_attributes([
211                    KeyValue::new("http.method", method.to_string()),
212                    KeyValue::new("http.target", path.to_string()),
213                    KeyValue::new("service.name", self.service_name.clone()),
214                ])
215                .start(&tracer);
216
217            RequestSpan {
218                span,
219                trace_id: trace_ctx
220                    .map(|c| c.trace_id.clone())
221                    .unwrap_or_else(generate_trace_id),
222                span_id: generate_span_id(),
223            }
224        }
225
226        /// Shutdown the tracer
227        pub fn shutdown(&self) {
228            info!("Shutting down OpenTelemetry tracer");
229            if let Err(e) = self.provider.shutdown() {
230                error!(error = %e, "Failed to shutdown OpenTelemetry tracer provider");
231            }
232        }
233    }
234
235    /// Request span wrapper
236    pub struct RequestSpan {
237        span: opentelemetry::global::BoxedSpan,
238        pub trace_id: String,
239        pub span_id: String,
240    }
241
242    impl RequestSpan {
243        pub fn set_status(&mut self, status_code: u16) {
244            self.span
245                .set_attribute(KeyValue::new("http.status_code", status_code as i64));
246            if status_code >= 500 {
247                self.span
248                    .set_status(Status::error(format!("HTTP {}", status_code)));
249            }
250        }
251
252        pub fn record_error(&mut self, error: &str) {
253            self.span.add_event(
254                "exception",
255                vec![KeyValue::new("exception.message", error.to_string())],
256            );
257            self.span.set_status(Status::error(error.to_string()));
258        }
259
260        pub fn set_upstream(&mut self, upstream: &str, address: &str) {
261            self.span
262                .set_attribute(KeyValue::new("peer.service", upstream.to_string()));
263            self.span
264                .set_attribute(KeyValue::new("net.peer.name", address.to_string()));
265        }
266
267        pub fn end(mut self) {
268            self.span.end();
269        }
270    }
271}
272
273// ============================================================================
274// Stub implementations when feature is disabled
275// ============================================================================
276
277#[cfg(not(feature = "opentelemetry"))]
278mod otel_impl {
279    use super::*;
280
281    pub struct OtelTracer;
282
283    impl OtelTracer {
284        pub fn init(_config: &TracingConfig) -> Result<Self, OtelError> {
285            warn!("OpenTelemetry feature not enabled, tracing disabled");
286            Err(OtelError::TracerInit(
287                "OpenTelemetry feature not enabled".to_string(),
288            ))
289        }
290
291        pub fn start_span(
292            &self,
293            _method: &str,
294            _path: &str,
295            trace_ctx: Option<&TraceContext>,
296        ) -> RequestSpan {
297            RequestSpan {
298                trace_id: trace_ctx
299                    .map(|c| c.trace_id.clone())
300                    .unwrap_or_else(generate_trace_id),
301                span_id: generate_span_id(),
302            }
303        }
304
305        pub fn shutdown(&self) {}
306    }
307
308    pub struct RequestSpan {
309        pub trace_id: String,
310        pub span_id: String,
311    }
312
313    impl RequestSpan {
314        pub fn set_status(&mut self, _status_code: u16) {}
315        pub fn record_error(&mut self, _error: &str) {}
316        pub fn set_upstream(&mut self, _upstream: &str, _address: &str) {}
317        pub fn end(self) {}
318    }
319}
320
321// Re-export from the appropriate module
322pub use otel_impl::{OtelTracer, RequestSpan};
323
324/// OpenTelemetry error types
325#[derive(Debug)]
326pub enum OtelError {
327    ExporterInit(String),
328    TracerInit(String),
329}
330
331impl std::fmt::Display for OtelError {
332    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
333        match self {
334            OtelError::ExporterInit(e) => write!(f, "Failed to initialize OTLP exporter: {}", e),
335            OtelError::TracerInit(e) => write!(f, "Failed to initialize tracer: {}", e),
336        }
337    }
338}
339
340impl std::error::Error for OtelError {}
341
342// ============================================================================
343// Global tracer instance
344// ============================================================================
345
346static GLOBAL_TRACER: OnceLock<Option<OtelTracer>> = OnceLock::new();
347
348/// Initialize the global tracer
349pub fn init_tracer(config: &TracingConfig) -> Result<(), OtelError> {
350    let tracer = OtelTracer::init(config)?;
351    GLOBAL_TRACER
352        .set(Some(tracer))
353        .map_err(|_| OtelError::TracerInit("Global tracer already initialized".to_string()))?;
354    Ok(())
355}
356
357/// Get the global tracer
358pub fn get_tracer() -> Option<&'static OtelTracer> {
359    GLOBAL_TRACER.get().and_then(|t| t.as_ref())
360}
361
362/// Shutdown the global tracer
363pub fn shutdown_tracer() {
364    if let Some(Some(tracer)) = GLOBAL_TRACER.get() {
365        tracer.shutdown();
366    }
367}
368
369// ============================================================================
370// Tests
371// ============================================================================
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376
377    #[test]
378    fn test_parse_valid_traceparent() {
379        let header = "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01";
380        let ctx = TraceContext::parse_traceparent(header).unwrap();
381
382        assert_eq!(ctx.trace_id, "0af7651916cd43dd8448eb211c80319c");
383        assert_eq!(ctx.parent_id, "b7ad6b7169203331");
384        assert!(ctx.sampled);
385    }
386
387    #[test]
388    fn test_parse_unsampled_traceparent() {
389        let header = "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-00";
390        let ctx = TraceContext::parse_traceparent(header).unwrap();
391
392        assert!(!ctx.sampled);
393    }
394
395    #[test]
396    fn test_parse_invalid_traceparent() {
397        // Invalid version
398        assert!(TraceContext::parse_traceparent("01-abc-def-00").is_none());
399
400        // Wrong number of parts
401        assert!(TraceContext::parse_traceparent("00-abc-def").is_none());
402
403        // Wrong trace_id length
404        assert!(TraceContext::parse_traceparent("00-abc-b7ad6b7169203331-01").is_none());
405    }
406
407    #[test]
408    fn test_trace_context_to_traceparent() {
409        let ctx = TraceContext {
410            trace_id: "0af7651916cd43dd8448eb211c80319c".to_string(),
411            parent_id: "b7ad6b7169203331".to_string(),
412            sampled: true,
413            tracestate: None,
414        };
415
416        let new_span_id = "1234567890abcdef";
417        let traceparent = ctx.to_traceparent(new_span_id);
418
419        assert_eq!(
420            traceparent,
421            "00-0af7651916cd43dd8448eb211c80319c-1234567890abcdef-01"
422        );
423    }
424
425    #[test]
426    fn test_generate_trace_id() {
427        let id = generate_trace_id();
428        assert_eq!(id.len(), 32);
429        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
430    }
431
432    #[test]
433    fn test_generate_span_id() {
434        let id = generate_span_id();
435        assert_eq!(id.len(), 16);
436        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
437    }
438
439    #[test]
440    fn test_create_traceparent() {
441        let traceparent =
442            create_traceparent("0af7651916cd43dd8448eb211c80319c", "b7ad6b7169203331", true);
443        assert_eq!(
444            traceparent,
445            "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01"
446        );
447    }
448
449    #[test]
450    fn test_new_root_trace_context() {
451        let ctx = TraceContext::new_root(true);
452        assert_eq!(ctx.trace_id.len(), 32);
453        assert_eq!(ctx.parent_id.len(), 16);
454        assert!(ctx.sampled);
455    }
456}