Skip to main content

talos_api_rs/runtime/
tracing.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2
3//! OpenTelemetry tracing integration for the Talos API client.
4//!
5//! This module provides OpenTelemetry-compatible distributed tracing for
6//! monitoring gRPC requests to Talos nodes.
7//!
8//! # Overview
9//!
10//! The tracing integration supports:
11//! - Span creation for each gRPC request
12//! - W3C Trace Context propagation
13//! - Request/response attributes
14//! - Error tracking
15//!
16//! # Usage with `tracing` Crate
17//!
18//! This module integrates with the standard Rust `tracing` ecosystem.
19//! To use OpenTelemetry, configure an OpenTelemetry subscriber:
20//!
21//! ```rust,ignore
22//! use tracing_subscriber::prelude::*;
23//! use opentelemetry::sdk::trace::TracerProvider;
24//! use opentelemetry_otlp::WithExportConfig;
25//!
26//! // Configure OTLP exporter
27//! let exporter = opentelemetry_otlp::new_exporter()
28//!     .tonic()
29//!     .with_endpoint("http://localhost:4317");
30//!
31//! let provider = TracerProvider::builder()
32//!     .with_batch_exporter(exporter.build_span_exporter()?)
33//!     .build();
34//!
35//! opentelemetry::global::set_tracer_provider(provider);
36//!
37//! // Configure tracing subscriber with OpenTelemetry layer
38//! let otel_layer = tracing_opentelemetry::layer()
39//!     .with_tracer(opentelemetry::global::tracer("talos-client"));
40//!
41//! tracing_subscriber::registry()
42//!     .with(otel_layer)
43//!     .with(tracing_subscriber::fmt::layer())
44//!     .init();
45//! ```
46//!
47//! # Example: Manual Span Creation
48//!
49//! ```rust
50//! use talos_api_rs::runtime::tracing::TalosSpan;
51//! use std::time::Duration;
52//!
53//! // Create a span for a Talos API call
54//! let span = TalosSpan::new("Version", "10.0.0.1:50000");
55//!
56//! // Record success
57//! span.record_success(Duration::from_millis(42));
58//!
59//! // Or record error
60//! // span.record_error("Connection refused");
61//! ```
62//!
63//! # Semantic Conventions
64//!
65//! Spans follow OpenTelemetry semantic conventions:
66//!
67//! | Attribute | Description |
68//! |-----------|-------------|
69//! | `rpc.system` | Always "grpc" |
70//! | `rpc.service` | Talos service name (e.g., "machine.MachineService") |
71//! | `rpc.method` | Method name (e.g., "Version") |
72//! | `server.address` | Target endpoint |
73//! | `rpc.grpc.status_code` | gRPC status code |
74//!
75//! # Note on Dependencies
76//!
77//! This module provides helpers for creating properly-attributed spans.
78//! To actually export traces to an OpenTelemetry backend, you need to:
79//!
80//! 1. Add `opentelemetry` and `tracing-opentelemetry` to your dependencies
81//! 2. Configure an OTLP exporter (e.g., `opentelemetry-otlp`)
82//! 3. Set up a tracing subscriber with the OpenTelemetry layer
83//!
84//! The library itself only depends on `tracing`, keeping the dependency
85//! footprint minimal for users who don't need distributed tracing.
86
87use std::time::{Duration, Instant};
88use tracing::{field, info_span, Span};
89
90/// Configuration for OpenTelemetry tracing.
91#[derive(Debug, Clone)]
92pub struct TracingConfig {
93    /// Service name for traces
94    pub service_name: String,
95    /// Whether to record request payloads (may contain sensitive data)
96    pub record_payloads: bool,
97    /// Whether to record response payloads
98    pub record_responses: bool,
99    /// Maximum payload size to record (in bytes)
100    pub max_payload_size: usize,
101}
102
103impl Default for TracingConfig {
104    fn default() -> Self {
105        Self {
106            service_name: "talos-client".to_string(),
107            record_payloads: false,
108            record_responses: false,
109            max_payload_size: 4096,
110        }
111    }
112}
113
114impl TracingConfig {
115    /// Create a new builder for `TracingConfig`.
116    pub fn builder() -> TracingConfigBuilder {
117        TracingConfigBuilder::default()
118    }
119}
120
121/// Builder for `TracingConfig`.
122#[derive(Debug, Default)]
123pub struct TracingConfigBuilder {
124    service_name: Option<String>,
125    record_payloads: Option<bool>,
126    record_responses: Option<bool>,
127    max_payload_size: Option<usize>,
128}
129
130impl TracingConfigBuilder {
131    /// Set the service name for traces.
132    pub fn service_name(mut self, name: impl Into<String>) -> Self {
133        self.service_name = Some(name.into());
134        self
135    }
136
137    /// Enable recording of request payloads.
138    pub fn record_payloads(mut self, enabled: bool) -> Self {
139        self.record_payloads = Some(enabled);
140        self
141    }
142
143    /// Enable recording of response payloads.
144    pub fn record_responses(mut self, enabled: bool) -> Self {
145        self.record_responses = Some(enabled);
146        self
147    }
148
149    /// Set maximum payload size to record.
150    pub fn max_payload_size(mut self, size: usize) -> Self {
151        self.max_payload_size = Some(size);
152        self
153    }
154
155    /// Build the `TracingConfig`.
156    pub fn build(self) -> TracingConfig {
157        let default = TracingConfig::default();
158        TracingConfig {
159            service_name: self.service_name.unwrap_or(default.service_name),
160            record_payloads: self.record_payloads.unwrap_or(default.record_payloads),
161            record_responses: self.record_responses.unwrap_or(default.record_responses),
162            max_payload_size: self.max_payload_size.unwrap_or(default.max_payload_size),
163        }
164    }
165}
166
167/// A span for a Talos API call with OpenTelemetry attributes.
168#[derive(Debug)]
169pub struct TalosSpan {
170    span: Span,
171    start: Instant,
172    method: String,
173    endpoint: String,
174}
175
176impl TalosSpan {
177    /// Create a new span for a Talos API call.
178    ///
179    /// # Example
180    ///
181    /// ```rust
182    /// use talos_api_rs::runtime::tracing::TalosSpan;
183    ///
184    /// let span = TalosSpan::new("Version", "10.0.0.1:50000");
185    /// ```
186    pub fn new(method: &str, endpoint: &str) -> Self {
187        let span = info_span!(
188            "talos.grpc",
189            rpc.system = "grpc",
190            rpc.service = "talos.machine.MachineService",
191            rpc.method = %method,
192            server.address = %endpoint,
193            rpc.grpc.status_code = field::Empty,
194            otel.status_code = field::Empty,
195            error.message = field::Empty,
196            duration_ms = field::Empty,
197        );
198
199        Self {
200            span,
201            start: Instant::now(),
202            method: method.to_string(),
203            endpoint: endpoint.to_string(),
204        }
205    }
206
207    /// Create a span with a specific service name.
208    pub fn with_service(method: &str, service: &str, endpoint: &str) -> Self {
209        let span = info_span!(
210            "talos.grpc",
211            rpc.system = "grpc",
212            rpc.service = %service,
213            rpc.method = %method,
214            server.address = %endpoint,
215            rpc.grpc.status_code = field::Empty,
216            otel.status_code = field::Empty,
217            error.message = field::Empty,
218            duration_ms = field::Empty,
219        );
220
221        Self {
222            span,
223            start: Instant::now(),
224            method: method.to_string(),
225            endpoint: endpoint.to_string(),
226        }
227    }
228
229    /// Get the underlying `tracing::Span`.
230    pub fn span(&self) -> &Span {
231        &self.span
232    }
233
234    /// Get the method name.
235    pub fn method(&self) -> &str {
236        &self.method
237    }
238
239    /// Get the endpoint.
240    pub fn endpoint(&self) -> &str {
241        &self.endpoint
242    }
243
244    /// Get elapsed time since span creation.
245    pub fn elapsed(&self) -> Duration {
246        self.start.elapsed()
247    }
248
249    /// Record a successful response.
250    pub fn record_success(&self, duration: Duration) {
251        self.span.record("rpc.grpc.status_code", 0i64); // OK
252        self.span.record("otel.status_code", "OK");
253        self.span.record("duration_ms", duration.as_millis() as i64);
254    }
255
256    /// Record an error response.
257    pub fn record_error(&self, error: &str) {
258        let duration = self.start.elapsed();
259        self.span.record("rpc.grpc.status_code", 2i64); // UNKNOWN
260        self.span.record("otel.status_code", "ERROR");
261        self.span.record("error.message", error);
262        self.span.record("duration_ms", duration.as_millis() as i64);
263    }
264
265    /// Record a gRPC status code.
266    pub fn record_grpc_status(&self, code: i32) {
267        self.span.record("rpc.grpc.status_code", code as i64);
268        let status = if code == 0 { "OK" } else { "ERROR" };
269        self.span.record("otel.status_code", status);
270        self.span
271            .record("duration_ms", self.start.elapsed().as_millis() as i64);
272    }
273
274    /// Enter the span context for async work.
275    pub fn enter(&self) -> tracing::span::Entered<'_> {
276        self.span.enter()
277    }
278}
279
280/// Helper macro for creating instrumented async functions.
281///
282/// # Example
283///
284/// ```rust,ignore
285/// use talos_api_rs::instrument_talos;
286///
287/// async fn call_version(client: &TalosClient, endpoint: &str) -> Result<Version, Error> {
288///     instrument_talos!("Version", endpoint, {
289///         client.version().version(()).await
290///     })
291/// }
292/// ```
293#[macro_export]
294macro_rules! instrument_talos {
295    ($method:expr, $endpoint:expr, $body:expr) => {{
296        let span = $crate::runtime::tracing::TalosSpan::new($method, $endpoint);
297        let _guard = span.enter();
298        let start = std::time::Instant::now();
299        let result = $body;
300        let duration = start.elapsed();
301        match &result {
302            Ok(_) => span.record_success(duration),
303            Err(e) => span.record_error(&format!("{}", e)),
304        }
305        result
306    }};
307}
308
309/// Span factory for creating consistent spans across the client.
310#[derive(Debug, Clone)]
311pub struct SpanFactory {
312    config: TracingConfig,
313}
314
315impl SpanFactory {
316    /// Create a new span factory with the given configuration.
317    pub fn new(config: TracingConfig) -> Self {
318        Self { config }
319    }
320
321    /// Create a span for a Talos API call.
322    pub fn create_span(&self, method: &str, endpoint: &str) -> TalosSpan {
323        TalosSpan::with_service(method, "talos.machine.MachineService", endpoint)
324    }
325
326    /// Create a span for an etcd API call.
327    pub fn create_etcd_span(&self, method: &str, endpoint: &str) -> TalosSpan {
328        TalosSpan::with_service(method, "talos.machine.MachineService/Etcd", endpoint)
329    }
330
331    /// Get the configuration.
332    pub fn config(&self) -> &TracingConfig {
333        &self.config
334    }
335}
336
337impl Default for SpanFactory {
338    fn default() -> Self {
339        Self::new(TracingConfig::default())
340    }
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346
347    #[test]
348    fn test_tracing_config_default() {
349        let config = TracingConfig::default();
350        assert_eq!(config.service_name, "talos-client");
351        assert!(!config.record_payloads);
352        assert!(!config.record_responses);
353        assert_eq!(config.max_payload_size, 4096);
354    }
355
356    #[test]
357    fn test_tracing_config_builder() {
358        let config = TracingConfig::builder()
359            .service_name("my-service")
360            .record_payloads(true)
361            .record_responses(true)
362            .max_payload_size(8192)
363            .build();
364
365        assert_eq!(config.service_name, "my-service");
366        assert!(config.record_payloads);
367        assert!(config.record_responses);
368        assert_eq!(config.max_payload_size, 8192);
369    }
370
371    #[test]
372    fn test_talos_span_new() {
373        let span = TalosSpan::new("Version", "10.0.0.1:50000");
374        assert_eq!(span.method(), "Version");
375        assert_eq!(span.endpoint(), "10.0.0.1:50000");
376    }
377
378    #[test]
379    fn test_talos_span_with_service() {
380        let span = TalosSpan::with_service(
381            "EtcdMemberList",
382            "talos.machine.MachineService/Etcd",
383            "10.0.0.1:50000",
384        );
385        assert_eq!(span.method(), "EtcdMemberList");
386    }
387
388    #[test]
389    fn test_talos_span_record_success() {
390        let span = TalosSpan::new("Version", "10.0.0.1:50000");
391        span.record_success(Duration::from_millis(42));
392        // Span should not panic
393    }
394
395    #[test]
396    fn test_talos_span_record_error() {
397        let span = TalosSpan::new("Version", "10.0.0.1:50000");
398        span.record_error("Connection refused");
399        // Span should not panic
400    }
401
402    #[test]
403    fn test_talos_span_record_grpc_status() {
404        let span = TalosSpan::new("Version", "10.0.0.1:50000");
405        span.record_grpc_status(0); // OK
406        span.record_grpc_status(14); // UNAVAILABLE
407                                     // Span should not panic
408    }
409
410    #[test]
411    fn test_span_factory_new() {
412        let config = TracingConfig::builder()
413            .service_name("test-service")
414            .build();
415        let factory = SpanFactory::new(config);
416        assert_eq!(factory.config().service_name, "test-service");
417    }
418
419    #[test]
420    fn test_span_factory_create_span() {
421        let factory = SpanFactory::default();
422        let span = factory.create_span("Version", "10.0.0.1:50000");
423        assert_eq!(span.method(), "Version");
424    }
425
426    #[test]
427    fn test_span_factory_create_etcd_span() {
428        let factory = SpanFactory::default();
429        let span = factory.create_etcd_span("EtcdMemberList", "10.0.0.1:50000");
430        assert_eq!(span.method(), "EtcdMemberList");
431    }
432
433    #[test]
434    fn test_talos_span_elapsed() {
435        let span = TalosSpan::new("Version", "10.0.0.1:50000");
436        std::thread::sleep(Duration::from_millis(10));
437        let elapsed = span.elapsed();
438        assert!(elapsed >= Duration::from_millis(10));
439    }
440}