Skip to main content

zeph_config/
telemetry.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7use serde::{Deserialize, Serialize};
8
9fn default_trace_dir() -> PathBuf {
10    PathBuf::from(".local/traces")
11}
12
13fn default_include_args() -> bool {
14    false
15}
16
17fn default_service_name() -> String {
18    "zeph-agent".into()
19}
20
21fn default_sample_rate() -> f64 {
22    1.0
23}
24
25fn default_system_metrics_interval_secs() -> u64 {
26    5
27}
28
29/// Selects the tracing backend used when `[telemetry] enabled = true`.
30///
31/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
32/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
33///   feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
34/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
35///   feature).
36#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum TelemetryBackend {
40    /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
41    #[default]
42    Local,
43    /// Export spans via OTLP gRPC to an OpenTelemetry collector.
44    Otlp,
45    /// Push continuous CPU/memory profiles to a Pyroscope server.
46    Pyroscope,
47}
48
49/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
50///
51/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
52/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
53/// according to the selected [`TelemetryBackend`].
54///
55/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
56/// instrumentation points are compiled out via `cfg_attr`.
57///
58/// # Example (TOML)
59///
60/// ```toml
61/// [telemetry]
62/// enabled = true
63/// backend = "local"
64/// trace_dir = ".local/traces"
65/// include_args = false
66/// service_name = "my-zeph"
67/// sample_rate = 0.1
68/// ```
69#[derive(Debug, Clone, Deserialize, Serialize)]
70pub struct TelemetryConfig {
71    /// Enable tracing instrumentation. Default: `false`.
72    #[serde(default)]
73    pub enabled: bool,
74    /// Backend to use for trace export. Default: `local`.
75    #[serde(default)]
76    pub backend: TelemetryBackend,
77    /// Directory for Chrome JSON trace files (used when `backend = "local"`).
78    /// Default: `".local/traces"`.
79    #[serde(default = "default_trace_dir")]
80    pub trace_dir: PathBuf,
81    /// Include function arguments as span attributes in Chrome JSON traces.
82    ///
83    /// **Default: false.** Keep disabled in production — span field values are visible
84    /// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
85    /// messages may appear as span attributes if enabled.
86    ///
87    /// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
88    #[serde(default = "default_include_args")]
89    pub include_args: bool,
90    /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
91    /// Default: `"http://localhost:4317"` when unset.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub otlp_endpoint: Option<String>,
94    /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
95    /// When set, the value is resolved from the age vault at startup and passed as
96    /// `Authorization` or custom headers to the collector.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub otlp_headers_vault_key: Option<String>,
99    /// Pyroscope server URL (used when `backend = "pyroscope"`).
100    #[serde(default, skip_serializing_if = "Option::is_none")]
101    pub pyroscope_endpoint: Option<String>,
102    /// Service name reported in trace metadata. Default: `"zeph-agent"`.
103    #[serde(default = "default_service_name")]
104    pub service_name: String,
105    /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
106    /// Applies only to the `otlp` backend; the `local` backend always records all spans.
107    /// Default: `1.0`.
108    ///
109    /// # Warning
110    ///
111    /// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
112    /// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
113    /// storage but provides **no protection** against CPU or RAM spikes caused by high span
114    /// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
115    /// applied before spans are created) to prevent the OTLP feedback loop.
116    #[serde(default = "default_sample_rate")]
117    pub sample_rate: f64,
118    /// Optional base filter directive for the OTLP tracing layer.
119    ///
120    /// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
121    /// When unset, defaults to `"info"`.
122    ///
123    /// # Hardcoded transport exclusions
124    ///
125    /// The following exclusions are **always appended** after the user-supplied value, regardless
126    /// of what is set here:
127    ///
128    /// ```text
129    /// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
130    /// ```
131    ///
132    /// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
133    /// precedence over any conflicting directive in this field. For example, setting
134    /// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
135    /// the hardcoded exclusion appears later in the filter string. This is intentional:
136    /// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
137    /// capture its own network activity, creating a feedback loop.
138    ///
139    /// # Note on `sample_rate`
140    ///
141    /// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
142    /// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
143    /// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
144    /// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
145    /// the feedback loop.
146    #[serde(default, skip_serializing_if = "Option::is_none")]
147    pub otel_filter: Option<String>,
148    /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
149    #[serde(default = "default_system_metrics_interval_secs")]
150    pub system_metrics_interval_secs: u64,
151    /// User-defined key/value pairs attached as OpenTelemetry resource attributes.
152    ///
153    /// These appear on every span exported via OTLP and in Chrome JSON trace
154    /// `resourceSpans[].resource.attributes`. Useful for tagging traces with deployment
155    /// environment, team, git revision, etc.
156    ///
157    /// Keys follow the [OpenTelemetry attribute naming convention](https://opentelemetry.io/docs/specs/semconv/general/attribute-naming/)
158    /// (dot-separated, lowercase). The reserved key `service.name` is silently ignored —
159    /// the `service_name` config field takes precedence.
160    ///
161    /// Values appear in plaintext in exported traces. The `RedactingSpanProcessor` does
162    /// **not** scrub resource attributes (they are set once at init, not per-span). Do not
163    /// store secrets here.
164    ///
165    /// # Example (TOML)
166    ///
167    /// ```toml
168    /// [telemetry.trace_metadata]
169    /// "deployment.environment" = "staging"
170    /// "team.name" = "platform"
171    /// "vcs.revision" = "abc1234"
172    /// ```
173    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
174    pub trace_metadata: HashMap<String, String>,
175}
176
177impl Default for TelemetryConfig {
178    fn default() -> Self {
179        Self {
180            enabled: false,
181            backend: TelemetryBackend::default(),
182            trace_dir: default_trace_dir(),
183            include_args: default_include_args(),
184            otlp_endpoint: None,
185            otlp_headers_vault_key: None,
186            pyroscope_endpoint: None,
187            service_name: default_service_name(),
188            sample_rate: default_sample_rate(),
189            otel_filter: None,
190            system_metrics_interval_secs: default_system_metrics_interval_secs(),
191            trace_metadata: HashMap::new(),
192        }
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    #[test]
201    fn telemetry_config_defaults() {
202        let cfg = TelemetryConfig::default();
203        assert!(!cfg.enabled);
204        assert_eq!(cfg.backend, TelemetryBackend::Local);
205        assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
206        assert!(!cfg.include_args);
207        assert!(cfg.otlp_endpoint.is_none());
208        assert_eq!(cfg.service_name, "zeph-agent");
209        assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
210    }
211
212    #[test]
213    fn telemetry_config_serde_roundtrip() {
214        let toml = r#"
215            enabled = true
216            backend = "otlp"
217            trace_dir = "/tmp/traces"
218            include_args = false
219            otlp_endpoint = "http://otel:4317"
220            service_name = "my-agent"
221            sample_rate = 0.5
222        "#;
223        let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
224        assert!(cfg.enabled);
225        assert_eq!(cfg.backend, TelemetryBackend::Otlp);
226        assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
227        assert!(!cfg.include_args);
228        assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
229        assert_eq!(cfg.service_name, "my-agent");
230        let serialized = toml::to_string(&cfg).unwrap();
231        let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
232        assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
233        assert_eq!(cfg2.service_name, "my-agent");
234    }
235
236    #[test]
237    fn telemetry_config_old_toml_without_section_uses_defaults() {
238        // Existing configs without [telemetry] must deserialize with defaults.
239        let cfg: TelemetryConfig = toml::from_str("").unwrap();
240        assert!(!cfg.enabled);
241        assert_eq!(cfg.backend, TelemetryBackend::Local);
242    }
243
244    #[test]
245    fn trace_metadata_parses_and_roundtrips() {
246        let toml = r#"
247            enabled = true
248            backend = "otlp"
249            service_name = "my-agent"
250
251            [trace_metadata]
252            "deployment.environment" = "staging"
253            "team.name" = "platform"
254        "#;
255        let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
256        assert_eq!(
257            cfg.trace_metadata
258                .get("deployment.environment")
259                .map(String::as_str),
260            Some("staging")
261        );
262        assert_eq!(
263            cfg.trace_metadata.get("team.name").map(String::as_str),
264            Some("platform")
265        );
266
267        // Roundtrip: serialize then deserialize preserves values.
268        let serialized = toml::to_string(&cfg).unwrap();
269        let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
270        assert_eq!(cfg2.trace_metadata, cfg.trace_metadata);
271    }
272
273    #[test]
274    fn trace_metadata_empty_by_default() {
275        let cfg = TelemetryConfig::default();
276        assert!(cfg.trace_metadata.is_empty());
277    }
278
279    #[test]
280    fn trace_metadata_omitted_when_empty_on_serialize() {
281        let cfg = TelemetryConfig::default();
282        let serialized = toml::to_string(&cfg).unwrap();
283        assert!(
284            !serialized.contains("trace_metadata"),
285            "empty trace_metadata must be omitted from serialized TOML"
286        );
287    }
288}