Skip to main content

zeph_config/
telemetry.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7use serde::{Deserialize, Serialize};
8
9fn default_trace_dir() -> PathBuf {
10    PathBuf::from(".local/traces")
11}
12
13fn default_include_args() -> bool {
14    false
15}
16
17fn default_service_name() -> String {
18    "zeph-agent".into()
19}
20
21fn default_sample_rate() -> f64 {
22    1.0
23}
24
25fn default_system_metrics_interval_secs() -> u64 {
26    5
27}
28
29/// Selects the tracing backend used when `[telemetry] enabled = true`.
30///
31/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
32/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
33///   feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
34/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
35///   feature).
36#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
37#[serde(rename_all = "lowercase")]
38pub enum TelemetryBackend {
39    /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
40    #[default]
41    Local,
42    /// Export spans via OTLP gRPC to an OpenTelemetry collector.
43    Otlp,
44    /// Push continuous CPU/memory profiles to a Pyroscope server.
45    Pyroscope,
46}
47
48/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
49///
50/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
51/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
52/// according to the selected [`TelemetryBackend`].
53///
54/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
55/// instrumentation points are compiled out via `cfg_attr`.
56///
57/// # Example (TOML)
58///
59/// ```toml
60/// [telemetry]
61/// enabled = true
62/// backend = "local"
63/// trace_dir = ".local/traces"
64/// include_args = false
65/// service_name = "my-zeph"
66/// sample_rate = 0.1
67/// ```
68#[derive(Debug, Clone, Deserialize, Serialize)]
69pub struct TelemetryConfig {
70    /// Enable tracing instrumentation. Default: `false`.
71    #[serde(default)]
72    pub enabled: bool,
73    /// Backend to use for trace export. Default: `local`.
74    #[serde(default)]
75    pub backend: TelemetryBackend,
76    /// Directory for Chrome JSON trace files (used when `backend = "local"`).
77    /// Default: `".local/traces"`.
78    #[serde(default = "default_trace_dir")]
79    pub trace_dir: PathBuf,
80    /// Include function arguments as span attributes in Chrome JSON traces.
81    ///
82    /// **Default: false.** Keep disabled in production — span field values are visible
83    /// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
84    /// messages may appear as span attributes if enabled.
85    ///
86    /// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
87    #[serde(default = "default_include_args")]
88    pub include_args: bool,
89    /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
90    /// Default: `"http://localhost:4317"` when unset.
91    #[serde(default, skip_serializing_if = "Option::is_none")]
92    pub otlp_endpoint: Option<String>,
93    /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
94    /// When set, the value is resolved from the age vault at startup and passed as
95    /// `Authorization` or custom headers to the collector.
96    #[serde(default, skip_serializing_if = "Option::is_none")]
97    pub otlp_headers_vault_key: Option<String>,
98    /// Pyroscope server URL (used when `backend = "pyroscope"`).
99    #[serde(default, skip_serializing_if = "Option::is_none")]
100    pub pyroscope_endpoint: Option<String>,
101    /// Service name reported in trace metadata. Default: `"zeph-agent"`.
102    #[serde(default = "default_service_name")]
103    pub service_name: String,
104    /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
105    /// Applies only to the `otlp` backend; the `local` backend always records all spans.
106    /// Default: `1.0`.
107    ///
108    /// # Warning
109    ///
110    /// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
111    /// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
112    /// storage but provides **no protection** against CPU or RAM spikes caused by high span
113    /// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
114    /// applied before spans are created) to prevent the OTLP feedback loop.
115    #[serde(default = "default_sample_rate")]
116    pub sample_rate: f64,
117    /// Optional base filter directive for the OTLP tracing layer.
118    ///
119    /// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
120    /// When unset, defaults to `"info"`.
121    ///
122    /// # Hardcoded transport exclusions
123    ///
124    /// The following exclusions are **always appended** after the user-supplied value, regardless
125    /// of what is set here:
126    ///
127    /// ```text
128    /// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
129    /// ```
130    ///
131    /// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
132    /// precedence over any conflicting directive in this field. For example, setting
133    /// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
134    /// the hardcoded exclusion appears later in the filter string. This is intentional:
135    /// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
136    /// capture its own network activity, creating a feedback loop.
137    ///
138    /// # Note on `sample_rate`
139    ///
140    /// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
141    /// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
142    /// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
143    /// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
144    /// the feedback loop.
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub otel_filter: Option<String>,
147    /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
148    #[serde(default = "default_system_metrics_interval_secs")]
149    pub system_metrics_interval_secs: u64,
150    /// User-defined key/value pairs attached as OpenTelemetry resource attributes.
151    ///
152    /// These appear on every span exported via OTLP and in Chrome JSON trace
153    /// `resourceSpans[].resource.attributes`. Useful for tagging traces with deployment
154    /// environment, team, git revision, etc.
155    ///
156    /// Keys follow the [OpenTelemetry attribute naming convention](https://opentelemetry.io/docs/specs/semconv/general/attribute-naming/)
157    /// (dot-separated, lowercase). The reserved key `service.name` is silently ignored —
158    /// the `service_name` config field takes precedence.
159    ///
160    /// Values appear in plaintext in exported traces. The `RedactingSpanProcessor` does
161    /// **not** scrub resource attributes (they are set once at init, not per-span). Do not
162    /// store secrets here.
163    ///
164    /// # Example (TOML)
165    ///
166    /// ```toml
167    /// [telemetry.trace_metadata]
168    /// "deployment.environment" = "staging"
169    /// "team.name" = "platform"
170    /// "vcs.revision" = "abc1234"
171    /// ```
172    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
173    pub trace_metadata: HashMap<String, String>,
174}
175
176impl Default for TelemetryConfig {
177    fn default() -> Self {
178        Self {
179            enabled: false,
180            backend: TelemetryBackend::default(),
181            trace_dir: default_trace_dir(),
182            include_args: default_include_args(),
183            otlp_endpoint: None,
184            otlp_headers_vault_key: None,
185            pyroscope_endpoint: None,
186            service_name: default_service_name(),
187            sample_rate: default_sample_rate(),
188            otel_filter: None,
189            system_metrics_interval_secs: default_system_metrics_interval_secs(),
190            trace_metadata: HashMap::new(),
191        }
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn telemetry_config_defaults() {
201        let cfg = TelemetryConfig::default();
202        assert!(!cfg.enabled);
203        assert_eq!(cfg.backend, TelemetryBackend::Local);
204        assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
205        assert!(!cfg.include_args);
206        assert!(cfg.otlp_endpoint.is_none());
207        assert_eq!(cfg.service_name, "zeph-agent");
208        assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
209    }
210
211    #[test]
212    fn telemetry_config_serde_roundtrip() {
213        let toml = r#"
214            enabled = true
215            backend = "otlp"
216            trace_dir = "/tmp/traces"
217            include_args = false
218            otlp_endpoint = "http://otel:4317"
219            service_name = "my-agent"
220            sample_rate = 0.5
221        "#;
222        let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
223        assert!(cfg.enabled);
224        assert_eq!(cfg.backend, TelemetryBackend::Otlp);
225        assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
226        assert!(!cfg.include_args);
227        assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
228        assert_eq!(cfg.service_name, "my-agent");
229        let serialized = toml::to_string(&cfg).unwrap();
230        let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
231        assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
232        assert_eq!(cfg2.service_name, "my-agent");
233    }
234
235    #[test]
236    fn telemetry_config_old_toml_without_section_uses_defaults() {
237        // Existing configs without [telemetry] must deserialize with defaults.
238        let cfg: TelemetryConfig = toml::from_str("").unwrap();
239        assert!(!cfg.enabled);
240        assert_eq!(cfg.backend, TelemetryBackend::Local);
241    }
242
243    #[test]
244    fn trace_metadata_parses_and_roundtrips() {
245        let toml = r#"
246            enabled = true
247            backend = "otlp"
248            service_name = "my-agent"
249
250            [trace_metadata]
251            "deployment.environment" = "staging"
252            "team.name" = "platform"
253        "#;
254        let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
255        assert_eq!(
256            cfg.trace_metadata
257                .get("deployment.environment")
258                .map(String::as_str),
259            Some("staging")
260        );
261        assert_eq!(
262            cfg.trace_metadata.get("team.name").map(String::as_str),
263            Some("platform")
264        );
265
266        // Roundtrip: serialize then deserialize preserves values.
267        let serialized = toml::to_string(&cfg).unwrap();
268        let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
269        assert_eq!(cfg2.trace_metadata, cfg.trace_metadata);
270    }
271
272    #[test]
273    fn trace_metadata_empty_by_default() {
274        let cfg = TelemetryConfig::default();
275        assert!(cfg.trace_metadata.is_empty());
276    }
277
278    #[test]
279    fn trace_metadata_omitted_when_empty_on_serialize() {
280        let cfg = TelemetryConfig::default();
281        let serialized = toml::to_string(&cfg).unwrap();
282        assert!(
283            !serialized.contains("trace_metadata"),
284            "empty trace_metadata must be omitted from serialized TOML"
285        );
286    }
287}