zeph_config/telemetry.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7use serde::{Deserialize, Serialize};
8
9fn default_trace_dir() -> PathBuf {
10 PathBuf::from(".local/traces")
11}
12
13fn default_include_args() -> bool {
14 false
15}
16
17fn default_service_name() -> String {
18 "zeph-agent".into()
19}
20
21fn default_sample_rate() -> f64 {
22 1.0
23}
24
25fn default_system_metrics_interval_secs() -> u64 {
26 5
27}
28
29/// Selects the tracing backend used when `[telemetry] enabled = true`.
30///
31/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
32/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
33/// feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
34/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
35/// feature).
36#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum TelemetryBackend {
40 /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
41 #[default]
42 Local,
43 /// Export spans via OTLP gRPC to an OpenTelemetry collector.
44 Otlp,
45 /// Push continuous CPU/memory profiles to a Pyroscope server.
46 Pyroscope,
47}
48
49/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
50///
51/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
52/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
53/// according to the selected [`TelemetryBackend`].
54///
55/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
56/// instrumentation points are compiled out via `cfg_attr`.
57///
58/// # Example (TOML)
59///
60/// ```toml
61/// [telemetry]
62/// enabled = true
63/// backend = "local"
64/// trace_dir = ".local/traces"
65/// include_args = false
66/// service_name = "my-zeph"
67/// sample_rate = 0.1
68/// ```
69#[derive(Debug, Clone, Deserialize, Serialize)]
70pub struct TelemetryConfig {
71 /// Enable tracing instrumentation. Default: `false`.
72 #[serde(default)]
73 pub enabled: bool,
74 /// Backend to use for trace export. Default: `local`.
75 #[serde(default)]
76 pub backend: TelemetryBackend,
77 /// Directory for Chrome JSON trace files (used when `backend = "local"`).
78 /// Default: `".local/traces"`.
79 #[serde(default = "default_trace_dir")]
80 pub trace_dir: PathBuf,
81 /// Include function arguments as span attributes in Chrome JSON traces.
82 ///
83 /// **Default: false.** Keep disabled in production — span field values are visible
84 /// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
85 /// messages may appear as span attributes if enabled.
86 ///
87 /// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
88 #[serde(default = "default_include_args")]
89 pub include_args: bool,
90 /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
91 /// Default: `"http://localhost:4317"` when unset.
92 #[serde(default, skip_serializing_if = "Option::is_none")]
93 pub otlp_endpoint: Option<String>,
94 /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
95 /// When set, the value is resolved from the age vault at startup and passed as
96 /// `Authorization` or custom headers to the collector.
97 #[serde(default, skip_serializing_if = "Option::is_none")]
98 pub otlp_headers_vault_key: Option<String>,
99 /// Pyroscope server URL (used when `backend = "pyroscope"`).
100 #[serde(default, skip_serializing_if = "Option::is_none")]
101 pub pyroscope_endpoint: Option<String>,
102 /// Service name reported in trace metadata. Default: `"zeph-agent"`.
103 #[serde(default = "default_service_name")]
104 pub service_name: String,
105 /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
106 /// Applies only to the `otlp` backend; the `local` backend always records all spans.
107 /// Default: `1.0`.
108 ///
109 /// # Warning
110 ///
111 /// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
112 /// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
113 /// storage but provides **no protection** against CPU or RAM spikes caused by high span
114 /// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
115 /// applied before spans are created) to prevent the OTLP feedback loop.
116 #[serde(default = "default_sample_rate")]
117 pub sample_rate: f64,
118 /// Optional base filter directive for the OTLP tracing layer.
119 ///
120 /// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
121 /// When unset, defaults to `"info"`.
122 ///
123 /// # Hardcoded transport exclusions
124 ///
125 /// The following exclusions are **always appended** after the user-supplied value, regardless
126 /// of what is set here:
127 ///
128 /// ```text
129 /// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
130 /// ```
131 ///
132 /// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
133 /// precedence over any conflicting directive in this field. For example, setting
134 /// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
135 /// the hardcoded exclusion appears later in the filter string. This is intentional:
136 /// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
137 /// capture its own network activity, creating a feedback loop.
138 ///
139 /// # Note on `sample_rate`
140 ///
141 /// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
142 /// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
143 /// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
144 /// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
145 /// the feedback loop.
146 #[serde(default, skip_serializing_if = "Option::is_none")]
147 pub otel_filter: Option<String>,
148 /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
149 #[serde(default = "default_system_metrics_interval_secs")]
150 pub system_metrics_interval_secs: u64,
151 /// User-defined key/value pairs attached as OpenTelemetry resource attributes.
152 ///
153 /// These appear on every span exported via OTLP and in Chrome JSON trace
154 /// `resourceSpans[].resource.attributes`. Useful for tagging traces with deployment
155 /// environment, team, git revision, etc.
156 ///
157 /// Keys follow the [OpenTelemetry attribute naming convention](https://opentelemetry.io/docs/specs/semconv/general/attribute-naming/)
158 /// (dot-separated, lowercase). The reserved key `service.name` is silently ignored —
159 /// the `service_name` config field takes precedence.
160 ///
161 /// Values appear in plaintext in exported traces. The `RedactingSpanProcessor` does
162 /// **not** scrub resource attributes (they are set once at init, not per-span). Do not
163 /// store secrets here.
164 ///
165 /// # Example (TOML)
166 ///
167 /// ```toml
168 /// [telemetry.trace_metadata]
169 /// "deployment.environment" = "staging"
170 /// "team.name" = "platform"
171 /// "vcs.revision" = "abc1234"
172 /// ```
173 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
174 pub trace_metadata: HashMap<String, String>,
175}
176
177impl Default for TelemetryConfig {
178 fn default() -> Self {
179 Self {
180 enabled: false,
181 backend: TelemetryBackend::default(),
182 trace_dir: default_trace_dir(),
183 include_args: default_include_args(),
184 otlp_endpoint: None,
185 otlp_headers_vault_key: None,
186 pyroscope_endpoint: None,
187 service_name: default_service_name(),
188 sample_rate: default_sample_rate(),
189 otel_filter: None,
190 system_metrics_interval_secs: default_system_metrics_interval_secs(),
191 trace_metadata: HashMap::new(),
192 }
193 }
194}
195
196#[cfg(test)]
197mod tests {
198 use super::*;
199
200 #[test]
201 fn telemetry_config_defaults() {
202 let cfg = TelemetryConfig::default();
203 assert!(!cfg.enabled);
204 assert_eq!(cfg.backend, TelemetryBackend::Local);
205 assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
206 assert!(!cfg.include_args);
207 assert!(cfg.otlp_endpoint.is_none());
208 assert_eq!(cfg.service_name, "zeph-agent");
209 assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
210 }
211
212 #[test]
213 fn telemetry_config_serde_roundtrip() {
214 let toml = r#"
215 enabled = true
216 backend = "otlp"
217 trace_dir = "/tmp/traces"
218 include_args = false
219 otlp_endpoint = "http://otel:4317"
220 service_name = "my-agent"
221 sample_rate = 0.5
222 "#;
223 let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
224 assert!(cfg.enabled);
225 assert_eq!(cfg.backend, TelemetryBackend::Otlp);
226 assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
227 assert!(!cfg.include_args);
228 assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
229 assert_eq!(cfg.service_name, "my-agent");
230 let serialized = toml::to_string(&cfg).unwrap();
231 let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
232 assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
233 assert_eq!(cfg2.service_name, "my-agent");
234 }
235
236 #[test]
237 fn telemetry_config_old_toml_without_section_uses_defaults() {
238 // Existing configs without [telemetry] must deserialize with defaults.
239 let cfg: TelemetryConfig = toml::from_str("").unwrap();
240 assert!(!cfg.enabled);
241 assert_eq!(cfg.backend, TelemetryBackend::Local);
242 }
243
244 #[test]
245 fn trace_metadata_parses_and_roundtrips() {
246 let toml = r#"
247 enabled = true
248 backend = "otlp"
249 service_name = "my-agent"
250
251 [trace_metadata]
252 "deployment.environment" = "staging"
253 "team.name" = "platform"
254 "#;
255 let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
256 assert_eq!(
257 cfg.trace_metadata
258 .get("deployment.environment")
259 .map(String::as_str),
260 Some("staging")
261 );
262 assert_eq!(
263 cfg.trace_metadata.get("team.name").map(String::as_str),
264 Some("platform")
265 );
266
267 // Roundtrip: serialize then deserialize preserves values.
268 let serialized = toml::to_string(&cfg).unwrap();
269 let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
270 assert_eq!(cfg2.trace_metadata, cfg.trace_metadata);
271 }
272
273 #[test]
274 fn trace_metadata_empty_by_default() {
275 let cfg = TelemetryConfig::default();
276 assert!(cfg.trace_metadata.is_empty());
277 }
278
279 #[test]
280 fn trace_metadata_omitted_when_empty_on_serialize() {
281 let cfg = TelemetryConfig::default();
282 let serialized = toml::to_string(&cfg).unwrap();
283 assert!(
284 !serialized.contains("trace_metadata"),
285 "empty trace_metadata must be omitted from serialized TOML"
286 );
287 }
288}