zeph_config/telemetry.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashMap;
5use std::path::PathBuf;
6
7use serde::{Deserialize, Serialize};
8
9fn default_trace_dir() -> PathBuf {
10 PathBuf::from(".local/traces")
11}
12
13fn default_include_args() -> bool {
14 false
15}
16
17fn default_service_name() -> String {
18 "zeph-agent".into()
19}
20
21fn default_sample_rate() -> f64 {
22 1.0
23}
24
25fn default_system_metrics_interval_secs() -> u64 {
26 5
27}
28
29/// Selects the tracing backend used when `[telemetry] enabled = true`.
30///
31/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
32/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
33/// feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
34/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
35/// feature).
36#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
37#[serde(rename_all = "lowercase")]
38pub enum TelemetryBackend {
39 /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
40 #[default]
41 Local,
42 /// Export spans via OTLP gRPC to an OpenTelemetry collector.
43 Otlp,
44 /// Push continuous CPU/memory profiles to a Pyroscope server.
45 Pyroscope,
46}
47
48/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
49///
50/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
51/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
52/// according to the selected [`TelemetryBackend`].
53///
54/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
55/// instrumentation points are compiled out via `cfg_attr`.
56///
57/// # Example (TOML)
58///
59/// ```toml
60/// [telemetry]
61/// enabled = true
62/// backend = "local"
63/// trace_dir = ".local/traces"
64/// include_args = false
65/// service_name = "my-zeph"
66/// sample_rate = 0.1
67/// ```
68#[derive(Debug, Clone, Deserialize, Serialize)]
69pub struct TelemetryConfig {
70 /// Enable tracing instrumentation. Default: `false`.
71 #[serde(default)]
72 pub enabled: bool,
73 /// Backend to use for trace export. Default: `local`.
74 #[serde(default)]
75 pub backend: TelemetryBackend,
76 /// Directory for Chrome JSON trace files (used when `backend = "local"`).
77 /// Default: `".local/traces"`.
78 #[serde(default = "default_trace_dir")]
79 pub trace_dir: PathBuf,
80 /// Include function arguments as span attributes in Chrome JSON traces.
81 ///
82 /// **Default: false.** Keep disabled in production — span field values are visible
83 /// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
84 /// messages may appear as span attributes if enabled.
85 ///
86 /// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
87 #[serde(default = "default_include_args")]
88 pub include_args: bool,
89 /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
90 /// Default: `"http://localhost:4317"` when unset.
91 #[serde(default, skip_serializing_if = "Option::is_none")]
92 pub otlp_endpoint: Option<String>,
93 /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
94 /// When set, the value is resolved from the age vault at startup and passed as
95 /// `Authorization` or custom headers to the collector.
96 #[serde(default, skip_serializing_if = "Option::is_none")]
97 pub otlp_headers_vault_key: Option<String>,
98 /// Pyroscope server URL (used when `backend = "pyroscope"`).
99 #[serde(default, skip_serializing_if = "Option::is_none")]
100 pub pyroscope_endpoint: Option<String>,
101 /// Service name reported in trace metadata. Default: `"zeph-agent"`.
102 #[serde(default = "default_service_name")]
103 pub service_name: String,
104 /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
105 /// Applies only to the `otlp` backend; the `local` backend always records all spans.
106 /// Default: `1.0`.
107 ///
108 /// # Warning
109 ///
110 /// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
111 /// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
112 /// storage but provides **no protection** against CPU or RAM spikes caused by high span
113 /// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
114 /// applied before spans are created) to prevent the OTLP feedback loop.
115 #[serde(default = "default_sample_rate")]
116 pub sample_rate: f64,
117 /// Optional base filter directive for the OTLP tracing layer.
118 ///
119 /// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
120 /// When unset, defaults to `"info"`.
121 ///
122 /// # Hardcoded transport exclusions
123 ///
124 /// The following exclusions are **always appended** after the user-supplied value, regardless
125 /// of what is set here:
126 ///
127 /// ```text
128 /// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
129 /// ```
130 ///
131 /// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
132 /// precedence over any conflicting directive in this field. For example, setting
133 /// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
134 /// the hardcoded exclusion appears later in the filter string. This is intentional:
135 /// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
136 /// capture its own network activity, creating a feedback loop.
137 ///
138 /// # Note on `sample_rate`
139 ///
140 /// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
141 /// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
142 /// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
143 /// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
144 /// the feedback loop.
145 #[serde(default, skip_serializing_if = "Option::is_none")]
146 pub otel_filter: Option<String>,
147 /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
148 #[serde(default = "default_system_metrics_interval_secs")]
149 pub system_metrics_interval_secs: u64,
150 /// User-defined key/value pairs attached as OpenTelemetry resource attributes.
151 ///
152 /// These appear on every span exported via OTLP and in Chrome JSON trace
153 /// `resourceSpans[].resource.attributes`. Useful for tagging traces with deployment
154 /// environment, team, git revision, etc.
155 ///
156 /// Keys follow the [OpenTelemetry attribute naming convention](https://opentelemetry.io/docs/specs/semconv/general/attribute-naming/)
157 /// (dot-separated, lowercase). The reserved key `service.name` is silently ignored —
158 /// the `service_name` config field takes precedence.
159 ///
160 /// Values appear in plaintext in exported traces. The `RedactingSpanProcessor` does
161 /// **not** scrub resource attributes (they are set once at init, not per-span). Do not
162 /// store secrets here.
163 ///
164 /// # Example (TOML)
165 ///
166 /// ```toml
167 /// [telemetry.trace_metadata]
168 /// "deployment.environment" = "staging"
169 /// "team.name" = "platform"
170 /// "vcs.revision" = "abc1234"
171 /// ```
172 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
173 pub trace_metadata: HashMap<String, String>,
174}
175
176impl Default for TelemetryConfig {
177 fn default() -> Self {
178 Self {
179 enabled: false,
180 backend: TelemetryBackend::default(),
181 trace_dir: default_trace_dir(),
182 include_args: default_include_args(),
183 otlp_endpoint: None,
184 otlp_headers_vault_key: None,
185 pyroscope_endpoint: None,
186 service_name: default_service_name(),
187 sample_rate: default_sample_rate(),
188 otel_filter: None,
189 system_metrics_interval_secs: default_system_metrics_interval_secs(),
190 trace_metadata: HashMap::new(),
191 }
192 }
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn telemetry_config_defaults() {
201 let cfg = TelemetryConfig::default();
202 assert!(!cfg.enabled);
203 assert_eq!(cfg.backend, TelemetryBackend::Local);
204 assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
205 assert!(!cfg.include_args);
206 assert!(cfg.otlp_endpoint.is_none());
207 assert_eq!(cfg.service_name, "zeph-agent");
208 assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
209 }
210
211 #[test]
212 fn telemetry_config_serde_roundtrip() {
213 let toml = r#"
214 enabled = true
215 backend = "otlp"
216 trace_dir = "/tmp/traces"
217 include_args = false
218 otlp_endpoint = "http://otel:4317"
219 service_name = "my-agent"
220 sample_rate = 0.5
221 "#;
222 let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
223 assert!(cfg.enabled);
224 assert_eq!(cfg.backend, TelemetryBackend::Otlp);
225 assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
226 assert!(!cfg.include_args);
227 assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
228 assert_eq!(cfg.service_name, "my-agent");
229 let serialized = toml::to_string(&cfg).unwrap();
230 let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
231 assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
232 assert_eq!(cfg2.service_name, "my-agent");
233 }
234
235 #[test]
236 fn telemetry_config_old_toml_without_section_uses_defaults() {
237 // Existing configs without [telemetry] must deserialize with defaults.
238 let cfg: TelemetryConfig = toml::from_str("").unwrap();
239 assert!(!cfg.enabled);
240 assert_eq!(cfg.backend, TelemetryBackend::Local);
241 }
242
243 #[test]
244 fn trace_metadata_parses_and_roundtrips() {
245 let toml = r#"
246 enabled = true
247 backend = "otlp"
248 service_name = "my-agent"
249
250 [trace_metadata]
251 "deployment.environment" = "staging"
252 "team.name" = "platform"
253 "#;
254 let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
255 assert_eq!(
256 cfg.trace_metadata
257 .get("deployment.environment")
258 .map(String::as_str),
259 Some("staging")
260 );
261 assert_eq!(
262 cfg.trace_metadata.get("team.name").map(String::as_str),
263 Some("platform")
264 );
265
266 // Roundtrip: serialize then deserialize preserves values.
267 let serialized = toml::to_string(&cfg).unwrap();
268 let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
269 assert_eq!(cfg2.trace_metadata, cfg.trace_metadata);
270 }
271
272 #[test]
273 fn trace_metadata_empty_by_default() {
274 let cfg = TelemetryConfig::default();
275 assert!(cfg.trace_metadata.is_empty());
276 }
277
278 #[test]
279 fn trace_metadata_omitted_when_empty_on_serialize() {
280 let cfg = TelemetryConfig::default();
281 let serialized = toml::to_string(&cfg).unwrap();
282 assert!(
283 !serialized.contains("trace_metadata"),
284 "empty trace_metadata must be omitted from serialized TOML"
285 );
286 }
287}