zeph_config/telemetry.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::path::PathBuf;
5
6use serde::{Deserialize, Serialize};
7
8fn default_trace_dir() -> PathBuf {
9 PathBuf::from(".local/traces")
10}
11
12fn default_include_args() -> bool {
13 false
14}
15
16fn default_service_name() -> String {
17 "zeph-agent".into()
18}
19
20fn default_sample_rate() -> f64 {
21 1.0
22}
23
24fn default_system_metrics_interval_secs() -> u64 {
25 5
26}
27
28/// Selects the tracing backend used when `[telemetry] enabled = true`.
29///
30/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
31/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
32/// feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
33/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
34/// feature).
35#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
36#[serde(rename_all = "lowercase")]
37pub enum TelemetryBackend {
38 /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
39 #[default]
40 Local,
41 /// Export spans via OTLP gRPC to an OpenTelemetry collector.
42 Otlp,
43 /// Push continuous CPU/memory profiles to a Pyroscope server.
44 Pyroscope,
45}
46
47/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
48///
49/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
50/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
51/// according to the selected [`TelemetryBackend`].
52///
53/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
54/// instrumentation points are compiled out via `cfg_attr`.
55///
56/// # Example (TOML)
57///
58/// ```toml
59/// [telemetry]
60/// enabled = true
61/// backend = "local"
62/// trace_dir = ".local/traces"
63/// include_args = false
64/// service_name = "my-zeph"
65/// sample_rate = 0.1
66/// ```
67#[derive(Debug, Clone, Deserialize, Serialize)]
68pub struct TelemetryConfig {
69 /// Enable tracing instrumentation. Default: `false`.
70 #[serde(default)]
71 pub enabled: bool,
72 /// Backend to use for trace export. Default: `local`.
73 #[serde(default)]
74 pub backend: TelemetryBackend,
75 /// Directory for Chrome JSON trace files (used when `backend = "local"`).
76 /// Default: `".local/traces"`.
77 #[serde(default = "default_trace_dir")]
78 pub trace_dir: PathBuf,
79 /// Include function arguments as span attributes in Chrome JSON traces.
80 ///
81 /// **Default: false.** Keep disabled in production — span field values are visible
82 /// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
83 /// messages may appear as span attributes if enabled.
84 ///
85 /// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
86 #[serde(default = "default_include_args")]
87 pub include_args: bool,
88 /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
89 /// Default: `"http://localhost:4317"` when unset.
90 #[serde(default, skip_serializing_if = "Option::is_none")]
91 pub otlp_endpoint: Option<String>,
92 /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
93 /// When set, the value is resolved from the age vault at startup and passed as
94 /// `Authorization` or custom headers to the collector.
95 #[serde(default, skip_serializing_if = "Option::is_none")]
96 pub otlp_headers_vault_key: Option<String>,
97 /// Pyroscope server URL (used when `backend = "pyroscope"`).
98 #[serde(default, skip_serializing_if = "Option::is_none")]
99 pub pyroscope_endpoint: Option<String>,
100 /// Service name reported in trace metadata. Default: `"zeph-agent"`.
101 #[serde(default = "default_service_name")]
102 pub service_name: String,
103 /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
104 /// Applies only to the `otlp` backend; the `local` backend always records all spans.
105 /// Default: `1.0`.
106 ///
107 /// # Warning
108 ///
109 /// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
110 /// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
111 /// storage but provides **no protection** against CPU or RAM spikes caused by high span
112 /// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
113 /// applied before spans are created) to prevent the OTLP feedback loop.
114 #[serde(default = "default_sample_rate")]
115 pub sample_rate: f64,
116 /// Optional base filter directive for the OTLP tracing layer.
117 ///
118 /// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
119 /// When unset, defaults to `"info"`.
120 ///
121 /// # Hardcoded transport exclusions
122 ///
123 /// The following exclusions are **always appended** after the user-supplied value, regardless
124 /// of what is set here:
125 ///
126 /// ```text
127 /// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
128 /// ```
129 ///
130 /// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
131 /// precedence over any conflicting directive in this field. For example, setting
132 /// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
133 /// the hardcoded exclusion appears later in the filter string. This is intentional:
134 /// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
135 /// capture its own network activity, creating a feedback loop.
136 ///
137 /// # Note on `sample_rate`
138 ///
139 /// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
140 /// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
141 /// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
142 /// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
143 /// the feedback loop.
144 #[serde(default, skip_serializing_if = "Option::is_none")]
145 pub otel_filter: Option<String>,
146 /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
147 #[serde(default = "default_system_metrics_interval_secs")]
148 pub system_metrics_interval_secs: u64,
149}
150
151impl Default for TelemetryConfig {
152 fn default() -> Self {
153 Self {
154 enabled: false,
155 backend: TelemetryBackend::default(),
156 trace_dir: default_trace_dir(),
157 include_args: default_include_args(),
158 otlp_endpoint: None,
159 otlp_headers_vault_key: None,
160 pyroscope_endpoint: None,
161 service_name: default_service_name(),
162 sample_rate: default_sample_rate(),
163 otel_filter: None,
164 system_metrics_interval_secs: default_system_metrics_interval_secs(),
165 }
166 }
167}
168
169#[cfg(test)]
170mod tests {
171 use super::*;
172
173 #[test]
174 fn telemetry_config_defaults() {
175 let cfg = TelemetryConfig::default();
176 assert!(!cfg.enabled);
177 assert_eq!(cfg.backend, TelemetryBackend::Local);
178 assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
179 assert!(!cfg.include_args);
180 assert!(cfg.otlp_endpoint.is_none());
181 assert_eq!(cfg.service_name, "zeph-agent");
182 assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
183 }
184
185 #[test]
186 fn telemetry_config_serde_roundtrip() {
187 let toml = r#"
188 enabled = true
189 backend = "otlp"
190 trace_dir = "/tmp/traces"
191 include_args = false
192 otlp_endpoint = "http://otel:4317"
193 service_name = "my-agent"
194 sample_rate = 0.5
195 "#;
196 let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
197 assert!(cfg.enabled);
198 assert_eq!(cfg.backend, TelemetryBackend::Otlp);
199 assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
200 assert!(!cfg.include_args);
201 assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
202 assert_eq!(cfg.service_name, "my-agent");
203 let serialized = toml::to_string(&cfg).unwrap();
204 let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
205 assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
206 assert_eq!(cfg2.service_name, "my-agent");
207 }
208
209 #[test]
210 fn telemetry_config_old_toml_without_section_uses_defaults() {
211 // Existing configs without [telemetry] must deserialize with defaults.
212 let cfg: TelemetryConfig = toml::from_str("").unwrap();
213 assert!(!cfg.enabled);
214 assert_eq!(cfg.backend, TelemetryBackend::Local);
215 }
216}