Skip to main content

zeph_config/
telemetry.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::path::PathBuf;
5
6use serde::{Deserialize, Serialize};
7
8fn default_trace_dir() -> PathBuf {
9    PathBuf::from(".local/traces")
10}
11
12fn default_include_args() -> bool {
13    false
14}
15
16fn default_service_name() -> String {
17    "zeph-agent".into()
18}
19
20fn default_sample_rate() -> f64 {
21    1.0
22}
23
24fn default_system_metrics_interval_secs() -> u64 {
25    5
26}
27
28/// Selects the tracing backend used when `[telemetry] enabled = true`.
29///
30/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
31/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
32///   feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
33/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
34///   feature).
35#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
36#[serde(rename_all = "lowercase")]
37pub enum TelemetryBackend {
38    /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
39    #[default]
40    Local,
41    /// Export spans via OTLP gRPC to an OpenTelemetry collector.
42    Otlp,
43    /// Push continuous CPU/memory profiles to a Pyroscope server.
44    Pyroscope,
45}
46
47/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
48///
49/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
50/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
51/// according to the selected [`TelemetryBackend`].
52///
53/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
54/// instrumentation points are compiled out via `cfg_attr`.
55///
56/// # Example (TOML)
57///
58/// ```toml
59/// [telemetry]
60/// enabled = true
61/// backend = "local"
62/// trace_dir = ".local/traces"
63/// include_args = false
64/// service_name = "my-zeph"
65/// sample_rate = 0.1
66/// ```
67#[derive(Debug, Clone, Deserialize, Serialize)]
68pub struct TelemetryConfig {
69    /// Enable tracing instrumentation. Default: `false`.
70    #[serde(default)]
71    pub enabled: bool,
72    /// Backend to use for trace export. Default: `local`.
73    #[serde(default)]
74    pub backend: TelemetryBackend,
75    /// Directory for Chrome JSON trace files (used when `backend = "local"`).
76    /// Default: `".local/traces"`.
77    #[serde(default = "default_trace_dir")]
78    pub trace_dir: PathBuf,
79    /// Include function arguments as span attributes in Chrome JSON traces.
80    ///
81    /// **Default: false.** Keep disabled in production — span field values are visible
82    /// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
83    /// messages may appear as span attributes if enabled.
84    ///
85    /// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
86    #[serde(default = "default_include_args")]
87    pub include_args: bool,
88    /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
89    /// Default: `"http://localhost:4317"` when unset.
90    #[serde(default, skip_serializing_if = "Option::is_none")]
91    pub otlp_endpoint: Option<String>,
92    /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
93    /// When set, the value is resolved from the age vault at startup and passed as
94    /// `Authorization` or custom headers to the collector.
95    #[serde(default, skip_serializing_if = "Option::is_none")]
96    pub otlp_headers_vault_key: Option<String>,
97    /// Pyroscope server URL (used when `backend = "pyroscope"`).
98    #[serde(default, skip_serializing_if = "Option::is_none")]
99    pub pyroscope_endpoint: Option<String>,
100    /// Service name reported in trace metadata. Default: `"zeph-agent"`.
101    #[serde(default = "default_service_name")]
102    pub service_name: String,
103    /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
104    /// Applies only to the `otlp` backend; the `local` backend always records all spans.
105    /// Default: `1.0`.
106    ///
107    /// # Warning
108    ///
109    /// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
110    /// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
111    /// storage but provides **no protection** against CPU or RAM spikes caused by high span
112    /// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
113    /// applied before spans are created) to prevent the OTLP feedback loop.
114    #[serde(default = "default_sample_rate")]
115    pub sample_rate: f64,
116    /// Optional base filter directive for the OTLP tracing layer.
117    ///
118    /// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
119    /// When unset, defaults to `"info"`.
120    ///
121    /// # Hardcoded transport exclusions
122    ///
123    /// The following exclusions are **always appended** after the user-supplied value, regardless
124    /// of what is set here:
125    ///
126    /// ```text
127    /// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
128    /// ```
129    ///
130    /// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
131    /// precedence over any conflicting directive in this field. For example, setting
132    /// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
133    /// the hardcoded exclusion appears later in the filter string. This is intentional:
134    /// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
135    /// capture its own network activity, creating a feedback loop.
136    ///
137    /// # Note on `sample_rate`
138    ///
139    /// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
140    /// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
141    /// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
142    /// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
143    /// the feedback loop.
144    #[serde(default, skip_serializing_if = "Option::is_none")]
145    pub otel_filter: Option<String>,
146    /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
147    #[serde(default = "default_system_metrics_interval_secs")]
148    pub system_metrics_interval_secs: u64,
149}
150
151impl Default for TelemetryConfig {
152    fn default() -> Self {
153        Self {
154            enabled: false,
155            backend: TelemetryBackend::default(),
156            trace_dir: default_trace_dir(),
157            include_args: default_include_args(),
158            otlp_endpoint: None,
159            otlp_headers_vault_key: None,
160            pyroscope_endpoint: None,
161            service_name: default_service_name(),
162            sample_rate: default_sample_rate(),
163            otel_filter: None,
164            system_metrics_interval_secs: default_system_metrics_interval_secs(),
165        }
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172
173    #[test]
174    fn telemetry_config_defaults() {
175        let cfg = TelemetryConfig::default();
176        assert!(!cfg.enabled);
177        assert_eq!(cfg.backend, TelemetryBackend::Local);
178        assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
179        assert!(!cfg.include_args);
180        assert!(cfg.otlp_endpoint.is_none());
181        assert_eq!(cfg.service_name, "zeph-agent");
182        assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
183    }
184
185    #[test]
186    fn telemetry_config_serde_roundtrip() {
187        let toml = r#"
188            enabled = true
189            backend = "otlp"
190            trace_dir = "/tmp/traces"
191            include_args = false
192            otlp_endpoint = "http://otel:4317"
193            service_name = "my-agent"
194            sample_rate = 0.5
195        "#;
196        let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
197        assert!(cfg.enabled);
198        assert_eq!(cfg.backend, TelemetryBackend::Otlp);
199        assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
200        assert!(!cfg.include_args);
201        assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
202        assert_eq!(cfg.service_name, "my-agent");
203        let serialized = toml::to_string(&cfg).unwrap();
204        let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
205        assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
206        assert_eq!(cfg2.service_name, "my-agent");
207    }
208
209    #[test]
210    fn telemetry_config_old_toml_without_section_uses_defaults() {
211        // Existing configs without [telemetry] must deserialize with defaults.
212        let cfg: TelemetryConfig = toml::from_str("").unwrap();
213        assert!(!cfg.enabled);
214        assert_eq!(cfg.backend, TelemetryBackend::Local);
215    }
216}