Skip to main content

zeph_config/
telemetry.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::path::PathBuf;
5
6use serde::{Deserialize, Serialize};
7
8fn default_trace_dir() -> PathBuf {
9    PathBuf::from(".local/traces")
10}
11
12fn default_include_args() -> bool {
13    false
14}
15
16fn default_service_name() -> String {
17    "zeph-agent".into()
18}
19
20fn default_sample_rate() -> f64 {
21    1.0
22}
23
24fn default_system_metrics_interval_secs() -> u64 {
25    5
26}
27
28/// Selects the tracing backend used when `[telemetry] enabled = true`.
29///
30/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
31/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
32///   feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
33/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
34///   feature).
35#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
36#[serde(rename_all = "lowercase")]
37pub enum TelemetryBackend {
38    /// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
39    #[default]
40    Local,
41    /// Export spans via OTLP gRPC to an OpenTelemetry collector.
42    Otlp,
43    /// Push continuous CPU/memory profiles to a Pyroscope server.
44    Pyroscope,
45}
46
47/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
48///
49/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
50/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
51/// according to the selected [`TelemetryBackend`].
52///
53/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
54/// instrumentation points are compiled out via `cfg_attr`.
55///
56/// # Example (TOML)
57///
58/// ```toml
59/// [telemetry]
60/// enabled = true
61/// backend = "local"
62/// trace_dir = ".local/traces"
63/// include_args = false
64/// service_name = "my-zeph"
65/// sample_rate = 0.1
66/// ```
67#[derive(Debug, Clone, Deserialize, Serialize)]
68pub struct TelemetryConfig {
69    /// Enable tracing instrumentation. Default: `false`.
70    #[serde(default)]
71    pub enabled: bool,
72    /// Backend to use for trace export. Default: `local`.
73    #[serde(default)]
74    pub backend: TelemetryBackend,
75    /// Directory for Chrome JSON trace files (used when `backend = "local"`).
76    /// Default: `".local/traces"`.
77    #[serde(default = "default_trace_dir")]
78    pub trace_dir: PathBuf,
79    /// Include function arguments in span attributes. Set to `true` for local debugging.
80    /// Keep `false` (the default) in production to avoid logging potentially sensitive data
81    /// such as user messages, LLM responses, or tool outputs with PII.
82    #[serde(default = "default_include_args")]
83    pub include_args: bool,
84    /// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
85    /// Default: `"http://localhost:4317"` when unset.
86    #[serde(default, skip_serializing_if = "Option::is_none")]
87    pub otlp_endpoint: Option<String>,
88    /// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
89    /// When set, the value is resolved from the age vault at startup and passed as
90    /// `Authorization` or custom headers to the collector.
91    #[serde(default, skip_serializing_if = "Option::is_none")]
92    pub otlp_headers_vault_key: Option<String>,
93    /// Pyroscope server URL (used when `backend = "pyroscope"`).
94    #[serde(default, skip_serializing_if = "Option::is_none")]
95    pub pyroscope_endpoint: Option<String>,
96    /// Service name reported in trace metadata. Default: `"zeph-agent"`.
97    #[serde(default = "default_service_name")]
98    pub service_name: String,
99    /// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
100    /// Applies only to the `otlp` backend; the `local` backend always records all spans.
101    /// Default: `1.0`.
102    #[serde(default = "default_sample_rate")]
103    pub sample_rate: f64,
104    /// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
105    #[serde(default = "default_system_metrics_interval_secs")]
106    pub system_metrics_interval_secs: u64,
107}
108
109impl Default for TelemetryConfig {
110    fn default() -> Self {
111        Self {
112            enabled: false,
113            backend: TelemetryBackend::default(),
114            trace_dir: default_trace_dir(),
115            include_args: default_include_args(),
116            otlp_endpoint: None,
117            otlp_headers_vault_key: None,
118            pyroscope_endpoint: None,
119            service_name: default_service_name(),
120            sample_rate: default_sample_rate(),
121            system_metrics_interval_secs: default_system_metrics_interval_secs(),
122        }
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn telemetry_config_defaults() {
132        let cfg = TelemetryConfig::default();
133        assert!(!cfg.enabled);
134        assert_eq!(cfg.backend, TelemetryBackend::Local);
135        assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
136        assert!(!cfg.include_args);
137        assert!(cfg.otlp_endpoint.is_none());
138        assert_eq!(cfg.service_name, "zeph-agent");
139        assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
140    }
141
142    #[test]
143    fn telemetry_config_serde_roundtrip() {
144        let toml = r#"
145            enabled = true
146            backend = "otlp"
147            trace_dir = "/tmp/traces"
148            include_args = false
149            otlp_endpoint = "http://otel:4317"
150            service_name = "my-agent"
151            sample_rate = 0.5
152        "#;
153        let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
154        assert!(cfg.enabled);
155        assert_eq!(cfg.backend, TelemetryBackend::Otlp);
156        assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
157        assert!(!cfg.include_args);
158        assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
159        assert_eq!(cfg.service_name, "my-agent");
160        let serialized = toml::to_string(&cfg).unwrap();
161        let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
162        assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
163        assert_eq!(cfg2.service_name, "my-agent");
164    }
165
166    #[test]
167    fn telemetry_config_old_toml_without_section_uses_defaults() {
168        // Existing configs without [telemetry] must deserialize with defaults.
169        let cfg: TelemetryConfig = toml::from_str("").unwrap();
170        assert!(!cfg.enabled);
171        assert_eq!(cfg.backend, TelemetryBackend::Local);
172    }
173}