1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
// SPDX-License-Identifier: MIT OR Apache-2.0
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
fn default_trace_dir() -> PathBuf {
PathBuf::from(".local/traces")
}
fn default_include_args() -> bool {
false
}
fn default_service_name() -> String {
"zeph-agent".into()
}
fn default_sample_rate() -> f64 {
1.0
}
fn default_system_metrics_interval_secs() -> u64 {
5
}
/// Selects the tracing backend used when `[telemetry] enabled = true`.
///
/// - `Local`: writes Chrome JSON traces to `trace_dir` on disk.
/// - `Otlp`: exports spans to an OpenTelemetry collector via OTLP gRPC (requires the `otel`
/// feature). Uses `otlp_endpoint` (default: `"http://localhost:4317"`) when set.
/// - `Pyroscope`: continuous profiling via Pyroscope (requires the `profiling-pyroscope`
/// feature).
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum TelemetryBackend {
/// Write `{trace_dir}/{session_id}_{timestamp}.json` Chrome traces.
#[default]
Local,
/// Export spans via OTLP gRPC to an OpenTelemetry collector.
Otlp,
/// Push continuous CPU/memory profiles to a Pyroscope server.
Pyroscope,
}
/// Profiling and distributed tracing configuration, nested under `[telemetry]` in TOML.
///
/// When `enabled = true` and the binary is compiled with `--features profiling`, agent turn
/// phases and LLM provider calls are instrumented with [`tracing`] spans. Traces are exported
/// according to the selected [`TelemetryBackend`].
///
/// Enabling telemetry has zero overhead when the `profiling` feature is absent — all
/// instrumentation points are compiled out via `cfg_attr`.
///
/// # Example (TOML)
///
/// ```toml
/// [telemetry]
/// enabled = true
/// backend = "local"
/// trace_dir = ".local/traces"
/// include_args = false
/// service_name = "my-zeph"
/// sample_rate = 0.1
/// ```
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct TelemetryConfig {
/// Enable tracing instrumentation. Default: `false`.
#[serde(default)]
pub enabled: bool,
/// Backend to use for trace export. Default: `local`.
#[serde(default)]
pub backend: TelemetryBackend,
/// Directory for Chrome JSON trace files (used when `backend = "local"`).
/// Default: `".local/traces"`.
#[serde(default = "default_trace_dir")]
pub trace_dir: PathBuf,
/// Include function arguments as span attributes in Chrome JSON traces.
///
/// **Default: false.** Keep disabled in production — span field values are visible
/// to all subscriber layers including OTLP. LLM prompts, tool outputs, and user
/// messages may appear as span attributes if enabled.
///
/// Note: this flag controls the Chrome JSON trace layer only, not OTLP span attributes.
#[serde(default = "default_include_args")]
pub include_args: bool,
/// OTLP gRPC endpoint URL (used when `backend = "otlp"`).
/// Default: `"http://localhost:4317"` when unset.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub otlp_endpoint: Option<String>,
/// Vault key for OTLP authentication headers (e.g. `ZEPH_OTLP_HEADERS`).
/// When set, the value is resolved from the age vault at startup and passed as
/// `Authorization` or custom headers to the collector.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub otlp_headers_vault_key: Option<String>,
/// Pyroscope server URL (used when `backend = "pyroscope"`).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pyroscope_endpoint: Option<String>,
/// Service name reported in trace metadata. Default: `"zeph-agent"`.
#[serde(default = "default_service_name")]
pub service_name: String,
/// Fraction of traces to sample. `1.0` = record all, `0.1` = record 10%.
/// Applies only to the `otlp` backend; the `local` backend always records all spans.
/// Default: `1.0`.
///
/// # Warning
///
/// `sample_rate` controls the fraction of completed traces sent to the OTLP collector,
/// but the sampler runs **after** span creation. A low `sample_rate` reduces collector
/// storage but provides **no protection** against CPU or RAM spikes caused by high span
/// creation rates. Use [`otel_filter`][TelemetryConfig::otel_filter] (an `EnvFilter`
/// applied before spans are created) to prevent the OTLP feedback loop.
#[serde(default = "default_sample_rate")]
pub sample_rate: f64,
/// Optional base filter directive for the OTLP tracing layer.
///
/// Accepts the same syntax as `RUST_LOG` / `EnvFilter` (e.g. `"info"`, `"debug,myapp=trace"`).
/// When unset, defaults to `"info"`.
///
/// # Hardcoded transport exclusions
///
/// The following exclusions are **always appended** after the user-supplied value, regardless
/// of what is set here:
///
/// ```text
/// tonic=warn,tower=warn,hyper=warn,h2=warn,opentelemetry=warn,rmcp=warn,sqlx=warn,want=warn
/// ```
///
/// `EnvFilter` uses last-directive-wins semantics, so these appended directives take
/// precedence over any conflicting directive in this field. For example, setting
/// `otel_filter = "tonic=debug"` will be silently overridden to `tonic=warn` because
/// the hardcoded exclusion appears later in the filter string. This is intentional:
/// allowing transport crates to emit `debug` spans would cause the OTLP exporter to
/// capture its own network activity, creating a feedback loop.
///
/// # Note on `sample_rate`
///
/// `sample_rate` controls the fraction of traces sent to the OTLP collector, but the sampler
/// runs **after** span creation. Setting `sample_rate < 1.0` reduces Jaeger storage but
/// provides **no protection** against CPU or RAM spikes caused by high span creation rate.
/// Only this `otel_filter` (an `EnvFilter` applied upstream of span creation) prevents
/// the feedback loop.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub otel_filter: Option<String>,
/// Interval in seconds between system-metrics snapshots (Phase 3). Default: `5`.
#[serde(default = "default_system_metrics_interval_secs")]
pub system_metrics_interval_secs: u64,
}
impl Default for TelemetryConfig {
fn default() -> Self {
Self {
enabled: false,
backend: TelemetryBackend::default(),
trace_dir: default_trace_dir(),
include_args: default_include_args(),
otlp_endpoint: None,
otlp_headers_vault_key: None,
pyroscope_endpoint: None,
service_name: default_service_name(),
sample_rate: default_sample_rate(),
otel_filter: None,
system_metrics_interval_secs: default_system_metrics_interval_secs(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn telemetry_config_defaults() {
let cfg = TelemetryConfig::default();
assert!(!cfg.enabled);
assert_eq!(cfg.backend, TelemetryBackend::Local);
assert_eq!(cfg.trace_dir, PathBuf::from(".local/traces"));
assert!(!cfg.include_args);
assert!(cfg.otlp_endpoint.is_none());
assert_eq!(cfg.service_name, "zeph-agent");
assert!((cfg.sample_rate - 1.0).abs() < f64::EPSILON);
}
#[test]
fn telemetry_config_serde_roundtrip() {
let toml = r#"
enabled = true
backend = "otlp"
trace_dir = "/tmp/traces"
include_args = false
otlp_endpoint = "http://otel:4317"
service_name = "my-agent"
sample_rate = 0.5
"#;
let cfg: TelemetryConfig = toml::from_str(toml).unwrap();
assert!(cfg.enabled);
assert_eq!(cfg.backend, TelemetryBackend::Otlp);
assert_eq!(cfg.trace_dir, PathBuf::from("/tmp/traces"));
assert!(!cfg.include_args);
assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://otel:4317"));
assert_eq!(cfg.service_name, "my-agent");
let serialized = toml::to_string(&cfg).unwrap();
let cfg2: TelemetryConfig = toml::from_str(&serialized).unwrap();
assert_eq!(cfg2.backend, TelemetryBackend::Otlp);
assert_eq!(cfg2.service_name, "my-agent");
}
#[test]
fn telemetry_config_old_toml_without_section_uses_defaults() {
// Existing configs without [telemetry] must deserialize with defaults.
let cfg: TelemetryConfig = toml::from_str("").unwrap();
assert!(!cfg.enabled);
assert_eq!(cfg.backend, TelemetryBackend::Local);
}
}