Skip to main content

swink_agent_eval/
telemetry.rs

1//! OpenTelemetry integration for eval runs (spec 043 US7, FR-035).
2//!
3//! When the `telemetry` feature is enabled, [`EvalsTelemetry`] can be wired
4//! into [`EvalRunner`](crate::EvalRunner) to emit a three-level span tree
5//! during `run_set`:
6//!
7//! * Root: `swink.eval.run_set` — attributes:
8//!   `swink.eval.set_id`, `swink.eval.set_name`, `swink.eval.case_count`.
9//! * Per-case child: `swink.eval.case` — attributes:
10//!   `swink.eval.set_id`, `swink.eval.case_id`, `swink.eval.case_name`,
11//!   `swink.eval.verdict`, `swink.eval.duration_ms`.
12//! * Per-evaluator grandchild: `swink.eval.evaluator` — attributes:
13//!   `swink.eval.evaluator_name`, `swink.eval.verdict`, `swink.eval.score`,
14//!   `swink.eval.score_threshold`.
15//!
16//! Failed cases (overall verdict `Fail`) record OTel `Status::error` and an
17//! `exception` event summarising the failure cause (research §R-005, §FR-035).
18//!
19//! If the caller has an active OTel context — e.g. an outer `agent.run` span
20//! — it is inherited as the parent of `swink.eval.run_set`, enabling cross-
21//! service trace correlation. Callers without an active context get a fresh
22//! root trace.
23//!
24//! ## Example
25//!
26//! ```rust,ignore
27//! use std::sync::Arc;
28//! use opentelemetry::global;
29//! use swink_agent_eval::{EvalRunner, EvaluatorRegistry};
30//! use swink_agent_eval::telemetry::EvalsTelemetry;
31//!
32//! let telemetry = EvalsTelemetry::builder()
33//!     .with_tracer(global::tracer("swink.eval"))
34//!     .build();
35//! let runner = EvalRunner::with_defaults()
36//!     .with_telemetry(Arc::new(telemetry));
37//! ```
38
39use std::borrow::Cow;
40use std::time::Duration;
41
42use opentelemetry::global::BoxedTracer;
43// `Span` is used for its `end()` / `set_attribute` / etc. trait methods on
44// `BoxedSpan`; clippy can't see the indirect usage.
45#[allow(unused_imports)]
46use opentelemetry::trace::Span;
47use opentelemetry::trace::{
48    SpanBuilder, SpanKind, Status, TraceContextExt, Tracer, TracerProvider,
49};
50use opentelemetry::{Context, KeyValue, global};
51
52use crate::score::Verdict;
53use crate::types::{EvalCase, EvalCaseResult, EvalMetricResult, EvalSet};
54
55// ─── Attribute keys ─────────────────────────────────────────────────────────
56
57/// Root span name for an entire `run_set` invocation.
58pub const SPAN_RUN_SET: &str = "swink.eval.run_set";
59/// Per-case child span name.
60pub const SPAN_CASE: &str = "swink.eval.case";
61/// Per-evaluator grandchild span name.
62pub const SPAN_EVALUATOR: &str = "swink.eval.evaluator";
63
64/// Eval set identifier. Present on every span in the tree.
65pub const ATTR_SET_ID: &str = "swink.eval.set_id";
66/// Human-readable eval set name.
67pub const ATTR_SET_NAME: &str = "swink.eval.set_name";
68/// Number of cases in the eval set (root span only).
69pub const ATTR_CASE_COUNT: &str = "swink.eval.case_count";
70/// Eval case identifier.
71pub const ATTR_CASE_ID: &str = "swink.eval.case_id";
72/// Human-readable case name.
73pub const ATTR_CASE_NAME: &str = "swink.eval.case_name";
74/// Evaluator name (e.g. `trajectory`, `response`, `budget`).
75pub const ATTR_EVALUATOR_NAME: &str = "swink.eval.evaluator_name";
76/// Verdict — one of `pass` or `fail`.
77pub const ATTR_VERDICT: &str = "swink.eval.verdict";
78/// Raw numeric score on the evaluator span.
79pub const ATTR_SCORE: &str = "swink.eval.score";
80/// Pass/fail threshold used to derive the verdict.
81pub const ATTR_SCORE_THRESHOLD: &str = "swink.eval.score_threshold";
82/// Wall-clock case duration in milliseconds.
83pub const ATTR_DURATION_MS: &str = "swink.eval.duration_ms";
84/// Aggregate pass/fail counters on the root span.
85pub const ATTR_PASSED: &str = "swink.eval.passed";
86/// Aggregate failed counter on the root span.
87pub const ATTR_FAILED: &str = "swink.eval.failed";
88
89// ─── EvalsTelemetry ─────────────────────────────────────────────────────────
90
91/// Emits OTel spans for an entire `run_set` invocation.
92///
93/// Holds a [`BoxedTracer`] obtained either from a caller-supplied
94/// [`TracerProvider`] or from the global provider. Cloning is cheap — the
95/// tracer itself is reference-counted by the underlying SDK.
96///
97/// Construct via [`EvalsTelemetry::builder`].
98pub struct EvalsTelemetry {
99    tracer: BoxedTracer,
100}
101
102impl std::fmt::Debug for EvalsTelemetry {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        f.debug_struct("EvalsTelemetry").finish_non_exhaustive()
105    }
106}
107
108impl EvalsTelemetry {
109    /// Start a new builder.
110    #[must_use]
111    pub fn builder() -> EvalsTelemetryBuilder {
112        EvalsTelemetryBuilder::default()
113    }
114
115    /// Borrow the underlying tracer. Exposed so downstream crates can mint
116    /// auxiliary spans under the same instrumentation scope.
117    #[must_use]
118    pub fn tracer(&self) -> &BoxedTracer {
119        &self.tracer
120    }
121
122    /// Start the root `swink.eval.run_set` span.
123    ///
124    /// The parent is the active OTel [`Context`] (`Context::current()`), so a
125    /// caller-owned span — e.g. an outer `agent.run` or a scheduler tick — is
126    /// inherited automatically. When no context is active the span becomes a
127    /// new root trace.
128    pub(crate) fn start_run_set_span(&self, eval_set: &EvalSet) -> RunSetSpan {
129        let parent = Context::current();
130        let builder = SpanBuilder::from_name(Cow::Borrowed(SPAN_RUN_SET))
131            .with_kind(SpanKind::Internal)
132            .with_attributes(vec![
133                KeyValue::new(ATTR_SET_ID, eval_set.id.clone()),
134                KeyValue::new(ATTR_SET_NAME, eval_set.name.clone()),
135                KeyValue::new(
136                    ATTR_CASE_COUNT,
137                    i64::try_from(eval_set.cases.len()).unwrap_or(i64::MAX),
138                ),
139            ]);
140        let span = self.tracer.build_with_context(builder, &parent);
141        let cx = parent.with_span(span);
142        RunSetSpan {
143            context: cx,
144            set_id: eval_set.id.clone(),
145        }
146    }
147
148    /// Start a per-case span as a child of the supplied run-set context.
149    ///
150    /// Accepts a cloneable [`RunSetSpanRef`] so per-case futures in
151    /// `join_all` can each carry their own copy of the parent context
152    /// without borrowing across await points.
153    pub(crate) fn start_case_span_raw(&self, parent: &RunSetSpanRef, case: &EvalCase) -> CaseSpan {
154        let builder = SpanBuilder::from_name(Cow::Borrowed(SPAN_CASE))
155            .with_kind(SpanKind::Internal)
156            .with_attributes(vec![
157                KeyValue::new(ATTR_SET_ID, parent.set_id.clone()),
158                KeyValue::new(ATTR_CASE_ID, case.id.clone()),
159                KeyValue::new(ATTR_CASE_NAME, case.name.clone()),
160            ]);
161        let span = self.tracer.build_with_context(builder, &parent.context);
162        let cx = parent.context.with_span(span);
163        CaseSpan {
164            context: cx,
165            set_id: parent.set_id.clone(),
166            case_id: case.id.clone(),
167        }
168    }
169
170    /// Start a per-evaluator span as a child of the supplied case context.
171    pub(crate) fn start_evaluator_span(
172        &self,
173        parent: &CaseSpan,
174        evaluator_name: &str,
175    ) -> EvaluatorSpan {
176        let builder = SpanBuilder::from_name(Cow::Borrowed(SPAN_EVALUATOR))
177            .with_kind(SpanKind::Internal)
178            .with_attributes(vec![
179                KeyValue::new(ATTR_SET_ID, parent.set_id.clone()),
180                KeyValue::new(ATTR_CASE_ID, parent.case_id.clone()),
181                KeyValue::new(ATTR_EVALUATOR_NAME, evaluator_name.to_string()),
182            ]);
183        let span = self.tracer.build_with_context(builder, &parent.context);
184        let cx = parent.context.with_span(span);
185        EvaluatorSpan { context: cx }
186    }
187}
188
189// ─── Span handles ───────────────────────────────────────────────────────────
190
191/// RAII-style handle for the root `swink.eval.run_set` span.
192pub(crate) struct RunSetSpan {
193    context: Context,
194    #[allow(dead_code)] // stashed for future per-span correlation logging
195    set_id: String,
196}
197
198/// Cloneable reference to the run-set context used by per-case futures.
199///
200/// The owned [`RunSetSpan`] lives in the outer `run_set` frame; each future
201/// in `join_all` gets its own copy of this ref so it can mint child spans
202/// without borrowing across await points.
203#[derive(Clone)]
204pub(crate) struct RunSetSpanRef {
205    pub(crate) context: Context,
206    pub(crate) set_id: String,
207}
208
209impl RunSetSpan {
210    pub(crate) fn context(&self) -> &Context {
211        &self.context
212    }
213
214    /// Record aggregate counters and end the span.
215    pub(crate) fn end(self, passed: usize, failed: usize) {
216        let span = self.context.span();
217        span.set_attribute(KeyValue::new(
218            ATTR_PASSED,
219            i64::try_from(passed).unwrap_or(i64::MAX),
220        ));
221        span.set_attribute(KeyValue::new(
222            ATTR_FAILED,
223            i64::try_from(failed).unwrap_or(i64::MAX),
224        ));
225        if failed > 0 {
226            span.set_status(Status::error(format!("{failed} case(s) failed")));
227        } else {
228            span.set_status(Status::Ok);
229        }
230        span.end();
231    }
232}
233
234/// RAII-style handle for a `swink.eval.case` span.
235pub(crate) struct CaseSpan {
236    context: Context,
237    set_id: String,
238    case_id: String,
239}
240
241impl CaseSpan {
242    /// Borrow the underlying OTel [`Context`]. Exposed so the runner can
243    /// parent evaluator spans off the case span.
244    #[allow(dead_code)]
245    pub(crate) fn context(&self) -> &Context {
246        &self.context
247    }
248
249    /// Record the final verdict + duration and end the span. On failure the
250    /// span receives `Status::error` plus an `exception` event whose message
251    /// summarises every failing metric (FR-035).
252    pub(crate) fn end(self, result: &EvalCaseResult, duration: Duration) {
253        let span = self.context.span();
254        span.set_attribute(KeyValue::new(ATTR_VERDICT, verdict_str(result.verdict)));
255        #[allow(clippy::cast_possible_truncation)]
256        span.set_attribute(KeyValue::new(
257            ATTR_DURATION_MS,
258            duration.as_millis().min(i64::MAX as u128) as i64,
259        ));
260
261        if result.verdict.is_pass() {
262            span.set_status(Status::Ok);
263        } else {
264            let failing: Vec<String> = result
265                .metric_results
266                .iter()
267                .filter(|m| !m.score.verdict().is_pass())
268                .map(|m| {
269                    let detail = m.details.clone().unwrap_or_default();
270                    if detail.is_empty() {
271                        m.evaluator_name.clone()
272                    } else {
273                        format!("{}: {}", m.evaluator_name, detail)
274                    }
275                })
276                .collect();
277            let message = if failing.is_empty() {
278                format!("case `{}` failed", result.case_id)
279            } else {
280                format!("case `{}` failed: {}", result.case_id, failing.join(" | "))
281            };
282            span.add_event(
283                Cow::Borrowed("exception"),
284                vec![
285                    KeyValue::new("exception.type", "EvalCaseFailure"),
286                    KeyValue::new("exception.message", message.clone()),
287                ],
288            );
289            span.set_status(Status::error(message));
290        }
291        span.end();
292    }
293}
294
295/// RAII-style handle for a `swink.eval.evaluator` span.
296pub(crate) struct EvaluatorSpan {
297    context: Context,
298}
299
300impl EvaluatorSpan {
301    /// Record the metric result and end the span. Failing metrics receive
302    /// `Status::error` so observability backends show the evaluator as the
303    /// responsible child of a failing case.
304    pub(crate) fn end(self, metric: &EvalMetricResult) {
305        let span = self.context.span();
306        let verdict = metric.score.verdict();
307        span.set_attribute(KeyValue::new(ATTR_VERDICT, verdict_str(verdict)));
308        span.set_attribute(KeyValue::new(ATTR_SCORE, metric.score.value));
309        span.set_attribute(KeyValue::new(ATTR_SCORE_THRESHOLD, metric.score.threshold));
310        if let Some(detail) = &metric.details {
311            span.set_attribute(KeyValue::new("swink.eval.details", detail.clone()));
312        }
313        if verdict.is_pass() {
314            span.set_status(Status::Ok);
315        } else {
316            let message = metric
317                .details
318                .clone()
319                .unwrap_or_else(|| format!("evaluator `{}` failed", metric.evaluator_name));
320            span.add_event(
321                Cow::Borrowed("exception"),
322                vec![
323                    KeyValue::new("exception.type", "EvaluatorFailure"),
324                    KeyValue::new("exception.message", message.clone()),
325                ],
326            );
327            span.set_status(Status::error(message));
328        }
329        span.end();
330    }
331
332    /// End the span without a metric (evaluator returned `None` — inapplicable
333    /// to the case). The span is closed with `Status::Ok` to signal a no-op.
334    pub(crate) fn end_inapplicable(self, evaluator_name: &str) {
335        let span = self.context.span();
336        span.set_attribute(KeyValue::new(
337            ATTR_EVALUATOR_NAME,
338            evaluator_name.to_string(),
339        ));
340        span.set_attribute(KeyValue::new(ATTR_VERDICT, "inapplicable"));
341        span.set_status(Status::Ok);
342        span.end();
343    }
344}
345
346fn verdict_str(verdict: Verdict) -> &'static str {
347    if verdict.is_pass() { "pass" } else { "fail" }
348}
349
350// ─── Builder ────────────────────────────────────────────────────────────────
351
352/// Builder for [`EvalsTelemetry`].
353///
354/// Defaults to the globally-installed OTel [`TracerProvider`], which a
355/// caller-supplied tracer can override. Use this in production to pick up
356/// whatever provider is already wired (OTLP, stdout, …); in tests,
357/// [`Self::with_tracer`] lets you inject a tracer backed by an
358/// `InMemorySpanExporter`.
359#[derive(Default)]
360pub struct EvalsTelemetryBuilder {
361    tracer: Option<BoxedTracer>,
362}
363
364impl EvalsTelemetryBuilder {
365    /// Use a caller-supplied tracer. Most direct path for tests; wire a
366    /// `SdkTracerProvider` with an `InMemorySpanExporter` and pass
367    /// `provider.tracer("swink.eval")` through here.
368    #[must_use]
369    pub fn with_tracer(mut self, tracer: BoxedTracer) -> Self {
370        self.tracer = Some(tracer);
371        self
372    }
373
374    /// Derive a tracer from an arbitrary [`TracerProvider`]. The tracer is
375    /// named `swink.eval`, matching the span-name prefix.
376    #[must_use]
377    pub fn with_tracer_provider<S, T, P>(mut self, provider: &P) -> Self
378    where
379        S: opentelemetry::trace::Span + Send + Sync + 'static,
380        T: Tracer<Span = S> + Send + Sync + 'static,
381        P: TracerProvider<Tracer = T>,
382    {
383        // Any `T: Tracer<Span = S>` with the Send+Sync+'static bounds implements
384        // `ObjectSafeTracer` via the blanket impl in `opentelemetry::global`.
385        let tracer = provider.tracer("swink.eval");
386        self.tracer = Some(BoxedTracer::new(Box::new(tracer)));
387        self
388    }
389
390    /// Build the [`EvalsTelemetry`]. If no tracer has been supplied, derive
391    /// one from the globally-installed provider.
392    #[must_use]
393    pub fn build(self) -> EvalsTelemetry {
394        let tracer = self.tracer.unwrap_or_else(|| global::tracer("swink.eval"));
395        EvalsTelemetry { tracer }
396    }
397}
398
399// ─── Unit tests ─────────────────────────────────────────────────────────────
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404    use opentelemetry_sdk::trace::{InMemorySpanExporter, SdkTracerProvider};
405
406    fn fresh_provider() -> (SdkTracerProvider, InMemorySpanExporter) {
407        let exporter = InMemorySpanExporter::default();
408        let provider = SdkTracerProvider::builder()
409            .with_simple_exporter(exporter.clone())
410            .build();
411        (provider, exporter)
412    }
413
414    #[test]
415    fn builder_uses_injected_tracer() {
416        let (provider, exporter) = fresh_provider();
417        let telemetry = EvalsTelemetry::builder()
418            .with_tracer_provider(&provider)
419            .build();
420        // Emit a span via the configured tracer to confirm it flows through.
421        let mut span = telemetry.tracer().start("selftest");
422        span.end();
423        provider.force_flush().expect("flush ok");
424        let spans = exporter.get_finished_spans().expect("get spans");
425        assert!(spans.iter().any(|s| s.name == "selftest"));
426    }
427
428    #[test]
429    fn verdict_str_rendering() {
430        assert_eq!(verdict_str(Verdict::Pass), "pass");
431        assert_eq!(verdict_str(Verdict::Fail), "fail");
432    }
433}