Skip to main content

agent_sdk/observability/
spans.rs

1//! Span construction and lifecycle helpers.
2
3use std::borrow::Cow;
4
5use opentelemetry::global::{self, BoxedSpan, BoxedTracer};
6use opentelemetry::trace::{
7    Span, SpanContext, SpanId, SpanKind, Status, TraceFlags, TraceId, TraceState, Tracer,
8};
9use opentelemetry::{InstrumentationScope, KeyValue};
10
11use super::types::CaptureDecision;
12
13const TRACER_NAME: &str = env!("CARGO_PKG_NAME");
14const TRACER_VERSION: &str = env!("CARGO_PKG_VERSION");
15
16/// Get the SDK tracer from the global provider.
17///
18/// Fetched fresh each time to avoid binding to a no-op if the application
19/// installs its provider after the SDK initialises.
20fn tracer() -> BoxedTracer {
21    let scope = InstrumentationScope::builder(TRACER_NAME)
22        .with_version(TRACER_VERSION)
23        .build();
24    global::tracer_with_scope(scope)
25}
26
27/// Start an `INTERNAL` span with the given name and attributes.
28#[must_use]
29pub fn start_internal_span(name: impl Into<Cow<'static, str>>, attrs: Vec<KeyValue>) -> BoxedSpan {
30    let t = tracer();
31    t.span_builder(name)
32        .with_kind(SpanKind::Internal)
33        .with_attributes(attrs)
34        .start(&t)
35}
36
37/// Start a `CLIENT` span with the given name and attributes.
38#[must_use]
39pub fn start_client_span(name: impl Into<Cow<'static, str>>, attrs: Vec<KeyValue>) -> BoxedSpan {
40    let t = tracer();
41    t.span_builder(name)
42        .with_kind(SpanKind::Client)
43        .with_attributes(attrs)
44        .start(&t)
45}
46
47/// Set span status to error with a message and `error.type` attribute.
48pub fn set_span_error(span: &mut BoxedSpan, error_type: &str, message: &str) {
49    span.set_attribute(KeyValue::new(
50        super::attrs::ERROR_TYPE,
51        error_type.to_string(),
52    ));
53    span.set_status(Status::error(message.to_string()));
54}
55
56/// Add a structured span event with attributes.
57///
58/// Skips silently when the span is not recording (sampling drop or
59/// no-op tracer). Mirrors the guards used by every other helper in
60/// this module so callers don't have to check `is_recording` first.
61pub fn add_event(span: &mut BoxedSpan, name: impl Into<Cow<'static, str>>, attrs: Vec<KeyValue>) {
62    if !span.is_recording() {
63        return;
64    }
65    span.add_event(name, attrs);
66}
67
68/// Add an `OTel` span link from `span` to `target`, carrying `attrs`.
69///
70/// Skips silently when the span is not recording, when `target` is
71/// not a valid `SpanContext` (zero trace/span ids), or when no tracer
72/// provider is installed.  This mirrors [`add_event`] so call sites
73/// don't have to gate every link emission with `is_recording` checks.
74pub fn add_link(span: &mut BoxedSpan, target: SpanContext, attrs: Vec<KeyValue>) {
75    if !span.is_recording() {
76        return;
77    }
78    if !target.is_valid() {
79        return;
80    }
81    span.add_link(target, attrs);
82}
83
84/// Add a `replay-of` link pointing at the original attempt's span.
85///
86/// `original_trace_id` and `original_span_id` are hex-encoded as
87/// stored on `agent_sdk_turn_attempts.{otel_trace_id,otel_span_id}`.
88/// Malformed hex values are treated as "no link" so a corrupt journal
89/// row never poisons the live span.
90///
91/// `attempt_index` is 1-based and matches
92/// `TurnAttempt::attempt_number` so cross-trace queries can join on it.
93pub fn link_to_replay_origin(
94    span: &mut BoxedSpan,
95    original_trace_id: &str,
96    original_span_id: &str,
97    attempt_index: u32,
98) {
99    let Some(target) =
100        span_context_from_hex(original_trace_id, original_span_id, TraceFlags::SAMPLED)
101    else {
102        return;
103    };
104    add_link(
105        span,
106        target,
107        vec![
108            KeyValue::new(
109                super::attrs::AGENT_REPLAY_ORIGINAL_TRACE_ID,
110                original_trace_id.to_string(),
111            ),
112            KeyValue::new(
113                super::attrs::AGENT_REPLAY_ORIGINAL_SPAN_ID,
114                original_span_id.to_string(),
115            ),
116            super::attrs::kv_i64(
117                super::attrs::AGENT_REPLAY_ATTEMPT_INDEX,
118                i64::from(attempt_index),
119            ),
120        ],
121    );
122}
123
124/// Add a `subagent-of` link pointing at the parent turn's span.
125///
126/// Even though parent and subagent share an `OTel` context today, the
127/// explicit link makes the relationship queryable when one of the
128/// spans is dropped by tail sampling.  Malformed ids are silently
129/// dropped (see [`link_to_replay_origin`]).
130pub fn link_to_parent_turn(span: &mut BoxedSpan, parent_trace_id: &str, parent_span_id: &str) {
131    let Some(target) = span_context_from_hex(parent_trace_id, parent_span_id, TraceFlags::SAMPLED)
132    else {
133        return;
134    };
135    add_link(span, target, vec![]);
136}
137
138/// Build a remote `SpanContext` from hex-encoded trace + span ids, for
139/// re-parenting spans under a span that is no longer live in the
140/// current task.
141///
142/// The daemon-hosted worker drives a turn across multiple tasks
143/// (execute → suspend at the tool boundary → resume), so its root
144/// `invoke_agent` span cannot stay live for the whole turn. The worker
145/// persists the root span's `(trace_id, span_id)` and rebuilds a remote
146/// parent context from them via this helper so resumed `chat` calls and
147/// child-task `execute_tool` calls nest under the turn root. Returns
148/// `None` for malformed / zero ids (treated as "no parent").
149///
150/// **Legacy entry point — assumes the remote parent was sampled.** It
151/// always marks the reconstructed context SAMPLED, which forces a
152/// `ParentBased` sampler to record every re-parented child even when the
153/// root span was sampled out. Prefer [`remote_span_context_with_sampling`]
154/// and pass the root span's real sampled bit so ratio sampling configured
155/// for the root is honoured by its children.
156#[must_use]
157pub fn remote_span_context(trace_hex: &str, span_hex: &str) -> Option<SpanContext> {
158    span_context_from_hex(trace_hex, span_hex, TraceFlags::SAMPLED)
159}
160
161/// Build a remote `SpanContext` that carries the root span's **real**
162/// sampled bit.
163///
164/// A `ParentBased` sampler decides whether to record a child span from its
165/// remote parent's sampled flag. Forcing SAMPLED (see
166/// [`remote_span_context`]) therefore defeats ratio sampling: a child of a
167/// sampled-out root would still be recorded and exported, orphaned from a
168/// parent that was never exported. Passing the parent's actual `sampled`
169/// state keeps the child's sampling decision aligned with the root's.
170///
171/// Returns `None` for malformed / zero ids (treated as "no parent").
172#[must_use]
173pub fn remote_span_context_with_sampling(
174    trace_hex: &str,
175    span_hex: &str,
176    sampled: bool,
177) -> Option<SpanContext> {
178    span_context_from_hex(trace_hex, span_hex, sampled_flags(sampled))
179}
180
181const fn sampled_flags(sampled: bool) -> TraceFlags {
182    if sampled {
183        TraceFlags::SAMPLED
184    } else {
185        TraceFlags::NOT_SAMPLED
186    }
187}
188
189/// Build a `SpanContext` from hex-encoded trace + span ids with explicit
190/// trace flags.
191///
192/// Returns `None` when either id is malformed or zero (`TraceId::INVALID`
193/// / `SpanId::INVALID`). The constructed context is marked
194/// `is_remote = true`.
195fn span_context_from_hex(
196    trace_hex: &str,
197    span_hex: &str,
198    flags: TraceFlags,
199) -> Option<SpanContext> {
200    let trace_id = TraceId::from_hex(trace_hex).ok()?;
201    let span_id = SpanId::from_hex(span_hex).ok()?;
202    let ctx = SpanContext::new(trace_id, span_id, flags, true, TraceState::default());
203    if !ctx.is_valid() {
204        return None;
205    }
206    Some(ctx)
207}
208
209/// Record payload content on an LLM span based on store decisions.
210pub fn record_payload_on_span(
211    span: &mut BoxedSpan,
212    result: &super::types::CaptureResult,
213    system_json: Option<&serde_json::Value>,
214    input_json: &serde_json::Value,
215    output_json: &serde_json::Value,
216) {
217    use super::attrs;
218
219    if !span.is_recording() {
220        return;
221    }
222
223    apply_capture_decision(
224        span,
225        &result.system_instructions,
226        system_json,
227        attrs::GEN_AI_SYSTEM_INSTRUCTIONS,
228        attrs::SDK_OTEL_SYSTEM_INSTRUCTIONS_REF,
229    );
230    apply_capture_decision(
231        span,
232        &result.input_messages,
233        Some(input_json),
234        attrs::GEN_AI_INPUT_MESSAGES,
235        attrs::SDK_OTEL_INPUT_MESSAGES_REF,
236    );
237    apply_capture_decision(
238        span,
239        &result.output_messages,
240        Some(output_json),
241        attrs::GEN_AI_OUTPUT_MESSAGES,
242        attrs::SDK_OTEL_OUTPUT_MESSAGES_REF,
243    );
244}
245
246fn apply_capture_decision(
247    span: &mut BoxedSpan,
248    decision: &CaptureDecision,
249    json_value: Option<&serde_json::Value>,
250    inline_attr: &'static str,
251    ref_attr: &'static str,
252) {
253    match decision {
254        CaptureDecision::Inline => {
255            if let Some(val) = json_value {
256                span.set_attribute(KeyValue::new(inline_attr, val.to_string()));
257            }
258        }
259        CaptureDecision::Reference(r) => {
260            span.set_attribute(KeyValue::new(ref_attr, r.clone()));
261        }
262        CaptureDecision::Omit => {}
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::{remote_span_context, remote_span_context_with_sampling};
269    use anyhow::Context as _;
270
271    // Valid W3C example ids (RFC trace-context), both non-zero.
272    const TRACE_HEX: &str = "4bf92f3577b34da6a3ce929d0e0e4736";
273    const SPAN_HEX: &str = "00f067aa0ba902b7";
274
275    #[test]
276    fn remote_span_context_honours_real_sampled_bit() -> anyhow::Result<()> {
277        let kept =
278            remote_span_context_with_sampling(TRACE_HEX, SPAN_HEX, true).context("sampled ctx")?;
279        assert!(kept.is_sampled(), "sampled=true must produce a sampled ctx");
280        assert!(kept.is_remote());
281
282        let dropped = remote_span_context_with_sampling(TRACE_HEX, SPAN_HEX, false)
283            .context("unsampled ctx")?;
284        assert!(
285            !dropped.is_sampled(),
286            "sampled=false must NOT force the SAMPLED flag (ratio sampling respected)"
287        );
288        assert!(dropped.is_remote());
289        Ok(())
290    }
291
292    #[test]
293    fn legacy_remote_span_context_stays_sampled() -> anyhow::Result<()> {
294        let ctx = remote_span_context(TRACE_HEX, SPAN_HEX).context("ctx")?;
295        assert!(ctx.is_sampled());
296        Ok(())
297    }
298
299    #[test]
300    fn remote_span_context_rejects_zero_ids() {
301        assert!(
302            remote_span_context_with_sampling(&"0".repeat(32), &"0".repeat(16), true).is_none()
303        );
304        assert!(remote_span_context_with_sampling("not-hex", SPAN_HEX, false).is_none());
305    }
306}