Skip to main content

harn_vm/
tracing.rs

1//! Pipeline Observability: structured tracing spans with parent/child relationships.
2//!
3//! When tracing is enabled (`vm.enable_tracing()`), the VM automatically emits
4//! spans for pipeline execution, function calls, LLM calls, tool invocations,
5//! imports, and async operations. Spans form a tree via parent_span_id.
6//!
7//! Access via builtins: `trace_spans()` returns all completed spans,
8//! `trace_summary()` returns a formatted summary.
9
10use std::cell::RefCell;
11use std::collections::BTreeMap;
12use std::time::{Instant, SystemTime, UNIX_EPOCH};
13
14use crate::value::VmValue;
15
16/// The kind of operation a span represents.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum SpanKind {
19    Pipeline,
20    FnCall,
21    LlmCall,
22    ToolCall,
23    Import,
24    Parallel,
25    Spawn,
26    /// A `@step`-annotated function while its frame is on the call stack.
27    Step,
28    /// Host-side VM setup before user bytecode starts executing.
29    VmSetup,
30    /// Cooperative worker suspension while the durable checkpoint is written.
31    Suspension,
32    /// Worker resumption after a cooperative suspension.
33    Resume,
34    /// Pipeline drain / settlement phase.
35    Drain,
36    /// One drain settlement decision.
37    DrainDecision,
38    /// `pool.submit()` boundary — accepted, rejected, or queued (PL-06).
39    PoolSubmit,
40    /// Pool worker picks the task out of the queue (PL-06). Links back to
41    /// the originating `PoolSubmit` span across the async boundary so
42    /// queue dwell time can be reconstructed from a single trace.
43    PoolDequeue,
44    /// `emit_channel(...)` boundary — opened at `emit_channel`, closed
45    /// after the durable append + trigger fan-out finishes (CH-06 / #1877).
46    ChannelEmit,
47    /// Channel-source trigger match boundary — opened at trigger fan-out
48    /// just before the handler is invoked, closed once dispatch finishes.
49    /// Links back to the originating `ChannelEmit` span (multi-link for
50    /// batched / aggregated triggers).
51    ChannelMatch,
52    /// Script-opened user timing span via `std/timing`. Modeled as an
53    /// OTel INTERNAL span — distinct from `FnCall` so OTel exporters and
54    /// `harn run --profile-json` do not confuse them with LLM/tool work.
55    UserTiming,
56    /// A model routing / escalation decision — the agent switched the
57    /// serving model mid-run. Metadata carries `from_model`, `to_model`,
58    /// and `reason` (see [`meta`]). Emitted as a zero-duration marker at
59    /// the decision point so viewers can annotate the flame graph with the
60    /// switch instead of inferring it from adjacent `llm_call` models.
61    ModelRoute,
62    /// A batch of tools promoted into the active surface (MCP bootstrap,
63    /// skill activation, or a search-driven mount). Metadata carries
64    /// `tool_names`, `tool_count`, `source`, and an optional `detail`.
65    ToolMount,
66    /// A single deferred tool schema promoted via `tool_search`. Metadata
67    /// carries `tool_name`, `query`, and the match `score`.
68    DeferredToolLoad,
69}
70
71impl SpanKind {
72    pub fn as_str(self) -> &'static str {
73        match self {
74            Self::Pipeline => "pipeline",
75            Self::FnCall => "fn_call",
76            Self::LlmCall => "llm_call",
77            Self::ToolCall => "tool_call",
78            Self::Import => "import",
79            Self::Parallel => "parallel",
80            Self::Spawn => "spawn",
81            Self::Step => "step",
82            Self::VmSetup => "vm_setup",
83            Self::Suspension => "suspension",
84            Self::Resume => "resume",
85            Self::Drain => "drain",
86            Self::DrainDecision => "drain_decision",
87            Self::PoolSubmit => "pool_submit",
88            Self::PoolDequeue => "pool_dequeue",
89            Self::ChannelEmit => "channel_emit",
90            Self::ChannelMatch => "channel_match",
91            Self::UserTiming => "user_timing",
92            Self::ModelRoute => "model_route",
93            Self::ToolMount => "tool_mount",
94            Self::DeferredToolLoad => "deferred_tool_load",
95        }
96    }
97}
98
99/// Canonical metadata keys for VM trace spans. Downstream viewers (Burin
100/// portal, harn-cloud dashboard) key off these exact strings to render
101/// token flame graphs and tool-selection events, so they are defined once
102/// here rather than retyped at each emission site.
103pub mod meta {
104    // llm_call token + cost attribution.
105    pub const MODEL: &str = "model";
106    pub const PROVIDER: &str = "provider";
107    pub const INPUT_TOKENS: &str = "input_tokens";
108    pub const OUTPUT_TOKENS: &str = "output_tokens";
109    pub const CACHE_READ_TOKENS: &str = "cache_read_tokens";
110    pub const CACHE_WRITE_TOKENS: &str = "cache_write_tokens";
111    pub const COST_USD: &str = "cost_usd";
112
113    // model_route.
114    pub const FROM_MODEL: &str = "from_model";
115    pub const TO_MODEL: &str = "to_model";
116    pub const REASON: &str = "reason";
117
118    // tool_mount.
119    pub const TOOL_NAMES: &str = "tool_names";
120    pub const TOOL_COUNT: &str = "tool_count";
121    pub const SOURCE: &str = "source";
122    pub const DETAIL: &str = "detail";
123
124    // deferred_tool_load.
125    pub const TOOL_NAME: &str = "tool_name";
126    pub const QUERY: &str = "query";
127    pub const SCORE: &str = "score";
128}
129
130/// Structured per-LLM-call token and cost attribution for an `llm_call`
131/// span. Built once at the call site and lowered to metadata pairs via
132/// [`LlmCallUsage::metadata_pairs`] so every emission uses the canonical
133/// [`meta`] keys instead of ad-hoc strings. `cost_usd` is `None` when the
134/// (provider, model) pair has no catalog pricing.
135#[derive(Debug, Clone, Default, PartialEq)]
136pub struct LlmCallUsage {
137    pub model: String,
138    pub provider: String,
139    pub input_tokens: i64,
140    pub output_tokens: i64,
141    pub cache_read_tokens: i64,
142    pub cache_write_tokens: i64,
143    pub cost_usd: Option<f64>,
144}
145
146impl LlmCallUsage {
147    /// Lower to `(key, value)` pairs keyed by the canonical [`meta`]
148    /// constants, suitable for `annotate_current_span` / `span_set_metadata`.
149    pub fn metadata_pairs(&self) -> Vec<(&'static str, serde_json::Value)> {
150        let mut pairs = vec![
151            (meta::MODEL, serde_json::json!(self.model)),
152            (meta::PROVIDER, serde_json::json!(self.provider)),
153            (meta::INPUT_TOKENS, serde_json::json!(self.input_tokens)),
154            (meta::OUTPUT_TOKENS, serde_json::json!(self.output_tokens)),
155            (
156                meta::CACHE_READ_TOKENS,
157                serde_json::json!(self.cache_read_tokens),
158            ),
159            (
160                meta::CACHE_WRITE_TOKENS,
161                serde_json::json!(self.cache_write_tokens),
162            ),
163        ];
164        if let Some(cost) = self.cost_usd {
165            pairs.push((meta::COST_USD, serde_json::json!(cost)));
166        }
167        pairs
168    }
169}
170
171/// Emit a zero-duration marker span of `kind` carrying `metadata`. Marker
172/// spans model point-in-time telemetry events (model routing, tool mounts,
173/// deferred-tool promotions) that have no meaningful duration but need to
174/// appear in the trace tree at their causal position under the current
175/// active span. No-op when tracing is disabled.
176pub fn emit_marker_span(
177    kind: SpanKind,
178    name: impl Into<String>,
179    metadata: Vec<(&str, serde_json::Value)>,
180) {
181    let span_id = span_start(kind, name.into());
182    if span_id == 0 {
183        return;
184    }
185    for (key, value) in metadata {
186        span_set_metadata(span_id, key, value);
187    }
188    span_end(span_id);
189}
190
191/// Emit a [`SpanKind::ModelRoute`] marker for a model switch / escalation.
192pub fn emit_model_route(from_model: &str, to_model: &str, reason: &str) {
193    emit_marker_span(
194        SpanKind::ModelRoute,
195        "model_route",
196        vec![
197            (meta::FROM_MODEL, serde_json::json!(from_model)),
198            (meta::TO_MODEL, serde_json::json!(to_model)),
199            (meta::REASON, serde_json::json!(reason)),
200        ],
201    );
202}
203
204/// Emit a [`SpanKind::ToolMount`] marker for a batch of tools promoted into
205/// the active surface. `source` is the promotion origin (`"mcp"`,
206/// `"skill"`, `"search"`); `detail` optionally names the concrete source
207/// (e.g. an MCP server name).
208pub fn emit_tool_mount(tool_names: &[String], source: &str, detail: Option<&str>) {
209    if tool_names.is_empty() {
210        return;
211    }
212    let mut metadata = vec![
213        (meta::TOOL_NAMES, serde_json::json!(tool_names)),
214        (meta::TOOL_COUNT, serde_json::json!(tool_names.len())),
215        (meta::SOURCE, serde_json::json!(source)),
216    ];
217    if let Some(detail) = detail {
218        metadata.push((meta::DETAIL, serde_json::json!(detail)));
219    }
220    emit_marker_span(SpanKind::ToolMount, "tool_mount", metadata);
221}
222
223/// Emit a [`SpanKind::DeferredToolLoad`] marker for a single deferred tool
224/// schema promoted via `tool_search`.
225pub fn emit_deferred_tool_load(tool_name: &str, query: &str, score: Option<f64>) {
226    let mut metadata = vec![
227        (meta::TOOL_NAME, serde_json::json!(tool_name)),
228        (meta::QUERY, serde_json::json!(query)),
229    ];
230    if let Some(score) = score {
231        metadata.push((meta::SCORE, serde_json::json!(score)));
232    }
233    emit_marker_span(SpanKind::DeferredToolLoad, "deferred_tool_load", metadata);
234}
235
236/// Link to a span that is causally related but not the parent.
237#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
238#[serde(default)]
239pub struct SpanLink {
240    pub trace_id: String,
241    pub span_id: String,
242    #[serde(skip_serializing_if = "BTreeMap::is_empty")]
243    pub attributes: BTreeMap<String, String>,
244}
245
246impl SpanLink {
247    pub fn new(trace_id: impl Into<String>, span_id: impl Into<String>) -> Self {
248        Self {
249            trace_id: trace_id.into(),
250            span_id: span_id.into(),
251            attributes: BTreeMap::new(),
252        }
253    }
254
255    pub fn with_attributes(mut self, attributes: BTreeMap<String, String>) -> Self {
256        self.attributes = attributes;
257        self
258    }
259}
260
261/// One sub-phase annotation attached to a span. Modeled after OTel span
262/// events: a named checkpoint with optional structured attributes that
263/// piggy-backs on the enclosing span rather than allocating a new one.
264#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
265#[serde(default)]
266pub struct SpanEvent {
267    pub name: String,
268    /// Wall-clock time of the event in milliseconds since the UNIX epoch.
269    pub time_unix_ms: u64,
270    /// Monotonic offset from the parent span's start, in milliseconds.
271    pub offset_ms: u64,
272    #[serde(skip_serializing_if = "BTreeMap::is_empty")]
273    pub attributes: BTreeMap<String, serde_json::Value>,
274}
275
276/// A completed tracing span.
277#[derive(Debug, Clone)]
278pub struct Span {
279    pub trace_id: String,
280    pub span_id: u64,
281    pub parent_id: Option<u64>,
282    pub kind: SpanKind,
283    pub name: String,
284    /// Monotonic offset from the collector's epoch, in milliseconds.
285    pub start_ms: u64,
286    /// Wall-clock start in milliseconds since the UNIX epoch. Recorded
287    /// once at `start` for external correlation; duration is always
288    /// derived from the monotonic clock, not from wall-clock end - start.
289    pub start_unix_ms: u64,
290    pub duration_ms: u64,
291    pub metadata: BTreeMap<String, serde_json::Value>,
292    pub links: Vec<SpanLink>,
293    pub events: Vec<SpanEvent>,
294}
295
296/// An in-flight span (not yet completed).
297struct OpenSpan {
298    trace_id: String,
299    span_id: u64,
300    parent_id: Option<u64>,
301    kind: SpanKind,
302    name: String,
303    started_at: Instant,
304    /// Mock-monotonic snapshot at start, captured only when a
305    /// `clock_mock` override was active. Pairs with the closing snapshot
306    /// to compute deterministic durations under `mock_time(...)`.
307    started_at_mock_mono_ms: Option<u64>,
308    start_unix_ms: u64,
309    metadata: BTreeMap<String, serde_json::Value>,
310    links: Vec<SpanLink>,
311    events: Vec<SpanEvent>,
312}
313
314/// Thread-local span collector. Accumulates completed spans and tracks the
315/// active span stack for automatic parent assignment.
316pub struct SpanCollector {
317    trace_id: String,
318    next_id: u64,
319    /// Stack of open span IDs — the top is the current active span.
320    active_stack: Vec<u64>,
321    /// Open (in-flight) spans keyed by ID.
322    open: BTreeMap<u64, OpenSpan>,
323    /// Completed spans in chronological order.
324    completed: Vec<Span>,
325    /// Epoch for relative timing.
326    epoch: Instant,
327}
328
329impl Default for SpanCollector {
330    fn default() -> Self {
331        Self::new()
332    }
333}
334
335impl SpanCollector {
336    pub fn new() -> Self {
337        Self {
338            next_id: 1,
339            trace_id: format!("trace_{}", uuid::Uuid::now_v7()),
340            active_stack: Vec::new(),
341            open: BTreeMap::new(),
342            completed: Vec::new(),
343            epoch: Instant::now(),
344        }
345    }
346
347    /// Start a new span. Returns the span ID.
348    pub fn start(&mut self, kind: SpanKind, name: String) -> u64 {
349        let parent_id = self.active_stack.last().copied();
350        self.start_with_parent(kind, name, Vec::new(), parent_id)
351    }
352
353    /// Start a new span with non-parent causal links. Returns the span ID.
354    pub fn start_with_links(&mut self, kind: SpanKind, name: String, links: Vec<SpanLink>) -> u64 {
355        let parent_id = self.active_stack.last().copied();
356        self.start_with_parent(kind, name, links, parent_id)
357    }
358
359    /// Start a root span with non-parent causal links. Returns the span ID.
360    pub fn start_detached_with_links(
361        &mut self,
362        kind: SpanKind,
363        name: String,
364        links: Vec<SpanLink>,
365    ) -> u64 {
366        self.start_with_parent(kind, name, links, None)
367    }
368
369    fn start_with_parent(
370        &mut self,
371        kind: SpanKind,
372        name: String,
373        links: Vec<SpanLink>,
374        parent_id: Option<u64>,
375    ) -> u64 {
376        let id = self.next_id;
377        self.next_id += 1;
378        let now = Instant::now();
379        let started_at_mock_mono_ms = mock_monotonic_ms();
380        let start_unix_ms = wall_clock_ms();
381
382        let mut event_metadata = BTreeMap::new();
383        if !links.is_empty() {
384            event_metadata.insert("links".to_string(), serde_json::json!(links));
385        }
386        crate::events::emit_span_start(id, parent_id, &name, kind.as_str(), event_metadata);
387
388        self.open.insert(
389            id,
390            OpenSpan {
391                trace_id: self.trace_id.clone(),
392                span_id: id,
393                parent_id,
394                kind,
395                name,
396                started_at: now,
397                started_at_mock_mono_ms,
398                start_unix_ms,
399                metadata: BTreeMap::new(),
400                links,
401                events: Vec::new(),
402            },
403        );
404        self.active_stack.push(id);
405        id
406    }
407
408    /// Attach metadata to an open span.
409    pub fn set_metadata(&mut self, span_id: u64, key: &str, value: serde_json::Value) {
410        if let Some(span) = self.open.get_mut(&span_id) {
411            span.metadata.insert(key.to_string(), value);
412        }
413    }
414
415    /// Attach metadata to an open or completed span unless the key exists.
416    pub fn attach_metadata_if_absent(&mut self, span_id: u64, key: &str, value: serde_json::Value) {
417        if let Some(span) = self.open.get_mut(&span_id) {
418            span.metadata.entry(key.to_string()).or_insert(value);
419            return;
420        }
421        if let Some(span) = self
422            .completed
423            .iter_mut()
424            .rev()
425            .find(|span| span.span_id == span_id)
426        {
427            span.metadata.entry(key.to_string()).or_insert(value);
428        }
429    }
430
431    /// Append a sub-phase annotation to an open span. Returns `true` if
432    /// the event was attached; `false` if `span_id` does not match any
433    /// open span (already closed or never opened).
434    pub fn record_event(
435        &mut self,
436        span_id: u64,
437        name: String,
438        attributes: BTreeMap<String, serde_json::Value>,
439    ) -> bool {
440        let Some(span) = self.open.get_mut(&span_id) else {
441            return false;
442        };
443        let offset_ms = match (span.started_at_mock_mono_ms, mock_monotonic_ms()) {
444            (Some(start), Some(now)) => now.saturating_sub(start),
445            _ => span.started_at.elapsed().as_millis() as u64,
446        };
447        span.events.push(SpanEvent {
448            name,
449            time_unix_ms: wall_clock_ms(),
450            offset_ms,
451            attributes,
452        });
453        true
454    }
455
456    /// Read the wall-clock start of an open span.
457    pub fn open_start_unix_ms(&self, span_id: u64) -> Option<u64> {
458        self.open.get(&span_id).map(|span| span.start_unix_ms)
459    }
460
461    /// End a span. Moves it from open to completed and returns the
462    /// finalized span so callers (e.g. `std/timing`) can read its
463    /// `duration_ms` directly without re-scanning `take_spans()`.
464    pub fn end(&mut self, span_id: u64) -> Option<Span> {
465        let span = self.open.remove(&span_id)?;
466        let start_ms = span.started_at.duration_since(self.epoch).as_millis() as u64;
467        let duration_ms = match (span.started_at_mock_mono_ms, mock_monotonic_ms()) {
468            (Some(start), Some(end)) => end.saturating_sub(start),
469            _ => span.started_at.elapsed().as_millis() as u64,
470        };
471
472        let mut end_meta = span.metadata.clone();
473        end_meta.insert(
474            "duration_ms".to_string(),
475            serde_json::Value::Number(serde_json::Number::from(duration_ms)),
476        );
477        crate::events::emit_span_end(span_id, end_meta);
478
479        let completed = Span {
480            trace_id: span.trace_id,
481            span_id: span.span_id,
482            parent_id: span.parent_id,
483            kind: span.kind,
484            name: span.name,
485            start_ms,
486            start_unix_ms: span.start_unix_ms,
487            duration_ms,
488            metadata: span.metadata,
489            links: span.links,
490            events: span.events,
491        };
492        self.completed.push(completed.clone());
493
494        if let Some(pos) = self.active_stack.iter().rposition(|&id| id == span_id) {
495            self.active_stack.remove(pos);
496        }
497        Some(completed)
498    }
499
500    /// Get the current active span ID (if any).
501    pub fn current_span_id(&self) -> Option<u64> {
502        self.active_stack.last().copied()
503    }
504
505    /// Build a serializable link for an open span.
506    pub fn span_link(&self, span_id: u64) -> Option<SpanLink> {
507        self.open
508            .get(&span_id)
509            .map(|span| SpanLink::new(span.trace_id.clone(), span.span_id.to_string()))
510    }
511
512    /// Build a serializable link for the current active span.
513    pub fn current_span_link(&self) -> Option<SpanLink> {
514        self.current_span_id()
515            .and_then(|span_id| self.span_link(span_id))
516    }
517
518    /// Take all completed spans (drains the collector).
519    pub fn take_spans(&mut self) -> Vec<Span> {
520        std::mem::take(&mut self.completed)
521    }
522
523    /// Peek at all completed spans (non-destructive).
524    pub fn spans(&self) -> &[Span] {
525        &self.completed
526    }
527
528    /// Reset the collector.
529    pub fn reset(&mut self) {
530        self.active_stack.clear();
531        self.open.clear();
532        self.completed.clear();
533        self.next_id = 1;
534        self.trace_id = format!("trace_{}", uuid::Uuid::now_v7());
535        self.epoch = Instant::now();
536    }
537}
538
539thread_local! {
540    static COLLECTOR: RefCell<SpanCollector> = RefCell::new(SpanCollector::new());
541    static TRACING_ENABLED: RefCell<bool> = const { RefCell::new(false) };
542}
543
544/// Best-effort wall-clock millis since the UNIX epoch. Honors an active
545/// `clock_mock` override so spans recorded inside `mock_time(...)` blocks
546/// align with the rest of the runtime's clock reads; returns 0 only if
547/// the host clock is behind the epoch (e.g. unusual sandbox shims).
548fn wall_clock_ms() -> u64 {
549    if let Some(mock) = crate::clock_mock::active_mock_clock() {
550        return mock.now_wall_ms() as u64;
551    }
552    SystemTime::now()
553        .duration_since(UNIX_EPOCH)
554        .map(|d| d.as_millis() as u64)
555        .unwrap_or(0)
556}
557
558/// Mock-aware monotonic snapshot. Returns `Some(ms)` when a
559/// `clock_mock` override is active, `None` otherwise. Span lifecycle
560/// pairs the start/end snapshots so durations recorded under
561/// `mock_time(...)` reflect `advance_time(...)` instead of real
562/// wall-clock progress; spans without an active mock at start fall
563/// through to the standard `Instant::elapsed` path on close.
564fn mock_monotonic_ms() -> Option<u64> {
565    crate::clock_mock::active_mock_clock().map(|mock| mock.now_monotonic_ms() as u64)
566}
567
568/// Enable or disable VM tracing for the current thread.
569pub fn set_tracing_enabled(enabled: bool) {
570    TRACING_ENABLED.with(|e| *e.borrow_mut() = enabled);
571    if enabled {
572        COLLECTOR.with(|c| c.borrow_mut().reset());
573    }
574}
575
576/// Check if tracing is enabled.
577pub fn is_tracing_enabled() -> bool {
578    TRACING_ENABLED.with(|e| *e.borrow())
579}
580
581/// Start a span (no-op if tracing disabled). Returns span ID or 0.
582pub fn span_start(kind: SpanKind, name: String) -> u64 {
583    if !is_tracing_enabled() {
584        return 0;
585    }
586    COLLECTOR.with(|c| c.borrow_mut().start(kind, name))
587}
588
589/// Start a span with non-parent causal links (no-op if tracing disabled).
590pub fn span_start_with_links(kind: SpanKind, name: String, links: Vec<SpanLink>) -> u64 {
591    if !is_tracing_enabled() {
592        return 0;
593    }
594    COLLECTOR.with(|c| c.borrow_mut().start_with_links(kind, name, links))
595}
596
597/// Start a root span with non-parent causal links (no-op if tracing disabled).
598pub fn span_start_detached_with_links(kind: SpanKind, name: String, links: Vec<SpanLink>) -> u64 {
599    if !is_tracing_enabled() {
600        return 0;
601    }
602    COLLECTOR.with(|c| c.borrow_mut().start_detached_with_links(kind, name, links))
603}
604
605/// Attach metadata to an open span (no-op if span_id is 0).
606pub fn span_set_metadata(span_id: u64, key: &str, value: serde_json::Value) {
607    if span_id == 0 {
608        return;
609    }
610    COLLECTOR.with(|c| c.borrow_mut().set_metadata(span_id, key, value));
611}
612
613/// End a span (no-op if span_id is 0). Returns the finalized span when
614/// the id was a live open span.
615pub fn span_end(span_id: u64) -> Option<Span> {
616    if span_id == 0 {
617        return None;
618    }
619    COLLECTOR.with(|c| c.borrow_mut().end(span_id))
620}
621
622/// Start a user-timing span. Unlike [`span_start`], this always records
623/// regardless of [`is_tracing_enabled`] — `std/timing` callers depend on
624/// the returned `duration_ms` to function as a primitive replacement for
625/// hand-rolled `now_ms()` subtraction.
626pub fn span_start_user_timing(
627    name: String,
628    attrs: BTreeMap<String, serde_json::Value>,
629) -> (u64, String, Option<u64>, u64) {
630    COLLECTOR.with(|c| {
631        let mut c = c.borrow_mut();
632        let id = c.start(SpanKind::UserTiming, name);
633        for (key, value) in attrs {
634            c.set_metadata(id, &key, value);
635        }
636        let parent = c.open.get(&id).and_then(|span| span.parent_id);
637        let trace_id = c
638            .open
639            .get(&id)
640            .map(|span| span.trace_id.clone())
641            .unwrap_or_default();
642        let start_unix_ms = c.open_start_unix_ms(id).unwrap_or(0);
643        (id, trace_id, parent, start_unix_ms)
644    })
645}
646
647/// Record a sub-phase event on an open span. No-op when `span_id` is 0
648/// or already closed; returns whether the event was attached so callers
649/// can surface no-op feedback.
650pub fn span_record_event(
651    span_id: u64,
652    name: String,
653    attributes: BTreeMap<String, serde_json::Value>,
654) -> bool {
655    if span_id == 0 {
656        return false;
657    }
658    COLLECTOR.with(|c| c.borrow_mut().record_event(span_id, name, attributes))
659}
660
661/// Attach metadata to an open span. No-op when `span_id` is 0 or already
662/// closed.
663pub fn span_attach_metadata(span_id: u64, key: &str, value: serde_json::Value) {
664    if span_id == 0 {
665        return;
666    }
667    COLLECTOR.with(|c| c.borrow_mut().set_metadata(span_id, key, value));
668}
669
670/// Attach metadata to an open or completed span unless the key already exists.
671pub fn span_attach_metadata_if_absent(span_id: u64, key: &str, value: serde_json::Value) {
672    if span_id == 0 {
673        return;
674    }
675    COLLECTOR.with(|c| {
676        c.borrow_mut()
677            .attach_metadata_if_absent(span_id, key, value);
678    });
679}
680
681/// Get the currently active span id, if tracing is enabled and a span is open.
682pub fn current_span_id() -> Option<u64> {
683    if !is_tracing_enabled() {
684        return None;
685    }
686    COLLECTOR.with(|c| c.borrow().current_span_id())
687}
688
689/// Return a link reference for an open span.
690pub fn span_link(span_id: u64) -> Option<SpanLink> {
691    if span_id == 0 || !is_tracing_enabled() {
692        return None;
693    }
694    COLLECTOR.with(|c| c.borrow().span_link(span_id))
695}
696
697/// Return a link reference for the current active span.
698pub fn current_span_link() -> Option<SpanLink> {
699    if !is_tracing_enabled() {
700        return None;
701    }
702    COLLECTOR.with(|c| c.borrow().current_span_link())
703}
704
705/// Take all completed spans.
706pub fn take_spans() -> Vec<Span> {
707    COLLECTOR.with(|c| c.borrow_mut().take_spans())
708}
709
710/// Peek at completed spans (cloned).
711pub fn peek_spans() -> Vec<Span> {
712    COLLECTOR.with(|c| c.borrow().spans().to_vec())
713}
714
715/// Reset the tracing collector.
716pub fn reset_tracing() {
717    COLLECTOR.with(|c| c.borrow_mut().reset());
718}
719
720/// Convert a span to a VmValue dict for user access.
721pub fn span_to_vm_value(span: &Span) -> VmValue {
722    let mut d: BTreeMap<String, VmValue> = BTreeMap::new();
723    d.insert(
724        "trace_id".into(),
725        VmValue::String(arcstr::ArcStr::from(span.trace_id.as_str())),
726    );
727    d.insert("span_id".into(), VmValue::Int(span.span_id as i64));
728    d.insert(
729        "parent_id".into(),
730        span.parent_id
731            .map(|id| VmValue::Int(id as i64))
732            .unwrap_or(VmValue::Nil),
733    );
734    d.insert(
735        "kind".into(),
736        VmValue::String(arcstr::ArcStr::from(span.kind.as_str())),
737    );
738    d.insert(
739        "name".into(),
740        VmValue::String(arcstr::ArcStr::from(span.name.as_str())),
741    );
742    d.insert("start_ms".into(), VmValue::Int(span.start_ms as i64));
743    d.insert(
744        "start_unix_ms".into(),
745        VmValue::Int(span.start_unix_ms as i64),
746    );
747    d.insert("duration_ms".into(), VmValue::Int(span.duration_ms as i64));
748
749    if !span.metadata.is_empty() {
750        let meta: crate::value::DictMap = span
751            .metadata
752            .iter()
753            .map(|(k, v)| {
754                (
755                    crate::value::intern_key(k),
756                    crate::stdlib::json_to_vm_value(v),
757                )
758            })
759            .collect();
760        d.insert("metadata".into(), VmValue::dict(meta));
761    }
762    if !span.links.is_empty() {
763        d.insert(
764            "links".into(),
765            crate::stdlib::json_to_vm_value(&serde_json::json!(span.links)),
766        );
767    }
768    if !span.events.is_empty() {
769        d.insert(
770            "events".into(),
771            crate::stdlib::json_to_vm_value(&serde_json::json!(span.events)),
772        );
773    }
774
775    VmValue::dict(d)
776}
777
778/// Generate a formatted summary of all spans.
779pub fn format_summary() -> String {
780    let spans = peek_spans();
781    if spans.is_empty() {
782        return "No spans recorded.".into();
783    }
784
785    let mut lines = Vec::new();
786    let total_ms: u64 = spans
787        .iter()
788        .filter(|s| s.parent_id.is_none())
789        .map(|s| s.duration_ms)
790        .sum();
791
792    lines.push(format!("Trace: {} spans, {total_ms}ms total", spans.len()));
793    lines.push(String::new());
794
795    fn print_tree(spans: &[Span], parent_id: Option<u64>, depth: usize, lines: &mut Vec<String>) {
796        let children: Vec<&Span> = spans.iter().filter(|s| s.parent_id == parent_id).collect();
797        for span in children {
798            let indent = "  ".repeat(depth);
799            let meta_str = if span.metadata.is_empty() {
800                String::new()
801            } else {
802                let parts: Vec<String> = span
803                    .metadata
804                    .iter()
805                    .map(|(k, v)| format!("{k}={v}"))
806                    .collect();
807                format!(" ({})", parts.join(", "))
808            };
809            lines.push(format!(
810                "{indent}{} {} {}ms{meta_str}",
811                span.kind.as_str(),
812                span.name,
813                span.duration_ms,
814            ));
815            print_tree(spans, Some(span.span_id), depth + 1, lines);
816        }
817    }
818
819    print_tree(&spans, None, 0, &mut lines);
820    lines.join("\n")
821}
822
823#[cfg(test)]
824mod tests {
825    use super::*;
826
827    #[test]
828    fn test_span_collector_basic() {
829        let mut c = SpanCollector::new();
830        let id = c.start(SpanKind::Pipeline, "main".into());
831        assert_eq!(id, 1);
832        assert_eq!(c.current_span_id(), Some(1));
833        assert!(c.span_link(id).is_some());
834        c.end(id);
835        assert_eq!(c.current_span_id(), None);
836        assert_eq!(c.spans().len(), 1);
837        assert_eq!(c.spans()[0].name, "main");
838        assert_eq!(c.spans()[0].parent_id, None);
839    }
840
841    #[test]
842    fn test_span_parent_child() {
843        let mut c = SpanCollector::new();
844        let parent = c.start(SpanKind::Pipeline, "main".into());
845        let child = c.start(SpanKind::FnCall, "helper".into());
846        c.end(child);
847        c.end(parent);
848        assert_eq!(c.spans().len(), 2);
849        assert_eq!(c.spans()[0].parent_id, Some(parent));
850        assert_eq!(c.spans()[1].parent_id, None);
851    }
852
853    #[test]
854    fn test_span_metadata() {
855        let mut c = SpanCollector::new();
856        let id = c.start(SpanKind::LlmCall, "gpt-4".into());
857        c.set_metadata(id, "tokens", serde_json::json!(100));
858        c.end(id);
859        assert_eq!(c.spans()[0].metadata["tokens"], serde_json::json!(100));
860    }
861
862    #[test]
863    fn test_completed_span_metadata_can_be_attached_late() {
864        let mut c = SpanCollector::new();
865        let id = c.start(SpanKind::LlmCall, "gpt-4".into());
866        c.end(id);
867
868        c.attach_metadata_if_absent(id, "first_token_ms", serde_json::json!(125));
869        c.attach_metadata_if_absent(id, "first_token_ms", serde_json::json!(250));
870
871        assert_eq!(
872            c.spans()[0].metadata["first_token_ms"],
873            serde_json::json!(125)
874        );
875    }
876
877    #[test]
878    fn test_span_links_are_preserved() {
879        let mut c = SpanCollector::new();
880        let parent = c.start(SpanKind::Suspension, "suspend worker".into());
881        let link = c.span_link(parent).expect("link for open span");
882        c.end(parent);
883
884        let child = c.start_with_links(SpanKind::Resume, "resume worker".into(), vec![link]);
885        c.end(child);
886
887        assert_eq!(c.spans().len(), 2);
888        assert_eq!(c.spans()[1].parent_id, None);
889        assert_eq!(c.spans()[1].links.len(), 1);
890        assert_eq!(c.spans()[1].links[0].span_id, parent.to_string());
891    }
892
893    #[test]
894    fn test_detached_span_links_do_not_inherit_active_parent() {
895        let mut c = SpanCollector::new();
896        let pipeline = c.start(SpanKind::Pipeline, "pipeline".into());
897        let link = c.span_link(pipeline).expect("pipeline link");
898        let drain = c.start_detached_with_links(SpanKind::Drain, "drain".into(), vec![link]);
899        c.end(drain);
900        c.end(pipeline);
901
902        let drain = c
903            .spans()
904            .iter()
905            .find(|span| span.kind == SpanKind::Drain)
906            .expect("drain span");
907        assert_eq!(drain.parent_id, None);
908        assert_eq!(drain.links.len(), 1);
909        assert_eq!(drain.links[0].span_id, pipeline.to_string());
910    }
911
912    #[test]
913    fn test_noop_when_disabled() {
914        set_tracing_enabled(false);
915        let id = span_start(SpanKind::Pipeline, "test".into());
916        assert_eq!(id, 0);
917        assert!(span_end(id).is_none());
918    }
919
920    #[test]
921    fn test_user_timing_records_when_tracing_disabled() {
922        // UserTiming is the substrate behind `std/timing`. Script
923        // callers depend on a real `duration_ms` even when global VM
924        // tracing is off, so the collector must always record this
925        // kind.
926        set_tracing_enabled(false);
927        reset_tracing();
928        let mut attrs = BTreeMap::new();
929        attrs.insert("phase".into(), serde_json::json!("warmup"));
930        let (id, trace_id, parent, start_unix_ms) =
931            span_start_user_timing("script.work".into(), attrs);
932        assert!(id != 0);
933        assert!(!trace_id.is_empty());
934        assert_eq!(parent, None);
935        assert!(start_unix_ms > 0);
936
937        assert!(span_record_event(id, "checkpoint".into(), BTreeMap::new()));
938
939        let closed = span_end(id).expect("user timing always records");
940        assert_eq!(closed.kind, SpanKind::UserTiming);
941        assert_eq!(closed.events.len(), 1);
942        assert_eq!(closed.events[0].name, "checkpoint");
943        assert_eq!(closed.metadata["phase"], serde_json::json!("warmup"));
944
945        // The recorded user_timing span survives in the collector
946        // snapshot so `trace_spans()` / `harn run --profile-json`
947        // surface it alongside the other VM-emitted spans.
948        let snapshot = peek_spans();
949        assert!(snapshot
950            .iter()
951            .any(|span| span.kind == SpanKind::UserTiming && span.name == "script.work"));
952    }
953
954    #[test]
955    fn test_new_span_kinds_stringify() {
956        assert_eq!(SpanKind::ModelRoute.as_str(), "model_route");
957        assert_eq!(SpanKind::ToolMount.as_str(), "tool_mount");
958        assert_eq!(SpanKind::DeferredToolLoad.as_str(), "deferred_tool_load");
959    }
960
961    #[test]
962    fn test_llm_call_usage_metadata_pairs_carry_cache_tokens() {
963        let usage = LlmCallUsage {
964            model: "claude-sonnet-4".into(),
965            provider: "anthropic".into(),
966            input_tokens: 100,
967            output_tokens: 20,
968            cache_read_tokens: 40,
969            cache_write_tokens: 8,
970            cost_usd: Some(0.0123),
971        };
972        let pairs: BTreeMap<&str, serde_json::Value> = usage.metadata_pairs().into_iter().collect();
973        assert_eq!(pairs[meta::MODEL], serde_json::json!("claude-sonnet-4"));
974        assert_eq!(pairs[meta::PROVIDER], serde_json::json!("anthropic"));
975        assert_eq!(pairs[meta::INPUT_TOKENS], serde_json::json!(100));
976        assert_eq!(pairs[meta::OUTPUT_TOKENS], serde_json::json!(20));
977        assert_eq!(pairs[meta::CACHE_READ_TOKENS], serde_json::json!(40));
978        assert_eq!(pairs[meta::CACHE_WRITE_TOKENS], serde_json::json!(8));
979        assert_eq!(pairs[meta::COST_USD], serde_json::json!(0.0123));
980    }
981
982    #[test]
983    fn test_llm_call_usage_omits_cost_when_unpriced() {
984        let usage = LlmCallUsage {
985            model: "local-model".into(),
986            provider: "local".into(),
987            input_tokens: 5,
988            output_tokens: 1,
989            cost_usd: None,
990            ..LlmCallUsage::default()
991        };
992        let pairs: BTreeMap<&str, serde_json::Value> = usage.metadata_pairs().into_iter().collect();
993        assert!(!pairs.contains_key(meta::COST_USD));
994        // Token attribution is still present even when unpriced.
995        assert_eq!(pairs[meta::INPUT_TOKENS], serde_json::json!(5));
996    }
997
998    #[test]
999    fn test_marker_spans_nest_under_active_span_and_carry_metadata() {
1000        set_tracing_enabled(true);
1001        reset_tracing();
1002        let parent = span_start(SpanKind::Pipeline, "agent_loop".into());
1003        emit_model_route("cheap-model", "smart-model", "no_progress");
1004        emit_tool_mount(
1005            &["read".to_string(), "write".to_string()],
1006            "mcp",
1007            Some("filesystem"),
1008        );
1009        emit_deferred_tool_load("grep", "search files", Some(4.5));
1010        span_end(parent);
1011
1012        let spans = peek_spans();
1013        let route = spans
1014            .iter()
1015            .find(|s| s.kind == SpanKind::ModelRoute)
1016            .expect("model_route span");
1017        assert_eq!(route.parent_id, Some(parent));
1018        assert_eq!(
1019            route.metadata[meta::TO_MODEL],
1020            serde_json::json!("smart-model")
1021        );
1022        assert_eq!(
1023            route.metadata[meta::FROM_MODEL],
1024            serde_json::json!("cheap-model")
1025        );
1026
1027        let mount = spans
1028            .iter()
1029            .find(|s| s.kind == SpanKind::ToolMount)
1030            .expect("tool_mount span");
1031        assert_eq!(mount.metadata[meta::TOOL_COUNT], serde_json::json!(2));
1032        assert_eq!(mount.metadata[meta::SOURCE], serde_json::json!("mcp"));
1033        assert_eq!(
1034            mount.metadata[meta::DETAIL],
1035            serde_json::json!("filesystem")
1036        );
1037
1038        let deferred = spans
1039            .iter()
1040            .find(|s| s.kind == SpanKind::DeferredToolLoad)
1041            .expect("deferred_tool_load span");
1042        assert_eq!(
1043            deferred.metadata[meta::TOOL_NAME],
1044            serde_json::json!("grep")
1045        );
1046        assert_eq!(deferred.metadata[meta::SCORE], serde_json::json!(4.5));
1047        set_tracing_enabled(false);
1048    }
1049
1050    #[test]
1051    fn test_empty_tool_mount_is_a_noop() {
1052        set_tracing_enabled(true);
1053        reset_tracing();
1054        let parent = span_start(SpanKind::Pipeline, "loop".into());
1055        emit_tool_mount(&[], "mcp", Some("empty-server"));
1056        span_end(parent);
1057        let spans = peek_spans();
1058        assert!(!spans.iter().any(|s| s.kind == SpanKind::ToolMount));
1059        set_tracing_enabled(false);
1060    }
1061
1062    #[test]
1063    fn test_span_event_offset_is_monotonic() {
1064        // Use a mock clock so the test is deterministic under any load and
1065        // requires zero wall-clock time. The mock is advanced by 10 ms
1066        // between the two events, guaranteeing a strictly-increasing offset
1067        // rather than relying on the OS scheduler to deliver >= 1 ms of
1068        // real elapsed time between the two `record_event` calls.
1069        let clock = crate::clock_mock::MockClock::at_wall_ms(1_000_000_000_000);
1070        let _guard = crate::clock_mock::install_override(clock.clone());
1071        let mut c = SpanCollector::new();
1072        let id = c.start(SpanKind::UserTiming, "outer".into());
1073        assert!(c.record_event(id, "before".into(), BTreeMap::new()));
1074        clock.advance_std_sync(std::time::Duration::from_millis(10));
1075        assert!(c.record_event(id, "after".into(), BTreeMap::new()));
1076        let closed = c.end(id).expect("open span");
1077        assert_eq!(closed.events.len(), 2);
1078        assert!(
1079            closed.events[1].offset_ms > closed.events[0].offset_ms,
1080            "second event should have a strictly greater offset after a 10ms advance; \
1081             before={} after={}",
1082            closed.events[0].offset_ms,
1083            closed.events[1].offset_ms
1084        );
1085    }
1086}