Skip to main content

lash_trace/
lib.rs

1//! Durable diagnostics for the lash runtime: the [`TraceSink`] channel and its
2//! record vocabulary.
3//!
4//! A [`TraceSink`] receives one [`TraceRecord`] per runtime event — session and
5//! turn lifecycle, prompt builds, LLM calls, per-tool start/completion, token
6//! usage, protocol steps, and Lashlang execution-graph updates. Each record
7//! carries a [`TraceContext`] (session / turn / graph-node identity) plus a
8//! tagged [`TraceEvent`] payload; [`TraceEvent::kind`] is the single source of
9//! truth for the `type` tag string that consumers match on.
10//!
11//! [`JsonlTraceSink`] writes one JSON line per record at schema
12//! [`TRACE_SCHEMA_VERSION`]; [`TeeTraceSink`] fans out to several sinks; and the
13//! optional `otel` feature adds an `OtelTraceSink` that converts each record to
14//! an OpenTelemetry span. This is the *durable diagnostics* reporting channel —
15//! distinct from the app-facing `TurnActivity` stream and the low-level
16//! `SessionEvent` stream that the runtime crates expose.
17//!
18//! For the full map of reporting channels, guidance on when to consume which,
19//! and the schema-evolution policy that governs [`TRACE_SCHEMA_VERSION`], see
20//! `docs/reporting.html`; for the attach-a-sink how-to, see `docs/tracing.html`.
21
22use std::collections::BTreeMap;
23use std::fs::OpenOptions;
24use std::io::{self, Write};
25use std::path::{Path, PathBuf};
26use std::sync::{Arc, Mutex};
27
28use serde::{Deserialize, Serialize};
29use serde_json::Value;
30use sha2::{Digest, Sha256};
31
32mod lashlang_graph;
33#[cfg(feature = "otel")]
34pub mod otel;
35
36pub use lashlang_graph::{
37    TraceLashlangEdgeSelection, TraceLashlangGraph, TraceLashlangGraphChildLink,
38    TraceLashlangGraphEdge, TraceLashlangGraphNode, TraceLashlangGraphStore,
39    TraceLashlangNodeStatus,
40};
41
42/// Version of the durable trace JSONL schema, written to
43/// [`TraceRecord::schema_version`] on every record.
44///
45/// Bump rules (the normative reporting-schema policy lives in
46/// `docs/reporting.html`):
47///
48/// - Adding a new [`TraceEvent`] variant, or adding an optional
49///   (`skip_serializing_if`) field to an existing payload, is **additive**:
50///   older readers skip the unknown variant or field, so this version does
51///   **not** change.
52/// - Renaming a field, removing a field, or changing the meaning of an existing
53///   field is a breaking change and **does** bump this version.
54/// - The free-form [`TraceEvent::Custom`] and [`TraceEvent::ProtocolStep`]
55///   payloads are opaque `serde_json::Value`; adding to or reshaping the data
56///   inside them never forces a bump. (This is why the `exec_code_completed`
57///   diagnostic's `tool_calls` payload was purely additive.)
58pub const TRACE_SCHEMA_VERSION: u32 = 2;
59
60#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
61#[serde(rename_all = "snake_case")]
62pub enum TraceLevel {
63    #[default]
64    Standard,
65    Extended,
66}
67
68impl TraceLevel {
69    pub fn is_extended(self) -> bool {
70        matches!(self, Self::Extended)
71    }
72}
73
74#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
75pub struct TraceContext {
76    #[serde(default, skip_serializing_if = "Option::is_none")]
77    pub run_id: Option<String>,
78    #[serde(default, skip_serializing_if = "Option::is_none")]
79    pub experiment_id: Option<String>,
80    #[serde(default, skip_serializing_if = "Option::is_none")]
81    pub candidate_id: Option<String>,
82    #[serde(default, skip_serializing_if = "Option::is_none")]
83    pub candidate_parent_id: Option<String>,
84    #[serde(default, skip_serializing_if = "Option::is_none")]
85    pub example_id: Option<String>,
86    #[serde(default, skip_serializing_if = "Option::is_none")]
87    pub split: Option<String>,
88    #[serde(default, skip_serializing_if = "Option::is_none")]
89    pub session_id: Option<String>,
90    #[serde(default, skip_serializing_if = "Option::is_none")]
91    pub turn_id: Option<String>,
92    /// Stable id of the span this record represents (e.g. `turn:<session>:<turn>`,
93    /// `llm:<call_id>`, `tool:<call_id>`). Populated by the runtime for turn /
94    /// llm / tool / session records so a consumer can build a nested span tree
95    /// from `(graph_node_id, parent_graph_node_id)` with a single `id -> span`
96    /// map. Lashlang execution sets its own graph node id here.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub graph_node_id: Option<String>,
99    /// Id of the enclosing span — the value of some other record's
100    /// `graph_node_id`. A turn's parent is its causal origin (the spawning tool
101    /// call / effect, via `caused_by`) when known, otherwise the session root.
102    #[serde(default, skip_serializing_if = "Option::is_none")]
103    pub parent_graph_node_id: Option<String>,
104    #[serde(default, skip_serializing_if = "Option::is_none")]
105    pub turn_index: Option<usize>,
106    #[serde(default, skip_serializing_if = "Option::is_none")]
107    pub protocol_iteration: Option<usize>,
108    #[serde(default, skip_serializing_if = "Option::is_none")]
109    pub effect_id: Option<String>,
110    #[serde(default, skip_serializing_if = "Option::is_none")]
111    pub llm_call_id: Option<String>,
112    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
113    pub metadata: BTreeMap<String, Value>,
114}
115
116impl TraceContext {
117    pub fn for_session(mut self, session_id: impl Into<String>) -> Self {
118        self.session_id = Some(session_id.into());
119        self
120    }
121
122    pub fn for_turn_index(mut self, turn_index: usize) -> Self {
123        self.turn_index = Some(turn_index);
124        self
125    }
126
127    pub fn for_turn(mut self, turn_id: impl Into<String>) -> Self {
128        self.turn_id = Some(turn_id.into());
129        self
130    }
131
132    pub fn for_protocol_iteration(mut self, protocol_iteration: usize) -> Self {
133        self.protocol_iteration = Some(protocol_iteration);
134        self
135    }
136
137    pub fn for_llm_call(mut self, llm_call_id: impl Into<String>) -> Self {
138        self.llm_call_id = Some(llm_call_id.into());
139        self
140    }
141}
142
143#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
144pub struct TraceRecord {
145    pub schema_version: u32,
146    pub id: String,
147    pub timestamp: String,
148    pub context: TraceContext,
149    #[serde(flatten)]
150    pub event: TraceEvent,
151}
152
153impl TraceRecord {
154    pub fn new(context: TraceContext, event: TraceEvent) -> Self {
155        Self::new_with_timestamp(context, event, chrono::Utc::now())
156    }
157
158    pub fn new_with_timestamp(
159        context: TraceContext,
160        event: TraceEvent,
161        timestamp: chrono::DateTime<chrono::Utc>,
162    ) -> Self {
163        Self {
164            schema_version: TRACE_SCHEMA_VERSION,
165            id: uuid::Uuid::new_v4().to_string(),
166            timestamp: timestamp.to_rfc3339(),
167            context,
168            event,
169        }
170    }
171}
172
173#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
174#[serde(tag = "type", rename_all = "snake_case")]
175#[allow(
176    clippy::large_enum_variant,
177    reason = "TraceEvent is a public DTO; keeping event payloads inline preserves ergonomic pattern matching"
178)]
179pub enum TraceEvent {
180    SessionStarted {
181        #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
182        metadata: BTreeMap<String, Value>,
183    },
184    TurnStarted {
185        #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
186        metadata: BTreeMap<String, Value>,
187    },
188    PromptBuilt {
189        prompt_hash: String,
190        prompt_chars: usize,
191        #[serde(default, skip_serializing_if = "Vec::is_empty")]
192        components: Vec<TracePromptComponent>,
193    },
194    LlmCallStarted {
195        request: TraceLlmRequest,
196    },
197    LlmCallCompleted {
198        response: TraceLlmResponse,
199        #[serde(default, skip_serializing_if = "Option::is_none")]
200        usage: Option<TraceTokenUsage>,
201        #[serde(default, skip_serializing_if = "Option::is_none")]
202        provider_usage: Option<Value>,
203        #[serde(default, skip_serializing_if = "Option::is_none")]
204        stream_summary: Option<Value>,
205    },
206    LlmCallFailed {
207        error: TraceError,
208        #[serde(default, skip_serializing_if = "Option::is_none")]
209        stream_summary: Option<Value>,
210    },
211    ProviderStreamEvent {
212        event: TraceProviderStreamEvent,
213    },
214    RuntimeStreamEvent {
215        event: TraceRuntimeStreamEvent,
216    },
217    ToolCallStarted {
218        call_id: Option<String>,
219        name: String,
220        args: Value,
221    },
222    ToolCallCompleted {
223        call_id: Option<String>,
224        name: String,
225        args: Value,
226        output: TraceToolCallOutput,
227        duration_ms: u64,
228    },
229    ProtocolStep {
230        plugin_id: String,
231        payload: Value,
232    },
233    TokenUsage {
234        usage: TraceTokenUsage,
235        #[serde(default, skip_serializing_if = "Option::is_none")]
236        cumulative: Option<TraceTokenUsage>,
237    },
238    LashlangExecution {
239        event: TraceLashlangExecutionEvent,
240    },
241    TurnCompleted {
242        status: String,
243        done_reason: String,
244        #[serde(default, skip_serializing_if = "Option::is_none")]
245        agent_frame_switch: Option<TraceAgentFrameSwitch>,
246    },
247    Custom {
248        name: String,
249        payload: Value,
250    },
251}
252
253impl TraceEvent {
254    /// The `type` tag serde writes for this variant. This is the single source
255    /// of truth for event-kind strings — consumers (e.g. the trace viewer)
256    /// match on the enum and read the kind from here rather than re-deriving
257    /// tag strings by hand. The match is exhaustive on purpose: a new variant
258    /// fails to compile here until it is given a kind.
259    pub fn kind(&self) -> &'static str {
260        match self {
261            Self::SessionStarted { .. } => "session_started",
262            Self::TurnStarted { .. } => "turn_started",
263            Self::PromptBuilt { .. } => "prompt_built",
264            Self::LlmCallStarted { .. } => "llm_call_started",
265            Self::LlmCallCompleted { .. } => "llm_call_completed",
266            Self::LlmCallFailed { .. } => "llm_call_failed",
267            Self::ProviderStreamEvent { .. } => "provider_stream_event",
268            Self::RuntimeStreamEvent { .. } => "runtime_stream_event",
269            Self::ToolCallStarted { .. } => "tool_call_started",
270            Self::ToolCallCompleted { .. } => "tool_call_completed",
271            Self::ProtocolStep { .. } => "protocol_step",
272            Self::TokenUsage { .. } => "token_usage",
273            Self::LashlangExecution { .. } => "lashlang_execution",
274            Self::TurnCompleted { .. } => "turn_completed",
275            Self::Custom { .. } => "custom",
276        }
277    }
278}
279
280#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
281pub struct TraceToolCallOutput {
282    pub outcome: TraceToolCallOutcome,
283    #[serde(default, skip_serializing_if = "Option::is_none")]
284    pub control: Option<Value>,
285}
286
287impl TraceToolCallOutput {
288    pub fn status(&self) -> TraceToolCallStatus {
289        match self.outcome {
290            TraceToolCallOutcome::Success(_) => TraceToolCallStatus::Success,
291            TraceToolCallOutcome::Failure(_) => TraceToolCallStatus::Failure,
292            TraceToolCallOutcome::Cancelled(_) => TraceToolCallStatus::Cancelled,
293        }
294    }
295
296    pub fn is_success(&self) -> bool {
297        self.status() == TraceToolCallStatus::Success
298    }
299
300    pub fn value_for_projection(&self) -> Value {
301        match &self.outcome {
302            TraceToolCallOutcome::Success(value)
303            | TraceToolCallOutcome::Failure(value)
304            | TraceToolCallOutcome::Cancelled(value) => value.clone(),
305        }
306    }
307}
308
309#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
310#[serde(tag = "status", content = "payload", rename_all = "snake_case")]
311pub enum TraceToolCallOutcome {
312    Success(Value),
313    Failure(Value),
314    Cancelled(Value),
315}
316
317#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
318#[serde(rename_all = "snake_case")]
319pub enum TraceToolCallStatus {
320    Success,
321    Failure,
322    Cancelled,
323}
324
325#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
326pub struct TracePromptComponent {
327    pub id: String,
328    pub kind: String,
329    pub hash: String,
330    #[serde(default, skip_serializing_if = "Option::is_none")]
331    pub chars: Option<usize>,
332}
333
334#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
335pub struct TraceLlmRequest {
336    pub model: String,
337    #[serde(default, skip_serializing_if = "Option::is_none")]
338    pub model_variant: Option<String>,
339    pub messages: Vec<TraceLlmMessage>,
340    #[serde(default, skip_serializing_if = "Vec::is_empty")]
341    pub attachments: Vec<TraceAttachment>,
342    #[serde(default, skip_serializing_if = "Vec::is_empty")]
343    pub tools: Vec<TraceToolSpec>,
344    pub tool_choice: String,
345    #[serde(default, skip_serializing_if = "Option::is_none")]
346    pub output_spec: Option<Value>,
347    pub stream: bool,
348}
349
350#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
351pub struct TraceLlmMessage {
352    pub role: String,
353    pub blocks: Vec<TraceContentBlock>,
354}
355
356#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
357#[serde(tag = "kind", rename_all = "snake_case")]
358pub enum TraceContentBlock {
359    Text {
360        text: String,
361        #[serde(default, skip_serializing_if = "is_false")]
362        cache_breakpoint: bool,
363    },
364    Image {
365        attachment_idx: usize,
366    },
367    ToolCall {
368        call_id: Option<String>,
369        tool_name: String,
370        input_json: Value,
371        item_id: Option<String>,
372        has_signature: bool,
373    },
374    ToolResult {
375        call_id: Option<String>,
376        tool_name: Option<String>,
377        content: String,
378    },
379    Reasoning {
380        text: String,
381        item_id: Option<String>,
382        summary: Vec<String>,
383        has_encrypted: bool,
384        redacted: bool,
385    },
386}
387
388fn is_false(value: &bool) -> bool {
389    !*value
390}
391
392#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
393pub struct TraceAttachment {
394    pub mime: String,
395    #[serde(default, skip_serializing_if = "Option::is_none")]
396    pub filename: Option<String>,
397    #[serde(default, skip_serializing_if = "Option::is_none")]
398    pub bytes_sha256: Option<String>,
399    #[serde(default, skip_serializing_if = "Option::is_none")]
400    pub bytes_len: Option<usize>,
401}
402
403#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
404pub struct TraceToolSpec {
405    pub name: String,
406    pub description: String,
407    pub input_schema: Value,
408    pub output_schema: Value,
409}
410
411#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
412pub struct TraceLlmResponse {
413    pub text: String,
414    pub duration_ms: u64,
415    #[serde(default, skip_serializing_if = "Option::is_none")]
416    pub terminal_reason: Option<String>,
417    #[serde(default, skip_serializing_if = "Option::is_none")]
418    pub parts: Option<Value>,
419}
420
421#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
422pub struct TraceProviderStreamEvent {
423    pub provider: String,
424    pub sequence: u64,
425    pub elapsed_ms: u64,
426    pub event_name: String,
427    #[serde(default, skip_serializing_if = "Option::is_none")]
428    pub item_id: Option<String>,
429    #[serde(default, skip_serializing_if = "Option::is_none")]
430    pub output_index: Option<i64>,
431    pub raw_len: usize,
432    pub raw_sha256: String,
433    #[serde(default, skip_serializing_if = "Option::is_none")]
434    pub raw_json: Option<Value>,
435}
436
437#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
438pub struct TraceRuntimeStreamEvent {
439    pub sequence: u64,
440    pub elapsed_ms: u64,
441    pub event_name: String,
442    #[serde(default, skip_serializing_if = "Option::is_none")]
443    pub raw_text: Option<String>,
444    #[serde(default, skip_serializing_if = "Option::is_none")]
445    pub visible_text: Option<String>,
446    #[serde(default, skip_serializing_if = "Option::is_none")]
447    pub item_id: Option<String>,
448    #[serde(default, skip_serializing_if = "Option::is_none")]
449    pub output_index: Option<i64>,
450    #[serde(default, skip_serializing_if = "Option::is_none")]
451    pub call_id: Option<String>,
452    #[serde(default, skip_serializing_if = "Option::is_none")]
453    pub tool_name: Option<String>,
454    #[serde(default, skip_serializing_if = "Option::is_none")]
455    pub input_json: Option<Value>,
456    #[serde(default, skip_serializing_if = "Option::is_none")]
457    pub usage: Option<TraceTokenUsage>,
458}
459
460#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
461pub struct TraceTokenUsage {
462    pub input_tokens: i64,
463    pub output_tokens: i64,
464    pub cache_read_input_tokens: i64,
465    pub cache_write_input_tokens: i64,
466    pub reasoning_output_tokens: i64,
467}
468
469#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
470pub struct TraceAgentFrameSwitch {
471    pub frame_id: String,
472}
473
474#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
475pub struct TraceRuntimeScope {
476    pub session_id: String,
477    #[serde(default, skip_serializing_if = "Option::is_none")]
478    pub turn_id: Option<String>,
479    #[serde(default, skip_serializing_if = "Option::is_none")]
480    pub turn_index: Option<usize>,
481    #[serde(default, skip_serializing_if = "Option::is_none")]
482    pub protocol_iteration: Option<usize>,
483}
484
485impl TraceRuntimeScope {
486    pub fn new(session_id: impl Into<String>) -> Self {
487        Self {
488            session_id: session_id.into(),
489            turn_id: None,
490            turn_index: None,
491            protocol_iteration: None,
492        }
493    }
494}
495
496#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
497#[serde(tag = "type", rename_all = "snake_case")]
498pub enum TraceRuntimeSubject {
499    Effect { effect_id: String, kind: String },
500    Process { process_id: String },
501}
502
503impl TraceRuntimeSubject {
504    pub fn graph_key(&self, scope: &TraceRuntimeScope) -> String {
505        match self {
506            Self::Effect { effect_id, .. } => match scope.turn_id.as_deref() {
507                Some(turn_id) if !turn_id.is_empty() => {
508                    format!("effect:{}:{turn_id}:{effect_id}", scope.session_id)
509                }
510                _ => format!("effect:{}:{effect_id}", scope.session_id),
511            },
512            Self::Process { process_id } => format!("process:{process_id}"),
513        }
514    }
515}
516
517#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
518pub struct TraceLashlangExecutionIdentity {
519    pub scope: TraceRuntimeScope,
520    pub subject: TraceRuntimeSubject,
521    pub module_ref: String,
522    pub entry_kind: String,
523    #[serde(default, skip_serializing_if = "Option::is_none")]
524    pub entry_ref: Option<String>,
525    pub entry_name: String,
526}
527
528impl TraceLashlangExecutionIdentity {
529    pub fn graph_key(&self) -> String {
530        self.subject.graph_key(&self.scope)
531    }
532}
533
534#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
535#[serde(tag = "kind", rename_all = "snake_case")]
536pub enum TraceLashlangExecutionEvent {
537    ExecutionStarted {
538        event_key: String,
539        identity: TraceLashlangExecutionIdentity,
540        execution_map: TraceLashlangMap,
541    },
542    ExecutionFinished {
543        event_key: String,
544        identity: TraceLashlangExecutionIdentity,
545        status: TraceLashlangStatus,
546        #[serde(default, skip_serializing_if = "Option::is_none")]
547        error: Option<String>,
548    },
549    NodeStarted {
550        event_key: String,
551        identity: TraceLashlangExecutionIdentity,
552        node_id: String,
553        node_kind: String,
554        label: String,
555        occurrence: u64,
556    },
557    NodeCompleted {
558        event_key: String,
559        identity: TraceLashlangExecutionIdentity,
560        node_id: String,
561        node_kind: String,
562        label: String,
563        occurrence: u64,
564    },
565    NodeFailed {
566        event_key: String,
567        identity: TraceLashlangExecutionIdentity,
568        node_id: String,
569        node_kind: String,
570        label: String,
571        occurrence: u64,
572        error: String,
573    },
574    BranchSelected {
575        event_key: String,
576        identity: TraceLashlangExecutionIdentity,
577        node_id: String,
578        occurrence: u64,
579        edge_id: String,
580        selected: TraceBranchSelection,
581    },
582    ChildStarted {
583        event_key: String,
584        identity: TraceLashlangExecutionIdentity,
585        parent_node_id: String,
586        occurrence: u64,
587        child: TraceLashlangChildExecution,
588    },
589}
590
591#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
592pub struct TraceLashlangChildExecution {
593    pub scope: TraceRuntimeScope,
594    pub subject: TraceRuntimeSubject,
595    #[serde(default, skip_serializing_if = "Option::is_none")]
596    pub module_ref: Option<String>,
597    #[serde(default, skip_serializing_if = "Option::is_none")]
598    pub entry_ref: Option<String>,
599    #[serde(default, skip_serializing_if = "Option::is_none")]
600    pub entry_name: Option<String>,
601}
602
603impl TraceLashlangChildExecution {
604    pub fn graph_key(&self) -> String {
605        self.subject.graph_key(&self.scope)
606    }
607}
608
609#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
610#[serde(rename_all = "snake_case")]
611pub enum TraceLashlangStatus {
612    Running,
613    Completed,
614    Failed,
615    Cancelled,
616}
617
618#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
619#[serde(rename_all = "snake_case")]
620pub enum TraceBranchSelection {
621    Then,
622    Else,
623}
624
625#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
626pub struct TraceLashlangMap {
627    pub module_ref: String,
628    pub entry_kind: String,
629    #[serde(default, skip_serializing_if = "Option::is_none")]
630    pub entry_ref: Option<String>,
631    pub entry_name: String,
632    #[serde(default)]
633    pub nodes: Vec<TraceLashlangMapNode>,
634    #[serde(default)]
635    pub edges: Vec<TraceLashlangMapEdge>,
636}
637
638#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
639pub struct TraceLashlangMapNode {
640    pub id: String,
641    pub kind: String,
642    pub label: String,
643    #[serde(default, skip_serializing_if = "Option::is_none")]
644    pub label_metadata: Option<TraceLabelMetadata>,
645}
646
647#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
648pub struct TraceLabelMetadata {
649    pub title: String,
650    #[serde(default, skip_serializing_if = "Option::is_none")]
651    pub description: Option<String>,
652}
653
654#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
655pub struct TraceLashlangMapEdge {
656    pub id: String,
657    pub from: String,
658    pub to: String,
659    pub label: String,
660}
661
662#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
663pub struct TraceError {
664    pub message: String,
665    pub retryable: bool,
666    #[serde(default, skip_serializing_if = "Option::is_none")]
667    pub terminal_reason: Option<String>,
668    #[serde(default, skip_serializing_if = "Option::is_none")]
669    pub code: Option<String>,
670    #[serde(default, skip_serializing_if = "Option::is_none")]
671    pub raw: Option<String>,
672}
673
674#[derive(Debug, thiserror::Error)]
675pub enum TraceSinkError {
676    #[error("failed to serialize trace record: {0}")]
677    Serialize(#[from] serde_json::Error),
678    #[error("trace sink lock poisoned")]
679    LockPoisoned,
680    #[error("failed to create trace directory {path}: {source}")]
681    CreateDir { path: PathBuf, source: io::Error },
682    #[error("failed to open trace file {path}: {source}")]
683    Open { path: PathBuf, source: io::Error },
684    #[error("failed to write trace file {path}: {source}")]
685    Write { path: PathBuf, source: io::Error },
686}
687
688pub trait TraceSink: Send + Sync {
689    fn append(&self, record: &TraceRecord) -> Result<(), TraceSinkError>;
690
691    /// Force any buffered trace data this sink owns to durable storage.
692    ///
693    /// Hosts call this before process exit so records that a sink has not yet
694    /// committed are not lost. The default is a no-op: sinks that write each
695    /// record through on [`append`](Self::append) (or that delegate durability
696    /// to a host-owned exporter) have nothing of their own to flush. Sinks that
697    /// buffer — or that can force an `fsync` — override this.
698    fn flush(&self) -> Result<(), TraceSinkError> {
699        Ok(())
700    }
701}
702
703pub struct JsonlTraceSink {
704    path: PathBuf,
705    lock: Mutex<()>,
706}
707
708impl JsonlTraceSink {
709    pub fn new(path: impl Into<PathBuf>) -> Self {
710        Self {
711            path: path.into(),
712            lock: Mutex::new(()),
713        }
714    }
715
716    pub fn path(&self) -> &Path {
717        &self.path
718    }
719}
720
721impl TraceSink for JsonlTraceSink {
722    fn append(&self, record: &TraceRecord) -> Result<(), TraceSinkError> {
723        let line = serde_json::to_string(record)?;
724        let _guard = self.lock.lock().map_err(|_| TraceSinkError::LockPoisoned)?;
725        if let Some(parent) = self.path.parent()
726            && !parent.as_os_str().is_empty()
727        {
728            std::fs::create_dir_all(parent).map_err(|source| TraceSinkError::CreateDir {
729                path: parent.to_path_buf(),
730                source,
731            })?;
732        }
733        let mut file = OpenOptions::new()
734            .create(true)
735            .append(true)
736            .open(&self.path)
737            .map_err(|source| TraceSinkError::Open {
738                path: self.path.clone(),
739                source,
740            })?;
741        writeln!(file, "{line}").map_err(|source| TraceSinkError::Write {
742            path: self.path.clone(),
743            source,
744        })
745    }
746
747    /// `fsync` the trace file to durable storage.
748    ///
749    /// Each [`append`](Self::append) opens, appends, and closes the file, so a
750    /// record's bytes already reach the OS as it is written — this sink holds no
751    /// in-process buffer. `flush` goes one step further and forces an `fsync` so
752    /// the OS page cache is pushed to disk, which is the honest guarantee a host
753    /// wants before exit. If no record has been written yet the file may not
754    /// exist; that is a no-op rather than an error (nothing to sync), and we do
755    /// not create an empty file just to sync it.
756    fn flush(&self) -> Result<(), TraceSinkError> {
757        let _guard = self.lock.lock().map_err(|_| TraceSinkError::LockPoisoned)?;
758        match OpenOptions::new().append(true).open(&self.path) {
759            Ok(file) => file.sync_all().map_err(|source| TraceSinkError::Write {
760                path: self.path.clone(),
761                source,
762            }),
763            Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()),
764            Err(source) => Err(TraceSinkError::Open {
765                path: self.path.clone(),
766                source,
767            }),
768        }
769    }
770}
771
772/// Writes each trace record as one JSON line to stderr — handy for `cargo run`
773/// debugging without a trace file.
774#[derive(Default)]
775pub struct StderrTraceSink {
776    lock: Mutex<()>,
777}
778
779impl TraceSink for StderrTraceSink {
780    fn append(&self, record: &TraceRecord) -> Result<(), TraceSinkError> {
781        let line = serde_json::to_string(record)?;
782        let _guard = self.lock.lock().map_err(|_| TraceSinkError::LockPoisoned)?;
783        eprintln!("{line}");
784        Ok(())
785    }
786}
787
788/// Fans each trace record out to several sinks in order (e.g. stderr + a JSONL
789/// file). Stops at the first sink that errors.
790pub struct TeeTraceSink {
791    sinks: Vec<Arc<dyn TraceSink>>,
792}
793
794impl TeeTraceSink {
795    pub fn new(sinks: impl IntoIterator<Item = Arc<dyn TraceSink>>) -> Self {
796        Self {
797            sinks: sinks.into_iter().collect(),
798        }
799    }
800}
801
802impl TraceSink for TeeTraceSink {
803    fn append(&self, record: &TraceRecord) -> Result<(), TraceSinkError> {
804        for sink in &self.sinks {
805            sink.append(record)?;
806        }
807        Ok(())
808    }
809
810    /// Flush every wrapped sink, stopping at the first that errors.
811    fn flush(&self) -> Result<(), TraceSinkError> {
812        for sink in &self.sinks {
813            sink.flush()?;
814        }
815        Ok(())
816    }
817}
818
819pub fn sha256_hex(input: impl AsRef<[u8]>) -> String {
820    let mut hasher = Sha256::new();
821    hasher.update(input.as_ref());
822    format!("{:x}", hasher.finalize())
823}
824
825pub fn json_hash(value: &Value) -> String {
826    sha256_hex(serde_json::to_vec(value).unwrap_or_default())
827}
828
829#[cfg(test)]
830mod tests {
831    use super::*;
832
833    #[test]
834    fn jsonl_sink_writes_record() {
835        let dir = std::env::temp_dir().join(format!("lash-trace-{}", uuid::Uuid::new_v4()));
836        std::fs::create_dir_all(&dir).unwrap();
837        let path = dir.join("trace.jsonl");
838        let sink = JsonlTraceSink::new(&path);
839        sink.append(&TraceRecord::new(
840            TraceContext::default().for_session("root"),
841            TraceEvent::Custom {
842                name: "test.event".to_string(),
843                payload: serde_json::json!({"ok": true}),
844            },
845        ))
846        .unwrap();
847        let text = std::fs::read_to_string(&path).unwrap();
848        assert!(text.contains("\"type\":\"custom\""));
849        assert!(text.contains("\"session_id\":\"root\""));
850    }
851
852    #[test]
853    fn tool_start_and_frame_switch_records_are_jsonl_shaped() {
854        let started = TraceRecord::new(
855            TraceContext::default().for_session("root"),
856            TraceEvent::ToolCallStarted {
857                call_id: Some("call-1".to_string()),
858                name: "read_file".to_string(),
859                args: serde_json::json!({"path": "README.md"}),
860            },
861        );
862        let completed = TraceRecord::new(
863            TraceContext::default().for_session("root"),
864            TraceEvent::TurnCompleted {
865                status: "completed".to_string(),
866                done_reason: "modelstop".to_string(),
867                agent_frame_switch: Some(TraceAgentFrameSwitch {
868                    frame_id: "frame-1".to_string(),
869                }),
870            },
871        );
872
873        let started_json = serde_json::to_value(started).unwrap();
874        assert_eq!(started_json["type"], "tool_call_started");
875        assert_eq!(started_json["call_id"], "call-1");
876
877        let completed_json = serde_json::to_value(completed).unwrap();
878        assert_eq!(completed_json["type"], "turn_completed");
879        assert_eq!(completed_json["agent_frame_switch"]["frame_id"], "frame-1");
880    }
881
882    #[test]
883    fn lashlang_execution_records_are_jsonl_shaped() {
884        let identity = TraceLashlangExecutionIdentity {
885            scope: TraceRuntimeScope::new("s1"),
886            subject: TraceRuntimeSubject::Process {
887                process_id: "p1".to_string(),
888            },
889            module_ref: "module".to_string(),
890            entry_kind: "process".to_string(),
891            entry_ref: Some("component:0".to_string()),
892            entry_name: "main".to_string(),
893        };
894        let event = TraceLashlangExecutionEvent::NodeStarted {
895            event_key: "process:p1:node:n1:1:started".to_string(),
896            identity,
897            node_id: "n1".to_string(),
898            node_kind: "resource_operation".to_string(),
899            label: "read_file".to_string(),
900            occurrence: 1,
901        };
902        let record = TraceRecord::new(
903            TraceContext::default().for_session("s1"),
904            TraceEvent::LashlangExecution { event },
905        );
906
907        let json = serde_json::to_value(&record).expect("serialize lashlang execution");
908        assert_eq!(json["type"], "lashlang_execution");
909        assert_eq!(json["event"]["kind"], "node_started");
910        assert_eq!(json["event"]["event_key"], "process:p1:node:n1:1:started");
911
912        let round_trip =
913            serde_json::from_value::<TraceRecord>(json).expect("deserialize lashlang execution");
914        assert!(matches!(
915            round_trip.event,
916            TraceEvent::LashlangExecution {
917                event: TraceLashlangExecutionEvent::NodeStarted { .. }
918            }
919        ));
920    }
921
922    #[test]
923    fn tool_completion_serializes_typed_failure_output() {
924        let record = TraceRecord::new(
925            TraceContext::default().for_session("root"),
926            TraceEvent::ToolCallCompleted {
927                call_id: Some("call-1".to_string()),
928                name: "read_file".to_string(),
929                args: serde_json::json!({"path": "missing"}),
930                output: TraceToolCallOutput {
931                    outcome: TraceToolCallOutcome::Failure(serde_json::json!({
932                        "class": "invalid_request",
933                        "code": "invalid_tool_args",
934                        "message": "bad args",
935                        "source": "runtime",
936                        "retry": { "type": "never" },
937                        "raw": { "path": "missing" }
938                    })),
939                    control: None,
940                },
941                duration_ms: 3,
942            },
943        );
944
945        let json = serde_json::to_value(record).unwrap();
946        assert_eq!(json["type"], "tool_call_completed");
947        assert_eq!(json["output"]["outcome"]["status"], "failure");
948        assert_eq!(
949            json["output"]["outcome"]["payload"]["code"],
950            "invalid_tool_args"
951        );
952        assert_eq!(
953            json["output"]["outcome"]["payload"]["raw"]["path"],
954            "missing"
955        );
956    }
957
958    #[test]
959    fn event_kind_matches_serialized_type_tag() {
960        let events = [
961            TraceEvent::SessionStarted {
962                metadata: Default::default(),
963            },
964            TraceEvent::TurnStarted {
965                metadata: Default::default(),
966            },
967            TraceEvent::ToolCallStarted {
968                call_id: None,
969                name: "read_file".to_string(),
970                args: Value::Null,
971            },
972            TraceEvent::Custom {
973                name: "x".to_string(),
974                payload: Value::Null,
975            },
976        ];
977        for event in events {
978            let kind = event.kind();
979            let json = serde_json::to_value(&event).expect("serialize event");
980            assert_eq!(json["type"], kind, "kind() disagrees with serde tag");
981        }
982    }
983
984    #[test]
985    fn jsonl_sink_creates_parent_directories() {
986        let dir = std::env::temp_dir().join(format!("lash-trace-{}", uuid::Uuid::new_v4()));
987        let path = dir.join("nested").join("trace.jsonl");
988        let sink = JsonlTraceSink::new(&path);
989        sink.append(&TraceRecord::new(
990            TraceContext::default().for_session("root"),
991            TraceEvent::RuntimeStreamEvent {
992                event: TraceRuntimeStreamEvent {
993                    sequence: 1,
994                    elapsed_ms: 0,
995                    event_name: "delta".to_string(),
996                    raw_text: Some("hello".to_string()),
997                    visible_text: Some("hello".to_string()),
998                    item_id: None,
999                    output_index: None,
1000                    call_id: None,
1001                    tool_name: None,
1002                    input_json: None,
1003                    usage: None,
1004                },
1005            },
1006        ))
1007        .unwrap();
1008        assert!(path.exists());
1009        let _ = std::fs::remove_dir_all(dir);
1010    }
1011}