entelix_session/event.rs
1//! `GraphEvent` — the audit-trail unit appended to a `SessionGraph`.
2//!
3//! Every event is timestamped and serializable, so a persisted log can be
4//! replayed verbatim by a fresh process (Anthropic-style `wake(thread_id)`).
5//! Events are **strictly additive** — once written, never mutated.
6
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9
10use entelix_core::ir::{ContentPart, ModelWarning, ProviderEchoSnapshot, ToolResultContent, Usage};
11use entelix_core::rate_limit::RateLimitSnapshot;
12
13/// One audit-log entry.
14///
15/// Aggregating these (oldest-to-newest) reconstructs the full conversation
16/// trace for a thread. Branches and checkpoints are recorded inline so a
17/// single linear scan is enough for replay.
18#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
19#[serde(tag = "kind", rename_all = "snake_case")]
20#[non_exhaustive]
21pub enum GraphEvent {
22 /// User-authored input.
23 UserMessage {
24 /// Multi-part content (text, image, `tool_result`).
25 content: Vec<ContentPart>,
26 /// Wall-clock time the event was appended.
27 timestamp: DateTime<Utc>,
28 },
29 /// Assistant reply (after stream aggregation).
30 AssistantMessage {
31 /// Multi-part content (text, `tool_use`).
32 content: Vec<ContentPart>,
33 /// Token accounting if reported by the provider.
34 usage: Option<Usage>,
35 /// Wall-clock time the event was appended.
36 timestamp: DateTime<Utc>,
37 },
38 /// An auto-compaction adapter trimmed the working message slice.
39 /// `dropped_chars` is the character cost the compactor removed
40 /// (or summarised away); `retained_chars` is the cost the
41 /// post-compaction slice carries forward. The pair lets dashboards
42 /// detect drift between the threshold the operator wired and the
43 /// actual trim each invocation produces.
44 ContextCompacted {
45 /// Character cost the compactor dropped.
46 dropped_chars: usize,
47 /// Character cost the post-compaction slice retained.
48 retained_chars: usize,
49 /// Wall-clock time the event was appended.
50 timestamp: DateTime<Utc>,
51 },
52 /// A tool was dispatched by the assistant.
53 ToolCall {
54 /// Stable tool-use id matching a future `ToolResult`.
55 id: String,
56 /// Registered tool name.
57 name: String,
58 /// Tool input as JSON.
59 input: serde_json::Value,
60 /// Wall-clock time the event was appended.
61 timestamp: DateTime<Utc>,
62 },
63 /// The dispatched tool returned.
64 ToolResult {
65 /// `ToolCall::id` this result resolves.
66 tool_use_id: String,
67 /// `ToolCall::name` this result resolves — required by
68 /// codecs whose wire format keys correlation by name
69 /// (Gemini's `functionResponse`) rather than id.
70 name: String,
71 /// Result payload.
72 content: ToolResultContent,
73 /// True if the tool reported an error.
74 is_error: bool,
75 /// Wall-clock time the event was appended.
76 timestamp: DateTime<Utc>,
77 },
78 /// A branch was forked off this session at the indicated event index.
79 /// The new branch's thread id is recorded alongside.
80 BranchCreated {
81 /// Identifier of the forked sub-session.
82 branch_id: String,
83 /// Index in `events` (0-based) the branch diverged at.
84 parent_event: usize,
85 /// Wall-clock time the event was appended.
86 timestamp: DateTime<Utc>,
87 },
88 /// Marker tying this position in the audit log to a `Checkpointer`
89 /// snapshot. Cross-tier reference for crash recovery flows that pair
90 /// `SessionGraph` (Tier 2) with `StateGraph` checkpoints (Tier 1).
91 CheckpointMarker {
92 /// Stringified `entelix_graph::CheckpointId`.
93 checkpoint_id: String,
94 /// Thread the checkpoint was written under (typically same as
95 /// the session's thread).
96 thread_id: String,
97 /// Wall-clock time the event was appended.
98 timestamp: DateTime<Utc>,
99 },
100 /// Codec / runtime advisory captured into the audit trail.
101 Warning {
102 /// Underlying advisory.
103 warning: ModelWarning,
104 /// Wall-clock time the event was appended.
105 timestamp: DateTime<Utc>,
106 },
107 /// Streaming thinking-content fragment captured into the audit
108 /// trail. Aggregators fold consecutive deltas into a single
109 /// `ContentPart::Thinking` when reconstructing a finalised
110 /// message. Recording deltas individually keeps the audit log
111 /// faithful to the wire — a replay that needs only the final
112 /// block can fold the deltas, while a replay that needs per-token
113 /// timing has the data.
114 ThinkingDelta {
115 /// Token text appended to the in-progress thinking block.
116 text: String,
117 /// Vendor opaque round-trip tokens carried on this delta
118 /// (Anthropic `signature_delta`, Gemini `thought_signature`
119 /// on streamed parts, `OpenAI` Responses reasoning-item
120 /// `encrypted_content`). Codecs pre-wrap into
121 /// `ProviderEchoSnapshot` on decode; the audit log preserves
122 /// the same opaque bytes for replay.
123 #[serde(default, skip_serializing_if = "Vec::is_empty")]
124 provider_echoes: Vec<ProviderEchoSnapshot>,
125 /// Wall-clock time the event was appended.
126 timestamp: DateTime<Utc>,
127 },
128 /// Provider rate-limit snapshot at this position in the
129 /// conversation. Operators reading the audit log can correlate a
130 /// later throttling failure with the snapshot that warned them.
131 /// Recorded inline rather than on a separate metric channel so
132 /// the audit trail is self-contained for compliance review.
133 RateLimit {
134 /// Snapshot the codec extracted from response headers.
135 snapshot: RateLimitSnapshot,
136 /// Wall-clock time the event was appended.
137 timestamp: DateTime<Utc>,
138 },
139 /// HITL pause point — the runtime asked the host application for
140 /// input. The matching resume signal lands in
141 /// `entelix_graph::Command` outside the audit log; this event
142 /// records that the pause happened, the typed reason, and what
143 /// was visible to the human at the time.
144 Interrupt {
145 /// Typed pause reason — `Custom` for operator-defined
146 /// pauses, `ApprovalPending { tool_use_id }` for tool-
147 /// approval pauses raised by `ApprovalLayer`,
148 /// `ScheduledPause { phase, node }` for the graph's
149 /// `interrupt_before` / `interrupt_after` schedules. Audit
150 /// consumers split compliance-reportable HITL approvals
151 /// from scheduled pauses without parsing the free-form
152 /// payload.
153 ///
154 /// The field is named `interruption_kind` (not `kind`)
155 /// because the surrounding `#[serde(tag = "kind")]` already
156 /// consumes `kind` as the variant discriminator — same
157 /// reason the parallel `GraphEvent::ToolErrorTerminal::
158 /// error_kind` field is renamed.
159 interruption_kind: entelix_core::InterruptionKind,
160 /// Operator-supplied payload describing the pause point.
161 /// Free-form JSON so the agent recipe owns the schema; the
162 /// audit log just persists it. `Value::Null` for typed
163 /// `interruption_kind`s that carry their own structured
164 /// detail.
165 payload: serde_json::Value,
166 /// Wall-clock time the event was appended.
167 timestamp: DateTime<Utc>,
168 },
169 /// The run was cancelled — either via cancellation token or via
170 /// a deadline elapsing. Recording the reason inline lets a
171 /// replay reconstruct partial-run audit traces faithfully.
172 Cancelled {
173 /// Lean reason string. Human-readable; not parsed downstream.
174 reason: String,
175 /// Wall-clock time the event was appended.
176 timestamp: DateTime<Utc>,
177 },
178 /// A sub-agent was dispatched from the parent's run. The parent
179 /// `run_id` (recorded on the surrounding `AgentEvent::Started`)
180 /// scopes the audit trail; this event ties the parent's
181 /// position to the child's `sub_thread_id` so a replay can walk
182 /// from parent to child without keying on heuristic timing.
183 /// Managed-agent shape — every `Subagent::execute`
184 /// call surfaces here as the canonical "brain passes hand"
185 /// audit boundary.
186 SubAgentInvoked {
187 /// Stable identifier the parent uses to refer to the
188 /// sub-agent (typically the `Subagent`'s configured name).
189 agent_id: String,
190 /// Thread the sub-agent ran under. Same as the parent's
191 /// thread when the sub-agent shares state; a fresh value
192 /// when the sub-agent runs in its own scope.
193 sub_thread_id: String,
194 /// Wall-clock time the event was appended.
195 timestamp: DateTime<Utc>,
196 },
197 /// A supervisor recipe handed control between named agents.
198 /// Distinct from `SubAgentInvoked` — supervisor handoffs route
199 /// inside one logical conversation, while sub-agent invocations
200 /// open a child run.
201 AgentHandoff {
202 /// Agent name that finished this turn (`None` on the first
203 /// supervisor turn where no agent has spoken yet).
204 from: Option<String>,
205 /// Agent name the supervisor routed to next.
206 to: String,
207 /// Wall-clock time the event was appended.
208 timestamp: DateTime<Utc>,
209 },
210 /// A run resumed from a prior checkpoint — either via
211 /// `wake(thread_id)` after a crash or via `Command::Resume` from
212 /// a HITL pause. Pairs with the `CheckpointMarker` whose id is
213 /// referenced so a single linear replay stays coherent across
214 /// the suspend / resume seam.
215 Resumed {
216 /// `CheckpointMarker::checkpoint_id` the resume hydrated
217 /// from. Empty string when the resume happened from a fresh
218 /// state (operator built the resume payload by hand).
219 from_checkpoint: String,
220 /// Wall-clock time the event was appended.
221 timestamp: DateTime<Utc>,
222 },
223 /// A long-term memory tier returned hits to the agent. Records
224 /// which tier was queried (`semantic` / `entity` / `graph` /
225 /// caller-defined), the namespace key (operator identifier for
226 /// the slice queried), and the number of hits returned. The
227 /// hits themselves stay outside the audit log — the model-facing
228 /// content already lands in `AssistantMessage` / `ToolResult`,
229 /// and storing the full retrieved corpus inline would balloon
230 /// the audit trail.
231 MemoryRecall {
232 /// Memory tier identifier (typically `"semantic"`,
233 /// `"entity"`, `"graph"`, or an operator-supplied label).
234 tier: String,
235 /// Rendered namespace key the query targeted.
236 namespace_key: String,
237 /// Number of records returned to the agent.
238 hits: usize,
239 /// Wall-clock time the event was appended.
240 timestamp: DateTime<Utc>,
241 },
242 /// An [`entelix_core::RunBudget`] axis hit its cap and
243 /// short-circuited the run with
244 /// `entelix_core::Error::UsageLimitExceeded`. Compliance and
245 /// billing audits replay this to attribute breaches per-tenant
246 /// per-run; the operator-facing `Error` continues to flow
247 /// through the typed dispatch return as well, so the audit
248 /// channel's role here is the durable record, not the only
249 /// breach signal.
250 UsageLimitExceeded {
251 /// Typed axis-and-magnitude pair carried straight through
252 /// from the matching `Error::UsageLimitExceeded(breach)`.
253 /// The axis variant carries its own magnitude shape
254 /// (`u64` for counts, `Decimal` for cost).
255 breach: entelix_core::UsageLimitBreach,
256 /// Wall-clock time the event was appended.
257 timestamp: DateTime<Utc>,
258 },
259 /// A classified tool-dispatch failure was escalated to terminal
260 /// by [`entelix_core::tools::ToolErrorPolicyLayer`]. The reasoning
261 /// loop returns `Error::ToolErrorTerminal` to the caller without
262 /// consulting the model — distinct from the matching `ToolResult
263 /// { is_error: true }` event (which records the tool's *response*)
264 /// because the loop-termination decision is a separate
265 /// observability fact dashboards need to split from generic
266 /// retry-loop exhaustion.
267 ToolErrorTerminal {
268 /// Classified `ToolErrorKind` (`"auth"`, `"quota"`,
269 /// `"permanent"`, ...) that matched the active
270 /// `ToolErrorPolicy::terminate_on` set.
271 ///
272 /// The field is named `error_kind` (not `kind`) because the
273 /// surrounding `#[serde(tag = "kind")]` already consumes
274 /// `kind` as the variant discriminator.
275 error_kind: entelix_core::ToolErrorKind,
276 /// Dispatched tool's name.
277 tool_name: String,
278 /// Wall-clock time the event was appended.
279 timestamp: DateTime<Utc>,
280 },
281 /// A failure surfaced from the model / tool / graph runtime.
282 /// Errors that the agent recovers from internally are still
283 /// recorded so post-mortems see the full picture.
284 Error {
285 /// Coarse classification matching `entelix_core::Error`
286 /// variants (`"provider"`, `"invalid_request"`, `"config"`,
287 /// `"auth"`, `"interrupted"`, `"cancelled"`, `"serde"`,
288 /// `"transport"`). Stable wire strings — dashboards key off
289 /// these without the SDK leaking internal error layout.
290 class: String,
291 /// Human-readable summary (`Display` form).
292 message: String,
293 /// Wall-clock time the event was appended.
294 timestamp: DateTime<Utc>,
295 },
296}
297
298impl GraphEvent {
299 /// Borrow the timestamp of any event variant.
300 pub const fn timestamp(&self) -> &DateTime<Utc> {
301 match self {
302 Self::UserMessage { timestamp, .. }
303 | Self::AssistantMessage { timestamp, .. }
304 | Self::ToolCall { timestamp, .. }
305 | Self::ToolResult { timestamp, .. }
306 | Self::BranchCreated { timestamp, .. }
307 | Self::CheckpointMarker { timestamp, .. }
308 | Self::Warning { timestamp, .. }
309 | Self::ThinkingDelta { timestamp, .. }
310 | Self::RateLimit { timestamp, .. }
311 | Self::Interrupt { timestamp, .. }
312 | Self::Cancelled { timestamp, .. }
313 | Self::SubAgentInvoked { timestamp, .. }
314 | Self::AgentHandoff { timestamp, .. }
315 | Self::Resumed { timestamp, .. }
316 | Self::MemoryRecall { timestamp, .. }
317 | Self::UsageLimitExceeded { timestamp, .. }
318 | Self::ContextCompacted { timestamp, .. }
319 | Self::ToolErrorTerminal { timestamp, .. }
320 | Self::Error { timestamp, .. } => timestamp,
321 }
322 }
323}