ai_memory/recover/parsers/mod.rs
1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! Per-host transcript parsers. Each module implements
5//! [`TranscriptParser`] for one host's transcript format; the
6//! [`crate::recover::recover_from_transcript`] handler dispatches
7//! to the right parser by `HostKind`.
8//!
9//! The parser surface is intentionally minimal: a parser takes a
10//! path + a `since` filter and yields an iterator of [`ParsedTurn`]
11//! values. The downstream recovery logic owns sha256-keyed dedup,
12//! memory-writes, and progress reporting; the parser owns only the
13//! transcript-format-specific concerns (JSONL framing, field
14//! mapping, timestamp parsing, role classification).
15
16pub mod claude_code_jsonl;
17
18use std::path::Path;
19
20use serde::{Deserialize, Serialize};
21
22/// One transcript turn parsed out of a host's transcript file.
23///
24/// The `role` field classifies the turn for downstream memory-kind
25/// assignment: a `user`-role turn becomes an `observation` memory
26/// tagged `operator-directive`; an `assistant`-role turn becomes
27/// an `observation` memory tagged `agent-response`. The v0.8
28/// decision-detector (#1393) will run an LLM classifier over these
29/// raw observations to refine them into `plan`/`decision`/`commitment`
30/// memories; the v0.7.0 recovery surface stops at the raw layer.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct ParsedTurn {
33 /// RFC3339 timestamp of the turn (when the host wrote it).
34 /// Used both for the `since` filter and for the recovered
35 /// memory's `created_at` (so the recovered memory's timeline
36 /// matches the original conversation rather than the recovery-
37 /// run wall-clock).
38 pub timestamp_iso: String,
39 /// Role classification — `user`, `assistant`, `tool_use`,
40 /// `tool_result`, or `other`. Drives the tag set the recovered
41 /// memory inherits.
42 pub role: TurnRole,
43 /// Verbatim content of the turn. For multi-content `assistant`
44 /// turns (text + tool_use + text), the parser concatenates the
45 /// text parts; tool-use bodies surface under [`Self::tool_calls`].
46 pub content_text: String,
47 /// Tool-call summaries from this turn. Each entry is one
48 /// `{tool, brief}` pair; the full args are not preserved at
49 /// this layer (the recovered memory's content is the user-
50 /// visible decision text, not the agent's tool-call trace).
51 pub tool_calls: Vec<ToolCallSummary>,
52 /// Stable sha256 of the source line content. Retained for audit
53 /// (memory title + metadata) and as the legacy back-compat dedup
54 /// probe; post-#1573 the dedup KEY is
55 /// `(host_session_id, host_turn_index)` when the host format
56 /// provides both, else [`Self::normalized_sha256_hex`].
57 pub line_sha256_hex: String,
58 /// Host-assigned session identifier for the turn (e.g. the
59 /// Claude Code JSONL `sessionId` field). `None` when the host
60 /// format does not carry one. Half of the #1573 canonical dedup
61 /// key, mirroring the L4 `memory_capture_turn` envelope.
62 pub host_session_id: Option<String>,
63 /// Host-assigned monotonic per-session turn counter. `None` when
64 /// the host format does not carry one (the Claude Code JSONL
65 /// format has no numeric turn counter — a line ordinal is NOT a
66 /// substitute because it need not agree with the counter the L4
67 /// `memory_capture_turn` envelope supplies, and a coincidental
68 /// match would falsely dedup a distinct turn).
69 pub host_turn_index: Option<i64>,
70}
71
72impl ParsedTurn {
73 /// #1573 — sha256 over the turn's NORMALIZED semantic content
74 /// (session id, timestamp, role, text, tool-call summaries) with
75 /// `0x00` field separators and `0x1f`/`0x1e` intra-list
76 /// separators. Unlike the raw-line hash, this is invariant under
77 /// host re-serialization (whitespace, JSON key order), so the
78 /// same turn rewritten by a host upgrade still dedups; distinct
79 /// turns that merely share text still differ via the
80 /// session-id + timestamp components.
81 #[must_use]
82 pub fn normalized_sha256_hex(&self) -> String {
83 use sha2::{Digest, Sha256};
84 let mut h = Sha256::new();
85 h.update(self.host_session_id.as_deref().unwrap_or("").as_bytes());
86 h.update([0x00]);
87 h.update(self.timestamp_iso.as_bytes());
88 h.update([0x00]);
89 h.update(self.role.as_str().as_bytes());
90 h.update([0x00]);
91 h.update(self.content_text.as_bytes());
92 h.update([0x00]);
93 for tc in &self.tool_calls {
94 h.update(tc.tool.as_bytes());
95 h.update([0x1f]);
96 h.update(tc.brief.as_bytes());
97 h.update([0x1e]);
98 }
99 format!("{:x}", h.finalize())
100 }
101}
102
103/// Role classification for one parsed transcript turn.
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
105#[serde(rename_all = "snake_case")]
106pub enum TurnRole {
107 /// Operator-typed prompt or directive.
108 User,
109 /// LLM-generated response.
110 Assistant,
111 /// LLM-initiated tool invocation.
112 ToolUse,
113 /// Tool-call result returned to the LLM.
114 ToolResult,
115 /// Any other line shape (system messages, attachments,
116 /// permission-mode toggles, etc.) — preserved as low-priority
117 /// observations rather than dropped.
118 Other,
119}
120
121impl TurnRole {
122 /// Stable wire string for the role. Single source for the
123 /// recovered-memory tags/metadata AND the #1573 normalized
124 /// dedup hash, so the two can never drift.
125 #[must_use]
126 pub fn as_str(self) -> &'static str {
127 match self {
128 Self::User => "user",
129 Self::Assistant => "assistant",
130 Self::ToolUse => "tool_use",
131 Self::ToolResult => "tool_result",
132 Self::Other => "other",
133 }
134 }
135}
136
137/// One tool-call mention extracted from an assistant turn.
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ToolCallSummary {
140 /// Tool name (e.g., `Bash`, `Read`, `mcp__memory__memory_store`).
141 pub tool: String,
142 /// One-line target / brief — for `Bash`, the command's
143 /// `description` arg; for `Read`, the file path; for an MCP
144 /// tool, the first 1-2 fields of the request struct.
145 pub brief: String,
146}
147
148/// Trait every per-host parser implements. The blanket
149/// [`crate::recover::recover_from_transcript`] entry-point dispatches
150/// to the right impl by `HostKind`.
151pub trait TranscriptParser {
152 /// Stream-parse a transcript file from disk, filtering to
153 /// turns whose timestamp is at or after `since_iso` when set.
154 /// Returns parsed turns in transcript order.
155 ///
156 /// # Errors
157 ///
158 /// Returns an error when the file cannot be opened. Per-line
159 /// parse errors are NOT propagated — the parser swallows them
160 /// and surfaces a partial result; SessionStart-hook integration
161 /// can't tolerate a single bad line wedging recovery.
162 fn parse(&self, path: &Path, since_iso: Option<&str>) -> Result<Vec<ParsedTurn>, ParseError>;
163}
164
165/// Errors surfaced by a parser. Most parse failures are non-fatal
166/// (see the parser-trait docstring); this enum carries only
167/// errors that prevent the parse from starting at all.
168#[derive(Debug)]
169pub enum ParseError {
170 /// File could not be opened or read.
171 Read(String),
172}
173
174impl std::fmt::Display for ParseError {
175 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
176 match self {
177 Self::Read(msg) => write!(f, "parser: read failed: {msg}"),
178 }
179 }
180}
181
182impl std::error::Error for ParseError {}
183
184#[cfg(test)]
185mod tests {
186 use super::*;
187
188 #[test]
189 fn turn_role_as_str_covers_every_variant() {
190 assert_eq!(TurnRole::User.as_str(), "user");
191 assert_eq!(TurnRole::Assistant.as_str(), "assistant");
192 assert_eq!(TurnRole::ToolUse.as_str(), "tool_use");
193 assert_eq!(TurnRole::ToolResult.as_str(), "tool_result");
194 assert_eq!(TurnRole::Other.as_str(), "other");
195 }
196
197 #[test]
198 fn turn_role_serde_round_trip_snake_case() {
199 for (role, wire) in [
200 (TurnRole::User, "\"user\""),
201 (TurnRole::Assistant, "\"assistant\""),
202 (TurnRole::ToolUse, "\"tool_use\""),
203 (TurnRole::ToolResult, "\"tool_result\""),
204 (TurnRole::Other, "\"other\""),
205 ] {
206 let s = serde_json::to_string(&role).unwrap();
207 assert_eq!(s, wire);
208 let back: TurnRole = serde_json::from_str(wire).unwrap();
209 assert_eq!(back, role);
210 }
211 }
212
213 fn sample_turn() -> ParsedTurn {
214 ParsedTurn {
215 timestamp_iso: "2026-05-28T12:00:00Z".to_string(),
216 role: TurnRole::Assistant,
217 content_text: "ran a tool".to_string(),
218 tool_calls: vec![
219 ToolCallSummary {
220 tool: "Bash".to_string(),
221 brief: "list files".to_string(),
222 },
223 ToolCallSummary {
224 tool: "Read".to_string(),
225 brief: "/a/b.rs".to_string(),
226 },
227 ],
228 line_sha256_hex: "ab".repeat(32),
229 host_session_id: Some("sess-1".to_string()),
230 host_turn_index: Some(7),
231 }
232 }
233
234 #[test]
235 fn normalized_sha256_is_64_hex_chars_and_stable() {
236 let t = sample_turn();
237 let a = t.normalized_sha256_hex();
238 let b = t.normalized_sha256_hex();
239 assert_eq!(a, b, "normalized hash must be deterministic");
240 assert_eq!(a.len(), 64);
241 assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
242 }
243
244 #[test]
245 fn normalized_sha256_differs_with_content_and_tool_calls() {
246 let base = sample_turn();
247 let base_hash = base.normalized_sha256_hex();
248
249 // Different content text → different hash.
250 let mut changed_content = base.clone();
251 changed_content.content_text = "different".to_string();
252 assert_ne!(changed_content.normalized_sha256_hex(), base_hash);
253
254 // Different tool-call brief → different hash (covers the
255 // tool_calls loop with the 0x1f/0x1e separators).
256 let mut changed_tool = base.clone();
257 changed_tool.tool_calls[0].brief = "rm -rf /".to_string();
258 assert_ne!(changed_tool.normalized_sha256_hex(), base_hash);
259
260 // Absent session id (None branch of unwrap_or) still hashes.
261 let mut no_session = base.clone();
262 no_session.host_session_id = None;
263 assert_eq!(no_session.normalized_sha256_hex().len(), 64);
264 assert_ne!(no_session.normalized_sha256_hex(), base_hash);
265 }
266
267 #[test]
268 fn parse_error_display_and_error_trait() {
269 let e = ParseError::Read("boom".to_string());
270 assert_eq!(e.to_string(), "parser: read failed: boom");
271 let _: &dyn std::error::Error = &e;
272 assert!(format!("{e:?}").contains("Read"));
273 }
274
275 #[test]
276 fn parsed_turn_serde_round_trips() {
277 let t = sample_turn();
278 let json = serde_json::to_string(&t).unwrap();
279 let back: ParsedTurn = serde_json::from_str(&json).unwrap();
280 assert_eq!(back.timestamp_iso, t.timestamp_iso);
281 assert_eq!(back.role, t.role);
282 assert_eq!(back.tool_calls.len(), 2);
283 assert_eq!(back.host_turn_index, Some(7));
284 }
285}