Skip to main content

ai_memory/recover/parsers/
mod.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! Per-host transcript parsers. Each module implements
5//! [`TranscriptParser`] for one host's transcript format; the
6//! [`crate::recover::recover_from_transcript`] handler dispatches
7//! to the right parser by `HostKind`.
8//!
9//! The parser surface is intentionally minimal: a parser takes a
10//! path + a `since` filter and yields an iterator of [`ParsedTurn`]
11//! values. The downstream recovery logic owns sha256-keyed dedup,
12//! memory-writes, and progress reporting; the parser owns only the
13//! transcript-format-specific concerns (JSONL framing, field
14//! mapping, timestamp parsing, role classification).
15
16pub mod claude_code_jsonl;
17
18use std::path::Path;
19
20use serde::{Deserialize, Serialize};
21
22/// One transcript turn parsed out of a host's transcript file.
23///
24/// The `role` field classifies the turn for downstream memory-kind
25/// assignment: a `user`-role turn becomes an `observation` memory
26/// tagged `operator-directive`; an `assistant`-role turn becomes
27/// an `observation` memory tagged `agent-response`. The v0.8
28/// decision-detector (#1393) will run an LLM classifier over these
29/// raw observations to refine them into `plan`/`decision`/`commitment`
30/// memories; the v0.7.0 recovery surface stops at the raw layer.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct ParsedTurn {
33    /// RFC3339 timestamp of the turn (when the host wrote it).
34    /// Used both for the `since` filter and for the recovered
35    /// memory's `created_at` (so the recovered memory's timeline
36    /// matches the original conversation rather than the recovery-
37    /// run wall-clock).
38    pub timestamp_iso: String,
39    /// Role classification — `user`, `assistant`, `tool_use`,
40    /// `tool_result`, or `other`. Drives the tag set the recovered
41    /// memory inherits.
42    pub role: TurnRole,
43    /// Verbatim content of the turn. For multi-content `assistant`
44    /// turns (text + tool_use + text), the parser concatenates the
45    /// text parts; tool-use bodies surface under [`Self::tool_calls`].
46    pub content_text: String,
47    /// Tool-call summaries from this turn. Each entry is one
48    /// `{tool, brief}` pair; the full args are not preserved at
49    /// this layer (the recovered memory's content is the user-
50    /// visible decision text, not the agent's tool-call trace).
51    pub tool_calls: Vec<ToolCallSummary>,
52    /// Stable sha256 of the source line content. Retained for audit
53    /// (memory title + metadata) and as the legacy back-compat dedup
54    /// probe; post-#1573 the dedup KEY is
55    /// `(host_session_id, host_turn_index)` when the host format
56    /// provides both, else [`Self::normalized_sha256_hex`].
57    pub line_sha256_hex: String,
58    /// Host-assigned session identifier for the turn (e.g. the
59    /// Claude Code JSONL `sessionId` field). `None` when the host
60    /// format does not carry one. Half of the #1573 canonical dedup
61    /// key, mirroring the L4 `memory_capture_turn` envelope.
62    pub host_session_id: Option<String>,
63    /// Host-assigned monotonic per-session turn counter. `None` when
64    /// the host format does not carry one (the Claude Code JSONL
65    /// format has no numeric turn counter — a line ordinal is NOT a
66    /// substitute because it need not agree with the counter the L4
67    /// `memory_capture_turn` envelope supplies, and a coincidental
68    /// match would falsely dedup a distinct turn).
69    pub host_turn_index: Option<i64>,
70}
71
72impl ParsedTurn {
73    /// #1573 — sha256 over the turn's NORMALIZED semantic content
74    /// (session id, timestamp, role, text, tool-call summaries) with
75    /// `0x00` field separators and `0x1f`/`0x1e` intra-list
76    /// separators. Unlike the raw-line hash, this is invariant under
77    /// host re-serialization (whitespace, JSON key order), so the
78    /// same turn rewritten by a host upgrade still dedups; distinct
79    /// turns that merely share text still differ via the
80    /// session-id + timestamp components.
81    #[must_use]
82    pub fn normalized_sha256_hex(&self) -> String {
83        use sha2::{Digest, Sha256};
84        let mut h = Sha256::new();
85        h.update(self.host_session_id.as_deref().unwrap_or("").as_bytes());
86        h.update([0x00]);
87        h.update(self.timestamp_iso.as_bytes());
88        h.update([0x00]);
89        h.update(self.role.as_str().as_bytes());
90        h.update([0x00]);
91        h.update(self.content_text.as_bytes());
92        h.update([0x00]);
93        for tc in &self.tool_calls {
94            h.update(tc.tool.as_bytes());
95            h.update([0x1f]);
96            h.update(tc.brief.as_bytes());
97            h.update([0x1e]);
98        }
99        format!("{:x}", h.finalize())
100    }
101}
102
103/// Role classification for one parsed transcript turn.
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
105#[serde(rename_all = "snake_case")]
106pub enum TurnRole {
107    /// Operator-typed prompt or directive.
108    User,
109    /// LLM-generated response.
110    Assistant,
111    /// LLM-initiated tool invocation.
112    ToolUse,
113    /// Tool-call result returned to the LLM.
114    ToolResult,
115    /// Any other line shape (system messages, attachments,
116    /// permission-mode toggles, etc.) — preserved as low-priority
117    /// observations rather than dropped.
118    Other,
119}
120
121impl TurnRole {
122    /// Stable wire string for the role. Single source for the
123    /// recovered-memory tags/metadata AND the #1573 normalized
124    /// dedup hash, so the two can never drift.
125    #[must_use]
126    pub fn as_str(self) -> &'static str {
127        match self {
128            Self::User => "user",
129            Self::Assistant => "assistant",
130            Self::ToolUse => "tool_use",
131            Self::ToolResult => "tool_result",
132            Self::Other => "other",
133        }
134    }
135}
136
137/// One tool-call mention extracted from an assistant turn.
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ToolCallSummary {
140    /// Tool name (e.g., `Bash`, `Read`, `mcp__memory__memory_store`).
141    pub tool: String,
142    /// One-line target / brief — for `Bash`, the command's
143    /// `description` arg; for `Read`, the file path; for an MCP
144    /// tool, the first 1-2 fields of the request struct.
145    pub brief: String,
146}
147
148/// Trait every per-host parser implements. The blanket
149/// [`crate::recover::recover_from_transcript`] entry-point dispatches
150/// to the right impl by `HostKind`.
151pub trait TranscriptParser {
152    /// Stream-parse a transcript file from disk, filtering to
153    /// turns whose timestamp is at or after `since_iso` when set.
154    /// Returns parsed turns in transcript order.
155    ///
156    /// # Errors
157    ///
158    /// Returns an error when the file cannot be opened. Per-line
159    /// parse errors are NOT propagated — the parser swallows them
160    /// and surfaces a partial result; SessionStart-hook integration
161    /// can't tolerate a single bad line wedging recovery.
162    fn parse(&self, path: &Path, since_iso: Option<&str>) -> Result<Vec<ParsedTurn>, ParseError>;
163}
164
165/// Errors surfaced by a parser. Most parse failures are non-fatal
166/// (see the parser-trait docstring); this enum carries only
167/// errors that prevent the parse from starting at all.
168#[derive(Debug)]
169pub enum ParseError {
170    /// File could not be opened or read.
171    Read(String),
172}
173
174impl std::fmt::Display for ParseError {
175    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
176        match self {
177            Self::Read(msg) => write!(f, "parser: read failed: {msg}"),
178        }
179    }
180}
181
182impl std::error::Error for ParseError {}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn turn_role_as_str_covers_every_variant() {
190        assert_eq!(TurnRole::User.as_str(), "user");
191        assert_eq!(TurnRole::Assistant.as_str(), "assistant");
192        assert_eq!(TurnRole::ToolUse.as_str(), "tool_use");
193        assert_eq!(TurnRole::ToolResult.as_str(), "tool_result");
194        assert_eq!(TurnRole::Other.as_str(), "other");
195    }
196
197    #[test]
198    fn turn_role_serde_round_trip_snake_case() {
199        for (role, wire) in [
200            (TurnRole::User, "\"user\""),
201            (TurnRole::Assistant, "\"assistant\""),
202            (TurnRole::ToolUse, "\"tool_use\""),
203            (TurnRole::ToolResult, "\"tool_result\""),
204            (TurnRole::Other, "\"other\""),
205        ] {
206            let s = serde_json::to_string(&role).unwrap();
207            assert_eq!(s, wire);
208            let back: TurnRole = serde_json::from_str(wire).unwrap();
209            assert_eq!(back, role);
210        }
211    }
212
213    fn sample_turn() -> ParsedTurn {
214        ParsedTurn {
215            timestamp_iso: "2026-05-28T12:00:00Z".to_string(),
216            role: TurnRole::Assistant,
217            content_text: "ran a tool".to_string(),
218            tool_calls: vec![
219                ToolCallSummary {
220                    tool: "Bash".to_string(),
221                    brief: "list files".to_string(),
222                },
223                ToolCallSummary {
224                    tool: "Read".to_string(),
225                    brief: "/a/b.rs".to_string(),
226                },
227            ],
228            line_sha256_hex: "ab".repeat(32),
229            host_session_id: Some("sess-1".to_string()),
230            host_turn_index: Some(7),
231        }
232    }
233
234    #[test]
235    fn normalized_sha256_is_64_hex_chars_and_stable() {
236        let t = sample_turn();
237        let a = t.normalized_sha256_hex();
238        let b = t.normalized_sha256_hex();
239        assert_eq!(a, b, "normalized hash must be deterministic");
240        assert_eq!(a.len(), 64);
241        assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
242    }
243
244    #[test]
245    fn normalized_sha256_differs_with_content_and_tool_calls() {
246        let base = sample_turn();
247        let base_hash = base.normalized_sha256_hex();
248
249        // Different content text → different hash.
250        let mut changed_content = base.clone();
251        changed_content.content_text = "different".to_string();
252        assert_ne!(changed_content.normalized_sha256_hex(), base_hash);
253
254        // Different tool-call brief → different hash (covers the
255        // tool_calls loop with the 0x1f/0x1e separators).
256        let mut changed_tool = base.clone();
257        changed_tool.tool_calls[0].brief = "rm -rf /".to_string();
258        assert_ne!(changed_tool.normalized_sha256_hex(), base_hash);
259
260        // Absent session id (None branch of unwrap_or) still hashes.
261        let mut no_session = base.clone();
262        no_session.host_session_id = None;
263        assert_eq!(no_session.normalized_sha256_hex().len(), 64);
264        assert_ne!(no_session.normalized_sha256_hex(), base_hash);
265    }
266
267    #[test]
268    fn parse_error_display_and_error_trait() {
269        let e = ParseError::Read("boom".to_string());
270        assert_eq!(e.to_string(), "parser: read failed: boom");
271        let _: &dyn std::error::Error = &e;
272        assert!(format!("{e:?}").contains("Read"));
273    }
274
275    #[test]
276    fn parsed_turn_serde_round_trips() {
277        let t = sample_turn();
278        let json = serde_json::to_string(&t).unwrap();
279        let back: ParsedTurn = serde_json::from_str(&json).unwrap();
280        assert_eq!(back.timestamp_iso, t.timestamp_iso);
281        assert_eq!(back.role, t.role);
282        assert_eq!(back.tool_calls.len(), 2);
283        assert_eq!(back.host_turn_index, Some(7));
284    }
285}