Skip to main content

spool/distill/
transcript.rs

1//! Claude Code session transcript parser.
2//!
3//! Claude Code persists every session as line-delimited JSON under
4//! `~/.claude/projects/<sanitized-cwd>/<session-id>.jsonl`. The exact
5//! schema isn't formally documented and evolves between Claude Code
6//! releases; we therefore use a permissive parser that:
7//!
8//! 1. Reads each line as a generic `serde_json::Value`.
9//! 2. Maps known shapes onto [`TranscriptEntry`] variants.
10//! 3. Skips unknown / malformed lines with a stderr warn (mirroring
11//!    `LifecycleStore::read_all` policy).
12//!
13//! ## Recognized shapes (Claude Code 2026-04+)
14//! - `{"type":"user","message":{"role":"user","content":<str|arr>}}`
15//! - `{"type":"assistant","message":{"role":"assistant","content":<str|arr>}}`
16//! - `{"type":"tool_use","name":<str>,"input":<obj>}` (and the
17//!   nested-in-assistant-content variant)
18//! - `{"type":"tool_result","content":<str|arr>}`
19//! - everything else → `TranscriptEntry::Other` (preserved verbatim
20//!   so distill heuristics can still see the raw shape if needed)
21//!
22//! Each entry exposes a normalized `text()` view (concatenated string
23//! content) so heuristics don't have to re-walk the message tree.
24//!
25//! ## What we deliberately do NOT do
26//! - We don't try to reconstruct turn boundaries (the assistant may
27//!   stream multiple `assistant` rows for one turn; heuristics handle
28//!   that).
29//! - We don't merge tool_use / tool_result pairs — the distill layer
30//!   does, after redaction.
31//! - We don't load the *whole* file into memory upfront for huge
32//!   sessions — we provide a streaming iterator (`stream`) too.
33
34use anyhow::{Context, Result};
35use serde::{Deserialize, Serialize};
36use serde_json::Value;
37use std::fs::File;
38use std::io::{BufRead, BufReader};
39use std::path::{Path, PathBuf};
40
41/// Resolve the directory Claude Code uses for transcripts of `cwd`.
42///
43/// Claude Code substitutes path separators with `-` and strips
44/// leading slashes. e.g. `/Users/long/Work/spool` →
45/// `~/.claude/projects/-Users-long-Work-spool/`.
46pub fn project_dir_for(cwd: &Path, home: &Path) -> PathBuf {
47    let raw = cwd.to_string_lossy();
48    // Replace every component separator with `-`. We intentionally do
49    // NOT use `replace('/', "-")` blindly because Windows paths use
50    // `\\`; on Unix they coincide, but being explicit keeps the
51    // function usable from tests with synthetic paths.
52    let mut sanitized = String::with_capacity(raw.len() + 1);
53    for ch in raw.chars() {
54        match ch {
55            '/' | '\\' | ':' => sanitized.push('-'),
56            other => sanitized.push(other),
57        }
58    }
59    home.join(".claude").join("projects").join(sanitized)
60}
61
62/// Find the most recently modified `.jsonl` transcript under
63/// `project_dir_for(cwd, home)`. Returns `None` when:
64/// - the project directory doesn't exist (no Claude Code session yet
65///   for this cwd), OR
66/// - the directory has no `.jsonl` files.
67///
68/// Used by Stop hook as a fallback when Claude Code's stdin payload
69/// doesn't include `transcript_path` (older versions or non-standard
70/// invocation).
71pub fn find_latest_for_cwd(cwd: &Path, home: &Path) -> Option<PathBuf> {
72    let dir = project_dir_for(cwd, home);
73    if !dir.exists() {
74        return None;
75    }
76    let mut latest: Option<(std::time::SystemTime, PathBuf)> = None;
77    let entries = std::fs::read_dir(&dir).ok()?;
78    for entry in entries.flatten() {
79        let path = entry.path();
80        if path.extension().and_then(|s| s.to_str()) != Some("jsonl") {
81            continue;
82        }
83        let modified = match entry.metadata().and_then(|m| m.modified()) {
84            Ok(m) => m,
85            Err(_) => continue,
86        };
87        match &latest {
88            Some((existing, _)) if *existing >= modified => {}
89            _ => latest = Some((modified, path)),
90        }
91    }
92    latest.map(|(_, p)| p)
93}
94
95/// One parsed line from a transcript file.
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
97#[serde(tag = "kind", rename_all = "snake_case")]
98pub enum TranscriptEntry {
99    /// User turn. `text` is the full content (including tool_result
100    /// blocks). `authored` is only the user-typed text blocks —
101    /// tool_result content is excluded. Use `authored` for self-tag
102    /// heuristics; use `text` for sampling excerpts.
103    User {
104        text: String,
105        authored: String,
106    },
107    Assistant {
108        text: String,
109    },
110    ToolUse {
111        name: String,
112        text: String,
113    },
114    ToolResult {
115        text: String,
116    },
117    Other {
118        raw: Value,
119    },
120}
121
122impl TranscriptEntry {
123    /// Flat text view used by heuristics. For `Other` we serialize
124    /// the raw value so heuristics can still grep across unknown
125    /// shapes if they want (cheap; it's already a JSON value).
126    pub fn text(&self) -> String {
127        match self {
128            TranscriptEntry::User { text, .. }
129            | TranscriptEntry::Assistant { text }
130            | TranscriptEntry::ToolResult { text } => text.clone(),
131            TranscriptEntry::ToolUse { name, text } => {
132                if text.is_empty() {
133                    name.clone()
134                } else {
135                    format!("{name}: {text}")
136                }
137            }
138            TranscriptEntry::Other { raw } => raw.to_string(),
139        }
140    }
141
142    /// Only the user-authored text in a User turn — tool_result
143    /// content blocks are excluded. Returns the same as `text()` for
144    /// non-User variants. Used by self-tag heuristics to avoid
145    /// scanning large tool output for memory markers.
146    pub fn authored_text(&self) -> &str {
147        match self {
148            TranscriptEntry::User { authored, .. } => authored.as_str(),
149            TranscriptEntry::Assistant { text } | TranscriptEntry::ToolResult { text } => {
150                text.as_str()
151            }
152            TranscriptEntry::ToolUse { text, .. } => text.as_str(),
153            TranscriptEntry::Other { .. } => "",
154        }
155    }
156
157    pub fn role_tag(&self) -> &'static str {
158        match self {
159            TranscriptEntry::User { .. } => "user",
160            TranscriptEntry::Assistant { .. } => "assistant",
161            TranscriptEntry::ToolUse { .. } => "tool_use",
162            TranscriptEntry::ToolResult { .. } => "tool_result",
163            TranscriptEntry::Other { .. } => "other",
164        }
165    }
166}
167
168/// Parse the entire transcript at `path` into memory. Returns Ok with
169/// the parsed prefix even when corrupt lines are encountered (those
170/// are skipped and reported to stderr).
171pub fn read_all(path: &Path) -> Result<Vec<TranscriptEntry>> {
172    read_tail(path, usize::MAX)
173}
174
175/// Parse at most `max_lines` raw lines from the **end** of the
176/// transcript. Useful for distill heuristics that only care about
177/// recent turns — avoids loading multi-MB transcripts in full.
178///
179/// Implementation: reads the whole file line-by-line but only keeps
180/// the last `max_lines` raw strings before parsing, so memory usage
181/// is bounded by `max_lines` even for huge files.
182pub fn read_tail(path: &Path, max_lines: usize) -> Result<Vec<TranscriptEntry>> {
183    if !path.exists() {
184        return Ok(Vec::new());
185    }
186    let file = File::open(path).with_context(|| format!("opening {}", path.display()))?;
187    let reader = BufReader::new(file);
188
189    // Collect raw non-empty lines into a ring buffer of size max_lines.
190    let mut ring: std::collections::VecDeque<String> = std::collections::VecDeque::new();
191    for (idx, line) in reader.lines().enumerate() {
192        let line_no = idx + 1;
193        let raw = match line {
194            Ok(raw) => raw,
195            Err(err) => {
196                eprintln!(
197                    "[spool transcript] read error at {}:{line_no}: {err}",
198                    path.display()
199                );
200                continue;
201            }
202        };
203        if raw.trim().is_empty() {
204            continue;
205        }
206        if max_lines < usize::MAX && ring.len() >= max_lines {
207            ring.pop_front();
208        }
209        ring.push_back(raw);
210    }
211
212    let mut entries = Vec::with_capacity(ring.len());
213    for (i, raw) in ring.into_iter().enumerate() {
214        match parse_line(&raw) {
215            Some(entry) => entries.push(entry),
216            None => {
217                eprintln!(
218                    "[spool transcript] malformed line at {}:~{i}",
219                    path.display()
220                );
221            }
222        }
223    }
224    Ok(entries)
225}
226
227/// Parse a single JSONL line. Returns `None` for malformed JSON;
228/// returns `Some(Other)` for parseable JSON we don't recognize so the
229/// caller can still inspect it.
230pub fn parse_line(raw: &str) -> Option<TranscriptEntry> {
231    let value: Value = serde_json::from_str(raw).ok()?;
232    Some(value_to_entry(value))
233}
234
235fn value_to_entry(value: Value) -> TranscriptEntry {
236    let kind = value.get("type").and_then(|v| v.as_str()).unwrap_or("");
237    match kind {
238        "user" => {
239            let text = extract_message_text(&value);
240            let authored = extract_user_authored_text(&value);
241            TranscriptEntry::User { text, authored }
242        }
243        "assistant" => {
244            let text = extract_message_text(&value);
245            TranscriptEntry::Assistant { text }
246        }
247        "tool_use" => {
248            let name = value
249                .get("name")
250                .and_then(|v| v.as_str())
251                .unwrap_or("")
252                .to_string();
253            let text = extract_tool_use_text(&value);
254            TranscriptEntry::ToolUse { name, text }
255        }
256        "tool_result" => {
257            let text = extract_tool_result_text(&value);
258            TranscriptEntry::ToolResult { text }
259        }
260        _ => TranscriptEntry::Other { raw: value },
261    }
262}
263
264/// Pull the textual payload out of a user/assistant message envelope.
265/// The Claude Code shape is one of:
266/// - `{"type":"user","message":{"content":"hello"}}`
267/// - `{"type":"user","message":{"content":[{"type":"text","text":"hello"}]}}`
268/// - `{"type":"assistant","message":{"content":[{"type":"tool_use",...},{"type":"text","text":"…"}]}}`
269fn extract_message_text(value: &Value) -> String {
270    let content = match value.get("message").and_then(|m| m.get("content")) {
271        Some(c) => c,
272        None => match value.get("content") {
273            Some(c) => c,
274            None => return String::new(),
275        },
276    };
277    extract_content_text(content)
278}
279
280fn extract_content_text(content: &Value) -> String {
281    match content {
282        Value::String(s) => s.clone(),
283        Value::Array(items) => {
284            let mut buf = String::new();
285            for item in items {
286                let item_type = item.get("type").and_then(|v| v.as_str()).unwrap_or("");
287                match item_type {
288                    "text" => {
289                        if let Some(t) = item.get("text").and_then(|v| v.as_str()) {
290                            if !buf.is_empty() {
291                                buf.push('\n');
292                            }
293                            buf.push_str(t);
294                        }
295                    }
296                    // Inline tool_use / tool_result inside an
297                    // assistant content array: we surface them as
298                    // synthetic markers so heuristics can still grep
299                    // for tool names without losing context.
300                    "tool_use" => {
301                        let name = item.get("name").and_then(|v| v.as_str()).unwrap_or("");
302                        if !buf.is_empty() {
303                            buf.push('\n');
304                        }
305                        buf.push_str(&format!("<tool_use:{name}>"));
306                    }
307                    "tool_result" => {
308                        let inner = item.get("content").map(extract_content_text);
309                        if !buf.is_empty() {
310                            buf.push('\n');
311                        }
312                        buf.push_str("<tool_result>");
313                        if let Some(t) = inner {
314                            buf.push('\n');
315                            buf.push_str(&t);
316                        }
317                    }
318                    _ => {}
319                }
320            }
321            buf
322        }
323        _ => String::new(),
324    }
325}
326
327/// Like [`extract_message_text`] but only keeps `text`-typed content
328/// blocks. Skips `tool_result` and `tool_use` blocks so self-tag
329/// heuristics don't scan large tool output for memory markers.
330fn extract_user_authored_text(value: &Value) -> String {
331    let content = match value.get("message").and_then(|m| m.get("content")) {
332        Some(c) => c,
333        None => match value.get("content") {
334            Some(c) => c,
335            None => return String::new(),
336        },
337    };
338    match content {
339        Value::String(s) => s.clone(),
340        Value::Array(items) => {
341            let mut buf = String::new();
342            for item in items {
343                if item.get("type").and_then(|v| v.as_str()) == Some("text")
344                    && let Some(t) = item.get("text").and_then(|v| v.as_str())
345                {
346                    if !buf.is_empty() {
347                        buf.push('\n');
348                    }
349                    buf.push_str(t);
350                }
351            }
352            buf
353        }
354        _ => String::new(),
355    }
356}
357
358fn extract_tool_use_text(value: &Value) -> String {
359    if let Some(input) = value.get("input") {
360        return input.to_string();
361    }
362    String::new()
363}
364
365fn extract_tool_result_text(value: &Value) -> String {
366    if let Some(content) = value.get("content") {
367        return extract_content_text(content);
368    }
369    String::new()
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375    use serde_json::json;
376    use std::fs;
377    use tempfile::tempdir;
378
379    #[test]
380    fn project_dir_substitutes_separators() {
381        let cwd = Path::new("/Users/long/Work/spool");
382        let home = Path::new("/Users/long");
383        let dir = project_dir_for(cwd, home);
384        assert_eq!(
385            dir,
386            Path::new("/Users/long/.claude/projects/-Users-long-Work-spool")
387        );
388    }
389
390    #[test]
391    fn parse_line_recognizes_string_user_message() {
392        let raw = json!({"type":"user","message":{"role":"user","content":"hello"}}).to_string();
393        let entry = parse_line(&raw).unwrap();
394        match entry {
395            TranscriptEntry::User { text, .. } => assert_eq!(text, "hello"),
396            _ => panic!("expected User entry"),
397        }
398    }
399
400    #[test]
401    fn parse_line_recognizes_array_user_message() {
402        let raw = json!({
403            "type": "user",
404            "message": {
405                "role": "user",
406                "content": [
407                    {"type": "text", "text": "first"},
408                    {"type": "text", "text": "second"}
409                ]
410            }
411        })
412        .to_string();
413        let entry = parse_line(&raw).unwrap();
414        assert_eq!(entry.role_tag(), "user");
415        assert!(entry.text().contains("first"));
416        assert!(entry.text().contains("second"));
417    }
418
419    #[test]
420    fn parse_line_recognizes_assistant_with_tool_use() {
421        let raw = json!({
422            "type": "assistant",
423            "message": {
424                "role": "assistant",
425                "content": [
426                    {"type": "text", "text": "running command"},
427                    {"type": "tool_use", "name": "Bash", "input": {"command": "ls"}}
428                ]
429            }
430        })
431        .to_string();
432        let entry = parse_line(&raw).unwrap();
433        let text = entry.text();
434        assert!(text.contains("running command"));
435        assert!(text.contains("<tool_use:Bash>"));
436    }
437
438    #[test]
439    fn parse_line_recognizes_tool_use_top_level() {
440        let raw = json!({
441            "type": "tool_use",
442            "name": "Edit",
443            "input": {"path": "/tmp/x", "content": "data"}
444        })
445        .to_string();
446        let entry = parse_line(&raw).unwrap();
447        match entry {
448            TranscriptEntry::ToolUse { name, text } => {
449                assert_eq!(name, "Edit");
450                assert!(text.contains("/tmp/x"));
451            }
452            _ => panic!("expected ToolUse"),
453        }
454    }
455
456    #[test]
457    fn parse_line_recognizes_tool_result_with_string() {
458        let raw = json!({"type":"tool_result","content":"ok"}).to_string();
459        let entry = parse_line(&raw).unwrap();
460        match entry {
461            TranscriptEntry::ToolResult { text } => assert_eq!(text, "ok"),
462            _ => panic!("expected ToolResult"),
463        }
464    }
465
466    #[test]
467    fn parse_line_returns_other_for_unknown_kind() {
468        let raw = json!({"type":"compact","summary":"…"}).to_string();
469        let entry = parse_line(&raw).unwrap();
470        match entry {
471            TranscriptEntry::Other { raw } => {
472                assert_eq!(raw["type"], "compact");
473            }
474            _ => panic!("expected Other"),
475        }
476    }
477
478    #[test]
479    fn parse_line_returns_none_for_malformed_json() {
480        assert!(parse_line("{ broken").is_none());
481        assert!(parse_line("not json at all").is_none());
482    }
483
484    #[test]
485    fn read_all_returns_empty_for_missing_file() {
486        let temp = tempdir().unwrap();
487        let path = temp.path().join("absent.jsonl");
488        let entries = read_all(&path).unwrap();
489        assert!(entries.is_empty());
490    }
491
492    #[test]
493    fn read_all_skips_malformed_lines_and_keeps_valid() {
494        let temp = tempdir().unwrap();
495        let path = temp.path().join("session.jsonl");
496        let user = json!({"type":"user","message":{"content":"first"}}).to_string();
497        let assistant = json!({"type":"assistant","message":{"content":"second"}}).to_string();
498        fs::write(
499            &path,
500            format!("{user}\n{{ broken\n\nthis isn't json\n{assistant}\n"),
501        )
502        .unwrap();
503
504        let entries = read_all(&path).unwrap();
505        assert_eq!(entries.len(), 2);
506        assert_eq!(entries[0].role_tag(), "user");
507        assert_eq!(entries[1].role_tag(), "assistant");
508    }
509
510    #[test]
511    fn role_tag_matches_variant() {
512        assert_eq!(
513            TranscriptEntry::User {
514                text: "x".into(),
515                authored: "x".into()
516            }
517            .role_tag(),
518            "user"
519        );
520        assert_eq!(
521            TranscriptEntry::Assistant { text: "x".into() }.role_tag(),
522            "assistant"
523        );
524        assert_eq!(
525            TranscriptEntry::ToolUse {
526                name: "n".into(),
527                text: "t".into()
528            }
529            .role_tag(),
530            "tool_use"
531        );
532        assert_eq!(
533            TranscriptEntry::ToolResult { text: "x".into() }.role_tag(),
534            "tool_result"
535        );
536        assert_eq!(
537            TranscriptEntry::Other { raw: json!({}) }.role_tag(),
538            "other"
539        );
540    }
541
542    #[test]
543    fn find_latest_for_cwd_returns_none_when_dir_missing() {
544        let temp = tempdir().unwrap();
545        let cwd = temp.path().join("repo");
546        let home = temp.path().join("home");
547        let result = find_latest_for_cwd(&cwd, &home);
548        assert!(result.is_none());
549    }
550
551    #[test]
552    fn find_latest_for_cwd_returns_none_when_dir_has_no_jsonl() {
553        let temp = tempdir().unwrap();
554        let cwd = temp.path().join("repo");
555        let home = temp.path().join("home");
556        let proj_dir = project_dir_for(&cwd, &home);
557        std::fs::create_dir_all(&proj_dir).unwrap();
558        std::fs::write(proj_dir.join("readme.txt"), "not jsonl").unwrap();
559        let result = find_latest_for_cwd(&cwd, &home);
560        assert!(result.is_none());
561    }
562
563    #[test]
564    fn find_latest_for_cwd_picks_most_recently_modified() {
565        let temp = tempdir().unwrap();
566        let cwd = temp.path().join("repo");
567        let home = temp.path().join("home");
568        let proj_dir = project_dir_for(&cwd, &home);
569        std::fs::create_dir_all(&proj_dir).unwrap();
570
571        let older = proj_dir.join("session-1.jsonl");
572        let newer = proj_dir.join("session-2.jsonl");
573        std::fs::write(&older, "{}\n").unwrap();
574        // Ensure a measurable mtime delta — sleep is brittle on
575        // ultrafast filesystems but 50ms is conservative.
576        std::thread::sleep(std::time::Duration::from_millis(50));
577        std::fs::write(&newer, "{}\n").unwrap();
578
579        let result = find_latest_for_cwd(&cwd, &home).unwrap();
580        assert_eq!(result, newer);
581    }
582}