spool-memory 0.2.3

//! Claude Code session transcript parser.
//!
//! Claude Code persists every session as line-delimited JSON under
//! `~/.claude/projects/<sanitized-cwd>/<session-id>.jsonl`. The exact
//! schema isn't formally documented and evolves between Claude Code
//! releases; we therefore use a permissive parser that:
//!
//! 1. Reads each line as a generic `serde_json::Value`.
//! 2. Maps known shapes onto [`TranscriptEntry`] variants.
//! 3. Skips unknown / malformed lines with a stderr warn (mirroring
//!    `LifecycleStore::read_all` policy).
//!
//! ## Recognized shapes (Claude Code 2026-04+)
//! - `{"type":"user","message":{"role":"user","content":<str|arr>}}`
//! - `{"type":"assistant","message":{"role":"assistant","content":<str|arr>}}`
//! - `{"type":"tool_use","name":<str>,"input":<obj>}` (and the
//!   nested-in-assistant-content variant)
//! - `{"type":"tool_result","content":<str|arr>}`
//! - everything else → `TranscriptEntry::Other` (preserved verbatim
//!   so distill heuristics can still see the raw shape if needed)
//!
//! Each entry exposes a normalized `text()` view (concatenated string
//! content) so heuristics don't have to re-walk the message tree.
//!
//! ## What we deliberately do NOT do
//! - We don't try to reconstruct turn boundaries (the assistant may
//!   stream multiple `assistant` rows for one turn; heuristics handle
//!   that).
//! - We don't merge tool_use / tool_result pairs — the distill layer
//!   does, after redaction.
//! - We don't load the *whole* file into memory upfront for huge
//!   sessions — we provide a streaming iterator (`stream`) too.

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::{Path, PathBuf};

/// Resolve the directory Claude Code uses for transcripts of `cwd`.
///
/// Claude Code substitutes path separators with `-` and strips
/// leading slashes. e.g. `/Users/long/Work/spool` →
/// `~/.claude/projects/-Users-long-Work-spool/`.
pub fn project_dir_for(cwd: &Path, home: &Path) -> PathBuf {
    let raw = cwd.to_string_lossy();
    // Replace every component separator with `-`. We intentionally do
    // NOT use `replace('/', "-")` blindly because Windows paths use
    // `\\`; on Unix they coincide, but being explicit keeps the
    // function usable from tests with synthetic paths.
    let mut sanitized = String::with_capacity(raw.len() + 1);
    for ch in raw.chars() {
        match ch {
            '/' | '\\' | ':' => sanitized.push('-'),
            other => sanitized.push(other),
        }
    }
    home.join(".claude").join("projects").join(sanitized)
}

/// Find the most recently modified `.jsonl` transcript under
/// `project_dir_for(cwd, home)`. Returns `None` when:
/// - the project directory doesn't exist (no Claude Code session yet
///   for this cwd), OR
/// - the directory has no `.jsonl` files.
///
/// Used by Stop hook as a fallback when Claude Code's stdin payload
/// doesn't include `transcript_path` (older versions or non-standard
/// invocation).
pub fn find_latest_for_cwd(cwd: &Path, home: &Path) -> Option<PathBuf> {
    let dir = project_dir_for(cwd, home);
    if !dir.exists() {
        return None;
    }
    let mut latest: Option<(std::time::SystemTime, PathBuf)> = None;
    let entries = std::fs::read_dir(&dir).ok()?;
    for entry in entries.flatten() {
        let path = entry.path();
        if path.extension().and_then(|s| s.to_str()) != Some("jsonl") {
            continue;
        }
        let modified = match entry.metadata().and_then(|m| m.modified()) {
            Ok(m) => m,
            Err(_) => continue,
        };
        match &latest {
            Some((existing, _)) if *existing >= modified => {}
            _ => latest = Some((modified, path)),
        }
    }
    latest.map(|(_, p)| p)
}

/// One parsed line from a transcript file.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum TranscriptEntry {
    /// User turn. `text` is the full content (including tool_result
    /// blocks). `authored` is only the user-typed text blocks —
    /// tool_result content is excluded. Use `authored` for self-tag
    /// heuristics; use `text` for sampling excerpts.
    User {
        text: String,
        authored: String,
    },
    Assistant {
        text: String,
    },
    ToolUse {
        name: String,
        text: String,
    },
    ToolResult {
        text: String,
    },
    Other {
        raw: Value,
    },
}

impl TranscriptEntry {
    /// Flat text view used by heuristics. For `Other` we serialize
    /// the raw value so heuristics can still grep across unknown
    /// shapes if they want (cheap; it's already a JSON value).
    pub fn text(&self) -> String {
        match self {
            TranscriptEntry::User { text, .. }
            | TranscriptEntry::Assistant { text }
            | TranscriptEntry::ToolResult { text } => text.clone(),
            TranscriptEntry::ToolUse { name, text } => {
                if text.is_empty() {
                    name.clone()
                } else {
                    format!("{name}: {text}")
                }
            }
            TranscriptEntry::Other { raw } => raw.to_string(),
        }
    }

    /// Only the user-authored text in a User turn — tool_result
    /// content blocks are excluded. Returns the same as `text()` for
    /// non-User variants. Used by self-tag heuristics to avoid
    /// scanning large tool output for memory markers.
    pub fn authored_text(&self) -> &str {
        match self {
            TranscriptEntry::User { authored, .. } => authored.as_str(),
            TranscriptEntry::Assistant { text } | TranscriptEntry::ToolResult { text } => {
                text.as_str()
            }
            TranscriptEntry::ToolUse { text, .. } => text.as_str(),
            TranscriptEntry::Other { .. } => "",
        }
    }

    pub fn role_tag(&self) -> &'static str {
        match self {
            TranscriptEntry::User { .. } => "user",
            TranscriptEntry::Assistant { .. } => "assistant",
            TranscriptEntry::ToolUse { .. } => "tool_use",
            TranscriptEntry::ToolResult { .. } => "tool_result",
            TranscriptEntry::Other { .. } => "other",
        }
    }
}

/// Parse the entire transcript at `path` into memory. Returns Ok with
/// the parsed prefix even when corrupt lines are encountered (those
/// are skipped and reported to stderr).
pub fn read_all(path: &Path) -> Result<Vec<TranscriptEntry>> {
    read_tail(path, usize::MAX)
}

/// Parse at most `max_lines` raw lines from the **end** of the
/// transcript. Useful for distill heuristics that only care about
/// recent turns — avoids loading multi-MB transcripts in full.
///
/// Implementation: reads the whole file line-by-line but only keeps
/// the last `max_lines` raw strings before parsing, so memory usage
/// is bounded by `max_lines` even for huge files.
pub fn read_tail(path: &Path, max_lines: usize) -> Result<Vec<TranscriptEntry>> {
    if !path.exists() {
        return Ok(Vec::new());
    }
    let file = File::open(path).with_context(|| format!("opening {}", path.display()))?;
    let reader = BufReader::new(file);

    // Collect raw non-empty lines into a ring buffer of size max_lines.
    let mut ring: std::collections::VecDeque<String> = std::collections::VecDeque::new();
    for (idx, line) in reader.lines().enumerate() {
        let line_no = idx + 1;
        let raw = match line {
            Ok(raw) => raw,
            Err(err) => {
                eprintln!(
                    "[spool transcript] read error at {}:{line_no}: {err}",
                    path.display()
                );
                continue;
            }
        };
        if raw.trim().is_empty() {
            continue;
        }
        if max_lines < usize::MAX && ring.len() >= max_lines {
            ring.pop_front();
        }
        ring.push_back(raw);
    }

    let mut entries = Vec::with_capacity(ring.len());
    for (i, raw) in ring.into_iter().enumerate() {
        match parse_line(&raw) {
            Some(entry) => entries.push(entry),
            None => {
                eprintln!(
                    "[spool transcript] malformed line at {}:~{i}",
                    path.display()
                );
            }
        }
    }
    Ok(entries)
}

/// Parse a single JSONL line. Returns `None` for malformed JSON;
/// returns `Some(Other)` for parseable JSON we don't recognize so the
/// caller can still inspect it.
pub fn parse_line(raw: &str) -> Option<TranscriptEntry> {
    let value: Value = serde_json::from_str(raw).ok()?;
    Some(value_to_entry(value))
}

fn value_to_entry(value: Value) -> TranscriptEntry {
    let kind = value.get("type").and_then(|v| v.as_str()).unwrap_or("");
    match kind {
        "user" => {
            let text = extract_message_text(&value);
            let authored = extract_user_authored_text(&value);
            TranscriptEntry::User { text, authored }
        }
        "assistant" => {
            let text = extract_message_text(&value);
            TranscriptEntry::Assistant { text }
        }
        "tool_use" => {
            let name = value
                .get("name")
                .and_then(|v| v.as_str())
                .unwrap_or("")
                .to_string();
            let text = extract_tool_use_text(&value);
            TranscriptEntry::ToolUse { name, text }
        }
        "tool_result" => {
            let text = extract_tool_result_text(&value);
            TranscriptEntry::ToolResult { text }
        }
        _ => TranscriptEntry::Other { raw: value },
    }
}

/// Pull the textual payload out of a user/assistant message envelope.
/// The Claude Code shape is one of:
/// - `{"type":"user","message":{"content":"hello"}}`
/// - `{"type":"user","message":{"content":[{"type":"text","text":"hello"}]}}`
/// - `{"type":"assistant","message":{"content":[{"type":"tool_use",...},{"type":"text","text":"…"}]}}`
fn extract_message_text(value: &Value) -> String {
    let content = match value.get("message").and_then(|m| m.get("content")) {
        Some(c) => c,
        None => match value.get("content") {
            Some(c) => c,
            None => return String::new(),
        },
    };
    extract_content_text(content)
}

fn extract_content_text(content: &Value) -> String {
    match content {
        Value::String(s) => s.clone(),
        Value::Array(items) => {
            let mut buf = String::new();
            for item in items {
                let item_type = item.get("type").and_then(|v| v.as_str()).unwrap_or("");
                match item_type {
                    "text" => {
                        if let Some(t) = item.get("text").and_then(|v| v.as_str()) {
                            if !buf.is_empty() {
                                buf.push('\n');
                            }
                            buf.push_str(t);
                        }
                    }
                    // Inline tool_use / tool_result inside an
                    // assistant content array: we surface them as
                    // synthetic markers so heuristics can still grep
                    // for tool names without losing context.
                    "tool_use" => {
                        let name = item.get("name").and_then(|v| v.as_str()).unwrap_or("");
                        if !buf.is_empty() {
                            buf.push('\n');
                        }
                        buf.push_str(&format!("<tool_use:{name}>"));
                    }
                    "tool_result" => {
                        let inner = item.get("content").map(extract_content_text);
                        if !buf.is_empty() {
                            buf.push('\n');
                        }
                        buf.push_str("<tool_result>");
                        if let Some(t) = inner {
                            buf.push('\n');
                            buf.push_str(&t);
                        }
                    }
                    _ => {}
                }
            }
            buf
        }
        _ => String::new(),
    }
}

/// Like [`extract_message_text`] but only keeps `text`-typed content
/// blocks. Skips `tool_result` and `tool_use` blocks so self-tag
/// heuristics don't scan large tool output for memory markers.
fn extract_user_authored_text(value: &Value) -> String {
    let content = match value.get("message").and_then(|m| m.get("content")) {
        Some(c) => c,
        None => match value.get("content") {
            Some(c) => c,
            None => return String::new(),
        },
    };
    match content {
        Value::String(s) => s.clone(),
        Value::Array(items) => {
            let mut buf = String::new();
            for item in items {
                if item.get("type").and_then(|v| v.as_str()) == Some("text")
                    && let Some(t) = item.get("text").and_then(|v| v.as_str())
                {
                    if !buf.is_empty() {
                        buf.push('\n');
                    }
                    buf.push_str(t);
                }
            }
            buf
        }
        _ => String::new(),
    }
}

fn extract_tool_use_text(value: &Value) -> String {
    if let Some(input) = value.get("input") {
        return input.to_string();
    }
    String::new()
}

fn extract_tool_result_text(value: &Value) -> String {
    if let Some(content) = value.get("content") {
        return extract_content_text(content);
    }
    String::new()
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;
    use std::fs;
    use tempfile::tempdir;

    #[test]
    fn project_dir_substitutes_separators() {
        let cwd = Path::new("/Users/long/Work/spool");
        let home = Path::new("/Users/long");
        let dir = project_dir_for(cwd, home);
        assert_eq!(
            dir,
            Path::new("/Users/long/.claude/projects/-Users-long-Work-spool")
        );
    }

    #[test]
    fn parse_line_recognizes_string_user_message() {
        let raw = json!({"type":"user","message":{"role":"user","content":"hello"}}).to_string();
        let entry = parse_line(&raw).unwrap();
        match entry {
            TranscriptEntry::User { text, .. } => assert_eq!(text, "hello"),
            _ => panic!("expected User entry"),
        }
    }

    #[test]
    fn parse_line_recognizes_array_user_message() {
        let raw = json!({
            "type": "user",
            "message": {
                "role": "user",
                "content": [
                    {"type": "text", "text": "first"},
                    {"type": "text", "text": "second"}
                ]
            }
        })
        .to_string();
        let entry = parse_line(&raw).unwrap();
        assert_eq!(entry.role_tag(), "user");
        assert!(entry.text().contains("first"));
        assert!(entry.text().contains("second"));
    }

    #[test]
    fn parse_line_recognizes_assistant_with_tool_use() {
        let raw = json!({
            "type": "assistant",
            "message": {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": "running command"},
                    {"type": "tool_use", "name": "Bash", "input": {"command": "ls"}}
                ]
            }
        })
        .to_string();
        let entry = parse_line(&raw).unwrap();
        let text = entry.text();
        assert!(text.contains("running command"));
        assert!(text.contains("<tool_use:Bash>"));
    }

    #[test]
    fn parse_line_recognizes_tool_use_top_level() {
        let raw = json!({
            "type": "tool_use",
            "name": "Edit",
            "input": {"path": "/tmp/x", "content": "data"}
        })
        .to_string();
        let entry = parse_line(&raw).unwrap();
        match entry {
            TranscriptEntry::ToolUse { name, text } => {
                assert_eq!(name, "Edit");
                assert!(text.contains("/tmp/x"));
            }
            _ => panic!("expected ToolUse"),
        }
    }

    #[test]
    fn parse_line_recognizes_tool_result_with_string() {
        let raw = json!({"type":"tool_result","content":"ok"}).to_string();
        let entry = parse_line(&raw).unwrap();
        match entry {
            TranscriptEntry::ToolResult { text } => assert_eq!(text, "ok"),
            _ => panic!("expected ToolResult"),
        }
    }

    #[test]
    fn parse_line_returns_other_for_unknown_kind() {
        let raw = json!({"type":"compact","summary":"…"}).to_string();
        let entry = parse_line(&raw).unwrap();
        match entry {
            TranscriptEntry::Other { raw } => {
                assert_eq!(raw["type"], "compact");
            }
            _ => panic!("expected Other"),
        }
    }

    #[test]
    fn parse_line_returns_none_for_malformed_json() {
        assert!(parse_line("{ broken").is_none());
        assert!(parse_line("not json at all").is_none());
    }

    #[test]
    fn read_all_returns_empty_for_missing_file() {
        let temp = tempdir().unwrap();
        let path = temp.path().join("absent.jsonl");
        let entries = read_all(&path).unwrap();
        assert!(entries.is_empty());
    }

    #[test]
    fn read_all_skips_malformed_lines_and_keeps_valid() {
        let temp = tempdir().unwrap();
        let path = temp.path().join("session.jsonl");
        let user = json!({"type":"user","message":{"content":"first"}}).to_string();
        let assistant = json!({"type":"assistant","message":{"content":"second"}}).to_string();
        fs::write(
            &path,
            format!("{user}\n{{ broken\n\nthis isn't json\n{assistant}\n"),
        )
        .unwrap();

        let entries = read_all(&path).unwrap();
        assert_eq!(entries.len(), 2);
        assert_eq!(entries[0].role_tag(), "user");
        assert_eq!(entries[1].role_tag(), "assistant");
    }

    #[test]
    fn role_tag_matches_variant() {
        assert_eq!(
            TranscriptEntry::User {
                text: "x".into(),
                authored: "x".into()
            }
            .role_tag(),
            "user"
        );
        assert_eq!(
            TranscriptEntry::Assistant { text: "x".into() }.role_tag(),
            "assistant"
        );
        assert_eq!(
            TranscriptEntry::ToolUse {
                name: "n".into(),
                text: "t".into()
            }
            .role_tag(),
            "tool_use"
        );
        assert_eq!(
            TranscriptEntry::ToolResult { text: "x".into() }.role_tag(),
            "tool_result"
        );
        assert_eq!(
            TranscriptEntry::Other { raw: json!({}) }.role_tag(),
            "other"
        );
    }

    #[test]
    fn find_latest_for_cwd_returns_none_when_dir_missing() {
        let temp = tempdir().unwrap();
        let cwd = temp.path().join("repo");
        let home = temp.path().join("home");
        let result = find_latest_for_cwd(&cwd, &home);
        assert!(result.is_none());
    }

    #[test]
    fn find_latest_for_cwd_returns_none_when_dir_has_no_jsonl() {
        let temp = tempdir().unwrap();
        let cwd = temp.path().join("repo");
        let home = temp.path().join("home");
        let proj_dir = project_dir_for(&cwd, &home);
        std::fs::create_dir_all(&proj_dir).unwrap();
        std::fs::write(proj_dir.join("readme.txt"), "not jsonl").unwrap();
        let result = find_latest_for_cwd(&cwd, &home);
        assert!(result.is_none());
    }

    #[test]
    fn find_latest_for_cwd_picks_most_recently_modified() {
        let temp = tempdir().unwrap();
        let cwd = temp.path().join("repo");
        let home = temp.path().join("home");
        let proj_dir = project_dir_for(&cwd, &home);
        std::fs::create_dir_all(&proj_dir).unwrap();

        let older = proj_dir.join("session-1.jsonl");
        let newer = proj_dir.join("session-2.jsonl");
        std::fs::write(&older, "{}\n").unwrap();
        // Ensure a measurable mtime delta — sleep is brittle on
        // ultrafast filesystems but 50ms is conservative.
        std::thread::sleep(std::time::Duration::from_millis(50));
        std::fs::write(&newer, "{}\n").unwrap();

        let result = find_latest_for_cwd(&cwd, &home).unwrap();
        assert_eq!(result, newer);
    }
}