mi6_core/input/transcript/
parser.rs

1//! Incremental transcript file parser.
2//!
3//! Parses Claude Code JSONL transcript files efficiently by tracking
4//! file positions and only reading new content.
5
6use std::collections::HashSet;
7use std::fs::File;
8use std::io::{BufRead, BufReader, Seek, SeekFrom};
9use std::path::Path;
10
11use crate::model::Event;
12use crate::model::error::TranscriptError;
13use crate::util::truncate_string;
14
15use super::{ToolInfoMap, TranscriptEntry};
16
17/// Tracks file read position for incremental parsing.
18#[derive(Debug, Clone, Default)]
19pub struct FilePosition {
20    /// Byte offset in file
21    pub offset: u64,
22    /// Last line number processed (1-based, for debugging)
23    pub line_number: u64,
24    /// Last UUID processed (for validation)
25    pub last_uuid: Option<String>,
26}
27
28impl FilePosition {
29    /// Create a new position at the start of file.
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    /// Create a position at a specific offset.
35    pub fn at_offset(offset: u64) -> Self {
36        Self {
37            offset,
38            line_number: 0,
39            last_uuid: None,
40        }
41    }
42}
43
44/// Result of parsing a transcript file.
45#[derive(Debug)]
46pub struct ParseResult {
47    /// Events extracted from the transcript
48    pub events: Vec<Event>,
49    /// New file position after parsing
50    pub position: FilePosition,
51    /// UUIDs seen during this parse (for deduplication)
52    pub uuids: HashSet<String>,
53    /// Number of lines parsed into events
54    pub lines_parsed: u64,
55    /// Number of lines skipped (non-message entries like file-history-snapshot)
56    pub lines_skipped: u64,
57    /// Number of lines that failed to parse as JSON
58    pub parse_errors: u64,
59}
60
61/// Parser for Claude Code transcript JSONL files.
62pub struct TranscriptParser {
63    machine_id: String,
64}
65
66impl TranscriptParser {
67    /// Create a new parser with the given machine ID.
68    pub fn new(machine_id: impl Into<String>) -> Self {
69        Self {
70            machine_id: machine_id.into(),
71        }
72    }
73
74    /// Parse new entries from a transcript file starting at the given position.
75    ///
76    /// Returns events, updated position, and UUIDs seen.
77    pub fn parse_incremental(
78        &self,
79        path: &Path,
80        start_position: &FilePosition,
81    ) -> Result<ParseResult, TranscriptError> {
82        let file = File::open(path).map_err(|e| {
83            if e.kind() == std::io::ErrorKind::NotFound {
84                TranscriptError::NotFound(path.display().to_string())
85            } else {
86                TranscriptError::Io(e)
87            }
88        })?;
89
90        let file_len = file.metadata()?.len();
91
92        // If position is past file end, file may have been truncated - reset
93        let start_offset = if start_position.offset > file_len {
94            0
95        } else {
96            start_position.offset
97        };
98
99        let mut reader = BufReader::new(file);
100        reader.seek(SeekFrom::Start(start_offset))?;
101
102        let mut events = Vec::new();
103        let mut uuids = HashSet::new();
104        let mut current_offset = start_offset;
105        let mut line_number = start_position.line_number;
106        let mut last_uuid = start_position.last_uuid.clone();
107        let mut lines_parsed = 0u64;
108        let mut lines_skipped = 0u64;
109        let mut parse_errors = 0u64;
110
111        // Track tool info across entries to enrich PostToolUse events
112        let mut tool_info_map: ToolInfoMap = Default::default();
113
114        let mut line = String::new();
115        loop {
116            line.clear();
117            let bytes_read = reader.read_line(&mut line)?;
118            if bytes_read == 0 {
119                break; // EOF
120            }
121
122            let line_bytes = bytes_read as u64;
123            line_number += 1;
124
125            // Skip empty lines
126            let trimmed = line.trim();
127            if trimmed.is_empty() {
128                current_offset += line_bytes;
129                continue;
130            }
131
132            // Try to parse as TranscriptEntry
133            match serde_json::from_str::<TranscriptEntry>(trimmed) {
134                Ok(entry) => {
135                    // Track UUID for deduplication
136                    uuids.insert(entry.uuid.clone());
137                    last_uuid = Some(entry.uuid.clone());
138
139                    // Skip non-message entries
140                    if entry.should_skip() {
141                        lines_skipped += 1;
142                    } else {
143                        // Convert to events, passing tool_info_map for cross-entry tracking
144                        let entry_events = entry.into_events(&self.machine_id, &mut tool_info_map);
145                        events.extend(entry_events);
146                        lines_parsed += 1;
147                    }
148                }
149                Err(_) => {
150                    // Track parse errors separately from intentionally skipped entries
151                    // This helps identify format changes or corruption vs expected non-message entries
152                    parse_errors += 1;
153                }
154            }
155
156            current_offset += line_bytes;
157        }
158
159        let new_position = FilePosition {
160            offset: current_offset,
161            line_number,
162            last_uuid,
163        };
164
165        Ok(ParseResult {
166            events,
167            position: new_position,
168            uuids,
169            lines_parsed,
170            lines_skipped,
171            parse_errors,
172        })
173    }
174
175    /// Parse entire transcript file from the beginning.
176    pub fn parse_full(&self, path: &Path) -> Result<ParseResult, TranscriptError> {
177        self.parse_incremental(path, &FilePosition::new())
178    }
179}
180
181/// Extract the first user prompt from a transcript file.
182///
183/// This is a lightweight operation that reads only until it finds the first
184/// user message with text content. Used as a fallback when the UserPromptSubmit
185/// hook doesn't fire (e.g., when Claude is started with an inline prompt like
186/// `claude 'initial prompt'`).
187///
188/// Returns `None` if:
189/// - The file doesn't exist or can't be read
190/// - No user message with text content is found
191/// - The first user message is a tool result (no prompt text)
192pub fn extract_first_prompt(path: &Path) -> Option<String> {
193    let file = File::open(path).ok()?;
194    let reader = BufReader::new(file);
195
196    for line in reader.lines() {
197        let line = line.ok()?;
198        let trimmed = line.trim();
199        if trimmed.is_empty() {
200            continue;
201        }
202
203        // Try to parse as TranscriptEntry
204        if let Ok(entry) = serde_json::from_str::<TranscriptEntry>(trimmed) {
205            // Only look at non-sidechain user entries
206            if entry.entry_type == "user" && !entry.is_sidechain {
207                // Extract prompt text if available
208                if let Some(ref message) = entry.message
209                    && let Some(ref content) = message.content
210                    && let Some(text) = content.text()
211                    && !text.is_empty()
212                {
213                    return Some(truncate_string(text, 1000));
214                }
215            }
216        }
217    }
218
219    None
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225    use std::io::Write;
226    use tempfile::NamedTempFile;
227
228    fn create_test_file(content: &str) -> NamedTempFile {
229        let mut file = NamedTempFile::new().unwrap();
230        file.write_all(content.as_bytes()).unwrap();
231        file.flush().unwrap();
232        file
233    }
234
235    #[test]
236    fn test_parse_empty_file() {
237        let file = create_test_file("");
238        let parser = TranscriptParser::new("machine-1");
239        let result = parser.parse_full(file.path()).unwrap();
240
241        assert!(result.events.is_empty());
242        assert_eq!(result.lines_parsed, 0);
243    }
244
245    #[test]
246    fn test_parse_single_entry() {
247        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
248
249        let file = create_test_file(content);
250        let parser = TranscriptParser::new("machine-1");
251        let result = parser.parse_full(file.path()).unwrap();
252
253        assert_eq!(result.events.len(), 1);
254        assert_eq!(result.lines_parsed, 1);
255        assert!(result.uuids.contains("1"));
256    }
257
258    #[test]
259    fn test_incremental_parsing() {
260        let line1 = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
261        let line2 = r#"{"sessionId":"test","uuid":"2","isSidechain":false,"timestamp":"2025-01-01T00:00:01Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":200,"output_tokens":100}}}"#;
262
263        let content = format!("{}\n{}\n", line1, line2);
264        let file = create_test_file(&content);
265        let parser = TranscriptParser::new("machine-1");
266
267        // First parse - get first line
268        let result1 = parser.parse_full(file.path()).unwrap();
269        assert_eq!(result1.events.len(), 2);
270
271        // Append new content
272        let mut f = std::fs::OpenOptions::new()
273            .append(true)
274            .open(file.path())
275            .unwrap();
276        let line3 = r#"{"sessionId":"test","uuid":"3","isSidechain":false,"timestamp":"2025-01-01T00:00:02Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":300,"output_tokens":150}}}"#;
277        writeln!(f, "{}", line3).unwrap();
278
279        // Incremental parse from saved position
280        let result2 = parser
281            .parse_incremental(file.path(), &result1.position)
282            .unwrap();
283        assert_eq!(result2.events.len(), 1);
284        assert!(result2.uuids.contains("3"));
285    }
286
287    #[test]
288    fn test_skip_non_message_entries() {
289        let content = r#"{"type":"file-history-snapshot","sessionId":"test","uuid":"snap1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z"}
290{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
291
292        let file = create_test_file(content);
293        let parser = TranscriptParser::new("machine-1");
294        let result = parser.parse_full(file.path()).unwrap();
295
296        assert_eq!(result.events.len(), 1);
297        assert_eq!(result.lines_parsed, 1);
298        assert_eq!(result.lines_skipped, 1);
299    }
300
301    #[test]
302    fn test_handle_truncated_file() {
303        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
304
305        let file = create_test_file(content);
306        let parser = TranscriptParser::new("machine-1");
307
308        // Create a position past the file end
309        let old_position = FilePosition {
310            offset: 10000,
311            line_number: 100,
312            last_uuid: Some("old".to_string()),
313        };
314
315        // Should reset to beginning
316        let result = parser
317            .parse_incremental(file.path(), &old_position)
318            .unwrap();
319        assert_eq!(result.events.len(), 1);
320    }
321
322    #[test]
323    fn test_file_not_found() {
324        let parser = TranscriptParser::new("machine-1");
325        let result = parser.parse_full(Path::new("/nonexistent/path.jsonl"));
326
327        assert!(matches!(result, Err(TranscriptError::NotFound(_))));
328    }
329
330    #[test]
331    fn test_extract_first_prompt_basic() {
332        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":"hello world"}}"#;
333        let file = create_test_file(content);
334
335        let result = extract_first_prompt(file.path());
336        assert_eq!(result, Some("hello world".to_string()));
337    }
338
339    #[test]
340    fn test_extract_first_prompt_with_array_content() {
341        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":[{"type":"text","text":"hello from array"}]}}"#;
342        let file = create_test_file(content);
343
344        let result = extract_first_prompt(file.path());
345        assert_eq!(result, Some("hello from array".to_string()));
346    }
347
348    #[test]
349    fn test_extract_first_prompt_skips_sidechain() {
350        // First entry is sidechain, second is main chain
351        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":true,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":"sidechain prompt"}}
352{"sessionId":"test","uuid":"2","isSidechain":false,"timestamp":"2025-01-01T00:00:01Z","type":"user","message":{"role":"user","content":"main chain prompt"}}"#;
353        let file = create_test_file(content);
354
355        let result = extract_first_prompt(file.path());
356        assert_eq!(result, Some("main chain prompt".to_string()));
357    }
358
359    #[test]
360    fn test_extract_first_prompt_skips_tool_results() {
361        // First entry is tool result (no text), second is user prompt
362        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_123","content":"result"}]}}
363{"sessionId":"test","uuid":"2","isSidechain":false,"timestamp":"2025-01-01T00:00:01Z","type":"user","message":{"role":"user","content":"actual prompt"}}"#;
364        let file = create_test_file(content);
365
366        let result = extract_first_prompt(file.path());
367        assert_eq!(result, Some("actual prompt".to_string()));
368    }
369
370    #[test]
371    fn test_extract_first_prompt_empty_file() {
372        let file = create_test_file("");
373        let result = extract_first_prompt(file.path());
374        assert_eq!(result, None);
375    }
376
377    #[test]
378    fn test_extract_first_prompt_no_user_messages() {
379        let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
380        let file = create_test_file(content);
381
382        let result = extract_first_prompt(file.path());
383        assert_eq!(result, None);
384    }
385
386    #[test]
387    fn test_extract_first_prompt_truncates_long_prompt() {
388        let long_text = "a".repeat(1500);
389        let content = format!(
390            r#"{{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{{"role":"user","content":"{}"}}}}"#,
391            long_text
392        );
393        let file = create_test_file(&content);
394
395        let result = extract_first_prompt(file.path());
396        assert!(result.is_some());
397        let prompt = result.unwrap();
398        assert!(prompt.ends_with("..."));
399        assert!(prompt.len() <= 1003); // 1000 + "..."
400    }
401
402    #[test]
403    fn test_extract_first_prompt_file_not_found() {
404        let result = extract_first_prompt(Path::new("/nonexistent/file.jsonl"));
405        assert_eq!(result, None);
406    }
407
408    #[test]
409    fn test_parse_errors_tracked_separately() {
410        // Mix of valid entries, skipped entries, and unparseable lines
411        let content = r#"{"type":"file-history-snapshot","sessionId":"test","uuid":"snap1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z"}
412not valid json at all
413{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}
414{incomplete json
415"#;
416
417        let file = create_test_file(content);
418        let parser = TranscriptParser::new("machine-1");
419        let result = parser.parse_full(file.path()).unwrap();
420
421        assert_eq!(result.events.len(), 1); // Only the valid assistant entry
422        assert_eq!(result.lines_parsed, 1); // One valid message entry
423        assert_eq!(result.lines_skipped, 1); // file-history-snapshot
424        assert_eq!(result.parse_errors, 2); // Two unparseable lines
425    }
426}