1use std::collections::HashSet;
7use std::fs::File;
8use std::io::{BufRead, BufReader, Seek, SeekFrom};
9use std::path::Path;
10
11use crate::model::Event;
12use crate::model::error::TranscriptError;
13use crate::util::truncate_string;
14
15use super::{ToolInfoMap, TranscriptEntry};
16
17#[derive(Debug, Clone, Default)]
19pub struct FilePosition {
20 pub offset: u64,
22 pub line_number: u64,
24 pub last_uuid: Option<String>,
26}
27
28impl FilePosition {
29 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn at_offset(offset: u64) -> Self {
36 Self {
37 offset,
38 line_number: 0,
39 last_uuid: None,
40 }
41 }
42}
43
44#[derive(Debug)]
46pub struct ParseResult {
47 pub events: Vec<Event>,
49 pub position: FilePosition,
51 pub uuids: HashSet<String>,
53 pub lines_parsed: u64,
55 pub lines_skipped: u64,
57 pub parse_errors: u64,
59}
60
61pub struct TranscriptParser {
63 machine_id: String,
64}
65
66impl TranscriptParser {
67 pub fn new(machine_id: impl Into<String>) -> Self {
69 Self {
70 machine_id: machine_id.into(),
71 }
72 }
73
74 pub fn parse_incremental(
78 &self,
79 path: &Path,
80 start_position: &FilePosition,
81 ) -> Result<ParseResult, TranscriptError> {
82 let file = File::open(path).map_err(|e| {
83 if e.kind() == std::io::ErrorKind::NotFound {
84 TranscriptError::NotFound(path.display().to_string())
85 } else {
86 TranscriptError::Io(e)
87 }
88 })?;
89
90 let file_len = file.metadata()?.len();
91
92 let start_offset = if start_position.offset > file_len {
94 0
95 } else {
96 start_position.offset
97 };
98
99 let mut reader = BufReader::new(file);
100 reader.seek(SeekFrom::Start(start_offset))?;
101
102 let mut events = Vec::new();
103 let mut uuids = HashSet::new();
104 let mut current_offset = start_offset;
105 let mut line_number = start_position.line_number;
106 let mut last_uuid = start_position.last_uuid.clone();
107 let mut lines_parsed = 0u64;
108 let mut lines_skipped = 0u64;
109 let mut parse_errors = 0u64;
110
111 let mut tool_info_map: ToolInfoMap = Default::default();
113
114 let mut line = String::new();
115 loop {
116 line.clear();
117 let bytes_read = reader.read_line(&mut line)?;
118 if bytes_read == 0 {
119 break; }
121
122 let line_bytes = bytes_read as u64;
123 line_number += 1;
124
125 let trimmed = line.trim();
127 if trimmed.is_empty() {
128 current_offset += line_bytes;
129 continue;
130 }
131
132 match serde_json::from_str::<TranscriptEntry>(trimmed) {
134 Ok(entry) => {
135 uuids.insert(entry.uuid.clone());
137 last_uuid = Some(entry.uuid.clone());
138
139 if entry.should_skip() {
141 lines_skipped += 1;
142 } else {
143 let entry_events = entry.into_events(&self.machine_id, &mut tool_info_map);
145 events.extend(entry_events);
146 lines_parsed += 1;
147 }
148 }
149 Err(_) => {
150 parse_errors += 1;
153 }
154 }
155
156 current_offset += line_bytes;
157 }
158
159 let new_position = FilePosition {
160 offset: current_offset,
161 line_number,
162 last_uuid,
163 };
164
165 Ok(ParseResult {
166 events,
167 position: new_position,
168 uuids,
169 lines_parsed,
170 lines_skipped,
171 parse_errors,
172 })
173 }
174
175 pub fn parse_full(&self, path: &Path) -> Result<ParseResult, TranscriptError> {
177 self.parse_incremental(path, &FilePosition::new())
178 }
179}
180
181pub fn extract_first_prompt(path: &Path) -> Option<String> {
193 let file = File::open(path).ok()?;
194 let reader = BufReader::new(file);
195
196 for line in reader.lines() {
197 let line = line.ok()?;
198 let trimmed = line.trim();
199 if trimmed.is_empty() {
200 continue;
201 }
202
203 if let Ok(entry) = serde_json::from_str::<TranscriptEntry>(trimmed) {
205 if entry.entry_type == "user" && !entry.is_sidechain {
207 if let Some(ref message) = entry.message
209 && let Some(ref content) = message.content
210 && let Some(text) = content.text()
211 && !text.is_empty()
212 {
213 return Some(truncate_string(text, 1000));
214 }
215 }
216 }
217 }
218
219 None
220}
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225 use std::io::Write;
226 use tempfile::NamedTempFile;
227
228 fn create_test_file(content: &str) -> NamedTempFile {
229 let mut file = NamedTempFile::new().unwrap();
230 file.write_all(content.as_bytes()).unwrap();
231 file.flush().unwrap();
232 file
233 }
234
235 #[test]
236 fn test_parse_empty_file() {
237 let file = create_test_file("");
238 let parser = TranscriptParser::new("machine-1");
239 let result = parser.parse_full(file.path()).unwrap();
240
241 assert!(result.events.is_empty());
242 assert_eq!(result.lines_parsed, 0);
243 }
244
245 #[test]
246 fn test_parse_single_entry() {
247 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
248
249 let file = create_test_file(content);
250 let parser = TranscriptParser::new("machine-1");
251 let result = parser.parse_full(file.path()).unwrap();
252
253 assert_eq!(result.events.len(), 1);
254 assert_eq!(result.lines_parsed, 1);
255 assert!(result.uuids.contains("1"));
256 }
257
258 #[test]
259 fn test_incremental_parsing() {
260 let line1 = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
261 let line2 = r#"{"sessionId":"test","uuid":"2","isSidechain":false,"timestamp":"2025-01-01T00:00:01Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":200,"output_tokens":100}}}"#;
262
263 let content = format!("{}\n{}\n", line1, line2);
264 let file = create_test_file(&content);
265 let parser = TranscriptParser::new("machine-1");
266
267 let result1 = parser.parse_full(file.path()).unwrap();
269 assert_eq!(result1.events.len(), 2);
270
271 let mut f = std::fs::OpenOptions::new()
273 .append(true)
274 .open(file.path())
275 .unwrap();
276 let line3 = r#"{"sessionId":"test","uuid":"3","isSidechain":false,"timestamp":"2025-01-01T00:00:02Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":300,"output_tokens":150}}}"#;
277 writeln!(f, "{}", line3).unwrap();
278
279 let result2 = parser
281 .parse_incremental(file.path(), &result1.position)
282 .unwrap();
283 assert_eq!(result2.events.len(), 1);
284 assert!(result2.uuids.contains("3"));
285 }
286
287 #[test]
288 fn test_skip_non_message_entries() {
289 let content = r#"{"type":"file-history-snapshot","sessionId":"test","uuid":"snap1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z"}
290{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
291
292 let file = create_test_file(content);
293 let parser = TranscriptParser::new("machine-1");
294 let result = parser.parse_full(file.path()).unwrap();
295
296 assert_eq!(result.events.len(), 1);
297 assert_eq!(result.lines_parsed, 1);
298 assert_eq!(result.lines_skipped, 1);
299 }
300
301 #[test]
302 fn test_handle_truncated_file() {
303 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
304
305 let file = create_test_file(content);
306 let parser = TranscriptParser::new("machine-1");
307
308 let old_position = FilePosition {
310 offset: 10000,
311 line_number: 100,
312 last_uuid: Some("old".to_string()),
313 };
314
315 let result = parser
317 .parse_incremental(file.path(), &old_position)
318 .unwrap();
319 assert_eq!(result.events.len(), 1);
320 }
321
322 #[test]
323 fn test_file_not_found() {
324 let parser = TranscriptParser::new("machine-1");
325 let result = parser.parse_full(Path::new("/nonexistent/path.jsonl"));
326
327 assert!(matches!(result, Err(TranscriptError::NotFound(_))));
328 }
329
330 #[test]
331 fn test_extract_first_prompt_basic() {
332 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":"hello world"}}"#;
333 let file = create_test_file(content);
334
335 let result = extract_first_prompt(file.path());
336 assert_eq!(result, Some("hello world".to_string()));
337 }
338
339 #[test]
340 fn test_extract_first_prompt_with_array_content() {
341 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":[{"type":"text","text":"hello from array"}]}}"#;
342 let file = create_test_file(content);
343
344 let result = extract_first_prompt(file.path());
345 assert_eq!(result, Some("hello from array".to_string()));
346 }
347
348 #[test]
349 fn test_extract_first_prompt_skips_sidechain() {
350 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":true,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":"sidechain prompt"}}
352{"sessionId":"test","uuid":"2","isSidechain":false,"timestamp":"2025-01-01T00:00:01Z","type":"user","message":{"role":"user","content":"main chain prompt"}}"#;
353 let file = create_test_file(content);
354
355 let result = extract_first_prompt(file.path());
356 assert_eq!(result, Some("main chain prompt".to_string()));
357 }
358
359 #[test]
360 fn test_extract_first_prompt_skips_tool_results() {
361 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_123","content":"result"}]}}
363{"sessionId":"test","uuid":"2","isSidechain":false,"timestamp":"2025-01-01T00:00:01Z","type":"user","message":{"role":"user","content":"actual prompt"}}"#;
364 let file = create_test_file(content);
365
366 let result = extract_first_prompt(file.path());
367 assert_eq!(result, Some("actual prompt".to_string()));
368 }
369
370 #[test]
371 fn test_extract_first_prompt_empty_file() {
372 let file = create_test_file("");
373 let result = extract_first_prompt(file.path());
374 assert_eq!(result, None);
375 }
376
377 #[test]
378 fn test_extract_first_prompt_no_user_messages() {
379 let content = r#"{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}"#;
380 let file = create_test_file(content);
381
382 let result = extract_first_prompt(file.path());
383 assert_eq!(result, None);
384 }
385
386 #[test]
387 fn test_extract_first_prompt_truncates_long_prompt() {
388 let long_text = "a".repeat(1500);
389 let content = format!(
390 r#"{{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"user","message":{{"role":"user","content":"{}"}}}}"#,
391 long_text
392 );
393 let file = create_test_file(&content);
394
395 let result = extract_first_prompt(file.path());
396 assert!(result.is_some());
397 let prompt = result.unwrap();
398 assert!(prompt.ends_with("..."));
399 assert!(prompt.len() <= 1003); }
401
402 #[test]
403 fn test_extract_first_prompt_file_not_found() {
404 let result = extract_first_prompt(Path::new("/nonexistent/file.jsonl"));
405 assert_eq!(result, None);
406 }
407
408 #[test]
409 fn test_parse_errors_tracked_separately() {
410 let content = r#"{"type":"file-history-snapshot","sessionId":"test","uuid":"snap1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z"}
412not valid json at all
413{"sessionId":"test","uuid":"1","isSidechain":false,"timestamp":"2025-01-01T00:00:00Z","type":"assistant","message":{"role":"assistant","usage":{"input_tokens":100,"output_tokens":50}}}
414{incomplete json
415"#;
416
417 let file = create_test_file(content);
418 let parser = TranscriptParser::new("machine-1");
419 let result = parser.parse_full(file.path()).unwrap();
420
421 assert_eq!(result.events.len(), 1); assert_eq!(result.lines_parsed, 1); assert_eq!(result.lines_skipped, 1); assert_eq!(result.parse_errors, 2); }
426}