mi6_core/input/codex_session/
scanner.rs

1//! Codex session file scanner with storage integration.
2//!
3//! Provides high-level scanning functionality that orchestrates parsing,
4//! deduplication, and storage operations.
5
6use std::path::{Path, PathBuf};
7
8use crate::model::Storage;
9use crate::model::error::{ScanError, TranscriptError};
10
11use super::{CodexSessionParser, FilePosition, extract_session_id_from_filename};
12
13/// Result of scanning a Codex session file.
14#[derive(Debug, Clone, Default)]
15pub struct ScanResult {
16    /// Number of events inserted
17    pub inserted: usize,
18    /// Number of lines that failed to parse
19    pub parse_errors: u64,
20    /// Session ID from the file
21    pub session_id: Option<String>,
22}
23
24/// Orchestrates Codex session file scanning with storage integration.
25///
26/// `CodexSessionScanner` provides a high-level API for scanning Codex CLI
27/// session files and inserting events into storage. It handles:
28/// - Incremental parsing (only reads new content since last scan)
29/// - Deduplication by timestamp+type key
30/// - Position tracking for future scans
31pub struct CodexSessionScanner<'a, S: Storage> {
32    storage: &'a S,
33    machine_id: String,
34}
35
36impl<'a, S: Storage> CodexSessionScanner<'a, S> {
37    /// Create a new scanner with the given storage and machine ID.
38    pub fn new(storage: &'a S, machine_id: impl Into<String>) -> Self {
39        Self {
40            storage,
41            machine_id: machine_id.into(),
42        }
43    }
44
45    /// Scan a Codex session file and insert events into storage.
46    ///
47    /// This method:
48    /// 1. Extracts session ID from filename
49    /// 2. Loads the last scanned position for the file
50    /// 3. Parses new entries since that position
51    /// 4. Inserts new events
52    /// 5. Updates the position for future incremental scans
53    pub fn scan_file(&self, path: &Path) -> Result<ScanResult, ScanError> {
54        // Extract session ID from filename
55        let filename = path.file_name().and_then(|n| n.to_str()).ok_or_else(|| {
56            ScanError::Parse(TranscriptError::Io(std::io::Error::new(
57                std::io::ErrorKind::InvalidInput,
58                "Invalid filename",
59            )))
60        })?;
61
62        let session_id = extract_session_id_from_filename(filename).ok_or_else(|| {
63            ScanError::Parse(TranscriptError::Io(std::io::Error::new(
64                std::io::ErrorKind::InvalidInput,
65                "Could not extract session ID from filename",
66            )))
67        })?;
68
69        // Get last scanned position (reuse transcript position tracking)
70        let start_position = self
71            .storage
72            .get_transcript_position(path)?
73            .map(|pos| FilePosition {
74                offset: pos.offset,
75                line_number: pos.line_number,
76                last_key: pos.last_uuid,
77            })
78            .unwrap_or_default();
79
80        // Parse the session file
81        let parser = CodexSessionParser::new(&self.machine_id);
82        let result = parser.parse_incremental(path, &session_id, &start_position)?;
83
84        let parse_errors = result.parse_errors;
85
86        // Insert new events
87        let mut inserted = 0;
88        for event in result.events {
89            self.storage.insert(&event)?;
90            inserted += 1;
91        }
92
93        // Update the session's transcript_path with the file path.
94        // This is needed because Codex hooks don't provide transcript_path,
95        // but we know it from scanning the session file.
96        if let Some(path_str) = path.to_str() {
97            // Ignore errors - the session may not exist yet if no events were inserted
98            let _ = self.storage.update_session_transcript_path(
99                &self.machine_id,
100                &session_id,
101                path_str,
102            );
103        }
104
105        // Update position for next scan (reuse transcript position)
106        let transcript_position = crate::input::transcript::FilePosition {
107            offset: result.position.offset,
108            line_number: result.position.line_number,
109            last_uuid: result.position.last_key,
110        };
111        self.storage
112            .set_transcript_position(path, &transcript_position)?;
113
114        Ok(ScanResult {
115            inserted,
116            parse_errors,
117            session_id: Some(session_id),
118        })
119    }
120
121    /// Scan a session file by session ID.
122    ///
123    /// Looks up the session file path in the standard Codex sessions directory.
124    /// After scanning, also updates the transcript_path for the provided session_id,
125    /// which may differ from the UUID extracted from the filename. This ensures
126    /// sessions created by hooks (using thread-id) get their transcript_path set.
127    pub fn scan_session(&self, session_id: &str) -> Result<ScanResult, ScanError> {
128        let path = find_session_file(session_id)?;
129        let result = self.scan_file(&path)?;
130
131        // Also update transcript_path for the original session_id.
132        // This is important because the hook may use a different session_id
133        // (thread-id) than the UUID in the filename. We need to update both
134        // so the session created by hooks gets the transcript path.
135        if let Some(path_str) = path.to_str() {
136            let _ =
137                self.storage
138                    .update_session_transcript_path(&self.machine_id, session_id, path_str);
139        }
140
141        Ok(result)
142    }
143}
144
145/// Find a Codex session file by session ID.
146///
147/// Searches in `~/.codex/sessions/` for a file matching the session UUID.
148pub fn find_session_file(session_id: &str) -> Result<PathBuf, ScanError> {
149    let sessions_dir = dirs::home_dir()
150        .ok_or_else(|| {
151            ScanError::Parse(TranscriptError::Io(std::io::Error::new(
152                std::io::ErrorKind::NotFound,
153                "Could not determine home directory",
154            )))
155        })?
156        .join(".codex")
157        .join("sessions");
158
159    if !sessions_dir.exists() {
160        return Err(ScanError::Parse(TranscriptError::Io(std::io::Error::new(
161            std::io::ErrorKind::NotFound,
162            "Codex sessions directory not found",
163        ))));
164    }
165
166    // Search through date directories
167    // Format: ~/.codex/sessions/YYYY/MM/DD/rollout-...-<UUID>.jsonl
168    let year_entries =
169        std::fs::read_dir(&sessions_dir).map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?;
170
171    for year_entry in year_entries {
172        let year_path = year_entry
173            .map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?
174            .path();
175        if !year_path.is_dir() {
176            continue;
177        }
178
179        let month_entries =
180            std::fs::read_dir(&year_path).map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?;
181
182        for month_entry in month_entries {
183            let month_path = month_entry
184                .map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?
185                .path();
186            if !month_path.is_dir() {
187                continue;
188            }
189
190            let day_entries = std::fs::read_dir(&month_path)
191                .map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?;
192
193            for day_entry in day_entries {
194                let day_path = day_entry
195                    .map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?
196                    .path();
197                if !day_path.is_dir() {
198                    continue;
199                }
200
201                let file_entries = std::fs::read_dir(&day_path)
202                    .map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?;
203
204                for file_entry in file_entries {
205                    let file_path = file_entry
206                        .map_err(|e| ScanError::Parse(TranscriptError::Io(e)))?
207                        .path();
208                    if let Some(filename) = file_path.file_name().and_then(|n| n.to_str())
209                        && filename.ends_with(".jsonl")
210                        && filename.contains(session_id)
211                    {
212                        return Ok(file_path);
213                    }
214                }
215            }
216        }
217    }
218
219    Err(ScanError::Parse(TranscriptError::Io(std::io::Error::new(
220        std::io::ErrorKind::NotFound,
221        format!("Session file not found for ID: {}", session_id),
222    ))))
223}
224
225/// Get the Codex sessions directory path.
226pub fn codex_sessions_dir() -> Option<PathBuf> {
227    dirs::home_dir().map(|h| h.join(".codex").join("sessions"))
228}
229
230/// List all session files in a date directory.
231///
232/// Returns (session_id, path) tuples for all session files found.
233pub fn list_session_files_in_dir(dir: &Path) -> Vec<(String, PathBuf)> {
234    let mut results = Vec::new();
235
236    if let Ok(entries) = std::fs::read_dir(dir) {
237        for entry in entries.flatten() {
238            let path = entry.path();
239            if path.is_file()
240                && let Some(filename) = path.file_name().and_then(|n| n.to_str())
241                && filename.ends_with(".jsonl")
242                && let Some(session_id) = extract_session_id_from_filename(filename)
243            {
244                results.push((session_id, path));
245            }
246        }
247    }
248
249    results
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    #[test]
257    fn test_list_session_files_in_dir_empty() {
258        let temp_dir = tempfile::tempdir().unwrap();
259        let results = list_session_files_in_dir(temp_dir.path());
260        assert!(results.is_empty());
261    }
262
263    #[test]
264    fn test_list_session_files_in_dir_with_files() {
265        let temp_dir = tempfile::tempdir().unwrap();
266
267        // Create test files
268        std::fs::write(
269            temp_dir
270                .path()
271                .join("rollout-2026-01-08T17-00-06-019ba044-90fa-7b30-be9c-6b7b601599cd.jsonl"),
272            "{}",
273        )
274        .unwrap();
275        std::fs::write(
276            temp_dir
277                .path()
278                .join("rollout-2026-01-08T16-00-00-abcd1234-5678-90ab-cdef-1234567890ab.jsonl"),
279            "{}",
280        )
281        .unwrap();
282        std::fs::write(temp_dir.path().join("other.txt"), "not a session").unwrap();
283
284        let results = list_session_files_in_dir(temp_dir.path());
285        assert_eq!(results.len(), 2);
286
287        let session_ids: Vec<_> = results.iter().map(|(id, _)| id.as_str()).collect();
288        assert!(session_ids.contains(&"019ba044-90fa-7b30-be9c-6b7b601599cd"));
289        assert!(session_ids.contains(&"abcd1234-5678-90ab-cdef-1234567890ab"));
290    }
291
292    #[test]
293    fn test_codex_sessions_dir() {
294        let dir = codex_sessions_dir();
295        assert!(dir.is_some());
296        assert!(dir.unwrap().ends_with(".codex/sessions"));
297    }
298}