mi6_core/input/codex_session/
entry.rs

1//! Codex session entry types and parsing.
2//!
3//! Codex CLI stores session data in JSONL files at:
4//! `~/.codex/sessions/YYYY/MM/DD/rollout-YYYY-MM-DDTHH-MM-SS-<UUID>.jsonl`
5//!
6//! The format differs from Claude Code transcripts:
7//! - Top-level `type` field indicates entry type
8//! - `payload` contains type-specific data
9//! - Token usage is in nested `event_msg.token_count` events
10
11use std::collections::HashMap;
12
13use chrono::{DateTime, Utc};
14use serde::Deserialize;
15
16use crate::model::{Event, EventBuilder, EventType};
17use crate::util::truncate_string;
18
19/// Information about a function call, tracked for enriching output events.
20#[derive(Debug, Clone)]
21pub struct FunctionCallInfo {
22    /// The function name (e.g., "shell_command")
23    pub name: String,
24    /// The function arguments as JSON string
25    pub arguments: String,
26    /// Whether this call requires user approval (sandbox_permissions: "require_escalated")
27    pub requires_approval: bool,
28    /// Timestamp of the function call
29    pub timestamp: Option<DateTime<Utc>>,
30}
31
32/// Map from call_id to function call info.
33pub type FunctionCallMap = HashMap<String, FunctionCallInfo>;
34
35/// A single entry in a Codex session JSONL file.
36///
37/// Codex uses a unified entry format with `type` discriminator.
38#[derive(Debug, Clone, Deserialize)]
39pub struct CodexSessionEntry {
40    /// Entry timestamp (ISO 8601)
41    pub timestamp: String,
42
43    /// Entry type: "session_meta", "response_item", "event_msg", "turn_context"
44    #[serde(rename = "type")]
45    pub entry_type: String,
46
47    /// Type-specific payload
48    pub payload: serde_json::Value,
49}
50
51/// Session metadata from the first entry.
52#[derive(Debug, Clone, Deserialize)]
53pub struct SessionMeta {
54    /// Session UUID
55    pub id: String,
56    /// Session start timestamp
57    pub timestamp: String,
58    /// Working directory
59    pub cwd: Option<String>,
60    /// CLI version
61    pub cli_version: Option<String>,
62    /// Model provider (e.g., "openai")
63    pub model_provider: Option<String>,
64    /// Git context
65    pub git: Option<GitInfo>,
66}
67
68/// Git repository information.
69#[derive(Debug, Clone, Deserialize)]
70pub struct GitInfo {
71    pub commit_hash: Option<String>,
72    pub branch: Option<String>,
73    pub repository_url: Option<String>,
74}
75
76/// Token usage information from token_count events.
77#[derive(Debug, Clone, Deserialize)]
78pub struct TokenUsageInfo {
79    pub total_token_usage: Option<TokenUsage>,
80    pub last_token_usage: Option<TokenUsage>,
81    pub model_context_window: Option<i64>,
82}
83
84/// Token counts.
85#[derive(Debug, Clone, Deserialize)]
86pub struct TokenUsage {
87    pub input_tokens: Option<i64>,
88    pub cached_input_tokens: Option<i64>,
89    pub output_tokens: Option<i64>,
90    pub reasoning_output_tokens: Option<i64>,
91    pub total_tokens: Option<i64>,
92}
93
94/// Turn context information.
95#[derive(Debug, Clone, Deserialize)]
96pub struct TurnContext {
97    pub cwd: Option<String>,
98    pub model: Option<String>,
99    pub approval_policy: Option<String>,
100}
101
102impl CodexSessionEntry {
103    /// Parse timestamp from the entry.
104    fn parse_timestamp(&self) -> Option<DateTime<Utc>> {
105        chrono::DateTime::parse_from_rfc3339(&self.timestamp)
106            .ok()
107            .map(|dt| dt.with_timezone(&Utc))
108    }
109
110    /// Convert this entry to Event(s).
111    ///
112    /// A single entry may produce multiple events depending on type.
113    /// The `function_call_map` tracks call_id -> function info for enriching outputs.
114    /// The `session_meta` provides session-level context like cwd, git branch.
115    /// The `turn_model` tracks the model from the most recent turn_context.
116    pub fn into_events(
117        self,
118        machine_id: &str,
119        session_id: &str,
120        function_call_map: &mut FunctionCallMap,
121        session_meta: &Option<SessionMeta>,
122        turn_model: &mut Option<String>,
123    ) -> Vec<Event> {
124        let timestamp = self.parse_timestamp();
125
126        match self.entry_type.as_str() {
127            "session_meta" => {
128                // session_meta is handled separately to set session context
129                // No events generated here
130                vec![]
131            }
132
133            "turn_context" => {
134                // Update turn model for subsequent events
135                if let Ok(ctx) = serde_json::from_value::<TurnContext>(self.payload) {
136                    *turn_model = ctx.model;
137                }
138                vec![]
139            }
140
141            "event_msg" => {
142                self.parse_event_msg(machine_id, session_id, timestamp, session_meta, turn_model)
143            }
144
145            "response_item" => self.parse_response_item(
146                machine_id,
147                session_id,
148                timestamp,
149                function_call_map,
150                session_meta,
151                turn_model,
152            ),
153
154            _ => vec![],
155        }
156    }
157
158    /// Parse event_msg entries (token_count, user_message, agent_message, etc.)
159    fn parse_event_msg(
160        self,
161        machine_id: &str,
162        session_id: &str,
163        timestamp: Option<DateTime<Utc>>,
164        session_meta: &Option<SessionMeta>,
165        turn_model: &Option<String>,
166    ) -> Vec<Event> {
167        let msg_type = self
168            .payload
169            .get("type")
170            .and_then(|v| v.as_str())
171            .unwrap_or("");
172
173        match msg_type {
174            "token_count" => {
175                // Parse token usage and create ApiRequest event
176                let info = self.payload.get("info");
177                if info.is_none() || info.is_some_and(|v| v.is_null()) {
178                    return vec![];
179                }
180
181                let info: TokenUsageInfo =
182                    match serde_json::from_value(info.cloned().unwrap_or_default()) {
183                        Ok(i) => i,
184                        Err(_) => return vec![],
185                    };
186
187                // Use last_token_usage for per-turn incremental counts
188                let usage = match info.last_token_usage {
189                    Some(u) => u,
190                    None => return vec![],
191                };
192
193                // Skip if no meaningful token data
194                if usage.input_tokens.unwrap_or(0) == 0 && usage.output_tokens.unwrap_or(0) == 0 {
195                    return vec![];
196                }
197
198                let mut builder = EventBuilder::new(machine_id, EventType::ApiRequest, session_id)
199                    .source("codex_session")
200                    .framework("codex")
201                    .timestamp_opt(timestamp);
202
203                // Set tokens
204                let input = usage.input_tokens.unwrap_or(0);
205                let output = usage.output_tokens.unwrap_or(0);
206                builder = builder.tokens(input, output);
207
208                // Set cache tokens (Codex uses cached_input_tokens)
209                if let Some(cache_read) = usage.cached_input_tokens {
210                    builder = builder.tokens_cache_read_opt(Some(cache_read));
211                }
212
213                // Set model from turn context
214                builder = builder.model_opt(turn_model.clone());
215
216                // Set cwd and git from session meta
217                if let Some(meta) = session_meta {
218                    builder = builder.cwd_opt(meta.cwd.clone());
219                    if let Some(ref git) = meta.git {
220                        builder = builder.git_branch_opt(git.branch.clone());
221                    }
222                }
223
224                // Store extra info (reasoning_output_tokens, model_context_window) in metadata
225                let mut extra = serde_json::Map::new();
226                if let Some(reasoning) = usage.reasoning_output_tokens {
227                    extra.insert(
228                        "reasoning_output_tokens".into(),
229                        serde_json::json!(reasoning),
230                    );
231                }
232                if let Some(ctx_window) = info.model_context_window {
233                    extra.insert("model_context_window".into(), serde_json::json!(ctx_window));
234                }
235                if !extra.is_empty() {
236                    builder = builder.metadata(serde_json::Value::Object(extra).to_string());
237                }
238
239                vec![builder.build()]
240            }
241
242            "user_message" => {
243                // Create UserPromptSubmit event
244                let message = self
245                    .payload
246                    .get("message")
247                    .and_then(|v| v.as_str())
248                    .unwrap_or("");
249
250                if message.is_empty() {
251                    return vec![];
252                }
253
254                let mut builder =
255                    EventBuilder::new(machine_id, EventType::UserPromptSubmit, session_id)
256                        .source("codex_session")
257                        .framework("codex")
258                        .timestamp_opt(timestamp);
259
260                // Set cwd from session meta
261                if let Some(meta) = session_meta {
262                    builder = builder.cwd_opt(meta.cwd.clone());
263                    if let Some(ref git) = meta.git {
264                        builder = builder.git_branch_opt(git.branch.clone());
265                    }
266                }
267
268                // Store truncated prompt in payload
269                let truncated = truncate_string(message, 1000);
270                builder = builder.payload(serde_json::json!({"prompt": truncated}).to_string());
271
272                vec![builder.build()]
273            }
274
275            // agent_message, agent_reasoning, turn_aborted are informational
276            // We don't create separate events for these
277            _ => vec![],
278        }
279    }
280
281    /// Parse response_item entries (function_call, function_call_output, message)
282    fn parse_response_item(
283        self,
284        machine_id: &str,
285        session_id: &str,
286        timestamp: Option<DateTime<Utc>>,
287        function_call_map: &mut FunctionCallMap,
288        session_meta: &Option<SessionMeta>,
289        turn_model: &Option<String>,
290    ) -> Vec<Event> {
291        let item_type = self
292            .payload
293            .get("type")
294            .and_then(|v| v.as_str())
295            .unwrap_or("");
296
297        match item_type {
298            "function_call" => {
299                let name = self
300                    .payload
301                    .get("name")
302                    .and_then(|v| v.as_str())
303                    .unwrap_or("")
304                    .to_string();
305                let call_id = self
306                    .payload
307                    .get("call_id")
308                    .and_then(|v| v.as_str())
309                    .unwrap_or("")
310                    .to_string();
311                let arguments = self
312                    .payload
313                    .get("arguments")
314                    .and_then(|v| v.as_str())
315                    .unwrap_or("")
316                    .to_string();
317
318                if call_id.is_empty() {
319                    return vec![];
320                }
321
322                // Check if this call requires user approval
323                // Codex sets sandbox_permissions: "require_escalated" when approval is needed
324                let requires_approval = serde_json::from_str::<serde_json::Value>(&arguments)
325                    .ok()
326                    .and_then(|args| {
327                        args.get("sandbox_permissions")
328                            .and_then(|v| v.as_str())
329                            .map(|s| s == "require_escalated")
330                    })
331                    .unwrap_or(false);
332
333                // Track function call for enriching output
334                function_call_map.insert(
335                    call_id.clone(),
336                    FunctionCallInfo {
337                        name: name.clone(),
338                        arguments: arguments.clone(),
339                        requires_approval,
340                        timestamp,
341                    },
342                );
343
344                // Create PreToolUse event
345                let mut builder = EventBuilder::new(machine_id, EventType::PreToolUse, session_id)
346                    .source("codex_session")
347                    .framework("codex")
348                    .timestamp_opt(timestamp)
349                    .tool(call_id, name)
350                    .model_opt(turn_model.clone());
351
352                if let Some(meta) = session_meta {
353                    builder = builder.cwd_opt(meta.cwd.clone());
354                    if let Some(ref git) = meta.git {
355                        builder = builder.git_branch_opt(git.branch.clone());
356                    }
357                }
358
359                vec![builder.build()]
360            }
361
362            "function_call_output" => {
363                let call_id = self
364                    .payload
365                    .get("call_id")
366                    .and_then(|v| v.as_str())
367                    .unwrap_or("")
368                    .to_string();
369                let output = self
370                    .payload
371                    .get("output")
372                    .and_then(|v| v.as_str())
373                    .unwrap_or("");
374
375                if call_id.is_empty() {
376                    return vec![];
377                }
378
379                // Create PostToolUse event
380                let mut builder = EventBuilder::new(machine_id, EventType::PostToolUse, session_id)
381                    .source("codex_session")
382                    .framework("codex")
383                    .timestamp_opt(timestamp)
384                    .tool_use_id(call_id.clone())
385                    .model_opt(turn_model.clone());
386
387                // Enrich with function call info
388                if let Some(call_info) = function_call_map.get(&call_id) {
389                    builder = builder.tool_name(call_info.name.clone());
390
391                    // For shell_command, store command in payload (for GitHub context)
392                    if call_info.name == "shell_command" {
393                        // Parse arguments JSON to extract command
394                        if let Ok(args) =
395                            serde_json::from_str::<serde_json::Value>(&call_info.arguments)
396                            && let Some(cmd) = args.get("command")
397                        {
398                            builder =
399                                builder.payload(serde_json::json!({ "command": cmd }).to_string());
400                        }
401
402                        // Store truncated output in metadata (for debugging)
403                        let truncated_output = truncate_string(output, 500);
404                        builder = builder.metadata(
405                            serde_json::json!({ "output": truncated_output }).to_string(),
406                        );
407                    }
408                }
409
410                if let Some(meta) = session_meta {
411                    builder = builder.cwd_opt(meta.cwd.clone());
412                    if let Some(ref git) = meta.git {
413                        builder = builder.git_branch_opt(git.branch.clone());
414                    }
415                }
416
417                vec![builder.build()]
418            }
419
420            // message, reasoning, ghost_snapshot - no events for these
421            _ => vec![],
422        }
423    }
424
425    /// Get a unique key for deduplication.
426    ///
427    /// Combines timestamp and entry type since Codex doesn't have per-entry UUIDs.
428    pub fn dedup_key(&self) -> String {
429        format!("{}:{}", self.timestamp, self.entry_type)
430    }
431}
432
433/// Extract session ID from a Codex session filename.
434///
435/// Filename format: `rollout-YYYY-MM-DDTHH-MM-SS-<UUID>.jsonl`
436/// Returns the UUID portion.
437pub fn extract_session_id_from_filename(filename: &str) -> Option<String> {
438    // UUID is the last 36 characters before .jsonl
439    // Format: rollout-2026-01-08T17-00-06-019ba044-90fa-7b30-be9c-6b7b601599cd.jsonl
440    let without_ext = filename.strip_suffix(".jsonl")?;
441    let parts: Vec<&str> = without_ext.rsplitn(6, '-').collect();
442
443    // UUID has 5 parts: 8-4-4-4-12
444    if parts.len() >= 5 {
445        // Reconstruct UUID from last 5 parts (reversed)
446        let uuid = format!(
447            "{}-{}-{}-{}-{}",
448            parts[4], parts[3], parts[2], parts[1], parts[0]
449        );
450        // Validate it looks like a UUID
451        if uuid.len() == 36 && uuid.chars().filter(|c| *c == '-').count() == 4 {
452            return Some(uuid);
453        }
454    }
455    None
456}
457
458/// Parse session metadata from a session_meta entry.
459pub fn parse_session_meta(entry: &CodexSessionEntry) -> Option<SessionMeta> {
460    if entry.entry_type != "session_meta" {
461        return None;
462    }
463    serde_json::from_value(entry.payload.clone()).ok()
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_extract_session_id_from_filename() {
472        assert_eq!(
473            extract_session_id_from_filename(
474                "rollout-2026-01-08T17-00-06-019ba044-90fa-7b30-be9c-6b7b601599cd.jsonl"
475            ),
476            Some("019ba044-90fa-7b30-be9c-6b7b601599cd".to_string())
477        );
478
479        assert_eq!(
480            extract_session_id_from_filename(
481                "rollout-2025-11-26T17-55-56-019ac306-3951-72d1-8b5d-447c33ce17f5.jsonl"
482            ),
483            Some("019ac306-3951-72d1-8b5d-447c33ce17f5".to_string())
484        );
485
486        assert_eq!(extract_session_id_from_filename("invalid.jsonl"), None);
487        assert_eq!(extract_session_id_from_filename("rollout.jsonl"), None);
488    }
489
490    #[test]
491    fn test_parse_session_meta() {
492        let entry = CodexSessionEntry {
493            timestamp: "2025-11-27T01:55:56.451Z".to_string(),
494            entry_type: "session_meta".to_string(),
495            payload: serde_json::json!({
496                "id": "019ac306-3951-72d1-8b5d-447c33ce17f5",
497                "timestamp": "2025-11-27T01:55:56.369Z",
498                "cwd": "/Users/storm/repos/test",
499                "cli_version": "0.63.0",
500                "model_provider": "openai",
501                "git": {
502                    "commit_hash": "abc123",
503                    "branch": "main",
504                    "repository_url": "git@github.com:test/repo.git"
505                }
506            }),
507        };
508
509        let meta = parse_session_meta(&entry).unwrap();
510        assert_eq!(meta.id, "019ac306-3951-72d1-8b5d-447c33ce17f5");
511        assert_eq!(meta.cwd, Some("/Users/storm/repos/test".to_string()));
512        assert_eq!(meta.cli_version, Some("0.63.0".to_string()));
513        assert_eq!(meta.git.as_ref().unwrap().branch, Some("main".to_string()));
514    }
515
516    #[test]
517    fn test_parse_token_count_event() {
518        let entry = CodexSessionEntry {
519            timestamp: "2025-11-27T01:56:10.186Z".to_string(),
520            entry_type: "event_msg".to_string(),
521            payload: serde_json::json!({
522                "type": "token_count",
523                "info": {
524                    "total_token_usage": {
525                        "input_tokens": 3597,
526                        "cached_input_tokens": 3072,
527                        "output_tokens": 33,
528                        "reasoning_output_tokens": 0,
529                        "total_tokens": 3630
530                    },
531                    "last_token_usage": {
532                        "input_tokens": 500,
533                        "cached_input_tokens": 400,
534                        "output_tokens": 33,
535                        "reasoning_output_tokens": 10,
536                        "total_tokens": 533
537                    },
538                    "model_context_window": 258400
539                }
540            }),
541        };
542
543        let session_meta = Some(SessionMeta {
544            id: "test".to_string(),
545            timestamp: "2025-11-27T01:55:56.369Z".to_string(),
546            cwd: Some("/test/cwd".to_string()),
547            cli_version: None,
548            model_provider: None,
549            git: Some(GitInfo {
550                branch: Some("main".to_string()),
551                commit_hash: None,
552                repository_url: None,
553            }),
554        });
555
556        let mut turn_model = Some("gpt-5.1-codex".to_string());
557        let mut call_map = FunctionCallMap::default();
558
559        let events = entry.into_events(
560            "machine-1",
561            "session-1",
562            &mut call_map,
563            &session_meta,
564            &mut turn_model,
565        );
566
567        assert_eq!(events.len(), 1);
568        let event = &events[0];
569        assert_eq!(event.event_type, EventType::ApiRequest);
570        assert_eq!(event.tokens_input, Some(500)); // Uses last_token_usage
571        assert_eq!(event.tokens_output, Some(33));
572        assert_eq!(event.tokens_cache_read, Some(400));
573        assert_eq!(event.model, Some("gpt-5.1-codex".to_string()));
574        assert_eq!(event.cwd, Some("/test/cwd".to_string()));
575        assert_eq!(event.git_branch, Some("main".to_string()));
576
577        // Check metadata for reasoning tokens
578        let metadata: serde_json::Value =
579            serde_json::from_str(event.metadata.as_ref().unwrap()).unwrap();
580        assert_eq!(metadata["reasoning_output_tokens"], 10);
581        assert_eq!(metadata["model_context_window"], 258400);
582    }
583
584    #[test]
585    fn test_parse_user_message_event() {
586        let entry = CodexSessionEntry {
587            timestamp: "2025-11-27T01:56:07.612Z".to_string(),
588            entry_type: "event_msg".to_string(),
589            payload: serde_json::json!({
590                "type": "user_message",
591                "message": "Hello, world!",
592                "images": []
593            }),
594        };
595
596        let mut call_map = FunctionCallMap::default();
597        let mut turn_model = None;
598
599        let events = entry.into_events(
600            "machine-1",
601            "session-1",
602            &mut call_map,
603            &None,
604            &mut turn_model,
605        );
606
607        assert_eq!(events.len(), 1);
608        let event = &events[0];
609        assert_eq!(event.event_type, EventType::UserPromptSubmit);
610
611        let payload: serde_json::Value =
612            serde_json::from_str(event.payload.as_ref().unwrap()).unwrap();
613        assert_eq!(payload["prompt"], "Hello, world!");
614    }
615
616    #[test]
617    fn test_parse_function_call_and_output() {
618        let call_entry = CodexSessionEntry {
619            timestamp: "2025-11-27T01:56:10.186Z".to_string(),
620            entry_type: "response_item".to_string(),
621            payload: serde_json::json!({
622                "type": "function_call",
623                "name": "shell_command",
624                "arguments": "{\"command\":\"ls\",\"workdir\":\"/test\"}",
625                "call_id": "call_123"
626            }),
627        };
628
629        let output_entry = CodexSessionEntry {
630            timestamp: "2025-11-27T01:56:10.300Z".to_string(),
631            entry_type: "response_item".to_string(),
632            payload: serde_json::json!({
633                "type": "function_call_output",
634                "call_id": "call_123",
635                "output": "file1.txt\nfile2.txt"
636            }),
637        };
638
639        let mut call_map = FunctionCallMap::default();
640        let mut turn_model = None;
641
642        // Process function call
643        let call_events = call_entry.into_events(
644            "machine-1",
645            "session-1",
646            &mut call_map,
647            &None,
648            &mut turn_model,
649        );
650        assert_eq!(call_events.len(), 1);
651        assert_eq!(call_events[0].event_type, EventType::PreToolUse);
652        assert_eq!(call_events[0].tool_use_id, Some("call_123".to_string()));
653        assert_eq!(call_events[0].tool_name, Some("shell_command".to_string()));
654
655        // Process function output
656        let output_events = output_entry.into_events(
657            "machine-1",
658            "session-1",
659            &mut call_map,
660            &None,
661            &mut turn_model,
662        );
663        assert_eq!(output_events.len(), 1);
664        assert_eq!(output_events[0].event_type, EventType::PostToolUse);
665        assert_eq!(output_events[0].tool_use_id, Some("call_123".to_string()));
666        assert_eq!(
667            output_events[0].tool_name,
668            Some("shell_command".to_string())
669        );
670
671        // Check payload has command
672        let payload: serde_json::Value =
673            serde_json::from_str(output_events[0].payload.as_ref().unwrap()).unwrap();
674        assert_eq!(payload["command"], "ls");
675
676        // Check metadata has truncated output
677        let metadata: serde_json::Value =
678            serde_json::from_str(output_events[0].metadata.as_ref().unwrap()).unwrap();
679        assert_eq!(metadata["output"], "file1.txt\nfile2.txt");
680    }
681
682    #[test]
683    fn test_turn_context_updates_model() {
684        let ctx_entry = CodexSessionEntry {
685            timestamp: "2025-11-27T01:56:07.612Z".to_string(),
686            entry_type: "turn_context".to_string(),
687            payload: serde_json::json!({
688                "cwd": "/test",
689                "model": "gpt-5.2-codex",
690                "approval_policy": "on-request"
691            }),
692        };
693
694        let mut call_map = FunctionCallMap::default();
695        let mut turn_model = None;
696
697        let events = ctx_entry.into_events(
698            "machine-1",
699            "session-1",
700            &mut call_map,
701            &None,
702            &mut turn_model,
703        );
704
705        assert!(events.is_empty()); // No events from turn_context
706        assert_eq!(turn_model, Some("gpt-5.2-codex".to_string()));
707    }
708
709    #[test]
710    fn test_skip_null_token_info() {
711        let entry = CodexSessionEntry {
712            timestamp: "2025-11-27T01:56:08.167Z".to_string(),
713            entry_type: "event_msg".to_string(),
714            payload: serde_json::json!({
715                "type": "token_count",
716                "info": null,
717                "rate_limits": {}
718            }),
719        };
720
721        let mut call_map = FunctionCallMap::default();
722        let mut turn_model = None;
723
724        let events = entry.into_events(
725            "machine-1",
726            "session-1",
727            &mut call_map,
728            &None,
729            &mut turn_model,
730        );
731
732        assert!(events.is_empty());
733    }
734
735    #[test]
736    fn test_dedup_key() {
737        let entry = CodexSessionEntry {
738            timestamp: "2025-11-27T01:56:10.186Z".to_string(),
739            entry_type: "event_msg".to_string(),
740            payload: serde_json::json!({}),
741        };
742
743        assert_eq!(
744            entry.dedup_key(),
745            "2025-11-27T01:56:10.186Z:event_msg".to_string()
746        );
747    }
748}