1use crate::errors::AppError;
17use crate::extract::codex_compat::codex_supports_ask_for_approval;
18use crate::extraction::{ExtractedUrl, ExtractionResult};
19use crate::storage::entities::{NewEntity, NewRelationship};
20use serde::{Deserialize, Serialize};
21use std::path::{Path, PathBuf};
22use std::process::{Command, Stdio};
23
24#[derive(Debug, Clone, Default, Deserialize, Serialize)]
26pub struct CodexUsage {
27 #[serde(default)]
28 pub input_tokens: u64,
29 #[serde(default)]
30 pub cached_input_tokens: u64,
31 #[serde(default)]
32 pub output_tokens: u64,
33 #[serde(default)]
34 pub reasoning_output_tokens: u64,
35}
36
37#[derive(Debug)]
39pub struct CodexResult {
40 pub extraction: ExtractionResult,
41 pub last_agent_text: String,
46 pub usage: Option<CodexUsage>,
47 pub rate_limited: bool,
48 pub schema_error: bool,
49 pub turn_failed: bool,
50 pub failed_message: String,
51}
52
53#[allow(rustdoc::broken_intra_doc_links)]
55pub struct CodexSpawnArgs<'a> {
56 pub binary: &'a Path,
57 pub prompt: &'a str,
58 pub json_schema: &'a str,
59 pub input_text: &'a str,
60 pub model: Option<&'a str>,
61 pub timeout_secs: u64,
62 pub schema_path: PathBuf,
66}
67
68pub fn trusted_schema_path() -> Result<PathBuf, AppError> {
71 let cache = crate::paths::AppPaths::resolve(None)
72 .map(|p| p.models.parent().map(|m| m.to_path_buf()))
73 .ok()
74 .flatten()
75 .unwrap_or_else(std::env::temp_dir);
76 std::fs::create_dir_all(&cache).map_err(AppError::Io)?;
77 Ok(cache.join(format!("enrich-schema-{}.json", std::process::id())))
78}
79
80pub const CODEX_PRO_OAUTH_MODELS: &[&str] = &[
87 "codex-auto-review",
88 "gpt-5.3-codex-spark",
89 "gpt-5.4",
90 "gpt-5.4-mini",
91 "gpt-5.5",
92];
93
94pub fn validate_codex_model(model: Option<&str>) -> Result<(), AppError> {
100 let Some(m) = model else {
101 return Ok(()); };
103 if CODEX_PRO_OAUTH_MODELS.contains(&m) {
104 Ok(())
105 } else {
106 Err(AppError::Validation(format!(
107 "--codex-model {m:?} is not supported with ChatGPT Pro OAuth. \
108 Accepted: {}",
109 CODEX_PRO_OAUTH_MODELS.join(", ")
110 )))
111 }
112}
113
114pub fn list_codex_models() -> Vec<String> {
122 use std::collections::BTreeSet;
123 let mut out: BTreeSet<String> = CODEX_PRO_OAUTH_MODELS
124 .iter()
125 .map(|s| s.to_string())
126 .collect();
127
128 if let Some(home) = std::env::var_os("HOME") {
129 let path = std::path::Path::new(&home)
130 .join(".codex")
131 .join("models_cache.json");
132 if let Ok(content) = std::fs::read_to_string(&path) {
133 if let Ok(value) = serde_json::from_str::<serde_json::Value>(&content) {
137 if let Some(obj) = value.as_object() {
138 for key in obj.keys() {
139 out.insert(key.clone());
140 }
141 } else if let Some(arr) = value.as_array() {
142 for v in arr {
143 if let Some(s) = v.as_str() {
144 out.insert(s.to_string());
145 }
146 }
147 }
148 }
149 }
150 }
151 out.into_iter().collect()
152}
153
154pub fn suggest_codex_model(query: &str) -> Option<String> {
160 let query_lc = query.to_ascii_lowercase();
161 let models = list_codex_model_lc();
162
163 for m in &models {
165 if m.contains(&query_lc) {
166 return Some(m.clone());
167 }
168 }
169
170 let max_distance = (query.len() / 3).max(2);
172 let mut best: Option<(usize, String)> = None;
173 for m in &models {
174 let d = levenshtein(query_lc.as_str(), m.as_str());
175 if d <= max_distance && best.as_ref().is_none_or(|(bd, _)| d < *bd) {
176 best = Some((d, m.clone()));
177 }
178 }
179 best.map(|(_, m)| m)
180}
181
182fn list_codex_model_lc() -> Vec<String> {
183 list_codex_models()
184 .into_iter()
185 .map(|s| s.to_ascii_lowercase())
186 .collect()
187}
188
189fn levenshtein(a: &str, b: &str) -> usize {
190 let a_chars: Vec<char> = a.chars().collect();
191 let b_chars: Vec<char> = b.chars().collect();
192 if a_chars.is_empty() {
193 return b_chars.len();
194 }
195 if b_chars.is_empty() {
196 return a_chars.len();
197 }
198 let mut prev: Vec<usize> = (0..=b_chars.len()).collect();
199 let mut curr = vec![0; b_chars.len() + 1];
200 for (i, &ac) in a_chars.iter().enumerate() {
201 curr[0] = i + 1;
202 for (j, &bc) in b_chars.iter().enumerate() {
203 let cost = if ac == bc { 0 } else { 1 };
204 curr[j + 1] = (curr[j] + 1).min(prev[j + 1] + 1).min(prev[j] + cost);
205 }
206 std::mem::swap(&mut prev, &mut curr);
207 }
208 prev[b_chars.len()]
209}
210
211pub fn build_codex_command(args: &CodexSpawnArgs<'_>) -> Command {
238 let full_prompt = format!("{}\n\n{}", args.prompt, args.input_text);
239
240 if let Ok(_key) = std::env::var("OPENAI_API_KEY") {
244 let mut cmd = Command::new("false");
245 cmd.env_clear();
246 cmd.env("PATH", "/nonexistent");
247 cmd.arg("--oauth-only-violation-openai-api-key-set");
248 return cmd;
249 }
250
251 std::fs::write(&args.schema_path, args.json_schema).ok();
254
255 let mut cmd = Command::new(args.binary);
256 cmd.env_clear();
257 for var in &[
262 "PATH",
263 "HOME",
264 "USER",
265 "SHELL",
266 "TERM",
267 "LANG",
268 "XDG_CONFIG_HOME",
269 "XDG_DATA_HOME",
270 "XDG_RUNTIME_DIR",
271 "XDG_CACHE_HOME",
272 "CODEX_ACCESS_TOKEN",
273 "TMPDIR",
274 "TMP",
275 "TEMP",
276 "DYLD_FALLBACK_LIBRARY_PATH",
277 ] {
278 if let Ok(val) = std::env::var(var) {
279 cmd.env(var, val);
280 }
281 }
282 if let Some(isolated) = prepare_isolated_codex_home_spawn() {
287 cmd.env("CODEX_HOME", isolated);
288 }
289
290 #[cfg(windows)]
291 for var in &[
292 "LOCALAPPDATA",
293 "APPDATA",
294 "USERPROFILE",
295 "SystemRoot",
296 "COMSPEC",
297 "PATHEXT",
298 ] {
299 if let Ok(val) = std::env::var(var) {
300 cmd.env(var, val);
301 }
302 }
303
304 cmd.arg("exec")
308 .arg("-c")
309 .arg("sandbox_mode='read-only'")
310 .arg("-c")
311 .arg("approval_policy='never'")
312 .arg("--json")
313 .arg("--output-schema")
314 .arg(&args.schema_path)
315 .arg("--ephemeral")
316 .arg("--skip-git-repo-check")
317 .arg("--sandbox")
318 .arg("read-only")
319 .arg("--ignore-user-config")
320 .arg("--ignore-rules");
321
322 if codex_supports_ask_for_approval() {
331 cmd.arg("--ask-for-approval").arg("never");
332 }
333
334 if let Some(m) = args.model {
335 cmd.arg("-m").arg(m);
336 }
337
338 cmd.arg("-");
340
341 cmd.stdin(Stdio::piped())
342 .stdout(Stdio::piped())
343 .stderr(Stdio::piped());
344 let _ = full_prompt; cmd
348}
349
350pub fn parse_codex_jsonl(stdout: &str) -> Result<CodexResult, AppError> {
363 let mut last_agent_text: Option<String> = None;
364 let mut usage: Option<CodexUsage> = None;
365 let mut rate_limited = false;
366 let mut schema_error = false;
367 let mut turn_failed = false;
368 let mut failed_message = String::new();
369
370 for line in stdout.lines() {
371 let line = line.trim();
372 if line.is_empty() {
373 continue;
374 }
375
376 let event: serde_json::Value = match serde_json::from_str(line) {
377 Ok(v) => v,
378 Err(_) => {
379 tracing::warn!(target: "codex_spawn", line, "skipping malformed JSONL line");
380 continue;
381 }
382 };
383
384 let event_type = match event.get("type").and_then(|t| t.as_str()) {
385 Some(t) => t,
386 None => continue,
387 };
388
389 match event_type {
390 "item.completed" => {
391 if let Some(item) = event.get("item") {
392 if item.get("type").and_then(|t| t.as_str()) == Some("agent_message") {
393 if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
394 last_agent_text = Some(text.to_string());
395 }
396 }
397 }
398 }
399 "turn.completed" => {
400 if let Some(u) = event.get("usage") {
401 let is_populated = u
406 .get("input_tokens")
407 .and_then(|v| v.as_u64())
408 .map(|n| n > 0)
409 .unwrap_or(false)
410 || u.get("output_tokens")
411 .and_then(|v| v.as_u64())
412 .map(|n| n > 0)
413 .unwrap_or(false);
414 if is_populated {
415 if let Ok(parsed) = serde_json::from_value::<CodexUsage>(u.clone()) {
416 usage = Some(parsed);
417 }
418 }
419 }
420 }
421 "turn.failed" => {
422 turn_failed = true;
423 if let Some(err) = event.get("error") {
424 let msg = err
425 .get("message")
426 .and_then(|m| m.as_str())
427 .unwrap_or("unknown error");
428 failed_message = msg.to_string();
429 if msg.contains("rate_limit")
430 || msg.contains("429")
431 || msg.contains("Too Many Requests")
432 {
433 rate_limited = true;
434 }
435 }
436 }
437 "error" => {
438 if let Some(msg) = event.get("message").and_then(|m| m.as_str()) {
439 if msg.contains("invalid_json_schema") || msg.contains("schema") {
440 schema_error = true;
441 }
442 }
443 }
444 _ => {}
445 }
446 }
447
448 let text = last_agent_text.ok_or_else(|| {
449 AppError::Validation(format!(
450 "no agent_message in codex JSONL output (rate_limited={rate_limited}, schema_error={schema_error}, turn_failed={turn_failed})"
451 ))
452 })?;
453
454 if turn_failed {
455 return Err(AppError::Validation(format!(
456 "codex turn failed: {failed_message}"
457 )));
458 }
459 if schema_error {
460 return Err(AppError::Validation(
461 "codex reported invalid_json_schema; check the --output-schema file".to_string(),
462 ));
463 }
464 if rate_limited {
465 return Err(AppError::Validation(format!(
466 "codex rate-limited: {failed_message}"
467 )));
468 }
469
470 let extraction = parse_extraction_text(&text)?;
471 Ok(CodexResult {
472 extraction,
473 last_agent_text: text,
474 usage,
475 rate_limited,
476 schema_error,
477 turn_failed,
478 failed_message,
479 })
480}
481
482pub fn parse_extraction_text(text: &str) -> Result<ExtractionResult, AppError> {
487 let value: serde_json::Value = serde_json::from_str(text).map_err(|e| {
488 AppError::Validation(format!("failed to parse codex agent_message as JSON: {e}"))
489 })?;
490 let obj = value.as_object().ok_or_else(|| {
491 AppError::Validation("codex agent_message is not a JSON object".to_string())
492 })?;
493
494 let mut entities: Vec<NewEntity> = Vec::new();
495 if let Some(arr) = obj.get("entities").and_then(|v| v.as_array()) {
496 for e in arr {
497 if let Some(name) = e.get("name").and_then(|v| v.as_str()) {
498 let entity_type_str = e
501 .get("type")
502 .or_else(|| e.get("entity_type"))
503 .and_then(|v| v.as_str())
504 .unwrap_or("concept");
505 let entity_type = serde_json::from_value::<crate::entity_type::EntityType>(
506 serde_json::Value::String(entity_type_str.to_string()),
507 )
508 .unwrap_or(crate::entity_type::EntityType::Concept);
509 entities.push(NewEntity {
510 name: name.to_string(),
511 entity_type,
512 description: None,
513 });
514 }
515 }
516 }
517
518 let mut relationships: Vec<NewRelationship> = Vec::new();
519 if let Some(arr) = obj.get("relationships").and_then(|v| v.as_array()) {
520 for r in arr {
521 let from = r.get("source").or_else(|| r.get("from"));
522 let to = r.get("target").or_else(|| r.get("to"));
523 let rel = r.get("relation").and_then(|v| v.as_str());
524 if let (Some(from_v), Some(to_v), Some(rel_v)) = (
525 from.and_then(|v| v.as_str()),
526 to.and_then(|v| v.as_str()),
527 rel,
528 ) {
529 relationships.push(NewRelationship {
530 source: from_v.to_string(),
531 target: to_v.to_string(),
532 relation: rel_v.to_string(),
533 strength: r.get("strength").and_then(|v| v.as_f64()).unwrap_or(0.5),
534 description: None,
535 });
536 }
537 }
538 }
539
540 let urls: Vec<ExtractedUrl> = obj
541 .get("urls")
542 .and_then(|v| v.as_array())
543 .map(|arr| {
544 arr.iter()
545 .filter_map(|u| {
546 let url = u.get("url")?.as_str()?.to_string();
547 let start = u.get("start").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
548 let end = u
549 .get("end")
550 .and_then(|v| v.as_u64())
551 .unwrap_or(start as u64) as usize;
552 Some(ExtractedUrl { url, start, end })
553 })
554 .collect()
555 })
556 .unwrap_or_default();
557
558 let entities_ext: Vec<crate::extraction::ExtractedEntity> = entities
568 .into_iter()
569 .map(|e| crate::extraction::ExtractedEntity {
570 name: e.name,
571 entity_type: e.entity_type.as_str().to_string(),
572 start: 0,
573 end: 0,
574 })
575 .collect();
576
577 Ok(ExtractionResult {
578 entities: entities_ext,
579 urls,
580 elapsed_ms: 0,
581 })
582}
583
584fn prepare_isolated_codex_home_spawn() -> Option<std::path::PathBuf> {
585 let home = std::env::var("HOME").ok()?;
586 let real_auth = std::path::Path::new(&home).join(".codex/auth.json");
587 if !real_auth.exists() {
588 return None;
589 }
590 let isolated =
591 std::env::temp_dir().join(format!("sqlite-graphrag-codex-home-{}", std::process::id()));
592 let _ = std::fs::create_dir_all(&isolated);
593 let target = isolated.join("auth.json");
594 if !target.exists() {
595 let _ = std::fs::copy(&real_auth, &target);
596 }
597 Some(isolated)
598}
599
600#[cfg(test)]
601mod tests {
602 use super::*;
603
604 const SAMPLE_JSONL: &str = r#"{"type":"thread.started","thread_id":"abc"}
605{"type":"turn.started"}
606{"type":"item.completed","item":{"type":"reasoning","text":"thinking"}}
607{"type":"item.completed","item":{"type":"agent_message","text":"{\"entities\":[{\"name\":\"alpha\",\"type\":\"concept\"}],\"relationships\":[{\"source\":\"alpha\",\"target\":\"beta\",\"relation\":\"uses\",\"strength\":0.7}],\"extraction_method\":\"codex\",\"urls\":[]}"}}
608{"type":"turn.completed","usage":{"input_tokens":120,"output_tokens":45}}
609{"type":"turn.completed","usage":{}}
610"#;
611
612 #[test]
613 fn parse_codex_jsonl_extracts_last_agent_message() {
614 let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
619 assert_eq!(result.extraction.entities.len(), 1);
620 assert_eq!(result.extraction.entities[0].name, "alpha");
621 }
622
623 #[test]
624 fn parse_codex_jsonl_collects_usage() {
625 let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
626 let usage = result.usage.expect("usage must be populated");
627 assert_eq!(usage.input_tokens, 120);
628 assert_eq!(usage.output_tokens, 45);
629 }
630
631 #[test]
632 fn parse_codex_jsonl_detects_rate_limit() {
633 let r = parse_codex_jsonl(
634 "{\"type\":\"turn.failed\",\"error\":{\"message\":\"rate_limit: 429 too many\"}}\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{}\"}}",
635 );
636 assert!(matches!(r, Err(AppError::Validation(_))));
637 }
638
639 #[test]
640 fn parse_codex_jsonl_handles_no_agent_message() {
641 let r = parse_codex_jsonl("{\"type\":\"thread.started\"}");
642 assert!(matches!(r, Err(AppError::Validation(_))));
643 }
644
645 #[test]
646 fn parse_codex_jsonl_skips_malformed_lines() {
647 let r = parse_codex_jsonl(
648 "{not valid json\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{\\\"entities\\\":[],\\\"relationships\\\":[],\\\"extraction_method\\\":\\\"codex\\\"}\"}}",
649 );
650 assert!(r.is_ok(), "malformed lines must be skipped, got {r:?}");
651 }
652
653 #[test]
654 fn validate_codex_model_accepts_known() {
655 assert!(validate_codex_model(Some("gpt-5.5")).is_ok());
656 assert!(validate_codex_model(Some("gpt-5.4")).is_ok());
657 assert!(validate_codex_model(None).is_ok()); }
659
660 #[test]
661 fn validate_codex_model_rejects_unknown() {
662 let err = validate_codex_model(Some("gpt-4")).unwrap_err();
663 let msg = format!("{err}");
664 assert!(msg.contains("not supported"));
665 assert!(msg.contains("gpt-5.5"));
666 }
667
668 #[test]
669 fn list_codex_models_includes_all_static_whitelist() {
670 let models = list_codex_models();
671 for m in CODEX_PRO_OAUTH_MODELS {
672 assert!(models.contains(&m.to_string()), "missing {m} in {models:?}");
673 }
674 }
675
676 #[test]
677 fn suggest_codex_model_substring_match() {
678 let s = suggest_codex_model("gpt-5");
679 assert!(s.is_some(), "must suggest a gpt-5.x model");
680 }
681
682 #[test]
683 fn suggest_codex_model_fuzzy_match() {
684 let s = suggest_codex_model("gpt5.5");
686 assert!(s.is_some(), "fuzzy must suggest gpt-5.5 for 'gpt5.5'");
687 assert_eq!(s.unwrap(), "gpt-5.5");
688 }
689
690 #[test]
691 fn suggest_codex_model_unrelated_returns_none() {
692 let s = suggest_codex_model("totally-unrelated-zzz");
693 assert!(s.is_none());
694 }
695
696 #[test]
697 fn build_codex_command_includes_hardening_flags() {
698 let args = CodexSpawnArgs {
699 binary: Path::new("/bin/true"),
700 prompt: "p",
701 json_schema: "{}",
702 input_text: "i",
703 model: Some("gpt-5.5"),
704 timeout_secs: 60,
705 schema_path: std::env::temp_dir().join("test-schema.json"),
706 };
707 let cmd = build_codex_command(&args);
708 let collected: Vec<String> = cmd
709 .get_args()
710 .filter_map(|a| a.to_str().map(|s| s.to_string()))
711 .collect();
712 for required in &[
713 "exec",
714 "-c",
715 "sandbox_mode='read-only'",
716 "approval_policy='never'",
717 "--json",
718 "--output-schema",
719 "--ephemeral",
720 "--skip-git-repo-check",
721 "--sandbox",
722 "read-only",
723 "--ignore-user-config",
724 "--ignore-rules",
725 "-m",
726 "gpt-5.5",
727 "-",
728 ] {
729 assert!(
730 collected.iter().any(|a| a == required),
731 "missing flag {required} in {collected:?}"
732 );
733 }
734 }
735
736 #[test]
737 fn list_codex_models_dedupes_with_cache_file() {
738 let models = list_codex_models();
742 let unique: std::collections::HashSet<_> = models.iter().collect();
743 assert_eq!(unique.len(), models.len(), "list_codex_models must dedupe");
744 }
745
746 #[test]
751 #[serial_test::serial(env)]
752 fn build_command_oauth_only_mandatory_flags() {
753 unsafe {
755 std::env::remove_var("OPENAI_API_KEY");
756 }
757 let schema = std::env::temp_dir().join("codex-test-schema.json");
758 let _ = std::fs::remove_file(&schema);
759 let args = CodexSpawnArgs {
760 binary: std::path::Path::new("/usr/bin/false"),
761 prompt: "p",
762 json_schema: "{}",
763 input_text: "i",
764 model: Some("gpt-5.4-mini"),
765 timeout_secs: 60,
766 schema_path: schema.clone(),
767 };
768 let cmd = build_codex_command(&args);
769 let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
770 assert!(
776 argv.contains(&"--ignore-user-config"),
777 "must have --ignore-user-config (gaps.md:266)"
778 );
779 let ask_for_approval_present = argv.contains(&"--ask-for-approval");
783 if !crate::extract::codex_compat::codex_supports_ask_for_approval() {
784 assert!(
785 !ask_for_approval_present,
786 "codex 0.134+ must NOT include --ask-for-approval"
787 );
788 }
789 assert!(
790 argv.contains(&"--sandbox"),
791 "must have --sandbox read-only (G31)"
792 );
793 assert!(argv.contains(&"--ephemeral"), "must have --ephemeral (G31)");
794 assert!(
795 argv.contains(&"--skip-git-repo-check"),
796 "must have --skip-git-repo-check (G31)"
797 );
798 assert!(
799 argv.contains(&"--ignore-rules"),
800 "must have --ignore-rules (G31)"
801 );
802 assert!(
804 argv.contains(&"-c") && argv.contains(&"sandbox_mode='read-only'"),
805 "must have -c sandbox_mode='read-only' (v1.0.77, codex#18113)"
806 );
807 assert!(
808 argv.contains(&"approval_policy='never'"),
809 "must have -c approval_policy='never' (v1.0.77)"
810 );
811 }
812
813 #[test]
817 #[serial_test::serial(env)]
818 fn build_command_aborts_when_openai_api_key_set() {
819 unsafe {
821 std::env::set_var("OPENAI_API_KEY", "sk-violation-test");
822 }
823 let schema = std::env::temp_dir().join("codex-test-schema-abort.json");
824 let _ = std::fs::remove_file(&schema);
825 let args = CodexSpawnArgs {
826 binary: std::path::Path::new("/usr/bin/codex"),
827 prompt: "p",
828 json_schema: "{}",
829 input_text: "i",
830 model: Some("gpt-5.4-mini"),
831 timeout_secs: 60,
832 schema_path: schema.clone(),
833 };
834 let cmd = build_codex_command(&args);
835 let program = cmd.get_program().to_string_lossy().to_string();
836 let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
837 assert_eq!(
838 program, "false",
839 "when OPENAI_API_KEY is set, build_codex_command must abort"
840 );
841 assert!(
842 argv.contains(&"--oauth-only-violation-openai-api-key-set"),
843 "aborted command must carry violation marker"
844 );
845 unsafe {
846 std::env::remove_var("OPENAI_API_KEY");
847 }
848 }
849}