1use crate::errors::AppError;
17use crate::extraction::{ExtractedUrl, ExtractionResult};
18use crate::storage::entities::{NewEntity, NewRelationship};
19use serde::{Deserialize, Serialize};
20use std::path::{Path, PathBuf};
21use std::process::{Command, Stdio};
22
23#[derive(Debug, Clone, Default, Deserialize, Serialize)]
25pub struct CodexUsage {
26 #[serde(default)]
27 pub input_tokens: u64,
28 #[serde(default)]
29 pub cached_input_tokens: u64,
30 #[serde(default)]
31 pub output_tokens: u64,
32 #[serde(default)]
33 pub reasoning_output_tokens: u64,
34}
35
36#[derive(Debug)]
38pub struct CodexResult {
39 pub extraction: ExtractionResult,
40 pub usage: Option<CodexUsage>,
41 pub rate_limited: bool,
42 pub schema_error: bool,
43 pub turn_failed: bool,
44 pub failed_message: String,
45}
46
47#[allow(rustdoc::broken_intra_doc_links)]
49pub struct CodexSpawnArgs<'a> {
50 pub binary: &'a Path,
51 pub prompt: &'a str,
52 pub json_schema: &'a str,
53 pub input_text: &'a str,
54 pub model: Option<&'a str>,
55 pub timeout_secs: u64,
56 pub schema_path: PathBuf,
60}
61
62pub fn trusted_schema_path() -> Result<PathBuf, AppError> {
65 let cache = crate::paths::AppPaths::resolve(None)
66 .map(|p| p.models.parent().map(|m| m.to_path_buf()))
67 .ok()
68 .flatten()
69 .unwrap_or_else(std::env::temp_dir);
70 std::fs::create_dir_all(&cache).map_err(AppError::Io)?;
71 Ok(cache.join(format!("enrich-schema-{}.json", std::process::id())))
72}
73
74pub const CODEX_PRO_OAUTH_MODELS: &[&str] = &[
81 "codex-auto-review",
82 "gpt-5.3-codex-spark",
83 "gpt-5.4",
84 "gpt-5.4-mini",
85 "gpt-5.5",
86];
87
88pub fn validate_codex_model(model: Option<&str>) -> Result<(), AppError> {
94 let Some(m) = model else {
95 return Ok(()); };
97 if CODEX_PRO_OAUTH_MODELS.contains(&m) {
98 Ok(())
99 } else {
100 Err(AppError::Validation(format!(
101 "--codex-model {m:?} is not supported with ChatGPT Pro OAuth. \
102 Accepted: {}",
103 CODEX_PRO_OAUTH_MODELS.join(", ")
104 )))
105 }
106}
107
108pub fn list_codex_models() -> Vec<String> {
116 use std::collections::BTreeSet;
117 let mut out: BTreeSet<String> = CODEX_PRO_OAUTH_MODELS
118 .iter()
119 .map(|s| s.to_string())
120 .collect();
121
122 if let Some(home) = std::env::var_os("HOME") {
123 let path = std::path::Path::new(&home)
124 .join(".codex")
125 .join("models_cache.json");
126 if let Ok(content) = std::fs::read_to_string(&path) {
127 if let Ok(value) = serde_json::from_str::<serde_json::Value>(&content) {
131 if let Some(obj) = value.as_object() {
132 for key in obj.keys() {
133 out.insert(key.clone());
134 }
135 } else if let Some(arr) = value.as_array() {
136 for v in arr {
137 if let Some(s) = v.as_str() {
138 out.insert(s.to_string());
139 }
140 }
141 }
142 }
143 }
144 }
145 out.into_iter().collect()
146}
147
148pub fn suggest_codex_model(query: &str) -> Option<String> {
154 let query_lc = query.to_ascii_lowercase();
155 let models = list_codex_model_lc();
156
157 for m in &models {
159 if m.contains(&query_lc) {
160 return Some(m.clone());
161 }
162 }
163
164 let max_distance = (query.len() / 3).max(2);
166 let mut best: Option<(usize, String)> = None;
167 for m in &models {
168 let d = levenshtein(query_lc.as_str(), m.as_str());
169 if d <= max_distance && best.as_ref().is_none_or(|(bd, _)| d < *bd) {
170 best = Some((d, m.clone()));
171 }
172 }
173 best.map(|(_, m)| m)
174}
175
176fn list_codex_model_lc() -> Vec<String> {
177 list_codex_models()
178 .into_iter()
179 .map(|s| s.to_ascii_lowercase())
180 .collect()
181}
182
183fn levenshtein(a: &str, b: &str) -> usize {
184 let a_chars: Vec<char> = a.chars().collect();
185 let b_chars: Vec<char> = b.chars().collect();
186 if a_chars.is_empty() {
187 return b_chars.len();
188 }
189 if b_chars.is_empty() {
190 return a_chars.len();
191 }
192 let mut prev: Vec<usize> = (0..=b_chars.len()).collect();
193 let mut curr = vec![0; b_chars.len() + 1];
194 for (i, &ac) in a_chars.iter().enumerate() {
195 curr[0] = i + 1;
196 for (j, &bc) in b_chars.iter().enumerate() {
197 let cost = if ac == bc { 0 } else { 1 };
198 curr[j + 1] = (curr[j] + 1).min(prev[j + 1] + 1).min(prev[j] + cost);
199 }
200 std::mem::swap(&mut prev, &mut curr);
201 }
202 prev[b_chars.len()]
203}
204
205pub fn build_codex_command(args: &CodexSpawnArgs<'_>) -> Command {
232 let full_prompt = format!("{}\n\n{}", args.prompt, args.input_text);
233
234 if let Ok(_key) = std::env::var("OPENAI_API_KEY") {
238 let mut cmd = Command::new("false");
239 cmd.env_clear();
240 cmd.env("PATH", "/nonexistent");
241 cmd.arg("--oauth-only-violation-openai-api-key-set");
242 return cmd;
243 }
244
245 std::fs::write(&args.schema_path, args.json_schema).ok();
248
249 let mut cmd = Command::new(args.binary);
250 cmd.env_clear();
251 for var in &[
254 "PATH",
255 "HOME",
256 "USER",
257 "SHELL",
258 "TERM",
259 "LANG",
260 "XDG_CONFIG_HOME",
261 "XDG_DATA_HOME",
262 "XDG_RUNTIME_DIR",
263 "XDG_CACHE_HOME",
264 "CODEX_ACCESS_TOKEN",
265 "CODEX_HOME",
266 "TMPDIR",
267 "TMP",
268 "TEMP",
269 "DYLD_FALLBACK_LIBRARY_PATH",
270 ] {
271 if let Ok(val) = std::env::var(var) {
272 cmd.env(var, val);
273 }
274 }
275
276 #[cfg(windows)]
277 for var in &[
278 "LOCALAPPDATA",
279 "APPDATA",
280 "USERPROFILE",
281 "SystemRoot",
282 "COMSPEC",
283 "PATHEXT",
284 ] {
285 if let Ok(val) = std::env::var(var) {
286 cmd.env(var, val);
287 }
288 }
289
290 cmd.arg("exec")
291 .arg("-c")
292 .arg("mcp_servers='{}'")
293 .arg("--json")
294 .arg("--output-schema")
295 .arg(&args.schema_path)
296 .arg("--ephemeral")
297 .arg("--skip-git-repo-check")
298 .arg("--sandbox")
299 .arg("read-only")
300 .arg("--ignore-user-config")
301 .arg("--ignore-rules")
302 .arg("--ask-for-approval")
303 .arg("never");
304
305 if let Some(m) = args.model {
306 cmd.arg("-m").arg(m);
307 }
308
309 cmd.arg("-");
311
312 cmd.stdin(Stdio::piped())
313 .stdout(Stdio::piped())
314 .stderr(Stdio::piped());
315 let _ = full_prompt; cmd
319}
320
321pub fn parse_codex_jsonl(stdout: &str) -> Result<CodexResult, AppError> {
334 let mut last_agent_text: Option<String> = None;
335 let mut usage: Option<CodexUsage> = None;
336 let mut rate_limited = false;
337 let mut schema_error = false;
338 let mut turn_failed = false;
339 let mut failed_message = String::new();
340
341 for line in stdout.lines() {
342 let line = line.trim();
343 if line.is_empty() {
344 continue;
345 }
346
347 let event: serde_json::Value = match serde_json::from_str(line) {
348 Ok(v) => v,
349 Err(_) => {
350 tracing::warn!(target: "codex_spawn", line, "skipping malformed JSONL line");
351 continue;
352 }
353 };
354
355 let event_type = match event.get("type").and_then(|t| t.as_str()) {
356 Some(t) => t,
357 None => continue,
358 };
359
360 match event_type {
361 "item.completed" => {
362 if let Some(item) = event.get("item") {
363 if item.get("type").and_then(|t| t.as_str()) == Some("agent_message") {
364 if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
365 last_agent_text = Some(text.to_string());
366 }
367 }
368 }
369 }
370 "turn.completed" => {
371 if let Some(u) = event.get("usage") {
372 let is_populated = u
377 .get("input_tokens")
378 .and_then(|v| v.as_u64())
379 .map(|n| n > 0)
380 .unwrap_or(false)
381 || u.get("output_tokens")
382 .and_then(|v| v.as_u64())
383 .map(|n| n > 0)
384 .unwrap_or(false);
385 if is_populated {
386 if let Ok(parsed) = serde_json::from_value::<CodexUsage>(u.clone()) {
387 usage = Some(parsed);
388 }
389 }
390 }
391 }
392 "turn.failed" => {
393 turn_failed = true;
394 if let Some(err) = event.get("error") {
395 let msg = err
396 .get("message")
397 .and_then(|m| m.as_str())
398 .unwrap_or("unknown error");
399 failed_message = msg.to_string();
400 if msg.contains("rate_limit")
401 || msg.contains("429")
402 || msg.contains("Too Many Requests")
403 {
404 rate_limited = true;
405 }
406 }
407 }
408 "error" => {
409 if let Some(msg) = event.get("message").and_then(|m| m.as_str()) {
410 if msg.contains("invalid_json_schema") || msg.contains("schema") {
411 schema_error = true;
412 }
413 }
414 }
415 _ => {}
416 }
417 }
418
419 let text = last_agent_text.ok_or_else(|| {
420 AppError::Validation(format!(
421 "no agent_message in codex JSONL output (rate_limited={rate_limited}, schema_error={schema_error}, turn_failed={turn_failed})"
422 ))
423 })?;
424
425 if turn_failed {
426 return Err(AppError::Validation(format!(
427 "codex turn failed: {failed_message}"
428 )));
429 }
430 if schema_error {
431 return Err(AppError::Validation(
432 "codex reported invalid_json_schema; check the --output-schema file".to_string(),
433 ));
434 }
435 if rate_limited {
436 return Err(AppError::Validation(format!(
437 "codex rate-limited: {failed_message}"
438 )));
439 }
440
441 let extraction = parse_extraction_text(&text)?;
442 Ok(CodexResult {
443 extraction,
444 usage,
445 rate_limited,
446 schema_error,
447 turn_failed,
448 failed_message,
449 })
450}
451
452pub fn parse_extraction_text(text: &str) -> Result<ExtractionResult, AppError> {
457 let value: serde_json::Value = serde_json::from_str(text).map_err(|e| {
458 AppError::Validation(format!("failed to parse codex agent_message as JSON: {e}"))
459 })?;
460 let obj = value.as_object().ok_or_else(|| {
461 AppError::Validation("codex agent_message is not a JSON object".to_string())
462 })?;
463
464 let mut entities: Vec<NewEntity> = Vec::new();
465 if let Some(arr) = obj.get("entities").and_then(|v| v.as_array()) {
466 for e in arr {
467 if let Some(name) = e.get("name").and_then(|v| v.as_str()) {
468 let entity_type_str = e
471 .get("type")
472 .or_else(|| e.get("entity_type"))
473 .and_then(|v| v.as_str())
474 .unwrap_or("concept");
475 let entity_type = serde_json::from_value::<crate::entity_type::EntityType>(
476 serde_json::Value::String(entity_type_str.to_string()),
477 )
478 .unwrap_or(crate::entity_type::EntityType::Concept);
479 entities.push(NewEntity {
480 name: name.to_string(),
481 entity_type,
482 description: None,
483 });
484 }
485 }
486 }
487
488 let mut relationships: Vec<NewRelationship> = Vec::new();
489 if let Some(arr) = obj.get("relationships").and_then(|v| v.as_array()) {
490 for r in arr {
491 let from = r.get("source").or_else(|| r.get("from"));
492 let to = r.get("target").or_else(|| r.get("to"));
493 let rel = r.get("relation").and_then(|v| v.as_str());
494 if let (Some(from_v), Some(to_v), Some(rel_v)) = (
495 from.and_then(|v| v.as_str()),
496 to.and_then(|v| v.as_str()),
497 rel,
498 ) {
499 relationships.push(NewRelationship {
500 source: from_v.to_string(),
501 target: to_v.to_string(),
502 relation: rel_v.to_string(),
503 strength: r.get("strength").and_then(|v| v.as_f64()).unwrap_or(0.5),
504 description: None,
505 });
506 }
507 }
508 }
509
510 let urls: Vec<ExtractedUrl> = obj
511 .get("urls")
512 .and_then(|v| v.as_array())
513 .map(|arr| {
514 arr.iter()
515 .filter_map(|u| {
516 Some(ExtractedUrl {
517 url: u.get("url")?.as_str()?.to_string(),
518 offset: u.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize,
519 })
520 })
521 .collect()
522 })
523 .unwrap_or_default();
524
525 Ok(ExtractionResult {
526 entities,
527 relationships,
528 relationships_truncated: obj
529 .get("relationships_truncated")
530 .and_then(|v| v.as_bool())
531 .unwrap_or(false),
532 extraction_method: obj
533 .get("extraction_method")
534 .and_then(|v| v.as_str())
535 .unwrap_or("codex")
536 .to_string(),
537 urls,
538 })
539}
540
541#[cfg(test)]
542mod tests {
543 use super::*;
544
545 const SAMPLE_JSONL: &str = r#"{"type":"thread.started","thread_id":"abc"}
546{"type":"turn.started"}
547{"type":"item.completed","item":{"type":"reasoning","text":"thinking"}}
548{"type":"item.completed","item":{"type":"agent_message","text":"{\"entities\":[{\"name\":\"alpha\",\"type\":\"concept\"}],\"relationships\":[{\"source\":\"alpha\",\"target\":\"beta\",\"relation\":\"uses\",\"strength\":0.7}],\"extraction_method\":\"codex\",\"urls\":[]}"}}
549{"type":"turn.completed","usage":{"input_tokens":120,"output_tokens":45}}
550{"type":"turn.completed","usage":{}}
551"#;
552
553 #[test]
554 fn parse_codex_jsonl_extracts_last_agent_message() {
555 let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
556 assert_eq!(result.extraction.entities.len(), 1);
557 assert_eq!(result.extraction.entities[0].name, "alpha");
558 assert_eq!(result.extraction.relationships.len(), 1);
559 assert_eq!(result.extraction.relationships[0].relation, "uses");
560 assert!((result.extraction.relationships[0].strength - 0.7).abs() < 1e-6);
561 }
562
563 #[test]
564 fn parse_codex_jsonl_collects_usage() {
565 let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
566 let usage = result.usage.expect("usage must be populated");
567 assert_eq!(usage.input_tokens, 120);
568 assert_eq!(usage.output_tokens, 45);
569 }
570
571 #[test]
572 fn parse_codex_jsonl_detects_rate_limit() {
573 let r = parse_codex_jsonl(
574 "{\"type\":\"turn.failed\",\"error\":{\"message\":\"rate_limit: 429 too many\"}}\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{}\"}}",
575 );
576 assert!(matches!(r, Err(AppError::Validation(_))));
577 }
578
579 #[test]
580 fn parse_codex_jsonl_handles_no_agent_message() {
581 let r = parse_codex_jsonl("{\"type\":\"thread.started\"}");
582 assert!(matches!(r, Err(AppError::Validation(_))));
583 }
584
585 #[test]
586 fn parse_codex_jsonl_skips_malformed_lines() {
587 let r = parse_codex_jsonl(
588 "{not valid json\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{\\\"entities\\\":[],\\\"relationships\\\":[],\\\"extraction_method\\\":\\\"codex\\\"}\"}}",
589 );
590 assert!(r.is_ok(), "malformed lines must be skipped, got {r:?}");
591 }
592
593 #[test]
594 fn validate_codex_model_accepts_known() {
595 assert!(validate_codex_model(Some("gpt-5.5")).is_ok());
596 assert!(validate_codex_model(Some("gpt-5.4")).is_ok());
597 assert!(validate_codex_model(None).is_ok()); }
599
600 #[test]
601 fn validate_codex_model_rejects_unknown() {
602 let err = validate_codex_model(Some("gpt-4")).unwrap_err();
603 let msg = format!("{err}");
604 assert!(msg.contains("not supported"));
605 assert!(msg.contains("gpt-5.5"));
606 }
607
608 #[test]
609 fn list_codex_models_includes_all_static_whitelist() {
610 let models = list_codex_models();
611 for m in CODEX_PRO_OAUTH_MODELS {
612 assert!(models.contains(&m.to_string()), "missing {m} in {models:?}");
613 }
614 }
615
616 #[test]
617 fn suggest_codex_model_substring_match() {
618 let s = suggest_codex_model("gpt-5");
619 assert!(s.is_some(), "must suggest a gpt-5.x model");
620 }
621
622 #[test]
623 fn suggest_codex_model_fuzzy_match() {
624 let s = suggest_codex_model("gpt5.5");
626 assert!(s.is_some(), "fuzzy must suggest gpt-5.5 for 'gpt5.5'");
627 assert_eq!(s.unwrap(), "gpt-5.5");
628 }
629
630 #[test]
631 fn suggest_codex_model_unrelated_returns_none() {
632 let s = suggest_codex_model("totally-unrelated-zzz");
633 assert!(s.is_none());
634 }
635
636 #[test]
637 fn build_codex_command_includes_hardening_flags() {
638 let args = CodexSpawnArgs {
639 binary: Path::new("/bin/true"),
640 prompt: "p",
641 json_schema: "{}",
642 input_text: "i",
643 model: Some("gpt-5.5"),
644 timeout_secs: 60,
645 schema_path: std::env::temp_dir().join("test-schema.json"),
646 };
647 let cmd = build_codex_command(&args);
648 let collected: Vec<String> = cmd
649 .get_args()
650 .filter_map(|a| a.to_str().map(|s| s.to_string()))
651 .collect();
652 for required in &[
653 "exec",
654 "--json",
655 "--output-schema",
656 "--ephemeral",
657 "--skip-git-repo-check",
658 "--sandbox",
659 "read-only",
660 "--ignore-user-config",
661 "--ignore-rules",
662 "-m",
663 "gpt-5.5",
664 "-",
665 ] {
666 assert!(
667 collected.iter().any(|a| a == required),
668 "missing flag {required} in {collected:?}"
669 );
670 }
671 }
672
673 #[test]
674 fn list_codex_models_dedupes_with_cache_file() {
675 let models = list_codex_models();
679 let unique: std::collections::HashSet<_> = models.iter().collect();
680 assert_eq!(unique.len(), models.len(), "list_codex_models must dedupe");
681 }
682
683 #[test]
688 #[serial_test::serial(env)]
689 fn build_command_oauth_only_mandatory_flags() {
690 unsafe {
692 std::env::remove_var("OPENAI_API_KEY");
693 }
694 let schema = std::env::temp_dir().join("codex-test-schema.json");
695 let _ = std::fs::remove_file(&schema);
696 let args = CodexSpawnArgs {
697 binary: std::path::Path::new("/usr/bin/false"),
698 prompt: "p",
699 json_schema: "{}",
700 input_text: "i",
701 model: Some("gpt-5.4-mini"),
702 timeout_secs: 60,
703 schema_path: schema.clone(),
704 };
705 let cmd = build_codex_command(&args);
706 let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
707 assert!(argv.contains(&"-c"), "must have -c (gaps.md:234)");
709 assert!(
710 argv.contains(&"mcp_servers='{}'"),
711 "must have mcp_servers override (gaps.md:234)"
712 );
713 assert!(
714 argv.contains(&"--ignore-user-config"),
715 "must have --ignore-user-config (gaps.md:266)"
716 );
717 assert!(
718 argv.contains(&"--ask-for-approval"),
719 "must have --ask-for-approval never (gaps.md:237)"
720 );
721 assert!(
722 argv.contains(&"--sandbox"),
723 "must have --sandbox read-only (G31)"
724 );
725 assert!(argv.contains(&"--ephemeral"), "must have --ephemeral (G31)");
726 assert!(
727 argv.contains(&"--skip-git-repo-check"),
728 "must have --skip-git-repo-check (G31)"
729 );
730 assert!(
731 argv.contains(&"--ignore-rules"),
732 "must have --ignore-rules (G31)"
733 );
734 }
735
736 #[test]
740 #[serial_test::serial(env)]
741 fn build_command_aborts_when_openai_api_key_set() {
742 unsafe {
744 std::env::set_var("OPENAI_API_KEY", "sk-violation-test");
745 }
746 let schema = std::env::temp_dir().join("codex-test-schema-abort.json");
747 let _ = std::fs::remove_file(&schema);
748 let args = CodexSpawnArgs {
749 binary: std::path::Path::new("/usr/bin/codex"),
750 prompt: "p",
751 json_schema: "{}",
752 input_text: "i",
753 model: Some("gpt-5.4-mini"),
754 timeout_secs: 60,
755 schema_path: schema.clone(),
756 };
757 let cmd = build_codex_command(&args);
758 let program = cmd.get_program().to_string_lossy().to_string();
759 let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
760 assert_eq!(
761 program, "false",
762 "when OPENAI_API_KEY is set, build_codex_command must abort"
763 );
764 assert!(
765 argv.contains(&"--oauth-only-violation-openai-api-key-set"),
766 "aborted command must carry violation marker"
767 );
768 unsafe {
769 std::env::remove_var("OPENAI_API_KEY");
770 }
771 }
772}