Skip to main content

agent_orchestrator/
json_extract.rs

1use anyhow::{Context, Result};
2use serde::Deserialize;
3use serde_json::Value;
4
5/// Repair unquoted JSON by adding quotes around bare keys and string values.
6///
7/// Handles LLM output like `{id: docs/qa/foo.md, count: 42, ok: true}` and
8/// converts it to valid JSON. Idempotent on already-valid JSON since all
9/// keys/strings are already quoted.
10pub fn repair_unquoted_json(input: &str) -> String {
11    #[derive(Clone, Copy, PartialEq)]
12    enum Context {
13        Object,
14        Array,
15    }
16
17    #[derive(Clone, Copy, PartialEq)]
18    enum Expecting {
19        Key,
20        Value,
21        ArrayElement,
22    }
23
24    let mut out = String::with_capacity(input.len() + 64);
25    let bytes = input.as_bytes();
26    let len = bytes.len();
27    let mut i = 0;
28    let mut in_string = false;
29    let mut context_stack: Vec<Context> = Vec::new();
30    let mut expecting = Expecting::Value; // top-level
31    let mut ever_opened = false; // track if we ever entered a structure
32
33    while i < len {
34        let b = bytes[i];
35
36        if in_string {
37            out.push(b as char);
38            if b == b'\\' && i + 1 < len {
39                i += 1;
40                out.push(bytes[i] as char);
41            } else if b == b'"' {
42                in_string = false;
43            }
44            i += 1;
45            continue;
46        }
47
48        match b {
49            b'"' => {
50                in_string = true;
51                out.push('"');
52                i += 1;
53            }
54            b'{' => {
55                out.push('{');
56                context_stack.push(Context::Object);
57                expecting = Expecting::Key;
58                ever_opened = true;
59                i += 1;
60            }
61            b'[' => {
62                out.push('[');
63                context_stack.push(Context::Array);
64                expecting = Expecting::ArrayElement;
65                ever_opened = true;
66                i += 1;
67            }
68            b'}' => {
69                out.push('}');
70                context_stack.pop();
71                if ever_opened && context_stack.is_empty() {
72                    // Top-level structure closed; stop — don't corrupt trailing text
73                    out.push_str(&input[i + 1..]);
74                    return out;
75                }
76                i += 1;
77            }
78            b']' => {
79                out.push(']');
80                context_stack.pop();
81                if ever_opened && context_stack.is_empty() {
82                    out.push_str(&input[i + 1..]);
83                    return out;
84                }
85                i += 1;
86            }
87            b':' => {
88                out.push(':');
89                expecting = Expecting::Value;
90                i += 1;
91            }
92            b',' => {
93                out.push(',');
94                expecting = match context_stack.last() {
95                    Some(Context::Object) => Expecting::Key,
96                    Some(Context::Array) => Expecting::ArrayElement,
97                    None => Expecting::Value,
98                };
99                i += 1;
100            }
101            b if b.is_ascii_whitespace() => {
102                out.push(b as char);
103                i += 1;
104            }
105            _ => {
106                // Bare token — accumulate it
107                let start = i;
108                if expecting == Expecting::Key {
109                    // Key: accumulate [a-zA-Z0-9_-]
110                    while i < len
111                        && (bytes[i].is_ascii_alphanumeric()
112                            || bytes[i] == b'_'
113                            || bytes[i] == b'-')
114                    {
115                        i += 1;
116                    }
117                    let token = &input[start..i];
118                    out.push('"');
119                    out.push_str(token);
120                    out.push('"');
121                } else {
122                    // Value or ArrayElement: accumulate until , } ] or end
123                    while i < len && bytes[i] != b',' && bytes[i] != b'}' && bytes[i] != b']' {
124                        i += 1;
125                    }
126                    let token = input[start..i].trim();
127                    // Check if it's a number, bool, or null — leave as-is
128                    if token == "true"
129                        || token == "false"
130                        || token == "null"
131                        || token.parse::<f64>().is_ok()
132                    {
133                        out.push_str(token);
134                    } else {
135                        out.push('"');
136                        out.push_str(token);
137                        out.push('"');
138                    }
139                }
140            }
141        }
142    }
143
144    out
145}
146
147/// Extract a JSON array from a JSON string using a simple path expression.
148///
149/// Supports paths like `$.field_name` or `$.field.nested`.
150/// Returns the array found at the given path.
151///
152/// Resilient to mixed-text input (e.g. LLM agent output with natural language
153/// before/after JSON). Tries in order:
154/// 1. Parse the entire string as JSON
155/// 2. Extract from a fenced code block (```json ... ```)
156/// 3. Scan for the first `{` or `[` and try parsing from there
157pub fn extract_json_array(json_str: &str, path: &str) -> Result<Vec<Value>> {
158    // 1. Try parsing the whole string as JSON
159    if let Ok(root) = serde_json::from_str::<Value>(json_str) {
160        let target = resolve_path(&root, path)?;
161        return match target {
162            Value::Array(arr) => Ok(arr.clone()),
163            _ => anyhow::bail!("path '{}' does not point to an array", path),
164        };
165    }
166
167    // 2. Try extracting from a fenced code block (```json ... ``` or ``` ... ```)
168    if let Some(json_block) = extract_fenced_json(json_str) {
169        if let Ok(root) = serde_json::from_str::<Value>(&json_block) {
170            if let Ok(target) = resolve_path(&root, path) {
171                return match target {
172                    Value::Array(arr) => Ok(arr.clone()),
173                    _ => anyhow::bail!("path '{}' does not point to an array", path),
174                };
175            }
176        }
177    }
178
179    // 2.5 Try repairing unquoted JSON
180    let repaired = repair_unquoted_json(json_str);
181    if repaired != json_str {
182        if let Ok(root) = serde_json::from_str::<Value>(&repaired) {
183            if let Ok(target) = resolve_path(&root, path) {
184                return match target {
185                    Value::Array(arr) => Ok(arr.clone()),
186                    _ => anyhow::bail!("path '{}' does not point to an array", path),
187                };
188            }
189        }
190    }
191
192    // 3. Scan for JSON objects/arrays starting at each `{` or `[`
193    if let Some(arr) = scan_for_json_with_path(json_str, path) {
194        return Ok(arr);
195    }
196
197    anyhow::bail!("no valid JSON containing path '{}' found in text", path)
198}
199
200/// Extract JSON content from a markdown fenced code block.
201fn extract_fenced_json(text: &str) -> Option<String> {
202    // Match ```json ... ``` or ``` ... ```
203    let fence_start_markers = ["```json\n", "```json\r\n", "```\n", "```\r\n"];
204    for marker in &fence_start_markers {
205        if let Some(start) = text.find(marker) {
206            let content_start = start + marker.len();
207            if let Some(end) = text[content_start..].find("```") {
208                return Some(text[content_start..content_start + end].trim().to_string());
209            }
210        }
211    }
212    None
213}
214
215/// Scan text for JSON objects starting at each `{` or `[`, try to parse and resolve path.
216/// Uses `serde_json::Deserializer::from_str` to parse a single value from a prefix,
217/// allowing trailing text after the JSON.
218fn scan_for_json_with_path(text: &str, path: &str) -> Option<Vec<Value>> {
219    for (i, ch) in text.char_indices() {
220        if ch != '{' && ch != '[' {
221            continue;
222        }
223        let slice = &text[i..];
224        let mut de = serde_json::Deserializer::from_str(slice);
225        if let Ok(root) = <Value as Deserialize>::deserialize(&mut de) {
226            if let Ok(Value::Array(arr)) = resolve_path(&root, path) {
227                return Some(arr.clone());
228            }
229        }
230        // Fallback: try repairing unquoted JSON in this slice
231        let repaired = repair_unquoted_json(slice);
232        if repaired != slice {
233            let mut de = serde_json::Deserializer::from_str(&repaired);
234            if let Ok(root) = <Value as Deserialize>::deserialize(&mut de) {
235                if let Ok(Value::Array(arr)) = resolve_path(&root, path) {
236                    return Some(arr.clone());
237                }
238            }
239        }
240    }
241    None
242}
243
244/// Extract a single field value from a JSON Value using a simple dot-path.
245///
246/// Supports paths like `$.field_name` or `$.field.nested`.
247/// Returns the string representation of the value, or None if not found.
248pub fn extract_field(value: &Value, path: &str) -> Option<String> {
249    let resolved = resolve_path(value, path).ok()?;
250    match resolved {
251        Value::String(s) => Some(s.clone()),
252        Value::Null => None,
253        other => Some(other.to_string()),
254    }
255}
256
257/// Extract the `result` field from the last `{"type":"result",...}` line in stream-json JSONL.
258pub fn extract_stream_json_result(content: &str) -> Option<String> {
259    for line in content.lines().rev() {
260        let trimmed = line.trim();
261        if trimmed.is_empty() {
262            continue;
263        }
264        if trimmed.contains("\"type\":\"result\"") || trimmed.contains("\"type\": \"result\"") {
265            if let Ok(parsed) = serde_json::from_str::<Value>(trimmed) {
266                if let Some(result) = parsed.get("result").and_then(|v| v.as_str()) {
267                    return Some(result.to_string());
268                }
269            }
270            if let Some(extracted) = extract_result_field_raw(trimmed) {
271                return Some(extracted);
272            }
273        }
274    }
275    None
276}
277
278fn resolve_path<'a>(root: &'a Value, path: &str) -> Result<&'a Value> {
279    if path == "$" {
280        return Ok(root);
281    }
282    let path = path.strip_prefix("$.").unwrap_or(path);
283    let mut current = root;
284    for segment in path.split('.') {
285        if segment.is_empty() {
286            continue;
287        }
288        current = current
289            .get(segment)
290            .with_context(|| format!("field '{}' not found", segment))?;
291    }
292    Ok(current)
293}
294
295fn extract_result_field_raw(line: &str) -> Option<String> {
296    let marker = "\"result\":\"";
297    let pos = line.find(marker)?;
298    let value_start = pos + marker.len();
299    let bytes = line.as_bytes();
300
301    let mut i = value_start;
302    let mut result = String::new();
303    while i < bytes.len() {
304        match bytes[i] {
305            b'\\' if i + 1 < bytes.len() => {
306                match bytes[i + 1] {
307                    b'"' => result.push('"'),
308                    b'\\' => result.push('\\'),
309                    b'n' => result.push('\n'),
310                    b'r' => result.push('\r'),
311                    b't' => result.push('\t'),
312                    b'/' => result.push('/'),
313                    b'u' if i + 5 < bytes.len() => {
314                        let hex = &line[i + 2..i + 6];
315                        if let Ok(cp) = u32::from_str_radix(hex, 16) {
316                            if let Some(ch) = char::from_u32(cp) {
317                                result.push(ch);
318                            }
319                        }
320                        i += 6;
321                        continue;
322                    }
323                    other => {
324                        result.push('\\');
325                        result.push(other as char);
326                    }
327                }
328                i += 2;
329            }
330            b'"' => return Some(result),
331            _ => {
332                result.push(bytes[i] as char);
333                i += 1;
334            }
335        }
336    }
337    None
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343    use serde_json::json;
344
345    #[test]
346    fn extract_array_simple() {
347        let json = r#"{"goals": ["a", "b", "c"]}"#;
348        let arr = extract_json_array(json, "$.goals").expect("extract goals");
349        assert_eq!(arr.len(), 3);
350        assert_eq!(arr[0], json!("a"));
351    }
352
353    #[test]
354    fn extract_array_nested() {
355        let json = r#"{"result": {"items": [1, 2]}}"#;
356        let arr = extract_json_array(json, "$.result.items").expect("extract nested");
357        assert_eq!(arr.len(), 2);
358    }
359
360    #[test]
361    fn extract_array_not_array_fails() {
362        let json = r#"{"goals": "not_array"}"#;
363        let result = extract_json_array(json, "$.goals");
364        assert!(result.is_err());
365    }
366
367    #[test]
368    fn extract_array_missing_field_fails() {
369        let json = r#"{"goals": []}"#;
370        let result = extract_json_array(json, "$.missing");
371        assert!(result.is_err());
372    }
373
374    #[test]
375    fn extract_field_string() {
376        let value = json!({"name": "test", "score": 42});
377        assert_eq!(extract_field(&value, "$.name"), Some("test".to_string()));
378    }
379
380    #[test]
381    fn extract_field_number() {
382        let value = json!({"score": 42});
383        assert_eq!(extract_field(&value, "$.score"), Some("42".to_string()));
384    }
385
386    #[test]
387    fn extract_field_nested() {
388        let value = json!({"meta": {"id": "abc"}});
389        assert_eq!(extract_field(&value, "$.meta.id"), Some("abc".to_string()));
390    }
391
392    #[test]
393    fn extract_field_missing_returns_none() {
394        let value = json!({"name": "test"});
395        assert_eq!(extract_field(&value, "$.missing"), None);
396    }
397
398    #[test]
399    fn extract_field_null_returns_none() {
400        let value = json!({"name": null});
401        assert_eq!(extract_field(&value, "$.name"), None);
402    }
403
404    #[test]
405    fn extract_field_boolean() {
406        let value = json!({"active": true});
407        assert_eq!(extract_field(&value, "$.active"), Some("true".to_string()));
408    }
409
410    #[test]
411    fn extract_array_from_mixed_text_with_preamble() {
412        let mixed = r#"Based on my analysis, I identified these targets:
413
414{"regression_targets": [{"id": "target-a", "name": "A"}, {"id": "target-b", "name": "B"}]}"#;
415        let arr = extract_json_array(mixed, "$.regression_targets")
416            .expect("should extract from mixed text");
417        assert_eq!(arr.len(), 2);
418        assert_eq!(arr[0]["id"], json!("target-a"));
419        assert_eq!(arr[1]["id"], json!("target-b"));
420    }
421
422    #[test]
423    fn extract_array_from_fenced_code_block() {
424        let fenced = r#"Here are the results:
425
426```json
427{"items": [{"id": "a"}, {"id": "b"}, {"id": "c"}]}
428```
429
430Done."#;
431        let arr = extract_json_array(fenced, "$.items").expect("should extract from fenced block");
432        assert_eq!(arr.len(), 3);
433    }
434
435    #[test]
436    fn extract_array_from_unfenced_code_block() {
437        let fenced = r#"Results:
438
439```
440{"goals": ["x", "y"]}
441```
442"#;
443        let arr =
444            extract_json_array(fenced, "$.goals").expect("should extract from unfenced block");
445        assert_eq!(arr.len(), 2);
446    }
447
448    #[test]
449    fn extract_array_multiple_json_objects_finds_correct_one() {
450        let multi = r#"Summary: {"status": "ok", "count": 3}
451
452Details:
453{"regression_targets": [{"id": "rt-1"}, {"id": "rt-2"}]}
454
455Footer: {"ts": "2026-01-01"}"#;
456        let arr = extract_json_array(multi, "$.regression_targets")
457            .expect("should find the correct JSON object");
458        assert_eq!(arr.len(), 2);
459        assert_eq!(arr[0]["id"], json!("rt-1"));
460    }
461
462    #[test]
463    fn extract_array_unquoted_json_succeeds() {
464        let input = "I found: {targets: [a, b]}";
465        let arr = extract_json_array(input, "$.targets").expect("should repair and extract");
466        assert_eq!(arr.len(), 2);
467        assert_eq!(arr[0], json!("a"));
468        assert_eq!(arr[1], json!("b"));
469    }
470
471    #[test]
472    fn extract_array_truly_unparsable_fails() {
473        let bad = "I found: <<<not json at all>>>";
474        let result = extract_json_array(bad, "$.targets");
475        assert!(result.is_err());
476    }
477
478    #[test]
479    fn extract_array_no_matching_path_in_mixed_text_fails() {
480        let mixed = r#"Some text {"other_field": [1, 2]}"#;
481        let result = extract_json_array(mixed, "$.regression_targets");
482        assert!(result.is_err());
483    }
484
485    #[test]
486    fn extract_array_pure_json_still_works() {
487        // Regression guard: pure JSON must keep working
488        let pure = r#"{"items": [{"id": "clean"}]}"#;
489        let arr = extract_json_array(pure, "$.items").expect("pure JSON must work");
490        assert_eq!(arr.len(), 1);
491        assert_eq!(arr[0]["id"], json!("clean"));
492    }
493
494    #[test]
495    fn extract_stream_json_result_prefers_last_result_line() {
496        let content = concat!(
497            "{\"type\":\"result\",\"result\":\"{\\\"score\\\":1}\"}\n",
498            "{\"type\":\"result\",\"result\":\"{\\\"score\\\":2}\"}\n"
499        );
500
501        assert_eq!(
502            extract_stream_json_result(content),
503            Some("{\"score\":2}".to_string())
504        );
505    }
506
507    #[test]
508    fn extract_stream_json_result_handles_redacted_lines() {
509        let content =
510            "{\"type\":\"result\",\"cost_usd\":[REDACTED],\"result\":\"{\\\"score\\\":42}\"}";
511
512        assert_eq!(
513            extract_stream_json_result(content),
514            Some("{\"score\":42}".to_string())
515        );
516    }
517
518    // --- repair_unquoted_json tests ---
519
520    #[test]
521    fn repair_unquoted_json_keys_and_values() {
522        let input = r#"{id: docs/qa/foo.md, name: test}"#;
523        let repaired = repair_unquoted_json(input);
524        let parsed: Value = serde_json::from_str(&repaired).expect("should be valid JSON");
525        assert_eq!(parsed["id"], json!("docs/qa/foo.md"));
526        assert_eq!(parsed["name"], json!("test"));
527    }
528
529    #[test]
530    fn repair_unquoted_json_nested_array() {
531        let input = r#"{items: [{id: a}, {id: b}]}"#;
532        let repaired = repair_unquoted_json(input);
533        let parsed: Value = serde_json::from_str(&repaired).expect("should be valid JSON");
534        assert_eq!(parsed["items"].as_array().unwrap().len(), 2);
535        assert_eq!(parsed["items"][0]["id"], json!("a"));
536        assert_eq!(parsed["items"][1]["id"], json!("b"));
537    }
538
539    #[test]
540    fn repair_unquoted_json_preserves_valid() {
541        let input = r#"{"id":"a"}"#;
542        let repaired = repair_unquoted_json(input);
543        assert_eq!(repaired, input);
544    }
545
546    #[test]
547    fn repair_unquoted_json_mixed_quoted() {
548        let input = r#"{"id": "a", name: b}"#;
549        let repaired = repair_unquoted_json(input);
550        let parsed: Value = serde_json::from_str(&repaired).expect("should be valid JSON");
551        assert_eq!(parsed["id"], json!("a"));
552        assert_eq!(parsed["name"], json!("b"));
553    }
554
555    #[test]
556    fn repair_unquoted_json_numbers_bools_null() {
557        let input = r#"{count: 42, ok: true, x: null}"#;
558        let repaired = repair_unquoted_json(input);
559        let parsed: Value = serde_json::from_str(&repaired).expect("should be valid JSON");
560        assert_eq!(parsed["count"], json!(42));
561        assert_eq!(parsed["ok"], json!(true));
562        assert_eq!(parsed["x"], json!(null));
563    }
564
565    #[test]
566    fn repair_unquoted_json_file_paths() {
567        let input = r#"{id: docs/qa/orchestrator/02-cli-task-lifecycle.md}"#;
568        let repaired = repair_unquoted_json(input);
569        let parsed: Value = serde_json::from_str(&repaired).expect("should be valid JSON");
570        assert_eq!(
571            parsed["id"],
572            json!("docs/qa/orchestrator/02-cli-task-lifecycle.md")
573        );
574    }
575
576    #[test]
577    fn extract_array_unquoted_regression_targets() {
578        let input = r#"{regression_targets: [{id: docs/qa/foo.md, scope: unit}, {id: docs/qa/bar.md, scope: e2e}, {id: docs/qa/baz.md, scope: unit}, {id: docs/qa/qux.md, scope: integration}, {id: docs/qa/quux.md, scope: unit}]}"#;
579        let arr = extract_json_array(input, "$.regression_targets")
580            .expect("should extract unquoted regression targets");
581        assert_eq!(arr.len(), 5);
582        assert_eq!(arr[0]["id"], json!("docs/qa/foo.md"));
583        assert_eq!(arr[2]["scope"], json!("unit"));
584    }
585
586    #[test]
587    fn extract_array_mixed_text_unquoted() {
588        let input = r#"Based on my analysis, here are the targets:
589
590{regression_targets: [{id: target-a, name: A}, {id: target-b, name: B}]}
591
592That's all."#;
593        let arr = extract_json_array(input, "$.regression_targets")
594            .expect("should extract from mixed text with unquoted JSON");
595        assert_eq!(arr.len(), 2);
596        assert_eq!(arr[0]["id"], json!("target-a"));
597        assert_eq!(arr[1]["name"], json!("B"));
598    }
599}