Skip to main content

agx_core/
debug_unknowns.rs

1//! Format-drift diagnostics: scan a session file and report any entry types
2//! or content-item types the typed parsers don't recognize.
3//!
4//! Used when the `--debug-unknowns` CLI flag is set. The cost is one extra
5//! `serde_json::Value` parse per line — only runs with the flag.
6//!
7//! Output is intentionally terse and machine-greppable: one section per
8//! format, sorted alphabetically by tag, with the first three line numbers
9//! where each unknown was seen.
10
11use crate::format::Format;
12use anyhow::{Context, Result};
13use std::collections::BTreeMap;
14use std::io::{self, Write};
15use std::path::{Path, PathBuf};
16
17const SAMPLE_LIMIT: usize = 3;
18
19#[derive(Debug, Default)]
20pub struct UnknownReport {
21    pub format: Option<Format>,
22    pub path: PathBuf,
23    pub total_lines: usize,
24    pub unknown_top_level: BTreeMap<String, Vec<usize>>,
25    pub unknown_payload_types: BTreeMap<String, Vec<usize>>,
26    pub unknown_content_item_types: BTreeMap<String, Vec<usize>>,
27}
28
29impl UnknownReport {
30    pub fn is_clean(&self) -> bool {
31        self.unknown_top_level.is_empty()
32            && self.unknown_payload_types.is_empty()
33            && self.unknown_content_item_types.is_empty()
34    }
35
36    pub fn print<W: Write>(&self, w: &mut W) -> io::Result<()> {
37        let fmt_label = self
38            .format
39            .map(|f| f.to_string())
40            .unwrap_or_else(|| "(unknown)".into());
41        writeln!(
42            w,
43            "[debug-unknowns] format={} path={} lines={}",
44            fmt_label,
45            self.path.display(),
46            self.total_lines
47        )?;
48        if self.is_clean() {
49            writeln!(w, "  no unknown entry types or fields detected")?;
50            return Ok(());
51        }
52        print_section(w, "unknown top-level type", &self.unknown_top_level)?;
53        print_section(w, "unknown payload type", &self.unknown_payload_types)?;
54        print_section(
55            w,
56            "unknown content-item type",
57            &self.unknown_content_item_types,
58        )?;
59        Ok(())
60    }
61}
62
63fn print_section<W: Write>(
64    w: &mut W,
65    label: &str,
66    map: &BTreeMap<String, Vec<usize>>,
67) -> io::Result<()> {
68    if map.is_empty() {
69        return Ok(());
70    }
71    for (tag, lines) in map {
72        writeln!(
73            w,
74            "  {label}={tag:?} count={} first_lines={:?}",
75            lines.len(),
76            &lines[..lines.len().min(SAMPLE_LIMIT)]
77        )?;
78    }
79    Ok(())
80}
81
82fn record(map: &mut BTreeMap<String, Vec<usize>>, tag: &str, line: usize) {
83    map.entry(tag.to_string()).or_default().push(line);
84}
85
86pub fn scan(format: Format, path: &Path) -> Result<UnknownReport> {
87    let content = std::fs::read_to_string(path)
88        .with_context(|| format!("reading session file: {}", path.display()))?;
89    let mut report = UnknownReport {
90        format: Some(format),
91        path: path.to_path_buf(),
92        ..UnknownReport::default()
93    };
94    match format {
95        Format::ClaudeCode => scan_claude_code(&content, &mut report),
96        Format::Codex => scan_codex(&content, &mut report),
97        Format::Gemini => scan_gemini(&content, &mut report)?,
98        Format::Generic => scan_generic(&content, &mut report)?,
99        Format::Langchain => scan_langchain(&content, &mut report)?,
100        Format::OtelJson => scan_otel_json(&content, &mut report)?,
101        Format::VercelAi => scan_vercel_ai(&content, &mut report)?,
102        // Binary OTLP drift scanning would require pulling prost into the
103        // scanner too. Skipping for v0.3 — users can still get a parse
104        // error from the dispatch (or a clean summary if the feature is
105        // on). Phase 2.2 extension if demand shows up.
106        Format::OtelProto => {
107            record(
108                &mut report.unknown_top_level,
109                "<binary-otlp-not-scanned>",
110                0,
111            );
112        }
113    }
114    Ok(report)
115}
116
117const CLAUDE_KNOWN_TOP: &[&str] = &["user", "assistant"];
118const CLAUDE_KNOWN_USER_ITEMS: &[&str] = &["text", "tool_result"];
119const CLAUDE_KNOWN_ASSISTANT_ITEMS: &[&str] = &["text", "tool_use"];
120
121fn scan_claude_code(content: &str, report: &mut UnknownReport) {
122    for (i, line) in content.lines().enumerate() {
123        if line.trim().is_empty() {
124            continue;
125        }
126        report.total_lines += 1;
127        let line_num = i + 1;
128        let Ok(v) = serde_json::from_str::<serde_json::Value>(line) else {
129            record(&mut report.unknown_top_level, "<malformed-json>", line_num);
130            continue;
131        };
132        let Some(ty) = v.get("type").and_then(|t| t.as_str()) else {
133            record(&mut report.unknown_top_level, "<no-type-field>", line_num);
134            continue;
135        };
136        if !CLAUDE_KNOWN_TOP.contains(&ty) {
137            record(&mut report.unknown_top_level, ty, line_num);
138            continue;
139        }
140        let known_items = if ty == "user" {
141            CLAUDE_KNOWN_USER_ITEMS
142        } else {
143            CLAUDE_KNOWN_ASSISTANT_ITEMS
144        };
145        if let Some(items) = v
146            .get("message")
147            .and_then(|m| m.get("content"))
148            .and_then(|c| c.as_array())
149        {
150            for item in items {
151                if let Some(item_ty) = item.get("type").and_then(|t| t.as_str())
152                    && !known_items.contains(&item_ty)
153                {
154                    record(&mut report.unknown_content_item_types, item_ty, line_num);
155                }
156            }
157        }
158    }
159}
160
161const CODEX_KNOWN_TOP: &[&str] = &["session_meta", "event_msg", "response_item", "turn_context"];
162const CODEX_KNOWN_PAYLOAD: &[&str] = &["message", "function_call", "function_call_output"];
163
164fn scan_codex(content: &str, report: &mut UnknownReport) {
165    for (i, line) in content.lines().enumerate() {
166        if line.trim().is_empty() {
167            continue;
168        }
169        report.total_lines += 1;
170        let line_num = i + 1;
171        let Ok(v) = serde_json::from_str::<serde_json::Value>(line) else {
172            record(&mut report.unknown_top_level, "<malformed-json>", line_num);
173            continue;
174        };
175        let Some(ty) = v.get("type").and_then(|t| t.as_str()) else {
176            record(&mut report.unknown_top_level, "<no-type-field>", line_num);
177            continue;
178        };
179        if !CODEX_KNOWN_TOP.contains(&ty) {
180            record(&mut report.unknown_top_level, ty, line_num);
181            continue;
182        }
183        // For response_item entries, also track unrecognized payload.type values.
184        // Other top-level kinds (session_meta, event_msg, turn_context) are
185        // intentionally skipped — agx doesn't render them, so payload variation
186        // is not interesting.
187        if ty == "response_item"
188            && let Some(payload_ty) = v
189                .get("payload")
190                .and_then(|p| p.get("type"))
191                .and_then(|t| t.as_str())
192            && !CODEX_KNOWN_PAYLOAD.contains(&payload_ty)
193        {
194            record(&mut report.unknown_payload_types, payload_ty, line_num);
195        }
196    }
197}
198
199const GEMINI_KNOWN_MSG_TYPES: &[&str] = &["user", "gemini"];
200
201fn scan_gemini(content: &str, report: &mut UnknownReport) -> Result<()> {
202    let v: serde_json::Value = serde_json::from_str(content)
203        .with_context(|| "parsing Gemini session as JSON for drift scan")?;
204    let Some(messages) = v.get("messages").and_then(|m| m.as_array()) else {
205        return Ok(());
206    };
207    for (i, msg) in messages.iter().enumerate() {
208        report.total_lines += 1;
209        // Use 1-indexed message position as a stand-in for line number
210        let msg_idx = i + 1;
211        if let Some(ty) = msg.get("type").and_then(|t| t.as_str())
212            && !GEMINI_KNOWN_MSG_TYPES.contains(&ty)
213        {
214            record(&mut report.unknown_top_level, ty, msg_idx);
215        }
216    }
217    Ok(())
218}
219
220const GENERIC_KNOWN_ROLES: &[&str] = &["user", "assistant", "tool", "system"];
221
222// Run types agx renders. Everything else gets reported as a drift signal —
223// retriever / parser / prompt etc. are intentionally ignored today but are
224// still worth surfacing so contributors can see what's in their fixtures.
225const LANGCHAIN_KNOWN_RUN_TYPES: &[&str] = &["chain", "llm", "chat_model", "tool"];
226
227// Step types the Vercel AI SDK currently emits. New values landing in a
228// future SDK release will surface here before users notice silent
229// mis-parsing in their timeline.
230const VERCEL_KNOWN_STEP_TYPES: &[&str] = &["initial", "continue", "tool-result"];
231
232// Operations agx currently renders end-to-end. Everything else falls into
233// unknown_top_level with the operation name as the tag — useful signal
234// when new GenAI semconv operations ship.
235const OTEL_KNOWN_OPERATIONS: &[&str] = &[
236    "chat",
237    "text_completion",
238    "generate_content",
239    "execute_tool",
240];
241
242fn scan_langchain(content: &str, report: &mut UnknownReport) -> Result<()> {
243    let v: serde_json::Value =
244        serde_json::from_str(content).with_context(|| "parsing LangChain export for drift scan")?;
245    let mut idx = 0usize;
246    fn walk(run: &serde_json::Value, idx: &mut usize, report: &mut UnknownReport) {
247        *idx += 1;
248        report.total_lines += 1;
249        if let Some(ty) = run.get("run_type").and_then(|v| v.as_str())
250            && !LANGCHAIN_KNOWN_RUN_TYPES.contains(&ty)
251        {
252            record(&mut report.unknown_top_level, ty, *idx);
253        }
254        if let Some(children) = run.get("child_runs").and_then(|v| v.as_array()) {
255            for child in children {
256                walk(child, idx, report);
257            }
258        }
259    }
260    walk(&v, &mut idx, report);
261    Ok(())
262}
263
264fn scan_vercel_ai(content: &str, report: &mut UnknownReport) -> Result<()> {
265    let v: serde_json::Value = serde_json::from_str(content)
266        .with_context(|| "parsing Vercel AI SDK session for drift scan")?;
267    // Track steps (if `steps[]` is present) or treat the root as a single
268    // pseudo-step.
269    if let Some(steps) = v.get("steps").and_then(|s| s.as_array()) {
270        for (i, step) in steps.iter().enumerate() {
271            report.total_lines += 1;
272            if let Some(ty) = step.get("stepType").and_then(|v| v.as_str())
273                && !VERCEL_KNOWN_STEP_TYPES.contains(&ty)
274            {
275                record(&mut report.unknown_top_level, ty, i + 1);
276            }
277        }
278    } else {
279        report.total_lines += 1;
280    }
281    Ok(())
282}
283
284fn scan_otel_json(content: &str, report: &mut UnknownReport) -> Result<()> {
285    let v: serde_json::Value = serde_json::from_str(content)
286        .with_context(|| "parsing OTel JSON session for drift scan")?;
287    let Some(resource_spans) = v.get("resourceSpans").and_then(|x| x.as_array()) else {
288        return Ok(());
289    };
290    let mut span_idx = 0usize;
291    for rs in resource_spans {
292        let Some(scope_spans) = rs.get("scopeSpans").and_then(|x| x.as_array()) else {
293            continue;
294        };
295        for ss in scope_spans {
296            let Some(spans) = ss.get("spans").and_then(|x| x.as_array()) else {
297                continue;
298            };
299            for span in spans {
300                span_idx += 1;
301                report.total_lines += 1;
302                let Some(attrs) = span.get("attributes").and_then(|x| x.as_array()) else {
303                    continue;
304                };
305                let mut op: Option<String> = None;
306                for kv in attrs {
307                    if kv.get("key").and_then(|k| k.as_str()) == Some("gen_ai.operation.name")
308                        && let Some(s) = kv
309                            .get("value")
310                            .and_then(|v| v.get("stringValue"))
311                            .and_then(|v| v.as_str())
312                    {
313                        op = Some(s.to_string());
314                    }
315                }
316                if let Some(op) = op
317                    && !OTEL_KNOWN_OPERATIONS.contains(&op.as_str())
318                {
319                    record(&mut report.unknown_top_level, &op, span_idx);
320                }
321            }
322        }
323    }
324    Ok(())
325}
326
327fn scan_generic(content: &str, report: &mut UnknownReport) -> Result<()> {
328    let v: serde_json::Value = serde_json::from_str(content)
329        .with_context(|| "parsing generic session as JSON for drift scan")?;
330    let Some(messages) = v.get("messages").and_then(|m| m.as_array()) else {
331        return Ok(());
332    };
333    for (i, msg) in messages.iter().enumerate() {
334        report.total_lines += 1;
335        let msg_idx = i + 1;
336        if let Some(role) = msg.get("role").and_then(|r| r.as_str())
337            && !GENERIC_KNOWN_ROLES.contains(&role)
338        {
339            record(&mut report.unknown_top_level, role, msg_idx);
340        }
341    }
342    Ok(())
343}
344
345#[cfg(test)]
346mod tests {
347    use super::*;
348    use std::io::Write;
349    use tempfile::NamedTempFile;
350
351    fn write_file(content: &str) -> NamedTempFile {
352        let mut f = NamedTempFile::new().unwrap();
353        f.write_all(content.as_bytes()).unwrap();
354        f
355    }
356
357    #[test]
358    fn claude_clean_session_reports_no_unknowns() {
359        let jsonl = r#"{"type":"user","uuid":"u1","message":{"role":"user","content":"hi"}}
360{"type":"assistant","uuid":"a1","message":{"role":"assistant","content":[{"type":"text","text":"hello"}]}}
361"#;
362        let f = write_file(jsonl);
363        let report = scan(Format::ClaudeCode, f.path()).unwrap();
364        assert_eq!(report.total_lines, 2);
365        assert!(report.is_clean());
366    }
367
368    #[test]
369    fn claude_unknown_top_level_type_recorded() {
370        let jsonl = r#"{"type":"user","uuid":"u1","message":{"role":"user","content":"hi"}}
371{"type":"summary","summary":"…"}
372{"type":"summary","summary":"another"}
373"#;
374        let f = write_file(jsonl);
375        let report = scan(Format::ClaudeCode, f.path()).unwrap();
376        assert_eq!(
377            report.unknown_top_level.get("summary").unwrap(),
378            &vec![2, 3]
379        );
380    }
381
382    #[test]
383    fn claude_unknown_content_item_recorded() {
384        let jsonl = r#"{"type":"assistant","uuid":"a1","message":{"role":"assistant","content":[{"type":"thinking","content":"…"}]}}
385"#;
386        let f = write_file(jsonl);
387        let report = scan(Format::ClaudeCode, f.path()).unwrap();
388        assert_eq!(
389            report.unknown_content_item_types.get("thinking").unwrap(),
390            &vec![1]
391        );
392    }
393
394    #[test]
395    fn codex_unknown_payload_type_recorded() {
396        let jsonl = r#"{"type":"response_item","payload":{"type":"reasoning"}}
397{"type":"response_item","payload":{"type":"message","role":"user","content":[]}}
398"#;
399        let f = write_file(jsonl);
400        let report = scan(Format::Codex, f.path()).unwrap();
401        assert_eq!(
402            report.unknown_payload_types.get("reasoning").unwrap(),
403            &vec![1]
404        );
405    }
406
407    #[test]
408    fn codex_known_top_levels_not_reported() {
409        let jsonl = r#"{"type":"session_meta","payload":{}}
410{"type":"event_msg","payload":{}}
411{"type":"turn_context","payload":{}}
412"#;
413        let f = write_file(jsonl);
414        let report = scan(Format::Codex, f.path()).unwrap();
415        assert!(report.is_clean());
416    }
417
418    #[test]
419    fn gemini_unknown_message_type_recorded() {
420        let json = r#"{"sessionId":"s1","messages":[
421            {"type":"user","content":"hi"},
422            {"type":"info","content":"…"},
423            {"type":"system","content":"…"}
424        ]}"#;
425        let f = write_file(json);
426        let report = scan(Format::Gemini, f.path()).unwrap();
427        assert_eq!(report.unknown_top_level.get("info").unwrap(), &vec![2]);
428        assert_eq!(report.unknown_top_level.get("system").unwrap(), &vec![3]);
429    }
430
431    #[test]
432    fn generic_unknown_role_recorded() {
433        let json = r#"{"messages":[
434            {"role":"user","content":"hi"},
435            {"role":"developer","content":"…"}
436        ]}"#;
437        let f = write_file(json);
438        let report = scan(Format::Generic, f.path()).unwrap();
439        assert_eq!(report.unknown_top_level.get("developer").unwrap(), &vec![2]);
440    }
441
442    #[test]
443    fn report_print_clean_session() {
444        let report = UnknownReport {
445            format: Some(Format::ClaudeCode),
446            path: PathBuf::from("/tmp/x"),
447            total_lines: 5,
448            ..UnknownReport::default()
449        };
450        let mut out = Vec::new();
451        report.print(&mut out).unwrap();
452        let s = String::from_utf8(out).unwrap();
453        assert!(s.contains("no unknown"));
454        assert!(s.contains("lines=5"));
455    }
456
457    #[test]
458    fn report_print_with_unknowns_shows_first_lines() {
459        let mut report = UnknownReport {
460            format: Some(Format::Codex),
461            path: PathBuf::from("/tmp/x"),
462            total_lines: 10,
463            ..UnknownReport::default()
464        };
465        record(&mut report.unknown_payload_types, "reasoning", 3);
466        record(&mut report.unknown_payload_types, "reasoning", 7);
467        let mut out = Vec::new();
468        report.print(&mut out).unwrap();
469        let s = String::from_utf8(out).unwrap();
470        assert!(s.contains("reasoning"));
471        assert!(s.contains("count=2"));
472    }
473}