1use crate::format::Format;
12use anyhow::{Context, Result};
13use std::collections::BTreeMap;
14use std::io::{self, Write};
15use std::path::{Path, PathBuf};
16
17const SAMPLE_LIMIT: usize = 3;
18
19#[derive(Debug, Default)]
20pub struct UnknownReport {
21 pub format: Option<Format>,
22 pub path: PathBuf,
23 pub total_lines: usize,
24 pub unknown_top_level: BTreeMap<String, Vec<usize>>,
25 pub unknown_payload_types: BTreeMap<String, Vec<usize>>,
26 pub unknown_content_item_types: BTreeMap<String, Vec<usize>>,
27}
28
29impl UnknownReport {
30 pub fn is_clean(&self) -> bool {
31 self.unknown_top_level.is_empty()
32 && self.unknown_payload_types.is_empty()
33 && self.unknown_content_item_types.is_empty()
34 }
35
36 pub fn print<W: Write>(&self, w: &mut W) -> io::Result<()> {
37 let fmt_label = self
38 .format
39 .map(|f| f.to_string())
40 .unwrap_or_else(|| "(unknown)".into());
41 writeln!(
42 w,
43 "[debug-unknowns] format={} path={} lines={}",
44 fmt_label,
45 self.path.display(),
46 self.total_lines
47 )?;
48 if self.is_clean() {
49 writeln!(w, " no unknown entry types or fields detected")?;
50 return Ok(());
51 }
52 print_section(w, "unknown top-level type", &self.unknown_top_level)?;
53 print_section(w, "unknown payload type", &self.unknown_payload_types)?;
54 print_section(
55 w,
56 "unknown content-item type",
57 &self.unknown_content_item_types,
58 )?;
59 Ok(())
60 }
61}
62
63fn print_section<W: Write>(
64 w: &mut W,
65 label: &str,
66 map: &BTreeMap<String, Vec<usize>>,
67) -> io::Result<()> {
68 if map.is_empty() {
69 return Ok(());
70 }
71 for (tag, lines) in map {
72 writeln!(
73 w,
74 " {label}={tag:?} count={} first_lines={:?}",
75 lines.len(),
76 &lines[..lines.len().min(SAMPLE_LIMIT)]
77 )?;
78 }
79 Ok(())
80}
81
82fn record(map: &mut BTreeMap<String, Vec<usize>>, tag: &str, line: usize) {
83 map.entry(tag.to_string()).or_default().push(line);
84}
85
86pub fn scan(format: Format, path: &Path) -> Result<UnknownReport> {
87 let content = std::fs::read_to_string(path)
88 .with_context(|| format!("reading session file: {}", path.display()))?;
89 let mut report = UnknownReport {
90 format: Some(format),
91 path: path.to_path_buf(),
92 ..UnknownReport::default()
93 };
94 match format {
95 Format::ClaudeCode => scan_claude_code(&content, &mut report),
96 Format::Codex => scan_codex(&content, &mut report),
97 Format::Gemini => scan_gemini(&content, &mut report)?,
98 Format::Generic => scan_generic(&content, &mut report)?,
99 Format::Langchain => scan_langchain(&content, &mut report)?,
100 Format::OtelJson => scan_otel_json(&content, &mut report)?,
101 Format::VercelAi => scan_vercel_ai(&content, &mut report)?,
102 Format::OtelProto => {
107 record(
108 &mut report.unknown_top_level,
109 "<binary-otlp-not-scanned>",
110 0,
111 );
112 }
113 }
114 Ok(report)
115}
116
117const CLAUDE_KNOWN_TOP: &[&str] = &["user", "assistant"];
118const CLAUDE_KNOWN_USER_ITEMS: &[&str] = &["text", "tool_result"];
119const CLAUDE_KNOWN_ASSISTANT_ITEMS: &[&str] = &["text", "tool_use"];
120
121fn scan_claude_code(content: &str, report: &mut UnknownReport) {
122 for (i, line) in content.lines().enumerate() {
123 if line.trim().is_empty() {
124 continue;
125 }
126 report.total_lines += 1;
127 let line_num = i + 1;
128 let Ok(v) = serde_json::from_str::<serde_json::Value>(line) else {
129 record(&mut report.unknown_top_level, "<malformed-json>", line_num);
130 continue;
131 };
132 let Some(ty) = v.get("type").and_then(|t| t.as_str()) else {
133 record(&mut report.unknown_top_level, "<no-type-field>", line_num);
134 continue;
135 };
136 if !CLAUDE_KNOWN_TOP.contains(&ty) {
137 record(&mut report.unknown_top_level, ty, line_num);
138 continue;
139 }
140 let known_items = if ty == "user" {
141 CLAUDE_KNOWN_USER_ITEMS
142 } else {
143 CLAUDE_KNOWN_ASSISTANT_ITEMS
144 };
145 if let Some(items) = v
146 .get("message")
147 .and_then(|m| m.get("content"))
148 .and_then(|c| c.as_array())
149 {
150 for item in items {
151 if let Some(item_ty) = item.get("type").and_then(|t| t.as_str())
152 && !known_items.contains(&item_ty)
153 {
154 record(&mut report.unknown_content_item_types, item_ty, line_num);
155 }
156 }
157 }
158 }
159}
160
161const CODEX_KNOWN_TOP: &[&str] = &["session_meta", "event_msg", "response_item", "turn_context"];
162const CODEX_KNOWN_PAYLOAD: &[&str] = &["message", "function_call", "function_call_output"];
163
164fn scan_codex(content: &str, report: &mut UnknownReport) {
165 for (i, line) in content.lines().enumerate() {
166 if line.trim().is_empty() {
167 continue;
168 }
169 report.total_lines += 1;
170 let line_num = i + 1;
171 let Ok(v) = serde_json::from_str::<serde_json::Value>(line) else {
172 record(&mut report.unknown_top_level, "<malformed-json>", line_num);
173 continue;
174 };
175 let Some(ty) = v.get("type").and_then(|t| t.as_str()) else {
176 record(&mut report.unknown_top_level, "<no-type-field>", line_num);
177 continue;
178 };
179 if !CODEX_KNOWN_TOP.contains(&ty) {
180 record(&mut report.unknown_top_level, ty, line_num);
181 continue;
182 }
183 if ty == "response_item"
188 && let Some(payload_ty) = v
189 .get("payload")
190 .and_then(|p| p.get("type"))
191 .and_then(|t| t.as_str())
192 && !CODEX_KNOWN_PAYLOAD.contains(&payload_ty)
193 {
194 record(&mut report.unknown_payload_types, payload_ty, line_num);
195 }
196 }
197}
198
199const GEMINI_KNOWN_MSG_TYPES: &[&str] = &["user", "gemini"];
200
201fn scan_gemini(content: &str, report: &mut UnknownReport) -> Result<()> {
202 let v: serde_json::Value = serde_json::from_str(content)
203 .with_context(|| "parsing Gemini session as JSON for drift scan")?;
204 let Some(messages) = v.get("messages").and_then(|m| m.as_array()) else {
205 return Ok(());
206 };
207 for (i, msg) in messages.iter().enumerate() {
208 report.total_lines += 1;
209 let msg_idx = i + 1;
211 if let Some(ty) = msg.get("type").and_then(|t| t.as_str())
212 && !GEMINI_KNOWN_MSG_TYPES.contains(&ty)
213 {
214 record(&mut report.unknown_top_level, ty, msg_idx);
215 }
216 }
217 Ok(())
218}
219
220const GENERIC_KNOWN_ROLES: &[&str] = &["user", "assistant", "tool", "system"];
221
222const LANGCHAIN_KNOWN_RUN_TYPES: &[&str] = &["chain", "llm", "chat_model", "tool"];
226
227const VERCEL_KNOWN_STEP_TYPES: &[&str] = &["initial", "continue", "tool-result"];
231
232const OTEL_KNOWN_OPERATIONS: &[&str] = &[
236 "chat",
237 "text_completion",
238 "generate_content",
239 "execute_tool",
240];
241
242fn scan_langchain(content: &str, report: &mut UnknownReport) -> Result<()> {
243 let v: serde_json::Value =
244 serde_json::from_str(content).with_context(|| "parsing LangChain export for drift scan")?;
245 let mut idx = 0usize;
246 fn walk(run: &serde_json::Value, idx: &mut usize, report: &mut UnknownReport) {
247 *idx += 1;
248 report.total_lines += 1;
249 if let Some(ty) = run.get("run_type").and_then(|v| v.as_str())
250 && !LANGCHAIN_KNOWN_RUN_TYPES.contains(&ty)
251 {
252 record(&mut report.unknown_top_level, ty, *idx);
253 }
254 if let Some(children) = run.get("child_runs").and_then(|v| v.as_array()) {
255 for child in children {
256 walk(child, idx, report);
257 }
258 }
259 }
260 walk(&v, &mut idx, report);
261 Ok(())
262}
263
264fn scan_vercel_ai(content: &str, report: &mut UnknownReport) -> Result<()> {
265 let v: serde_json::Value = serde_json::from_str(content)
266 .with_context(|| "parsing Vercel AI SDK session for drift scan")?;
267 if let Some(steps) = v.get("steps").and_then(|s| s.as_array()) {
270 for (i, step) in steps.iter().enumerate() {
271 report.total_lines += 1;
272 if let Some(ty) = step.get("stepType").and_then(|v| v.as_str())
273 && !VERCEL_KNOWN_STEP_TYPES.contains(&ty)
274 {
275 record(&mut report.unknown_top_level, ty, i + 1);
276 }
277 }
278 } else {
279 report.total_lines += 1;
280 }
281 Ok(())
282}
283
284fn scan_otel_json(content: &str, report: &mut UnknownReport) -> Result<()> {
285 let v: serde_json::Value = serde_json::from_str(content)
286 .with_context(|| "parsing OTel JSON session for drift scan")?;
287 let Some(resource_spans) = v.get("resourceSpans").and_then(|x| x.as_array()) else {
288 return Ok(());
289 };
290 let mut span_idx = 0usize;
291 for rs in resource_spans {
292 let Some(scope_spans) = rs.get("scopeSpans").and_then(|x| x.as_array()) else {
293 continue;
294 };
295 for ss in scope_spans {
296 let Some(spans) = ss.get("spans").and_then(|x| x.as_array()) else {
297 continue;
298 };
299 for span in spans {
300 span_idx += 1;
301 report.total_lines += 1;
302 let Some(attrs) = span.get("attributes").and_then(|x| x.as_array()) else {
303 continue;
304 };
305 let mut op: Option<String> = None;
306 for kv in attrs {
307 if kv.get("key").and_then(|k| k.as_str()) == Some("gen_ai.operation.name")
308 && let Some(s) = kv
309 .get("value")
310 .and_then(|v| v.get("stringValue"))
311 .and_then(|v| v.as_str())
312 {
313 op = Some(s.to_string());
314 }
315 }
316 if let Some(op) = op
317 && !OTEL_KNOWN_OPERATIONS.contains(&op.as_str())
318 {
319 record(&mut report.unknown_top_level, &op, span_idx);
320 }
321 }
322 }
323 }
324 Ok(())
325}
326
327fn scan_generic(content: &str, report: &mut UnknownReport) -> Result<()> {
328 let v: serde_json::Value = serde_json::from_str(content)
329 .with_context(|| "parsing generic session as JSON for drift scan")?;
330 let Some(messages) = v.get("messages").and_then(|m| m.as_array()) else {
331 return Ok(());
332 };
333 for (i, msg) in messages.iter().enumerate() {
334 report.total_lines += 1;
335 let msg_idx = i + 1;
336 if let Some(role) = msg.get("role").and_then(|r| r.as_str())
337 && !GENERIC_KNOWN_ROLES.contains(&role)
338 {
339 record(&mut report.unknown_top_level, role, msg_idx);
340 }
341 }
342 Ok(())
343}
344
345#[cfg(test)]
346mod tests {
347 use super::*;
348 use std::io::Write;
349 use tempfile::NamedTempFile;
350
351 fn write_file(content: &str) -> NamedTempFile {
352 let mut f = NamedTempFile::new().unwrap();
353 f.write_all(content.as_bytes()).unwrap();
354 f
355 }
356
357 #[test]
358 fn claude_clean_session_reports_no_unknowns() {
359 let jsonl = r#"{"type":"user","uuid":"u1","message":{"role":"user","content":"hi"}}
360{"type":"assistant","uuid":"a1","message":{"role":"assistant","content":[{"type":"text","text":"hello"}]}}
361"#;
362 let f = write_file(jsonl);
363 let report = scan(Format::ClaudeCode, f.path()).unwrap();
364 assert_eq!(report.total_lines, 2);
365 assert!(report.is_clean());
366 }
367
368 #[test]
369 fn claude_unknown_top_level_type_recorded() {
370 let jsonl = r#"{"type":"user","uuid":"u1","message":{"role":"user","content":"hi"}}
371{"type":"summary","summary":"…"}
372{"type":"summary","summary":"another"}
373"#;
374 let f = write_file(jsonl);
375 let report = scan(Format::ClaudeCode, f.path()).unwrap();
376 assert_eq!(
377 report.unknown_top_level.get("summary").unwrap(),
378 &vec![2, 3]
379 );
380 }
381
382 #[test]
383 fn claude_unknown_content_item_recorded() {
384 let jsonl = r#"{"type":"assistant","uuid":"a1","message":{"role":"assistant","content":[{"type":"thinking","content":"…"}]}}
385"#;
386 let f = write_file(jsonl);
387 let report = scan(Format::ClaudeCode, f.path()).unwrap();
388 assert_eq!(
389 report.unknown_content_item_types.get("thinking").unwrap(),
390 &vec![1]
391 );
392 }
393
394 #[test]
395 fn codex_unknown_payload_type_recorded() {
396 let jsonl = r#"{"type":"response_item","payload":{"type":"reasoning"}}
397{"type":"response_item","payload":{"type":"message","role":"user","content":[]}}
398"#;
399 let f = write_file(jsonl);
400 let report = scan(Format::Codex, f.path()).unwrap();
401 assert_eq!(
402 report.unknown_payload_types.get("reasoning").unwrap(),
403 &vec![1]
404 );
405 }
406
407 #[test]
408 fn codex_known_top_levels_not_reported() {
409 let jsonl = r#"{"type":"session_meta","payload":{}}
410{"type":"event_msg","payload":{}}
411{"type":"turn_context","payload":{}}
412"#;
413 let f = write_file(jsonl);
414 let report = scan(Format::Codex, f.path()).unwrap();
415 assert!(report.is_clean());
416 }
417
418 #[test]
419 fn gemini_unknown_message_type_recorded() {
420 let json = r#"{"sessionId":"s1","messages":[
421 {"type":"user","content":"hi"},
422 {"type":"info","content":"…"},
423 {"type":"system","content":"…"}
424 ]}"#;
425 let f = write_file(json);
426 let report = scan(Format::Gemini, f.path()).unwrap();
427 assert_eq!(report.unknown_top_level.get("info").unwrap(), &vec![2]);
428 assert_eq!(report.unknown_top_level.get("system").unwrap(), &vec![3]);
429 }
430
431 #[test]
432 fn generic_unknown_role_recorded() {
433 let json = r#"{"messages":[
434 {"role":"user","content":"hi"},
435 {"role":"developer","content":"…"}
436 ]}"#;
437 let f = write_file(json);
438 let report = scan(Format::Generic, f.path()).unwrap();
439 assert_eq!(report.unknown_top_level.get("developer").unwrap(), &vec![2]);
440 }
441
442 #[test]
443 fn report_print_clean_session() {
444 let report = UnknownReport {
445 format: Some(Format::ClaudeCode),
446 path: PathBuf::from("/tmp/x"),
447 total_lines: 5,
448 ..UnknownReport::default()
449 };
450 let mut out = Vec::new();
451 report.print(&mut out).unwrap();
452 let s = String::from_utf8(out).unwrap();
453 assert!(s.contains("no unknown"));
454 assert!(s.contains("lines=5"));
455 }
456
457 #[test]
458 fn report_print_with_unknowns_shows_first_lines() {
459 let mut report = UnknownReport {
460 format: Some(Format::Codex),
461 path: PathBuf::from("/tmp/x"),
462 total_lines: 10,
463 ..UnknownReport::default()
464 };
465 record(&mut report.unknown_payload_types, "reasoning", 3);
466 record(&mut report.unknown_payload_types, "reasoning", 7);
467 let mut out = Vec::new();
468 report.print(&mut out).unwrap();
469 let s = String::from_utf8(out).unwrap();
470 assert!(s.contains("reasoning"));
471 assert!(s.contains("count=2"));
472 }
473}