1use anyhow::{Context, Result};
2use std::collections::{HashMap, HashSet};
3use std::fmt;
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::Path;
7
8use crate::data::models::{GlobalDataQuality, SessionData, SessionFile};
9use crate::data::scanner::{resolve_agent_parents, scan_claude_home};
10use crate::pricing::calculator::PricingCalculator;
11
12#[derive(Debug)]
15pub struct ValidationReport {
16 pub session_results: Vec<SessionValidation>,
17 pub structure_checks: Vec<Check>,
18 pub summary: ValidationSummary,
19}
20
21#[derive(Debug)]
22pub struct SessionValidation {
23 pub session_id: String,
24 pub project: String,
25 pub token_checks: Vec<Check>,
26 pub agent_checks: Vec<Check>,
27}
28
29#[derive(Debug)]
30pub struct Check {
31 pub name: String,
32 pub expected: String,
33 pub actual: String,
34 pub passed: bool,
35}
36
37impl Check {
38 fn pass(name: impl Into<String>, value: impl fmt::Display) -> Self {
39 let v = value.to_string();
40 Self {
41 name: name.into(),
42 expected: v.clone(),
43 actual: v,
44 passed: true,
45 }
46 }
47
48 fn compare(
49 name: impl Into<String>,
50 expected: impl fmt::Display,
51 actual: impl fmt::Display,
52 ) -> Self {
53 let e = expected.to_string();
54 let a = actual.to_string();
55 let passed = e == a;
56 Self {
57 name: name.into(),
58 expected: e,
59 actual: a,
60 passed,
61 }
62 }
63
64 #[allow(dead_code)]
65 fn compare_f64(name: impl Into<String>, expected: f64, actual: f64, tolerance: f64) -> Self {
66 let passed = (expected - actual).abs() < tolerance;
67 Self {
68 name: name.into(),
69 expected: format!("{:.2}", expected),
70 actual: format!("{:.2}", actual),
71 passed,
72 }
73 }
74}
75
76#[derive(Debug, Default)]
77pub struct ValidationSummary {
78 pub total_checks: usize,
79 pub passed: usize,
80 pub failed: usize,
81 pub sessions_validated: usize,
82 pub sessions_passed: usize,
83}
84
85#[derive(Debug, Default)]
90struct RawTokenCount {
91 input_tokens: u64,
92 output_tokens: u64,
93 cache_creation_tokens: u64,
94 cache_read_tokens: u64,
95 turn_count: usize,
96}
97
98fn is_valid_assistant(
105 val: &serde_json::Value,
106 skip_sidechain: bool,
107 now: &chrono::DateTime<chrono::Utc>,
108) -> bool {
109 if val.get("type").and_then(|t| t.as_str()) != Some("assistant") {
110 return false;
111 }
112 if skip_sidechain && val.get("isSidechain").and_then(|v| v.as_bool()) == Some(true) {
113 return false;
114 }
115 let model = val.pointer("/message/model").and_then(|m| m.as_str());
116 if model == Some("<synthetic>") || model.is_none() {
117 return false;
118 }
119 if val.pointer("/message/usage").is_none() {
121 return false;
122 }
123 let input = val
124 .pointer("/message/usage/input_tokens")
125 .and_then(|v| v.as_u64())
126 .unwrap_or(0);
127 let output = val
128 .pointer("/message/usage/output_tokens")
129 .and_then(|v| v.as_u64())
130 .unwrap_or(0);
131 let cache_creation = val
132 .pointer("/message/usage/cache_creation_input_tokens")
133 .and_then(|v| v.as_u64())
134 .unwrap_or(0);
135 let cache_read = val
136 .pointer("/message/usage/cache_read_input_tokens")
137 .and_then(|v| v.as_u64())
138 .unwrap_or(0);
139 if input + output + cache_creation + cache_read == 0 {
140 return false;
141 }
142 if let Some(ts_str) = val.get("timestamp").and_then(|t| t.as_str()) {
144 if let Ok(ts) = ts_str.parse::<chrono::DateTime<chrono::Utc>>() {
145 if ts > *now {
146 return false;
147 }
148 } else {
149 return false;
150 }
151 } else {
152 return false;
153 }
154 true
155}
156
157fn count_raw_tokens(path: &Path, skip_sidechain: bool) -> Result<RawTokenCount> {
160 let file = File::open(path)
161 .with_context(|| format!("raw counter: failed to open {}", path.display()))?;
162 let reader = BufReader::new(file);
163 let now = chrono::Utc::now();
164
165 let mut by_request: HashMap<String, (u64, u64, u64, u64)> = HashMap::new();
167 let mut no_request_id_count = RawTokenCount::default();
168
169 for line in reader.lines() {
170 let line = line?;
171 let val: serde_json::Value = match serde_json::from_str(&line) {
172 Ok(v) => v,
173 Err(_) => continue,
174 };
175
176 if !is_valid_assistant(&val, skip_sidechain, &now) {
177 continue;
178 }
179
180 let input = val
181 .pointer("/message/usage/input_tokens")
182 .and_then(|v| v.as_u64())
183 .unwrap_or(0);
184 let output = val
185 .pointer("/message/usage/output_tokens")
186 .and_then(|v| v.as_u64())
187 .unwrap_or(0);
188 let cache_creation = val
189 .pointer("/message/usage/cache_creation_input_tokens")
190 .and_then(|v| v.as_u64())
191 .unwrap_or(0);
192 let cache_read = val
193 .pointer("/message/usage/cache_read_input_tokens")
194 .and_then(|v| v.as_u64())
195 .unwrap_or(0);
196
197 let request_id = val.get("requestId").and_then(|r| r.as_str());
198
199 match request_id {
200 Some(rid) if !rid.is_empty() => {
201 by_request.insert(rid.to_string(), (input, output, cache_creation, cache_read));
202 }
203 _ => {
204 no_request_id_count.input_tokens += input;
205 no_request_id_count.output_tokens += output;
206 no_request_id_count.cache_creation_tokens += cache_creation;
207 no_request_id_count.cache_read_tokens += cache_read;
208 no_request_id_count.turn_count += 1;
209 }
210 }
211 }
212
213 let mut result = no_request_id_count;
214 for (input, output, cc, cr) in by_request.values() {
215 result.input_tokens += input;
216 result.output_tokens += output;
217 result.cache_creation_tokens += cc;
218 result.cache_read_tokens += cr;
219 result.turn_count += 1;
220 }
221
222 Ok(result)
223}
224
225fn count_tokens_by_request_id(
229 path: &Path,
230 skip_sidechain: bool,
231) -> Result<(HashMap<String, u64>, u64)> {
232 let file = File::open(path)?;
233 let reader = BufReader::new(file);
234 let now = chrono::Utc::now();
235 let mut by_rid: HashMap<String, u64> = HashMap::new();
236 let mut no_rid_output: u64 = 0;
237
238 for line in reader.lines() {
239 let line = line?;
240 let val: serde_json::Value = match serde_json::from_str(&line) {
241 Ok(v) => v,
242 Err(_) => continue,
243 };
244 if !is_valid_assistant(&val, skip_sidechain, &now) {
245 continue;
246 }
247 let output = val
248 .pointer("/message/usage/output_tokens")
249 .and_then(|v| v.as_u64())
250 .unwrap_or(0);
251 match val.get("requestId").and_then(|r| r.as_str()) {
252 Some(rid) if !rid.is_empty() => {
253 by_rid.insert(rid.to_string(), output);
254 }
255 _ => {
256 no_rid_output += output;
257 }
258 }
259 }
260 Ok((by_rid, no_rid_output))
261}
262
263fn collect_valid_request_ids(path: &Path, skip_sidechain: bool) -> Result<HashSet<String>> {
266 let file = File::open(path)?;
267 let reader = BufReader::new(file);
268 let now = chrono::Utc::now();
269 let mut ids = HashSet::new();
270
271 for line in reader.lines() {
272 let line = line?;
273 let val: serde_json::Value = match serde_json::from_str(&line) {
274 Ok(v) => v,
275 Err(_) => continue,
276 };
277 if !is_valid_assistant(&val, skip_sidechain, &now) {
278 continue;
279 }
280 if let Some(rid) = val.get("requestId").and_then(|r| r.as_str()) {
281 if !rid.is_empty() {
282 ids.insert(rid.to_string());
283 }
284 }
285 }
286 Ok(ids)
287}
288
289pub fn validate_all(
293 sessions: &[&SessionData],
294 quality: &GlobalDataQuality,
295 claude_home: &Path,
296 calc: &PricingCalculator,
297) -> Result<ValidationReport> {
298 let mut files = scan_claude_home(claude_home)?;
300 resolve_agent_parents(&mut files)?;
301
302 let (main_files, agent_files): (Vec<&SessionFile>, Vec<&SessionFile>) =
303 files.iter().partition(|f| !f.is_agent);
304
305 let mut structure_checks = Vec::new();
306 let mut session_results = Vec::new();
307
308 structure_checks.push(Check::compare(
312 "session_count == main_file_count",
313 main_files.len(),
314 quality.total_session_files,
315 ));
316
317 structure_checks.push(Check::compare(
319 "agent_file_count",
320 agent_files.len(),
321 quality.total_agent_files,
322 ));
323
324 let main_session_ids: HashSet<&str> =
326 main_files.iter().map(|f| f.session_id.as_str()).collect();
327 let orphan_count = agent_files
328 .iter()
329 .filter(|f| {
330 let parent = f.parent_session_id.as_deref().unwrap_or(&f.session_id);
331 !main_session_ids.contains(parent)
332 })
333 .count();
334 structure_checks.push(Check::pass(
335 format!("orphan_agents (no main session file): {}", orphan_count),
336 orphan_count,
337 ));
338
339 let unique_main_ids: HashSet<&str> = main_files.iter().map(|f| f.session_id.as_str()).collect();
341 let dup_count = main_files.len() - unique_main_ids.len();
342 structure_checks.push(Check::pass(
343 format!(
344 "main_session_files: {} files, {} unique IDs ({} duplicates)",
345 main_files.len(),
346 unique_main_ids.len(),
347 dup_count
348 ),
349 main_files.len(),
350 ));
351
352 let mut cross_file_overlap = 0usize;
354 for agent in &agent_files {
355 let parent_id = agent
356 .parent_session_id
357 .as_deref()
358 .unwrap_or(&agent.session_id);
359 let parent_file = main_files.iter().find(|f| f.session_id == parent_id);
360 if let Some(pf) = parent_file {
361 let parent_rids = collect_valid_request_ids(&pf.path, true).unwrap_or_default();
362 let agent_rids = collect_valid_request_ids(&agent.path, false).unwrap_or_default();
363 cross_file_overlap += parent_rids.intersection(&agent_rids).count();
364 }
365 }
366 structure_checks.push(Check::pass(
367 format!(
368 "cross_file_overlapping_request_ids (deduped: {})",
369 cross_file_overlap
370 ),
371 cross_file_overlap,
372 ));
373
374 let mut agents_by_parent: HashMap<&str, Vec<&SessionFile>> = HashMap::new();
378 for af in &agent_files {
379 let parent_id = af.parent_session_id.as_deref().unwrap_or(&af.session_id);
380 agents_by_parent.entry(parent_id).or_default().push(af);
381 }
382
383 let main_file_map: HashMap<&str, &SessionFile> = main_files
385 .iter()
386 .map(|f| (f.session_id.as_str(), *f))
387 .collect();
388
389 for session in sessions {
390 let mut token_checks = Vec::new();
391 let mut agent_checks = Vec::new();
392
393 if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
395 let raw_main = count_raw_tokens(&mf.path, true).unwrap_or_default();
396
397 let pipeline_main_input: u64 = session
399 .turns
400 .iter()
401 .map(|t| t.usage.input_tokens.unwrap_or(0))
402 .sum();
403 let pipeline_main_output: u64 = session
404 .turns
405 .iter()
406 .map(|t| t.usage.output_tokens.unwrap_or(0))
407 .sum();
408 let pipeline_main_cache_creation: u64 = session
409 .turns
410 .iter()
411 .map(|t| t.usage.cache_creation_input_tokens.unwrap_or(0))
412 .sum();
413 let pipeline_main_cache_read: u64 = session
414 .turns
415 .iter()
416 .map(|t| t.usage.cache_read_input_tokens.unwrap_or(0))
417 .sum();
418 let pipeline_main_turns = session.turns.len();
419
420 token_checks.push(Check::compare(
421 "main_turn_count",
422 raw_main.turn_count,
423 pipeline_main_turns,
424 ));
425 token_checks.push(Check::compare(
426 "main_input_tokens",
427 raw_main.input_tokens,
428 pipeline_main_input,
429 ));
430 token_checks.push(Check::compare(
431 "main_output_tokens",
432 raw_main.output_tokens,
433 pipeline_main_output,
434 ));
435 token_checks.push(Check::compare(
436 "main_cache_creation_tokens",
437 raw_main.cache_creation_tokens,
438 pipeline_main_cache_creation,
439 ));
440 token_checks.push(Check::compare(
441 "main_cache_read_tokens",
442 raw_main.cache_read_tokens,
443 pipeline_main_cache_read,
444 ));
445 }
446
447 let agent_session_files = agents_by_parent.get(session.session_id.as_str());
449 let expected_agent_files = agent_session_files.map_or(0, |v| v.len());
450 let actual_agent_file_count = if expected_agent_files > 0 {
451 expected_agent_files
452 } else {
453 0
454 };
455
456 agent_checks.push(Check::compare(
457 "agent_file_count (from scanner)",
458 actual_agent_file_count,
459 expected_agent_files,
460 ));
461
462 if expected_agent_files > 0 {
464 if let Some(afs) = agent_session_files {
465 let main_file = main_file_map.get(session.session_id.as_str());
467 let main_rids = main_file
468 .map(|mf| collect_valid_request_ids(&mf.path, true).unwrap_or_default())
469 .unwrap_or_default();
470
471 let mut expected_unique_agent_turns = 0usize;
473 let mut raw_agent_output: u64 = 0;
474
475 for af in afs {
476 let raw = count_raw_tokens(&af.path, false).unwrap_or_default();
477 let file_rids = collect_valid_request_ids(&af.path, false).unwrap_or_default();
478 let file_overlap = file_rids.intersection(&main_rids).count();
479 let unique_turns = raw.turn_count.saturating_sub(file_overlap);
480 expected_unique_agent_turns += unique_turns;
481
482 let (per_rid, no_rid_output) =
484 count_tokens_by_request_id(&af.path, false).unwrap_or_default();
485 for (rid, output) in &per_rid {
486 if !main_rids.contains(rid) {
487 raw_agent_output += output;
488 }
489 }
490 raw_agent_output += no_rid_output;
491 }
492
493 let pipeline_subagent_turn_count = session.agent_turn_count();
495 agent_checks.push(Check::compare(
496 "agent_turn_count (after cross-file dedup)",
497 expected_unique_agent_turns,
498 pipeline_subagent_turn_count,
499 ));
500
501 if expected_unique_agent_turns > 0 {
503 agent_checks.push(Check::compare(
504 "has_agent_turns (non-overlapping exist)",
505 "true",
506 (pipeline_subagent_turn_count > 0).to_string(),
507 ));
508 }
509
510 let pipeline_agent_output: u64 = session
512 .subagents
513 .iter()
514 .flat_map(|s| s.turns.iter())
515 .map(|t| t.usage.output_tokens.unwrap_or(0))
516 .sum();
517
518 let agent_output_match = {
519 if raw_agent_output == 0 && pipeline_agent_output == 0 {
520 true
521 } else {
522 let max_val = raw_agent_output.max(pipeline_agent_output) as f64;
523 if max_val == 0.0 {
524 true
525 } else {
526 (raw_agent_output as f64 - pipeline_agent_output as f64).abs() / max_val
527 < 0.05
528 }
529 }
530 };
531
532 agent_checks.push(Check {
533 name: "agent_output_tokens (±5%)".into(),
534 expected: raw_agent_output.to_string(),
535 actual: pipeline_agent_output.to_string(),
536 passed: agent_output_match,
537 });
538
539 let all_marked_agent = session
541 .subagents
542 .iter()
543 .flat_map(|s| s.turns.iter())
544 .all(|t| t.is_agent);
545 agent_checks.push(Check::compare(
546 "all agent_turns have is_agent=true",
547 "true",
548 all_marked_agent.to_string(),
549 ));
550 }
551 }
552
553 let pipeline_total_output: u64 = session
555 .turns
556 .iter()
557 .chain(session.subagents.iter().flat_map(|s| s.turns.iter()))
558 .map(|t| t.usage.output_tokens.unwrap_or(0))
559 .sum();
560 let pipeline_total_turns = session.total_turn_count();
561
562 token_checks.push(Check::compare(
564 "total_turn_count == turns + agent_turns",
565 pipeline_total_turns,
566 session.all_responses().len(),
567 ));
568
569 if pipeline_total_turns > 0 {
571 token_checks.push(Check::compare(
572 "total_output_tokens > 0",
573 "true",
574 (pipeline_total_output > 0).to_string(),
575 ));
576 }
577
578 let pipeline_cost: f64 = session
580 .turns
581 .iter()
582 .chain(session.subagents.iter().flat_map(|s| s.turns.iter()))
583 .map(|t| calc.calculate_turn_cost(&t.model, &t.usage).total)
584 .sum();
585
586 let has_tokens = session
588 .turns
589 .iter()
590 .chain(session.subagents.iter().flat_map(|s| s.turns.iter()))
591 .any(|t| {
592 t.usage.input_tokens.unwrap_or(0) > 0 || t.usage.output_tokens.unwrap_or(0) > 0
593 });
594 if has_tokens {
595 token_checks.push(Check::compare(
596 "cost > 0 when tokens exist",
597 "true",
598 (pipeline_cost > 0.0).to_string(),
599 ));
600 }
601
602 if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
604 token_checks.push(Check::compare(
605 "project_association",
606 mf.project.as_deref().unwrap_or("(none)"),
607 session.project.as_deref().unwrap_or("(none)"),
608 ));
609 }
610
611 let project_name = session
612 .project
613 .as_deref()
614 .unwrap_or("(unknown)")
615 .to_string();
616
617 session_results.push(SessionValidation {
618 session_id: session.session_id.clone(),
619 project: project_name,
620 token_checks,
621 agent_checks,
622 });
623 }
624
625 let mut summary = ValidationSummary::default();
628
629 for check in &structure_checks {
630 summary.total_checks += 1;
631 if check.passed {
632 summary.passed += 1;
633 } else {
634 summary.failed += 1;
635 }
636 }
637
638 for sv in &session_results {
639 summary.sessions_validated += 1;
640 let mut session_pass = true;
641 for check in sv.token_checks.iter().chain(sv.agent_checks.iter()) {
642 summary.total_checks += 1;
643 if check.passed {
644 summary.passed += 1;
645 } else {
646 summary.failed += 1;
647 session_pass = false;
648 }
649 }
650 if session_pass {
651 summary.sessions_passed += 1;
652 }
653 }
654
655 Ok(ValidationReport {
656 session_results,
657 structure_checks,
658 summary,
659 })
660}
661
662#[cfg(test)]
663mod tests {
664 use super::*;
665 use std::io::Write;
666 use tempfile::NamedTempFile;
667
668 fn make_assistant_line(request_id: &str, input: u64, output: u64) -> String {
669 format!(
670 r#"{{"type":"assistant","uuid":"u-{}","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":{},"output_tokens":{},"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[{{"type":"text","text":"hi"}}]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"{}"}}"#,
671 request_id, input, output, request_id
672 )
673 }
674
675 #[test]
676 fn raw_counter_basic() {
677 let mut f = NamedTempFile::new().unwrap();
678 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
679 writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
680 f.flush().unwrap();
681
682 let result = count_raw_tokens(f.path(), true).unwrap();
683 assert_eq!(result.turn_count, 2);
684 assert_eq!(result.input_tokens, 300);
685 assert_eq!(result.output_tokens, 125);
686 }
687
688 #[test]
689 fn raw_counter_deduplicates_streaming() {
690 let mut f = NamedTempFile::new().unwrap();
691 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
693 writeln!(f, "{}", make_assistant_line("r1", 200, 75)).unwrap();
694 f.flush().unwrap();
695
696 let result = count_raw_tokens(f.path(), true).unwrap();
697 assert_eq!(result.turn_count, 1);
698 assert_eq!(result.input_tokens, 200);
699 assert_eq!(result.output_tokens, 75);
700 }
701
702 #[test]
703 fn raw_counter_skips_synthetic() {
704 let mut f = NamedTempFile::new().unwrap();
705 writeln!(f, r#"{{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"<synthetic>","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"r1"}}"#).unwrap();
706 writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
707 f.flush().unwrap();
708
709 let result = count_raw_tokens(f.path(), true).unwrap();
710 assert_eq!(result.turn_count, 1);
711 assert_eq!(result.input_tokens, 200);
712 }
713
714 #[test]
715 fn raw_counter_respects_sidechain_flag() {
716 let sidechain_line = r#"{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0},"content":[]},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":true,"parentUuid":null,"requestId":"r1"}"#;
717 let mut f = NamedTempFile::new().unwrap();
718 writeln!(f, "{}", sidechain_line).unwrap();
719 f.flush().unwrap();
720
721 let result = count_raw_tokens(f.path(), true).unwrap();
723 assert_eq!(result.turn_count, 0);
724
725 let result = count_raw_tokens(f.path(), false).unwrap();
727 assert_eq!(result.turn_count, 1);
728 assert_eq!(result.input_tokens, 100);
729 }
730
731 #[test]
732 fn raw_counter_skips_non_assistant() {
733 let mut f = NamedTempFile::new().unwrap();
734 writeln!(f, r#"{{"type":"user","uuid":"u1","message":{{"role":"user","content":"hi"}},"timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
735 writeln!(f, r#"{{"type":"progress","data":{{"type":"hook"}},"uuid":"u2","timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
736 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
737 f.flush().unwrap();
738
739 let result = count_raw_tokens(f.path(), true).unwrap();
740 assert_eq!(result.turn_count, 1);
741 }
742}