1use anyhow::{Context, Result};
2use std::collections::{HashMap, HashSet};
3use std::fmt;
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::Path;
7
8use crate::data::models::{GlobalDataQuality, SessionData, SessionFile};
9use crate::data::scanner::{resolve_agent_parents, scan_claude_home};
10use crate::pricing::calculator::PricingCalculator;
11
12#[derive(Debug)]
15pub struct ValidationReport {
16 pub session_results: Vec<SessionValidation>,
17 pub structure_checks: Vec<Check>,
18 pub summary: ValidationSummary,
19}
20
21#[derive(Debug)]
22pub struct SessionValidation {
23 pub session_id: String,
24 pub project: String,
25 pub token_checks: Vec<Check>,
26 pub agent_checks: Vec<Check>,
27}
28
29#[derive(Debug)]
30pub struct Check {
31 pub name: String,
32 pub expected: String,
33 pub actual: String,
34 pub passed: bool,
35}
36
37impl Check {
38 fn pass(name: impl Into<String>, value: impl fmt::Display) -> Self {
39 let v = value.to_string();
40 Self {
41 name: name.into(),
42 expected: v.clone(),
43 actual: v,
44 passed: true,
45 }
46 }
47
48 fn compare(
49 name: impl Into<String>,
50 expected: impl fmt::Display,
51 actual: impl fmt::Display,
52 ) -> Self {
53 let e = expected.to_string();
54 let a = actual.to_string();
55 let passed = e == a;
56 Self {
57 name: name.into(),
58 expected: e,
59 actual: a,
60 passed,
61 }
62 }
63
64 #[allow(dead_code)]
65 fn compare_f64(name: impl Into<String>, expected: f64, actual: f64, tolerance: f64) -> Self {
66 let passed = (expected - actual).abs() < tolerance;
67 Self {
68 name: name.into(),
69 expected: format!("{:.2}", expected),
70 actual: format!("{:.2}", actual),
71 passed,
72 }
73 }
74}
75
76#[derive(Debug, Default)]
77pub struct ValidationSummary {
78 pub total_checks: usize,
79 pub passed: usize,
80 pub failed: usize,
81 pub sessions_validated: usize,
82 pub sessions_passed: usize,
83}
84
85#[derive(Debug, Default)]
90struct RawTokenCount {
91 input_tokens: u64,
92 output_tokens: u64,
93 cache_creation_tokens: u64,
94 cache_read_tokens: u64,
95 turn_count: usize,
96}
97
98fn is_valid_assistant(
105 val: &serde_json::Value,
106 skip_sidechain: bool,
107 now: &chrono::DateTime<chrono::Utc>,
108) -> bool {
109 if val.get("type").and_then(|t| t.as_str()) != Some("assistant") {
110 return false;
111 }
112 if skip_sidechain && val.get("isSidechain").and_then(|v| v.as_bool()) == Some(true) {
113 return false;
114 }
115 let model = val.pointer("/message/model").and_then(|m| m.as_str());
116 if model == Some("<synthetic>") || model.is_none() {
117 return false;
118 }
119 if val.pointer("/message/usage").is_none() {
121 return false;
122 }
123 let input = val
124 .pointer("/message/usage/input_tokens")
125 .and_then(|v| v.as_u64())
126 .unwrap_or(0);
127 let output = val
128 .pointer("/message/usage/output_tokens")
129 .and_then(|v| v.as_u64())
130 .unwrap_or(0);
131 let cache_creation = val
132 .pointer("/message/usage/cache_creation_input_tokens")
133 .and_then(|v| v.as_u64())
134 .unwrap_or(0);
135 let cache_read = val
136 .pointer("/message/usage/cache_read_input_tokens")
137 .and_then(|v| v.as_u64())
138 .unwrap_or(0);
139 if input + output + cache_creation + cache_read == 0 {
140 return false;
141 }
142 if let Some(ts_str) = val.get("timestamp").and_then(|t| t.as_str()) {
144 if let Ok(ts) = ts_str.parse::<chrono::DateTime<chrono::Utc>>() {
145 if ts > *now {
146 return false;
147 }
148 } else {
149 return false;
150 }
151 } else {
152 return false;
153 }
154 true
155}
156
157fn count_raw_tokens(path: &Path, skip_sidechain: bool) -> Result<RawTokenCount> {
160 let file = File::open(path)
161 .with_context(|| format!("raw counter: failed to open {}", path.display()))?;
162 let reader = BufReader::new(file);
163 let now = chrono::Utc::now();
164
165 let mut by_request: HashMap<String, (u64, u64, u64, u64)> = HashMap::new();
167 let mut no_request_id_count = RawTokenCount::default();
168
169 for line in reader.lines() {
170 let line = line?;
171 let val: serde_json::Value = match serde_json::from_str(&line) {
172 Ok(v) => v,
173 Err(_) => continue,
174 };
175
176 if !is_valid_assistant(&val, skip_sidechain, &now) {
177 continue;
178 }
179
180 let input = val
181 .pointer("/message/usage/input_tokens")
182 .and_then(|v| v.as_u64())
183 .unwrap_or(0);
184 let output = val
185 .pointer("/message/usage/output_tokens")
186 .and_then(|v| v.as_u64())
187 .unwrap_or(0);
188 let cache_creation = val
189 .pointer("/message/usage/cache_creation_input_tokens")
190 .and_then(|v| v.as_u64())
191 .unwrap_or(0);
192 let cache_read = val
193 .pointer("/message/usage/cache_read_input_tokens")
194 .and_then(|v| v.as_u64())
195 .unwrap_or(0);
196
197 let request_id = val.get("requestId").and_then(|r| r.as_str());
198
199 match request_id {
200 Some(rid) if !rid.is_empty() => {
201 by_request.insert(rid.to_string(), (input, output, cache_creation, cache_read));
202 }
203 _ => {
204 no_request_id_count.input_tokens += input;
205 no_request_id_count.output_tokens += output;
206 no_request_id_count.cache_creation_tokens += cache_creation;
207 no_request_id_count.cache_read_tokens += cache_read;
208 no_request_id_count.turn_count += 1;
209 }
210 }
211 }
212
213 let mut result = no_request_id_count;
214 for (input, output, cc, cr) in by_request.values() {
215 result.input_tokens += input;
216 result.output_tokens += output;
217 result.cache_creation_tokens += cc;
218 result.cache_read_tokens += cr;
219 result.turn_count += 1;
220 }
221
222 Ok(result)
223}
224
225fn count_tokens_by_request_id(
229 path: &Path,
230 skip_sidechain: bool,
231) -> Result<(HashMap<String, u64>, u64)> {
232 let file = File::open(path)?;
233 let reader = BufReader::new(file);
234 let now = chrono::Utc::now();
235 let mut by_rid: HashMap<String, u64> = HashMap::new();
236 let mut no_rid_output: u64 = 0;
237
238 for line in reader.lines() {
239 let line = line?;
240 let val: serde_json::Value = match serde_json::from_str(&line) {
241 Ok(v) => v,
242 Err(_) => continue,
243 };
244 if !is_valid_assistant(&val, skip_sidechain, &now) {
245 continue;
246 }
247 let output = val
248 .pointer("/message/usage/output_tokens")
249 .and_then(|v| v.as_u64())
250 .unwrap_or(0);
251 match val.get("requestId").and_then(|r| r.as_str()) {
252 Some(rid) if !rid.is_empty() => {
253 by_rid.insert(rid.to_string(), output);
254 }
255 _ => {
256 no_rid_output += output;
257 }
258 }
259 }
260 Ok((by_rid, no_rid_output))
261}
262
263fn collect_valid_request_ids(path: &Path, skip_sidechain: bool) -> Result<HashSet<String>> {
266 let file = File::open(path)?;
267 let reader = BufReader::new(file);
268 let now = chrono::Utc::now();
269 let mut ids = HashSet::new();
270
271 for line in reader.lines() {
272 let line = line?;
273 let val: serde_json::Value = match serde_json::from_str(&line) {
274 Ok(v) => v,
275 Err(_) => continue,
276 };
277 if !is_valid_assistant(&val, skip_sidechain, &now) {
278 continue;
279 }
280 if let Some(rid) = val.get("requestId").and_then(|r| r.as_str()) {
281 if !rid.is_empty() {
282 ids.insert(rid.to_string());
283 }
284 }
285 }
286 Ok(ids)
287}
288
289pub fn validate_all(
293 sessions: &[&SessionData],
294 quality: &GlobalDataQuality,
295 claude_home: &Path,
296 calc: &PricingCalculator,
297) -> Result<ValidationReport> {
298 let mut files = scan_claude_home(claude_home)?;
300 resolve_agent_parents(&mut files)?;
301
302 let (main_files, agent_files): (Vec<&SessionFile>, Vec<&SessionFile>) =
303 files.iter().partition(|f| !f.is_agent);
304
305 let mut structure_checks = Vec::new();
306 let mut session_results = Vec::new();
307
308 structure_checks.push(Check::compare(
312 "session_count == main_file_count",
313 main_files.len(),
314 quality.total_session_files,
315 ));
316
317 structure_checks.push(Check::compare(
319 "agent_file_count",
320 agent_files.len(),
321 quality.total_agent_files,
322 ));
323
324 let main_session_ids: HashSet<&str> =
326 main_files.iter().map(|f| f.session_id.as_str()).collect();
327 let orphan_count = agent_files
328 .iter()
329 .filter(|f| {
330 let parent = f.parent_session_id.as_deref().unwrap_or(&f.session_id);
331 !main_session_ids.contains(parent)
332 })
333 .count();
334 structure_checks.push(Check::pass(
335 format!("orphan_agents (no main session file): {}", orphan_count),
336 orphan_count,
337 ));
338
339 let unique_main_ids: HashSet<&str> = main_files.iter().map(|f| f.session_id.as_str()).collect();
341 let dup_count = main_files.len() - unique_main_ids.len();
342 structure_checks.push(Check::pass(
343 format!(
344 "main_session_files: {} files, {} unique IDs ({} duplicates)",
345 main_files.len(),
346 unique_main_ids.len(),
347 dup_count
348 ),
349 main_files.len(),
350 ));
351
352 let mut cross_file_overlap = 0usize;
354 for agent in &agent_files {
355 let parent_id = agent
356 .parent_session_id
357 .as_deref()
358 .unwrap_or(&agent.session_id);
359 let parent_file = main_files.iter().find(|f| f.session_id == parent_id);
360 if let Some(pf) = parent_file {
361 let parent_rids = collect_valid_request_ids(&pf.path, true).unwrap_or_default();
362 let agent_rids = collect_valid_request_ids(&agent.path, false).unwrap_or_default();
363 cross_file_overlap += parent_rids.intersection(&agent_rids).count();
364 }
365 }
366 structure_checks.push(Check::pass(
367 format!(
368 "cross_file_overlapping_request_ids (deduped: {})",
369 cross_file_overlap
370 ),
371 cross_file_overlap,
372 ));
373
374 let mut agents_by_parent: HashMap<&str, Vec<&SessionFile>> = HashMap::new();
378 for af in &agent_files {
379 let parent_id = af.parent_session_id.as_deref().unwrap_or(&af.session_id);
380 agents_by_parent.entry(parent_id).or_default().push(af);
381 }
382
383 let main_file_map: HashMap<&str, &SessionFile> = main_files
385 .iter()
386 .map(|f| (f.session_id.as_str(), *f))
387 .collect();
388
389 for session in sessions {
390 let mut token_checks = Vec::new();
391 let mut agent_checks = Vec::new();
392
393 if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
395 let raw_main = count_raw_tokens(&mf.path, true).unwrap_or_default();
396
397 let pipeline_main_input: u64 = session
399 .turns
400 .iter()
401 .map(|t| t.usage.input_tokens.unwrap_or(0))
402 .sum();
403 let pipeline_main_output: u64 = session
404 .turns
405 .iter()
406 .map(|t| t.usage.output_tokens.unwrap_or(0))
407 .sum();
408 let pipeline_main_cache_creation: u64 = session
409 .turns
410 .iter()
411 .map(|t| t.usage.cache_creation_input_tokens.unwrap_or(0))
412 .sum();
413 let pipeline_main_cache_read: u64 = session
414 .turns
415 .iter()
416 .map(|t| t.usage.cache_read_input_tokens.unwrap_or(0))
417 .sum();
418 let pipeline_main_turns = session.turns.len();
419
420 token_checks.push(Check::compare(
421 "main_turn_count",
422 raw_main.turn_count,
423 pipeline_main_turns,
424 ));
425 token_checks.push(Check::compare(
426 "main_input_tokens",
427 raw_main.input_tokens,
428 pipeline_main_input,
429 ));
430 token_checks.push(Check::compare(
431 "main_output_tokens",
432 raw_main.output_tokens,
433 pipeline_main_output,
434 ));
435 token_checks.push(Check::compare(
436 "main_cache_creation_tokens",
437 raw_main.cache_creation_tokens,
438 pipeline_main_cache_creation,
439 ));
440 token_checks.push(Check::compare(
441 "main_cache_read_tokens",
442 raw_main.cache_read_tokens,
443 pipeline_main_cache_read,
444 ));
445 }
446
447 let agent_session_files = agents_by_parent.get(session.session_id.as_str());
449 let expected_agent_files = agent_session_files.map_or(0, |v| v.len());
450 let actual_agent_file_count = if expected_agent_files > 0 {
451 expected_agent_files
452 } else {
453 0
454 };
455
456 agent_checks.push(Check::compare(
457 "agent_file_count (from scanner)",
458 actual_agent_file_count,
459 expected_agent_files,
460 ));
461
462 if expected_agent_files > 0 {
464 if let Some(afs) = agent_session_files {
465 let main_file = main_file_map.get(session.session_id.as_str());
467 let main_rids = main_file
468 .map(|mf| collect_valid_request_ids(&mf.path, true).unwrap_or_default())
469 .unwrap_or_default();
470
471 let mut expected_unique_agent_turns = 0usize;
473 let mut raw_agent_output: u64 = 0;
474
475 for af in afs {
476 let raw = count_raw_tokens(&af.path, false).unwrap_or_default();
477 let file_rids = collect_valid_request_ids(&af.path, false).unwrap_or_default();
478 let file_overlap = file_rids.intersection(&main_rids).count();
479 let unique_turns = raw.turn_count.saturating_sub(file_overlap);
480 expected_unique_agent_turns += unique_turns;
481
482 let (per_rid, no_rid_output) =
484 count_tokens_by_request_id(&af.path, false).unwrap_or_default();
485 for (rid, output) in &per_rid {
486 if !main_rids.contains(rid) {
487 raw_agent_output += output;
488 }
489 }
490 raw_agent_output += no_rid_output;
491 }
492
493 agent_checks.push(Check::compare(
495 "agent_turn_count (after cross-file dedup)",
496 expected_unique_agent_turns,
497 session.agent_turns.len(),
498 ));
499
500 if expected_unique_agent_turns > 0 {
502 agent_checks.push(Check::compare(
503 "has_agent_turns (non-overlapping exist)",
504 "true",
505 (!session.agent_turns.is_empty()).to_string(),
506 ));
507 }
508
509 let pipeline_agent_output: u64 = session
511 .agent_turns
512 .iter()
513 .map(|t| t.usage.output_tokens.unwrap_or(0))
514 .sum();
515
516 let agent_output_match = {
517 if raw_agent_output == 0 && pipeline_agent_output == 0 {
518 true
519 } else {
520 let max_val = raw_agent_output.max(pipeline_agent_output) as f64;
521 if max_val == 0.0 {
522 true
523 } else {
524 (raw_agent_output as f64 - pipeline_agent_output as f64).abs() / max_val
525 < 0.05
526 }
527 }
528 };
529
530 agent_checks.push(Check {
531 name: "agent_output_tokens (±5%)".into(),
532 expected: raw_agent_output.to_string(),
533 actual: pipeline_agent_output.to_string(),
534 passed: agent_output_match,
535 });
536
537 let all_marked_agent = session.agent_turns.iter().all(|t| t.is_agent);
539 agent_checks.push(Check::compare(
540 "all agent_turns have is_agent=true",
541 "true",
542 all_marked_agent.to_string(),
543 ));
544 }
545 }
546
547 let pipeline_total_output: u64 = session
549 .turns
550 .iter()
551 .chain(session.agent_turns.iter())
552 .map(|t| t.usage.output_tokens.unwrap_or(0))
553 .sum();
554 let pipeline_total_turns = session.turns.len() + session.agent_turns.len();
555
556 token_checks.push(Check::compare(
558 "total_turn_count == turns + agent_turns",
559 pipeline_total_turns,
560 session.all_responses().len(),
561 ));
562
563 if pipeline_total_turns > 0 {
565 token_checks.push(Check::compare(
566 "total_output_tokens > 0",
567 "true",
568 (pipeline_total_output > 0).to_string(),
569 ));
570 }
571
572 let pipeline_cost: f64 = session
574 .turns
575 .iter()
576 .chain(session.agent_turns.iter())
577 .map(|t| calc.calculate_turn_cost(&t.model, &t.usage).total)
578 .sum();
579
580 let has_tokens = session
582 .turns
583 .iter()
584 .chain(session.agent_turns.iter())
585 .any(|t| {
586 t.usage.input_tokens.unwrap_or(0) > 0 || t.usage.output_tokens.unwrap_or(0) > 0
587 });
588 if has_tokens {
589 token_checks.push(Check::compare(
590 "cost > 0 when tokens exist",
591 "true",
592 (pipeline_cost > 0.0).to_string(),
593 ));
594 }
595
596 if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
598 token_checks.push(Check::compare(
599 "project_association",
600 mf.project.as_deref().unwrap_or("(none)"),
601 session.project.as_deref().unwrap_or("(none)"),
602 ));
603 }
604
605 let project_name = session
606 .project
607 .as_deref()
608 .unwrap_or("(unknown)")
609 .to_string();
610
611 session_results.push(SessionValidation {
612 session_id: session.session_id.clone(),
613 project: project_name,
614 token_checks,
615 agent_checks,
616 });
617 }
618
619 let mut summary = ValidationSummary::default();
622
623 for check in &structure_checks {
624 summary.total_checks += 1;
625 if check.passed {
626 summary.passed += 1;
627 } else {
628 summary.failed += 1;
629 }
630 }
631
632 for sv in &session_results {
633 summary.sessions_validated += 1;
634 let mut session_pass = true;
635 for check in sv.token_checks.iter().chain(sv.agent_checks.iter()) {
636 summary.total_checks += 1;
637 if check.passed {
638 summary.passed += 1;
639 } else {
640 summary.failed += 1;
641 session_pass = false;
642 }
643 }
644 if session_pass {
645 summary.sessions_passed += 1;
646 }
647 }
648
649 Ok(ValidationReport {
650 session_results,
651 structure_checks,
652 summary,
653 })
654}
655
656#[cfg(test)]
657mod tests {
658 use super::*;
659 use std::io::Write;
660 use tempfile::NamedTempFile;
661
662 fn make_assistant_line(request_id: &str, input: u64, output: u64) -> String {
663 format!(
664 r#"{{"type":"assistant","uuid":"u-{}","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":{},"output_tokens":{},"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[{{"type":"text","text":"hi"}}]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"{}"}}"#,
665 request_id, input, output, request_id
666 )
667 }
668
669 #[test]
670 fn raw_counter_basic() {
671 let mut f = NamedTempFile::new().unwrap();
672 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
673 writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
674 f.flush().unwrap();
675
676 let result = count_raw_tokens(f.path(), true).unwrap();
677 assert_eq!(result.turn_count, 2);
678 assert_eq!(result.input_tokens, 300);
679 assert_eq!(result.output_tokens, 125);
680 }
681
682 #[test]
683 fn raw_counter_deduplicates_streaming() {
684 let mut f = NamedTempFile::new().unwrap();
685 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
687 writeln!(f, "{}", make_assistant_line("r1", 200, 75)).unwrap();
688 f.flush().unwrap();
689
690 let result = count_raw_tokens(f.path(), true).unwrap();
691 assert_eq!(result.turn_count, 1);
692 assert_eq!(result.input_tokens, 200);
693 assert_eq!(result.output_tokens, 75);
694 }
695
696 #[test]
697 fn raw_counter_skips_synthetic() {
698 let mut f = NamedTempFile::new().unwrap();
699 writeln!(f, r#"{{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"<synthetic>","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"r1"}}"#).unwrap();
700 writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
701 f.flush().unwrap();
702
703 let result = count_raw_tokens(f.path(), true).unwrap();
704 assert_eq!(result.turn_count, 1);
705 assert_eq!(result.input_tokens, 200);
706 }
707
708 #[test]
709 fn raw_counter_respects_sidechain_flag() {
710 let sidechain_line = r#"{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0},"content":[]},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":true,"parentUuid":null,"requestId":"r1"}"#;
711 let mut f = NamedTempFile::new().unwrap();
712 writeln!(f, "{}", sidechain_line).unwrap();
713 f.flush().unwrap();
714
715 let result = count_raw_tokens(f.path(), true).unwrap();
717 assert_eq!(result.turn_count, 0);
718
719 let result = count_raw_tokens(f.path(), false).unwrap();
721 assert_eq!(result.turn_count, 1);
722 assert_eq!(result.input_tokens, 100);
723 }
724
725 #[test]
726 fn raw_counter_skips_non_assistant() {
727 let mut f = NamedTempFile::new().unwrap();
728 writeln!(f, r#"{{"type":"user","uuid":"u1","message":{{"role":"user","content":"hi"}},"timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
729 writeln!(f, r#"{{"type":"progress","data":{{"type":"hook"}},"uuid":"u2","timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
730 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
731 f.flush().unwrap();
732
733 let result = count_raw_tokens(f.path(), true).unwrap();
734 assert_eq!(result.turn_count, 1);
735 }
736}