1use anyhow::{Context, Result};
2use std::collections::{HashMap, HashSet};
3use std::fmt;
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::Path;
7
8use crate::data::models::{GlobalDataQuality, SessionData, SessionFile};
9use crate::data::scanner;
10use crate::pricing::calculator::PricingCalculator;
11
12#[derive(Debug)]
15pub struct ValidationReport {
16 pub session_results: Vec<SessionValidation>,
17 pub structure_checks: Vec<Check>,
18 pub summary: ValidationSummary,
19}
20
21#[derive(Debug)]
22pub struct SessionValidation {
23 pub session_id: String,
24 pub project: String,
25 pub token_checks: Vec<Check>,
26 pub agent_checks: Vec<Check>,
27}
28
29#[derive(Debug)]
30pub struct Check {
31 pub name: String,
32 pub expected: String,
33 pub actual: String,
34 pub passed: bool,
35}
36
37impl Check {
38 fn pass(name: impl Into<String>, value: impl fmt::Display) -> Self {
39 let v = value.to_string();
40 Self { name: name.into(), expected: v.clone(), actual: v, passed: true }
41 }
42
43 fn compare(name: impl Into<String>, expected: impl fmt::Display, actual: impl fmt::Display) -> Self {
44 let e = expected.to_string();
45 let a = actual.to_string();
46 let passed = e == a;
47 Self { name: name.into(), expected: e, actual: a, passed }
48 }
49
50 #[allow(dead_code)]
51 fn compare_f64(name: impl Into<String>, expected: f64, actual: f64, tolerance: f64) -> Self {
52 let passed = (expected - actual).abs() < tolerance;
53 Self {
54 name: name.into(),
55 expected: format!("{:.2}", expected),
56 actual: format!("{:.2}", actual),
57 passed,
58 }
59 }
60}
61
62#[derive(Debug, Default)]
63pub struct ValidationSummary {
64 pub total_checks: usize,
65 pub passed: usize,
66 pub failed: usize,
67 pub sessions_validated: usize,
68 pub sessions_passed: usize,
69}
70
71#[derive(Debug, Default)]
76struct RawTokenCount {
77 input_tokens: u64,
78 output_tokens: u64,
79 cache_creation_tokens: u64,
80 cache_read_tokens: u64,
81 turn_count: usize,
82}
83
84fn is_valid_assistant(val: &serde_json::Value, skip_sidechain: bool, now: &chrono::DateTime<chrono::Utc>) -> bool {
91 if val.get("type").and_then(|t| t.as_str()) != Some("assistant") {
92 return false;
93 }
94 if skip_sidechain && val.get("isSidechain").and_then(|v| v.as_bool()) == Some(true) {
95 return false;
96 }
97 let model = val.pointer("/message/model").and_then(|m| m.as_str());
98 if model == Some("<synthetic>") || model.is_none() {
99 return false;
100 }
101 if val.pointer("/message/usage").is_none() {
103 return false;
104 }
105 let input = val.pointer("/message/usage/input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
106 let output = val.pointer("/message/usage/output_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
107 let cache_creation = val.pointer("/message/usage/cache_creation_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
108 let cache_read = val.pointer("/message/usage/cache_read_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
109 if input + output + cache_creation + cache_read == 0 {
110 return false;
111 }
112 if let Some(ts_str) = val.get("timestamp").and_then(|t| t.as_str()) {
114 if let Ok(ts) = ts_str.parse::<chrono::DateTime<chrono::Utc>>() {
115 if ts > *now {
116 return false;
117 }
118 } else {
119 return false;
120 }
121 } else {
122 return false;
123 }
124 true
125}
126
127fn count_raw_tokens(path: &Path, skip_sidechain: bool) -> Result<RawTokenCount> {
130 let file = File::open(path)
131 .with_context(|| format!("raw counter: failed to open {}", path.display()))?;
132 let reader = BufReader::new(file);
133 let now = chrono::Utc::now();
134
135 let mut by_request: HashMap<String, (u64, u64, u64, u64)> = HashMap::new();
137 let mut no_request_id_count = RawTokenCount::default();
138
139 for line in reader.lines() {
140 let line = line?;
141 let val: serde_json::Value = match serde_json::from_str(&line) {
142 Ok(v) => v,
143 Err(_) => continue,
144 };
145
146 if !is_valid_assistant(&val, skip_sidechain, &now) {
147 continue;
148 }
149
150 let input = val.pointer("/message/usage/input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
151 let output = val.pointer("/message/usage/output_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
152 let cache_creation = val.pointer("/message/usage/cache_creation_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
153 let cache_read = val.pointer("/message/usage/cache_read_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
154
155 let request_id = val.get("requestId").and_then(|r| r.as_str());
156
157 match request_id {
158 Some(rid) if !rid.is_empty() => {
159 by_request.insert(rid.to_string(), (input, output, cache_creation, cache_read));
160 }
161 _ => {
162 no_request_id_count.input_tokens += input;
163 no_request_id_count.output_tokens += output;
164 no_request_id_count.cache_creation_tokens += cache_creation;
165 no_request_id_count.cache_read_tokens += cache_read;
166 no_request_id_count.turn_count += 1;
167 }
168 }
169 }
170
171 let mut result = no_request_id_count;
172 for (input, output, cc, cr) in by_request.values() {
173 result.input_tokens += input;
174 result.output_tokens += output;
175 result.cache_creation_tokens += cc;
176 result.cache_read_tokens += cr;
177 result.turn_count += 1;
178 }
179
180 Ok(result)
181}
182
183fn collect_valid_request_ids(path: &Path, skip_sidechain: bool) -> Result<HashSet<String>> {
186 let file = File::open(path)?;
187 let reader = BufReader::new(file);
188 let now = chrono::Utc::now();
189 let mut ids = HashSet::new();
190
191 for line in reader.lines() {
192 let line = line?;
193 let val: serde_json::Value = match serde_json::from_str(&line) {
194 Ok(v) => v,
195 Err(_) => continue,
196 };
197 if !is_valid_assistant(&val, skip_sidechain, &now) {
198 continue;
199 }
200 if let Some(rid) = val.get("requestId").and_then(|r| r.as_str()) {
201 if !rid.is_empty() {
202 ids.insert(rid.to_string());
203 }
204 }
205 }
206 Ok(ids)
207}
208
209pub fn validate_all(
213 sessions: &[&SessionData],
214 quality: &GlobalDataQuality,
215 claude_home: &Path,
216 calc: &PricingCalculator,
217) -> Result<ValidationReport> {
218 let mut files = scanner::scan_claude_home(claude_home)?;
220 scanner::resolve_agent_parents(&mut files)?;
221
222 let (main_files, agent_files): (Vec<&SessionFile>, Vec<&SessionFile>) =
223 files.iter().partition(|f| !f.is_agent);
224
225 let mut structure_checks = Vec::new();
226 let mut session_results = Vec::new();
227
228 structure_checks.push(Check::compare(
232 "session_count == main_file_count",
233 main_files.len(),
234 quality.total_session_files,
235 ));
236
237 structure_checks.push(Check::compare(
239 "agent_file_count",
240 agent_files.len(),
241 quality.total_agent_files,
242 ));
243
244 let main_session_ids: HashSet<&str> = main_files.iter()
246 .map(|f| f.session_id.as_str())
247 .collect();
248 let orphan_count = agent_files.iter()
249 .filter(|f| {
250 let parent = f.parent_session_id.as_deref()
251 .unwrap_or(&f.session_id);
252 !main_session_ids.contains(parent)
253 })
254 .count();
255 structure_checks.push(Check::pass(
256 format!("orphan_agents (no main session file): {}", orphan_count),
257 orphan_count,
258 ));
259
260 let unique_main_ids: HashSet<&str> = main_files.iter()
262 .map(|f| f.session_id.as_str())
263 .collect();
264 let dup_count = main_files.len() - unique_main_ids.len();
265 structure_checks.push(Check::pass(
266 format!("main_session_files: {} files, {} unique IDs ({} duplicates)", main_files.len(), unique_main_ids.len(), dup_count),
267 main_files.len(),
268 ));
269
270 let mut cross_file_overlap = 0usize;
272 for agent in &agent_files {
273 let parent_id = agent.parent_session_id.as_deref()
274 .unwrap_or(&agent.session_id);
275 let parent_file = main_files.iter()
276 .find(|f| f.session_id == parent_id);
277 if let Some(pf) = parent_file {
278 let parent_rids = collect_valid_request_ids(&pf.file_path, true).unwrap_or_default();
279 let agent_rids = collect_valid_request_ids(&agent.file_path, false).unwrap_or_default();
280 cross_file_overlap += parent_rids.intersection(&agent_rids).count();
281 }
282 }
283 structure_checks.push(Check::pass(
284 format!("cross_file_overlapping_request_ids (deduped: {})", cross_file_overlap),
285 cross_file_overlap,
286 ));
287
288 let mut agents_by_parent: HashMap<&str, Vec<&SessionFile>> = HashMap::new();
292 for af in &agent_files {
293 let parent_id = af.parent_session_id.as_deref()
294 .unwrap_or(&af.session_id);
295 agents_by_parent.entry(parent_id).or_default().push(af);
296 }
297
298 let main_file_map: HashMap<&str, &SessionFile> = main_files.iter()
300 .map(|f| (f.session_id.as_str(), *f))
301 .collect();
302
303 for session in sessions {
304 let mut token_checks = Vec::new();
305 let mut agent_checks = Vec::new();
306
307 if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
309 let raw_main = count_raw_tokens(&mf.file_path, true)
310 .unwrap_or_default();
311
312 let pipeline_main_input: u64 = session.turns.iter()
314 .map(|t| t.usage.input_tokens.unwrap_or(0)).sum();
315 let pipeline_main_output: u64 = session.turns.iter()
316 .map(|t| t.usage.output_tokens.unwrap_or(0)).sum();
317 let pipeline_main_cache_creation: u64 = session.turns.iter()
318 .map(|t| t.usage.cache_creation_input_tokens.unwrap_or(0)).sum();
319 let pipeline_main_cache_read: u64 = session.turns.iter()
320 .map(|t| t.usage.cache_read_input_tokens.unwrap_or(0)).sum();
321 let pipeline_main_turns = session.turns.len();
322
323 token_checks.push(Check::compare(
324 "main_turn_count",
325 raw_main.turn_count,
326 pipeline_main_turns,
327 ));
328 token_checks.push(Check::compare(
329 "main_input_tokens",
330 raw_main.input_tokens,
331 pipeline_main_input,
332 ));
333 token_checks.push(Check::compare(
334 "main_output_tokens",
335 raw_main.output_tokens,
336 pipeline_main_output,
337 ));
338 token_checks.push(Check::compare(
339 "main_cache_creation_tokens",
340 raw_main.cache_creation_tokens,
341 pipeline_main_cache_creation,
342 ));
343 token_checks.push(Check::compare(
344 "main_cache_read_tokens",
345 raw_main.cache_read_tokens,
346 pipeline_main_cache_read,
347 ));
348 }
349
350 let agent_session_files = agents_by_parent.get(session.session_id.as_str());
352 let expected_agent_files = agent_session_files.map_or(0, |v| v.len());
353
354 agent_checks.push(Check::compare(
355 "agent_file_count",
356 expected_agent_files,
357 expected_agent_files, ));
359
360 if expected_agent_files > 0 {
362 if let Some(afs) = agent_session_files {
363 let main_file = main_file_map.get(session.session_id.as_str());
365 let main_rids = main_file
366 .map(|mf| collect_valid_request_ids(&mf.file_path, true).unwrap_or_default())
367 .unwrap_or_default();
368
369 let mut expected_unique_agent_turns = 0usize;
371 for af in afs {
372 let raw = count_raw_tokens(&af.file_path, false)
373 .unwrap_or_default();
374 let file_rids = collect_valid_request_ids(&af.file_path, false)
375 .unwrap_or_default();
376 let file_overlap = file_rids.intersection(&main_rids).count();
377 expected_unique_agent_turns += raw.turn_count.saturating_sub(file_overlap);
378 }
379
380 agent_checks.push(Check::compare(
381 "agent_turn_count (after cross-file dedup)",
382 expected_unique_agent_turns,
383 session.agent_turns.len(),
384 ));
385
386 if expected_unique_agent_turns > 0 {
388 agent_checks.push(Check::compare(
389 "has_agent_turns (non-overlapping exist)",
390 "true",
391 (!session.agent_turns.is_empty()).to_string(),
392 ));
393 }
394 }
395 }
396
397 let pipeline_cost: f64 = session.turns.iter()
399 .chain(session.agent_turns.iter())
400 .map(|t| calc.calculate_turn_cost(&t.model, &t.usage).total)
401 .sum();
402
403 let has_tokens = session.turns.iter().chain(session.agent_turns.iter())
405 .any(|t| {
406 t.usage.input_tokens.unwrap_or(0) > 0
407 || t.usage.output_tokens.unwrap_or(0) > 0
408 });
409 if has_tokens {
410 token_checks.push(Check::compare(
411 "cost > 0 when tokens exist",
412 "true",
413 (pipeline_cost > 0.0).to_string(),
414 ));
415 }
416
417 if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
419 token_checks.push(Check::compare(
420 "project_association",
421 mf.project.as_deref().unwrap_or("(none)"),
422 session.project.as_deref().unwrap_or("(none)"),
423 ));
424 }
425
426 let project_name = session.project.as_deref().unwrap_or("(unknown)").to_string();
427
428 session_results.push(SessionValidation {
429 session_id: session.session_id.clone(),
430 project: project_name,
431 token_checks,
432 agent_checks,
433 });
434 }
435
436 let mut summary = ValidationSummary::default();
439
440 for check in &structure_checks {
441 summary.total_checks += 1;
442 if check.passed { summary.passed += 1; } else { summary.failed += 1; }
443 }
444
445 for sv in &session_results {
446 summary.sessions_validated += 1;
447 let mut session_pass = true;
448 for check in sv.token_checks.iter().chain(sv.agent_checks.iter()) {
449 summary.total_checks += 1;
450 if check.passed {
451 summary.passed += 1;
452 } else {
453 summary.failed += 1;
454 session_pass = false;
455 }
456 }
457 if session_pass {
458 summary.sessions_passed += 1;
459 }
460 }
461
462 Ok(ValidationReport {
463 session_results,
464 structure_checks,
465 summary,
466 })
467}
468
469#[cfg(test)]
470mod tests {
471 use super::*;
472 use std::io::Write;
473 use tempfile::NamedTempFile;
474
475 fn make_assistant_line(request_id: &str, input: u64, output: u64) -> String {
476 format!(
477 r#"{{"type":"assistant","uuid":"u-{}","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":{},"output_tokens":{},"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[{{"type":"text","text":"hi"}}]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"{}"}}"#,
478 request_id, input, output, request_id
479 )
480 }
481
482 #[test]
483 fn raw_counter_basic() {
484 let mut f = NamedTempFile::new().unwrap();
485 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
486 writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
487 f.flush().unwrap();
488
489 let result = count_raw_tokens(f.path(), true).unwrap();
490 assert_eq!(result.turn_count, 2);
491 assert_eq!(result.input_tokens, 300);
492 assert_eq!(result.output_tokens, 125);
493 }
494
495 #[test]
496 fn raw_counter_deduplicates_streaming() {
497 let mut f = NamedTempFile::new().unwrap();
498 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
500 writeln!(f, "{}", make_assistant_line("r1", 200, 75)).unwrap();
501 f.flush().unwrap();
502
503 let result = count_raw_tokens(f.path(), true).unwrap();
504 assert_eq!(result.turn_count, 1);
505 assert_eq!(result.input_tokens, 200);
506 assert_eq!(result.output_tokens, 75);
507 }
508
509 #[test]
510 fn raw_counter_skips_synthetic() {
511 let mut f = NamedTempFile::new().unwrap();
512 writeln!(f, r#"{{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"<synthetic>","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"r1"}}"#).unwrap();
513 writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
514 f.flush().unwrap();
515
516 let result = count_raw_tokens(f.path(), true).unwrap();
517 assert_eq!(result.turn_count, 1);
518 assert_eq!(result.input_tokens, 200);
519 }
520
521 #[test]
522 fn raw_counter_respects_sidechain_flag() {
523 let sidechain_line = r#"{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0},"content":[]},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":true,"parentUuid":null,"requestId":"r1"}"#;
524 let mut f = NamedTempFile::new().unwrap();
525 writeln!(f, "{}", sidechain_line).unwrap();
526 f.flush().unwrap();
527
528 let result = count_raw_tokens(f.path(), true).unwrap();
530 assert_eq!(result.turn_count, 0);
531
532 let result = count_raw_tokens(f.path(), false).unwrap();
534 assert_eq!(result.turn_count, 1);
535 assert_eq!(result.input_tokens, 100);
536 }
537
538 #[test]
539 fn raw_counter_skips_non_assistant() {
540 let mut f = NamedTempFile::new().unwrap();
541 writeln!(f, r#"{{"type":"user","uuid":"u1","message":{{"role":"user","content":"hi"}},"timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
542 writeln!(f, r#"{{"type":"progress","data":{{"type":"hook"}},"uuid":"u2","timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
543 writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
544 f.flush().unwrap();
545
546 let result = count_raw_tokens(f.path(), true).unwrap();
547 assert_eq!(result.turn_count, 1);
548 }
549}