use anyhow::{Context, Result};
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use crate::data::models::{GlobalDataQuality, SessionData, SessionFile};
use crate::data::scanner;
use crate::pricing::calculator::PricingCalculator;
#[derive(Debug)]
pub struct ValidationReport {
pub session_results: Vec<SessionValidation>,
pub structure_checks: Vec<Check>,
pub summary: ValidationSummary,
}
#[derive(Debug)]
pub struct SessionValidation {
pub session_id: String,
pub project: String,
pub token_checks: Vec<Check>,
pub agent_checks: Vec<Check>,
}
#[derive(Debug)]
pub struct Check {
pub name: String,
pub expected: String,
pub actual: String,
pub passed: bool,
}
impl Check {
fn pass(name: impl Into<String>, value: impl fmt::Display) -> Self {
let v = value.to_string();
Self { name: name.into(), expected: v.clone(), actual: v, passed: true }
}
fn compare(name: impl Into<String>, expected: impl fmt::Display, actual: impl fmt::Display) -> Self {
let e = expected.to_string();
let a = actual.to_string();
let passed = e == a;
Self { name: name.into(), expected: e, actual: a, passed }
}
#[allow(dead_code)]
fn compare_f64(name: impl Into<String>, expected: f64, actual: f64, tolerance: f64) -> Self {
let passed = (expected - actual).abs() < tolerance;
Self {
name: name.into(),
expected: format!("{:.2}", expected),
actual: format!("{:.2}", actual),
passed,
}
}
}
#[derive(Debug, Default)]
pub struct ValidationSummary {
pub total_checks: usize,
pub passed: usize,
pub failed: usize,
pub sessions_validated: usize,
pub sessions_passed: usize,
}
#[derive(Debug, Default)]
struct RawTokenCount {
input_tokens: u64,
output_tokens: u64,
cache_creation_tokens: u64,
cache_read_tokens: u64,
turn_count: usize,
}
fn is_valid_assistant(val: &serde_json::Value, skip_sidechain: bool, now: &chrono::DateTime<chrono::Utc>) -> bool {
if val.get("type").and_then(|t| t.as_str()) != Some("assistant") {
return false;
}
if skip_sidechain && val.get("isSidechain").and_then(|v| v.as_bool()) == Some(true) {
return false;
}
let model = val.pointer("/message/model").and_then(|m| m.as_str());
if model == Some("<synthetic>") || model.is_none() {
return false;
}
if val.pointer("/message/usage").is_none() {
return false;
}
let input = val.pointer("/message/usage/input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let output = val.pointer("/message/usage/output_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let cache_creation = val.pointer("/message/usage/cache_creation_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let cache_read = val.pointer("/message/usage/cache_read_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
if input + output + cache_creation + cache_read == 0 {
return false;
}
if let Some(ts_str) = val.get("timestamp").and_then(|t| t.as_str()) {
if let Ok(ts) = ts_str.parse::<chrono::DateTime<chrono::Utc>>() {
if ts > *now {
return false;
}
} else {
return false;
}
} else {
return false;
}
true
}
fn count_raw_tokens(path: &Path, skip_sidechain: bool) -> Result<RawTokenCount> {
let file = File::open(path)
.with_context(|| format!("raw counter: failed to open {}", path.display()))?;
let reader = BufReader::new(file);
let now = chrono::Utc::now();
let mut by_request: HashMap<String, (u64, u64, u64, u64)> = HashMap::new();
let mut no_request_id_count = RawTokenCount::default();
for line in reader.lines() {
let line = line?;
let val: serde_json::Value = match serde_json::from_str(&line) {
Ok(v) => v,
Err(_) => continue,
};
if !is_valid_assistant(&val, skip_sidechain, &now) {
continue;
}
let input = val.pointer("/message/usage/input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let output = val.pointer("/message/usage/output_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let cache_creation = val.pointer("/message/usage/cache_creation_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let cache_read = val.pointer("/message/usage/cache_read_input_tokens").and_then(|v| v.as_u64()).unwrap_or(0);
let request_id = val.get("requestId").and_then(|r| r.as_str());
match request_id {
Some(rid) if !rid.is_empty() => {
by_request.insert(rid.to_string(), (input, output, cache_creation, cache_read));
}
_ => {
no_request_id_count.input_tokens += input;
no_request_id_count.output_tokens += output;
no_request_id_count.cache_creation_tokens += cache_creation;
no_request_id_count.cache_read_tokens += cache_read;
no_request_id_count.turn_count += 1;
}
}
}
let mut result = no_request_id_count;
for (input, output, cc, cr) in by_request.values() {
result.input_tokens += input;
result.output_tokens += output;
result.cache_creation_tokens += cc;
result.cache_read_tokens += cr;
result.turn_count += 1;
}
Ok(result)
}
fn count_tokens_by_request_id(path: &Path, skip_sidechain: bool) -> Result<(HashMap<String, u64>, u64)> {
let file = File::open(path)?;
let reader = BufReader::new(file);
let now = chrono::Utc::now();
let mut by_rid: HashMap<String, u64> = HashMap::new();
let mut no_rid_output: u64 = 0;
for line in reader.lines() {
let line = line?;
let val: serde_json::Value = match serde_json::from_str(&line) {
Ok(v) => v,
Err(_) => continue,
};
if !is_valid_assistant(&val, skip_sidechain, &now) {
continue;
}
let output = val.pointer("/message/usage/output_tokens")
.and_then(|v| v.as_u64()).unwrap_or(0);
match val.get("requestId").and_then(|r| r.as_str()) {
Some(rid) if !rid.is_empty() => {
by_rid.insert(rid.to_string(), output);
}
_ => {
no_rid_output += output;
}
}
}
Ok((by_rid, no_rid_output))
}
fn collect_valid_request_ids(path: &Path, skip_sidechain: bool) -> Result<HashSet<String>> {
let file = File::open(path)?;
let reader = BufReader::new(file);
let now = chrono::Utc::now();
let mut ids = HashSet::new();
for line in reader.lines() {
let line = line?;
let val: serde_json::Value = match serde_json::from_str(&line) {
Ok(v) => v,
Err(_) => continue,
};
if !is_valid_assistant(&val, skip_sidechain, &now) {
continue;
}
if let Some(rid) = val.get("requestId").and_then(|r| r.as_str()) {
if !rid.is_empty() {
ids.insert(rid.to_string());
}
}
}
Ok(ids)
}
pub fn validate_all(
sessions: &[&SessionData],
quality: &GlobalDataQuality,
claude_home: &Path,
calc: &PricingCalculator,
) -> Result<ValidationReport> {
let mut files = scanner::scan_claude_home(claude_home)?;
scanner::resolve_agent_parents(&mut files)?;
let (main_files, agent_files): (Vec<&SessionFile>, Vec<&SessionFile>) =
files.iter().partition(|f| !f.is_agent);
let mut structure_checks = Vec::new();
let mut session_results = Vec::new();
structure_checks.push(Check::compare(
"session_count == main_file_count",
main_files.len(),
quality.total_session_files,
));
structure_checks.push(Check::compare(
"agent_file_count",
agent_files.len(),
quality.total_agent_files,
));
let main_session_ids: HashSet<&str> = main_files.iter()
.map(|f| f.session_id.as_str())
.collect();
let orphan_count = agent_files.iter()
.filter(|f| {
let parent = f.parent_session_id.as_deref()
.unwrap_or(&f.session_id);
!main_session_ids.contains(parent)
})
.count();
structure_checks.push(Check::pass(
format!("orphan_agents (no main session file): {}", orphan_count),
orphan_count,
));
let unique_main_ids: HashSet<&str> = main_files.iter()
.map(|f| f.session_id.as_str())
.collect();
let dup_count = main_files.len() - unique_main_ids.len();
structure_checks.push(Check::pass(
format!("main_session_files: {} files, {} unique IDs ({} duplicates)", main_files.len(), unique_main_ids.len(), dup_count),
main_files.len(),
));
let mut cross_file_overlap = 0usize;
for agent in &agent_files {
let parent_id = agent.parent_session_id.as_deref()
.unwrap_or(&agent.session_id);
let parent_file = main_files.iter()
.find(|f| f.session_id == parent_id);
if let Some(pf) = parent_file {
let parent_rids = collect_valid_request_ids(&pf.file_path, true).unwrap_or_default();
let agent_rids = collect_valid_request_ids(&agent.file_path, false).unwrap_or_default();
cross_file_overlap += parent_rids.intersection(&agent_rids).count();
}
}
structure_checks.push(Check::pass(
format!("cross_file_overlapping_request_ids (deduped: {})", cross_file_overlap),
cross_file_overlap,
));
let mut agents_by_parent: HashMap<&str, Vec<&SessionFile>> = HashMap::new();
for af in &agent_files {
let parent_id = af.parent_session_id.as_deref()
.unwrap_or(&af.session_id);
agents_by_parent.entry(parent_id).or_default().push(af);
}
let main_file_map: HashMap<&str, &SessionFile> = main_files.iter()
.map(|f| (f.session_id.as_str(), *f))
.collect();
for session in sessions {
let mut token_checks = Vec::new();
let mut agent_checks = Vec::new();
if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
let raw_main = count_raw_tokens(&mf.file_path, true)
.unwrap_or_default();
let pipeline_main_input: u64 = session.turns.iter()
.map(|t| t.usage.input_tokens.unwrap_or(0)).sum();
let pipeline_main_output: u64 = session.turns.iter()
.map(|t| t.usage.output_tokens.unwrap_or(0)).sum();
let pipeline_main_cache_creation: u64 = session.turns.iter()
.map(|t| t.usage.cache_creation_input_tokens.unwrap_or(0)).sum();
let pipeline_main_cache_read: u64 = session.turns.iter()
.map(|t| t.usage.cache_read_input_tokens.unwrap_or(0)).sum();
let pipeline_main_turns = session.turns.len();
token_checks.push(Check::compare(
"main_turn_count",
raw_main.turn_count,
pipeline_main_turns,
));
token_checks.push(Check::compare(
"main_input_tokens",
raw_main.input_tokens,
pipeline_main_input,
));
token_checks.push(Check::compare(
"main_output_tokens",
raw_main.output_tokens,
pipeline_main_output,
));
token_checks.push(Check::compare(
"main_cache_creation_tokens",
raw_main.cache_creation_tokens,
pipeline_main_cache_creation,
));
token_checks.push(Check::compare(
"main_cache_read_tokens",
raw_main.cache_read_tokens,
pipeline_main_cache_read,
));
}
let agent_session_files = agents_by_parent.get(session.session_id.as_str());
let expected_agent_files = agent_session_files.map_or(0, |v| v.len());
let actual_agent_file_count = if expected_agent_files > 0 { expected_agent_files } else { 0 };
agent_checks.push(Check::compare(
"agent_file_count (from scanner)",
actual_agent_file_count,
expected_agent_files,
));
if expected_agent_files > 0 {
if let Some(afs) = agent_session_files {
let main_file = main_file_map.get(session.session_id.as_str());
let main_rids = main_file
.map(|mf| collect_valid_request_ids(&mf.file_path, true).unwrap_or_default())
.unwrap_or_default();
let mut expected_unique_agent_turns = 0usize;
let mut raw_agent_output: u64 = 0;
for af in afs {
let raw = count_raw_tokens(&af.file_path, false)
.unwrap_or_default();
let file_rids = collect_valid_request_ids(&af.file_path, false)
.unwrap_or_default();
let file_overlap = file_rids.intersection(&main_rids).count();
let unique_turns = raw.turn_count.saturating_sub(file_overlap);
expected_unique_agent_turns += unique_turns;
let (per_rid, no_rid_output) = count_tokens_by_request_id(&af.file_path, false)
.unwrap_or_default();
for (rid, output) in &per_rid {
if !main_rids.contains(rid) {
raw_agent_output += output;
}
}
raw_agent_output += no_rid_output;
}
agent_checks.push(Check::compare(
"agent_turn_count (after cross-file dedup)",
expected_unique_agent_turns,
session.agent_turns.len(),
));
if expected_unique_agent_turns > 0 {
agent_checks.push(Check::compare(
"has_agent_turns (non-overlapping exist)",
"true",
(!session.agent_turns.is_empty()).to_string(),
));
}
let pipeline_agent_output: u64 = session.agent_turns.iter()
.map(|t| t.usage.output_tokens.unwrap_or(0)).sum();
let agent_output_match = {
if raw_agent_output == 0 && pipeline_agent_output == 0 { true }
else {
let max_val = raw_agent_output.max(pipeline_agent_output) as f64;
if max_val == 0.0 { true }
else { (raw_agent_output as f64 - pipeline_agent_output as f64).abs() / max_val < 0.05 }
}
};
agent_checks.push(Check {
name: "agent_output_tokens (±5%)".into(),
expected: raw_agent_output.to_string(),
actual: pipeline_agent_output.to_string(),
passed: agent_output_match,
});
let all_marked_agent = session.agent_turns.iter().all(|t| t.is_agent);
agent_checks.push(Check::compare(
"all agent_turns have is_agent=true",
"true",
all_marked_agent.to_string(),
));
}
}
let pipeline_total_output: u64 = session.turns.iter()
.chain(session.agent_turns.iter())
.map(|t| t.usage.output_tokens.unwrap_or(0)).sum();
let pipeline_total_turns = session.turns.len() + session.agent_turns.len();
token_checks.push(Check::compare(
"total_turn_count == turns + agent_turns",
pipeline_total_turns,
session.all_responses().len(),
));
if pipeline_total_turns > 0 {
token_checks.push(Check::compare(
"total_output_tokens > 0",
"true",
(pipeline_total_output > 0).to_string(),
));
}
let pipeline_cost: f64 = session.turns.iter()
.chain(session.agent_turns.iter())
.map(|t| calc.calculate_turn_cost(&t.model, &t.usage).total)
.sum();
let has_tokens = session.turns.iter().chain(session.agent_turns.iter())
.any(|t| {
t.usage.input_tokens.unwrap_or(0) > 0
|| t.usage.output_tokens.unwrap_or(0) > 0
});
if has_tokens {
token_checks.push(Check::compare(
"cost > 0 when tokens exist",
"true",
(pipeline_cost > 0.0).to_string(),
));
}
if let Some(mf) = main_file_map.get(session.session_id.as_str()) {
token_checks.push(Check::compare(
"project_association",
mf.project.as_deref().unwrap_or("(none)"),
session.project.as_deref().unwrap_or("(none)"),
));
}
let project_name = session.project.as_deref().unwrap_or("(unknown)").to_string();
session_results.push(SessionValidation {
session_id: session.session_id.clone(),
project: project_name,
token_checks,
agent_checks,
});
}
let mut summary = ValidationSummary::default();
for check in &structure_checks {
summary.total_checks += 1;
if check.passed { summary.passed += 1; } else { summary.failed += 1; }
}
for sv in &session_results {
summary.sessions_validated += 1;
let mut session_pass = true;
for check in sv.token_checks.iter().chain(sv.agent_checks.iter()) {
summary.total_checks += 1;
if check.passed {
summary.passed += 1;
} else {
summary.failed += 1;
session_pass = false;
}
}
if session_pass {
summary.sessions_passed += 1;
}
}
Ok(ValidationReport {
session_results,
structure_checks,
summary,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn make_assistant_line(request_id: &str, input: u64, output: u64) -> String {
format!(
r#"{{"type":"assistant","uuid":"u-{}","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":{},"output_tokens":{},"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[{{"type":"text","text":"hi"}}]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"{}"}}"#,
request_id, input, output, request_id
)
}
#[test]
fn raw_counter_basic() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
f.flush().unwrap();
let result = count_raw_tokens(f.path(), true).unwrap();
assert_eq!(result.turn_count, 2);
assert_eq!(result.input_tokens, 300);
assert_eq!(result.output_tokens, 125);
}
#[test]
fn raw_counter_deduplicates_streaming() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
writeln!(f, "{}", make_assistant_line("r1", 200, 75)).unwrap();
f.flush().unwrap();
let result = count_raw_tokens(f.path(), true).unwrap();
assert_eq!(result.turn_count, 1);
assert_eq!(result.input_tokens, 200);
assert_eq!(result.output_tokens, 75);
}
#[test]
fn raw_counter_skips_synthetic() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, r#"{{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{{"model":"<synthetic>","role":"assistant","stop_reason":"end_turn","usage":{{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0}},"content":[]}},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":false,"parentUuid":null,"requestId":"r1"}}"#).unwrap();
writeln!(f, "{}", make_assistant_line("r2", 200, 75)).unwrap();
f.flush().unwrap();
let result = count_raw_tokens(f.path(), true).unwrap();
assert_eq!(result.turn_count, 1);
assert_eq!(result.input_tokens, 200);
}
#[test]
fn raw_counter_respects_sidechain_flag() {
let sidechain_line = r#"{"type":"assistant","uuid":"u1","timestamp":"2026-03-16T10:00:00Z","message":{"model":"claude-opus-4-6","role":"assistant","stop_reason":"end_turn","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":0,"cache_read_input_tokens":0},"content":[]},"sessionId":"s1","cwd":"/tmp","gitBranch":"","userType":"external","isSidechain":true,"parentUuid":null,"requestId":"r1"}"#;
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "{}", sidechain_line).unwrap();
f.flush().unwrap();
let result = count_raw_tokens(f.path(), true).unwrap();
assert_eq!(result.turn_count, 0);
let result = count_raw_tokens(f.path(), false).unwrap();
assert_eq!(result.turn_count, 1);
assert_eq!(result.input_tokens, 100);
}
#[test]
fn raw_counter_skips_non_assistant() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, r#"{{"type":"user","uuid":"u1","message":{{"role":"user","content":"hi"}},"timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
writeln!(f, r#"{{"type":"progress","data":{{"type":"hook"}},"uuid":"u2","timestamp":"2026-03-16T10:00:00Z","sessionId":"s1"}}"#).unwrap();
writeln!(f, "{}", make_assistant_line("r1", 100, 50)).unwrap();
f.flush().unwrap();
let result = count_raw_tokens(f.path(), true).unwrap();
assert_eq!(result.turn_count, 1);
}
}