fn run_bench(args: BenchArgs) -> anyhow::Result<BenchReport> {
let path = args.path;
let quality_cases = parse_quality_cases(&args.quality_cases)?;
let start = Instant::now();
let snapshot = index_repo(&path)?;
let index_duration = start.elapsed();
let index = TantivySearchIndex::open_or_create(default_index_dir(&path))?;
let bm25_median = median_duration(time_searches(10, || index.search("fn", 10).map(|_| ()))?);
let store = open_store(&path)?;
let files = store.list_files(usize::MAX, 0)?;
let chunks = store.all_chunks()?;
let symbols = store.list_symbols(None, usize::MAX, 0)?;
let regex_median = median_duration(time_searches(10, || {
search_chunks(&chunks, &files, &symbols, "fn", 10).map(|_| ())
})?);
let quality = if quality_cases.is_empty() {
None
} else {
Some(evaluate_quality_cases(
&index,
&quality_cases,
args.quality_limit,
)?)
};
let manifest = snapshot.manifest;
let elapsed_seconds = index_duration.as_secs_f64();
Ok(BenchReport {
repo: path,
index: IndexBenchReport {
file_count: manifest.file_count,
symbol_count: manifest.symbol_count,
chunk_count: manifest.chunk_count,
elapsed_ms: duration_ms(index_duration),
files_per_second: if elapsed_seconds > 0.0 {
manifest.file_count as f64 / elapsed_seconds
} else {
0.0
},
},
search: SearchBenchReport {
bm25_median_ms: duration_ms(bm25_median),
regex_median_ms: duration_ms(regex_median),
},
quality,
})
}
fn parse_quality_cases(values: &[String]) -> anyhow::Result<Vec<QualityCase>> {
values
.iter()
.map(|value| {
let (query, expected_path) = value.split_once('=').ok_or_else(|| {
anyhow::anyhow!("quality case must use QUERY=EXPECTED_PATH_SUBSTRING: {value}")
})?;
let query = query.trim();
let expected_path = expected_path.trim();
if query.is_empty() || expected_path.is_empty() {
anyhow::bail!("quality case query and expected path must be non-empty: {value}");
}
Ok(QualityCase {
query: query.to_string(),
expected_path: expected_path.to_string(),
})
})
.collect()
}
fn evaluate_quality_cases(
index: &TantivySearchIndex,
cases: &[QualityCase],
limit: usize,
) -> anyhow::Result<QualityBenchReport> {
let limit = limit.max(1);
let mut reports = Vec::with_capacity(cases.len());
let mut top_hits = 0usize;
let mut any_hits = 0usize;
let mut reciprocal_rank = 0.0;
for case in cases {
let results = index.search(&case.query, limit)?;
let expected = normalize_path_fragment(&case.expected_path);
let rank = results.iter().position(|result| {
normalize_path_fragment(&result.path.to_string_lossy()).contains(&expected)
});
let rank = rank.map(|value| value + 1);
if rank == Some(1) {
top_hits += 1;
}
if let Some(rank) = rank {
any_hits += 1;
reciprocal_rank += 1.0 / rank as f64;
}
reports.push(QualityCaseReport {
query: case.query.clone(),
expected_path: case.expected_path.clone(),
rank,
top_path: results.first().map(|result| result.path.clone()),
matched_path: rank
.and_then(|rank| results.get(rank - 1).map(|result| result.path.clone())),
result_count: results.len(),
});
}
let total = cases.len() as f64;
Ok(QualityBenchReport {
case_count: cases.len(),
precision_at_1: top_hits as f64 / total,
hit_rate_at_k: any_hits as f64 / total,
mean_reciprocal_rank: reciprocal_rank / total,
limit,
cases: reports,
})
}
fn print_bench_report(report: &BenchReport) {
println!(
"Indexed {} files, {} symbols, and {} chunks in {:.2}ms",
report.index.file_count,
report.index.symbol_count,
report.index.chunk_count,
report.index.elapsed_ms
);
println!("{:.2} files/sec", report.index.files_per_second);
println!("BM25 search: {:.2}ms median", report.search.bm25_median_ms);
println!(
"Regex search: {:.2}ms median",
report.search.regex_median_ms
);
if let Some(quality) = &report.quality {
println!(
"Quality: precision@1 {:.3}, hit-rate@{} {:.3}, MRR {:.3}",
quality.precision_at_1,
quality.limit,
quality.hit_rate_at_k,
quality.mean_reciprocal_rank
);
for case in &quality.cases {
let status = match case.rank {
Some(1) => "pass",
Some(_) => "hit",
None => "miss",
};
let rank = case
.rank
.map(|rank| rank.to_string())
.unwrap_or_else(|| "-".to_string());
let top_path = case
.top_path
.as_ref()
.map(|path| path.display().to_string())
.unwrap_or_else(|| "-".to_string());
println!(
" {status}: query {:?}, expected {:?}, rank {}, top {}",
case.query, case.expected_path, rank, top_path
);
}
}
}
fn run_architecture_policy_bench(
args: ArchitectureBenchArgs,
) -> anyhow::Result<ArchitecturePolicyBenchReport> {
let repo = absolutize(&args.path)?;
let cases_file = absolutize(&args.cases_file)?;
let cases = load_architecture_policy_bench_cases(&cases_file)?;
if cases.is_empty() {
anyhow::bail!(
"architecture policy benchmark cases file is empty: {}",
cases_file.display()
);
}
if !args.no_index {
index_repo(&repo)?;
}
let Some(policy) = load_architecture_policy(&repo)? else {
anyhow::bail!(
"architecture policy benchmark requires a configured policy in {}",
repo.display()
);
};
let store = open_store(&repo)?;
let resolver = PolicyResolver::new(&policy)?;
let iterations = args.iterations.max(1);
let mut durations = Vec::with_capacity(iterations);
let mut report = None;
for _ in 0..iterations {
let started = Instant::now();
let check = evaluate_policy(&store, &resolver, &policy)?;
durations.push(started.elapsed());
report = Some(check);
}
let report = report.expect("at least one architecture policy benchmark iteration");
let actual_findings = architecture_policy_actual_findings(&policy, &report);
let (summary, families, case_reports) =
score_architecture_policy_cases(&policy, &cases, &actual_findings);
Ok(ArchitecturePolicyBenchReport {
repo,
cases_file,
case_count: cases.len(),
iterations,
p95_policy_check_ms: percentile_duration_ms(&durations, 0.95),
summary,
rule_families: families,
cases: case_reports,
})
}
fn load_architecture_policy_bench_cases(
path: &Path,
) -> anyhow::Result<Vec<ArchitecturePolicyBenchCase>> {
let raw = fs::read_to_string(path).with_context(|| {
format!(
"failed to read architecture policy cases {}",
path.display()
)
})?;
let cases: Vec<ArchitecturePolicyBenchCase> =
serde_json::from_str(&raw).with_context(|| {
format!(
"failed to parse architecture policy cases {}",
path.display()
)
})?;
let mut seen = BTreeMap::new();
for case in &cases {
if case.id.trim().is_empty() {
anyhow::bail!("architecture policy benchmark case id must be non-empty");
}
if let Some(previous) = seen.insert(case.id.clone(), true) {
if previous {
anyhow::bail!(
"duplicate architecture policy benchmark case id `{}`",
case.id
);
}
}
if matches!(
case.expected,
ArchitecturePolicyBenchOutcome::Violation | ArchitecturePolicyBenchOutcome::Exempted
) && case.rule_id.as_deref().unwrap_or_default().is_empty()
{
anyhow::bail!(
"architecture policy benchmark case `{}` requires rule_id for {:?}",
case.id,
case.expected
);
}
}
Ok(cases)
}
fn architecture_policy_actual_findings(
policy: &ArchitecturePolicy,
report: &open_kioku_core::PolicyCheckReport,
) -> Vec<ArchitecturePolicyActualFinding> {
let mut findings = Vec::new();
for violation in &report.violations {
findings.push(ArchitecturePolicyActualFinding {
rule_family: architecture_policy_rule_family(policy, &violation.rule_id),
outcome: ArchitecturePolicyBenchOutcome::Violation,
rule_id: Some(violation.rule_id.clone()),
source_path: violation.source_path.clone(),
target_path: violation.target_path.clone(),
edge_type: violation.edge_type,
});
}
for exemption in &report.exemptions {
findings.push(ArchitecturePolicyActualFinding {
rule_family: architecture_policy_rule_family(policy, &exemption.rule_id),
outcome: ArchitecturePolicyBenchOutcome::Exempted,
rule_id: Some(exemption.rule_id.clone()),
source_path: exemption.source_path.clone(),
target_path: exemption.target_path.clone(),
edge_type: exemption.evidence.edge_type,
});
}
for unknown in &report.unknown_edges {
findings.push(ArchitecturePolicyActualFinding {
rule_family: ArchitecturePolicyRuleFamily::Unknown,
outcome: ArchitecturePolicyBenchOutcome::Unknown,
rule_id: None,
source_path: unknown.evidence.source_path.clone(),
target_path: unknown.evidence.target_path.clone(),
edge_type: unknown.evidence.edge_type,
});
}
findings.sort_by(|left, right| {
left.rule_family
.cmp(&right.rule_family)
.then_with(|| left.rule_id.cmp(&right.rule_id))
.then_with(|| left.source_path.cmp(&right.source_path))
.then_with(|| left.target_path.cmp(&right.target_path))
.then_with(|| left.edge_type.cmp(&right.edge_type))
.then_with(|| format!("{:?}", left.outcome).cmp(&format!("{:?}", right.outcome)))
});
findings.dedup_by(|left, right| {
left.rule_family == right.rule_family
&& left.outcome == right.outcome
&& left.rule_id == right.rule_id
&& same_architecture_bench_path(&left.source_path, &right.source_path)
&& same_architecture_bench_path(&left.target_path, &right.target_path)
&& left.edge_type == right.edge_type
});
findings
}
fn score_architecture_policy_cases(
_policy: &ArchitecturePolicy,
cases: &[ArchitecturePolicyBenchCase],
actual_findings: &[ArchitecturePolicyActualFinding],
) -> (
ArchitecturePolicyBenchSummary,
Vec<ArchitecturePolicyBenchFamilyReport>,
Vec<ArchitecturePolicyBenchCaseReport>,
) {
let mut overall = ArchitecturePolicyBenchCounts::default();
let mut families: BTreeMap<ArchitecturePolicyRuleFamily, ArchitecturePolicyBenchCounts> =
BTreeMap::new();
let mut matched_positive_cases = vec![false; cases.len()];
let mut case_reports = Vec::with_capacity(cases.len());
for (case_index, case) in cases.iter().enumerate() {
let matching = actual_findings
.iter()
.filter(|finding| architecture_policy_case_selector_matches(case, finding))
.collect::<Vec<_>>();
let actual = matching
.iter()
.map(|finding| finding.outcome)
.collect::<Vec<_>>();
let matched = matching
.iter()
.any(|finding| architecture_policy_case_exact_match(case, finding));
let passed = if case.expected == ArchitecturePolicyBenchOutcome::Allowed {
matching.is_empty()
} else {
matched
};
if matched && case.expected != ArchitecturePolicyBenchOutcome::Allowed {
matched_positive_cases[case_index] = true;
}
let mut notes = Vec::new();
if !passed {
if case.expected == ArchitecturePolicyBenchOutcome::Allowed {
notes.push("expected no policy finding, but at least one finding matched".into());
} else if matching.is_empty() {
notes.push("expected policy finding was not reported".into());
} else {
notes.push(
"reported policy finding did not match expected outcome, family, or rule"
.into(),
);
}
}
case_reports.push(ArchitecturePolicyBenchCaseReport {
id: case.id.clone(),
rule_family: case.rule_family,
expected: case.expected,
actual,
rule_id: case.rule_id.clone(),
source_path: case.source_path.clone(),
target_path: case.target_path.clone(),
edge_type: case.edge_type,
passed,
notes,
});
}
for case in cases {
if case.expected != ArchitecturePolicyBenchOutcome::Allowed {
overall.expected_positive_count += 1;
families
.entry(case.rule_family)
.or_default()
.expected_positive_count += 1;
}
}
for finding in actual_findings {
let Some((case_index, case)) = cases
.iter()
.enumerate()
.find(|(_, case)| architecture_policy_case_selector_matches(case, finding))
else {
continue;
};
overall.actual_positive_count += 1;
families
.entry(finding.rule_family)
.or_default()
.actual_positive_count += 1;
if architecture_policy_case_exact_match(case, finding) {
overall.true_positives += 1;
families
.entry(finding.rule_family)
.or_default()
.true_positives += 1;
matched_positive_cases[case_index] = true;
} else {
overall.false_positives += 1;
families
.entry(finding.rule_family)
.or_default()
.false_positives += 1;
}
}
for (case_index, case) in cases.iter().enumerate() {
if case.expected != ArchitecturePolicyBenchOutcome::Allowed
&& !matched_positive_cases[case_index]
{
overall.false_negatives += 1;
families
.entry(case.rule_family)
.or_default()
.false_negatives += 1;
}
}
let summary = architecture_policy_counts_summary(overall);
let family_reports = families
.into_iter()
.map(|(rule_family, counts)| architecture_policy_family_report(rule_family, counts))
.collect::<Vec<_>>();
(summary, family_reports, case_reports)
}
fn architecture_policy_case_selector_matches(
case: &ArchitecturePolicyBenchCase,
finding: &ArchitecturePolicyActualFinding,
) -> bool {
same_architecture_bench_path(&case.source_path, &finding.source_path)
&& same_architecture_bench_path(&case.target_path, &finding.target_path)
&& case.edge_type == finding.edge_type
&& case
.rule_id
.as_ref()
.map(|rule_id| finding.rule_id.as_ref() == Some(rule_id))
.unwrap_or(true)
}
fn architecture_policy_case_exact_match(
case: &ArchitecturePolicyBenchCase,
finding: &ArchitecturePolicyActualFinding,
) -> bool {
architecture_policy_case_selector_matches(case, finding)
&& case.expected == finding.outcome
&& case.rule_family == finding.rule_family
}
fn same_architecture_bench_path(left: &Path, right: &Path) -> bool {
normalize_path_fragment(&left.display().to_string())
== normalize_path_fragment(&right.display().to_string())
}
fn architecture_policy_rule_family(
policy: &ArchitecturePolicy,
rule_id: &str,
) -> ArchitecturePolicyRuleFamily {
if policy
.dependency_rules
.iter()
.any(|rule| rule.id == rule_id)
{
ArchitecturePolicyRuleFamily::DependencyRule
} else if policy
.public_api_rules
.iter()
.any(|rule| rule.id == rule_id)
{
ArchitecturePolicyRuleFamily::PublicApiRule
} else if policy
.internal_only_rules
.iter()
.any(|rule| rule.id == rule_id)
{
ArchitecturePolicyRuleFamily::InternalOnlyRule
} else {
ArchitecturePolicyRuleFamily::Unknown
}
}
fn architecture_policy_counts_summary(
counts: ArchitecturePolicyBenchCounts,
) -> ArchitecturePolicyBenchSummary {
ArchitecturePolicyBenchSummary {
precision: ratio(counts.true_positives, counts.actual_positive_count),
recall: ratio(counts.true_positives, counts.expected_positive_count),
true_positives: counts.true_positives,
false_positives: counts.false_positives,
false_negatives: counts.false_negatives,
expected_positive_count: counts.expected_positive_count,
actual_positive_count: counts.actual_positive_count,
}
}
fn architecture_policy_family_report(
rule_family: ArchitecturePolicyRuleFamily,
counts: ArchitecturePolicyBenchCounts,
) -> ArchitecturePolicyBenchFamilyReport {
ArchitecturePolicyBenchFamilyReport {
rule_family,
precision: ratio(counts.true_positives, counts.actual_positive_count),
recall: ratio(counts.true_positives, counts.expected_positive_count),
true_positives: counts.true_positives,
false_positives: counts.false_positives,
false_negatives: counts.false_negatives,
expected_positive_count: counts.expected_positive_count,
actual_positive_count: counts.actual_positive_count,
}
}
fn percentile_duration_ms(durations: &[Duration], percentile: f64) -> f64 {
if durations.is_empty() {
return 0.0;
}
let mut values = durations
.iter()
.map(|duration| duration_ms(*duration))
.collect::<Vec<_>>();
values.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
let rank = ((values.len() as f64 * percentile).ceil() as usize)
.saturating_sub(1)
.min(values.len() - 1);
values[rank]
}
fn print_architecture_policy_bench_report(report: &ArchitecturePolicyBenchReport) {
println!(
"Architecture policy benchmark: {} case(s), p95 {:.2}ms",
report.case_count, report.p95_policy_check_ms
);
println!(
"Overall: precision {:.3}, recall {:.3}, TP {}, FP {}, FN {}",
report.summary.precision,
report.summary.recall,
report.summary.true_positives,
report.summary.false_positives,
report.summary.false_negatives
);
for family in &report.rule_families {
println!(
" {:?}: precision {:.3}, recall {:.3}, TP {}, FP {}, FN {}",
family.rule_family,
family.precision,
family.recall,
family.true_positives,
family.false_positives,
family.false_negatives
);
}
for case in &report.cases {
let status = if case.passed { "pass" } else { "fail" };
println!(
" {status}: {} {:?} {:?} {} -> {} via {:?}",
case.id,
case.rule_family,
case.expected,
case.source_path.display(),
case.target_path.display(),
case.edge_type
);
for note in &case.notes {
println!(" note: {note}");
}
}
}
fn run_history_bench(repo: &Path, args: HistoryBenchArgs) -> anyhow::Result<HistoryBenchReport> {
let cases_file = if args.cases_file.is_absolute() {
args.cases_file.clone()
} else {
repo.join(&args.cases_file)
};
let corpus = load_history_bench_corpus(&cases_file)?;
if corpus.cases.is_empty() {
anyhow::bail!(
"history benchmark cases file is empty: {}",
cases_file.display()
);
}
let mut scores = HistoryBenchScoring::default();
let mut cases = Vec::with_capacity(corpus.cases.len());
let mut failures = Vec::new();
for case in &corpus.cases {
let report = score_history_bench_case(case, &mut scores)?;
failures.extend(history_bench_failures(&report));
cases.push(report);
}
let family_p95_ms = scores
.family_latencies_ms
.iter()
.map(|(family, values)| (family.clone(), p95_ms(values)))
.collect::<BTreeMap<_, _>>();
Ok(HistoryBenchReport {
cases_file,
schema_version: corpus.schema_version,
case_count: cases.len(),
family_counts: scores.family_counts,
min_reviewer_accuracy: args.min_reviewer_accuracy,
reviewer_accuracy: ratio(scores.reviewer_passed, scores.reviewer_total),
min_similar_recall_at_5: args.min_similar_recall_at_5,
similar_recall_at_5: ratio(scores.similar_matched_total, scores.similar_expected_total),
max_similar_p95_ms: args.max_similar_p95_ms,
similar_p95_ms: p95_ms(&scores.similar_latencies_ms),
max_lookup_p95_ms: args.max_lookup_p95_ms,
ownership_churn_p95_ms: p95_ms(&scores.ownership_churn_latencies_ms),
family_p95_ms,
failures,
cases,
})
}
#[derive(Default)]
struct HistoryBenchScoring {
family_counts: HistoryBenchFamilyCounts,
similar_expected_total: usize,
similar_matched_total: usize,
reviewer_total: usize,
reviewer_passed: usize,
similar_latencies_ms: Vec<f64>,
ownership_churn_latencies_ms: Vec<f64>,
family_latencies_ms: BTreeMap<String, Vec<f64>>,
}
fn load_history_bench_corpus(path: &Path) -> anyhow::Result<HistoryBenchCorpus> {
let raw = fs::read_to_string(path)?;
let corpus: HistoryBenchCorpus = serde_json::from_str(&raw)?;
if corpus.schema_version != 1 {
anyhow::bail!(
"unsupported history benchmark schema_version {}; expected 1",
corpus.schema_version
);
}
let mut top_ids = BTreeSet::new();
let mut child_ids = BTreeSet::new();
for case in &corpus.cases {
if case.id.trim().is_empty() {
anyhow::bail!("history benchmark cases require non-empty id");
}
if !top_ids.insert(case.id.clone()) {
anyhow::bail!("duplicate history benchmark case id `{}`", case.id);
}
if case.similar.is_empty()
&& case.ownership.is_empty()
&& case.reviewers.is_empty()
&& case.churn.is_empty()
&& case.provenance.is_empty()
{
anyhow::bail!(
"history benchmark case `{}` must include at least one public API family",
case.id
);
}
for child in &case.similar {
validate_history_bench_child_id(&case.id, "similar", &child.id, &mut child_ids)?;
if child.expected_top_5.is_empty() {
anyhow::bail!(
"history benchmark similar case `{}::{}` requires expected_top_5",
case.id,
child.id
);
}
}
for child in &case.ownership {
validate_history_bench_child_id(&case.id, "ownership", &child.id, &mut child_ids)?;
if child.path.as_os_str().is_empty() || child.expected_owner.trim().is_empty() {
anyhow::bail!(
"history benchmark ownership case `{}::{}` requires path and expected_owner",
case.id,
child.id
);
}
}
for child in &case.reviewers {
validate_history_bench_child_id(&case.id, "reviewers", &child.id, &mut child_ids)?;
if child.path.as_os_str().is_empty() || child.expected_top_reviewer.trim().is_empty() {
anyhow::bail!(
"history benchmark reviewer case `{}::{}` requires path and expected_top_reviewer",
case.id,
child.id
);
}
}
for child in &case.churn {
validate_history_bench_child_id(&case.id, "churn", &child.id, &mut child_ids)?;
let provided = usize::from(child.path.is_some())
+ usize::from(child.module.is_some())
+ usize::from(child.symbol_id.is_some());
if provided != 1 {
anyhow::bail!(
"history benchmark churn case `{}::{}` must provide exactly one of path, module, or symbol_id",
case.id,
child.id
);
}
}
for child in &case.provenance {
validate_history_bench_child_id(&case.id, "provenance", &child.id, &mut child_ids)?;
if child.path.as_os_str().is_empty()
|| child.expected_first_seen.trim().is_empty()
|| child.expected_last_touched.trim().is_empty()
{
anyhow::bail!(
"history benchmark provenance case `{}::{}` requires path, expected_first_seen, and expected_last_touched",
case.id,
child.id
);
}
}
}
Ok(corpus)
}
fn validate_history_bench_child_id(
case_id: &str,
family: &str,
child_id: &str,
seen: &mut BTreeSet<String>,
) -> anyhow::Result<()> {
if child_id.trim().is_empty() {
anyhow::bail!("history benchmark {family} cases require non-empty id");
}
let key = format!("{case_id}::{family}::{child_id}");
if !seen.insert(key.clone()) {
anyhow::bail!("duplicate history benchmark child case id `{key}`");
}
Ok(())
}
fn score_history_bench_case(
case: &HistoryBenchCase,
scores: &mut HistoryBenchScoring,
) -> anyhow::Result<HistoryBenchCaseReport> {
let store = SqliteStore::open(":memory:")?;
store.put_history_snapshot(&case.snapshot)?;
let fixture_repo = prepare_history_bench_repo(&case.id, &case.codeowners)?;
let similar = case
.similar
.iter()
.map(|child| score_history_bench_similar(&store, child, scores))
.collect::<anyhow::Result<Vec<_>>>()?;
let ownership = case
.ownership
.iter()
.map(|child| score_history_bench_ownership(&fixture_repo.path, &store, child, scores))
.collect::<anyhow::Result<Vec<_>>>()?;
let reviewers = case
.reviewers
.iter()
.map(|child| score_history_bench_reviewer(&fixture_repo.path, &store, child, scores))
.collect::<anyhow::Result<Vec<_>>>()?;
let churn = case
.churn
.iter()
.map(|child| score_history_bench_churn(&store, child, scores))
.collect::<anyhow::Result<Vec<_>>>()?;
let provenance = case
.provenance
.iter()
.map(|child| score_history_bench_provenance(&store, child, scores))
.collect::<anyhow::Result<Vec<_>>>()?;
let passed = similar.iter().all(|report| report.passed)
&& ownership.iter().all(|report| report.passed)
&& reviewers.iter().all(|report| report.passed)
&& churn.iter().all(|report| report.passed)
&& provenance.iter().all(|report| report.passed);
Ok(HistoryBenchCaseReport {
id: case.id.clone(),
similar,
ownership,
reviewers,
churn,
provenance,
passed,
})
}
struct HistoryBenchTempRepo {
path: PathBuf,
}
impl Drop for HistoryBenchTempRepo {
fn drop(&mut self) {
let _ = fs::remove_dir_all(&self.path);
}
}
fn prepare_history_bench_repo(
case_id: &str,
codeowners: &[String],
) -> anyhow::Result<HistoryBenchTempRepo> {
let stamp = chrono::Utc::now()
.timestamp_nanos_opt()
.unwrap_or_else(|| chrono::Utc::now().timestamp_micros());
let path = std::env::temp_dir().join(format!(
"open-kioku-history-bench-{}-{}-{}",
std::process::id(),
stamp,
sanitize_temp_path_fragment(case_id)
));
fs::create_dir_all(&path)?;
if !codeowners.is_empty() {
let codeowners_dir = path.join(".github");
fs::create_dir_all(&codeowners_dir)?;
fs::write(codeowners_dir.join("CODEOWNERS"), codeowners.join("\n"))?;
}
Ok(HistoryBenchTempRepo { path })
}
fn score_history_bench_similar(
store: &SqliteStore,
case: &HistoryBenchSimilarCase,
scores: &mut HistoryBenchScoring,
) -> anyhow::Result<HistoryBenchSimilarCaseReport> {
let started_at = Instant::now();
let report = store.similar_changes(&case.query, 5)?;
let latency_ms = elapsed_ms(started_at);
record_history_bench_latency(scores, "similar", latency_ms);
scores.family_counts.similar += 1;
let actual_top_5 = report
.hits
.iter()
.map(|hit| hit.change.commit.id.0.clone())
.collect::<Vec<_>>();
let expected = case.expected_top_5.iter().cloned().collect::<BTreeSet<_>>();
let actual = actual_top_5.iter().cloned().collect::<BTreeSet<_>>();
let matched = expected.intersection(&actual).cloned().collect::<Vec<_>>();
scores.similar_expected_total += expected.len();
scores.similar_matched_total += matched.len();
let recall_at_5 = ratio(matched.len(), expected.len());
Ok(HistoryBenchSimilarCaseReport {
id: case.id.clone(),
expected_top_5: case.expected_top_5.clone(),
actual_top_5,
matched,
recall_at_5,
latency_ms,
passed: recall_at_5 >= 1.0,
})
}
fn score_history_bench_ownership(
repo: &Path,
store: &SqliteStore,
case: &HistoryBenchOwnershipCase,
scores: &mut HistoryBenchScoring,
) -> anyhow::Result<HistoryBenchOwnershipCaseReport> {
let started_at = Instant::now();
let report = open_kioku_git::ownership_for_path(open_kioku_git::OwnershipInput {
repo,
path: &case.path,
history: store,
memory_facts: &[],
components: Vec::new(),
})?;
let latency_ms = elapsed_ms(started_at);
record_history_bench_latency(scores, "ownership", latency_ms);
scores.family_counts.ownership += 1;
let rank = report
.owners
.iter()
.position(|suggestion| owner_matches_expected(&suggestion.owner, &case.expected_owner))
.map(|index| index + 1);
let top = report.owners.first();
let actual_owner = top.map(|suggestion| owner_display(&suggestion.owner));
let actual_source_types = top
.map(|suggestion| suggestion.source_types.clone())
.unwrap_or_default();
let source_types_match = case
.expected_source_types
.iter()
.all(|expected| actual_source_types.contains(expected));
Ok(HistoryBenchOwnershipCaseReport {
id: case.id.clone(),
path: case.path.clone(),
expected_owner: case.expected_owner.clone(),
actual_owner,
rank,
expected_source_types: case.expected_source_types.clone(),
actual_source_types,
latency_ms,
passed: rank == Some(1) && source_types_match,
})
}
fn score_history_bench_reviewer(
repo: &Path,
store: &SqliteStore,
case: &HistoryBenchReviewerCase,
scores: &mut HistoryBenchScoring,
) -> anyhow::Result<HistoryBenchReviewerCaseReport> {
let started_at = Instant::now();
let ownership = open_kioku_git::ownership_for_path(open_kioku_git::OwnershipInput {
repo,
path: &case.path,
history: store,
memory_facts: &[],
components: Vec::new(),
})?;
let report = open_kioku_git::suggest_reviewers(open_kioku_git::ReviewerSuggestionInput {
path: &case.path,
history: store,
ownership: Some(&ownership),
})?;
let latency_ms = elapsed_ms(started_at);
record_history_bench_latency(scores, "reviewers", latency_ms);
scores.family_counts.reviewers += 1;
scores.reviewer_total += 1;
let rank = report
.suggestions
.iter()
.position(|suggestion| {
owner_matches_expected(&suggestion.reviewer, &case.expected_top_reviewer)
})
.map(|index| index + 1);
let top = report.suggestions.first();
let actual_top_reviewer = top.map(|suggestion| owner_display(&suggestion.reviewer));
let actual_review_evidence = top.map(|suggestion| suggestion.actual_review_evidence);
let inferred_from_authors = top.map(|suggestion| suggestion.inferred_from_authors);
let availability_correct = report.availability == case.expected_availability;
let actual_review_evidence_correct = case
.expected_actual_review_evidence
.zip(actual_review_evidence)
.map(|(expected, actual)| expected == actual)
.unwrap_or(true);
let inferred_from_authors_correct = case
.expected_inferred_from_authors
.zip(inferred_from_authors)
.map(|(expected, actual)| expected == actual)
.unwrap_or(true);
let passed = rank == Some(1)
&& availability_correct
&& actual_review_evidence_correct
&& inferred_from_authors_correct;
if passed {
scores.reviewer_passed += 1;
}
Ok(HistoryBenchReviewerCaseReport {
id: case.id.clone(),
path: case.path.clone(),
expected_top_reviewer: case.expected_top_reviewer.clone(),
actual_top_reviewer,
rank,
expected_availability: case.expected_availability,
availability: report.availability,
availability_correct,
expected_actual_review_evidence: case.expected_actual_review_evidence,
actual_review_evidence,
actual_review_evidence_correct,
expected_inferred_from_authors: case.expected_inferred_from_authors,
inferred_from_authors,
inferred_from_authors_correct,
latency_ms,
passed,
})
}
fn score_history_bench_churn(
store: &SqliteStore,
case: &HistoryBenchChurnCase,
scores: &mut HistoryBenchScoring,
) -> anyhow::Result<HistoryBenchChurnCaseReport> {
let target = history_bench_churn_target(case);
let started_at = Instant::now();
let summary = if let Some(path) = &case.path {
store.churn_for_file(path)?
} else if let Some(module) = &case.module {
store.churn_for_module(module)?
} else if let Some(symbol_id) = &case.symbol_id {
store.churn_for_symbol(symbol_id)?
} else {
unreachable!("history benchmark churn targets are validated before scoring");
};
let latency_ms = elapsed_ms(started_at);
record_history_bench_latency(scores, "churn", latency_ms);
scores.family_counts.churn += 1;
let passed = summary.stats.touch_count >= case.min_touch_count
&& summary.stats.hotspot_score >= case.min_hotspot_score;
Ok(HistoryBenchChurnCaseReport {
id: case.id.clone(),
target,
touch_count: summary.stats.touch_count,
hotspot_score: summary.stats.hotspot_score,
min_touch_count: case.min_touch_count,
min_hotspot_score: case.min_hotspot_score,
confidence: summary.confidence,
latency_ms,
passed,
})
}
fn score_history_bench_provenance(
store: &SqliteStore,
case: &HistoryBenchProvenanceCase,
scores: &mut HistoryBenchScoring,
) -> anyhow::Result<HistoryBenchProvenanceCaseReport> {
let started_at = Instant::now();
let report = store.provenance_for_path(&case.path, case.limit.unwrap_or(20))?;
let latency_ms = elapsed_ms(started_at);
record_history_bench_latency(scores, "provenance", latency_ms);
scores.family_counts.provenance += 1;
let actual_first_seen = report
.first_seen
.as_ref()
.map(|touch| touch.commit.id.0.clone());
let actual_last_touched = report
.last_touched
.as_ref()
.map(|touch| touch.commit.id.0.clone());
let passed = actual_first_seen.as_deref() == Some(case.expected_first_seen.as_str())
&& actual_last_touched.as_deref() == Some(case.expected_last_touched.as_str())
&& report.recent_touches.len() >= case.min_recent_touches;
Ok(HistoryBenchProvenanceCaseReport {
id: case.id.clone(),
path: case.path.clone(),
expected_first_seen: case.expected_first_seen.clone(),
actual_first_seen,
expected_last_touched: case.expected_last_touched.clone(),
actual_last_touched,
min_recent_touches: case.min_recent_touches,
recent_touch_count: report.recent_touches.len(),
confidence: report.confidence,
latency_ms,
passed,
})
}
fn history_bench_churn_target(case: &HistoryBenchChurnCase) -> String {
if let Some(path) = &case.path {
format!("file:{}", path.display())
} else if let Some(module) = &case.module {
format!("module:{}", module.display())
} else if let Some(symbol_id) = &case.symbol_id {
format!("symbol:{}", symbol_id.0)
} else {
"unknown".into()
}
}
fn history_bench_failures(report: &HistoryBenchCaseReport) -> Vec<String> {
let mut failures = Vec::new();
for case in &report.similar {
if !case.passed {
failures.push(format!(
"{}::similar::{} expected Top-5 {:?}, got {:?}",
report.id, case.id, case.expected_top_5, case.actual_top_5
));
}
}
for case in &report.ownership {
if !case.passed {
failures.push(format!(
"{}::ownership::{} expected owner `{}` at rank 1, got {:?} at rank {:?}",
report.id, case.id, case.expected_owner, case.actual_owner, case.rank
));
}
}
for case in &report.reviewers {
if !case.passed {
failures.push(format!(
"{}::reviewers::{} expected reviewer `{}` at rank 1 with {:?}, got {:?} at rank {:?} with {:?}",
report.id,
case.id,
case.expected_top_reviewer,
case.expected_availability,
case.actual_top_reviewer,
case.rank,
case.availability
));
}
}
for case in &report.churn {
if !case.passed {
failures.push(format!(
"{}::churn::{} expected touch_count >= {} and hotspot_score >= {:.3}, got {} and {:.3}",
report.id,
case.id,
case.min_touch_count,
case.min_hotspot_score,
case.touch_count,
case.hotspot_score
));
}
}
for case in &report.provenance {
if !case.passed {
failures.push(format!(
"{}::provenance::{} expected first/last {}/{}, got {:?}/{:?}",
report.id,
case.id,
case.expected_first_seen,
case.expected_last_touched,
case.actual_first_seen,
case.actual_last_touched
));
}
}
failures
}
fn record_history_bench_latency(scores: &mut HistoryBenchScoring, family: &str, latency_ms: f64) {
if family == "similar" {
scores.similar_latencies_ms.push(latency_ms);
}
if matches!(family, "ownership" | "churn") {
scores.ownership_churn_latencies_ms.push(latency_ms);
}
scores
.family_latencies_ms
.entry(family.to_string())
.or_default()
.push(latency_ms);
}
fn elapsed_ms(started_at: Instant) -> f64 {
started_at.elapsed().as_secs_f64() * 1000.0
}
fn p95_ms(values: &[f64]) -> f64 {
if values.is_empty() {
return 0.0;
}
let mut sorted = values.to_vec();
sorted.sort_by(|left, right| left.total_cmp(right));
let index = ((sorted.len() as f64 * 0.95).ceil() as usize).saturating_sub(1);
sorted[index.min(sorted.len() - 1)]
}
fn default_history_bench_min_recent_touches() -> usize {
1
}
fn print_history_bench_report(report: &HistoryBenchReport) {
println!(
"History API benchmark: {} case set(s); reviewer accuracy {:.3} (min {:.3}); similar Top-5 recall {:.3} (min {:.3})",
report.case_count,
report.reviewer_accuracy,
report.min_reviewer_accuracy,
report.similar_recall_at_5,
report.min_similar_recall_at_5
);
println!(
"Latency p95: similar {:.2}ms (max {:.2}ms); ownership/churn {:.2}ms (max {:.2}ms)",
report.similar_p95_ms,
report.max_similar_p95_ms,
report.ownership_churn_p95_ms,
report.max_lookup_p95_ms
);
println!(
"Families: similar {}, ownership {}, reviewers {}, churn {}, provenance {}",
report.family_counts.similar,
report.family_counts.ownership,
report.family_counts.reviewers,
report.family_counts.churn,
report.family_counts.provenance
);
for (family, latency_ms) in &report.family_p95_ms {
println!(" {family}: p95 {latency_ms:.2}ms");
}
for case in &report.cases {
let status = if case.passed { "pass" } else { "fail" };
println!(" {status}: {}", case.id);
}
if !report.failures.is_empty() {
println!("Failures:");
for failure in &report.failures {
println!("- {failure}");
}
}
}
fn run_reviewer_bench(repo: &Path, args: ReviewerBenchArgs) -> anyhow::Result<ReviewerBenchReport> {
let cases_file = if args.cases_file.is_absolute() {
args.cases_file.clone()
} else {
repo.join(&args.cases_file)
};
let cases = load_reviewer_bench_cases(&cases_file)?;
if cases.is_empty() {
anyhow::bail!(
"reviewer benchmark cases file is empty: {}",
cases_file.display()
);
}
let mut reports = Vec::with_capacity(cases.len());
let mut failures = Vec::new();
for case in &cases {
let report = score_reviewer_bench_case(case)?;
if !report.passed {
failures.push(format!(
"{} expected top reviewer `{}` at rank 1, got {:?} at rank {:?}",
case.id, case.expected_top_reviewer, report.actual_top_reviewer, report.rank
));
}
reports.push(report);
}
let passed = reports.iter().filter(|case| case.passed).count();
Ok(ReviewerBenchReport {
cases_file,
case_count: reports.len(),
min_accuracy: args.min_accuracy,
accuracy: ratio(passed, reports.len()),
failures,
cases: reports,
})
}
fn run_similar_history_bench(
repo: &Path,
args: SimilarHistoryBenchArgs,
) -> anyhow::Result<SimilarHistoryBenchReport> {
let cases_file = if args.cases_file.is_absolute() {
args.cases_file.clone()
} else {
repo.join(&args.cases_file)
};
let cases = load_similar_history_bench_cases(&cases_file)?;
if cases.is_empty() {
anyhow::bail!(
"similar history benchmark cases file is empty: {}",
cases_file.display()
);
}
let mut reports = Vec::with_capacity(cases.len());
let mut failures = Vec::new();
let mut expected_total = 0_usize;
let mut matched_total = 0_usize;
for case in &cases {
let report = score_similar_history_bench_case(case)?;
expected_total += report.expected_top_5.len();
matched_total += report.matched.len();
if !report.passed {
failures.push(format!(
"{} expected Top-5 commit(s) {:?}, got {:?}",
case.id, report.expected_top_5, report.actual_top_5
));
}
reports.push(report);
}
Ok(SimilarHistoryBenchReport {
cases_file,
case_count: reports.len(),
min_recall_at_5: args.min_recall_at_5,
recall_at_5: ratio(matched_total, expected_total),
failures,
cases: reports,
})
}
fn load_reviewer_bench_cases(path: &Path) -> anyhow::Result<Vec<ReviewerBenchCase>> {
let raw = fs::read_to_string(path)?;
let cases: Vec<ReviewerBenchCase> = serde_json::from_str(&raw)?;
let mut ids = BTreeSet::new();
for case in &cases {
if case.id.trim().is_empty() || case.expected_top_reviewer.trim().is_empty() {
anyhow::bail!(
"reviewer benchmark cases require non-empty id and expected_top_reviewer"
);
}
if case.path.as_os_str().is_empty() {
anyhow::bail!("reviewer benchmark case `{}` requires a path", case.id);
}
if !ids.insert(case.id.clone()) {
anyhow::bail!("duplicate reviewer benchmark case id `{}`", case.id);
}
}
Ok(cases)
}
fn load_similar_history_bench_cases(path: &Path) -> anyhow::Result<Vec<SimilarHistoryBenchCase>> {
let raw = fs::read_to_string(path)?;
let cases: Vec<SimilarHistoryBenchCase> = serde_json::from_str(&raw)?;
let mut ids = BTreeSet::new();
for case in &cases {
if case.id.trim().is_empty() || case.expected_top_5.is_empty() {
anyhow::bail!(
"similar history benchmark cases require non-empty id and expected_top_5"
);
}
if !ids.insert(case.id.clone()) {
anyhow::bail!("duplicate similar history benchmark case id `{}`", case.id);
}
}
Ok(cases)
}
fn score_similar_history_bench_case(
case: &SimilarHistoryBenchCase,
) -> anyhow::Result<SimilarHistoryBenchCaseReport> {
let store = SqliteStore::open(":memory:")?;
store.put_history_snapshot(&case.snapshot)?;
let report = store.similar_changes(&case.query, 5)?;
let actual_top_5 = report
.hits
.iter()
.map(|hit| hit.change.commit.id.0.clone())
.collect::<Vec<_>>();
let expected = case.expected_top_5.iter().cloned().collect::<BTreeSet<_>>();
let actual = actual_top_5.iter().cloned().collect::<BTreeSet<_>>();
let matched = expected.intersection(&actual).cloned().collect::<Vec<_>>();
let recall_at_5 = ratio(matched.len(), expected.len());
Ok(SimilarHistoryBenchCaseReport {
id: case.id.clone(),
expected_top_5: case.expected_top_5.clone(),
actual_top_5,
matched,
recall_at_5,
passed: recall_at_5 >= 1.0,
})
}
fn score_reviewer_bench_case(case: &ReviewerBenchCase) -> anyhow::Result<ReviewerBenchCaseReport> {
let history = ReviewerBenchHistoryStore::from_case(case);
let ownership = reviewer_bench_ownership_report(case);
let report = open_kioku_git::suggest_reviewers(open_kioku_git::ReviewerSuggestionInput {
path: &case.path,
history: &history,
ownership: Some(&ownership),
})?;
let rank = report
.suggestions
.iter()
.position(|suggestion| {
owner_matches_expected(&suggestion.reviewer, &case.expected_top_reviewer)
})
.map(|index| index + 1);
let top = report.suggestions.first();
let actual_top_reviewer = top.map(|suggestion| owner_display(&suggestion.reviewer));
let actual_review_evidence = top.map(|suggestion| suggestion.actual_review_evidence);
let inferred_from_authors = top.map(|suggestion| suggestion.inferred_from_authors);
let top_score = top.map(|suggestion| suggestion.score);
let availability_correct = report.availability == case.expected_availability;
let actual_review_evidence_correct = case
.expected_actual_review_evidence
.zip(actual_review_evidence)
.map(|(expected, actual)| expected == actual)
.unwrap_or(true);
let inferred_from_authors_correct = case
.expected_inferred_from_authors
.zip(inferred_from_authors)
.map(|(expected, actual)| expected == actual)
.unwrap_or(true);
let passed = rank == Some(1)
&& availability_correct
&& actual_review_evidence_correct
&& inferred_from_authors_correct;
Ok(ReviewerBenchCaseReport {
id: case.id.clone(),
path: case.path.clone(),
expected_top_reviewer: case.expected_top_reviewer.clone(),
actual_top_reviewer,
rank,
expected_availability: case.expected_availability,
availability: report.availability,
availability_correct,
expected_actual_review_evidence: case.expected_actual_review_evidence,
actual_review_evidence,
actual_review_evidence_correct,
expected_inferred_from_authors: case.expected_inferred_from_authors,
inferred_from_authors,
inferred_from_authors_correct,
top_score,
passed,
})
}
#[derive(Clone)]
struct ReviewerBenchHistoryStore {
history: HistorySummary,
provenance: FileProvenance,
}
impl ReviewerBenchHistoryStore {
fn from_case(case: &ReviewerBenchCase) -> Self {
let mut reviewer_evidence = Vec::with_capacity(case.review_evidence.len());
for (index, evidence) in case.review_evidence.iter().enumerate() {
reviewer_evidence.push(ReviewerEvidence {
id: HistoryRecordId::new(format!("reviewer-bench:{}:{index}", case.id)),
commit_id: Some(GitCommitId::new(format!(
"reviewer-bench-{}-{index}",
case.id
))),
path: Some(case.path.clone()),
reviewer: owner_from_token(&evidence.reviewer),
role: evidence.role,
observed_at: reviewer_bench_time(evidence.days_ago),
source: evidence
.source
.clone()
.unwrap_or_else(|| format!("reviewer-bench:{}", case.id)),
confidence: evidence.confidence,
});
}
let mut touches = Vec::new();
let mut touch_index = 0usize;
for touch in &case.author_touches {
for offset in 0..touch.count.max(1) {
touches.push(reviewer_bench_touch(case, touch, touch_index, offset));
touch_index += 1;
}
}
touches.sort_by(|left, right| {
right
.commit
.committed_at
.cmp(&left.commit.committed_at)
.then_with(|| left.commit.id.0.cmp(&right.commit.id.0))
});
Self {
history: HistorySummary {
path: case.path.clone(),
recent_commits: Vec::new(),
file_touches: Vec::new(),
symbol_touches: Vec::new(),
cochange_neighbors: Vec::new(),
reviewer_evidence,
truncated: false,
uncertainty: Vec::new(),
},
provenance: FileProvenance {
path: case.path.clone(),
first_seen: touches.last().cloned(),
last_touched: touches.first().cloned(),
recent_touches: touches,
confidence: Confidence::High,
truncated: false,
uncertainty: Vec::new(),
},
}
}
}
impl HistoryStore for ReviewerBenchHistoryStore {
fn put_history_snapshot(&self, _snapshot: &HistorySnapshot) -> open_kioku_errors::Result<()> {
Ok(())
}
fn history_for_file(
&self,
path: &Path,
_limit: usize,
) -> open_kioku_errors::Result<HistorySummary> {
if path == self.history.path {
Ok(self.history.clone())
} else {
Ok(HistorySummary::empty(path))
}
}
fn provenance_for_path(
&self,
path: &Path,
_limit: usize,
) -> open_kioku_errors::Result<FileProvenance> {
if path == self.provenance.path {
Ok(self.provenance.clone())
} else {
Ok(FileProvenance {
path: path.to_path_buf(),
first_seen: None,
last_touched: None,
recent_touches: Vec::new(),
confidence: Confidence::Low,
truncated: false,
uncertainty: vec!["reviewer benchmark provenance unavailable for this path".into()],
})
}
}
fn provenance_for_symbol(
&self,
symbol_id: &SymbolId,
_limit: usize,
) -> open_kioku_errors::Result<SymbolProvenance> {
Ok(SymbolProvenance {
symbol_id: symbol_id.clone(),
qualified_name: "reviewer_bench::unknown".into(),
file_path: self.provenance.path.clone(),
range: None,
first_seen: None,
last_touched: None,
recent_touches: Vec::new(),
confidence: Confidence::Low,
truncated: false,
uncertainty: vec!["reviewer benchmark symbol provenance unavailable".into()],
})
}
fn cochange_neighbors(
&self,
_path: &Path,
_limit: usize,
) -> open_kioku_errors::Result<Vec<GitCochangeEdge>> {
Ok(Vec::new())
}
fn recent_commits(&self, _limit: usize) -> open_kioku_errors::Result<Vec<GitCommitRecord>> {
Ok(Vec::new())
}
}
fn reviewer_bench_ownership_report(case: &ReviewerBenchCase) -> OwnershipReport {
let generated_at = chrono::Utc::now();
let owners = case
.ownership
.iter()
.enumerate()
.map(|(index, evidence)| {
let owner = owner_from_token(&evidence.owner);
let source_types = if evidence.source_types.is_empty() {
default_reviewer_bench_source_types()
} else {
evidence.source_types.clone()
};
let observed_at = reviewer_bench_time(evidence.days_ago);
let stale = reviewer_bench_is_stale(evidence.days_ago);
let confidence = Confidence::from_score(evidence.score);
let source = evidence
.source
.clone()
.unwrap_or_else(|| format!("reviewer-bench:{}:{index}", case.id));
let ownership_evidence = source_types
.iter()
.map(|source_type| OwnershipEvidence {
source_type: *source_type,
owner: owner.clone(),
source: source.clone(),
message: "reviewer benchmark ownership signal".into(),
confidence,
observed_at: Some(observed_at),
stale,
})
.collect::<Vec<_>>();
OwnerSuggestion {
owner,
rationale: "reviewer benchmark ownership signal".into(),
confidence,
score: evidence.score,
source_types: source_types.clone(),
stale,
evidence: ownership_evidence,
confidence_breakdown: open_kioku_core::OwnershipConfidenceBreakdown {
codeowners: if source_types.contains(&OwnershipSourceType::Codeowners) {
evidence.score
} else {
0.0
},
git_history: if source_types.contains(&OwnershipSourceType::GitHistory) {
evidence.score
} else {
0.0
},
memory: if source_types.contains(&OwnershipSourceType::RepoMemory) {
evidence.score
} else {
0.0
},
freshness: if stale { 0.0 } else { 0.05 },
ambiguity_penalty: 0.0,
final_score: evidence.score,
},
}
})
.collect();
OwnershipReport {
path: case.path.clone(),
components: Vec::new(),
generated_at,
owners,
uncertainty: Vec::new(),
}
}
fn reviewer_bench_touch(
case: &ReviewerBenchCase,
touch: &ReviewerBenchAuthorTouch,
index: usize,
offset: usize,
) -> ProvenanceTouch {
let author = owner_from_token(&touch.author);
let observed_at = reviewer_bench_time(touch.days_ago + offset as i64);
let commit_id = GitCommitId::new(format!("reviewer-bench-{}-touch-{index}", case.id));
ProvenanceTouch {
commit: GitCommitRecord {
id: commit_id,
parent_ids: Vec::new(),
author: author.clone(),
committer: None,
authored_at: observed_at,
committed_at: observed_at,
summary: format!("reviewer benchmark touch by {}", owner_display(&author)),
message: format!("reviewer benchmark touch by {}", owner_display(&author)),
file_count: 1,
},
path: case.path.clone(),
previous_path: None,
symbol_id: None,
qualified_name: None,
change_kind: GitChangeKind::Modified,
line_ranges: Vec::new(),
confidence: Confidence::High,
uncertainty: Vec::new(),
}
}
fn default_reviewer_bench_confidence() -> Confidence {
Confidence::High
}
fn default_reviewer_bench_source_types() -> Vec<OwnershipSourceType> {
vec![OwnershipSourceType::Codeowners]
}
fn default_reviewer_bench_owner_score() -> f32 {
0.90
}
fn default_reviewer_bench_touch_count() -> usize {
1
}
fn reviewer_bench_time(days_ago: i64) -> chrono::DateTime<chrono::Utc> {
chrono::Utc::now() - chrono::Duration::days(days_ago.max(0))
}
fn reviewer_bench_is_stale(days_ago: i64) -> bool {
days_ago > 365
}
fn owner_from_token(value: &str) -> Owner {
let trimmed = value.trim();
if let (Some(start), Some(end)) = (trimmed.rfind('<'), trimmed.rfind('>')) {
if start < end {
let name = trimmed[..start].trim();
let email = trimmed[start + 1..end].trim();
return Owner {
name: if name.is_empty() {
owner_name_from_email(email)
} else {
name.to_string()
},
email: (!email.is_empty()).then(|| email.to_string()),
};
}
}
if trimmed.contains('@') {
Owner {
name: owner_name_from_email(trimmed),
email: Some(trimmed.to_string()),
}
} else {
Owner {
name: trimmed.to_string(),
email: None,
}
}
}
fn owner_name_from_email(email: &str) -> String {
email.split('@').next().unwrap_or(email).to_string()
}
fn owner_matches_expected(owner: &Owner, expected: &str) -> bool {
let expected = expected.trim().to_ascii_lowercase();
owner
.email
.as_deref()
.is_some_and(|email| email.eq_ignore_ascii_case(&expected))
|| owner.name.eq_ignore_ascii_case(&expected)
}
fn owner_display(owner: &Owner) -> String {
owner.email.clone().unwrap_or_else(|| owner.name.clone())
}
fn print_reviewer_bench_report(report: &ReviewerBenchReport) {
println!(
"Reviewer benchmark: {} case(s), accuracy {:.3}, min {:.3}",
report.case_count, report.accuracy, report.min_accuracy
);
for case in &report.cases {
println!(
" {}: rank={:?} top={:?} availability={:?} score={:?} passed={}",
case.id,
case.rank,
case.actual_top_reviewer,
case.availability,
case.top_score,
case.passed
);
}
if !report.failures.is_empty() {
println!("Failures:");
for failure in &report.failures {
println!("- {failure}");
}
}
}
fn print_similar_history_bench_report(report: &SimilarHistoryBenchReport) {
println!(
"Similar history benchmark: {} case(s), Top-5 recall {:.3}, min {:.3}",
report.case_count, report.recall_at_5, report.min_recall_at_5
);
for case in &report.cases {
println!(
" {}: recall_at_5={:.3} expected={:?} actual={:?} passed={}",
case.id, case.recall_at_5, case.expected_top_5, case.actual_top_5, case.passed
);
}
if !report.failures.is_empty() {
println!("Failures:");
for failure in &report.failures {
println!("- {failure}");
}
}
}
fn run_workflow_bench(args: WorkflowBenchArgs) -> anyhow::Result<WorkflowBenchReport> {
let repo = absolutize(&args.path)?;
let cases_file = if args.cases_file.is_absolute() {
args.cases_file.clone()
} else {
repo.join(&args.cases_file)
};
let cases = load_workflow_bench_cases(&cases_file)?;
if cases.is_empty() {
anyhow::bail!(
"workflow benchmark cases file is empty: {}",
cases_file.display()
);
}
if !args.no_index {
index_repo(&repo)?;
}
let store = open_store(&repo)?;
let index_dir = default_index_dir(&repo);
let search_index = if TantivySearchIndex::exists(&index_dir) {
Some(TantivySearchIndex::open_or_create(&index_dir)?)
} else {
None
};
let planner = PlanEngine::new(&store as &dyn OkStore)
.with_search_index(search_index.as_ref().map(|idx| idx as &dyn SearchIndex))
.with_history_store(Some(&store));
let verifier = ChangeVerifier::new(&store as &dyn OkStore)
.with_search_index(search_index.as_ref().map(|idx| idx as &dyn SearchIndex));
let limit = args.limit.clamp(1, 100);
let mut reports = Vec::with_capacity(cases.len());
for case in cases {
let baseline_paths = baseline_context_paths(&repo, &store, &case.task, limit, &cases_file)?;
let plan = workflow_plan(&repo, &store, &planner, &case.task, limit, &cases_file)?;
reports.push(score_workflow_case(
&repo,
&verifier,
&case,
&plan,
&baseline_paths,
limit,
)?);
}
let workflow = summarize_workflow_cases(&reports, false);
let baseline = summarize_workflow_cases(&reports, true);
let deltas = WorkflowBenchDeltas {
context_recall_at_k: workflow.context_recall_at_k - baseline.context_recall_at_k,
impact_recall_at_k: workflow.impact_recall_at_k - baseline.impact_recall_at_k,
test_recall_at_k: workflow.test_recall_at_k - baseline.test_recall_at_k,
boundary_precision: workflow.boundary_precision - baseline.boundary_precision,
boundary_recall: workflow.boundary_recall - baseline.boundary_recall,
confidence_calibration_error: baseline.confidence_calibration_error
- workflow.confidence_calibration_error,
verification_verdict_accuracy: workflow.verification_verdict_accuracy
- baseline.verification_verdict_accuracy,
};
Ok(WorkflowBenchReport {
repo,
cases_file,
limit,
case_count: reports.len(),
baseline,
workflow,
deltas,
cases: reports,
})
}
fn load_workflow_bench_cases(path: &Path) -> anyhow::Result<Vec<WorkflowBenchCase>> {
let raw = fs::read_to_string(path)?;
let cases: Vec<WorkflowBenchCase> = serde_json::from_str(&raw)?;
for case in &cases {
if case.id.trim().is_empty() || case.task.trim().is_empty() {
anyhow::bail!("workflow benchmark cases require non-empty id and task");
}
}
Ok(cases)
}
fn baseline_context_paths(
repo: &Path,
store: &dyn MetadataStore,
task: &str,
limit: usize,
cases_file: &Path,
) -> anyhow::Result<Vec<PathBuf>> {
let mut raw = search_raw(repo, store, task, ranking_candidate_limit(limit))?;
filter_workflow_benchmark_artifacts(&mut raw, repo, cases_file);
Ok(top_unique_paths(rerank_baseline(raw), limit)
.into_iter()
.map(|result| result.path)
.collect())
}
fn workflow_plan(
repo: &Path,
store: &SqliteStore,
planner: &PlanEngine,
task: &str,
limit: usize,
cases_file: &Path,
) -> anyhow::Result<PlanReport> {
let mut context = build_context_pack(repo, store, task, limit)?;
context
.primary_files
.retain(|result| !is_workflow_benchmark_artifact(&result.path, repo, cases_file));
context
.supporting_files
.retain(|result| !is_workflow_benchmark_artifact(&result.path, repo, cases_file));
planner
.plan_from_context(task, limit, context)
.map_err(Into::into)
}
fn filter_workflow_benchmark_artifacts(
results: &mut Vec<open_kioku_core::SearchResult>,
repo: &Path,
cases_file: &Path,
) {
results.retain(|result| !is_workflow_benchmark_artifact(&result.path, repo, cases_file));
}
fn is_workflow_benchmark_artifact(path: &Path, repo: &Path, cases_file: &Path) -> bool {
let normalized = normalize_path_fragment(&path.to_string_lossy());
let cases = cases_file
.strip_prefix(repo)
.unwrap_or(cases_file)
.to_string_lossy();
normalized == normalize_path_fragment(&cases) || normalized.starts_with("benchmarks/")
}
fn score_workflow_case(
repo: &Path,
verifier: &ChangeVerifier,
case: &WorkflowBenchCase,
plan: &PlanReport,
baseline_paths: &[PathBuf],
limit: usize,
) -> anyhow::Result<WorkflowBenchCaseReport> {
let context_paths = plan
.primary_context
.iter()
.take(limit)
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let impact_paths = plan
.impact
.direct_impacts
.iter()
.chain(plan.impact.indirect_impacts.iter())
.take(limit)
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let test_names = plan
.validation
.iter()
.take(limit)
.map(|test| test.name.clone())
.collect::<Vec<_>>();
let boundary_paths = plan
.recommended_change_boundary
.allowed_files
.iter()
.chain(plan.recommended_change_boundary.caution_files.iter())
.cloned()
.collect::<Vec<_>>();
let context_hits = matching_expected_values(&case.expected_primary_context, &context_paths);
let impact_hits = matching_expected_values(&case.expected_impact, &impact_paths);
let test_hits = matching_expected_strings(&case.expected_tests, &test_names);
let boundary_hits = matching_expected_values(&case.expected_boundary, &boundary_paths);
let forbidden_boundary_hits = matching_expected_values(&case.forbidden_paths, &boundary_paths);
let baseline_context_hits =
matching_expected_values(&case.expected_primary_context, baseline_paths);
let expected_success = case.expected_confidence.unwrap_or_else(|| {
!case
.expected_verdict
.is_some_and(|verdict| verdict == VerificationVerdict::Fail)
});
let confidence_probability = plan_success_probability(plan);
let confidence_calibration_error =
Some((confidence_probability - if expected_success { 1.0 } else { 0.0 }).abs());
let verification = if case.expected_verdict.is_some()
&& (!case.changed_files.is_empty() || case.unified_diff.is_some())
{
Some(verifier.verify(
repo,
plan,
VerifyChangeInput {
changed_files: case.changed_files.clone(),
unified_diff: case.unified_diff.clone(),
evidence_refs: Vec::new(),
run_commands: false,
write_attestation: false,
validation_attestations: Vec::new(),
traceability_strict: false,
check_api_surface: false,
check_dependency_delta: false,
architecture_policy: None,
suppress_plan_validation_pending: false,
},
)?)
} else {
None
};
let actual_verdict = verification.as_ref().map(|report| report.verdict);
let verification_correct = case
.expected_verdict
.zip(actual_verdict)
.map(|(expected, actual)| expected == actual);
Ok(WorkflowBenchCaseReport {
id: case.id.clone(),
task: case.task.clone(),
context_recall: ratio(context_hits.len(), case.expected_primary_context.len()),
impact_recall: ratio(impact_hits.len(), case.expected_impact.len()),
test_recall: ratio(test_hits.len(), case.expected_tests.len()),
boundary_precision: boundary_precision(&boundary_paths, &case.forbidden_paths),
boundary_recall: ratio(boundary_hits.len(), case.expected_boundary.len()),
confidence_expected_success: Some(expected_success),
confidence_probability,
confidence_calibration_error,
expected_verdict: case.expected_verdict,
actual_verdict,
verification_correct,
baseline_context_recall: ratio(
baseline_context_hits.len(),
case.expected_primary_context.len(),
),
baseline_impact_recall: 0.0,
baseline_test_recall: 0.0,
context_hits,
impact_hits,
test_hits,
boundary_hits,
forbidden_boundary_hits,
top_context_paths: context_paths,
top_impact_paths: impact_paths,
top_tests: test_names,
})
}
fn summarize_workflow_cases(
reports: &[WorkflowBenchCaseReport],
baseline: bool,
) -> WorkflowBenchSummary {
let count = reports.len() as f64;
let verification = reports
.iter()
.filter_map(|case| case.verification_correct)
.collect::<Vec<_>>();
let calibration = reports
.iter()
.filter_map(|case| case.confidence_calibration_error)
.collect::<Vec<_>>();
WorkflowBenchSummary {
context_recall_at_k: mean(
reports
.iter()
.map(|case| {
if baseline {
case.baseline_context_recall
} else {
case.context_recall
}
})
.sum::<f64>(),
count,
),
impact_recall_at_k: mean(
reports
.iter()
.map(|case| {
if baseline {
case.baseline_impact_recall
} else {
case.impact_recall
}
})
.sum::<f64>(),
count,
),
test_recall_at_k: mean(
reports
.iter()
.map(|case| {
if baseline {
case.baseline_test_recall
} else {
case.test_recall
}
})
.sum::<f64>(),
count,
),
boundary_precision: if baseline {
0.0
} else {
mean(
reports
.iter()
.map(|case| case.boundary_precision)
.sum::<f64>(),
count,
)
},
boundary_recall: if baseline {
0.0
} else {
mean(
reports.iter().map(|case| case.boundary_recall).sum::<f64>(),
count,
)
},
confidence_calibration_error: if baseline {
1.0
} else {
mean(calibration.iter().sum::<f64>(), calibration.len() as f64)
},
verification_verdict_accuracy: if baseline {
0.0
} else {
mean(
verification.iter().filter(|correct| **correct).count() as f64,
verification.len() as f64,
)
},
}
}
fn boundary_precision(selected: &[PathBuf], forbidden: &[String]) -> f64 {
if selected.is_empty() {
return 1.0;
}
let forbidden_hits = matching_expected_values(forbidden, selected).len();
(selected.len().saturating_sub(forbidden_hits)) as f64 / selected.len() as f64
}
fn plan_success_probability(plan: &PlanReport) -> f64 {
match plan.risk.level.as_str() {
"low" => 0.85,
"medium" => 0.6,
"high" => 0.3,
"critical" => 0.1,
_ => 0.5,
}
}
fn mean(sum: f64, count: f64) -> f64 {
if count == 0.0 {
1.0
} else {
sum / count
}
}
fn print_workflow_bench_report(report: &WorkflowBenchReport) {
println!(
"Workflow benchmark: {} case(s), limit {}",
report.case_count, report.limit
);
println!(
"Workflow: context recall {:.3}, impact recall {:.3}, test recall {:.3}, boundary precision {:.3}, boundary recall {:.3}, calibration error {:.3}, verification accuracy {:.3}",
report.workflow.context_recall_at_k,
report.workflow.impact_recall_at_k,
report.workflow.test_recall_at_k,
report.workflow.boundary_precision,
report.workflow.boundary_recall,
report.workflow.confidence_calibration_error,
report.workflow.verification_verdict_accuracy
);
println!(
"Deltas vs baseline: context {:+.3}, impact {:+.3}, test {:+.3}, boundary precision {:+.3}, boundary recall {:+.3}, calibration {:+.3}, verification {:+.3}",
report.deltas.context_recall_at_k,
report.deltas.impact_recall_at_k,
report.deltas.test_recall_at_k,
report.deltas.boundary_precision,
report.deltas.boundary_recall,
report.deltas.confidence_calibration_error,
report.deltas.verification_verdict_accuracy
);
for case in &report.cases {
let verdict = case
.actual_verdict
.map(|verdict| format!("{verdict:?}"))
.unwrap_or_else(|| "-".into());
println!(
" {}: context {:.3}, impact {:.3}, tests {:.3}, boundary {:.3}/{:.3}, verdict {}",
case.id,
case.context_recall,
case.impact_recall,
case.test_recall,
case.boundary_precision,
case.boundary_recall,
verdict
);
}
}
const REQUIRED_CONTRACT_BENCH_RULE_FAMILIES: [ContractBenchRuleFamily; 7] = [
ContractBenchRuleFamily::AllowedEdit,
ContractBenchRuleFamily::ForbiddenEdit,
ContractBenchRuleFamily::MissingTests,
ContractBenchRuleFamily::ArchitectureViolation,
ContractBenchRuleFamily::DependencyDelta,
ContractBenchRuleFamily::ApiSurfaceDelta,
ContractBenchRuleFamily::ExplanationQuality,
];
fn run_contract_bench(args: ContractBenchArgs) -> anyhow::Result<ContractBenchReport> {
let repo = absolutize(&args.path)?;
let cases_file = resolve_contract_bench_cases_file(&repo, &args.cases_file)?;
let cases = load_contract_bench_cases(&cases_file)?;
validate_contract_bench_coverage(&cases)?;
let limit = args.limit.clamp(1, 100);
let mut reports = Vec::with_capacity(cases.len());
for case in cases {
let temp_repo = prepare_contract_bench_repo(&repo, &case.id)?;
if !args.no_index {
index_repo(&temp_repo.path)?;
}
let store = open_store(&temp_repo.path)?;
let index_dir = default_index_dir(&temp_repo.path);
let search_index = if TantivySearchIndex::exists(&index_dir) {
Some(TantivySearchIndex::open_or_create(&index_dir)?)
} else {
None
};
let planner = PlanEngine::new(&store as &dyn OkStore)
.with_search_index(search_index.as_ref().map(|idx| idx as &dyn SearchIndex))
.with_history_store(Some(&store));
let generation_started = Instant::now();
let plan = workflow_plan(
&temp_repo.path,
&store,
&planner,
&case.task,
limit,
&cases_file,
)?;
let mut contract = ContractBuilder::from_plan(&plan)?;
apply_contract_bench_overlay(&mut contract, &case.contract_overlay)?;
let generation_ms = duration_ms(generation_started.elapsed());
apply_contract_bench_edits(&temp_repo.path, &case)?;
let verifier = ContractVerifier::new(&store as &dyn OkStore)
.with_search_index(search_index.as_ref().map(|idx| idx as &dyn SearchIndex));
let verification_started = Instant::now();
let verification = verifier.verify(
&temp_repo.path,
&contract,
VerifyChangeInput {
changed_files: contract_bench_changed_files(&case),
unified_diff: case.unified_diff.clone(),
evidence_refs: Vec::new(),
run_commands: false,
write_attestation: false,
validation_attestations: Vec::new(),
traceability_strict: case.traceability_strict,
check_api_surface: case.check_api_surface,
check_dependency_delta: case.check_dependency_delta,
architecture_policy: load_architecture_policy(&temp_repo.path)?,
suppress_plan_validation_pending: false,
},
)?;
let verification_ms = duration_ms(verification_started.elapsed());
reports.push(score_contract_bench_case(
&case,
&contract,
&verification,
generation_ms,
verification_ms,
)?);
}
let summary = summarize_contract_bench_cases(&reports);
let rule_families = summarize_contract_bench_families(&reports);
let failures = reports
.iter()
.filter(|case| !case.passed)
.map(|case| case.id.clone())
.collect::<Vec<_>>();
Ok(ContractBenchReport {
repo,
cases_file,
limit,
case_count: reports.len(),
summary,
rule_families,
failures,
cases: reports,
})
}
fn resolve_contract_bench_cases_file(repo: &Path, cases_file: &Path) -> anyhow::Result<PathBuf> {
if cases_file.is_absolute() {
return Ok(cases_file.to_path_buf());
}
let repo_relative = repo.join(cases_file);
if repo_relative.exists() {
return Ok(repo_relative);
}
absolutize(cases_file)
}
fn load_contract_bench_cases(path: &Path) -> anyhow::Result<Vec<ContractBenchCase>> {
let raw = fs::read_to_string(path)?;
let cases: Vec<ContractBenchCase> = serde_json::from_str(&raw)?;
let mut seen = BTreeSet::new();
for case in &cases {
if case.id.trim().is_empty() || case.task.trim().is_empty() {
anyhow::bail!("contract benchmark cases require non-empty id and task");
}
if !seen.insert(case.id.clone()) {
anyhow::bail!("duplicate contract benchmark case id `{}`", case.id);
}
if case.changed_files.is_empty() && case.unified_diff.is_none() && case.edits.is_empty() {
anyhow::bail!(
"contract benchmark case `{}` requires changed_files, unified_diff, or edits",
case.id
);
}
}
Ok(cases)
}
fn validate_contract_bench_coverage(cases: &[ContractBenchCase]) -> anyhow::Result<()> {
let covered = cases
.iter()
.map(|case| case.rule_family)
.collect::<BTreeSet<_>>();
let missing = REQUIRED_CONTRACT_BENCH_RULE_FAMILIES
.iter()
.copied()
.filter(|family| !covered.contains(family))
.map(|family| format!("{family:?}"))
.collect::<Vec<_>>();
if !missing.is_empty() {
anyhow::bail!(
"contract benchmark cases missing required rule family coverage: {}",
missing.join(", ")
);
}
Ok(())
}
struct ContractBenchTempRepo {
path: PathBuf,
}
impl Drop for ContractBenchTempRepo {
fn drop(&mut self) {
let _ = fs::remove_dir_all(&self.path);
}
}
fn prepare_contract_bench_repo(
repo: &Path,
case_id: &str,
) -> anyhow::Result<ContractBenchTempRepo> {
let stamp = chrono::Utc::now()
.timestamp_nanos_opt()
.unwrap_or_else(|| chrono::Utc::now().timestamp_micros());
let path = std::env::temp_dir().join(format!(
"open-kioku-contract-bench-{}-{}-{}",
std::process::id(),
stamp,
sanitize_temp_path_fragment(case_id)
));
copy_contract_bench_repo(repo, &path)?;
Ok(ContractBenchTempRepo { path })
}
fn sanitize_temp_path_fragment(value: &str) -> String {
value
.chars()
.map(|ch| {
if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
ch
} else {
'-'
}
})
.collect()
}
fn copy_contract_bench_repo(source: &Path, destination: &Path) -> anyhow::Result<()> {
if destination.exists() {
fs::remove_dir_all(destination)?;
}
fs::create_dir_all(destination)?;
for entry in walkdir::WalkDir::new(source) {
let entry = entry?;
let relative = entry.path().strip_prefix(source)?;
if relative.as_os_str().is_empty() || should_skip_contract_bench_copy(relative) {
continue;
}
let target = destination.join(relative);
if entry.file_type().is_dir() {
fs::create_dir_all(&target)?;
} else if entry.file_type().is_file() {
if let Some(parent) = target.parent() {
fs::create_dir_all(parent)?;
}
fs::copy(entry.path(), target)?;
}
}
Ok(())
}
fn should_skip_contract_bench_copy(path: &Path) -> bool {
path.components().any(|component| {
let name = component.as_os_str().to_string_lossy();
matches!(name.as_ref(), ".git" | ".ok" | "target")
})
}
fn apply_contract_bench_overlay(
contract: &mut ChangeContractV1,
overlay: &ContractBenchContractOverlay,
) -> anyhow::Result<()> {
merge_contract_files(&mut contract.primary_files, &overlay.primary_files);
merge_contract_files(&mut contract.secondary_files, &overlay.secondary_files);
merge_contract_files(&mut contract.forbidden_files, &overlay.forbidden_files);
remove_contract_files(&mut contract.primary_files, &overlay.forbidden_files);
remove_contract_files(&mut contract.secondary_files, &overlay.forbidden_files);
contract
.api_surface_constraints
.extend(overlay.api_surface_constraints.clone());
contract
.dependency_delta_constraints
.extend(overlay.dependency_delta_constraints.clone());
contract.validate().map_err(|err| {
anyhow::anyhow!("contract benchmark overlay produced invalid contract: {err}")
})
}
fn merge_contract_files(target: &mut Vec<ContractFile>, additions: &[ContractFile]) {
for addition in additions {
if !target
.iter()
.any(|current| same_normalized_contract_file(current, addition))
{
target.push(addition.clone());
}
}
}
fn remove_contract_files(target: &mut Vec<ContractFile>, removals: &[ContractFile]) {
target.retain(|candidate| {
!removals
.iter()
.any(|removal| same_normalized_contract_file(candidate, removal))
});
}
fn same_normalized_contract_file(left: &ContractFile, right: &ContractFile) -> bool {
normalize_path_fragment(left.as_str()) == normalize_path_fragment(right.as_str())
}
fn apply_contract_bench_edits(repo: &Path, case: &ContractBenchCase) -> anyhow::Result<()> {
for edit in &case.edits {
let path = repo.join(&edit.path);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
fs::write(path, &edit.content)?;
}
Ok(())
}
fn contract_bench_changed_files(case: &ContractBenchCase) -> Vec<PathBuf> {
if !case.changed_files.is_empty() {
return case.changed_files.clone();
}
case.edits.iter().map(|edit| edit.path.clone()).collect()
}
fn score_contract_bench_case(
case: &ContractBenchCase,
contract: &ChangeContractV1,
verification: &ContractVerificationReport,
generation_ms: f64,
verification_ms: f64,
) -> anyhow::Result<ContractBenchCaseReport> {
let contract_primary = contract_pathbufs(&contract.primary_files);
let contract_allowed_boundary = contract_allowed_boundary_paths(contract);
let contract_forbidden = contract_pathbufs(&contract.forbidden_files);
let primary_file_hits =
matching_expected_values(&case.expected_contract.primary_files, &contract_primary);
let boundary_hits = matching_expected_values(
&case.expected_contract.allowed_boundary,
&contract_allowed_boundary,
);
let forbidden_boundary_hits = matching_expected_values(
&case.expected_contract.forbidden_paths,
&contract_allowed_boundary,
);
let forbidden_contract_hits =
matching_expected_values(&case.expected_contract.forbidden_paths, &contract_forbidden);
let mut missing_contract_fields = Vec::new();
push_missing_expected_values(
&mut missing_contract_fields,
"primary_files",
&case.expected_contract.primary_files,
&primary_file_hits,
);
push_missing_expected_values(
&mut missing_contract_fields,
"allowed_boundary",
&case.expected_contract.allowed_boundary,
&boundary_hits,
);
push_missing_expected_values(
&mut missing_contract_fields,
"forbidden_files",
&case.expected_contract.forbidden_paths,
&forbidden_contract_hits,
);
if contract.required_tests.len() < case.expected_contract.min_required_tests {
missing_contract_fields.push(format!(
"required_tests: expected at least {}, got {}",
case.expected_contract.min_required_tests,
contract.required_tests.len()
));
}
if contract.traceability.len() < case.expected_contract.min_traceability {
missing_contract_fields.push(format!(
"traceability: expected at least {}, got {}",
case.expected_contract.min_traceability,
contract.traceability.len()
));
}
if contract.architecture_constraints.len() < case.expected_contract.min_architecture_constraints
{
missing_contract_fields.push(format!(
"architecture_constraints: expected at least {}, got {}",
case.expected_contract.min_architecture_constraints,
contract.architecture_constraints.len()
));
}
if contract.evidence_refs.len() < case.expected_contract.min_evidence_refs {
missing_contract_fields.push(format!(
"evidence_refs: expected at least {}, got {}",
case.expected_contract.min_evidence_refs,
contract.evidence_refs.len()
));
}
let actual_finding_keys = contract_bench_finding_keys(verification);
let finding_hits = matching_expected_strings(&case.expected_findings, &actual_finding_keys);
let mut missing_findings = Vec::new();
push_missing_expected_values(
&mut missing_findings,
"findings",
&case.expected_findings,
&finding_hits,
);
let explanation = render_contract_explain_markdown(&explain_contract(contract));
let explanation_lower = explanation.to_ascii_lowercase();
let explanation_hits = case
.explanation_terms
.iter()
.filter(|term| explanation_lower.contains(&term.to_ascii_lowercase()))
.cloned()
.collect::<Vec<_>>();
let mut missing_explanation_terms = Vec::new();
push_missing_expected_values(
&mut missing_explanation_terms,
"explanation_terms",
&case.explanation_terms,
&explanation_hits,
);
let pretty_json = serde_json::to_string_pretty(contract)?;
let toon = render_contract_toon(contract);
let pretty_json_bytes = pretty_json.len();
let toon_bytes = toon.len();
let toon_reduction = if pretty_json_bytes == 0 {
0.0
} else {
1.0 - (toon_bytes as f64 / pretty_json_bytes as f64)
};
let actual_verdict = verification.change_report.verdict;
let verdict_correct = actual_verdict == case.expected_verdict;
let boundary_precision = boundary_precision(
&contract_allowed_boundary,
&case.expected_contract.forbidden_paths,
);
let boundary_recall = ratio(
boundary_hits.len(),
case.expected_contract.allowed_boundary.len(),
);
let passed = verdict_correct
&& missing_contract_fields.is_empty()
&& missing_findings.is_empty()
&& missing_explanation_terms.is_empty()
&& forbidden_boundary_hits.is_empty();
Ok(ContractBenchCaseReport {
id: case.id.clone(),
rule_family: case.rule_family,
task: case.task.clone(),
contract_id: contract.id.0.clone(),
expected_verdict: case.expected_verdict,
actual_verdict,
verdict_correct,
boundary_precision,
boundary_recall,
primary_file_hits,
boundary_hits,
forbidden_boundary_hits,
missing_contract_fields,
finding_hits,
missing_findings,
explanation_hits,
missing_explanation_terms,
pretty_json_bytes,
toon_bytes,
toon_reduction,
generation_ms,
verification_ms,
passed,
})
}
fn push_missing_expected_values(
target: &mut Vec<String>,
field: &str,
expected: &[String],
hits: &[String],
) {
for value in expected {
if !hits.iter().any(|hit| hit == value) {
target.push(format!("{field}: missing `{value}`"));
}
}
}
fn contract_pathbufs(files: &[ContractFile]) -> Vec<PathBuf> {
files
.iter()
.map(|file| PathBuf::from(file.as_str()))
.collect()
}
fn contract_allowed_boundary_paths(contract: &ChangeContractV1) -> Vec<PathBuf> {
contract
.primary_files
.iter()
.chain(contract.secondary_files.iter())
.map(|file| PathBuf::from(file.as_str()))
.collect()
}
fn contract_bench_finding_keys(report: &ContractVerificationReport) -> Vec<String> {
let mut keys = Vec::new();
keys.extend(
report
.change_report
.boundary_violations
.iter()
.map(|finding| finding.kind.clone()),
);
keys.extend(
report
.change_report
.warnings
.iter()
.map(|finding| finding.kind.clone()),
);
keys.extend(
report
.change_report
.missing_tests
.iter()
.map(|finding| finding.kind.clone()),
);
keys.extend(
report
.change_report
.changed_impact
.iter()
.map(|finding| finding.kind.clone()),
);
keys.extend(
report
.change_report
.api_surface_deltas
.iter()
.map(|finding| finding.kind.clone()),
);
keys.extend(
report
.change_report
.dependency_deltas
.iter()
.map(|finding| {
format!("dependency_delta:{:?}", finding.classification).to_ascii_lowercase()
}),
);
keys.sort();
keys.dedup();
keys
}
fn summarize_contract_bench_cases(reports: &[ContractBenchCaseReport]) -> ContractBenchSummary {
let count = reports.len() as f64;
let verdict_correct = reports.iter().filter(|case| case.verdict_correct).count() as f64;
let mut true_positives = 0;
let mut false_positives = 0;
let mut false_negatives = 0;
for case in reports {
let expected_positive = case.expected_verdict != VerificationVerdict::Pass;
let actual_positive = case.actual_verdict != VerificationVerdict::Pass;
match (expected_positive, actual_positive) {
(true, true) => true_positives += 1,
(false, true) => false_positives += 1,
(true, false) => false_negatives += 1,
(false, false) => {}
}
}
let min_toon_reduction = reports
.iter()
.map(|case| case.toon_reduction)
.fold(1.0, f64::min);
ContractBenchSummary {
verdict_accuracy: mean(verdict_correct, count),
verification_precision: mean(
true_positives as f64,
(true_positives + false_positives) as f64,
),
boundary_precision: mean(
reports
.iter()
.map(|case| case.boundary_precision)
.sum::<f64>(),
count,
),
boundary_recall: mean(
reports.iter().map(|case| case.boundary_recall).sum::<f64>(),
count,
),
min_toon_reduction,
mean_toon_reduction: mean(
reports.iter().map(|case| case.toon_reduction).sum::<f64>(),
count,
),
mean_generation_ms: mean(
reports.iter().map(|case| case.generation_ms).sum::<f64>(),
count,
),
mean_verification_ms: mean(
reports.iter().map(|case| case.verification_ms).sum::<f64>(),
count,
),
true_positives,
false_positives,
false_negatives,
}
}
fn summarize_contract_bench_families(
reports: &[ContractBenchCaseReport],
) -> Vec<ContractBenchFamilyReport> {
let mut grouped = BTreeMap::<ContractBenchRuleFamily, Vec<&ContractBenchCaseReport>>::new();
for report in reports {
grouped.entry(report.rule_family).or_default().push(report);
}
grouped
.into_iter()
.map(|(rule_family, cases)| {
let count = cases.len() as f64;
ContractBenchFamilyReport {
rule_family,
case_count: cases.len(),
verdict_accuracy: mean(
cases.iter().filter(|case| case.verdict_correct).count() as f64,
count,
),
boundary_precision: mean(
cases
.iter()
.map(|case| case.boundary_precision)
.sum::<f64>(),
count,
),
boundary_recall: mean(
cases.iter().map(|case| case.boundary_recall).sum::<f64>(),
count,
),
}
})
.collect()
}
fn print_contract_bench_report(report: &ContractBenchReport) {
println!(
"Contract benchmark: {} case(s), limit {}",
report.case_count, report.limit
);
println!(
"Summary: verdict accuracy {:.3}, verification precision {:.3}, boundary precision {:.3}, boundary recall {:.3}, min TOON reduction {:.3}, mean TOON reduction {:.3}",
report.summary.verdict_accuracy,
report.summary.verification_precision,
report.summary.boundary_precision,
report.summary.boundary_recall,
report.summary.min_toon_reduction,
report.summary.mean_toon_reduction
);
for family in &report.rule_families {
println!(
" {:?}: {} case(s), verdict {:.3}, boundary {:.3}/{:.3}",
family.rule_family,
family.case_count,
family.verdict_accuracy,
family.boundary_precision,
family.boundary_recall
);
}
for case in &report.cases {
println!(
" {}: {:?}, verdict {:?}/{:?}, boundary {:.3}/{:.3}, TOON {:+.1}%, {}",
case.id,
case.rule_family,
case.actual_verdict,
case.expected_verdict,
case.boundary_precision,
case.boundary_recall,
case.toon_reduction * 100.0,
if case.passed { "pass" } else { "fail" }
);
}
}
fn run_eval(args: EvalArgs) -> anyhow::Result<EvalReport> {
let repo = absolutize(&args.path)?;
let limit = args.limit.clamp(1, 100);
let cases = load_eval_cases(&args.cases, args.cases_file.as_ref())?;
if cases.is_empty() {
anyhow::bail!("no eval cases provided; pass --case TASK=EXPECTED_PATH or --cases-file");
}
if !args.no_index {
index_repo(&repo)?;
}
let store = open_store(&repo)?;
let ranking_options = ranking_options_for_repo(&repo)?;
let mut semantic_config = OkConfig::load_from_repo(&repo)?.semantic;
semantic_config.enabled = true;
let semantic_manager = SemanticIndexManager::new(&repo, &store, &semantic_config);
let semantic_ready = semantic_manager.status().ready;
let mut case_reports = Vec::with_capacity(cases.len());
let mut recall_sum = 0.0;
let mut mrr_sum = 0.0;
let mut ndcg_sum = 0.0;
let mut semantic_recall_sum = 0.0;
let mut semantic_mrr_sum = 0.0;
let mut semantic_ndcg_sum = 0.0;
let mut baseline_recall_sum = 0.0;
let mut baseline_mrr_sum = 0.0;
let mut baseline_ndcg_sum = 0.0;
let signals = ranking_ablation_signals();
let mut ablation_sums = signals
.iter()
.map(|signal| (*signal, 0.0, 0.0, 0.0))
.collect::<Vec<_>>();
let mut context_recall_sum = 0.0;
let mut test_recall_sum = 0.0;
let mut abstention_required = 0usize;
for case in cases {
let mut raw_candidates =
search_raw(&repo, &store, &case.task, ranking_candidate_limit(limit))?;
let baseline_results = top_unique_paths(rerank_baseline(raw_candidates.clone()), limit);
annotate_candidates_with_git_history(&store, &mut raw_candidates)?;
let semantic_results = if semantic_ready {
semantic_manager.search(&case.task, ranking_candidate_limit(limit))?
} else {
Vec::new()
};
if semantic_ready {
raw_candidates.extend(semantic_results.clone());
}
let mut case_ranking_options = ranking_options.clone();
case_ranking_options.query = Some(case.task.clone());
let search_results = top_unique_paths_merging(
rerank_with_options(raw_candidates.clone(), &case_ranking_options),
limit,
);
let context = build_context_pack(&repo, &store, &case.task, limit)?;
let search_paths = search_results
.iter()
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let baseline_paths = baseline_results
.iter()
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let context_paths = context
.primary_files
.iter()
.chain(context.supporting_files.iter())
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let selected_tests = context
.test_candidates
.iter()
.map(|test| test.name.clone())
.collect::<Vec<_>>();
let baseline_ranks = expected_path_ranks(&case.expected_paths, &baseline_paths);
baseline_recall_sum += ratio(
baseline_ranks.iter().filter(|rank| rank.is_some()).count(),
case.expected_paths.len(),
);
baseline_mrr_sum += reciprocal_rank(&baseline_ranks);
baseline_ndcg_sum += ndcg(&baseline_ranks, limit);
if semantic_ready {
let semantic_paths = semantic_results
.iter()
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let semantic_ranks = expected_path_ranks(&case.expected_paths, &semantic_paths);
semantic_recall_sum += ratio(
semantic_ranks.iter().filter(|rank| rank.is_some()).count(),
case.expected_paths.len(),
);
semantic_mrr_sum += reciprocal_rank(&semantic_ranks);
semantic_ndcg_sum += ndcg(&semantic_ranks, limit);
}
let search_ranks = expected_path_ranks(&case.expected_paths, &search_paths);
let search_hits = search_ranks.iter().filter(|rank| rank.is_some()).count();
let search_recall = ratio(search_hits, case.expected_paths.len());
recall_sum += search_recall;
mrr_sum += reciprocal_rank(&search_ranks);
ndcg_sum += ndcg(&search_ranks, limit);
for (signal, recall, mrr, ndcg_value) in &mut ablation_sums {
let mut ablation_options = case_ranking_options.clone();
ablation_options.mode = RankingMode::WithoutSignal(*signal);
let candidates = if *signal == RankingSignal::GitCochange {
without_git_history_candidates(raw_candidates.clone())
} else {
raw_candidates.clone()
};
let ablated =
top_unique_paths(rerank_with_options(candidates, &ablation_options), limit);
let ablated_paths = ablated
.iter()
.map(|result| result.path.clone())
.collect::<Vec<_>>();
let ablated_ranks = expected_path_ranks(&case.expected_paths, &ablated_paths);
*recall += ratio(
ablated_ranks.iter().filter(|rank| rank.is_some()).count(),
case.expected_paths.len(),
);
*mrr += reciprocal_rank(&ablated_ranks);
*ndcg_value += ndcg(&ablated_ranks, limit);
}
let context_hits = matching_expected_values(&case.expected_paths, &context_paths);
let context_recall = ratio(context_hits.len(), case.expected_paths.len());
context_recall_sum += context_recall;
let test_hits = matching_expected_strings(&case.expected_tests, &selected_tests);
let test_recall = ratio(test_hits.len(), case.expected_tests.len());
test_recall_sum += test_recall;
let mut notes = Vec::new();
if search_recall == 0.0 {
notes.push("expected files were not found in top search results".into());
}
if context_recall == 0.0 {
notes.push("expected files were not grounded in context pack".into());
}
if !case.expected_tests.is_empty() && test_recall == 0.0 {
notes.push("expected tests were not selected".into());
}
let confidence = if search_recall > 0.0 && context_recall > 0.0 {
"grounded"
} else if search_results.is_empty() || context.primary_files.is_empty() {
abstention_required += 1;
"abstain"
} else {
abstention_required += 1;
"weak"
};
case_reports.push(EvalCaseReport {
task: case.task,
expected_paths: case.expected_paths,
expected_tests: case.expected_tests,
search_ranks,
context_hits,
test_hits,
top_search_paths: search_paths.into_iter().take(limit).collect(),
top_context_paths: context_paths.into_iter().take(limit).collect(),
top_search_signals: search_results
.first()
.map(|result| top_score_signals(result, 3))
.unwrap_or_default(),
confidence,
notes,
});
}
let count = case_reports.len() as f64;
let fusion = RankingEvalSummary {
mode: "fusion".into(),
search_recall_at_k: recall_sum / count,
search_mrr: mrr_sum / count,
search_ndcg_at_k: ndcg_sum / count,
};
let baseline = RankingEvalSummary {
mode: "baseline".into(),
search_recall_at_k: baseline_recall_sum / count,
search_mrr: baseline_mrr_sum / count,
search_ndcg_at_k: baseline_ndcg_sum / count,
};
let semantic = semantic_ready.then(|| RankingEvalSummary {
mode: "semantic".into(),
search_recall_at_k: semantic_recall_sum / count,
search_mrr: semantic_mrr_sum / count,
search_ndcg_at_k: semantic_ndcg_sum / count,
});
let ablations = ablation_sums
.into_iter()
.map(|(signal, recall, mrr, ndcg_value)| {
let recall_at_k = recall / count;
let search_mrr = mrr / count;
let ndcg_at_k = ndcg_value / count;
RankingAblationReport {
signal: ranking_signal_name(signal).into(),
search_recall_at_k: recall_at_k,
search_mrr,
search_ndcg_at_k: ndcg_at_k,
recall_delta_vs_fusion: fusion.search_recall_at_k - recall_at_k,
mrr_delta_vs_fusion: fusion.search_mrr - search_mrr,
ndcg_delta_vs_fusion: fusion.search_ndcg_at_k - ndcg_at_k,
}
})
.collect::<Vec<_>>();
Ok(EvalReport {
repo,
limit,
case_count: case_reports.len(),
summary: EvalSummary {
search_recall_at_k: fusion.search_recall_at_k,
search_mrr: fusion.search_mrr,
search_ndcg_at_k: fusion.search_ndcg_at_k,
context_recall_at_k: context_recall_sum / count,
test_recall_at_k: test_recall_sum / count,
abstention_required,
},
baseline,
semantic,
fusion,
ablations,
cases: case_reports,
})
}
fn load_eval_cases(
values: &[String],
cases_file: Option<&PathBuf>,
) -> anyhow::Result<Vec<EvalCase>> {
let mut cases = values
.iter()
.map(|value| {
let (task, expected) = value.split_once('=').ok_or_else(|| {
anyhow::anyhow!("eval case must use TASK=EXPECTED_PATH[,EXPECTED_PATH]: {value}")
})?;
let expected_paths = expected
.split(',')
.map(str::trim)
.filter(|path| !path.is_empty())
.map(ToString::to_string)
.collect::<Vec<_>>();
if task.trim().is_empty() || expected_paths.is_empty() {
anyhow::bail!("eval task and expected paths must be non-empty: {value}");
}
Ok(EvalCase {
task: task.trim().to_string(),
expected_paths,
expected_tests: Vec::new(),
})
})
.collect::<anyhow::Result<Vec<_>>>()?;
if let Some(path) = cases_file {
let raw = fs::read_to_string(path)?;
let mut from_file: Vec<EvalCase> = serde_json::from_str(&raw)?;
cases.append(&mut from_file);
}
Ok(cases)
}
fn expected_path_ranks(expected_paths: &[String], actual_paths: &[PathBuf]) -> Vec<Option<usize>> {
expected_paths
.iter()
.map(|expected| {
let expected = normalize_path_fragment(expected);
actual_paths
.iter()
.position(|path| {
normalize_path_fragment(&path.to_string_lossy()).contains(&expected)
})
.map(|rank| rank + 1)
})
.collect()
}
fn matching_expected_values(expected: &[String], actual: &[PathBuf]) -> Vec<String> {
expected
.iter()
.filter(|expected| {
let expected = normalize_path_fragment(expected);
actual
.iter()
.any(|path| normalize_path_fragment(&path.to_string_lossy()).contains(&expected))
})
.cloned()
.collect()
}
fn matching_expected_strings(expected: &[String], actual: &[String]) -> Vec<String> {
expected
.iter()
.filter(|expected| {
let expected = expected.to_ascii_lowercase();
actual
.iter()
.any(|value| value.to_ascii_lowercase().contains(&expected))
})
.cloned()
.collect()
}
fn reciprocal_rank(ranks: &[Option<usize>]) -> f64 {
ranks
.iter()
.flatten()
.min()
.map(|rank| 1.0 / *rank as f64)
.unwrap_or(0.0)
}
fn ndcg(ranks: &[Option<usize>], limit: usize) -> f64 {
if ranks.is_empty() {
return 1.0;
}
let dcg = ranks
.iter()
.flatten()
.filter(|rank| **rank <= limit)
.map(|rank| 1.0 / ((*rank as f64) + 1.0).log2())
.sum::<f64>();
let ideal = (1..=ranks.len().min(limit))
.map(|rank| 1.0 / ((rank as f64) + 1.0).log2())
.sum::<f64>();
if ideal == 0.0 {
0.0
} else {
dcg / ideal
}
}
fn ratio(numerator: usize, denominator: usize) -> f64 {
if denominator == 0 {
1.0
} else {
numerator as f64 / denominator as f64
}
}
fn print_eval_report(report: &EvalReport) {
println!("Open Kioku eval for {}", report.repo.display());
println!(
"Search recall@{} {:.3}, MRR {:.3}, nDCG@{} {:.3}",
report.limit,
report.summary.search_recall_at_k,
report.summary.search_mrr,
report.limit,
report.summary.search_ndcg_at_k
);
println!(
"Ranking baseline: recall@{} {:.3}, MRR {:.3}, nDCG@{} {:.3}",
report.limit,
report.baseline.search_recall_at_k,
report.baseline.search_mrr,
report.limit,
report.baseline.search_ndcg_at_k
);
println!(
"Ranking fusion: recall@{} {:.3}, MRR {:.3}, nDCG@{} {:.3}",
report.limit,
report.fusion.search_recall_at_k,
report.fusion.search_mrr,
report.limit,
report.fusion.search_ndcg_at_k
);
if let Some(semantic) = &report.semantic {
println!(
"Ranking semantic-only: recall@{} {:.3}, MRR {:.3}, nDCG@{} {:.3}",
report.limit,
semantic.search_recall_at_k,
semantic.search_mrr,
report.limit,
semantic.search_ndcg_at_k
);
}
if !report.ablations.is_empty() {
println!("Ranking ablations:");
for ablation in &report.ablations {
println!(
" - without {}: recall@{} {:.3} (delta {:+.3}), MRR {:.3} (delta {:+.3}), nDCG {:.3} (delta {:+.3})",
ablation.signal,
report.limit,
ablation.search_recall_at_k,
ablation.recall_delta_vs_fusion,
ablation.search_mrr,
ablation.mrr_delta_vs_fusion,
ablation.search_ndcg_at_k,
ablation.ndcg_delta_vs_fusion
);
}
}
println!(
"Context recall@{} {:.3}, test recall@{} {:.3}, weak/abstain {}",
report.limit,
report.summary.context_recall_at_k,
report.limit,
report.summary.test_recall_at_k,
report.summary.abstention_required
);
for case in &report.cases {
println!("\n- {} [{}]", case.task, case.confidence);
println!(" expected paths: {}", case.expected_paths.join(", "));
println!(" ranks: {:?}", case.search_ranks);
if !case.top_search_signals.is_empty() {
println!(
" top ranking signals: {}",
case.top_search_signals.join(", ")
);
}
if !case.test_hits.is_empty() {
println!(" test hits: {}", case.test_hits.join(", "));
}
for note in &case.notes {
println!(" note: {note}");
}
}
}