Skip to main content

vela_protocol/
agent_bench.rs

1//! # VelaBench v0.26 — agent state-update scoring
2//!
3//! Compares a *candidate* frontier (typically agent-generated)
4//! against a *gold* frontier (curator-validated) and produces a
5//! reproducible score.
6//!
7//! Unlike the legacy `benchmark` module — which scores literature
8//! extraction quality — this scorer reads two frontiers as data
9//! artifacts and judges how well one approximates the other.
10//! Determinism is the doctrine: sort by `vf_id`, no wall-clock,
11//! no RNG. Same inputs → same numbers.
12//!
13//! Substrate stays dumb: this is pure data comparison. No LLM
14//! call, no network, no agent invocation. The scorer never spawns
15//! `claude` or anything else; it operates on already-emitted
16//! `FindingBundle`s and `StateProposal`s.
17
18use std::collections::{BTreeSet, HashMap, HashSet};
19use std::path::{Path, PathBuf};
20
21use serde::{Deserialize, Serialize};
22
23use crate::bundle::FindingBundle;
24use crate::project::Project;
25use crate::repo;
26
27/// Composite score weights, summing to 1.0 — locked here so the
28/// formula is auditable in one line. Adjust deliberately.
29pub const W_CLAIM_MATCH: f64 = 0.25;
30pub const W_SCOPE: f64 = 0.20;
31pub const W_EVIDENCE_FIDELITY: f64 = 0.20;
32pub const W_CONTRADICTION_RECALL: f64 = 0.15;
33pub const W_DOWNSTREAM_LINK: f64 = 0.10;
34pub const W_DUPLICATE_INV: f64 = 0.10;
35
36/// Inputs to a single VelaBench run.
37#[derive(Debug, Clone)]
38pub struct BenchInput {
39    pub gold_path: PathBuf,
40    pub candidate_path: PathBuf,
41    /// When provided, `evidence_fidelity` checks each candidate
42    /// finding's evidence span against the actual file content.
43    /// Without it, that metric is reported as `None` and dropped
44    /// from the composite (weight rebalanced).
45    pub sources: Option<PathBuf>,
46    /// Threshold for the composite score; the binary exit code is
47    /// non-zero if the score falls below.
48    pub threshold: f64,
49}
50
51/// One metric's worth of result. `pass` is purely informational
52/// (target met) — the binary's exit code is driven by the
53/// composite, not by individual metrics.
54///
55/// `vacuous` (v0.29.2): true when the metric had no data to
56/// measure (e.g. no gold contradictions to recall, no novel
57/// candidate findings to ground). Such metrics still report a
58/// formal score of 1.0 ("vacuously satisfied"), but they are
59/// excluded from the composite weighting. Friction #13 from sim-
60/// user pass #2: vacuous 1.0s were inflating the composite to
61/// ~0.31 even when claim_match_rate was 0, which made the score
62/// look like a passing grade when it really meant "no overlap
63/// detected".
64#[derive(Debug, Clone, Default, Serialize, Deserialize)]
65pub struct MetricResult {
66    pub score: f64,
67    pub target: f64,
68    pub pass: bool,
69    pub note: String,
70    #[serde(default, skip_serializing_if = "is_false")]
71    pub vacuous: bool,
72}
73
74fn is_false(b: &bool) -> bool {
75    !*b
76}
77
78/// Full bench report. Serializable to JSON for `--json` mode and
79/// for checking in as `expected.json` regression bands.
80#[derive(Debug, Clone, Default, Serialize, Deserialize)]
81pub struct BenchReport {
82    pub gold_path: String,
83    pub candidate_path: String,
84    pub gold_findings: usize,
85    pub candidate_findings: usize,
86    pub matched_pairs: usize,
87    pub claim_match_rate: MetricResult,
88    pub scope_accuracy: MetricResult,
89    pub evidence_fidelity: Option<MetricResult>,
90    pub duplicate_rate: f64,
91    pub novelty_rate: f64,
92    pub contradiction_recall: MetricResult,
93    pub downstream_link_rate: MetricResult,
94    pub composite: f64,
95    pub threshold: f64,
96    pub pass: bool,
97}
98
99/// Run a complete bench. Loads both frontiers, computes every
100/// metric, and returns the report. Caller decides what to do with
101/// the exit code.
102pub fn run(input: BenchInput) -> Result<BenchReport, String> {
103    let gold: Project = repo::load_from_path(&input.gold_path)
104        .map_err(|e| format!("load gold {}: {e}", input.gold_path.display()))?;
105    let candidate: Project = repo::load_from_path(&input.candidate_path)
106        .map_err(|e| format!("load candidate {}: {e}", input.candidate_path.display()))?;
107
108    let gold_findings = sorted_findings(&gold);
109    // Candidates are most often unsigned: an agent's `vela scout`
110    // run leaves its output as `finding.add` proposals, not as
111    // committed findings. Pull both surfaces into the candidate
112    // set so the bench can score pre-review agent quality (where
113    // the dogfood lives) as well as post-review accepted state.
114    let candidate_findings = sorted_findings_with_proposals(&candidate);
115
116    let matches = match_findings(&gold_findings, &candidate_findings);
117
118    let claim_match_rate = score_claim_match(&gold_findings, &candidate_findings, &matches);
119    let scope_accuracy = score_scope(&gold_findings, &candidate_findings, &matches);
120    let evidence_fidelity = input
121        .sources
122        .as_ref()
123        .map(|src| score_evidence_fidelity(&candidate_findings, src));
124    let (duplicate_inv, duplicate_rate) = score_duplicates(&candidate_findings);
125    let novelty_rate = score_novelty(&candidate_findings, &matches);
126    let contradiction_recall = score_contradiction_recall(&gold_findings, &candidate_findings);
127    let downstream_link_rate = score_downstream_link(&gold_findings, &candidate_findings, &matches);
128
129    let composite = compute_composite(
130        &claim_match_rate,
131        &scope_accuracy,
132        evidence_fidelity.as_ref(),
133        duplicate_inv,
134        &contradiction_recall,
135        &downstream_link_rate,
136    );
137
138    Ok(BenchReport {
139        gold_path: input.gold_path.display().to_string(),
140        candidate_path: input.candidate_path.display().to_string(),
141        gold_findings: gold_findings.len(),
142        candidate_findings: candidate_findings.len(),
143        matched_pairs: matches.len(),
144        claim_match_rate,
145        scope_accuracy,
146        evidence_fidelity,
147        duplicate_rate,
148        novelty_rate,
149        contradiction_recall,
150        downstream_link_rate,
151        composite,
152        threshold: input.threshold,
153        pass: composite >= input.threshold,
154    })
155}
156
157fn sorted_findings(p: &Project) -> Vec<FindingBundle> {
158    let mut out = p.findings.clone();
159    out.sort_by(|a, b| a.id.cmp(&b.id));
160    out
161}
162
163/// Like `sorted_findings`, but also pulls `finding.add` proposal
164/// payloads into the set so unsigned agent output can be scored.
165/// Skips proposals whose target id collides with an already-
166/// committed finding (the committed copy wins).
167fn sorted_findings_with_proposals(p: &Project) -> Vec<FindingBundle> {
168    let mut out = p.findings.clone();
169    let already: HashSet<String> = out.iter().map(|f| f.id.clone()).collect();
170    let mut seen = already.clone();
171    for proposal in &p.proposals {
172        if proposal.kind != "finding.add" {
173            continue;
174        }
175        let Some(payload_finding) = proposal.payload.get("finding") else {
176            continue;
177        };
178        let Ok(bundle) = serde_json::from_value::<FindingBundle>(payload_finding.clone()) else {
179            continue;
180        };
181        if seen.contains(&bundle.id) {
182            continue;
183        }
184        seen.insert(bundle.id.clone());
185        out.push(bundle);
186    }
187    out.sort_by(|a, b| a.id.cmp(&b.id));
188    out
189}
190
191// ---------- Matching ----------
192
193/// Returns matched pairs (gold_index, candidate_index). Greedy
194/// (not full Hungarian — F1 is symmetric in our use, and this
195/// stays deterministic with sorted-by-id input). Match rule:
196/// either content-address equal OR claim-text Jaccard ≥ 0.4.
197fn match_findings(gold: &[FindingBundle], candidate: &[FindingBundle]) -> Vec<(usize, usize)> {
198    let mut used_g: HashSet<usize> = HashSet::new();
199    let mut used_c: HashSet<usize> = HashSet::new();
200    let mut pairs: Vec<(usize, usize, f64)> = Vec::new();
201
202    // First pass: exact id matches (cheap, certain).
203    let g_by_id: HashMap<&str, usize> = gold
204        .iter()
205        .enumerate()
206        .map(|(i, f)| (f.id.as_str(), i))
207        .collect();
208    for (ci, cand) in candidate.iter().enumerate() {
209        if let Some(&gi) = g_by_id.get(cand.id.as_str()) {
210            pairs.push((gi, ci, 1.0));
211            used_g.insert(gi);
212            used_c.insert(ci);
213        }
214    }
215
216    // Second pass: jaccard ≥ 0.4 on remaining.
217    let g_tokens: Vec<BTreeSet<String>> = gold
218        .iter()
219        .map(|f| tokenize_claim(&f.assertion.text))
220        .collect();
221    let c_tokens: Vec<BTreeSet<String>> = candidate
222        .iter()
223        .map(|f| tokenize_claim(&f.assertion.text))
224        .collect();
225    for (ci, c_set) in c_tokens.iter().enumerate() {
226        if used_c.contains(&ci) {
227            continue;
228        }
229        let mut best: Option<(usize, f64)> = None;
230        for (gi, g_set) in g_tokens.iter().enumerate() {
231            if used_g.contains(&gi) {
232                continue;
233            }
234            let j = jaccard(g_set, c_set);
235            if j >= 0.4 && best.map(|(_, b)| j > b).unwrap_or(true) {
236                best = Some((gi, j));
237            }
238        }
239        if let Some((gi, score)) = best {
240            pairs.push((gi, ci, score));
241            used_g.insert(gi);
242            used_c.insert(ci);
243        }
244    }
245
246    pairs.sort_by(|a, b| a.0.cmp(&b.0));
247    pairs.into_iter().map(|(g, c, _)| (g, c)).collect()
248}
249
250fn tokenize_claim(s: &str) -> BTreeSet<String> {
251    s.to_lowercase()
252        .split(|c: char| !c.is_alphanumeric())
253        .filter(|t| t.len() > 2)
254        .map(String::from)
255        .collect()
256}
257
258fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
259    if a.is_empty() && b.is_empty() {
260        return 1.0;
261    }
262    let inter = a.intersection(b).count() as f64;
263    let union = a.union(b).count() as f64;
264    if union == 0.0 { 0.0 } else { inter / union }
265}
266
267// ---------- Metrics ----------
268
269fn score_claim_match(
270    gold: &[FindingBundle],
271    candidate: &[FindingBundle],
272    matches: &[(usize, usize)],
273) -> MetricResult {
274    let g = gold.len();
275    let c = candidate.len();
276    let m = matches.len();
277    let denom = g + c;
278    let score = if denom == 0 {
279        0.0
280    } else {
281        (2.0 * m as f64) / denom as f64
282    };
283    let target = 0.70;
284    MetricResult {
285        score,
286        target,
287        pass: score >= target,
288        note: format!("F1 over claim-text match: 2·|M|/(|G|+|C|) = 2·{m}/({g}+{c})"),
289        vacuous: false,
290    }
291}
292
293fn score_scope(
294    gold: &[FindingBundle],
295    candidate: &[FindingBundle],
296    matches: &[(usize, usize)],
297) -> MetricResult {
298    if matches.is_empty() {
299        return MetricResult {
300            score: 0.0,
301            target: 0.80,
302            pass: false,
303            note: "no matched pairs to evaluate scope on".to_string(),
304            vacuous: false,
305        };
306    }
307    let mut sum = 0.0_f64;
308    for &(gi, ci) in matches {
309        let g = &gold[gi];
310        let c = &candidate[ci];
311        let organism_eq = entity_eq_for_type(g, c, "organism");
312        let intervention_overlap = entity_overlap_for_type(g, c, "intervention");
313        sum += 0.5 * organism_eq + 0.5 * intervention_overlap;
314    }
315    let score = sum / matches.len() as f64;
316    MetricResult {
317        score,
318        target: 0.80,
319        pass: score >= 0.80,
320        note: "mean of (0.5·organism_eq + 0.5·intervention_overlap) over matched pairs".to_string(),
321        vacuous: false,
322    }
323}
324
325fn entity_eq_for_type(g: &FindingBundle, c: &FindingBundle, ent_type: &str) -> f64 {
326    let g_set: BTreeSet<String> = g
327        .assertion
328        .entities
329        .iter()
330        .filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
331        .map(|e| e.name.to_lowercase())
332        .collect();
333    let c_set: BTreeSet<String> = c
334        .assertion
335        .entities
336        .iter()
337        .filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
338        .map(|e| e.name.to_lowercase())
339        .collect();
340    if g_set.is_empty() && c_set.is_empty() {
341        // Neither side specified — neutral, count as match.
342        return 1.0;
343    }
344    if g_set == c_set { 1.0 } else { 0.0 }
345}
346
347fn entity_overlap_for_type(g: &FindingBundle, c: &FindingBundle, ent_type: &str) -> f64 {
348    let g_set: BTreeSet<String> = g
349        .assertion
350        .entities
351        .iter()
352        .filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
353        .map(|e| e.name.to_lowercase())
354        .collect();
355    let c_set: BTreeSet<String> = c
356        .assertion
357        .entities
358        .iter()
359        .filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
360        .map(|e| e.name.to_lowercase())
361        .collect();
362    jaccard(&g_set, &c_set)
363}
364
365fn score_evidence_fidelity(candidate: &[FindingBundle], sources: &Path) -> MetricResult {
366    // Walk all files under `sources` once, build a lowercase
367    // whitespace-normalized buffer per file; for each candidate
368    // finding's evidence_spans, check substring presence.
369    let source_blobs = collect_source_blobs(sources);
370    if source_blobs.is_empty() {
371        return MetricResult {
372            score: 0.0,
373            target: 0.95,
374            pass: false,
375            note: format!(
376                "no readable source files under {} — cannot score fidelity",
377                sources.display()
378            ),
379            vacuous: false,
380        };
381    }
382
383    let mut checked = 0;
384    let mut hit = 0;
385    for f in candidate {
386        for span in &f.evidence.evidence_spans {
387            let text = extract_span_text(span);
388            if text.is_empty() {
389                continue;
390            }
391            let needle = normalize_for_match(&text);
392            if needle.len() < 12 {
393                // Too short to be meaningful (single tokens trivially
394                // match the whole corpus). Skip.
395                continue;
396            }
397            checked += 1;
398            if source_blobs.iter().any(|b| b.contains(&needle)) {
399                hit += 1;
400            }
401        }
402    }
403
404    let score = if checked == 0 {
405        0.0
406    } else {
407        hit as f64 / checked as f64
408    };
409    MetricResult {
410        score,
411        target: 0.95,
412        pass: score >= 0.95,
413        note: format!("{hit}/{checked} candidate evidence spans substring-match a source file"),
414        vacuous: false,
415    }
416}
417
418fn collect_source_blobs(root: &Path) -> Vec<String> {
419    let mut out = Vec::new();
420    let mut stack: Vec<PathBuf> = vec![root.to_path_buf()];
421    while let Some(dir) = stack.pop() {
422        let Ok(entries) = std::fs::read_dir(&dir) else {
423            continue;
424        };
425        for entry in entries.flatten() {
426            let path = entry.path();
427            let basename = path
428                .file_name()
429                .and_then(|n| n.to_str())
430                .unwrap_or_default();
431            if basename.starts_with('.') {
432                continue;
433            }
434            let Ok(meta) = entry.metadata() else { continue };
435            if meta.is_dir() {
436                stack.push(path);
437                continue;
438            }
439            // Read as text — binary files (PDFs) get skipped via
440            // the utf8 check (the corresponding raw .txt sibling
441            // is what bench actually scores against).
442            if let Ok(s) = std::fs::read_to_string(&path) {
443                out.push(normalize_for_match(&s));
444            }
445        }
446    }
447    out
448}
449
450fn extract_span_text(span: &serde_json::Value) -> String {
451    if let Some(s) = span.as_str() {
452        return s.to_string();
453    }
454    if let Some(s) = span.get("text").and_then(|v| v.as_str()) {
455        return s.to_string();
456    }
457    if let Some(s) = span.get("snippet").and_then(|v| v.as_str()) {
458        return s.to_string();
459    }
460    String::new()
461}
462
463fn normalize_for_match(s: &str) -> String {
464    s.to_lowercase()
465        .split_whitespace()
466        .collect::<Vec<_>>()
467        .join(" ")
468}
469
470fn score_duplicates(candidate: &[FindingBundle]) -> (f64, f64) {
471    if candidate.is_empty() {
472        return (1.0, 0.0);
473    }
474    let unique: HashSet<&str> = candidate.iter().map(|f| f.id.as_str()).collect();
475    let dup_rate = 1.0 - (unique.len() as f64 / candidate.len() as f64);
476    (1.0 - dup_rate, dup_rate)
477}
478
479fn score_novelty(candidate: &[FindingBundle], matches: &[(usize, usize)]) -> f64 {
480    if candidate.is_empty() {
481        return 0.0;
482    }
483    let matched_c: HashSet<usize> = matches.iter().map(|&(_, ci)| ci).collect();
484    let novel = candidate.len() - matched_c.len();
485    novel as f64 / candidate.len() as f64
486}
487
488fn score_contradiction_recall(gold: &[FindingBundle], candidate: &[FindingBundle]) -> MetricResult {
489    let gold_contradictions = collect_contradiction_set(gold);
490    if gold_contradictions.is_empty() {
491        return MetricResult {
492            score: 1.0,
493            target: 0.60,
494            pass: true,
495            note: "no contradictions in gold — excluded from composite".to_string(),
496            vacuous: true,
497        };
498    }
499    let candidate_contradictions = collect_contradiction_set(candidate);
500    let detected = gold_contradictions
501        .iter()
502        .filter(|pair| candidate_contradictions.contains(*pair))
503        .count();
504    let score = detected as f64 / gold_contradictions.len() as f64;
505    MetricResult {
506        score,
507        target: 0.60,
508        pass: score >= 0.60,
509        note: format!(
510            "{detected}/{} gold contradictions detected by candidate",
511            gold_contradictions.len()
512        ),
513        vacuous: false,
514    }
515}
516
517fn collect_contradiction_set(p: &[FindingBundle]) -> BTreeSet<(String, String)> {
518    let mut out = BTreeSet::new();
519    for f in p {
520        for link in &f.links {
521            let lt = link.link_type.to_lowercase();
522            if lt == "contradicts" || lt == "tension" || lt == "contests" {
523                let mut pair = [f.id.clone(), link.target.clone()];
524                pair.sort();
525                out.insert((pair[0].clone(), pair[1].clone()));
526            }
527        }
528    }
529    out
530}
531
532fn score_downstream_link(
533    gold: &[FindingBundle],
534    candidate: &[FindingBundle],
535    matches: &[(usize, usize)],
536) -> MetricResult {
537    // Among candidate findings that aren't matches (i.e. novel),
538    // what fraction link to ≥1 existing gold vf_id? A sign that
539    // the agent is grounding new claims in the existing frontier.
540    let matched_c: HashSet<usize> = matches.iter().map(|&(_, ci)| ci).collect();
541    let novel: Vec<&FindingBundle> = candidate
542        .iter()
543        .enumerate()
544        .filter(|(i, _)| !matched_c.contains(i))
545        .map(|(_, f)| f)
546        .collect();
547    if novel.is_empty() {
548        return MetricResult {
549            score: 1.0,
550            target: 0.75,
551            pass: true,
552            note: "no novel candidate findings — excluded from composite".to_string(),
553            vacuous: true,
554        };
555    }
556    let gold_ids: HashSet<&str> = gold.iter().map(|f| f.id.as_str()).collect();
557    let linked = novel
558        .iter()
559        .filter(|f| f.links.iter().any(|l| gold_ids.contains(l.target.as_str())))
560        .count();
561    let score = linked as f64 / novel.len() as f64;
562    MetricResult {
563        score,
564        target: 0.75,
565        pass: score >= 0.75,
566        note: format!(
567            "{linked}/{} novel candidate findings link to a gold finding",
568            novel.len()
569        ),
570        vacuous: false,
571    }
572}
573
574fn compute_composite(
575    claim_match: &MetricResult,
576    scope: &MetricResult,
577    evidence_fidelity: Option<&MetricResult>,
578    duplicate_inv: f64,
579    contradiction_recall: &MetricResult,
580    downstream_link: &MetricResult,
581) -> f64 {
582    // v0.29.2: weighted average over only the metrics that have
583    // real data. A vacuous metric (e.g. contradiction_recall=1.0
584    // because gold has 0 contradictions to recall) is dropped from
585    // both the numerator AND the denominator so it can't inflate
586    // the composite. Friction #13: pre-fix, an unrelated candidate
587    // frontier could score 0.31 just from vacuous 1.0s, masking
588    // the fact that claim_match_rate was 0. Post-fix, it scores 0.
589    let mut num = W_CLAIM_MATCH * claim_match.score + W_SCOPE * scope.score;
590    let mut denom = W_CLAIM_MATCH + W_SCOPE;
591
592    if let Some(ef) = evidence_fidelity
593        && !ef.vacuous
594    {
595        num += W_EVIDENCE_FIDELITY * ef.score;
596        denom += W_EVIDENCE_FIDELITY;
597    }
598    if !contradiction_recall.vacuous {
599        num += W_CONTRADICTION_RECALL * contradiction_recall.score;
600        denom += W_CONTRADICTION_RECALL;
601    }
602    if !downstream_link.vacuous {
603        num += W_DOWNSTREAM_LINK * downstream_link.score;
604        denom += W_DOWNSTREAM_LINK;
605    }
606    // duplicate_inv is never vacuous: even with 0 candidate
607    // findings it has the trivial meaning "no duplicates among 0".
608    num += W_DUPLICATE_INV * duplicate_inv;
609    denom += W_DUPLICATE_INV;
610
611    if denom == 0.0 { 0.0 } else { num / denom }
612}
613
614/// Render a human-readable report. JSON callers serialize
615/// `BenchReport` directly.
616pub fn render_pretty(report: &BenchReport) -> String {
617    let mut out = String::new();
618    out.push_str(&format!(
619        "  gold:                {} ({} findings)\n",
620        report.gold_path, report.gold_findings
621    ));
622    out.push_str(&format!(
623        "  candidate:           {} ({} findings)\n",
624        report.candidate_path, report.candidate_findings
625    ));
626    out.push_str(&format!(
627        "  matched pairs:       {}\n",
628        report.matched_pairs
629    ));
630    out.push_str("  ----\n");
631    pretty_metric(&mut out, "claim_match_rate    ", &report.claim_match_rate);
632    pretty_metric(&mut out, "scope_accuracy      ", &report.scope_accuracy);
633    if let Some(ef) = &report.evidence_fidelity {
634        pretty_metric(&mut out, "evidence_fidelity   ", ef);
635    } else {
636        out.push_str("  evidence_fidelity     (skipped — no --sources provided)\n");
637    }
638    out.push_str(&format!(
639        "  duplicate_rate        {:.3} (lower is better)\n",
640        report.duplicate_rate
641    ));
642    out.push_str(&format!(
643        "  novelty_rate          {:.3} (informational)\n",
644        report.novelty_rate
645    ));
646    pretty_metric(
647        &mut out,
648        "contradiction_recall",
649        &report.contradiction_recall,
650    );
651    pretty_metric(
652        &mut out,
653        "downstream_link_rate",
654        &report.downstream_link_rate,
655    );
656    out.push_str("  ----\n");
657    // v0.29.2: surface a clear "no-overlap detected" banner when
658    // claim_match_rate is 0 against a non-empty gold + candidate.
659    // Without this, a candidate covering tangential subject matter
660    // can collapse the composite to whatever the duplicate_inv +
661    // duplicate_rate floor allows, and the user reads the score as
662    // "passing". Friction #13.
663    let no_overlap =
664        report.matched_pairs == 0 && report.gold_findings > 0 && report.candidate_findings > 0;
665    if no_overlap {
666        out.push_str(
667            "  ⚠ no overlap detected: 0 matched pairs against a non-empty gold;\n    composite reflects only the metrics with real data\n",
668        );
669    }
670    out.push_str(&format!(
671        "  COMPOSITE             {:.3}  (threshold {:.2}, {})\n",
672        report.composite,
673        report.threshold,
674        if report.pass { "PASS" } else { "FAIL" }
675    ));
676    out
677}
678
679fn pretty_metric(out: &mut String, label: &str, m: &MetricResult) {
680    let tag = if m.vacuous {
681        "n/a"
682    } else if m.pass {
683        "ok"
684    } else {
685        "low"
686    };
687    out.push_str(&format!(
688        "  {label}  {:.3}  (target {:.2}, {tag})\n",
689        m.score, m.target,
690    ));
691}
692
693#[cfg(test)]
694mod tests {
695    use super::*;
696    use crate::bundle::{
697        Assertion, Conditions, Confidence, Evidence, Extraction, FindingBundle, Flags, Provenance,
698    };
699
700    fn finding(id: &str, claim: &str) -> FindingBundle {
701        FindingBundle {
702            id: id.to_string(),
703            version: 1,
704            previous_version: None,
705            assertion: Assertion {
706                text: claim.to_string(),
707                assertion_type: "mechanism".to_string(),
708                entities: Vec::new(),
709                relation: None,
710                direction: None,
711                causal_claim: None,
712                causal_evidence_grade: None,
713            },
714            evidence: Evidence {
715                evidence_type: "test".to_string(),
716                model_system: String::new(),
717                species: None,
718                method: "test".to_string(),
719                sample_size: None,
720                effect_size: None,
721                p_value: None,
722                replicated: false,
723                replication_count: None,
724                evidence_spans: Vec::new(),
725            },
726            conditions: Conditions {
727                text: String::new(),
728                species_verified: Vec::new(),
729                species_unverified: Vec::new(),
730                in_vitro: false,
731                in_vivo: false,
732                human_data: false,
733                clinical_trial: false,
734                concentration_range: None,
735                duration: None,
736                age_group: None,
737                cell_type: None,
738            },
739            confidence: Confidence::raw(0.5, "test", 0.7),
740            provenance: Provenance {
741                source_type: "test".to_string(),
742                doi: None,
743                pmid: None,
744                pmc: None,
745                openalex_id: None,
746                url: None,
747                title: "t".to_string(),
748                authors: Vec::new(),
749                year: None,
750                journal: None,
751                license: None,
752                publisher: None,
753                funders: Vec::new(),
754                extraction: Extraction {
755                    method: "test".to_string(),
756                    model: None,
757                    model_version: None,
758                    extracted_at: String::new(),
759                    extractor_version: "test".to_string(),
760                },
761                review: None,
762                citation_count: None,
763            },
764            flags: Flags {
765                gap: false,
766                negative_space: false,
767                contested: false,
768                retracted: false,
769                declining: false,
770                gravity_well: false,
771                review_state: None,
772                superseded: false,
773                signature_threshold: None,
774                jointly_accepted: false,
775            },
776            links: Vec::new(),
777            annotations: Vec::new(),
778            attachments: Vec::new(),
779            created: String::new(),
780            updated: None,
781
782            access_tier: crate::access_tier::AccessTier::Public,
783        }
784    }
785
786    #[test]
787    fn jaccard_basics() {
788        let a: BTreeSet<String> = ["a", "b", "c"].iter().map(|s| s.to_string()).collect();
789        let b: BTreeSet<String> = ["b", "c", "d"].iter().map(|s| s.to_string()).collect();
790        assert!((jaccard(&a, &b) - (2.0 / 4.0)).abs() < f64::EPSILON);
791    }
792
793    #[test]
794    fn match_findings_id_first() {
795        let g = vec![finding("vf_1", "alpha increases beta in mouse")];
796        let c = vec![finding("vf_1", "totally different text")];
797        let m = match_findings(&g, &c);
798        assert_eq!(m, vec![(0, 0)]);
799    }
800
801    #[test]
802    fn match_findings_jaccard_fallback() {
803        let g = vec![finding(
804            "vf_g1",
805            "Focused ultrasound increases BBB permeability in mouse models",
806        )];
807        let c = vec![finding(
808            "vf_c1",
809            "Focused ultrasound transiently opens BBB permeability across mouse models",
810        )];
811        let m = match_findings(&g, &c);
812        assert_eq!(m.len(), 1);
813    }
814
815    #[test]
816    fn claim_match_rate_at_full_overlap() {
817        let g = vec![
818            finding("vf_g1", "alpha increases beta in mouse"),
819            finding("vf_g2", "gamma decreases delta in human"),
820        ];
821        let c = g.clone();
822        let m = match_findings(&g, &c);
823        let r = score_claim_match(&g, &c, &m);
824        assert!((r.score - 1.0).abs() < f64::EPSILON);
825    }
826
827    #[test]
828    fn duplicate_rate_zero_for_unique() {
829        let c = vec![finding("vf_a", "x"), finding("vf_b", "y")];
830        let (inv, dup) = score_duplicates(&c);
831        assert!((dup - 0.0).abs() < f64::EPSILON);
832        assert!((inv - 1.0).abs() < f64::EPSILON);
833    }
834
835    #[test]
836    fn empty_candidate_zero_composite() {
837        let g = vec![finding("vf_g1", "a b c d e")];
838        let c: Vec<FindingBundle> = Vec::new();
839        let m = match_findings(&g, &c);
840        let cm = score_claim_match(&g, &c, &m);
841        assert_eq!(cm.score, 0.0);
842    }
843}