use std::collections::{BTreeSet, HashMap, HashSet};
use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
use crate::bundle::FindingBundle;
use crate::project::Project;
use crate::repo;
pub const W_CLAIM_MATCH: f64 = 0.25;
pub const W_SCOPE: f64 = 0.20;
pub const W_EVIDENCE_FIDELITY: f64 = 0.20;
pub const W_CONTRADICTION_RECALL: f64 = 0.15;
pub const W_DOWNSTREAM_LINK: f64 = 0.10;
pub const W_DUPLICATE_INV: f64 = 0.10;
#[derive(Debug, Clone)]
pub struct BenchInput {
pub gold_path: PathBuf,
pub candidate_path: PathBuf,
pub sources: Option<PathBuf>,
pub threshold: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct MetricResult {
pub score: f64,
pub target: f64,
pub pass: bool,
pub note: String,
#[serde(default, skip_serializing_if = "is_false")]
pub vacuous: bool,
}
fn is_false(b: &bool) -> bool {
!*b
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct BenchReport {
pub gold_path: String,
pub candidate_path: String,
pub gold_findings: usize,
pub candidate_findings: usize,
pub matched_pairs: usize,
pub claim_match_rate: MetricResult,
pub scope_accuracy: MetricResult,
pub evidence_fidelity: Option<MetricResult>,
pub duplicate_rate: f64,
pub novelty_rate: f64,
pub contradiction_recall: MetricResult,
pub downstream_link_rate: MetricResult,
pub composite: f64,
pub threshold: f64,
pub pass: bool,
}
pub fn run(input: BenchInput) -> Result<BenchReport, String> {
let gold: Project = repo::load_from_path(&input.gold_path)
.map_err(|e| format!("load gold {}: {e}", input.gold_path.display()))?;
let candidate: Project = repo::load_from_path(&input.candidate_path)
.map_err(|e| format!("load candidate {}: {e}", input.candidate_path.display()))?;
let gold_findings = sorted_findings(&gold);
let candidate_findings = sorted_findings_with_proposals(&candidate);
let matches = match_findings(&gold_findings, &candidate_findings);
let claim_match_rate = score_claim_match(&gold_findings, &candidate_findings, &matches);
let scope_accuracy = score_scope(&gold_findings, &candidate_findings, &matches);
let evidence_fidelity = input
.sources
.as_ref()
.map(|src| score_evidence_fidelity(&candidate_findings, src));
let (duplicate_inv, duplicate_rate) = score_duplicates(&candidate_findings);
let novelty_rate = score_novelty(&candidate_findings, &matches);
let contradiction_recall = score_contradiction_recall(&gold_findings, &candidate_findings);
let downstream_link_rate = score_downstream_link(&gold_findings, &candidate_findings, &matches);
let composite = compute_composite(
&claim_match_rate,
&scope_accuracy,
evidence_fidelity.as_ref(),
duplicate_inv,
&contradiction_recall,
&downstream_link_rate,
);
Ok(BenchReport {
gold_path: input.gold_path.display().to_string(),
candidate_path: input.candidate_path.display().to_string(),
gold_findings: gold_findings.len(),
candidate_findings: candidate_findings.len(),
matched_pairs: matches.len(),
claim_match_rate,
scope_accuracy,
evidence_fidelity,
duplicate_rate,
novelty_rate,
contradiction_recall,
downstream_link_rate,
composite,
threshold: input.threshold,
pass: composite >= input.threshold,
})
}
fn sorted_findings(p: &Project) -> Vec<FindingBundle> {
let mut out = p.findings.clone();
out.sort_by(|a, b| a.id.cmp(&b.id));
out
}
fn sorted_findings_with_proposals(p: &Project) -> Vec<FindingBundle> {
let mut out = p.findings.clone();
let already: HashSet<String> = out.iter().map(|f| f.id.clone()).collect();
let mut seen = already.clone();
for proposal in &p.proposals {
if proposal.kind != "finding.add" {
continue;
}
let Some(payload_finding) = proposal.payload.get("finding") else {
continue;
};
let Ok(bundle) = serde_json::from_value::<FindingBundle>(payload_finding.clone()) else {
continue;
};
if seen.contains(&bundle.id) {
continue;
}
seen.insert(bundle.id.clone());
out.push(bundle);
}
out.sort_by(|a, b| a.id.cmp(&b.id));
out
}
fn match_findings(gold: &[FindingBundle], candidate: &[FindingBundle]) -> Vec<(usize, usize)> {
let mut used_g: HashSet<usize> = HashSet::new();
let mut used_c: HashSet<usize> = HashSet::new();
let mut pairs: Vec<(usize, usize, f64)> = Vec::new();
let g_by_id: HashMap<&str, usize> = gold
.iter()
.enumerate()
.map(|(i, f)| (f.id.as_str(), i))
.collect();
for (ci, cand) in candidate.iter().enumerate() {
if let Some(&gi) = g_by_id.get(cand.id.as_str()) {
pairs.push((gi, ci, 1.0));
used_g.insert(gi);
used_c.insert(ci);
}
}
let g_tokens: Vec<BTreeSet<String>> = gold
.iter()
.map(|f| tokenize_claim(&f.assertion.text))
.collect();
let c_tokens: Vec<BTreeSet<String>> = candidate
.iter()
.map(|f| tokenize_claim(&f.assertion.text))
.collect();
for (ci, c_set) in c_tokens.iter().enumerate() {
if used_c.contains(&ci) {
continue;
}
let mut best: Option<(usize, f64)> = None;
for (gi, g_set) in g_tokens.iter().enumerate() {
if used_g.contains(&gi) {
continue;
}
let j = jaccard(g_set, c_set);
if j >= 0.4 && best.map(|(_, b)| j > b).unwrap_or(true) {
best = Some((gi, j));
}
}
if let Some((gi, score)) = best {
pairs.push((gi, ci, score));
used_g.insert(gi);
used_c.insert(ci);
}
}
pairs.sort_by(|a, b| a.0.cmp(&b.0));
pairs.into_iter().map(|(g, c, _)| (g, c)).collect()
}
fn tokenize_claim(s: &str) -> BTreeSet<String> {
s.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|t| t.len() > 2)
.map(String::from)
.collect()
}
fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
let inter = a.intersection(b).count() as f64;
let union = a.union(b).count() as f64;
if union == 0.0 { 0.0 } else { inter / union }
}
fn score_claim_match(
gold: &[FindingBundle],
candidate: &[FindingBundle],
matches: &[(usize, usize)],
) -> MetricResult {
let g = gold.len();
let c = candidate.len();
let m = matches.len();
let denom = g + c;
let score = if denom == 0 {
0.0
} else {
(2.0 * m as f64) / denom as f64
};
let target = 0.70;
MetricResult {
score,
target,
pass: score >= target,
note: format!("F1 over claim-text match: 2·|M|/(|G|+|C|) = 2·{m}/({g}+{c})"),
vacuous: false,
}
}
fn score_scope(
gold: &[FindingBundle],
candidate: &[FindingBundle],
matches: &[(usize, usize)],
) -> MetricResult {
if matches.is_empty() {
return MetricResult {
score: 0.0,
target: 0.80,
pass: false,
note: "no matched pairs to evaluate scope on".to_string(),
vacuous: false,
};
}
let mut sum = 0.0_f64;
for &(gi, ci) in matches {
let g = &gold[gi];
let c = &candidate[ci];
let organism_eq = entity_eq_for_type(g, c, "organism");
let intervention_overlap = entity_overlap_for_type(g, c, "intervention");
sum += 0.5 * organism_eq + 0.5 * intervention_overlap;
}
let score = sum / matches.len() as f64;
MetricResult {
score,
target: 0.80,
pass: score >= 0.80,
note: "mean of (0.5·organism_eq + 0.5·intervention_overlap) over matched pairs".to_string(),
vacuous: false,
}
}
fn entity_eq_for_type(g: &FindingBundle, c: &FindingBundle, ent_type: &str) -> f64 {
let g_set: BTreeSet<String> = g
.assertion
.entities
.iter()
.filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
.map(|e| e.name.to_lowercase())
.collect();
let c_set: BTreeSet<String> = c
.assertion
.entities
.iter()
.filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
.map(|e| e.name.to_lowercase())
.collect();
if g_set.is_empty() && c_set.is_empty() {
return 1.0;
}
if g_set == c_set { 1.0 } else { 0.0 }
}
fn entity_overlap_for_type(g: &FindingBundle, c: &FindingBundle, ent_type: &str) -> f64 {
let g_set: BTreeSet<String> = g
.assertion
.entities
.iter()
.filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
.map(|e| e.name.to_lowercase())
.collect();
let c_set: BTreeSet<String> = c
.assertion
.entities
.iter()
.filter(|e| e.entity_type.eq_ignore_ascii_case(ent_type))
.map(|e| e.name.to_lowercase())
.collect();
jaccard(&g_set, &c_set)
}
fn score_evidence_fidelity(candidate: &[FindingBundle], sources: &Path) -> MetricResult {
let source_blobs = collect_source_blobs(sources);
if source_blobs.is_empty() {
return MetricResult {
score: 0.0,
target: 0.95,
pass: false,
note: format!(
"no readable source files under {} — cannot score fidelity",
sources.display()
),
vacuous: false,
};
}
let mut checked = 0;
let mut hit = 0;
for f in candidate {
for span in &f.evidence.evidence_spans {
let text = extract_span_text(span);
if text.is_empty() {
continue;
}
let needle = normalize_for_match(&text);
if needle.len() < 12 {
continue;
}
checked += 1;
if source_blobs.iter().any(|b| b.contains(&needle)) {
hit += 1;
}
}
}
let score = if checked == 0 {
0.0
} else {
hit as f64 / checked as f64
};
MetricResult {
score,
target: 0.95,
pass: score >= 0.95,
note: format!("{hit}/{checked} candidate evidence spans substring-match a source file"),
vacuous: false,
}
}
fn collect_source_blobs(root: &Path) -> Vec<String> {
let mut out = Vec::new();
let mut stack: Vec<PathBuf> = vec![root.to_path_buf()];
while let Some(dir) = stack.pop() {
let Ok(entries) = std::fs::read_dir(&dir) else {
continue;
};
for entry in entries.flatten() {
let path = entry.path();
let basename = path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default();
if basename.starts_with('.') {
continue;
}
let Ok(meta) = entry.metadata() else { continue };
if meta.is_dir() {
stack.push(path);
continue;
}
if let Ok(s) = std::fs::read_to_string(&path) {
out.push(normalize_for_match(&s));
}
}
}
out
}
fn extract_span_text(span: &serde_json::Value) -> String {
if let Some(s) = span.as_str() {
return s.to_string();
}
if let Some(s) = span.get("text").and_then(|v| v.as_str()) {
return s.to_string();
}
if let Some(s) = span.get("snippet").and_then(|v| v.as_str()) {
return s.to_string();
}
String::new()
}
fn normalize_for_match(s: &str) -> String {
s.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn score_duplicates(candidate: &[FindingBundle]) -> (f64, f64) {
if candidate.is_empty() {
return (1.0, 0.0);
}
let unique: HashSet<&str> = candidate.iter().map(|f| f.id.as_str()).collect();
let dup_rate = 1.0 - (unique.len() as f64 / candidate.len() as f64);
(1.0 - dup_rate, dup_rate)
}
fn score_novelty(candidate: &[FindingBundle], matches: &[(usize, usize)]) -> f64 {
if candidate.is_empty() {
return 0.0;
}
let matched_c: HashSet<usize> = matches.iter().map(|&(_, ci)| ci).collect();
let novel = candidate.len() - matched_c.len();
novel as f64 / candidate.len() as f64
}
fn score_contradiction_recall(gold: &[FindingBundle], candidate: &[FindingBundle]) -> MetricResult {
let gold_contradictions = collect_contradiction_set(gold);
if gold_contradictions.is_empty() {
return MetricResult {
score: 1.0,
target: 0.60,
pass: true,
note: "no contradictions in gold — excluded from composite".to_string(),
vacuous: true,
};
}
let candidate_contradictions = collect_contradiction_set(candidate);
let detected = gold_contradictions
.iter()
.filter(|pair| candidate_contradictions.contains(*pair))
.count();
let score = detected as f64 / gold_contradictions.len() as f64;
MetricResult {
score,
target: 0.60,
pass: score >= 0.60,
note: format!(
"{detected}/{} gold contradictions detected by candidate",
gold_contradictions.len()
),
vacuous: false,
}
}
fn collect_contradiction_set(p: &[FindingBundle]) -> BTreeSet<(String, String)> {
let mut out = BTreeSet::new();
for f in p {
for link in &f.links {
let lt = link.link_type.to_lowercase();
if lt == "contradicts" || lt == "tension" || lt == "contests" {
let mut pair = [f.id.clone(), link.target.clone()];
pair.sort();
out.insert((pair[0].clone(), pair[1].clone()));
}
}
}
out
}
fn score_downstream_link(
gold: &[FindingBundle],
candidate: &[FindingBundle],
matches: &[(usize, usize)],
) -> MetricResult {
let matched_c: HashSet<usize> = matches.iter().map(|&(_, ci)| ci).collect();
let novel: Vec<&FindingBundle> = candidate
.iter()
.enumerate()
.filter(|(i, _)| !matched_c.contains(i))
.map(|(_, f)| f)
.collect();
if novel.is_empty() {
return MetricResult {
score: 1.0,
target: 0.75,
pass: true,
note: "no novel candidate findings — excluded from composite".to_string(),
vacuous: true,
};
}
let gold_ids: HashSet<&str> = gold.iter().map(|f| f.id.as_str()).collect();
let linked = novel
.iter()
.filter(|f| f.links.iter().any(|l| gold_ids.contains(l.target.as_str())))
.count();
let score = linked as f64 / novel.len() as f64;
MetricResult {
score,
target: 0.75,
pass: score >= 0.75,
note: format!(
"{linked}/{} novel candidate findings link to a gold finding",
novel.len()
),
vacuous: false,
}
}
fn compute_composite(
claim_match: &MetricResult,
scope: &MetricResult,
evidence_fidelity: Option<&MetricResult>,
duplicate_inv: f64,
contradiction_recall: &MetricResult,
downstream_link: &MetricResult,
) -> f64 {
let mut num = W_CLAIM_MATCH * claim_match.score + W_SCOPE * scope.score;
let mut denom = W_CLAIM_MATCH + W_SCOPE;
if let Some(ef) = evidence_fidelity
&& !ef.vacuous
{
num += W_EVIDENCE_FIDELITY * ef.score;
denom += W_EVIDENCE_FIDELITY;
}
if !contradiction_recall.vacuous {
num += W_CONTRADICTION_RECALL * contradiction_recall.score;
denom += W_CONTRADICTION_RECALL;
}
if !downstream_link.vacuous {
num += W_DOWNSTREAM_LINK * downstream_link.score;
denom += W_DOWNSTREAM_LINK;
}
num += W_DUPLICATE_INV * duplicate_inv;
denom += W_DUPLICATE_INV;
if denom == 0.0 { 0.0 } else { num / denom }
}
pub fn render_pretty(report: &BenchReport) -> String {
let mut out = String::new();
out.push_str(&format!(
" gold: {} ({} findings)\n",
report.gold_path, report.gold_findings
));
out.push_str(&format!(
" candidate: {} ({} findings)\n",
report.candidate_path, report.candidate_findings
));
out.push_str(&format!(
" matched pairs: {}\n",
report.matched_pairs
));
out.push_str(" ----\n");
pretty_metric(&mut out, "claim_match_rate ", &report.claim_match_rate);
pretty_metric(&mut out, "scope_accuracy ", &report.scope_accuracy);
if let Some(ef) = &report.evidence_fidelity {
pretty_metric(&mut out, "evidence_fidelity ", ef);
} else {
out.push_str(" evidence_fidelity (skipped — no --sources provided)\n");
}
out.push_str(&format!(
" duplicate_rate {:.3} (lower is better)\n",
report.duplicate_rate
));
out.push_str(&format!(
" novelty_rate {:.3} (informational)\n",
report.novelty_rate
));
pretty_metric(
&mut out,
"contradiction_recall",
&report.contradiction_recall,
);
pretty_metric(
&mut out,
"downstream_link_rate",
&report.downstream_link_rate,
);
out.push_str(" ----\n");
let no_overlap =
report.matched_pairs == 0 && report.gold_findings > 0 && report.candidate_findings > 0;
if no_overlap {
out.push_str(
" ⚠ no overlap detected: 0 matched pairs against a non-empty gold;\n composite reflects only the metrics with real data\n",
);
}
out.push_str(&format!(
" COMPOSITE {:.3} (threshold {:.2}, {})\n",
report.composite,
report.threshold,
if report.pass { "PASS" } else { "FAIL" }
));
out
}
fn pretty_metric(out: &mut String, label: &str, m: &MetricResult) {
let tag = if m.vacuous {
"n/a"
} else if m.pass {
"ok"
} else {
"low"
};
out.push_str(&format!(
" {label} {:.3} (target {:.2}, {tag})\n",
m.score, m.target,
));
}
#[cfg(test)]
mod tests {
use super::*;
use crate::bundle::{
Assertion, Conditions, Confidence, Evidence, Extraction, FindingBundle, Flags, Provenance,
};
fn finding(id: &str, claim: &str) -> FindingBundle {
FindingBundle {
id: id.to_string(),
version: 1,
previous_version: None,
assertion: Assertion {
text: claim.to_string(),
assertion_type: "mechanism".to_string(),
entities: Vec::new(),
relation: None,
direction: None,
causal_claim: None,
causal_evidence_grade: None,
},
evidence: Evidence {
evidence_type: "test".to_string(),
model_system: String::new(),
species: None,
method: "test".to_string(),
sample_size: None,
effect_size: None,
p_value: None,
replicated: false,
replication_count: None,
evidence_spans: Vec::new(),
},
conditions: Conditions {
text: String::new(),
species_verified: Vec::new(),
species_unverified: Vec::new(),
in_vitro: false,
in_vivo: false,
human_data: false,
clinical_trial: false,
concentration_range: None,
duration: None,
age_group: None,
cell_type: None,
},
confidence: Confidence::raw(0.5, "test", 0.7),
provenance: Provenance {
source_type: "test".to_string(),
doi: None,
pmid: None,
pmc: None,
openalex_id: None,
url: None,
title: "t".to_string(),
authors: Vec::new(),
year: None,
journal: None,
license: None,
publisher: None,
funders: Vec::new(),
extraction: Extraction {
method: "test".to_string(),
model: None,
model_version: None,
extracted_at: String::new(),
extractor_version: "test".to_string(),
},
review: None,
citation_count: None,
},
flags: Flags {
gap: false,
negative_space: false,
contested: false,
retracted: false,
declining: false,
gravity_well: false,
review_state: None,
superseded: false,
signature_threshold: None,
jointly_accepted: false,
},
links: Vec::new(),
annotations: Vec::new(),
attachments: Vec::new(),
created: String::new(),
updated: None,
access_tier: crate::access_tier::AccessTier::Public,
}
}
#[test]
fn jaccard_basics() {
let a: BTreeSet<String> = ["a", "b", "c"].iter().map(|s| s.to_string()).collect();
let b: BTreeSet<String> = ["b", "c", "d"].iter().map(|s| s.to_string()).collect();
assert!((jaccard(&a, &b) - (2.0 / 4.0)).abs() < f64::EPSILON);
}
#[test]
fn match_findings_id_first() {
let g = vec![finding("vf_1", "alpha increases beta in mouse")];
let c = vec![finding("vf_1", "totally different text")];
let m = match_findings(&g, &c);
assert_eq!(m, vec![(0, 0)]);
}
#[test]
fn match_findings_jaccard_fallback() {
let g = vec![finding(
"vf_g1",
"Focused ultrasound increases BBB permeability in mouse models",
)];
let c = vec![finding(
"vf_c1",
"Focused ultrasound transiently opens BBB permeability across mouse models",
)];
let m = match_findings(&g, &c);
assert_eq!(m.len(), 1);
}
#[test]
fn claim_match_rate_at_full_overlap() {
let g = vec![
finding("vf_g1", "alpha increases beta in mouse"),
finding("vf_g2", "gamma decreases delta in human"),
];
let c = g.clone();
let m = match_findings(&g, &c);
let r = score_claim_match(&g, &c, &m);
assert!((r.score - 1.0).abs() < f64::EPSILON);
}
#[test]
fn duplicate_rate_zero_for_unique() {
let c = vec![finding("vf_a", "x"), finding("vf_b", "y")];
let (inv, dup) = score_duplicates(&c);
assert!((dup - 0.0).abs() < f64::EPSILON);
assert!((inv - 1.0).abs() < f64::EPSILON);
}
#[test]
fn empty_candidate_zero_composite() {
let g = vec![finding("vf_g1", "a b c d e")];
let c: Vec<FindingBundle> = Vec::new();
let m = match_findings(&g, &c);
let cm = score_claim_match(&g, &c, &m);
assert_eq!(cm.score, 0.0);
}
}