vela_protocol/
benchmark.rs

1//! Benchmark extraction quality against a gold standard.
2
3use std::collections::{HashMap, HashSet};
4use std::path::{Path, PathBuf};
5
6use colored::Colorize;
7use serde::{Deserialize, Serialize};
8use serde_json::json;
9use sha2::{Digest, Sha256};
10
11use crate::bundle::{Entity, FindingBundle};
12use crate::cli_style as style;
13use crate::project;
14use crate::repo;
15
16/// A single gold-standard finding.
17#[allow(dead_code)]
18#[derive(Debug, Clone, Deserialize)]
19pub struct GoldFinding {
20    #[serde(default)]
21    pub id: Option<String>,
22    pub assertion_text: String,
23    pub assertion_type: String,
24    pub entities: Vec<String>,
25    pub confidence_range: ConfidenceRange,
26    #[serde(default)]
27    pub notes: Option<String>,
28}
29
30#[derive(Debug, Clone, Deserialize)]
31pub struct ConfidenceRange {
32    pub low: f64,
33    pub high: f64,
34}
35
36/// Full benchmark report.
37#[derive(Debug, Serialize)]
38pub struct BenchmarkReport {
39    pub total_frontier_findings: usize,
40    pub total_gold_findings: usize,
41    pub matched: usize,
42    pub total_frontier_matched: usize,
43    pub unmatched_gold: usize,
44    pub unmatched_frontier: usize,
45    pub exact_id_matches: usize,
46    pub precision: f64,
47    pub recall: f64,
48    pub f1: f64,
49    pub entity_accuracy: f64,
50    pub assertion_type_accuracy: f64,
51    pub confidence_calibration: f64,
52    pub match_details: Vec<MatchDetail>,
53}
54
55#[derive(Debug, Serialize)]
56pub struct MatchDetail {
57    pub gold_id: Option<String>,
58    pub frontier_id: String,
59    pub gold_text: String,
60    pub frontier_text: String,
61    pub similarity: f64,
62    pub entity_overlap: f64,
63    pub assertion_type_match: bool,
64    pub confidence_in_range: bool,
65    pub exact_id_match: bool,
66}
67
68pub fn run(frontier_path: &Path, gold_path: &Path, json_output: bool) {
69    let frontier = repo::load_from_path(frontier_path).expect("Failed to load frontier");
70
71    let gold_data = std::fs::read_to_string(gold_path).expect("Failed to read gold standard file");
72    let gold: Vec<GoldFinding> =
73        serde_json::from_str(&gold_data).expect("Failed to parse gold standard JSON");
74
75    let report = benchmark(&frontier.findings, &gold);
76
77    if json_output {
78        let json = serde_json::to_string_pretty(&report).unwrap();
79        println!("{json}");
80    } else {
81        print_report(&report);
82    }
83}
84
85pub fn benchmark(findings: &[FindingBundle], gold: &[GoldFinding]) -> BenchmarkReport {
86    let mut match_details = Vec::new();
87    let mut gold_matched = vec![false; gold.len()];
88    let mut frontier_matched = vec![false; findings.len()];
89    let mut candidates = Vec::new();
90
91    for (gi, g) in gold.iter().enumerate() {
92        for (fi, f) in findings.iter().enumerate() {
93            let sim = jaccard_similarity(&g.assertion_text, &f.assertion.text);
94            let exact_id = g.id.as_deref().is_some_and(|id| id == f.id);
95            if exact_id || sim >= 0.2 {
96                candidates.push(FindingCandidate {
97                    gold_idx: gi,
98                    frontier_idx: fi,
99                    similarity: sim,
100                    exact_id,
101                    assertion_type_match: g.assertion_type == f.assertion.assertion_type,
102                });
103            }
104        }
105    }
106
107    candidates.sort_by(|a, b| {
108        b.exact_id
109            .cmp(&a.exact_id)
110            .then_with(|| {
111                b.similarity
112                    .partial_cmp(&a.similarity)
113                    .unwrap_or(std::cmp::Ordering::Equal)
114            })
115            .then_with(|| b.assertion_type_match.cmp(&a.assertion_type_match))
116            .then_with(|| a.gold_idx.cmp(&b.gold_idx))
117            .then_with(|| a.frontier_idx.cmp(&b.frontier_idx))
118    });
119
120    for candidate in candidates {
121        let gi = candidate.gold_idx;
122        let fi = candidate.frontier_idx;
123        if gold_matched[gi] || frontier_matched[fi] {
124            continue;
125        }
126
127        gold_matched[gi] = true;
128        frontier_matched[fi] = true;
129
130        let g = &gold[gi];
131        let f = &findings[fi];
132
133        let gold_entities: HashSet<String> =
134            g.entities.iter().map(|e| normalize_token(e)).collect();
135        let frontier_entities: HashSet<String> = f
136            .assertion
137            .entities
138            .iter()
139            .map(|e| normalize_token(&e.name))
140            .collect();
141        let entity_overlap = if gold_entities.is_empty() {
142            1.0
143        } else {
144            let matches = gold_entities
145                .iter()
146                .filter(|e| frontier_entities.contains(*e))
147                .count();
148            matches as f64 / gold_entities.len() as f64
149        };
150
151        let in_range = f.confidence.score >= g.confidence_range.low
152            && f.confidence.score <= g.confidence_range.high;
153
154        match_details.push(MatchDetail {
155            gold_id: g.id.clone(),
156            frontier_id: f.id.clone(),
157            gold_text: truncate(&g.assertion_text, 80),
158            frontier_text: truncate(&f.assertion.text, 80),
159            similarity: round3(candidate.similarity),
160            entity_overlap: round3(entity_overlap),
161            assertion_type_match: candidate.assertion_type_match,
162            confidence_in_range: in_range,
163            exact_id_match: candidate.exact_id,
164        });
165    }
166
167    let matched = gold_matched.iter().filter(|&&m| m).count();
168    let frontier_matched_count = frontier_matched.iter().filter(|&&m| m).count();
169    let exact_id_matches = match_details.iter().filter(|d| d.exact_id_match).count();
170
171    let precision = if findings.is_empty() {
172        0.0
173    } else {
174        frontier_matched_count as f64 / findings.len() as f64
175    };
176    let recall = if gold.is_empty() {
177        0.0
178    } else {
179        matched as f64 / gold.len() as f64
180    };
181    let f1 = if precision + recall == 0.0 {
182        0.0
183    } else {
184        2.0 * precision * recall / (precision + recall)
185    };
186
187    let entity_accuracy = if match_details.is_empty() {
188        0.0
189    } else {
190        match_details.iter().map(|d| d.entity_overlap).sum::<f64>() / match_details.len() as f64
191    };
192
193    let confidence_calibration = if match_details.is_empty() {
194        0.0
195    } else {
196        match_details
197            .iter()
198            .filter(|d| d.confidence_in_range)
199            .count() as f64
200            / match_details.len() as f64
201    };
202    let assertion_type_accuracy = if match_details.is_empty() {
203        0.0
204    } else {
205        match_details
206            .iter()
207            .filter(|d| d.assertion_type_match)
208            .count() as f64
209            / match_details.len() as f64
210    };
211
212    BenchmarkReport {
213        total_frontier_findings: findings.len(),
214        total_gold_findings: gold.len(),
215        matched,
216        total_frontier_matched: frontier_matched_count,
217        unmatched_gold: gold.len().saturating_sub(matched),
218        unmatched_frontier: findings.len().saturating_sub(frontier_matched_count),
219        exact_id_matches,
220        precision: round3(precision),
221        recall: round3(recall),
222        f1: round3(f1),
223        entity_accuracy: round3(entity_accuracy),
224        assertion_type_accuracy: round3(assertion_type_accuracy),
225        confidence_calibration: round3(confidence_calibration),
226        match_details,
227    }
228}
229
230struct FindingCandidate {
231    gold_idx: usize,
232    frontier_idx: usize,
233    similarity: f64,
234    exact_id: bool,
235    assertion_type_match: bool,
236}
237
238fn round3(v: f64) -> f64 {
239    (v * 1000.0).round() / 1000.0
240}
241
242fn normalize_token(value: &str) -> String {
243    value
244        .trim()
245        .to_lowercase()
246        .replace('β', "beta")
247        .replace('α', "alpha")
248}
249
250fn truncate(s: &str, max: usize) -> String {
251    if s.len() <= max {
252        s.to_string()
253    } else {
254        let mut end = max;
255        while end > 0 && !s.is_char_boundary(end) {
256            end -= 1;
257        }
258        format!("{}...", &s[..end])
259    }
260}
261
262/// Jaccard similarity between two strings based on word overlap.
263fn jaccard_similarity(a: &str, b: &str) -> f64 {
264    let words_a: HashSet<&str> = a
265        .split_whitespace()
266        .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
267        .filter(|w| !w.is_empty())
268        .collect();
269    let words_b: HashSet<&str> = b
270        .split_whitespace()
271        .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
272        .filter(|w| !w.is_empty())
273        .collect();
274
275    if words_a.is_empty() && words_b.is_empty() {
276        return 1.0;
277    }
278
279    let intersection = words_a.intersection(&words_b).count();
280    let union = words_a.union(&words_b).count();
281
282    if union == 0 {
283        0.0
284    } else {
285        intersection as f64 / union as f64
286    }
287}
288
289pub fn print_report(report: &BenchmarkReport) {
290    println!();
291    println!("  {}", "VELA · BENCHMARK REPORT".dimmed());
292    println!("  {}", style::tick_row(60));
293    println!("  project findings: {}", report.total_frontier_findings);
294    println!("  gold findings:    {}", report.total_gold_findings);
295    println!("  matched:          {}", report.matched);
296    println!();
297    println!("  precision:        {:.1}%", report.precision * 100.0);
298    println!("  recall:           {:.1}%", report.recall * 100.0);
299    println!("  f1:               {:.1}%", report.f1 * 100.0);
300    println!();
301    println!(
302        "  entity accuracy:       {:.1}%",
303        report.entity_accuracy * 100.0
304    );
305    println!(
306        "  confidence calibration: {:.1}%",
307        report.confidence_calibration * 100.0
308    );
309
310    if !report.match_details.is_empty() {
311        println!();
312        println!("  {}", "MATCH DETAILS".dimmed());
313        println!("  {}", style::tick_row(110));
314        for d in &report.match_details {
315            let cal = if d.confidence_in_range {
316                style::ok("ok")
317            } else {
318                style::lost("miss")
319            };
320            println!(
321                "  sim:{:.2} ent:{:.2} conf:{} · {} · {}",
322                d.similarity, d.entity_overlap, cal, d.gold_text, d.frontier_text
323            );
324        }
325    }
326
327    println!();
328    println!("  {}", style::tick_row(60));
329    println!();
330}
331
332// ---------------------------------------------------------------------------
333// Entity resolution benchmark
334// ---------------------------------------------------------------------------
335
336/// A single gold-standard entity for resolution benchmarking.
337#[derive(Debug, Clone, Deserialize)]
338pub struct GoldEntity {
339    /// Entity name as it appears in findings.
340    pub name: String,
341    /// Entity type (gene, protein, compound, disease, pathway, other).
342    #[serde(rename = "type")]
343    pub entity_type: String,
344    /// Expected database source (uniprot, mesh, pubchem, chebi, etc.).
345    pub expected_source: String,
346    /// Expected canonical ID in that database.
347    pub expected_id: String,
348    /// Minimum acceptable resolution confidence.
349    pub expected_confidence: f64,
350    /// Alternative acceptable IDs.
351    #[serde(default)]
352    pub alternatives: Vec<AlternativeId>,
353}
354
355#[derive(Debug, Clone, Deserialize, Serialize)]
356pub struct AlternativeId {
357    pub source: String,
358    pub id: String,
359}
360
361/// Per-entity match result.
362#[derive(Debug, Serialize)]
363pub struct EntityMatchDetail {
364    pub name: String,
365    pub entity_type: String,
366    pub expected_source: String,
367    pub expected_id: String,
368    pub found_type: Option<String>,
369    pub resolved_source: Option<String>,
370    pub resolved_id: Option<String>,
371    pub resolved_confidence: f64,
372    pub type_match: bool,
373    pub id_match: bool,
374    pub confidence_ok: bool,
375}
376
377/// Per-type breakdown.
378#[derive(Debug, Serialize)]
379pub struct TypeBreakdown {
380    pub entity_type: String,
381    pub total: usize,
382    pub found: usize,
383    pub id_correct: usize,
384    pub confidence_ok: usize,
385    pub precision: f64,
386    pub recall: f64,
387    pub f1: f64,
388}
389
390/// Full entity resolution benchmark report.
391#[derive(Debug, Serialize)]
392pub struct EntityBenchmarkReport {
393    pub total_gold_entities: usize,
394    pub found_in_frontier: usize,
395    pub type_correct: usize,
396    pub id_correct: usize,
397    pub confidence_ok: usize,
398    pub precision: f64,
399    pub recall: f64,
400    pub f1: f64,
401    pub type_accuracy: f64,
402    pub by_type: Vec<TypeBreakdown>,
403    pub details: Vec<EntityMatchDetail>,
404}
405
406pub fn run_entity_benchmark(frontier_path: &Path, gold_path: &Path, json_output: bool) {
407    let frontier = repo::load_from_path(frontier_path).expect("Failed to load frontier");
408
409    let gold_data =
410        std::fs::read_to_string(gold_path).expect("Failed to read entity gold standard file");
411    let gold: Vec<GoldEntity> =
412        serde_json::from_str(&gold_data).expect("Failed to parse entity gold standard JSON");
413
414    let report = entity_benchmark(&frontier.findings, &gold);
415
416    if json_output {
417        let json = serde_json::to_string_pretty(&report).unwrap();
418        println!("{json}");
419    } else {
420        print_entity_report(&report);
421    }
422}
423
424/// Collect all entities from findings into a lookup keyed by lowercase name.
425fn collect_entities(findings: &[FindingBundle]) -> HashMap<String, Vec<&Entity>> {
426    let mut map: HashMap<String, Vec<&Entity>> = HashMap::new();
427    for f in findings {
428        for ent in &f.assertion.entities {
429            map.entry(ent.name.to_lowercase()).or_default().push(ent);
430        }
431    }
432    map
433}
434
435/// Check whether a resolved entity matches the gold entry (ID match).
436fn id_matches(entity: &Entity, gold: &GoldEntity) -> bool {
437    if gold.expected_source.is_empty() && gold.expected_id.is_empty() {
438        return entity.entity_type == gold.entity_type;
439    }
440    if let Some(ref cid) = entity.canonical_id {
441        // Primary match.
442        if cid.source == gold.expected_source && cid.id == gold.expected_id {
443            return true;
444        }
445        // Check alternatives.
446        for alt in &gold.alternatives {
447            if cid.source == alt.source && cid.id == alt.id {
448                return true;
449            }
450        }
451    }
452    false
453}
454
455/// Check whether confidence meets the minimum threshold.
456fn confidence_ok(entity: &Entity, gold: &GoldEntity) -> bool {
457    if let Some(ref cid) = entity.canonical_id {
458        cid.confidence >= gold.expected_confidence
459    } else {
460        entity.resolution_confidence >= gold.expected_confidence
461    }
462}
463
464pub fn entity_benchmark(findings: &[FindingBundle], gold: &[GoldEntity]) -> EntityBenchmarkReport {
465    let entity_map = collect_entities(findings);
466    let mut details = Vec::new();
467
468    for g in gold {
469        let key = g.name.to_lowercase();
470        let entities = entity_map.get(&key);
471
472        let (found_type, resolved_source, resolved_id, resolved_conf, type_match, matched, conf_ok) =
473            if let Some(ents) = entities {
474                let best = ents.iter().max_by(|a, b| {
475                    entity_rank(a, g)
476                        .cmp(&entity_rank(b, g))
477                        .then_with(|| a.name.cmp(&b.name))
478                });
479
480                if let Some(ent) = best {
481                    (
482                        Some(ent.entity_type.clone()),
483                        ent.canonical_id.as_ref().map(|cid| cid.source.clone()),
484                        ent.canonical_id.as_ref().map(|cid| cid.id.clone()),
485                        entity_resolution_confidence(ent),
486                        ent.entity_type == g.entity_type,
487                        id_matches(ent, g),
488                        confidence_ok(ent, g),
489                    )
490                } else {
491                    (None, None, None, 0.0, false, false, false)
492                }
493            } else {
494                (None, None, None, 0.0, false, false, false)
495            };
496
497        details.push(EntityMatchDetail {
498            name: g.name.clone(),
499            entity_type: g.entity_type.clone(),
500            expected_source: g.expected_source.clone(),
501            expected_id: g.expected_id.clone(),
502            found_type,
503            resolved_source,
504            resolved_id,
505            resolved_confidence: round3(resolved_conf),
506            type_match,
507            id_match: matched,
508            confidence_ok: conf_ok,
509        });
510    }
511
512    let total = gold.len();
513    let found = details.iter().filter(|d| d.found_type.is_some()).count();
514    let type_correct = details.iter().filter(|d| d.type_match).count();
515    let id_correct = details.iter().filter(|d| d.id_match).count();
516    let conf_ok_count = details.iter().filter(|d| d.confidence_ok).count();
517
518    // Precision = correct / found, Recall = correct / total.
519    let precision = if found == 0 {
520        0.0
521    } else {
522        id_correct as f64 / found as f64
523    };
524    let recall = if total == 0 {
525        0.0
526    } else {
527        id_correct as f64 / total as f64
528    };
529    let f1 = if precision + recall == 0.0 {
530        0.0
531    } else {
532        2.0 * precision * recall / (precision + recall)
533    };
534    let type_accuracy = if total == 0 {
535        0.0
536    } else {
537        type_correct as f64 / total as f64
538    };
539
540    // Per-type breakdown.
541    let mut type_groups: HashMap<String, Vec<&EntityMatchDetail>> = HashMap::new();
542    for d in &details {
543        type_groups
544            .entry(d.entity_type.clone())
545            .or_default()
546            .push(d);
547    }
548
549    let mut by_type: Vec<TypeBreakdown> = type_groups
550        .into_iter()
551        .map(|(etype, ds)| {
552            let t = ds.len();
553            let f = ds.iter().filter(|d| d.found_type.is_some()).count();
554            let c = ds.iter().filter(|d| d.id_match).count();
555            let co = ds.iter().filter(|d| d.confidence_ok).count();
556            let p = if f == 0 { 0.0 } else { c as f64 / f as f64 };
557            let r = if t == 0 { 0.0 } else { c as f64 / t as f64 };
558            let f1t = if p + r == 0.0 {
559                0.0
560            } else {
561                2.0 * p * r / (p + r)
562            };
563            TypeBreakdown {
564                entity_type: etype,
565                total: t,
566                found: f,
567                id_correct: c,
568                confidence_ok: co,
569                precision: round3(p),
570                recall: round3(r),
571                f1: round3(f1t),
572            }
573        })
574        .collect();
575    by_type.sort_by(|a, b| a.entity_type.cmp(&b.entity_type));
576
577    EntityBenchmarkReport {
578        total_gold_entities: total,
579        found_in_frontier: found,
580        type_correct,
581        id_correct,
582        confidence_ok: conf_ok_count,
583        precision: round3(precision),
584        recall: round3(recall),
585        f1: round3(f1),
586        type_accuracy: round3(type_accuracy),
587        by_type,
588        details,
589    }
590}
591
592fn entity_rank(entity: &Entity, gold: &GoldEntity) -> (u8, u8, u32) {
593    (
594        u8::from(entity.entity_type == gold.entity_type),
595        u8::from(entity.canonical_id.is_some()),
596        (entity_resolution_confidence(entity) * 1000.0).round() as u32,
597    )
598}
599
600fn entity_resolution_confidence(entity: &Entity) -> f64 {
601    entity
602        .canonical_id
603        .as_ref()
604        .map(|cid| cid.confidence)
605        .unwrap_or(entity.resolution_confidence)
606}
607
608fn print_entity_report(report: &EntityBenchmarkReport) {
609    println!();
610    println!("  {}", "VELA · ENTITY RESOLUTION BENCHMARK".dimmed());
611    println!("  {}", style::tick_row(60));
612    println!("  gold entities:      {}", report.total_gold_entities);
613    println!("  found in frontier:  {}", report.found_in_frontier);
614    println!("  type correct:       {}", report.type_correct);
615    println!("  id correct:         {}", report.id_correct);
616    println!("  confidence ok:      {}", report.confidence_ok);
617    println!();
618    println!("  precision:  {:.1}%", report.precision * 100.0);
619    println!("  recall:     {:.1}%", report.recall * 100.0);
620    println!("  f1:         {:.1}%", report.f1 * 100.0);
621    println!("  type accuracy: {:.1}%", report.type_accuracy * 100.0);
622    println!();
623    println!("  {}", "BY TYPE".dimmed());
624    println!(
625        "  {}",
626        format!(
627            "{:<12} {:>5} {:>5} {:>7} {:>8} {:>6} {:>6}",
628            "type", "total", "found", "correct", "conf_ok", "prec", "f1"
629        )
630        .dimmed()
631    );
632    for t in &report.by_type {
633        println!(
634            "  {:<12} {:>5} {:>5} {:>7} {:>8} {:>5.1}% {:>5.1}%",
635            t.entity_type,
636            t.total,
637            t.found,
638            t.id_correct,
639            t.confidence_ok,
640            t.precision * 100.0,
641            t.f1 * 100.0,
642        );
643    }
644
645    // Show mismatches.
646    let mismatches: Vec<_> = report.details.iter().filter(|d| !d.id_match).collect();
647    if !mismatches.is_empty() {
648        println!();
649        println!(
650            "  {}",
651            format!("MISMATCHES ({})", mismatches.len()).dimmed()
652        );
653        println!("  {}", style::tick_row(60));
654        for d in &mismatches {
655            let resolved = match (&d.resolved_source, &d.resolved_id) {
656                (Some(s), Some(id)) => format!("{s}:{id}"),
657                _ => d
658                    .found_type
659                    .clone()
660                    .unwrap_or_else(|| "missing".to_string()),
661            };
662            println!(
663                "  {} ({}) expected {}:{} got {}",
664                d.name, d.entity_type, d.expected_source, d.expected_id, resolved
665            );
666        }
667    }
668
669    println!();
670    println!("  {}", style::tick_row(60));
671    println!();
672}
673
674// ---------------------------------------------------------------------------
675// Link benchmark
676// ---------------------------------------------------------------------------
677
678/// A single gold-standard link for link benchmarking.
679#[derive(Debug, Clone, Deserialize)]
680pub struct GoldLink {
681    pub source_id: String,
682    pub target_id: String,
683    pub link_type: String,
684    #[serde(default)]
685    pub note: String,
686}
687
688/// Per-link match result.
689#[derive(Debug, Serialize)]
690pub struct LinkMatchDetail {
691    pub source_id: String,
692    pub target_id: String,
693    pub expected_type: String,
694    pub found: bool,
695    pub found_type: Option<String>,
696    pub type_correct: bool,
697}
698
699/// Per-type breakdown for links.
700#[derive(Debug, Serialize)]
701pub struct LinkTypeBreakdown {
702    pub link_type: String,
703    pub total: usize,
704    pub found: usize,
705    pub type_correct: usize,
706    pub precision: f64,
707    pub recall: f64,
708    pub f1: f64,
709}
710
711/// Full link benchmark report.
712#[derive(Debug, Serialize)]
713pub struct LinkBenchmarkReport {
714    pub total_gold_links: usize,
715    pub total_frontier_links: usize,
716    pub found: usize,
717    pub type_correct: usize,
718    pub precision: f64,
719    pub recall: f64,
720    pub f1: f64,
721    pub by_type: Vec<LinkTypeBreakdown>,
722    pub details: Vec<LinkMatchDetail>,
723}
724
725pub fn run_link_benchmark(frontier_path: &Path, gold_path: &Path, json_output: bool) {
726    let frontier = repo::load_from_path(frontier_path).expect("Failed to load frontier");
727
728    let gold_data =
729        std::fs::read_to_string(gold_path).expect("Failed to read link gold standard file");
730    let gold: Vec<GoldLink> =
731        serde_json::from_str(&gold_data).expect("Failed to parse link gold standard JSON");
732
733    let report = link_benchmark(&frontier.findings, &gold);
734
735    if json_output {
736        let json = serde_json::to_string_pretty(&report).unwrap();
737        println!("{json}");
738    } else {
739        print_link_report(&report);
740    }
741}
742
743/// Build a lookup: (source_id, target_id) -> list of link types.
744fn collect_links(findings: &[FindingBundle]) -> HashMap<(String, String), Vec<String>> {
745    let mut map: HashMap<(String, String), Vec<String>> = HashMap::new();
746    for f in findings {
747        for link in &f.links {
748            map.entry((f.id.clone(), link.target.clone()))
749                .or_default()
750                .push(link.link_type.clone());
751        }
752    }
753    map
754}
755
756pub fn link_benchmark(findings: &[FindingBundle], gold: &[GoldLink]) -> LinkBenchmarkReport {
757    let link_map = collect_links(findings);
758    let total_frontier_links: usize = findings.iter().map(|f| f.links.len()).sum();
759    let mut details = Vec::new();
760
761    for g in gold {
762        let key = (g.source_id.clone(), g.target_id.clone());
763        let types = link_map.get(&key);
764
765        let (found, found_type, type_correct) = if let Some(ts) = types {
766            let correct = ts.contains(&g.link_type);
767            (true, Some(ts[0].clone()), correct)
768        } else {
769            (false, None, false)
770        };
771
772        details.push(LinkMatchDetail {
773            source_id: g.source_id.clone(),
774            target_id: g.target_id.clone(),
775            expected_type: g.link_type.clone(),
776            found,
777            found_type,
778            type_correct,
779        });
780    }
781
782    let total = gold.len();
783    let found_count = details.iter().filter(|d| d.found).count();
784    let type_correct_count = details.iter().filter(|d| d.type_correct).count();
785
786    // Precision = type_correct / found, Recall = type_correct / total gold.
787    let precision = if found_count == 0 {
788        0.0
789    } else {
790        type_correct_count as f64 / found_count as f64
791    };
792    let recall = if total == 0 {
793        0.0
794    } else {
795        type_correct_count as f64 / total as f64
796    };
797    let f1 = if precision + recall == 0.0 {
798        0.0
799    } else {
800        2.0 * precision * recall / (precision + recall)
801    };
802
803    // Per-type breakdown.
804    let mut type_groups: HashMap<String, Vec<&LinkMatchDetail>> = HashMap::new();
805    for d in &details {
806        type_groups
807            .entry(d.expected_type.clone())
808            .or_default()
809            .push(d);
810    }
811
812    let mut by_type: Vec<LinkTypeBreakdown> = type_groups
813        .into_iter()
814        .map(|(lt, ds)| {
815            let t = ds.len();
816            let f = ds.iter().filter(|d| d.found).count();
817            let c = ds.iter().filter(|d| d.type_correct).count();
818            let p = if f == 0 { 0.0 } else { c as f64 / f as f64 };
819            let r = if t == 0 { 0.0 } else { c as f64 / t as f64 };
820            let f1t = if p + r == 0.0 {
821                0.0
822            } else {
823                2.0 * p * r / (p + r)
824            };
825            LinkTypeBreakdown {
826                link_type: lt,
827                total: t,
828                found: f,
829                type_correct: c,
830                precision: round3(p),
831                recall: round3(r),
832                f1: round3(f1t),
833            }
834        })
835        .collect();
836    by_type.sort_by(|a, b| a.link_type.cmp(&b.link_type));
837
838    LinkBenchmarkReport {
839        total_gold_links: total,
840        total_frontier_links,
841        found: found_count,
842        type_correct: type_correct_count,
843        precision: round3(precision),
844        recall: round3(recall),
845        f1: round3(f1),
846        by_type,
847        details,
848    }
849}
850
851fn print_link_report(report: &LinkBenchmarkReport) {
852    println!();
853    println!("  {}", "VELA · LINK BENCHMARK".dimmed());
854    println!("  {}", style::tick_row(60));
855    println!("  gold links:        {}", report.total_gold_links);
856    println!("  project links:     {}", report.total_frontier_links);
857    println!("  found:             {}", report.found);
858    println!("  type correct:      {}", report.type_correct);
859    println!();
860    println!("  precision:  {:.1}%", report.precision * 100.0);
861    println!("  recall:     {:.1}%", report.recall * 100.0);
862    println!("  f1:         {:.1}%", report.f1 * 100.0);
863    println!();
864    println!("  {}", "BY TYPE".dimmed());
865    println!(
866        "  {}",
867        format!(
868            "{:<12} {:>5} {:>5} {:>7} {:>6} {:>6}",
869            "type", "total", "found", "correct", "prec", "f1"
870        )
871        .dimmed()
872    );
873    for t in &report.by_type {
874        println!(
875            "  {:<12} {:>5} {:>5} {:>7} {:>5.1}% {:>5.1}%",
876            t.link_type,
877            t.total,
878            t.found,
879            t.type_correct,
880            t.precision * 100.0,
881            t.f1 * 100.0,
882        );
883    }
884
885    // Show mismatches.
886    let mismatches: Vec<_> = report.details.iter().filter(|d| !d.type_correct).collect();
887    if !mismatches.is_empty() {
888        println!();
889        println!(
890            "  {}",
891            format!("MISMATCHES ({})", mismatches.len()).dimmed()
892        );
893        println!("  {}", style::tick_row(60));
894        for d in &mismatches {
895            let found_str = match &d.found_type {
896                Some(t) => t.as_str(),
897                None => "missing",
898            };
899            println!(
900                "  {} · {} expected:{} got:{}",
901                d.source_id, d.target_id, d.expected_type, found_str
902            );
903        }
904    }
905
906    println!();
907    println!("  {}", style::tick_row(60));
908    println!();
909}
910
911// ---------------------------------------------------------------------------
912// Suite-based benchmark gate
913// ---------------------------------------------------------------------------
914
915#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
916#[serde(rename_all = "snake_case")]
917pub enum BenchmarkMode {
918    Finding,
919    Entity,
920    Link,
921    Workflow,
922}
923
924impl BenchmarkMode {
925    fn as_str(&self) -> &'static str {
926        match self {
927            BenchmarkMode::Finding => "finding",
928            BenchmarkMode::Entity => "entity",
929            BenchmarkMode::Link => "link",
930            BenchmarkMode::Workflow => "workflow",
931        }
932    }
933}
934
935#[derive(Debug, Clone, Default, Deserialize, Serialize)]
936pub struct BenchmarkThresholds {
937    #[serde(default, skip_serializing_if = "Option::is_none")]
938    pub min_f1: Option<f64>,
939    #[serde(default, skip_serializing_if = "Option::is_none")]
940    pub min_precision: Option<f64>,
941    #[serde(default, skip_serializing_if = "Option::is_none")]
942    pub min_recall: Option<f64>,
943    #[serde(default, skip_serializing_if = "Option::is_none")]
944    pub min_entity_accuracy: Option<f64>,
945    #[serde(default, skip_serializing_if = "Option::is_none")]
946    pub min_confidence_calibration: Option<f64>,
947    #[serde(default, skip_serializing_if = "Option::is_none")]
948    pub min_type_accuracy: Option<f64>,
949    #[serde(default, skip_serializing_if = "Option::is_none")]
950    pub min_workflow_score: Option<f64>,
951}
952
953#[derive(Debug, Clone, Default, Deserialize, Serialize)]
954pub struct WorkflowExpectations {
955    #[serde(default)]
956    pub min_findings: usize,
957    #[serde(default)]
958    pub min_links: usize,
959    #[serde(default)]
960    pub min_entity_mentions: usize,
961    #[serde(default)]
962    pub min_evidence_spans: usize,
963    #[serde(default)]
964    pub min_provenance_complete: usize,
965    #[serde(default)]
966    pub min_assertion_types: usize,
967    #[serde(default)]
968    pub min_gap_flags: usize,
969    #[serde(default)]
970    pub min_contested_flags: usize,
971}
972
973#[derive(Debug, Clone, Deserialize, Serialize)]
974pub struct BenchmarkTask {
975    pub id: String,
976    pub mode: BenchmarkMode,
977    #[serde(default)]
978    pub description: Option<String>,
979    #[serde(default)]
980    pub frontier: Option<String>,
981    #[serde(default)]
982    pub gold: Option<String>,
983    #[serde(default)]
984    pub thresholds: BenchmarkThresholds,
985    #[serde(default)]
986    pub workflow: Option<WorkflowExpectations>,
987}
988
989#[derive(Debug, Clone, Deserialize, Serialize)]
990pub struct BenchmarkSuite {
991    pub id: String,
992    pub name: String,
993    #[serde(default)]
994    pub description: Option<String>,
995    pub frontier: String,
996    pub tasks: Vec<BenchmarkTask>,
997}
998
999#[derive(Debug, Serialize)]
1000pub struct WorkflowBenchmarkReport {
1001    pub total_findings: usize,
1002    pub total_links: usize,
1003    pub total_entity_mentions: usize,
1004    pub total_evidence_spans: usize,
1005    pub total_provenance_complete: usize,
1006    pub evidence_span_coverage: f64,
1007    pub provenance_coverage: f64,
1008    pub assertion_types: usize,
1009    pub gap_flags: usize,
1010    pub contested_flags: usize,
1011    pub checks_total: usize,
1012    pub checks_passed: usize,
1013    pub workflow_score: f64,
1014    pub details: Vec<WorkflowCheckDetail>,
1015}
1016
1017#[derive(Debug, Serialize)]
1018pub struct WorkflowCheckDetail {
1019    pub metric: String,
1020    pub actual: usize,
1021    pub expected_min: usize,
1022    pub passed: bool,
1023}
1024
1025pub fn load_suite(path: &Path) -> Result<BenchmarkSuite, String> {
1026    let data = std::fs::read_to_string(path)
1027        .map_err(|e| format!("Failed to read benchmark suite '{}': {e}", path.display()))?;
1028    serde_json::from_str(&data)
1029        .map_err(|e| format!("Failed to parse benchmark suite '{}': {e}", path.display()))
1030}
1031
1032pub fn suite_ready_report(suite_path: &Path) -> Result<serde_json::Value, String> {
1033    let envelope = run_suite(suite_path)?;
1034    let suite_ready = envelope
1035        .get("ok")
1036        .and_then(|value| value.as_bool())
1037        .unwrap_or(false);
1038    Ok(json!({
1039        "ok": suite_ready,
1040        "command": "bench",
1041        "suite_ready": suite_ready,
1042        "suite": envelope.get("suite").cloned().unwrap_or(serde_json::Value::Null),
1043        "tasks": envelope.get("tasks").cloned().unwrap_or_else(|| json!([])),
1044        "failures": envelope.get("failures").cloned().unwrap_or_else(|| json!([])),
1045    }))
1046}
1047
1048pub fn run_suite(suite_path: &Path) -> Result<serde_json::Value, String> {
1049    let suite = load_suite(suite_path)?;
1050    let base_dir = suite_path.parent().unwrap_or_else(|| Path::new("."));
1051    let frontier_path = resolve_suite_path(base_dir, &suite.frontier);
1052    let loaded = repo::load_from_path(&frontier_path)?;
1053    let frontier_hash = hash_path(&frontier_path)?;
1054
1055    let mut task_outputs = Vec::new();
1056    let mut failures = Vec::new();
1057    let mut standard_candles = Vec::new();
1058
1059    for task in &suite.tasks {
1060        if let Some(gold) = &task.gold {
1061            let gold_path = resolve_suite_path(base_dir, gold);
1062            standard_candles.push(json!({
1063                "task_id": task.id,
1064                "mode": task.mode.as_str(),
1065                "path": gold_path.display().to_string(),
1066                "items": count_json_array_items(&gold_path)?,
1067                "role": "reviewed calibration anchor"
1068            }));
1069        }
1070        let task_frontier = task
1071            .frontier
1072            .as_deref()
1073            .map(|p| resolve_suite_path(base_dir, p))
1074            .unwrap_or_else(|| frontier_path.clone());
1075        let output = task_envelope(
1076            &task_frontier,
1077            Some((&suite.id, &task.id)),
1078            task.mode.clone(),
1079            task.gold
1080                .as_deref()
1081                .map(|p| resolve_suite_path(base_dir, p))
1082                .as_deref(),
1083            &task.thresholds,
1084            task.workflow.as_ref(),
1085        )?;
1086        if !output
1087            .get("ok")
1088            .and_then(|value| value.as_bool())
1089            .unwrap_or(false)
1090        {
1091            failures.push(format!("task {} failed", task.id));
1092        }
1093        task_outputs.push(output);
1094    }
1095
1096    let passed = task_outputs
1097        .iter()
1098        .filter(|task| task.get("ok").and_then(|value| value.as_bool()) == Some(true))
1099        .count();
1100    let ok = failures.is_empty();
1101
1102    Ok(json!({
1103        "ok": ok,
1104        "command": "bench",
1105        "benchmark_type": "suite",
1106        "schema_version": project::VELA_SCHEMA_VERSION,
1107        "suite": {
1108            "id": suite.id,
1109            "name": suite.name,
1110            "path": suite_path.display().to_string(),
1111            "tasks": suite.tasks.len(),
1112        },
1113        "frontier": {
1114            "name": loaded.project.name,
1115            "source": frontier_path.display().to_string(),
1116            "hash": format!("sha256:{frontier_hash}"),
1117        },
1118        "metrics": {
1119            "tasks_total": task_outputs.len(),
1120            "tasks_passed": passed,
1121            "tasks_failed": task_outputs.len().saturating_sub(passed),
1122            "standard_candles": standard_candles
1123                .iter()
1124                .filter_map(|item| item.get("items").and_then(|value| value.as_u64()))
1125                .sum::<u64>(),
1126        },
1127        "standard_candles": {
1128            "definition": "Reviewed gold fixtures used as calibration anchors for release drift, not proof of scientific superiority.",
1129            "items": standard_candles,
1130        },
1131        "failures": failures,
1132        "tasks": task_outputs,
1133    }))
1134}
1135
1136fn count_json_array_items(path: &Path) -> Result<usize, String> {
1137    let data = std::fs::read_to_string(path)
1138        .map_err(|e| format!("Failed to read gold fixture '{}': {e}", path.display()))?;
1139    let value: serde_json::Value = serde_json::from_str(&data)
1140        .map_err(|e| format!("Failed to parse gold fixture '{}': {e}", path.display()))?;
1141    value
1142        .as_array()
1143        .map(Vec::len)
1144        .ok_or_else(|| format!("Gold fixture '{}' must be a JSON array", path.display()))
1145}
1146
1147pub fn task_envelope(
1148    frontier_path: &Path,
1149    suite_task: Option<(&str, &str)>,
1150    mode: BenchmarkMode,
1151    gold_path: Option<&Path>,
1152    thresholds: &BenchmarkThresholds,
1153    workflow: Option<&WorkflowExpectations>,
1154) -> Result<serde_json::Value, String> {
1155    let loaded = repo::load_from_path(frontier_path)?;
1156    let frontier_hash = hash_path(frontier_path)?;
1157    let (suite_id, task_id) = suite_task
1158        .map(|(suite, task)| (Some(suite.to_string()), Some(task.to_string())))
1159        .unwrap_or((None, None));
1160
1161    match mode {
1162        BenchmarkMode::Finding => {
1163            let gold_path =
1164                gold_path.ok_or_else(|| "finding benchmark requires gold".to_string())?;
1165            let gold_data = std::fs::read_to_string(gold_path).map_err(|e| {
1166                format!("Failed to read finding gold '{}': {e}", gold_path.display())
1167            })?;
1168            let gold: Vec<GoldFinding> = serde_json::from_str(&gold_data).map_err(|e| {
1169                format!(
1170                    "Failed to parse finding gold '{}': {e}",
1171                    gold_path.display()
1172                )
1173            })?;
1174            let report = benchmark(&loaded.findings, &gold);
1175            let failures = finding_threshold_failures(&report, thresholds);
1176            let gold_hash = hash_path(gold_path)?;
1177            Ok(json!({
1178                "ok": failures.is_empty(),
1179                "command": "bench",
1180                "benchmark_type": BenchmarkMode::Finding.as_str(),
1181                "mode": BenchmarkMode::Finding.as_str(),
1182                "suite_id": suite_id,
1183                "task_id": task_id,
1184                "schema_version": project::VELA_SCHEMA_VERSION,
1185                "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1186                "gold": gold_metadata(gold_path, &gold_hash, gold.len()),
1187                "metrics": {
1188                    "total_frontier_findings": report.total_frontier_findings,
1189                    "total_gold_findings": report.total_gold_findings,
1190                    "matched": report.matched,
1191                    "total_frontier_matched": report.total_frontier_matched,
1192                    "unmatched_gold": report.unmatched_gold,
1193                    "unmatched_frontier": report.unmatched_frontier,
1194                    "exact_id_matches": report.exact_id_matches,
1195                    "precision": report.precision,
1196                    "recall": report.recall,
1197                    "f1": report.f1,
1198                    "entity_accuracy": report.entity_accuracy,
1199                    "assertion_type_accuracy": report.assertion_type_accuracy,
1200                    "confidence_calibration": report.confidence_calibration,
1201                },
1202                "thresholds": thresholds,
1203                "failures": failures,
1204                "match_details": report.match_details,
1205            }))
1206        }
1207        BenchmarkMode::Entity => {
1208            let gold_path =
1209                gold_path.ok_or_else(|| "entity benchmark requires gold".to_string())?;
1210            let gold_data = std::fs::read_to_string(gold_path).map_err(|e| {
1211                format!("Failed to read entity gold '{}': {e}", gold_path.display())
1212            })?;
1213            let gold: Vec<GoldEntity> = serde_json::from_str(&gold_data).map_err(|e| {
1214                format!("Failed to parse entity gold '{}': {e}", gold_path.display())
1215            })?;
1216            let report = entity_benchmark(&loaded.findings, &gold);
1217            let failures = entity_threshold_failures(&report, thresholds);
1218            let gold_hash = hash_path(gold_path)?;
1219            Ok(json!({
1220                "ok": failures.is_empty(),
1221                "command": "bench",
1222                "benchmark_type": BenchmarkMode::Entity.as_str(),
1223                "mode": BenchmarkMode::Entity.as_str(),
1224                "suite_id": suite_id,
1225                "task_id": task_id,
1226                "schema_version": project::VELA_SCHEMA_VERSION,
1227                "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1228                "gold": gold_metadata(gold_path, &gold_hash, gold.len()),
1229                "metrics": {
1230                    "total_gold_entities": report.total_gold_entities,
1231                    "found_in_frontier": report.found_in_frontier,
1232                    "type_correct": report.type_correct,
1233                    "id_correct": report.id_correct,
1234                    "confidence_ok": report.confidence_ok,
1235                    "precision": report.precision,
1236                    "recall": report.recall,
1237                    "f1": report.f1,
1238                    "type_accuracy": report.type_accuracy,
1239                },
1240                "thresholds": thresholds,
1241                "failures": failures,
1242                "by_type": report.by_type,
1243                "details": report.details,
1244            }))
1245        }
1246        BenchmarkMode::Link => {
1247            let gold_path = gold_path.ok_or_else(|| "link benchmark requires gold".to_string())?;
1248            let gold_data = std::fs::read_to_string(gold_path)
1249                .map_err(|e| format!("Failed to read link gold '{}': {e}", gold_path.display()))?;
1250            let gold: Vec<GoldLink> = serde_json::from_str(&gold_data)
1251                .map_err(|e| format!("Failed to parse link gold '{}': {e}", gold_path.display()))?;
1252            let report = link_benchmark(&loaded.findings, &gold);
1253            let failures = link_threshold_failures(&report, thresholds);
1254            let gold_hash = hash_path(gold_path)?;
1255            Ok(json!({
1256                "ok": failures.is_empty(),
1257                "command": "bench",
1258                "benchmark_type": BenchmarkMode::Link.as_str(),
1259                "mode": BenchmarkMode::Link.as_str(),
1260                "suite_id": suite_id,
1261                "task_id": task_id,
1262                "schema_version": project::VELA_SCHEMA_VERSION,
1263                "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1264                "gold": gold_metadata(gold_path, &gold_hash, gold.len()),
1265                "metrics": {
1266                    "total_gold_links": report.total_gold_links,
1267                    "total_frontier_links": report.total_frontier_links,
1268                    "found": report.found,
1269                    "type_correct": report.type_correct,
1270                    "precision": report.precision,
1271                    "recall": report.recall,
1272                    "f1": report.f1,
1273                },
1274                "thresholds": thresholds,
1275                "failures": failures,
1276                "by_type": report.by_type,
1277                "details": report.details,
1278            }))
1279        }
1280        BenchmarkMode::Workflow => {
1281            let expectations = workflow.cloned().unwrap_or_default();
1282            let report = workflow_benchmark(&loaded.findings, &expectations);
1283            let failures = workflow_threshold_failures(&report, thresholds);
1284            Ok(json!({
1285                "ok": failures.is_empty(),
1286                "command": "bench",
1287                "benchmark_type": BenchmarkMode::Workflow.as_str(),
1288                "mode": BenchmarkMode::Workflow.as_str(),
1289                "suite_id": suite_id,
1290                "task_id": task_id,
1291                "schema_version": project::VELA_SCHEMA_VERSION,
1292                "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1293                "gold": null,
1294                "metrics": {
1295                    "total_findings": report.total_findings,
1296                    "total_links": report.total_links,
1297                    "total_entity_mentions": report.total_entity_mentions,
1298                    "total_evidence_spans": report.total_evidence_spans,
1299                    "total_provenance_complete": report.total_provenance_complete,
1300                    "evidence_span_coverage": report.evidence_span_coverage,
1301                    "provenance_coverage": report.provenance_coverage,
1302                    "assertion_types": report.assertion_types,
1303                    "gap_flags": report.gap_flags,
1304                    "contested_flags": report.contested_flags,
1305                    "checks_total": report.checks_total,
1306                    "checks_passed": report.checks_passed,
1307                    "workflow_score": report.workflow_score,
1308                },
1309                "thresholds": thresholds,
1310                "failures": failures,
1311                "details": report.details,
1312            }))
1313        }
1314    }
1315}
1316
1317pub fn workflow_benchmark(
1318    findings: &[FindingBundle],
1319    expectations: &WorkflowExpectations,
1320) -> WorkflowBenchmarkReport {
1321    let total_links = findings.iter().map(|f| f.links.len()).sum();
1322    let total_entity_mentions = findings.iter().map(|f| f.assertion.entities.len()).sum();
1323    let total_evidence_spans = findings
1324        .iter()
1325        .map(|f| f.evidence.evidence_spans.len())
1326        .sum();
1327    let findings_with_spans = findings
1328        .iter()
1329        .filter(|f| !f.evidence.evidence_spans.is_empty())
1330        .count();
1331    let total_provenance_complete = findings
1332        .iter()
1333        .filter(|f| {
1334            f.provenance.doi.is_some()
1335                || f.provenance.pmid.is_some()
1336                || !f.provenance.title.trim().is_empty()
1337        })
1338        .count();
1339    let evidence_span_coverage = if findings.is_empty() {
1340        1.0
1341    } else {
1342        findings_with_spans as f64 / findings.len() as f64
1343    };
1344    let provenance_coverage = if findings.is_empty() {
1345        1.0
1346    } else {
1347        total_provenance_complete as f64 / findings.len() as f64
1348    };
1349    let assertion_types = findings
1350        .iter()
1351        .map(|f| f.assertion.assertion_type.as_str())
1352        .collect::<HashSet<_>>()
1353        .len();
1354    let gap_flags = findings.iter().filter(|f| f.flags.gap).count();
1355    let contested_flags = findings.iter().filter(|f| f.flags.contested).count();
1356
1357    let checks = vec![
1358        ("findings", findings.len(), expectations.min_findings),
1359        ("links", total_links, expectations.min_links),
1360        (
1361            "entity_mentions",
1362            total_entity_mentions,
1363            expectations.min_entity_mentions,
1364        ),
1365        (
1366            "evidence_spans",
1367            total_evidence_spans,
1368            expectations.min_evidence_spans,
1369        ),
1370        (
1371            "provenance_complete",
1372            total_provenance_complete,
1373            expectations.min_provenance_complete,
1374        ),
1375        (
1376            "assertion_types",
1377            assertion_types,
1378            expectations.min_assertion_types,
1379        ),
1380        ("gap_flags", gap_flags, expectations.min_gap_flags),
1381        (
1382            "contested_flags",
1383            contested_flags,
1384            expectations.min_contested_flags,
1385        ),
1386    ];
1387    let details: Vec<WorkflowCheckDetail> = checks
1388        .into_iter()
1389        .filter(|(_, _, expected)| *expected > 0)
1390        .map(|(metric, actual, expected_min)| WorkflowCheckDetail {
1391            metric: metric.to_string(),
1392            actual,
1393            expected_min,
1394            passed: actual >= expected_min,
1395        })
1396        .collect();
1397    let checks_total = details.len();
1398    let checks_passed = details.iter().filter(|detail| detail.passed).count();
1399    let workflow_score = if checks_total == 0 {
1400        1.0
1401    } else {
1402        checks_passed as f64 / checks_total as f64
1403    };
1404
1405    WorkflowBenchmarkReport {
1406        total_findings: findings.len(),
1407        total_links,
1408        total_entity_mentions,
1409        total_evidence_spans,
1410        total_provenance_complete,
1411        evidence_span_coverage: round3(evidence_span_coverage),
1412        provenance_coverage: round3(provenance_coverage),
1413        assertion_types,
1414        gap_flags,
1415        contested_flags,
1416        checks_total,
1417        checks_passed,
1418        workflow_score: round3(workflow_score),
1419        details,
1420    }
1421}
1422
1423fn resolve_suite_path(base_dir: &Path, value: &str) -> PathBuf {
1424    let path = PathBuf::from(value);
1425    if path.is_absolute() {
1426        path
1427    } else {
1428        let from_suite = base_dir.join(&path);
1429        if from_suite.exists() {
1430            from_suite
1431        } else {
1432            PathBuf::from(value)
1433        }
1434    }
1435}
1436
1437fn frontier_metadata(
1438    loaded: &project::Project,
1439    frontier_path: &Path,
1440    frontier_hash: &str,
1441) -> serde_json::Value {
1442    json!({
1443        "name": loaded.project.name,
1444        "source": frontier_path.display().to_string(),
1445        "hash": format!("sha256:{frontier_hash}"),
1446    })
1447}
1448
1449fn gold_metadata(gold_path: &Path, gold_hash: &str, items: usize) -> serde_json::Value {
1450    json!({
1451        "path": gold_path.display().to_string(),
1452        "hash": format!("sha256:{gold_hash}"),
1453        "items": items,
1454    })
1455}
1456
1457fn finding_threshold_failures(
1458    report: &BenchmarkReport,
1459    thresholds: &BenchmarkThresholds,
1460) -> Vec<String> {
1461    let mut failures =
1462        generic_threshold_failures(report.precision, report.recall, report.f1, thresholds);
1463    if let Some(threshold) = thresholds.min_entity_accuracy
1464        && report.entity_accuracy < threshold
1465    {
1466        failures.push(format!(
1467            "entity_accuracy {} is below threshold {}",
1468            report.entity_accuracy, threshold
1469        ));
1470    }
1471    if let Some(threshold) = thresholds.min_confidence_calibration
1472        && report.confidence_calibration < threshold
1473    {
1474        failures.push(format!(
1475            "confidence_calibration {} is below threshold {}",
1476            report.confidence_calibration, threshold
1477        ));
1478    }
1479    if let Some(threshold) = thresholds.min_type_accuracy
1480        && report.assertion_type_accuracy < threshold
1481    {
1482        failures.push(format!(
1483            "assertion_type_accuracy {} is below threshold {}",
1484            report.assertion_type_accuracy, threshold
1485        ));
1486    }
1487    failures
1488}
1489
1490fn entity_threshold_failures(
1491    report: &EntityBenchmarkReport,
1492    thresholds: &BenchmarkThresholds,
1493) -> Vec<String> {
1494    let mut failures =
1495        generic_threshold_failures(report.precision, report.recall, report.f1, thresholds);
1496    if let Some(threshold) = thresholds.min_type_accuracy
1497        && report.type_accuracy < threshold
1498    {
1499        failures.push(format!(
1500            "type_accuracy {} is below threshold {}",
1501            report.type_accuracy, threshold
1502        ));
1503    }
1504    failures
1505}
1506
1507fn link_threshold_failures(
1508    report: &LinkBenchmarkReport,
1509    thresholds: &BenchmarkThresholds,
1510) -> Vec<String> {
1511    generic_threshold_failures(report.precision, report.recall, report.f1, thresholds)
1512}
1513
1514fn workflow_threshold_failures(
1515    report: &WorkflowBenchmarkReport,
1516    thresholds: &BenchmarkThresholds,
1517) -> Vec<String> {
1518    let mut failures = Vec::new();
1519    for detail in &report.details {
1520        if !detail.passed {
1521            failures.push(format!(
1522                "{} {} is below minimum {}",
1523                detail.metric, detail.actual, detail.expected_min
1524            ));
1525        }
1526    }
1527    if let Some(threshold) = thresholds.min_workflow_score
1528        && report.workflow_score < threshold
1529    {
1530        failures.push(format!(
1531            "workflow_score {} is below threshold {}",
1532            report.workflow_score, threshold
1533        ));
1534    }
1535    failures
1536}
1537
1538fn generic_threshold_failures(
1539    precision: f64,
1540    recall: f64,
1541    f1: f64,
1542    thresholds: &BenchmarkThresholds,
1543) -> Vec<String> {
1544    let mut failures = Vec::new();
1545    if let Some(threshold) = thresholds.min_f1
1546        && f1 < threshold
1547    {
1548        failures.push(format!("f1 {} is below threshold {}", f1, threshold));
1549    }
1550    if let Some(threshold) = thresholds.min_precision
1551        && precision < threshold
1552    {
1553        failures.push(format!(
1554            "precision {} is below threshold {}",
1555            precision, threshold
1556        ));
1557    }
1558    if let Some(threshold) = thresholds.min_recall
1559        && recall < threshold
1560    {
1561        failures.push(format!(
1562            "recall {} is below threshold {}",
1563            recall, threshold
1564        ));
1565    }
1566    failures
1567}
1568
1569fn hash_path(path: &Path) -> Result<String, String> {
1570    let mut hasher = Sha256::new();
1571    if path.is_file() {
1572        let bytes = std::fs::read(path)
1573            .map_err(|e| format!("Failed to read {} for hashing: {e}", path.display()))?;
1574        hasher.update(&bytes);
1575    } else if path.is_dir() {
1576        let mut files = Vec::new();
1577        collect_hash_files(path, path, &mut files)?;
1578        files.sort();
1579        for rel in files {
1580            hasher.update(rel.to_string_lossy().as_bytes());
1581            let bytes = std::fs::read(path.join(&rel))
1582                .map_err(|e| format!("Failed to read {} for hashing: {e}", rel.display()))?;
1583            hasher.update(&bytes);
1584        }
1585    } else {
1586        return Err(format!("Cannot hash missing path {}", path.display()));
1587    }
1588    Ok(format!("{:x}", hasher.finalize()))
1589}
1590
1591fn collect_hash_files(root: &Path, dir: &Path, files: &mut Vec<PathBuf>) -> Result<(), String> {
1592    for entry in
1593        std::fs::read_dir(dir).map_err(|e| format!("Failed to read {}: {e}", dir.display()))?
1594    {
1595        let entry = entry.map_err(|e| format!("Failed to read directory entry: {e}"))?;
1596        let path = entry.path();
1597        if path.is_dir() {
1598            collect_hash_files(root, &path, files)?;
1599        } else if path.is_file() {
1600            let rel = path.strip_prefix(root).map_err(|e| e.to_string())?;
1601            files.push(rel.to_path_buf());
1602        }
1603    }
1604    Ok(())
1605}
1606
1607#[cfg(test)]
1608mod tests {
1609    use super::*;
1610
1611    #[test]
1612    fn jaccard_identical() {
1613        let sim = jaccard_similarity("NLRP3 activates caspase-1", "NLRP3 activates caspase-1");
1614        assert!((sim - 1.0).abs() < 0.001);
1615    }
1616
1617    #[test]
1618    fn jaccard_disjoint() {
1619        let sim = jaccard_similarity("NLRP3 activates caspase-1", "tau propagation in cortex");
1620        assert!(sim < 0.1);
1621    }
1622
1623    #[test]
1624    fn jaccard_partial() {
1625        let sim = jaccard_similarity(
1626            "NLRP3 inflammasome activates caspase-1 in microglia",
1627            "NLRP3 activates caspase-1",
1628        );
1629        assert!(sim > 0.3);
1630        assert!(sim < 1.0);
1631    }
1632
1633    #[test]
1634    fn jaccard_empty() {
1635        assert!((jaccard_similarity("", "") - 1.0).abs() < 0.001);
1636        assert!((jaccard_similarity("word", "")).abs() < 0.001);
1637    }
1638
1639    #[test]
1640    fn benchmark_empty() {
1641        let report = benchmark(&[], &[]);
1642        assert_eq!(report.matched, 0);
1643        assert_eq!(report.f1, 0.0);
1644    }
1645
1646    #[test]
1647    fn benchmark_perfect_match() {
1648        use crate::bundle::*;
1649
1650        let finding = FindingBundle {
1651            id: "vf_test".into(),
1652            version: 1,
1653            previous_version: None,
1654            assertion: Assertion {
1655                text: "NLRP3 activates caspase-1 in microglia".into(),
1656                assertion_type: "mechanism".into(),
1657                entities: vec![
1658                    Entity {
1659                        name: "NLRP3".into(),
1660                        entity_type: "protein".into(),
1661                        identifiers: serde_json::Map::new(),
1662                        canonical_id: None,
1663                        candidates: vec![],
1664                        aliases: vec![],
1665                        resolution_provenance: None,
1666                        resolution_confidence: 1.0,
1667                        resolution_method: None,
1668                        species_context: None,
1669                        needs_review: false,
1670                    },
1671                    Entity {
1672                        name: "caspase-1".into(),
1673                        entity_type: "protein".into(),
1674                        identifiers: serde_json::Map::new(),
1675                        canonical_id: None,
1676                        candidates: vec![],
1677                        aliases: vec![],
1678                        resolution_provenance: None,
1679                        resolution_confidence: 1.0,
1680                        resolution_method: None,
1681                        species_context: None,
1682                        needs_review: false,
1683                    },
1684                ],
1685                relation: None,
1686                direction: None,
1687                causal_claim: None,
1688                causal_evidence_grade: None,
1689            },
1690            evidence: Evidence {
1691                evidence_type: "experimental".into(),
1692                model_system: String::new(),
1693                species: None,
1694                method: String::new(),
1695                sample_size: None,
1696                effect_size: None,
1697                p_value: None,
1698                replicated: false,
1699                replication_count: None,
1700                evidence_spans: vec![],
1701            },
1702            conditions: Conditions {
1703                text: String::new(),
1704                species_verified: vec![],
1705                species_unverified: vec![],
1706                in_vitro: false,
1707                in_vivo: false,
1708                human_data: false,
1709                clinical_trial: false,
1710                concentration_range: None,
1711                duration: None,
1712                age_group: None,
1713                cell_type: None,
1714            },
1715            confidence: Confidence::raw(0.85, "test", 0.9),
1716            provenance: Provenance {
1717                source_type: "published_paper".into(),
1718                doi: None,
1719                pmid: None,
1720                pmc: None,
1721                openalex_id: None,
1722                url: None,
1723                title: "Test".into(),
1724                authors: vec![],
1725                year: Some(2024),
1726                journal: None,
1727                license: None,
1728                publisher: None,
1729                funders: vec![],
1730                extraction: Extraction::default(),
1731                review: None,
1732                citation_count: None,
1733            },
1734            flags: Flags {
1735                gap: false,
1736                negative_space: false,
1737                contested: false,
1738                retracted: false,
1739                declining: false,
1740                gravity_well: false,
1741                review_state: None,
1742                superseded: false,
1743                signature_threshold: None,
1744                jointly_accepted: false,
1745            },
1746            links: vec![],
1747            annotations: vec![],
1748            attachments: vec![],
1749            created: String::new(),
1750            updated: None,
1751
1752            access_tier: crate::access_tier::AccessTier::Public,
1753        };
1754
1755        let gold = vec![GoldFinding {
1756            id: None,
1757            assertion_text: "NLRP3 activates caspase-1 in microglia".into(),
1758            assertion_type: "mechanism".into(),
1759            entities: vec!["NLRP3".into(), "caspase-1".into()],
1760            confidence_range: ConfidenceRange {
1761                low: 0.7,
1762                high: 0.95,
1763            },
1764            notes: None,
1765        }];
1766
1767        let report = benchmark(&[finding], &gold);
1768        assert_eq!(report.matched, 1);
1769        assert!((report.recall - 1.0).abs() < 0.001);
1770        assert!((report.precision - 1.0).abs() < 0.001);
1771        assert!((report.entity_accuracy - 1.0).abs() < 0.001);
1772        assert!((report.confidence_calibration - 1.0).abs() < 0.001);
1773    }
1774
1775    // Helper to create a minimal FindingBundle with given entities.
1776    fn make_finding_with_entities(entities: Vec<Entity>) -> FindingBundle {
1777        use crate::bundle::*;
1778        FindingBundle {
1779            id: "vf_ent_test".into(),
1780            version: 1,
1781            previous_version: None,
1782            assertion: Assertion {
1783                text: "test assertion".into(),
1784                assertion_type: "mechanism".into(),
1785                entities,
1786                relation: None,
1787                direction: None,
1788                causal_claim: None,
1789                causal_evidence_grade: None,
1790            },
1791            evidence: Evidence {
1792                evidence_type: "experimental".into(),
1793                model_system: String::new(),
1794                species: None,
1795                method: String::new(),
1796                sample_size: None,
1797                effect_size: None,
1798                p_value: None,
1799                replicated: false,
1800                replication_count: None,
1801                evidence_spans: vec![],
1802            },
1803            conditions: Conditions {
1804                text: String::new(),
1805                species_verified: vec![],
1806                species_unverified: vec![],
1807                in_vitro: false,
1808                in_vivo: false,
1809                human_data: false,
1810                clinical_trial: false,
1811                concentration_range: None,
1812                duration: None,
1813                age_group: None,
1814                cell_type: None,
1815            },
1816            confidence: Confidence::raw(0.9, "test", 0.9),
1817            provenance: Provenance {
1818                source_type: "published_paper".into(),
1819                doi: None,
1820                pmid: None,
1821                pmc: None,
1822                openalex_id: None,
1823                url: None,
1824                title: "Test".into(),
1825                authors: vec![],
1826                year: Some(2024),
1827                journal: None,
1828                license: None,
1829                publisher: None,
1830                funders: vec![],
1831                extraction: Extraction::default(),
1832                review: None,
1833                citation_count: None,
1834            },
1835            flags: Flags {
1836                gap: false,
1837                negative_space: false,
1838                contested: false,
1839                retracted: false,
1840                declining: false,
1841                gravity_well: false,
1842                review_state: None,
1843                superseded: false,
1844                signature_threshold: None,
1845                jointly_accepted: false,
1846            },
1847            links: vec![],
1848            annotations: vec![],
1849            attachments: vec![],
1850            created: String::new(),
1851            updated: None,
1852
1853            access_tier: crate::access_tier::AccessTier::Public,
1854        }
1855    }
1856
1857    #[test]
1858    fn entity_benchmark_empty() {
1859        let report = entity_benchmark(&[], &[]);
1860        assert_eq!(report.total_gold_entities, 0);
1861        assert_eq!(report.found_in_frontier, 0);
1862        assert_eq!(report.f1, 0.0);
1863    }
1864
1865    #[test]
1866    fn entity_benchmark_perfect_match() {
1867        use crate::bundle::*;
1868
1869        let entity = Entity {
1870            name: "NLRP3".into(),
1871            entity_type: "protein".into(),
1872            identifiers: serde_json::Map::new(),
1873            canonical_id: Some(ResolvedId {
1874                source: "uniprot".into(),
1875                id: "Q96P20".into(),
1876                confidence: 0.95,
1877                matched_name: Some("NLRP3".into()),
1878            }),
1879            candidates: vec![],
1880            aliases: vec![],
1881            resolution_provenance: Some("vela_resolve/uniprot".into()),
1882            resolution_confidence: 0.95,
1883            resolution_method: None,
1884            species_context: None,
1885            needs_review: false,
1886        };
1887
1888        let finding = make_finding_with_entities(vec![entity]);
1889
1890        let gold = vec![GoldEntity {
1891            name: "NLRP3".into(),
1892            entity_type: "protein".into(),
1893            expected_source: "uniprot".into(),
1894            expected_id: "Q96P20".into(),
1895            expected_confidence: 0.8,
1896            alternatives: vec![],
1897        }];
1898
1899        let report = entity_benchmark(&[finding], &gold);
1900        assert_eq!(report.total_gold_entities, 1);
1901        assert_eq!(report.found_in_frontier, 1);
1902        assert_eq!(report.id_correct, 1);
1903        assert_eq!(report.confidence_ok, 1);
1904        assert!((report.precision - 1.0).abs() < 0.001);
1905        assert!((report.recall - 1.0).abs() < 0.001);
1906        assert!((report.f1 - 1.0).abs() < 0.001);
1907    }
1908
1909    #[test]
1910    fn entity_benchmark_alternative_id() {
1911        use crate::bundle::*;
1912
1913        let entity = Entity {
1914            name: "aspirin".into(),
1915            entity_type: "compound".into(),
1916            identifiers: serde_json::Map::new(),
1917            canonical_id: Some(ResolvedId {
1918                source: "pubchem".into(),
1919                id: "2244".into(),
1920                confidence: 0.9,
1921                matched_name: Some("Aspirin".into()),
1922            }),
1923            candidates: vec![],
1924            aliases: vec![],
1925            resolution_provenance: None,
1926            resolution_confidence: 0.9,
1927            resolution_method: None,
1928            species_context: None,
1929            needs_review: false,
1930        };
1931
1932        let finding = make_finding_with_entities(vec![entity]);
1933
1934        // Gold expects a different primary ID but lists 2244 as an alternative.
1935        let gold = vec![GoldEntity {
1936            name: "aspirin".into(),
1937            entity_type: "compound".into(),
1938            expected_source: "chebi".into(),
1939            expected_id: "CHEBI:15365".into(),
1940            expected_confidence: 0.7,
1941            alternatives: vec![AlternativeId {
1942                source: "pubchem".into(),
1943                id: "2244".into(),
1944            }],
1945        }];
1946
1947        let report = entity_benchmark(&[finding], &gold);
1948        assert_eq!(
1949            report.id_correct, 1,
1950            "Alternative ID should count as correct"
1951        );
1952        assert!((report.precision - 1.0).abs() < 0.001);
1953    }
1954
1955    #[test]
1956    fn entity_benchmark_mismatch_and_missing() {
1957        use crate::bundle::*;
1958
1959        // Entity with wrong ID.
1960        let entity = Entity {
1961            name: "BRCA1".into(),
1962            entity_type: "gene".into(),
1963            identifiers: serde_json::Map::new(),
1964            canonical_id: Some(ResolvedId {
1965                source: "uniprot".into(),
1966                id: "WRONG_ID".into(),
1967                confidence: 0.8,
1968                matched_name: Some("BRCA1".into()),
1969            }),
1970            candidates: vec![],
1971            aliases: vec![],
1972            resolution_provenance: None,
1973            resolution_confidence: 0.8,
1974            resolution_method: None,
1975            species_context: None,
1976            needs_review: false,
1977        };
1978
1979        let finding = make_finding_with_entities(vec![entity]);
1980
1981        let gold = vec![
1982            GoldEntity {
1983                name: "BRCA1".into(),
1984                entity_type: "gene".into(),
1985                expected_source: "uniprot".into(),
1986                expected_id: "P38398".into(),
1987                expected_confidence: 0.7,
1988                alternatives: vec![],
1989            },
1990            // Entity not present in frontier at all.
1991            GoldEntity {
1992                name: "TP53".into(),
1993                entity_type: "gene".into(),
1994                expected_source: "uniprot".into(),
1995                expected_id: "P04637".into(),
1996                expected_confidence: 0.7,
1997                alternatives: vec![],
1998            },
1999        ];
2000
2001        let report = entity_benchmark(&[finding], &gold);
2002        assert_eq!(report.total_gold_entities, 2);
2003        assert_eq!(report.found_in_frontier, 1); // BRCA1 found but wrong ID
2004        assert_eq!(report.id_correct, 0);
2005        assert!((report.precision).abs() < 0.001); // 0/1
2006        assert!((report.recall).abs() < 0.001); // 0/2
2007        assert_eq!(report.by_type.len(), 1);
2008        assert_eq!(report.by_type[0].entity_type, "gene");
2009        assert_eq!(report.by_type[0].total, 2);
2010    }
2011}
vela_protocol/benchmark.rs

vela_protocol/
benchmark.rs