1use std::collections::{HashMap, HashSet};
4use std::path::{Path, PathBuf};
5
6use colored::Colorize;
7use serde::{Deserialize, Serialize};
8use serde_json::json;
9use sha2::{Digest, Sha256};
10
11use crate::bundle::{Entity, FindingBundle};
12use crate::cli_style as style;
13use crate::project;
14use crate::repo;
15
16#[allow(dead_code)]
18#[derive(Debug, Clone, Deserialize)]
19pub struct GoldFinding {
20 #[serde(default)]
21 pub id: Option<String>,
22 pub assertion_text: String,
23 pub assertion_type: String,
24 pub entities: Vec<String>,
25 pub confidence_range: ConfidenceRange,
26 #[serde(default)]
27 pub notes: Option<String>,
28}
29
30#[derive(Debug, Clone, Deserialize)]
31pub struct ConfidenceRange {
32 pub low: f64,
33 pub high: f64,
34}
35
36#[derive(Debug, Serialize)]
38pub struct BenchmarkReport {
39 pub total_frontier_findings: usize,
40 pub total_gold_findings: usize,
41 pub matched: usize,
42 pub total_frontier_matched: usize,
43 pub unmatched_gold: usize,
44 pub unmatched_frontier: usize,
45 pub exact_id_matches: usize,
46 pub precision: f64,
47 pub recall: f64,
48 pub f1: f64,
49 pub entity_accuracy: f64,
50 pub assertion_type_accuracy: f64,
51 pub confidence_calibration: f64,
52 pub match_details: Vec<MatchDetail>,
53}
54
55#[derive(Debug, Serialize)]
56pub struct MatchDetail {
57 pub gold_id: Option<String>,
58 pub frontier_id: String,
59 pub gold_text: String,
60 pub frontier_text: String,
61 pub similarity: f64,
62 pub entity_overlap: f64,
63 pub assertion_type_match: bool,
64 pub confidence_in_range: bool,
65 pub exact_id_match: bool,
66}
67
68pub fn run(frontier_path: &Path, gold_path: &Path, json_output: bool) {
69 let frontier = repo::load_from_path(frontier_path).expect("Failed to load frontier");
70
71 let gold_data = std::fs::read_to_string(gold_path).expect("Failed to read gold standard file");
72 let gold: Vec<GoldFinding> =
73 serde_json::from_str(&gold_data).expect("Failed to parse gold standard JSON");
74
75 let report = benchmark(&frontier.findings, &gold);
76
77 if json_output {
78 let json = serde_json::to_string_pretty(&report).unwrap();
79 println!("{json}");
80 } else {
81 print_report(&report);
82 }
83}
84
85pub fn benchmark(findings: &[FindingBundle], gold: &[GoldFinding]) -> BenchmarkReport {
86 let mut match_details = Vec::new();
87 let mut gold_matched = vec![false; gold.len()];
88 let mut frontier_matched = vec![false; findings.len()];
89 let mut candidates = Vec::new();
90
91 for (gi, g) in gold.iter().enumerate() {
92 for (fi, f) in findings.iter().enumerate() {
93 let sim = jaccard_similarity(&g.assertion_text, &f.assertion.text);
94 let exact_id = g.id.as_deref().is_some_and(|id| id == f.id);
95 if exact_id || sim >= 0.2 {
96 candidates.push(FindingCandidate {
97 gold_idx: gi,
98 frontier_idx: fi,
99 similarity: sim,
100 exact_id,
101 assertion_type_match: g.assertion_type == f.assertion.assertion_type,
102 });
103 }
104 }
105 }
106
107 candidates.sort_by(|a, b| {
108 b.exact_id
109 .cmp(&a.exact_id)
110 .then_with(|| {
111 b.similarity
112 .partial_cmp(&a.similarity)
113 .unwrap_or(std::cmp::Ordering::Equal)
114 })
115 .then_with(|| b.assertion_type_match.cmp(&a.assertion_type_match))
116 .then_with(|| a.gold_idx.cmp(&b.gold_idx))
117 .then_with(|| a.frontier_idx.cmp(&b.frontier_idx))
118 });
119
120 for candidate in candidates {
121 let gi = candidate.gold_idx;
122 let fi = candidate.frontier_idx;
123 if gold_matched[gi] || frontier_matched[fi] {
124 continue;
125 }
126
127 gold_matched[gi] = true;
128 frontier_matched[fi] = true;
129
130 let g = &gold[gi];
131 let f = &findings[fi];
132
133 let gold_entities: HashSet<String> =
134 g.entities.iter().map(|e| normalize_token(e)).collect();
135 let frontier_entities: HashSet<String> = f
136 .assertion
137 .entities
138 .iter()
139 .map(|e| normalize_token(&e.name))
140 .collect();
141 let entity_overlap = if gold_entities.is_empty() {
142 1.0
143 } else {
144 let matches = gold_entities
145 .iter()
146 .filter(|e| frontier_entities.contains(*e))
147 .count();
148 matches as f64 / gold_entities.len() as f64
149 };
150
151 let in_range = f.confidence.score >= g.confidence_range.low
152 && f.confidence.score <= g.confidence_range.high;
153
154 match_details.push(MatchDetail {
155 gold_id: g.id.clone(),
156 frontier_id: f.id.clone(),
157 gold_text: truncate(&g.assertion_text, 80),
158 frontier_text: truncate(&f.assertion.text, 80),
159 similarity: round3(candidate.similarity),
160 entity_overlap: round3(entity_overlap),
161 assertion_type_match: candidate.assertion_type_match,
162 confidence_in_range: in_range,
163 exact_id_match: candidate.exact_id,
164 });
165 }
166
167 let matched = gold_matched.iter().filter(|&&m| m).count();
168 let frontier_matched_count = frontier_matched.iter().filter(|&&m| m).count();
169 let exact_id_matches = match_details.iter().filter(|d| d.exact_id_match).count();
170
171 let precision = if findings.is_empty() {
172 0.0
173 } else {
174 frontier_matched_count as f64 / findings.len() as f64
175 };
176 let recall = if gold.is_empty() {
177 0.0
178 } else {
179 matched as f64 / gold.len() as f64
180 };
181 let f1 = if precision + recall == 0.0 {
182 0.0
183 } else {
184 2.0 * precision * recall / (precision + recall)
185 };
186
187 let entity_accuracy = if match_details.is_empty() {
188 0.0
189 } else {
190 match_details.iter().map(|d| d.entity_overlap).sum::<f64>() / match_details.len() as f64
191 };
192
193 let confidence_calibration = if match_details.is_empty() {
194 0.0
195 } else {
196 match_details
197 .iter()
198 .filter(|d| d.confidence_in_range)
199 .count() as f64
200 / match_details.len() as f64
201 };
202 let assertion_type_accuracy = if match_details.is_empty() {
203 0.0
204 } else {
205 match_details
206 .iter()
207 .filter(|d| d.assertion_type_match)
208 .count() as f64
209 / match_details.len() as f64
210 };
211
212 BenchmarkReport {
213 total_frontier_findings: findings.len(),
214 total_gold_findings: gold.len(),
215 matched,
216 total_frontier_matched: frontier_matched_count,
217 unmatched_gold: gold.len().saturating_sub(matched),
218 unmatched_frontier: findings.len().saturating_sub(frontier_matched_count),
219 exact_id_matches,
220 precision: round3(precision),
221 recall: round3(recall),
222 f1: round3(f1),
223 entity_accuracy: round3(entity_accuracy),
224 assertion_type_accuracy: round3(assertion_type_accuracy),
225 confidence_calibration: round3(confidence_calibration),
226 match_details,
227 }
228}
229
230struct FindingCandidate {
231 gold_idx: usize,
232 frontier_idx: usize,
233 similarity: f64,
234 exact_id: bool,
235 assertion_type_match: bool,
236}
237
238fn round3(v: f64) -> f64 {
239 (v * 1000.0).round() / 1000.0
240}
241
242fn normalize_token(value: &str) -> String {
243 value
244 .trim()
245 .to_lowercase()
246 .replace('β', "beta")
247 .replace('α', "alpha")
248}
249
250fn truncate(s: &str, max: usize) -> String {
251 if s.len() <= max {
252 s.to_string()
253 } else {
254 let mut end = max;
255 while end > 0 && !s.is_char_boundary(end) {
256 end -= 1;
257 }
258 format!("{}...", &s[..end])
259 }
260}
261
262fn jaccard_similarity(a: &str, b: &str) -> f64 {
264 let words_a: HashSet<&str> = a
265 .split_whitespace()
266 .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
267 .filter(|w| !w.is_empty())
268 .collect();
269 let words_b: HashSet<&str> = b
270 .split_whitespace()
271 .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()))
272 .filter(|w| !w.is_empty())
273 .collect();
274
275 if words_a.is_empty() && words_b.is_empty() {
276 return 1.0;
277 }
278
279 let intersection = words_a.intersection(&words_b).count();
280 let union = words_a.union(&words_b).count();
281
282 if union == 0 {
283 0.0
284 } else {
285 intersection as f64 / union as f64
286 }
287}
288
289pub fn print_report(report: &BenchmarkReport) {
290 println!();
291 println!(" {}", "VELA · BENCHMARK REPORT".dimmed());
292 println!(" {}", style::tick_row(60));
293 println!(" project findings: {}", report.total_frontier_findings);
294 println!(" gold findings: {}", report.total_gold_findings);
295 println!(" matched: {}", report.matched);
296 println!();
297 println!(" precision: {:.1}%", report.precision * 100.0);
298 println!(" recall: {:.1}%", report.recall * 100.0);
299 println!(" f1: {:.1}%", report.f1 * 100.0);
300 println!();
301 println!(
302 " entity accuracy: {:.1}%",
303 report.entity_accuracy * 100.0
304 );
305 println!(
306 " confidence calibration: {:.1}%",
307 report.confidence_calibration * 100.0
308 );
309
310 if !report.match_details.is_empty() {
311 println!();
312 println!(" {}", "MATCH DETAILS".dimmed());
313 println!(" {}", style::tick_row(110));
314 for d in &report.match_details {
315 let cal = if d.confidence_in_range {
316 style::ok("ok")
317 } else {
318 style::lost("miss")
319 };
320 println!(
321 " sim:{:.2} ent:{:.2} conf:{} · {} · {}",
322 d.similarity, d.entity_overlap, cal, d.gold_text, d.frontier_text
323 );
324 }
325 }
326
327 println!();
328 println!(" {}", style::tick_row(60));
329 println!();
330}
331
332#[derive(Debug, Clone, Deserialize)]
338pub struct GoldEntity {
339 pub name: String,
341 #[serde(rename = "type")]
343 pub entity_type: String,
344 pub expected_source: String,
346 pub expected_id: String,
348 pub expected_confidence: f64,
350 #[serde(default)]
352 pub alternatives: Vec<AlternativeId>,
353}
354
355#[derive(Debug, Clone, Deserialize, Serialize)]
356pub struct AlternativeId {
357 pub source: String,
358 pub id: String,
359}
360
361#[derive(Debug, Serialize)]
363pub struct EntityMatchDetail {
364 pub name: String,
365 pub entity_type: String,
366 pub expected_source: String,
367 pub expected_id: String,
368 pub found_type: Option<String>,
369 pub resolved_source: Option<String>,
370 pub resolved_id: Option<String>,
371 pub resolved_confidence: f64,
372 pub type_match: bool,
373 pub id_match: bool,
374 pub confidence_ok: bool,
375}
376
377#[derive(Debug, Serialize)]
379pub struct TypeBreakdown {
380 pub entity_type: String,
381 pub total: usize,
382 pub found: usize,
383 pub id_correct: usize,
384 pub confidence_ok: usize,
385 pub precision: f64,
386 pub recall: f64,
387 pub f1: f64,
388}
389
390#[derive(Debug, Serialize)]
392pub struct EntityBenchmarkReport {
393 pub total_gold_entities: usize,
394 pub found_in_frontier: usize,
395 pub type_correct: usize,
396 pub id_correct: usize,
397 pub confidence_ok: usize,
398 pub precision: f64,
399 pub recall: f64,
400 pub f1: f64,
401 pub type_accuracy: f64,
402 pub by_type: Vec<TypeBreakdown>,
403 pub details: Vec<EntityMatchDetail>,
404}
405
406pub fn run_entity_benchmark(frontier_path: &Path, gold_path: &Path, json_output: bool) {
407 let frontier = repo::load_from_path(frontier_path).expect("Failed to load frontier");
408
409 let gold_data =
410 std::fs::read_to_string(gold_path).expect("Failed to read entity gold standard file");
411 let gold: Vec<GoldEntity> =
412 serde_json::from_str(&gold_data).expect("Failed to parse entity gold standard JSON");
413
414 let report = entity_benchmark(&frontier.findings, &gold);
415
416 if json_output {
417 let json = serde_json::to_string_pretty(&report).unwrap();
418 println!("{json}");
419 } else {
420 print_entity_report(&report);
421 }
422}
423
424fn collect_entities(findings: &[FindingBundle]) -> HashMap<String, Vec<&Entity>> {
426 let mut map: HashMap<String, Vec<&Entity>> = HashMap::new();
427 for f in findings {
428 for ent in &f.assertion.entities {
429 map.entry(ent.name.to_lowercase()).or_default().push(ent);
430 }
431 }
432 map
433}
434
435fn id_matches(entity: &Entity, gold: &GoldEntity) -> bool {
437 if gold.expected_source.is_empty() && gold.expected_id.is_empty() {
438 return entity.entity_type == gold.entity_type;
439 }
440 if let Some(ref cid) = entity.canonical_id {
441 if cid.source == gold.expected_source && cid.id == gold.expected_id {
443 return true;
444 }
445 for alt in &gold.alternatives {
447 if cid.source == alt.source && cid.id == alt.id {
448 return true;
449 }
450 }
451 }
452 false
453}
454
455fn confidence_ok(entity: &Entity, gold: &GoldEntity) -> bool {
457 if let Some(ref cid) = entity.canonical_id {
458 cid.confidence >= gold.expected_confidence
459 } else {
460 entity.resolution_confidence >= gold.expected_confidence
461 }
462}
463
464pub fn entity_benchmark(findings: &[FindingBundle], gold: &[GoldEntity]) -> EntityBenchmarkReport {
465 let entity_map = collect_entities(findings);
466 let mut details = Vec::new();
467
468 for g in gold {
469 let key = g.name.to_lowercase();
470 let entities = entity_map.get(&key);
471
472 let (found_type, resolved_source, resolved_id, resolved_conf, type_match, matched, conf_ok) =
473 if let Some(ents) = entities {
474 let best = ents.iter().max_by(|a, b| {
475 entity_rank(a, g)
476 .cmp(&entity_rank(b, g))
477 .then_with(|| a.name.cmp(&b.name))
478 });
479
480 if let Some(ent) = best {
481 (
482 Some(ent.entity_type.clone()),
483 ent.canonical_id.as_ref().map(|cid| cid.source.clone()),
484 ent.canonical_id.as_ref().map(|cid| cid.id.clone()),
485 entity_resolution_confidence(ent),
486 ent.entity_type == g.entity_type,
487 id_matches(ent, g),
488 confidence_ok(ent, g),
489 )
490 } else {
491 (None, None, None, 0.0, false, false, false)
492 }
493 } else {
494 (None, None, None, 0.0, false, false, false)
495 };
496
497 details.push(EntityMatchDetail {
498 name: g.name.clone(),
499 entity_type: g.entity_type.clone(),
500 expected_source: g.expected_source.clone(),
501 expected_id: g.expected_id.clone(),
502 found_type,
503 resolved_source,
504 resolved_id,
505 resolved_confidence: round3(resolved_conf),
506 type_match,
507 id_match: matched,
508 confidence_ok: conf_ok,
509 });
510 }
511
512 let total = gold.len();
513 let found = details.iter().filter(|d| d.found_type.is_some()).count();
514 let type_correct = details.iter().filter(|d| d.type_match).count();
515 let id_correct = details.iter().filter(|d| d.id_match).count();
516 let conf_ok_count = details.iter().filter(|d| d.confidence_ok).count();
517
518 let precision = if found == 0 {
520 0.0
521 } else {
522 id_correct as f64 / found as f64
523 };
524 let recall = if total == 0 {
525 0.0
526 } else {
527 id_correct as f64 / total as f64
528 };
529 let f1 = if precision + recall == 0.0 {
530 0.0
531 } else {
532 2.0 * precision * recall / (precision + recall)
533 };
534 let type_accuracy = if total == 0 {
535 0.0
536 } else {
537 type_correct as f64 / total as f64
538 };
539
540 let mut type_groups: HashMap<String, Vec<&EntityMatchDetail>> = HashMap::new();
542 for d in &details {
543 type_groups
544 .entry(d.entity_type.clone())
545 .or_default()
546 .push(d);
547 }
548
549 let mut by_type: Vec<TypeBreakdown> = type_groups
550 .into_iter()
551 .map(|(etype, ds)| {
552 let t = ds.len();
553 let f = ds.iter().filter(|d| d.found_type.is_some()).count();
554 let c = ds.iter().filter(|d| d.id_match).count();
555 let co = ds.iter().filter(|d| d.confidence_ok).count();
556 let p = if f == 0 { 0.0 } else { c as f64 / f as f64 };
557 let r = if t == 0 { 0.0 } else { c as f64 / t as f64 };
558 let f1t = if p + r == 0.0 {
559 0.0
560 } else {
561 2.0 * p * r / (p + r)
562 };
563 TypeBreakdown {
564 entity_type: etype,
565 total: t,
566 found: f,
567 id_correct: c,
568 confidence_ok: co,
569 precision: round3(p),
570 recall: round3(r),
571 f1: round3(f1t),
572 }
573 })
574 .collect();
575 by_type.sort_by(|a, b| a.entity_type.cmp(&b.entity_type));
576
577 EntityBenchmarkReport {
578 total_gold_entities: total,
579 found_in_frontier: found,
580 type_correct,
581 id_correct,
582 confidence_ok: conf_ok_count,
583 precision: round3(precision),
584 recall: round3(recall),
585 f1: round3(f1),
586 type_accuracy: round3(type_accuracy),
587 by_type,
588 details,
589 }
590}
591
592fn entity_rank(entity: &Entity, gold: &GoldEntity) -> (u8, u8, u32) {
593 (
594 u8::from(entity.entity_type == gold.entity_type),
595 u8::from(entity.canonical_id.is_some()),
596 (entity_resolution_confidence(entity) * 1000.0).round() as u32,
597 )
598}
599
600fn entity_resolution_confidence(entity: &Entity) -> f64 {
601 entity
602 .canonical_id
603 .as_ref()
604 .map(|cid| cid.confidence)
605 .unwrap_or(entity.resolution_confidence)
606}
607
608fn print_entity_report(report: &EntityBenchmarkReport) {
609 println!();
610 println!(" {}", "VELA · ENTITY RESOLUTION BENCHMARK".dimmed());
611 println!(" {}", style::tick_row(60));
612 println!(" gold entities: {}", report.total_gold_entities);
613 println!(" found in frontier: {}", report.found_in_frontier);
614 println!(" type correct: {}", report.type_correct);
615 println!(" id correct: {}", report.id_correct);
616 println!(" confidence ok: {}", report.confidence_ok);
617 println!();
618 println!(" precision: {:.1}%", report.precision * 100.0);
619 println!(" recall: {:.1}%", report.recall * 100.0);
620 println!(" f1: {:.1}%", report.f1 * 100.0);
621 println!(" type accuracy: {:.1}%", report.type_accuracy * 100.0);
622 println!();
623 println!(" {}", "BY TYPE".dimmed());
624 println!(
625 " {}",
626 format!(
627 "{:<12} {:>5} {:>5} {:>7} {:>8} {:>6} {:>6}",
628 "type", "total", "found", "correct", "conf_ok", "prec", "f1"
629 )
630 .dimmed()
631 );
632 for t in &report.by_type {
633 println!(
634 " {:<12} {:>5} {:>5} {:>7} {:>8} {:>5.1}% {:>5.1}%",
635 t.entity_type,
636 t.total,
637 t.found,
638 t.id_correct,
639 t.confidence_ok,
640 t.precision * 100.0,
641 t.f1 * 100.0,
642 );
643 }
644
645 let mismatches: Vec<_> = report.details.iter().filter(|d| !d.id_match).collect();
647 if !mismatches.is_empty() {
648 println!();
649 println!(
650 " {}",
651 format!("MISMATCHES ({})", mismatches.len()).dimmed()
652 );
653 println!(" {}", style::tick_row(60));
654 for d in &mismatches {
655 let resolved = match (&d.resolved_source, &d.resolved_id) {
656 (Some(s), Some(id)) => format!("{s}:{id}"),
657 _ => d
658 .found_type
659 .clone()
660 .unwrap_or_else(|| "missing".to_string()),
661 };
662 println!(
663 " {} ({}) expected {}:{} got {}",
664 d.name, d.entity_type, d.expected_source, d.expected_id, resolved
665 );
666 }
667 }
668
669 println!();
670 println!(" {}", style::tick_row(60));
671 println!();
672}
673
674#[derive(Debug, Clone, Deserialize)]
680pub struct GoldLink {
681 pub source_id: String,
682 pub target_id: String,
683 pub link_type: String,
684 #[serde(default)]
685 pub note: String,
686}
687
688#[derive(Debug, Serialize)]
690pub struct LinkMatchDetail {
691 pub source_id: String,
692 pub target_id: String,
693 pub expected_type: String,
694 pub found: bool,
695 pub found_type: Option<String>,
696 pub type_correct: bool,
697}
698
699#[derive(Debug, Serialize)]
701pub struct LinkTypeBreakdown {
702 pub link_type: String,
703 pub total: usize,
704 pub found: usize,
705 pub type_correct: usize,
706 pub precision: f64,
707 pub recall: f64,
708 pub f1: f64,
709}
710
711#[derive(Debug, Serialize)]
713pub struct LinkBenchmarkReport {
714 pub total_gold_links: usize,
715 pub total_frontier_links: usize,
716 pub found: usize,
717 pub type_correct: usize,
718 pub precision: f64,
719 pub recall: f64,
720 pub f1: f64,
721 pub by_type: Vec<LinkTypeBreakdown>,
722 pub details: Vec<LinkMatchDetail>,
723}
724
725pub fn run_link_benchmark(frontier_path: &Path, gold_path: &Path, json_output: bool) {
726 let frontier = repo::load_from_path(frontier_path).expect("Failed to load frontier");
727
728 let gold_data =
729 std::fs::read_to_string(gold_path).expect("Failed to read link gold standard file");
730 let gold: Vec<GoldLink> =
731 serde_json::from_str(&gold_data).expect("Failed to parse link gold standard JSON");
732
733 let report = link_benchmark(&frontier.findings, &gold);
734
735 if json_output {
736 let json = serde_json::to_string_pretty(&report).unwrap();
737 println!("{json}");
738 } else {
739 print_link_report(&report);
740 }
741}
742
743fn collect_links(findings: &[FindingBundle]) -> HashMap<(String, String), Vec<String>> {
745 let mut map: HashMap<(String, String), Vec<String>> = HashMap::new();
746 for f in findings {
747 for link in &f.links {
748 map.entry((f.id.clone(), link.target.clone()))
749 .or_default()
750 .push(link.link_type.clone());
751 }
752 }
753 map
754}
755
756pub fn link_benchmark(findings: &[FindingBundle], gold: &[GoldLink]) -> LinkBenchmarkReport {
757 let link_map = collect_links(findings);
758 let total_frontier_links: usize = findings.iter().map(|f| f.links.len()).sum();
759 let mut details = Vec::new();
760
761 for g in gold {
762 let key = (g.source_id.clone(), g.target_id.clone());
763 let types = link_map.get(&key);
764
765 let (found, found_type, type_correct) = if let Some(ts) = types {
766 let correct = ts.contains(&g.link_type);
767 (true, Some(ts[0].clone()), correct)
768 } else {
769 (false, None, false)
770 };
771
772 details.push(LinkMatchDetail {
773 source_id: g.source_id.clone(),
774 target_id: g.target_id.clone(),
775 expected_type: g.link_type.clone(),
776 found,
777 found_type,
778 type_correct,
779 });
780 }
781
782 let total = gold.len();
783 let found_count = details.iter().filter(|d| d.found).count();
784 let type_correct_count = details.iter().filter(|d| d.type_correct).count();
785
786 let precision = if found_count == 0 {
788 0.0
789 } else {
790 type_correct_count as f64 / found_count as f64
791 };
792 let recall = if total == 0 {
793 0.0
794 } else {
795 type_correct_count as f64 / total as f64
796 };
797 let f1 = if precision + recall == 0.0 {
798 0.0
799 } else {
800 2.0 * precision * recall / (precision + recall)
801 };
802
803 let mut type_groups: HashMap<String, Vec<&LinkMatchDetail>> = HashMap::new();
805 for d in &details {
806 type_groups
807 .entry(d.expected_type.clone())
808 .or_default()
809 .push(d);
810 }
811
812 let mut by_type: Vec<LinkTypeBreakdown> = type_groups
813 .into_iter()
814 .map(|(lt, ds)| {
815 let t = ds.len();
816 let f = ds.iter().filter(|d| d.found).count();
817 let c = ds.iter().filter(|d| d.type_correct).count();
818 let p = if f == 0 { 0.0 } else { c as f64 / f as f64 };
819 let r = if t == 0 { 0.0 } else { c as f64 / t as f64 };
820 let f1t = if p + r == 0.0 {
821 0.0
822 } else {
823 2.0 * p * r / (p + r)
824 };
825 LinkTypeBreakdown {
826 link_type: lt,
827 total: t,
828 found: f,
829 type_correct: c,
830 precision: round3(p),
831 recall: round3(r),
832 f1: round3(f1t),
833 }
834 })
835 .collect();
836 by_type.sort_by(|a, b| a.link_type.cmp(&b.link_type));
837
838 LinkBenchmarkReport {
839 total_gold_links: total,
840 total_frontier_links,
841 found: found_count,
842 type_correct: type_correct_count,
843 precision: round3(precision),
844 recall: round3(recall),
845 f1: round3(f1),
846 by_type,
847 details,
848 }
849}
850
851fn print_link_report(report: &LinkBenchmarkReport) {
852 println!();
853 println!(" {}", "VELA · LINK BENCHMARK".dimmed());
854 println!(" {}", style::tick_row(60));
855 println!(" gold links: {}", report.total_gold_links);
856 println!(" project links: {}", report.total_frontier_links);
857 println!(" found: {}", report.found);
858 println!(" type correct: {}", report.type_correct);
859 println!();
860 println!(" precision: {:.1}%", report.precision * 100.0);
861 println!(" recall: {:.1}%", report.recall * 100.0);
862 println!(" f1: {:.1}%", report.f1 * 100.0);
863 println!();
864 println!(" {}", "BY TYPE".dimmed());
865 println!(
866 " {}",
867 format!(
868 "{:<12} {:>5} {:>5} {:>7} {:>6} {:>6}",
869 "type", "total", "found", "correct", "prec", "f1"
870 )
871 .dimmed()
872 );
873 for t in &report.by_type {
874 println!(
875 " {:<12} {:>5} {:>5} {:>7} {:>5.1}% {:>5.1}%",
876 t.link_type,
877 t.total,
878 t.found,
879 t.type_correct,
880 t.precision * 100.0,
881 t.f1 * 100.0,
882 );
883 }
884
885 let mismatches: Vec<_> = report.details.iter().filter(|d| !d.type_correct).collect();
887 if !mismatches.is_empty() {
888 println!();
889 println!(
890 " {}",
891 format!("MISMATCHES ({})", mismatches.len()).dimmed()
892 );
893 println!(" {}", style::tick_row(60));
894 for d in &mismatches {
895 let found_str = match &d.found_type {
896 Some(t) => t.as_str(),
897 None => "missing",
898 };
899 println!(
900 " {} · {} expected:{} got:{}",
901 d.source_id, d.target_id, d.expected_type, found_str
902 );
903 }
904 }
905
906 println!();
907 println!(" {}", style::tick_row(60));
908 println!();
909}
910
911#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
916#[serde(rename_all = "snake_case")]
917pub enum BenchmarkMode {
918 Finding,
919 Entity,
920 Link,
921 Workflow,
922}
923
924impl BenchmarkMode {
925 fn as_str(&self) -> &'static str {
926 match self {
927 BenchmarkMode::Finding => "finding",
928 BenchmarkMode::Entity => "entity",
929 BenchmarkMode::Link => "link",
930 BenchmarkMode::Workflow => "workflow",
931 }
932 }
933}
934
935#[derive(Debug, Clone, Default, Deserialize, Serialize)]
936pub struct BenchmarkThresholds {
937 #[serde(default, skip_serializing_if = "Option::is_none")]
938 pub min_f1: Option<f64>,
939 #[serde(default, skip_serializing_if = "Option::is_none")]
940 pub min_precision: Option<f64>,
941 #[serde(default, skip_serializing_if = "Option::is_none")]
942 pub min_recall: Option<f64>,
943 #[serde(default, skip_serializing_if = "Option::is_none")]
944 pub min_entity_accuracy: Option<f64>,
945 #[serde(default, skip_serializing_if = "Option::is_none")]
946 pub min_confidence_calibration: Option<f64>,
947 #[serde(default, skip_serializing_if = "Option::is_none")]
948 pub min_type_accuracy: Option<f64>,
949 #[serde(default, skip_serializing_if = "Option::is_none")]
950 pub min_workflow_score: Option<f64>,
951}
952
953#[derive(Debug, Clone, Default, Deserialize, Serialize)]
954pub struct WorkflowExpectations {
955 #[serde(default)]
956 pub min_findings: usize,
957 #[serde(default)]
958 pub min_links: usize,
959 #[serde(default)]
960 pub min_entity_mentions: usize,
961 #[serde(default)]
962 pub min_evidence_spans: usize,
963 #[serde(default)]
964 pub min_provenance_complete: usize,
965 #[serde(default)]
966 pub min_assertion_types: usize,
967 #[serde(default)]
968 pub min_gap_flags: usize,
969 #[serde(default)]
970 pub min_contested_flags: usize,
971}
972
973#[derive(Debug, Clone, Deserialize, Serialize)]
974pub struct BenchmarkTask {
975 pub id: String,
976 pub mode: BenchmarkMode,
977 #[serde(default)]
978 pub description: Option<String>,
979 #[serde(default)]
980 pub frontier: Option<String>,
981 #[serde(default)]
982 pub gold: Option<String>,
983 #[serde(default)]
984 pub thresholds: BenchmarkThresholds,
985 #[serde(default)]
986 pub workflow: Option<WorkflowExpectations>,
987}
988
989#[derive(Debug, Clone, Deserialize, Serialize)]
990pub struct BenchmarkSuite {
991 pub id: String,
992 pub name: String,
993 #[serde(default)]
994 pub description: Option<String>,
995 pub frontier: String,
996 pub tasks: Vec<BenchmarkTask>,
997}
998
999#[derive(Debug, Serialize)]
1000pub struct WorkflowBenchmarkReport {
1001 pub total_findings: usize,
1002 pub total_links: usize,
1003 pub total_entity_mentions: usize,
1004 pub total_evidence_spans: usize,
1005 pub total_provenance_complete: usize,
1006 pub evidence_span_coverage: f64,
1007 pub provenance_coverage: f64,
1008 pub assertion_types: usize,
1009 pub gap_flags: usize,
1010 pub contested_flags: usize,
1011 pub checks_total: usize,
1012 pub checks_passed: usize,
1013 pub workflow_score: f64,
1014 pub details: Vec<WorkflowCheckDetail>,
1015}
1016
1017#[derive(Debug, Serialize)]
1018pub struct WorkflowCheckDetail {
1019 pub metric: String,
1020 pub actual: usize,
1021 pub expected_min: usize,
1022 pub passed: bool,
1023}
1024
1025pub fn load_suite(path: &Path) -> Result<BenchmarkSuite, String> {
1026 let data = std::fs::read_to_string(path)
1027 .map_err(|e| format!("Failed to read benchmark suite '{}': {e}", path.display()))?;
1028 serde_json::from_str(&data)
1029 .map_err(|e| format!("Failed to parse benchmark suite '{}': {e}", path.display()))
1030}
1031
1032pub fn suite_ready_report(suite_path: &Path) -> Result<serde_json::Value, String> {
1033 let envelope = run_suite(suite_path)?;
1034 let suite_ready = envelope
1035 .get("ok")
1036 .and_then(|value| value.as_bool())
1037 .unwrap_or(false);
1038 Ok(json!({
1039 "ok": suite_ready,
1040 "command": "bench",
1041 "suite_ready": suite_ready,
1042 "suite": envelope.get("suite").cloned().unwrap_or(serde_json::Value::Null),
1043 "tasks": envelope.get("tasks").cloned().unwrap_or_else(|| json!([])),
1044 "failures": envelope.get("failures").cloned().unwrap_or_else(|| json!([])),
1045 }))
1046}
1047
1048pub fn run_suite(suite_path: &Path) -> Result<serde_json::Value, String> {
1049 let suite = load_suite(suite_path)?;
1050 let base_dir = suite_path.parent().unwrap_or_else(|| Path::new("."));
1051 let frontier_path = resolve_suite_path(base_dir, &suite.frontier);
1052 let loaded = repo::load_from_path(&frontier_path)?;
1053 let frontier_hash = hash_path(&frontier_path)?;
1054
1055 let mut task_outputs = Vec::new();
1056 let mut failures = Vec::new();
1057 let mut standard_candles = Vec::new();
1058
1059 for task in &suite.tasks {
1060 if let Some(gold) = &task.gold {
1061 let gold_path = resolve_suite_path(base_dir, gold);
1062 standard_candles.push(json!({
1063 "task_id": task.id,
1064 "mode": task.mode.as_str(),
1065 "path": gold_path.display().to_string(),
1066 "items": count_json_array_items(&gold_path)?,
1067 "role": "reviewed calibration anchor"
1068 }));
1069 }
1070 let task_frontier = task
1071 .frontier
1072 .as_deref()
1073 .map(|p| resolve_suite_path(base_dir, p))
1074 .unwrap_or_else(|| frontier_path.clone());
1075 let output = task_envelope(
1076 &task_frontier,
1077 Some((&suite.id, &task.id)),
1078 task.mode.clone(),
1079 task.gold
1080 .as_deref()
1081 .map(|p| resolve_suite_path(base_dir, p))
1082 .as_deref(),
1083 &task.thresholds,
1084 task.workflow.as_ref(),
1085 )?;
1086 if !output
1087 .get("ok")
1088 .and_then(|value| value.as_bool())
1089 .unwrap_or(false)
1090 {
1091 failures.push(format!("task {} failed", task.id));
1092 }
1093 task_outputs.push(output);
1094 }
1095
1096 let passed = task_outputs
1097 .iter()
1098 .filter(|task| task.get("ok").and_then(|value| value.as_bool()) == Some(true))
1099 .count();
1100 let ok = failures.is_empty();
1101
1102 Ok(json!({
1103 "ok": ok,
1104 "command": "bench",
1105 "benchmark_type": "suite",
1106 "schema_version": project::VELA_SCHEMA_VERSION,
1107 "suite": {
1108 "id": suite.id,
1109 "name": suite.name,
1110 "path": suite_path.display().to_string(),
1111 "tasks": suite.tasks.len(),
1112 },
1113 "frontier": {
1114 "name": loaded.project.name,
1115 "source": frontier_path.display().to_string(),
1116 "hash": format!("sha256:{frontier_hash}"),
1117 },
1118 "metrics": {
1119 "tasks_total": task_outputs.len(),
1120 "tasks_passed": passed,
1121 "tasks_failed": task_outputs.len().saturating_sub(passed),
1122 "standard_candles": standard_candles
1123 .iter()
1124 .filter_map(|item| item.get("items").and_then(|value| value.as_u64()))
1125 .sum::<u64>(),
1126 },
1127 "standard_candles": {
1128 "definition": "Reviewed gold fixtures used as calibration anchors for release drift, not proof of scientific superiority.",
1129 "items": standard_candles,
1130 },
1131 "failures": failures,
1132 "tasks": task_outputs,
1133 }))
1134}
1135
1136fn count_json_array_items(path: &Path) -> Result<usize, String> {
1137 let data = std::fs::read_to_string(path)
1138 .map_err(|e| format!("Failed to read gold fixture '{}': {e}", path.display()))?;
1139 let value: serde_json::Value = serde_json::from_str(&data)
1140 .map_err(|e| format!("Failed to parse gold fixture '{}': {e}", path.display()))?;
1141 value
1142 .as_array()
1143 .map(Vec::len)
1144 .ok_or_else(|| format!("Gold fixture '{}' must be a JSON array", path.display()))
1145}
1146
1147pub fn task_envelope(
1148 frontier_path: &Path,
1149 suite_task: Option<(&str, &str)>,
1150 mode: BenchmarkMode,
1151 gold_path: Option<&Path>,
1152 thresholds: &BenchmarkThresholds,
1153 workflow: Option<&WorkflowExpectations>,
1154) -> Result<serde_json::Value, String> {
1155 let loaded = repo::load_from_path(frontier_path)?;
1156 let frontier_hash = hash_path(frontier_path)?;
1157 let (suite_id, task_id) = suite_task
1158 .map(|(suite, task)| (Some(suite.to_string()), Some(task.to_string())))
1159 .unwrap_or((None, None));
1160
1161 match mode {
1162 BenchmarkMode::Finding => {
1163 let gold_path =
1164 gold_path.ok_or_else(|| "finding benchmark requires gold".to_string())?;
1165 let gold_data = std::fs::read_to_string(gold_path).map_err(|e| {
1166 format!("Failed to read finding gold '{}': {e}", gold_path.display())
1167 })?;
1168 let gold: Vec<GoldFinding> = serde_json::from_str(&gold_data).map_err(|e| {
1169 format!(
1170 "Failed to parse finding gold '{}': {e}",
1171 gold_path.display()
1172 )
1173 })?;
1174 let report = benchmark(&loaded.findings, &gold);
1175 let failures = finding_threshold_failures(&report, thresholds);
1176 let gold_hash = hash_path(gold_path)?;
1177 Ok(json!({
1178 "ok": failures.is_empty(),
1179 "command": "bench",
1180 "benchmark_type": BenchmarkMode::Finding.as_str(),
1181 "mode": BenchmarkMode::Finding.as_str(),
1182 "suite_id": suite_id,
1183 "task_id": task_id,
1184 "schema_version": project::VELA_SCHEMA_VERSION,
1185 "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1186 "gold": gold_metadata(gold_path, &gold_hash, gold.len()),
1187 "metrics": {
1188 "total_frontier_findings": report.total_frontier_findings,
1189 "total_gold_findings": report.total_gold_findings,
1190 "matched": report.matched,
1191 "total_frontier_matched": report.total_frontier_matched,
1192 "unmatched_gold": report.unmatched_gold,
1193 "unmatched_frontier": report.unmatched_frontier,
1194 "exact_id_matches": report.exact_id_matches,
1195 "precision": report.precision,
1196 "recall": report.recall,
1197 "f1": report.f1,
1198 "entity_accuracy": report.entity_accuracy,
1199 "assertion_type_accuracy": report.assertion_type_accuracy,
1200 "confidence_calibration": report.confidence_calibration,
1201 },
1202 "thresholds": thresholds,
1203 "failures": failures,
1204 "match_details": report.match_details,
1205 }))
1206 }
1207 BenchmarkMode::Entity => {
1208 let gold_path =
1209 gold_path.ok_or_else(|| "entity benchmark requires gold".to_string())?;
1210 let gold_data = std::fs::read_to_string(gold_path).map_err(|e| {
1211 format!("Failed to read entity gold '{}': {e}", gold_path.display())
1212 })?;
1213 let gold: Vec<GoldEntity> = serde_json::from_str(&gold_data).map_err(|e| {
1214 format!("Failed to parse entity gold '{}': {e}", gold_path.display())
1215 })?;
1216 let report = entity_benchmark(&loaded.findings, &gold);
1217 let failures = entity_threshold_failures(&report, thresholds);
1218 let gold_hash = hash_path(gold_path)?;
1219 Ok(json!({
1220 "ok": failures.is_empty(),
1221 "command": "bench",
1222 "benchmark_type": BenchmarkMode::Entity.as_str(),
1223 "mode": BenchmarkMode::Entity.as_str(),
1224 "suite_id": suite_id,
1225 "task_id": task_id,
1226 "schema_version": project::VELA_SCHEMA_VERSION,
1227 "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1228 "gold": gold_metadata(gold_path, &gold_hash, gold.len()),
1229 "metrics": {
1230 "total_gold_entities": report.total_gold_entities,
1231 "found_in_frontier": report.found_in_frontier,
1232 "type_correct": report.type_correct,
1233 "id_correct": report.id_correct,
1234 "confidence_ok": report.confidence_ok,
1235 "precision": report.precision,
1236 "recall": report.recall,
1237 "f1": report.f1,
1238 "type_accuracy": report.type_accuracy,
1239 },
1240 "thresholds": thresholds,
1241 "failures": failures,
1242 "by_type": report.by_type,
1243 "details": report.details,
1244 }))
1245 }
1246 BenchmarkMode::Link => {
1247 let gold_path = gold_path.ok_or_else(|| "link benchmark requires gold".to_string())?;
1248 let gold_data = std::fs::read_to_string(gold_path)
1249 .map_err(|e| format!("Failed to read link gold '{}': {e}", gold_path.display()))?;
1250 let gold: Vec<GoldLink> = serde_json::from_str(&gold_data)
1251 .map_err(|e| format!("Failed to parse link gold '{}': {e}", gold_path.display()))?;
1252 let report = link_benchmark(&loaded.findings, &gold);
1253 let failures = link_threshold_failures(&report, thresholds);
1254 let gold_hash = hash_path(gold_path)?;
1255 Ok(json!({
1256 "ok": failures.is_empty(),
1257 "command": "bench",
1258 "benchmark_type": BenchmarkMode::Link.as_str(),
1259 "mode": BenchmarkMode::Link.as_str(),
1260 "suite_id": suite_id,
1261 "task_id": task_id,
1262 "schema_version": project::VELA_SCHEMA_VERSION,
1263 "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1264 "gold": gold_metadata(gold_path, &gold_hash, gold.len()),
1265 "metrics": {
1266 "total_gold_links": report.total_gold_links,
1267 "total_frontier_links": report.total_frontier_links,
1268 "found": report.found,
1269 "type_correct": report.type_correct,
1270 "precision": report.precision,
1271 "recall": report.recall,
1272 "f1": report.f1,
1273 },
1274 "thresholds": thresholds,
1275 "failures": failures,
1276 "by_type": report.by_type,
1277 "details": report.details,
1278 }))
1279 }
1280 BenchmarkMode::Workflow => {
1281 let expectations = workflow.cloned().unwrap_or_default();
1282 let report = workflow_benchmark(&loaded.findings, &expectations);
1283 let failures = workflow_threshold_failures(&report, thresholds);
1284 Ok(json!({
1285 "ok": failures.is_empty(),
1286 "command": "bench",
1287 "benchmark_type": BenchmarkMode::Workflow.as_str(),
1288 "mode": BenchmarkMode::Workflow.as_str(),
1289 "suite_id": suite_id,
1290 "task_id": task_id,
1291 "schema_version": project::VELA_SCHEMA_VERSION,
1292 "frontier": frontier_metadata(&loaded, frontier_path, &frontier_hash),
1293 "gold": null,
1294 "metrics": {
1295 "total_findings": report.total_findings,
1296 "total_links": report.total_links,
1297 "total_entity_mentions": report.total_entity_mentions,
1298 "total_evidence_spans": report.total_evidence_spans,
1299 "total_provenance_complete": report.total_provenance_complete,
1300 "evidence_span_coverage": report.evidence_span_coverage,
1301 "provenance_coverage": report.provenance_coverage,
1302 "assertion_types": report.assertion_types,
1303 "gap_flags": report.gap_flags,
1304 "contested_flags": report.contested_flags,
1305 "checks_total": report.checks_total,
1306 "checks_passed": report.checks_passed,
1307 "workflow_score": report.workflow_score,
1308 },
1309 "thresholds": thresholds,
1310 "failures": failures,
1311 "details": report.details,
1312 }))
1313 }
1314 }
1315}
1316
1317pub fn workflow_benchmark(
1318 findings: &[FindingBundle],
1319 expectations: &WorkflowExpectations,
1320) -> WorkflowBenchmarkReport {
1321 let total_links = findings.iter().map(|f| f.links.len()).sum();
1322 let total_entity_mentions = findings.iter().map(|f| f.assertion.entities.len()).sum();
1323 let total_evidence_spans = findings
1324 .iter()
1325 .map(|f| f.evidence.evidence_spans.len())
1326 .sum();
1327 let findings_with_spans = findings
1328 .iter()
1329 .filter(|f| !f.evidence.evidence_spans.is_empty())
1330 .count();
1331 let total_provenance_complete = findings
1332 .iter()
1333 .filter(|f| {
1334 f.provenance.doi.is_some()
1335 || f.provenance.pmid.is_some()
1336 || !f.provenance.title.trim().is_empty()
1337 })
1338 .count();
1339 let evidence_span_coverage = if findings.is_empty() {
1340 1.0
1341 } else {
1342 findings_with_spans as f64 / findings.len() as f64
1343 };
1344 let provenance_coverage = if findings.is_empty() {
1345 1.0
1346 } else {
1347 total_provenance_complete as f64 / findings.len() as f64
1348 };
1349 let assertion_types = findings
1350 .iter()
1351 .map(|f| f.assertion.assertion_type.as_str())
1352 .collect::<HashSet<_>>()
1353 .len();
1354 let gap_flags = findings.iter().filter(|f| f.flags.gap).count();
1355 let contested_flags = findings.iter().filter(|f| f.flags.contested).count();
1356
1357 let checks = vec![
1358 ("findings", findings.len(), expectations.min_findings),
1359 ("links", total_links, expectations.min_links),
1360 (
1361 "entity_mentions",
1362 total_entity_mentions,
1363 expectations.min_entity_mentions,
1364 ),
1365 (
1366 "evidence_spans",
1367 total_evidence_spans,
1368 expectations.min_evidence_spans,
1369 ),
1370 (
1371 "provenance_complete",
1372 total_provenance_complete,
1373 expectations.min_provenance_complete,
1374 ),
1375 (
1376 "assertion_types",
1377 assertion_types,
1378 expectations.min_assertion_types,
1379 ),
1380 ("gap_flags", gap_flags, expectations.min_gap_flags),
1381 (
1382 "contested_flags",
1383 contested_flags,
1384 expectations.min_contested_flags,
1385 ),
1386 ];
1387 let details: Vec<WorkflowCheckDetail> = checks
1388 .into_iter()
1389 .filter(|(_, _, expected)| *expected > 0)
1390 .map(|(metric, actual, expected_min)| WorkflowCheckDetail {
1391 metric: metric.to_string(),
1392 actual,
1393 expected_min,
1394 passed: actual >= expected_min,
1395 })
1396 .collect();
1397 let checks_total = details.len();
1398 let checks_passed = details.iter().filter(|detail| detail.passed).count();
1399 let workflow_score = if checks_total == 0 {
1400 1.0
1401 } else {
1402 checks_passed as f64 / checks_total as f64
1403 };
1404
1405 WorkflowBenchmarkReport {
1406 total_findings: findings.len(),
1407 total_links,
1408 total_entity_mentions,
1409 total_evidence_spans,
1410 total_provenance_complete,
1411 evidence_span_coverage: round3(evidence_span_coverage),
1412 provenance_coverage: round3(provenance_coverage),
1413 assertion_types,
1414 gap_flags,
1415 contested_flags,
1416 checks_total,
1417 checks_passed,
1418 workflow_score: round3(workflow_score),
1419 details,
1420 }
1421}
1422
1423fn resolve_suite_path(base_dir: &Path, value: &str) -> PathBuf {
1424 let path = PathBuf::from(value);
1425 if path.is_absolute() {
1426 path
1427 } else {
1428 let from_suite = base_dir.join(&path);
1429 if from_suite.exists() {
1430 from_suite
1431 } else {
1432 PathBuf::from(value)
1433 }
1434 }
1435}
1436
1437fn frontier_metadata(
1438 loaded: &project::Project,
1439 frontier_path: &Path,
1440 frontier_hash: &str,
1441) -> serde_json::Value {
1442 json!({
1443 "name": loaded.project.name,
1444 "source": frontier_path.display().to_string(),
1445 "hash": format!("sha256:{frontier_hash}"),
1446 })
1447}
1448
1449fn gold_metadata(gold_path: &Path, gold_hash: &str, items: usize) -> serde_json::Value {
1450 json!({
1451 "path": gold_path.display().to_string(),
1452 "hash": format!("sha256:{gold_hash}"),
1453 "items": items,
1454 })
1455}
1456
1457fn finding_threshold_failures(
1458 report: &BenchmarkReport,
1459 thresholds: &BenchmarkThresholds,
1460) -> Vec<String> {
1461 let mut failures =
1462 generic_threshold_failures(report.precision, report.recall, report.f1, thresholds);
1463 if let Some(threshold) = thresholds.min_entity_accuracy
1464 && report.entity_accuracy < threshold
1465 {
1466 failures.push(format!(
1467 "entity_accuracy {} is below threshold {}",
1468 report.entity_accuracy, threshold
1469 ));
1470 }
1471 if let Some(threshold) = thresholds.min_confidence_calibration
1472 && report.confidence_calibration < threshold
1473 {
1474 failures.push(format!(
1475 "confidence_calibration {} is below threshold {}",
1476 report.confidence_calibration, threshold
1477 ));
1478 }
1479 if let Some(threshold) = thresholds.min_type_accuracy
1480 && report.assertion_type_accuracy < threshold
1481 {
1482 failures.push(format!(
1483 "assertion_type_accuracy {} is below threshold {}",
1484 report.assertion_type_accuracy, threshold
1485 ));
1486 }
1487 failures
1488}
1489
1490fn entity_threshold_failures(
1491 report: &EntityBenchmarkReport,
1492 thresholds: &BenchmarkThresholds,
1493) -> Vec<String> {
1494 let mut failures =
1495 generic_threshold_failures(report.precision, report.recall, report.f1, thresholds);
1496 if let Some(threshold) = thresholds.min_type_accuracy
1497 && report.type_accuracy < threshold
1498 {
1499 failures.push(format!(
1500 "type_accuracy {} is below threshold {}",
1501 report.type_accuracy, threshold
1502 ));
1503 }
1504 failures
1505}
1506
1507fn link_threshold_failures(
1508 report: &LinkBenchmarkReport,
1509 thresholds: &BenchmarkThresholds,
1510) -> Vec<String> {
1511 generic_threshold_failures(report.precision, report.recall, report.f1, thresholds)
1512}
1513
1514fn workflow_threshold_failures(
1515 report: &WorkflowBenchmarkReport,
1516 thresholds: &BenchmarkThresholds,
1517) -> Vec<String> {
1518 let mut failures = Vec::new();
1519 for detail in &report.details {
1520 if !detail.passed {
1521 failures.push(format!(
1522 "{} {} is below minimum {}",
1523 detail.metric, detail.actual, detail.expected_min
1524 ));
1525 }
1526 }
1527 if let Some(threshold) = thresholds.min_workflow_score
1528 && report.workflow_score < threshold
1529 {
1530 failures.push(format!(
1531 "workflow_score {} is below threshold {}",
1532 report.workflow_score, threshold
1533 ));
1534 }
1535 failures
1536}
1537
1538fn generic_threshold_failures(
1539 precision: f64,
1540 recall: f64,
1541 f1: f64,
1542 thresholds: &BenchmarkThresholds,
1543) -> Vec<String> {
1544 let mut failures = Vec::new();
1545 if let Some(threshold) = thresholds.min_f1
1546 && f1 < threshold
1547 {
1548 failures.push(format!("f1 {} is below threshold {}", f1, threshold));
1549 }
1550 if let Some(threshold) = thresholds.min_precision
1551 && precision < threshold
1552 {
1553 failures.push(format!(
1554 "precision {} is below threshold {}",
1555 precision, threshold
1556 ));
1557 }
1558 if let Some(threshold) = thresholds.min_recall
1559 && recall < threshold
1560 {
1561 failures.push(format!(
1562 "recall {} is below threshold {}",
1563 recall, threshold
1564 ));
1565 }
1566 failures
1567}
1568
1569fn hash_path(path: &Path) -> Result<String, String> {
1570 let mut hasher = Sha256::new();
1571 if path.is_file() {
1572 let bytes = std::fs::read(path)
1573 .map_err(|e| format!("Failed to read {} for hashing: {e}", path.display()))?;
1574 hasher.update(&bytes);
1575 } else if path.is_dir() {
1576 let mut files = Vec::new();
1577 collect_hash_files(path, path, &mut files)?;
1578 files.sort();
1579 for rel in files {
1580 hasher.update(rel.to_string_lossy().as_bytes());
1581 let bytes = std::fs::read(path.join(&rel))
1582 .map_err(|e| format!("Failed to read {} for hashing: {e}", rel.display()))?;
1583 hasher.update(&bytes);
1584 }
1585 } else {
1586 return Err(format!("Cannot hash missing path {}", path.display()));
1587 }
1588 Ok(format!("{:x}", hasher.finalize()))
1589}
1590
1591fn collect_hash_files(root: &Path, dir: &Path, files: &mut Vec<PathBuf>) -> Result<(), String> {
1592 for entry in
1593 std::fs::read_dir(dir).map_err(|e| format!("Failed to read {}: {e}", dir.display()))?
1594 {
1595 let entry = entry.map_err(|e| format!("Failed to read directory entry: {e}"))?;
1596 let path = entry.path();
1597 if path.is_dir() {
1598 collect_hash_files(root, &path, files)?;
1599 } else if path.is_file() {
1600 let rel = path.strip_prefix(root).map_err(|e| e.to_string())?;
1601 files.push(rel.to_path_buf());
1602 }
1603 }
1604 Ok(())
1605}
1606
1607#[cfg(test)]
1608mod tests {
1609 use super::*;
1610
1611 #[test]
1612 fn jaccard_identical() {
1613 let sim = jaccard_similarity("NLRP3 activates caspase-1", "NLRP3 activates caspase-1");
1614 assert!((sim - 1.0).abs() < 0.001);
1615 }
1616
1617 #[test]
1618 fn jaccard_disjoint() {
1619 let sim = jaccard_similarity("NLRP3 activates caspase-1", "tau propagation in cortex");
1620 assert!(sim < 0.1);
1621 }
1622
1623 #[test]
1624 fn jaccard_partial() {
1625 let sim = jaccard_similarity(
1626 "NLRP3 inflammasome activates caspase-1 in microglia",
1627 "NLRP3 activates caspase-1",
1628 );
1629 assert!(sim > 0.3);
1630 assert!(sim < 1.0);
1631 }
1632
1633 #[test]
1634 fn jaccard_empty() {
1635 assert!((jaccard_similarity("", "") - 1.0).abs() < 0.001);
1636 assert!((jaccard_similarity("word", "")).abs() < 0.001);
1637 }
1638
1639 #[test]
1640 fn benchmark_empty() {
1641 let report = benchmark(&[], &[]);
1642 assert_eq!(report.matched, 0);
1643 assert_eq!(report.f1, 0.0);
1644 }
1645
1646 #[test]
1647 fn benchmark_perfect_match() {
1648 use crate::bundle::*;
1649
1650 let finding = FindingBundle {
1651 id: "vf_test".into(),
1652 version: 1,
1653 previous_version: None,
1654 assertion: Assertion {
1655 text: "NLRP3 activates caspase-1 in microglia".into(),
1656 assertion_type: "mechanism".into(),
1657 entities: vec![
1658 Entity {
1659 name: "NLRP3".into(),
1660 entity_type: "protein".into(),
1661 identifiers: serde_json::Map::new(),
1662 canonical_id: None,
1663 candidates: vec![],
1664 aliases: vec![],
1665 resolution_provenance: None,
1666 resolution_confidence: 1.0,
1667 resolution_method: None,
1668 species_context: None,
1669 needs_review: false,
1670 },
1671 Entity {
1672 name: "caspase-1".into(),
1673 entity_type: "protein".into(),
1674 identifiers: serde_json::Map::new(),
1675 canonical_id: None,
1676 candidates: vec![],
1677 aliases: vec![],
1678 resolution_provenance: None,
1679 resolution_confidence: 1.0,
1680 resolution_method: None,
1681 species_context: None,
1682 needs_review: false,
1683 },
1684 ],
1685 relation: None,
1686 direction: None,
1687 causal_claim: None,
1688 causal_evidence_grade: None,
1689 },
1690 evidence: Evidence {
1691 evidence_type: "experimental".into(),
1692 model_system: String::new(),
1693 species: None,
1694 method: String::new(),
1695 sample_size: None,
1696 effect_size: None,
1697 p_value: None,
1698 replicated: false,
1699 replication_count: None,
1700 evidence_spans: vec![],
1701 },
1702 conditions: Conditions {
1703 text: String::new(),
1704 species_verified: vec![],
1705 species_unverified: vec![],
1706 in_vitro: false,
1707 in_vivo: false,
1708 human_data: false,
1709 clinical_trial: false,
1710 concentration_range: None,
1711 duration: None,
1712 age_group: None,
1713 cell_type: None,
1714 },
1715 confidence: Confidence::raw(0.85, "test", 0.9),
1716 provenance: Provenance {
1717 source_type: "published_paper".into(),
1718 doi: None,
1719 pmid: None,
1720 pmc: None,
1721 openalex_id: None,
1722 url: None,
1723 title: "Test".into(),
1724 authors: vec![],
1725 year: Some(2024),
1726 journal: None,
1727 license: None,
1728 publisher: None,
1729 funders: vec![],
1730 extraction: Extraction::default(),
1731 review: None,
1732 citation_count: None,
1733 },
1734 flags: Flags {
1735 gap: false,
1736 negative_space: false,
1737 contested: false,
1738 retracted: false,
1739 declining: false,
1740 gravity_well: false,
1741 review_state: None,
1742 superseded: false,
1743 signature_threshold: None,
1744 jointly_accepted: false,
1745 },
1746 links: vec![],
1747 annotations: vec![],
1748 attachments: vec![],
1749 created: String::new(),
1750 updated: None,
1751
1752 access_tier: crate::access_tier::AccessTier::Public,
1753 };
1754
1755 let gold = vec![GoldFinding {
1756 id: None,
1757 assertion_text: "NLRP3 activates caspase-1 in microglia".into(),
1758 assertion_type: "mechanism".into(),
1759 entities: vec!["NLRP3".into(), "caspase-1".into()],
1760 confidence_range: ConfidenceRange {
1761 low: 0.7,
1762 high: 0.95,
1763 },
1764 notes: None,
1765 }];
1766
1767 let report = benchmark(&[finding], &gold);
1768 assert_eq!(report.matched, 1);
1769 assert!((report.recall - 1.0).abs() < 0.001);
1770 assert!((report.precision - 1.0).abs() < 0.001);
1771 assert!((report.entity_accuracy - 1.0).abs() < 0.001);
1772 assert!((report.confidence_calibration - 1.0).abs() < 0.001);
1773 }
1774
1775 fn make_finding_with_entities(entities: Vec<Entity>) -> FindingBundle {
1777 use crate::bundle::*;
1778 FindingBundle {
1779 id: "vf_ent_test".into(),
1780 version: 1,
1781 previous_version: None,
1782 assertion: Assertion {
1783 text: "test assertion".into(),
1784 assertion_type: "mechanism".into(),
1785 entities,
1786 relation: None,
1787 direction: None,
1788 causal_claim: None,
1789 causal_evidence_grade: None,
1790 },
1791 evidence: Evidence {
1792 evidence_type: "experimental".into(),
1793 model_system: String::new(),
1794 species: None,
1795 method: String::new(),
1796 sample_size: None,
1797 effect_size: None,
1798 p_value: None,
1799 replicated: false,
1800 replication_count: None,
1801 evidence_spans: vec![],
1802 },
1803 conditions: Conditions {
1804 text: String::new(),
1805 species_verified: vec![],
1806 species_unverified: vec![],
1807 in_vitro: false,
1808 in_vivo: false,
1809 human_data: false,
1810 clinical_trial: false,
1811 concentration_range: None,
1812 duration: None,
1813 age_group: None,
1814 cell_type: None,
1815 },
1816 confidence: Confidence::raw(0.9, "test", 0.9),
1817 provenance: Provenance {
1818 source_type: "published_paper".into(),
1819 doi: None,
1820 pmid: None,
1821 pmc: None,
1822 openalex_id: None,
1823 url: None,
1824 title: "Test".into(),
1825 authors: vec![],
1826 year: Some(2024),
1827 journal: None,
1828 license: None,
1829 publisher: None,
1830 funders: vec![],
1831 extraction: Extraction::default(),
1832 review: None,
1833 citation_count: None,
1834 },
1835 flags: Flags {
1836 gap: false,
1837 negative_space: false,
1838 contested: false,
1839 retracted: false,
1840 declining: false,
1841 gravity_well: false,
1842 review_state: None,
1843 superseded: false,
1844 signature_threshold: None,
1845 jointly_accepted: false,
1846 },
1847 links: vec![],
1848 annotations: vec![],
1849 attachments: vec![],
1850 created: String::new(),
1851 updated: None,
1852
1853 access_tier: crate::access_tier::AccessTier::Public,
1854 }
1855 }
1856
1857 #[test]
1858 fn entity_benchmark_empty() {
1859 let report = entity_benchmark(&[], &[]);
1860 assert_eq!(report.total_gold_entities, 0);
1861 assert_eq!(report.found_in_frontier, 0);
1862 assert_eq!(report.f1, 0.0);
1863 }
1864
1865 #[test]
1866 fn entity_benchmark_perfect_match() {
1867 use crate::bundle::*;
1868
1869 let entity = Entity {
1870 name: "NLRP3".into(),
1871 entity_type: "protein".into(),
1872 identifiers: serde_json::Map::new(),
1873 canonical_id: Some(ResolvedId {
1874 source: "uniprot".into(),
1875 id: "Q96P20".into(),
1876 confidence: 0.95,
1877 matched_name: Some("NLRP3".into()),
1878 }),
1879 candidates: vec![],
1880 aliases: vec![],
1881 resolution_provenance: Some("vela_resolve/uniprot".into()),
1882 resolution_confidence: 0.95,
1883 resolution_method: None,
1884 species_context: None,
1885 needs_review: false,
1886 };
1887
1888 let finding = make_finding_with_entities(vec![entity]);
1889
1890 let gold = vec![GoldEntity {
1891 name: "NLRP3".into(),
1892 entity_type: "protein".into(),
1893 expected_source: "uniprot".into(),
1894 expected_id: "Q96P20".into(),
1895 expected_confidence: 0.8,
1896 alternatives: vec![],
1897 }];
1898
1899 let report = entity_benchmark(&[finding], &gold);
1900 assert_eq!(report.total_gold_entities, 1);
1901 assert_eq!(report.found_in_frontier, 1);
1902 assert_eq!(report.id_correct, 1);
1903 assert_eq!(report.confidence_ok, 1);
1904 assert!((report.precision - 1.0).abs() < 0.001);
1905 assert!((report.recall - 1.0).abs() < 0.001);
1906 assert!((report.f1 - 1.0).abs() < 0.001);
1907 }
1908
1909 #[test]
1910 fn entity_benchmark_alternative_id() {
1911 use crate::bundle::*;
1912
1913 let entity = Entity {
1914 name: "aspirin".into(),
1915 entity_type: "compound".into(),
1916 identifiers: serde_json::Map::new(),
1917 canonical_id: Some(ResolvedId {
1918 source: "pubchem".into(),
1919 id: "2244".into(),
1920 confidence: 0.9,
1921 matched_name: Some("Aspirin".into()),
1922 }),
1923 candidates: vec![],
1924 aliases: vec![],
1925 resolution_provenance: None,
1926 resolution_confidence: 0.9,
1927 resolution_method: None,
1928 species_context: None,
1929 needs_review: false,
1930 };
1931
1932 let finding = make_finding_with_entities(vec![entity]);
1933
1934 let gold = vec![GoldEntity {
1936 name: "aspirin".into(),
1937 entity_type: "compound".into(),
1938 expected_source: "chebi".into(),
1939 expected_id: "CHEBI:15365".into(),
1940 expected_confidence: 0.7,
1941 alternatives: vec![AlternativeId {
1942 source: "pubchem".into(),
1943 id: "2244".into(),
1944 }],
1945 }];
1946
1947 let report = entity_benchmark(&[finding], &gold);
1948 assert_eq!(
1949 report.id_correct, 1,
1950 "Alternative ID should count as correct"
1951 );
1952 assert!((report.precision - 1.0).abs() < 0.001);
1953 }
1954
1955 #[test]
1956 fn entity_benchmark_mismatch_and_missing() {
1957 use crate::bundle::*;
1958
1959 let entity = Entity {
1961 name: "BRCA1".into(),
1962 entity_type: "gene".into(),
1963 identifiers: serde_json::Map::new(),
1964 canonical_id: Some(ResolvedId {
1965 source: "uniprot".into(),
1966 id: "WRONG_ID".into(),
1967 confidence: 0.8,
1968 matched_name: Some("BRCA1".into()),
1969 }),
1970 candidates: vec![],
1971 aliases: vec![],
1972 resolution_provenance: None,
1973 resolution_confidence: 0.8,
1974 resolution_method: None,
1975 species_context: None,
1976 needs_review: false,
1977 };
1978
1979 let finding = make_finding_with_entities(vec![entity]);
1980
1981 let gold = vec![
1982 GoldEntity {
1983 name: "BRCA1".into(),
1984 entity_type: "gene".into(),
1985 expected_source: "uniprot".into(),
1986 expected_id: "P38398".into(),
1987 expected_confidence: 0.7,
1988 alternatives: vec![],
1989 },
1990 GoldEntity {
1992 name: "TP53".into(),
1993 entity_type: "gene".into(),
1994 expected_source: "uniprot".into(),
1995 expected_id: "P04637".into(),
1996 expected_confidence: 0.7,
1997 alternatives: vec![],
1998 },
1999 ];
2000
2001 let report = entity_benchmark(&[finding], &gold);
2002 assert_eq!(report.total_gold_entities, 2);
2003 assert_eq!(report.found_in_frontier, 1); assert_eq!(report.id_correct, 0);
2005 assert!((report.precision).abs() < 0.001); assert!((report.recall).abs() < 0.001); assert_eq!(report.by_type.len(), 1);
2008 assert_eq!(report.by_type[0].entity_type, "gene");
2009 assert_eq!(report.by_type[0].total, 2);
2010 }
2011}