Skip to main content

vela_protocol/
link.rs

1//! Stage 4: LINK — infer typed relationships between findings.
2//!
3//! Two passes:
4//! 1. **Deterministic** (`deterministic_links`): O(n^2) entity-overlap scan, no API calls.
5//! 2. **LLM** (`infer_links`): existing LLM-based inference on top, with dedup/merge.
6//!
7//! ## Integration
8//!
9//! Add to the compile pipeline in main.rs, BEFORE `infer_links`:
10//! ```ignore
11//! let det_count = link::deterministic_links(&mut all_bundles);
12//! println!("  -> {det_count} deterministic links (entity overlap)");
13//! let llm_count = link::infer_links(&client, &config, &mut all_bundles).await.unwrap_or(0);
14//! println!("  -> {llm_count} LLM links inferred");
15//! ```
16use crate::bundle::FindingBundle;
17use std::collections::HashSet;
18// ── Deterministic entity-overlap linking ─────────────────────────────
19
20/// Run a fast, deterministic linking pass based on shared entities between
21/// findings. Returns the number of links created.
22///
23/// Rules:
24/// - Shared entity, different papers -> "extends"
25/// - Shared entity, opposite direction (positive/negative) -> "contradicts"
26/// - Shared entity, same direction, newer + higher confidence -> "supersedes"
27/// - 2+ shared entities -> noted as strong overlap in the link note
28pub fn deterministic_links(bundles: &mut [FindingBundle]) -> usize {
29    let n = bundles.len();
30    if n < 2 {
31        return 0;
32    }
33
34    // Pre-compute normalized entity sets for each bundle.
35    // Include aliases so that "NLRP3" in one paper matches "cryopyrin" in another.
36    let entity_sets: Vec<HashSet<String>> = bundles
37        .iter()
38        .map(|b| {
39            let mut names = HashSet::new();
40            for e in &b.assertion.entities {
41                names.insert(e.name.to_lowercase());
42                for alias in &e.aliases {
43                    names.insert(alias.to_lowercase());
44                }
45            }
46            names
47        })
48        .collect();
49
50    // Pre-compute DOIs for same-paper detection.
51    let dois: Vec<Option<String>> = bundles
52        .iter()
53        .map(|b| b.provenance.doi.as_ref().map(|d| d.to_lowercase()))
54        .collect();
55
56    // Collect all links first to avoid borrow issues.
57    struct PendingLink {
58        from_idx: usize,
59        to_id: String,
60        link_type: String,
61        note: String,
62    }
63
64    let mut pending: Vec<PendingLink> = Vec::new();
65
66    for i in 0..n {
67        for j in (i + 1)..n {
68            let shared: HashSet<&String> = entity_sets[i].intersection(&entity_sets[j]).collect();
69            if shared.is_empty() {
70                continue;
71            }
72
73            let same_paper = match (&dois[i], &dois[j]) {
74                (Some(a), Some(b)) => a == b,
75                _ => false,
76            };
77
78            // Skip intra-paper links (findings from the same paper already cohere).
79            if same_paper {
80                continue;
81            }
82
83            let shared_names: Vec<String> = shared.iter().map(|s| s.to_string()).collect();
84            let overlap_count = shared_names.len();
85            let overlap_label = shared_names.join(", ");
86            let strong = overlap_count >= 2;
87
88            // Determine link type.
89            let dir_i = bundles[i].assertion.direction.as_deref();
90            let dir_j = bundles[j].assertion.direction.as_deref();
91
92            let (link_type, note) = if is_opposite(dir_i, dir_j) {
93                (
94                    "contradicts",
95                    format!(
96                        "Opposite directions on shared entit{}: {}{}",
97                        if overlap_count == 1 { "y" } else { "ies" },
98                        overlap_label,
99                        if strong { " (strong overlap)" } else { "" }
100                    ),
101                )
102            } else if is_same_direction(dir_i, dir_j) && could_supersede(bundles, i, j) {
103                let (newer, _older) = if supersede_order(bundles, i, j) {
104                    (i, j)
105                } else {
106                    (j, i)
107                };
108                let _is_i_newer = newer == i;
109                (
110                    "supersedes",
111                    format!(
112                        "Newer/higher-confidence finding on shared entit{}: {}{}",
113                        if overlap_count == 1 { "y" } else { "ies" },
114                        overlap_label,
115                        if strong { " (strong overlap)" } else { "" }
116                    ),
117                )
118            } else {
119                (
120                    "extends",
121                    format!(
122                        "Cross-paper shared entit{}: {}{}",
123                        if overlap_count == 1 { "y" } else { "ies" },
124                        overlap_label,
125                        if strong { " (strong overlap)" } else { "" }
126                    ),
127                )
128            };
129
130            // For supersedes, only the newer finding gets the outgoing link.
131            if link_type == "supersedes" {
132                let (from_idx, to_idx) = if supersede_order(bundles, i, j) {
133                    (i, j)
134                } else {
135                    (j, i)
136                };
137                pending.push(PendingLink {
138                    from_idx,
139                    to_id: bundles[to_idx].id.clone(),
140                    link_type: link_type.to_string(),
141                    note,
142                });
143            } else {
144                // Bidirectional awareness: add from i -> j.
145                pending.push(PendingLink {
146                    from_idx: i,
147                    to_id: bundles[j].id.clone(),
148                    link_type: link_type.to_string(),
149                    note,
150                });
151            }
152        }
153    }
154
155    let count = pending.len();
156    for pl in pending {
157        bundles[pl.from_idx].add_link_with_source(&pl.to_id, &pl.link_type, &pl.note, "compiler");
158    }
159
160    count
161}
162
163/// True if two directions are opposite (positive vs negative).
164fn is_opposite(a: Option<&str>, b: Option<&str>) -> bool {
165    matches!(
166        (a, b),
167        (Some("positive"), Some("negative")) | (Some("negative"), Some("positive"))
168    )
169}
170
171/// True if two directions are the same non-null value.
172fn is_same_direction(a: Option<&str>, b: Option<&str>) -> bool {
173    match (a, b) {
174        (Some(a), Some(b)) => a == b && a != "null",
175        _ => false,
176    }
177}
178
179/// True if one finding plausibly supersedes the other (same direction, one is
180/// newer with higher confidence).
181fn could_supersede(bundles: &[FindingBundle], i: usize, j: usize) -> bool {
182    let yi = bundles[i].provenance.year.unwrap_or(0);
183    let yj = bundles[j].provenance.year.unwrap_or(0);
184    let ci = bundles[i].confidence.score;
185    let cj = bundles[j].confidence.score;
186
187    // One must be strictly newer AND have higher confidence.
188    (yi > yj && ci > cj) || (yj > yi && cj > ci)
189}
190
191/// Returns true if bundle[i] supersedes bundle[j] (i is newer+stronger).
192fn supersede_order(bundles: &[FindingBundle], i: usize, j: usize) -> bool {
193    let yi = bundles[i].provenance.year.unwrap_or(0);
194    let yj = bundles[j].provenance.year.unwrap_or(0);
195    let ci = bundles[i].confidence.score;
196    let cj = bundles[j].confidence.score;
197    yi > yj && ci > cj
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203    use crate::bundle::*;
204
205    fn make_finding(
206        id: &str,
207        entities: Vec<(&str, &str)>,
208        direction: Option<&str>,
209        doi: Option<&str>,
210        year: i32,
211        score: f64,
212    ) -> FindingBundle {
213        FindingBundle {
214            id: id.into(),
215            version: 1,
216            previous_version: None,
217            assertion: Assertion {
218                text: format!("Finding {id}"),
219                assertion_type: "mechanism".into(),
220                entities: entities
221                    .into_iter()
222                    .map(|(name, etype)| Entity {
223                        name: name.into(),
224                        entity_type: etype.into(),
225                        identifiers: serde_json::Map::new(),
226                        canonical_id: None,
227                        candidates: vec![],
228                        aliases: vec![],
229                        resolution_provenance: None,
230                        resolution_confidence: 1.0,
231                        resolution_method: None,
232                        species_context: None,
233                        needs_review: false,
234                    })
235                    .collect(),
236                relation: None,
237                direction: direction.map(|s| s.to_string()),
238                causal_claim: None,
239                causal_evidence_grade: None,
240            },
241            evidence: Evidence {
242                evidence_type: "experimental".into(),
243                model_system: String::new(),
244                species: None,
245                method: String::new(),
246                sample_size: None,
247                effect_size: None,
248                p_value: None,
249                replicated: false,
250                replication_count: None,
251                evidence_spans: vec![],
252            },
253            conditions: Conditions {
254                text: String::new(),
255                species_verified: vec![],
256                species_unverified: vec![],
257                in_vitro: false,
258                in_vivo: false,
259                human_data: false,
260                clinical_trial: false,
261                concentration_range: None,
262                duration: None,
263                age_group: None,
264                cell_type: None,
265            },
266            confidence: Confidence::raw(score, "seeded prior", 0.85),
267            provenance: Provenance {
268                source_type: "published_paper".into(),
269                doi: doi.map(|s| s.to_string()),
270                pmid: None,
271                pmc: None,
272                openalex_id: None,
273                url: None,
274                title: "Test".into(),
275                authors: vec![],
276                year: Some(year),
277                journal: None,
278                license: None,
279                publisher: None,
280                funders: vec![],
281                extraction: Extraction::default(),
282                review: None,
283                citation_count: None,
284            },
285            flags: Flags {
286                gap: false,
287                negative_space: false,
288                contested: false,
289                retracted: false,
290                declining: false,
291                gravity_well: false,
292                review_state: None,
293                superseded: false,
294                signature_threshold: None,
295                jointly_accepted: false,
296            },
297            links: vec![],
298            annotations: vec![],
299            attachments: vec![],
300            created: String::new(),
301            updated: None,
302
303            access_tier: crate::access_tier::AccessTier::Public,
304        }
305    }
306
307    #[test]
308    fn shared_entity_creates_extends_link() {
309        let mut bundles = vec![
310            make_finding(
311                "f1",
312                vec![("NLRP3", "protein")],
313                None,
314                Some("10.1/a"),
315                2020,
316                0.7,
317            ),
318            make_finding(
319                "f2",
320                vec![("NLRP3", "protein")],
321                None,
322                Some("10.1/b"),
323                2021,
324                0.7,
325            ),
326        ];
327        let count = deterministic_links(&mut bundles);
328        assert_eq!(count, 1);
329        assert_eq!(bundles[0].links.len(), 1);
330        assert_eq!(bundles[0].links[0].link_type, "extends");
331        assert_eq!(bundles[0].links[0].target, "f2");
332    }
333
334    #[test]
335    fn opposite_directions_creates_contradicts_link() {
336        let mut bundles = vec![
337            make_finding(
338                "f1",
339                vec![("NLRP3", "protein")],
340                Some("positive"),
341                Some("10.1/a"),
342                2020,
343                0.7,
344            ),
345            make_finding(
346                "f2",
347                vec![("NLRP3", "protein")],
348                Some("negative"),
349                Some("10.1/b"),
350                2021,
351                0.7,
352            ),
353        ];
354        let count = deterministic_links(&mut bundles);
355        assert_eq!(count, 1);
356        assert_eq!(bundles[0].links[0].link_type, "contradicts");
357    }
358
359    #[test]
360    fn newer_higher_confidence_creates_supersedes() {
361        let mut bundles = vec![
362            make_finding(
363                "f1",
364                vec![("NLRP3", "protein")],
365                Some("positive"),
366                Some("10.1/a"),
367                2018,
368                0.6,
369            ),
370            make_finding(
371                "f2",
372                vec![("NLRP3", "protein")],
373                Some("positive"),
374                Some("10.1/b"),
375                2024,
376                0.9,
377            ),
378        ];
379        let count = deterministic_links(&mut bundles);
380        assert_eq!(count, 1);
381        // f2 is newer+stronger, so it gets the supersedes link pointing to f1
382        assert_eq!(bundles[1].links.len(), 1);
383        assert_eq!(bundles[1].links[0].link_type, "supersedes");
384        assert_eq!(bundles[1].links[0].target, "f1");
385    }
386
387    #[test]
388    fn no_shared_entities_no_link() {
389        let mut bundles = vec![
390            make_finding(
391                "f1",
392                vec![("NLRP3", "protein")],
393                None,
394                Some("10.1/a"),
395                2020,
396                0.7,
397            ),
398            make_finding(
399                "f2",
400                vec![("APOE4", "gene")],
401                None,
402                Some("10.1/b"),
403                2021,
404                0.7,
405            ),
406        ];
407        let count = deterministic_links(&mut bundles);
408        assert_eq!(count, 0);
409        assert!(bundles[0].links.is_empty());
410        assert!(bundles[1].links.is_empty());
411    }
412
413    #[test]
414    fn same_paper_skipped() {
415        let mut bundles = vec![
416            make_finding(
417                "f1",
418                vec![("NLRP3", "protein")],
419                None,
420                Some("10.1/same"),
421                2020,
422                0.7,
423            ),
424            make_finding(
425                "f2",
426                vec![("NLRP3", "protein")],
427                None,
428                Some("10.1/same"),
429                2020,
430                0.7,
431            ),
432        ];
433        let count = deterministic_links(&mut bundles);
434        assert_eq!(count, 0);
435    }
436
437    #[test]
438    fn single_bundle_no_links() {
439        let mut bundles = vec![make_finding(
440            "f1",
441            vec![("NLRP3", "protein")],
442            None,
443            Some("10.1/a"),
444            2020,
445            0.7,
446        )];
447        let count = deterministic_links(&mut bundles);
448        assert_eq!(count, 0);
449    }
450
451    #[test]
452    fn empty_bundles_no_links() {
453        let mut bundles: Vec<FindingBundle> = vec![];
454        let count = deterministic_links(&mut bundles);
455        assert_eq!(count, 0);
456    }
457
458    #[test]
459    fn strong_overlap_noted() {
460        let mut bundles = vec![
461            make_finding(
462                "f1",
463                vec![("NLRP3", "protein"), ("IL-1β", "protein")],
464                None,
465                Some("10.1/a"),
466                2020,
467                0.7,
468            ),
469            make_finding(
470                "f2",
471                vec![("NLRP3", "protein"), ("IL-1β", "protein")],
472                None,
473                Some("10.1/b"),
474                2021,
475                0.7,
476            ),
477        ];
478        let count = deterministic_links(&mut bundles);
479        assert_eq!(count, 1);
480        assert!(bundles[0].links[0].note.contains("strong overlap"));
481    }
482
483    #[test]
484    fn alias_matching_works() {
485        let mut bundles = vec![
486            make_finding("f1", vec![], None, Some("10.1/a"), 2020, 0.7),
487            make_finding("f2", vec![], None, Some("10.1/b"), 2021, 0.7),
488        ];
489        // Add entity with alias to f1
490        bundles[0].assertion.entities.push(Entity {
491            name: "NLRP3".into(),
492            entity_type: "protein".into(),
493            identifiers: serde_json::Map::new(),
494            canonical_id: None,
495            candidates: vec![],
496            aliases: vec!["cryopyrin".into()],
497            resolution_provenance: None,
498            resolution_confidence: 1.0,
499            resolution_method: None,
500            species_context: None,
501            needs_review: false,
502        });
503        // Add entity matching the alias in f2
504        bundles[1].assertion.entities.push(Entity {
505            name: "cryopyrin".into(),
506            entity_type: "protein".into(),
507            identifiers: serde_json::Map::new(),
508            canonical_id: None,
509            candidates: vec![],
510            aliases: vec![],
511            resolution_provenance: None,
512            resolution_confidence: 1.0,
513            resolution_method: None,
514            species_context: None,
515            needs_review: false,
516        });
517        let count = deterministic_links(&mut bundles);
518        assert_eq!(count, 1);
519    }
520
521    #[test]
522    fn link_inferred_by_is_compiler() {
523        let mut bundles = vec![
524            make_finding(
525                "f1",
526                vec![("NLRP3", "protein")],
527                None,
528                Some("10.1/a"),
529                2020,
530                0.7,
531            ),
532            make_finding(
533                "f2",
534                vec![("NLRP3", "protein")],
535                None,
536                Some("10.1/b"),
537                2021,
538                0.7,
539            ),
540        ];
541        deterministic_links(&mut bundles);
542        assert_eq!(bundles[0].links[0].inferred_by, "compiler");
543    }
544
545    #[test]
546    fn is_opposite_helper() {
547        assert!(is_opposite(Some("positive"), Some("negative")));
548        assert!(is_opposite(Some("negative"), Some("positive")));
549        assert!(!is_opposite(Some("positive"), Some("positive")));
550        assert!(!is_opposite(None, Some("negative")));
551        assert!(!is_opposite(None, None));
552    }
553
554    #[test]
555    fn is_same_direction_helper() {
556        assert!(is_same_direction(Some("positive"), Some("positive")));
557        assert!(!is_same_direction(Some("positive"), Some("negative")));
558        assert!(!is_same_direction(None, None));
559        assert!(!is_same_direction(Some("null"), Some("null")));
560    }
561
562    #[test]
563    fn valid_link_types_list() {
564        assert!(VALID_LINK_TYPES.contains(&"supports"));
565        assert!(VALID_LINK_TYPES.contains(&"contradicts"));
566        assert!(VALID_LINK_TYPES.contains(&"extends"));
567        assert!(VALID_LINK_TYPES.contains(&"supersedes"));
568        assert!(!VALID_LINK_TYPES.contains(&"invalidtype"));
569    }
570}