Skip to main content

vela_protocol/
entity_resolve.rs

1//! v0.19: bundled entity resolution.
2//!
3//! Hardcoded lookup table for common entities a curator hand-adds via
4//! `vela finding add --entities`. Maps `(normalized_name, entity_type)` to
5//! a canonical `ResolvedId` (UniProt for proteins, MeSH for diseases/anatomy,
6//! ChEBI for compounds, NCBI Taxonomy for organisms). When a match lands,
7//! the entity's `canonical_id` is populated, `resolution_method = Manual`,
8//! `resolution_confidence = 0.95`, `needs_review = false`.
9//!
10//! This is deliberately not exhaustive. The ~50-entry table covers the
11//! Alzheimer's drug-target landscape's natural vocabulary plus the cross-
12//! domain v0.10 enums (particle, instrument). Real research will need
13//! either a much larger bundled table or live ontology API integration —
14//! but a manual-curation user gets meaningful resolution today, and
15//! `needs_human_review` strict-blockers drop for matched entities.
16
17use crate::bundle::{Entity, ResolutionMethod, ResolvedId};
18use crate::project::Project;
19
20/// One bundled lookup entry. `match_names` is the normalized list of
21/// names + aliases that should resolve to this entry.
22struct OntologyEntry {
23    canonical_name: &'static str,
24    entity_type: &'static str,
25    match_names: &'static [&'static str],
26    source: &'static str,
27    id: &'static str,
28}
29
30/// Bundled common-entity table. v0.19: Alzheimer's-flavored bias
31/// (matches Will's frontier vocabulary) + a small cross-domain set
32/// for v0.10 physics entities. Add carefully; every entry becomes a
33/// public claim that this name resolves to this id.
34const TABLE: &[OntologyEntry] = &[
35    // Proteins (UniProt human canonical)
36    OntologyEntry {
37        canonical_name: "amyloid-beta",
38        entity_type: "protein",
39        match_names: &["amyloid-beta", "amyloid beta", "abeta", "aβ", "ab"],
40        source: "UniProt",
41        id: "P05067", // Amyloid-beta precursor protein (APP)
42    },
43    OntologyEntry {
44        canonical_name: "APP",
45        entity_type: "protein",
46        match_names: &["app", "amyloid precursor protein"],
47        source: "UniProt",
48        id: "P05067",
49    },
50    OntologyEntry {
51        canonical_name: "BACE1",
52        entity_type: "protein",
53        match_names: &["bace1", "β-secretase 1", "beta-secretase 1"],
54        source: "UniProt",
55        id: "P56817",
56    },
57    OntologyEntry {
58        canonical_name: "tau",
59        entity_type: "protein",
60        match_names: &["tau", "mapt", "microtubule-associated protein tau"],
61        source: "UniProt",
62        id: "P10636",
63    },
64    OntologyEntry {
65        canonical_name: "TREM2",
66        entity_type: "protein",
67        match_names: &["trem2", "triggering receptor expressed on myeloid cells 2"],
68        source: "UniProt",
69        id: "Q9NZC2",
70    },
71    OntologyEntry {
72        canonical_name: "ApoE",
73        entity_type: "protein",
74        match_names: &["apoe", "apolipoprotein e"],
75        source: "UniProt",
76        id: "P02649",
77    },
78    OntologyEntry {
79        canonical_name: "PSEN1",
80        entity_type: "protein",
81        match_names: &["psen1", "presenilin-1", "presenilin 1"],
82        source: "UniProt",
83        id: "P49768",
84    },
85    OntologyEntry {
86        canonical_name: "PSEN2",
87        entity_type: "protein",
88        match_names: &["psen2", "presenilin-2", "presenilin 2"],
89        source: "UniProt",
90        id: "P49810",
91    },
92    // Same identifiers as gene symbols
93    OntologyEntry {
94        canonical_name: "PSEN1",
95        entity_type: "gene",
96        match_names: &["psen1"],
97        source: "NCBI Gene",
98        id: "5663",
99    },
100    OntologyEntry {
101        canonical_name: "APOE",
102        entity_type: "gene",
103        match_names: &["apoe"],
104        source: "NCBI Gene",
105        id: "348",
106    },
107    // Diseases (MeSH)
108    OntologyEntry {
109        canonical_name: "Alzheimer's disease",
110        entity_type: "disease",
111        match_names: &[
112            "alzheimer's disease",
113            "alzheimer disease",
114            "alzheimers disease",
115            "ad",
116        ],
117        source: "MeSH",
118        id: "D000544",
119    },
120    OntologyEntry {
121        canonical_name: "mild cognitive impairment",
122        entity_type: "disease",
123        match_names: &["mild cognitive impairment", "mci"],
124        source: "MeSH",
125        id: "D060825",
126    },
127    // Compounds / drugs (CHEBI/DrugBank)
128    OntologyEntry {
129        canonical_name: "Lecanemab",
130        entity_type: "compound",
131        match_names: &["lecanemab", "leqembi"],
132        source: "DrugBank",
133        id: "DB16703",
134    },
135    OntologyEntry {
136        canonical_name: "Aducanumab",
137        entity_type: "compound",
138        match_names: &["aducanumab", "aduhelm"],
139        source: "DrugBank",
140        id: "DB12274",
141    },
142    OntologyEntry {
143        canonical_name: "Donanemab",
144        entity_type: "compound",
145        match_names: &["donanemab", "kisunla"],
146        source: "DrugBank",
147        id: "DB17791",
148    },
149    OntologyEntry {
150        canonical_name: "Verubecestat",
151        entity_type: "compound",
152        match_names: &["verubecestat", "mk-8931"],
153        source: "DrugBank",
154        id: "DB12089",
155    },
156    OntologyEntry {
157        canonical_name: "Liraglutide",
158        entity_type: "compound",
159        match_names: &["liraglutide", "victoza", "saxenda"],
160        source: "DrugBank",
161        id: "DB06655",
162    },
163    OntologyEntry {
164        canonical_name: "Semaglutide",
165        entity_type: "compound",
166        match_names: &["semaglutide", "ozempic", "wegovy"],
167        source: "DrugBank",
168        id: "DB13928",
169    },
170    OntologyEntry {
171        canonical_name: "Exendin-4",
172        entity_type: "compound",
173        match_names: &["exendin-4", "exenatide", "byetta"],
174        source: "DrugBank",
175        id: "DB01276",
176    },
177    OntologyEntry {
178        canonical_name: "Xenon",
179        entity_type: "compound",
180        match_names: &["xenon", "xe"],
181        source: "ChEBI",
182        id: "CHEBI:49957",
183    },
184    // Cell types
185    OntologyEntry {
186        canonical_name: "microglia",
187        entity_type: "cell_type",
188        match_names: &["microglia", "microglial cell"],
189        source: "Cell Ontology",
190        id: "CL:0000129",
191    },
192    // Anatomical structures
193    OntologyEntry {
194        canonical_name: "blood-brain barrier",
195        entity_type: "anatomical_structure",
196        match_names: &["blood-brain barrier", "bbb"],
197        source: "MeSH",
198        id: "D001812",
199    },
200    // Organisms (NCBI Taxonomy)
201    OntologyEntry {
202        canonical_name: "Homo sapiens",
203        entity_type: "organism",
204        match_names: &["homo sapiens", "human"],
205        source: "NCBI Taxonomy",
206        id: "9606",
207    },
208    OntologyEntry {
209        canonical_name: "Mus musculus",
210        entity_type: "organism",
211        match_names: &["mus musculus", "mouse", "house mouse"],
212        source: "NCBI Taxonomy",
213        id: "10090",
214    },
215    // Physics-side (v0.10 entity types)
216    OntologyEntry {
217        canonical_name: "WIMP",
218        entity_type: "particle",
219        match_names: &["wimp", "weakly interacting massive particle"],
220        source: "PDG",
221        id: "WIMP",
222    },
223    OntologyEntry {
224        canonical_name: "XENONnT",
225        entity_type: "instrument",
226        match_names: &["xenonnt", "xenon nt"],
227        source: "ROR",
228        id: "https://ror.org/03wkt5x30",
229    },
230    OntologyEntry {
231        canonical_name: "LZ",
232        entity_type: "instrument",
233        match_names: &["lz", "lux-zeplin", "lux zeplin"],
234        source: "ROR",
235        id: "https://ror.org/04xeg9z08",
236    },
237];
238
239/// Lower / collapse-whitespace normalization to compare against `match_names`.
240fn normalize(s: &str) -> String {
241    s.split_whitespace()
242        .collect::<Vec<_>>()
243        .join(" ")
244        .to_lowercase()
245}
246
247/// Find a bundled match for an unresolved entity. Match must agree on
248/// `entity_type` (we don't auto-resolve `LZ:compound` to the LZ instrument).
249fn lookup(entity: &Entity) -> Option<&'static OntologyEntry> {
250    let n = normalize(&entity.name);
251    TABLE.iter().find(|row| {
252        row.entity_type == entity.entity_type && row.match_names.iter().any(|m| *m == n)
253    })
254}
255
256/// Per-finding outcome.
257#[derive(Debug, Clone, serde::Serialize)]
258pub struct FindingResolutionReport {
259    pub finding_id: String,
260    pub resolved: usize,
261    pub unresolved: Vec<String>,
262    pub already_resolved: usize,
263}
264
265/// Whole-frontier outcome.
266#[derive(Debug, Clone, serde::Serialize)]
267pub struct ResolveReport {
268    pub frontier: String,
269    pub total_entities: usize,
270    pub resolved: usize,
271    pub already_resolved: usize,
272    pub unresolved_count: usize,
273    pub findings_touched: usize,
274    pub per_finding: Vec<FindingResolutionReport>,
275}
276
277/// Walk every entity on every finding and apply the bundled lookup.
278/// Already-resolved entities (canonical_id is Some, regardless of source)
279/// are skipped — caller can pass `force` to re-resolve.
280pub fn resolve_frontier(project: &mut Project, force: bool) -> ResolveReport {
281    let frontier_name = project.project.name.clone();
282    let mut total = 0usize;
283    let mut resolved = 0usize;
284    let mut already = 0usize;
285    let mut unresolved_count = 0usize;
286    let mut findings_touched = 0usize;
287    let mut per_finding: Vec<FindingResolutionReport> = Vec::new();
288
289    for finding in project.findings.iter_mut() {
290        let mut f_resolved = 0usize;
291        let mut f_unresolved: Vec<String> = Vec::new();
292        let mut f_already = 0usize;
293        for entity in finding.assertion.entities.iter_mut() {
294            total += 1;
295            if entity.canonical_id.is_some() && !force {
296                already += 1;
297                f_already += 1;
298                continue;
299            }
300            match lookup(entity) {
301                Some(row) => {
302                    entity.canonical_id = Some(ResolvedId {
303                        source: row.source.to_string(),
304                        id: row.id.to_string(),
305                        confidence: 0.95,
306                        matched_name: Some(row.canonical_name.to_string()),
307                    });
308                    entity.resolution_method = Some(ResolutionMethod::Manual);
309                    entity.resolution_confidence = 0.95;
310                    entity.resolution_provenance =
311                        Some("vela_entity_resolve_v0_19_bundled_table".to_string());
312                    entity.needs_review = false;
313                    resolved += 1;
314                    f_resolved += 1;
315                }
316                None => {
317                    unresolved_count += 1;
318                    f_unresolved.push(format!("{}:{}", entity.name, entity.entity_type));
319                }
320            }
321        }
322        if f_resolved > 0 || !f_unresolved.is_empty() || f_already > 0 {
323            if f_resolved > 0 {
324                findings_touched += 1;
325            }
326            per_finding.push(FindingResolutionReport {
327                finding_id: finding.id.clone(),
328                resolved: f_resolved,
329                unresolved: f_unresolved,
330                already_resolved: f_already,
331            });
332        }
333    }
334
335    // Recompute stats since needs_review counts may have shifted.
336    crate::project::recompute_stats(project);
337
338    ResolveReport {
339        frontier: frontier_name,
340        total_entities: total,
341        resolved,
342        already_resolved: already,
343        unresolved_count,
344        findings_touched,
345        per_finding,
346    }
347}
348
349/// Convenience: how many entries are bundled. Used by tests + the
350/// CLI's `--list` flag.
351pub fn bundled_entry_count() -> usize {
352    TABLE.len()
353}
354
355/// Iterate the bundle without leaking the internal struct shape.
356pub fn iter_bundled()
357-> impl Iterator<Item = (&'static str, &'static str, &'static str, &'static str)> {
358    TABLE
359        .iter()
360        .map(|r| (r.canonical_name, r.entity_type, r.source, r.id))
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366
367    fn make_entity(name: &str, et: &str) -> Entity {
368        Entity {
369            name: name.to_string(),
370            entity_type: et.to_string(),
371            identifiers: serde_json::Map::new(),
372            canonical_id: None,
373            candidates: Vec::new(),
374            aliases: Vec::new(),
375            resolution_provenance: Some("manual_state_transition".to_string()),
376            resolution_confidence: 0.6,
377            resolution_method: None,
378            species_context: None,
379            needs_review: true,
380        }
381    }
382
383    #[test]
384    fn lookup_amyloid_beta_protein() {
385        let e = make_entity("amyloid-beta", "protein");
386        let row = lookup(&e).expect("amyloid-beta should resolve");
387        assert_eq!(row.source, "UniProt");
388        assert_eq!(row.id, "P05067");
389    }
390
391    #[test]
392    fn lookup_alzheimers_disease_with_apostrophe_variants() {
393        for n in [
394            "Alzheimer's disease",
395            "alzheimers disease",
396            "ALZHEIMER'S DISEASE",
397        ] {
398            let e = make_entity(n, "disease");
399            assert!(lookup(&e).is_some(), "should resolve '{n}'");
400        }
401    }
402
403    #[test]
404    fn lookup_respects_entity_type() {
405        // "LZ" as an instrument resolves; as a compound does not.
406        assert!(lookup(&make_entity("LZ", "instrument")).is_some());
407        assert!(lookup(&make_entity("LZ", "compound")).is_none());
408    }
409
410    #[test]
411    fn unmatched_name_returns_none() {
412        let e = make_entity("totally made-up entity name", "protein");
413        assert!(lookup(&e).is_none());
414    }
415}