Skip to main content

vela_protocol/
sources.rs

1//! Source registry and evidence atom projections.
2//!
3//! Sources identify imported artifacts. Evidence atoms identify the exact
4//! source-grounded unit that bears on a finding. Both are safe to derive from
5//! legacy finding bundles when older frontiers do not persist them yet.
6
7use std::collections::{BTreeMap, BTreeSet};
8
9use chrono::Utc;
10use serde::{Deserialize, Serialize};
11use serde_json::{Value, json};
12use sha2::{Digest, Sha256};
13
14use crate::bundle::{FindingBundle, Provenance};
15use crate::project::Project;
16
17#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
18pub struct SourceRecord {
19    pub id: String,
20    pub source_type: String,
21    pub locator: String,
22    #[serde(default, skip_serializing_if = "Option::is_none")]
23    pub content_hash: Option<String>,
24    #[serde(default)]
25    pub title: String,
26    #[serde(default)]
27    pub authors: Vec<String>,
28    #[serde(default, skip_serializing_if = "Option::is_none")]
29    pub year: Option<i32>,
30    #[serde(default, skip_serializing_if = "Option::is_none")]
31    pub doi: Option<String>,
32    #[serde(default, skip_serializing_if = "Option::is_none")]
33    pub pmid: Option<String>,
34    #[serde(default)]
35    pub imported_at: String,
36    #[serde(default)]
37    pub extraction_mode: String,
38    #[serde(default)]
39    pub source_quality: String,
40    #[serde(default)]
41    pub caveats: Vec<String>,
42    #[serde(default)]
43    pub finding_ids: Vec<String>,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
47pub struct EvidenceAtom {
48    pub id: String,
49    pub source_id: String,
50    pub finding_id: String,
51    #[serde(default, skip_serializing_if = "Option::is_none")]
52    pub locator: Option<String>,
53    pub evidence_type: String,
54    pub measurement_or_claim: String,
55    pub supports_or_challenges: String,
56    pub condition_refs: Vec<String>,
57    pub extraction_method: String,
58    pub human_verified: bool,
59    pub caveats: Vec<String>,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
63pub struct ConditionRecord {
64    pub id: String,
65    pub finding_id: String,
66    pub text: String,
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub species: Option<String>,
69    pub model_system: String,
70    pub method: String,
71    pub in_vitro: bool,
72    pub in_vivo: bool,
73    pub human_data: bool,
74    pub clinical_trial: bool,
75    pub exposure_or_efficacy: String,
76    pub comparator_status: String,
77    pub translation_scope: String,
78    pub caveats: Vec<String>,
79}
80
81#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
82pub struct SourceRegistrySummary {
83    pub count: usize,
84    pub source_types: BTreeMap<String, usize>,
85    pub low_quality_count: usize,
86    pub missing_hash_count: usize,
87}
88
89#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
90pub struct EvidenceAtomSummary {
91    pub count: usize,
92    pub missing_locator_count: usize,
93    pub unverified_count: usize,
94    pub synthetic_source_count: usize,
95}
96
97#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
98pub struct ConditionSummary {
99    pub count: usize,
100    pub missing_text_count: usize,
101    pub missing_comparator_count: usize,
102    pub exposure_efficacy_risk_count: usize,
103    pub translation_scopes: BTreeMap<String, usize>,
104}
105
106#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
107pub struct SourceEvidenceProjection {
108    pub sources: Vec<SourceRecord>,
109    pub evidence_atoms: Vec<EvidenceAtom>,
110    pub condition_records: Vec<ConditionRecord>,
111}
112
113/// Phase N: rewrite each finding's `provenance` (title, year, authors,
114/// journal, license, publisher, funders) from the canonical
115/// SourceRecord that matches by DOI / PMID / title. Returns the count
116/// of findings whose provenance changed.
117///
118/// Doctrine: `Project.sources` is canonical; `FindingBundle.provenance`
119/// is the denormalized cache. When they disagree, the source wins.
120pub fn resync_provenance_from_sources(project: &mut Project) -> usize {
121    use crate::bundle::Author;
122    let mut by_doi: BTreeMap<String, &SourceRecord> = BTreeMap::new();
123    let mut by_pmid: BTreeMap<String, &SourceRecord> = BTreeMap::new();
124    let mut by_title: BTreeMap<String, &SourceRecord> = BTreeMap::new();
125    for source in &project.sources {
126        if let Some(doi) = source.doi.as_deref() {
127            by_doi.insert(doi.to_lowercase(), source);
128        }
129        if let Some(pmid) = source.pmid.as_deref() {
130            by_pmid.insert(pmid.to_string(), source);
131        }
132        if !source.title.trim().is_empty() {
133            by_title.insert(normalize_title_key(&source.title), source);
134        }
135    }
136
137    let mut updated = 0usize;
138    for finding in &mut project.findings {
139        let source: Option<&SourceRecord> = finding
140            .provenance
141            .doi
142            .as_deref()
143            .map(str::to_lowercase)
144            .and_then(|key| by_doi.get(&key).copied())
145            .or_else(|| {
146                finding
147                    .provenance
148                    .pmid
149                    .as_deref()
150                    .and_then(|key| by_pmid.get(key).copied())
151            })
152            .or_else(|| {
153                if finding.provenance.title.trim().is_empty() {
154                    None
155                } else {
156                    by_title
157                        .get(&normalize_title_key(&finding.provenance.title))
158                        .copied()
159                }
160            });
161
162        let Some(source) = source else { continue };
163        let mut changed = false;
164
165        if !source.title.is_empty() && source.title != finding.provenance.title {
166            finding.provenance.title = source.title.clone();
167            changed = true;
168        }
169        if source.year.is_some() && source.year != finding.provenance.year {
170            finding.provenance.year = source.year;
171            changed = true;
172        }
173        if !source.authors.is_empty() {
174            let derived: Vec<Author> = source
175                .authors
176                .iter()
177                .map(|name| Author {
178                    name: name.clone(),
179                    orcid: None,
180                })
181                .collect();
182            let differs = derived.len() != finding.provenance.authors.len()
183                || derived
184                    .iter()
185                    .zip(finding.provenance.authors.iter())
186                    .any(|(a, b)| a.name != b.name);
187            if differs {
188                finding.provenance.authors = derived;
189                changed = true;
190            }
191        }
192        if changed {
193            updated += 1;
194        }
195    }
196    updated
197}
198
199fn normalize_title_key(title: &str) -> String {
200    title
201        .split_whitespace()
202        .collect::<Vec<_>>()
203        .join(" ")
204        .to_lowercase()
205}
206
207pub fn materialize_project(project: &mut Project) {
208    let projection = derive_projection(project);
209    project.sources = projection.sources;
210    project.evidence_atoms = projection.evidence_atoms;
211    project.condition_records = projection.condition_records;
212    crate::project::recompute_stats(project);
213}
214
215pub fn derive_projection(project: &Project) -> SourceEvidenceProjection {
216    let sources = derive_source_records(project);
217    let condition_records = derive_condition_records(project);
218    let evidence_atoms = derive_evidence_atoms(project, &sources, &condition_records);
219    SourceEvidenceProjection {
220        sources,
221        evidence_atoms,
222        condition_records,
223    }
224}
225
226pub fn source_summary(project: &Project) -> SourceRegistrySummary {
227    let sources = if project.sources.is_empty() {
228        derive_source_records(project)
229    } else {
230        project.sources.clone()
231    };
232    let mut source_types = BTreeMap::new();
233    let mut low_quality_count = 0usize;
234    let mut missing_hash_count = 0usize;
235    for source in &sources {
236        *source_types.entry(source.source_type.clone()).or_default() += 1;
237        if matches!(
238            source.source_quality.as_str(),
239            "low" | "rough" | "needs_review" | "synthetic"
240        ) {
241            low_quality_count += 1;
242        }
243        if source.content_hash.is_none() {
244            missing_hash_count += 1;
245        }
246    }
247    SourceRegistrySummary {
248        count: sources.len(),
249        source_types,
250        low_quality_count,
251        missing_hash_count,
252    }
253}
254
255pub fn evidence_summary(project: &Project) -> EvidenceAtomSummary {
256    let projection;
257    let (atoms, source_records): (&[EvidenceAtom], &[SourceRecord]) =
258        if project.evidence_atoms.is_empty() || project.sources.is_empty() {
259            projection = derive_projection(project);
260            (&projection.evidence_atoms, &projection.sources)
261        } else {
262            (&project.evidence_atoms, &project.sources)
263        };
264    let source_map = source_records
265        .iter()
266        .map(|source| (source.id.as_str(), source))
267        .collect::<BTreeMap<_, _>>();
268    let mut missing_locator_count = 0usize;
269    let mut unverified_count = 0usize;
270    let mut synthetic_source_count = 0usize;
271    for atom in atoms {
272        if atom.locator.as_deref().is_none_or(str::is_empty) {
273            missing_locator_count += 1;
274        }
275        if !atom.human_verified {
276            unverified_count += 1;
277        }
278        if source_map
279            .get(atom.source_id.as_str())
280            .is_some_and(|source| is_synthetic_source(source))
281        {
282            synthetic_source_count += 1;
283        }
284    }
285    EvidenceAtomSummary {
286        count: atoms.len(),
287        missing_locator_count,
288        unverified_count,
289        synthetic_source_count,
290    }
291}
292
293pub fn condition_summary(project: &Project) -> ConditionSummary {
294    let records = if project.condition_records.is_empty() {
295        derive_condition_records(project)
296    } else {
297        project.condition_records.clone()
298    };
299    let mut translation_scopes = BTreeMap::new();
300    let mut missing_text_count = 0usize;
301    let mut missing_comparator_count = 0usize;
302    let mut exposure_efficacy_risk_count = 0usize;
303    for record in &records {
304        *translation_scopes
305            .entry(record.translation_scope.clone())
306            .or_default() += 1;
307        if record.text.trim().is_empty() {
308            missing_text_count += 1;
309        }
310        if record.comparator_status == "missing_or_unclear" {
311            missing_comparator_count += 1;
312        }
313        if record.exposure_or_efficacy == "both" {
314            exposure_efficacy_risk_count += 1;
315        }
316    }
317    ConditionSummary {
318        count: records.len(),
319        missing_text_count,
320        missing_comparator_count,
321        exposure_efficacy_risk_count,
322        translation_scopes,
323    }
324}
325
326pub fn source_map(project: &Project) -> BTreeMap<&str, &SourceRecord> {
327    let mut map = BTreeMap::new();
328    for source in &project.sources {
329        map.insert(source.id.as_str(), source);
330    }
331    map
332}
333
334pub fn condition_records_for_finding<'a>(
335    project: &'a Project,
336    finding_id: &str,
337) -> Vec<&'a ConditionRecord> {
338    project
339        .condition_records
340        .iter()
341        .filter(|record| record.finding_id == finding_id)
342        .collect()
343}
344
345pub fn evidence_atoms_for_finding<'a>(
346    project: &'a Project,
347    finding_id: &str,
348) -> Vec<&'a EvidenceAtom> {
349    project
350        .evidence_atoms
351        .iter()
352        .filter(|atom| atom.finding_id == finding_id)
353        .collect()
354}
355
356pub fn sources_for_finding<'a>(project: &'a Project, finding_id: &str) -> Vec<&'a SourceRecord> {
357    let atoms = evidence_atoms_for_finding(project, finding_id);
358    let ids = atoms
359        .iter()
360        .map(|atom| atom.source_id.as_str())
361        .collect::<BTreeSet<_>>();
362    project
363        .sources
364        .iter()
365        .filter(|source| {
366            source.finding_ids.iter().any(|id| id == finding_id) || ids.contains(source.id.as_str())
367        })
368        .collect()
369}
370
371pub fn source_evidence_map(project: &Project) -> Value {
372    source_evidence_map_from_atoms(&project.evidence_atoms)
373}
374
375pub fn source_evidence_map_from_atoms(evidence_atoms: &[EvidenceAtom]) -> Value {
376    let mut by_source = BTreeMap::<String, Vec<Value>>::new();
377    for atom in evidence_atoms {
378        by_source
379            .entry(atom.source_id.clone())
380            .or_default()
381            .push(json!({
382                "evidence_atom_id": atom.id,
383                "finding_id": atom.finding_id,
384                "locator": atom.locator,
385                "supports_or_challenges": atom.supports_or_challenges,
386                "human_verified": atom.human_verified,
387                "caveats": atom.caveats,
388            }));
389    }
390    json!({
391        "schema": "vela.source-evidence-map.v0",
392        "sources": by_source,
393    })
394}
395
396pub fn condition_matrix(records: &[ConditionRecord]) -> Value {
397    let rows = records
398        .iter()
399        .map(|record| {
400            json!({
401                "condition_id": record.id,
402                "finding_id": record.finding_id,
403                "text": record.text,
404                "species": record.species,
405                "model_system": record.model_system,
406                "method": record.method,
407                "human_data": record.human_data,
408                "clinical_trial": record.clinical_trial,
409                "exposure_or_efficacy": record.exposure_or_efficacy,
410                "comparator_status": record.comparator_status,
411                "translation_scope": record.translation_scope,
412                "caveats": record.caveats,
413            })
414        })
415        .collect::<Vec<_>>();
416    json!({
417        "schema": "vela.condition-matrix.v0",
418        "conditions": rows,
419    })
420}
421
422pub fn attach_local_source_details(
423    project: &mut Project,
424    finding_hashes: &BTreeMap<String, String>,
425    finding_source_types: &BTreeMap<String, String>,
426) {
427    if finding_hashes.is_empty() && finding_source_types.is_empty() {
428        return;
429    }
430    let mut remap = BTreeMap::<String, String>::new();
431    for source in &mut project.sources {
432        let hashes = source
433            .finding_ids
434            .iter()
435            .filter_map(|finding_id| finding_hashes.get(finding_id))
436            .collect::<BTreeSet<_>>();
437        if hashes.len() == 1
438            && let Some(hash) = hashes.into_iter().next().cloned()
439        {
440            source.content_hash = Some(hash);
441        }
442        let source_types = source
443            .finding_ids
444            .iter()
445            .filter_map(|finding_id| finding_source_types.get(finding_id))
446            .collect::<BTreeSet<_>>();
447        if source_types.len() == 1
448            && let Some(source_type) = source_types.into_iter().next()
449        {
450            source.source_type = normalize_source_type(source_type);
451        }
452        let old_id = source.id.clone();
453        source.id = source_id(
454            &source.source_type,
455            &source.locator,
456            source.content_hash.as_deref(),
457            source.doi.as_deref(),
458            source.pmid.as_deref(),
459            &source.title,
460        );
461        if source.id != old_id {
462            remap.insert(old_id, source.id.clone());
463        }
464    }
465    if remap.is_empty() {
466        crate::project::recompute_stats(project);
467        return;
468    }
469    for atom in &mut project.evidence_atoms {
470        if let Some(new_source_id) = remap.get(&atom.source_id) {
471            atom.source_id = new_source_id.clone();
472            atom.id = evidence_atom_id(
473                &atom.source_id,
474                &atom.finding_id,
475                atom.locator.as_deref(),
476                &atom.measurement_or_claim,
477                &atom.evidence_type,
478            );
479        }
480    }
481    crate::project::recompute_stats(project);
482}
483
484pub fn source_record_for_finding(finding: &FindingBundle) -> SourceRecord {
485    let source_type = normalize_source_type(&finding.provenance.source_type);
486    let locator = source_locator(&finding.provenance, &finding.id);
487    let content_hash = None;
488    let id = source_id(
489        &source_type,
490        &locator,
491        content_hash.as_deref(),
492        finding.provenance.doi.as_deref(),
493        finding.provenance.pmid.as_deref(),
494        &finding.provenance.title,
495    );
496    let mut caveats = Vec::new();
497    if source_type == "synthetic_report" || source_type == "agent_trace" {
498        caveats.push("source requires human review before being treated as evidence".to_string());
499    }
500    if finding.provenance.title.trim().is_empty()
501        && finding.provenance.doi.is_none()
502        && finding.provenance.pmid.is_none()
503    {
504        caveats.push("weak source metadata; locator derived from finding id".to_string());
505    }
506    let source_quality = if caveats.is_empty()
507        && !finding.provenance.extraction.method.contains("fallback")
508        && !finding.provenance.extraction.method.contains("rough")
509    {
510        "declared".to_string()
511    } else if source_type == "synthetic_report" || source_type == "agent_trace" {
512        "synthetic".to_string()
513    } else {
514        "needs_review".to_string()
515    };
516    SourceRecord {
517        id,
518        source_type,
519        locator,
520        content_hash,
521        title: finding.provenance.title.clone(),
522        authors: finding
523            .provenance
524            .authors
525            .iter()
526            .map(|author| author.name.clone())
527            .collect(),
528        year: finding.provenance.year,
529        doi: finding.provenance.doi.clone(),
530        pmid: finding.provenance.pmid.clone(),
531        imported_at: finding.provenance.extraction.extracted_at.clone(),
532        extraction_mode: finding.provenance.extraction.method.clone(),
533        source_quality,
534        caveats,
535        finding_ids: vec![finding.id.clone()],
536    }
537}
538
539fn derive_source_records(project: &Project) -> Vec<SourceRecord> {
540    let mut by_id = BTreeMap::<String, SourceRecord>::new();
541
542    for finding in &project.findings {
543        let mut record = source_record_for_finding(finding);
544        if let Some(existing) = matching_existing_source(project, &record) {
545            record.source_type = existing.source_type.clone();
546            if existing.content_hash.is_some() {
547                record.content_hash = existing.content_hash.clone();
548            }
549            record.id = source_id(
550                &record.source_type,
551                &record.locator,
552                record.content_hash.as_deref(),
553                record.doi.as_deref(),
554                record.pmid.as_deref(),
555                &record.title,
556            );
557            for caveat in &existing.caveats {
558                push_unique(&mut record.caveats, caveat);
559            }
560        }
561        by_id
562            .entry(record.id.clone())
563            .and_modify(|existing| push_unique(&mut existing.finding_ids, &finding.id))
564            .or_insert(record);
565    }
566
567    for existing in &project.sources {
568        by_id
569            .entry(existing.id.clone())
570            .or_insert_with(|| existing.clone());
571    }
572
573    by_id.into_values().collect()
574}
575
576fn matching_existing_source<'a>(
577    project: &'a Project,
578    record: &SourceRecord,
579) -> Option<&'a SourceRecord> {
580    project.sources.iter().find(|existing| {
581        existing
582            .finding_ids
583            .iter()
584            .any(|id| record.finding_ids.iter().any(|record_id| record_id == id))
585            || (existing.locator == record.locator
586                && existing.title == record.title
587                && existing.doi == record.doi
588                && existing.pmid == record.pmid)
589    })
590}
591
592fn derive_evidence_atoms(
593    project: &Project,
594    sources: &[SourceRecord],
595    condition_records: &[ConditionRecord],
596) -> Vec<EvidenceAtom> {
597    let source_by_finding = sources
598        .iter()
599        .flat_map(|source| {
600            source
601                .finding_ids
602                .iter()
603                .map(move |finding_id| (finding_id.as_str(), source))
604        })
605        .collect::<BTreeMap<_, _>>();
606    let mut atoms = BTreeMap::<String, EvidenceAtom>::new();
607    for finding in &project.findings {
608        let source = source_by_finding
609            .get(finding.id.as_str())
610            .copied()
611            .cloned()
612            .unwrap_or_else(|| source_record_for_finding(finding));
613        let source_id = source.id.clone();
614        if finding.evidence.evidence_spans.is_empty() {
615            let atom = weak_atom(finding, &source_id, condition_records);
616            atoms.insert(atom.id.clone(), atom);
617            continue;
618        }
619        for (span_index, span) in finding.evidence.evidence_spans.iter().enumerate() {
620            let (locator, claim) = span_locator_and_claim(span, span_index);
621            let mut caveats = Vec::new();
622            if locator.is_none() {
623                caveats.push("missing evidence locator".to_string());
624            }
625            if finding.conditions.text.trim().is_empty() {
626                caveats.push("condition boundary missing on parent finding".to_string());
627            }
628            let atom = EvidenceAtom {
629                id: evidence_atom_id(
630                    &source_id,
631                    &finding.id,
632                    locator.as_deref(),
633                    &claim,
634                    &finding.evidence.evidence_type,
635                ),
636                source_id: source_id.clone(),
637                finding_id: finding.id.clone(),
638                locator,
639                evidence_type: finding.evidence.evidence_type.clone(),
640                measurement_or_claim: claim,
641                supports_or_challenges: "supports".to_string(),
642                condition_refs: condition_refs(finding, condition_records),
643                extraction_method: finding.provenance.extraction.method.clone(),
644                human_verified: finding
645                    .provenance
646                    .review
647                    .as_ref()
648                    .is_some_and(|review| review.reviewed),
649                caveats,
650            };
651            atoms.insert(atom.id.clone(), atom);
652        }
653    }
654    // v0.56: When a persisted atom shares an id with a derived one,
655    // merge field-by-field so reducer-applied repairs survive
656    // re-derivation. The derive pass produces weak atoms (locator
657    // None, "missing evidence locator" caveat) for findings whose
658    // evidence_spans are empty. After an `evidence_atom.locator_repaired`
659    // event lands, the persisted atom carries `locator: Some(...)`
660    // and no longer carries the missing-locator caveat. Without this
661    // merge step, re-derivation would silently overwrite the repair
662    // because the existing entry-or-insert pattern preferred the
663    // derived weak atom on id collision.
664    for existing in &project.evidence_atoms {
665        let id = existing.id.clone();
666        match atoms.get_mut(&id) {
667            None => {
668                atoms.insert(id, existing.clone());
669            }
670            Some(derived) => {
671                if existing.locator.is_some() && derived.locator.is_none() {
672                    derived.locator = existing.locator.clone();
673                    derived.caveats.retain(|c| c != "missing evidence locator");
674                }
675                if existing.human_verified && !derived.human_verified {
676                    derived.human_verified = true;
677                }
678            }
679        }
680    }
681    atoms.into_values().collect()
682}
683
684fn derive_condition_records(project: &Project) -> Vec<ConditionRecord> {
685    let mut records = BTreeMap::<String, ConditionRecord>::new();
686    for finding in &project.findings {
687        let record = condition_record_for_finding(finding);
688        records.insert(record.id.clone(), record);
689    }
690    for existing in &project.condition_records {
691        records
692            .entry(existing.id.clone())
693            .or_insert_with(|| existing.clone());
694    }
695    records.into_values().collect()
696}
697
698pub fn condition_record_for_finding(finding: &FindingBundle) -> ConditionRecord {
699    let text = finding.conditions.text.trim().to_string();
700    let species = finding
701        .conditions
702        .species_verified
703        .first()
704        .cloned()
705        .or_else(|| finding.evidence.species.clone());
706    let combined = format!(
707        "{} {} {} {} {}",
708        finding.assertion.text,
709        finding.evidence.evidence_type,
710        finding.evidence.model_system,
711        finding.evidence.method,
712        text
713    );
714    let exposure_or_efficacy = exposure_or_efficacy(&combined);
715    let comparator_status = comparator_status(&combined, finding);
716    let translation_scope = translation_scope(finding, &combined);
717    let mut caveats = Vec::new();
718    if text.is_empty() {
719        caveats.push("condition boundary missing".to_string());
720    }
721    if comparator_status == "missing_or_unclear" {
722        caveats.push("comparator or baseline missing or unclear".to_string());
723    }
724    if exposure_or_efficacy == "both" {
725        caveats.push(
726            "exposure and efficacy language both present; review for overgeneralization"
727                .to_string(),
728        );
729    }
730    if translation_scope == "animal_model" && mentions_human_translation(&combined) {
731        caveats.push(
732            "animal-model evidence is being discussed near human translation language".to_string(),
733        );
734    }
735    ConditionRecord {
736        id: condition_record_id(finding),
737        finding_id: finding.id.clone(),
738        text,
739        species,
740        model_system: finding.evidence.model_system.clone(),
741        method: finding.evidence.method.clone(),
742        in_vitro: finding.conditions.in_vitro,
743        in_vivo: finding.conditions.in_vivo,
744        human_data: finding.conditions.human_data,
745        clinical_trial: finding.conditions.clinical_trial,
746        exposure_or_efficacy,
747        comparator_status,
748        translation_scope,
749        caveats,
750    }
751}
752
753fn weak_atom(
754    finding: &FindingBundle,
755    source_id: &str,
756    condition_records: &[ConditionRecord],
757) -> EvidenceAtom {
758    let claim = finding.assertion.text.clone();
759    EvidenceAtom {
760        id: evidence_atom_id(
761            source_id,
762            &finding.id,
763            None,
764            &claim,
765            &finding.evidence.evidence_type,
766        ),
767        source_id: source_id.to_string(),
768        finding_id: finding.id.clone(),
769        locator: None,
770        evidence_type: finding.evidence.evidence_type.clone(),
771        measurement_or_claim: claim,
772        supports_or_challenges: "unknown".to_string(),
773        condition_refs: condition_refs(finding, condition_records),
774        extraction_method: finding.provenance.extraction.method.clone(),
775        human_verified: false,
776        caveats: vec!["missing evidence locator".to_string()],
777    }
778}
779
780fn span_locator_and_claim(span: &Value, span_index: usize) -> (Option<String>, String) {
781    if let Some(text) = span.as_str() {
782        let trimmed = text.trim().to_string();
783        let locator = if trimmed.is_empty() {
784            None
785        } else {
786            Some(format!("span:{span_index}"))
787        };
788        return (locator, trimmed);
789    }
790    if let Some(object) = span.as_object() {
791        let claim = object
792            .get("text")
793            .or_else(|| object.get("quote"))
794            .or_else(|| object.get("claim"))
795            .and_then(Value::as_str)
796            .unwrap_or("")
797            .trim()
798            .to_string();
799        let mut parts = Vec::new();
800        for key in [
801            "source", "section", "page", "row", "table", "figure", "start", "end",
802        ] {
803            if let Some(value) = object.get(key) {
804                let rendered = value
805                    .as_str()
806                    .map(str::to_string)
807                    .unwrap_or_else(|| value.to_string());
808                if !rendered.trim().is_empty() {
809                    parts.push(format!("{key}:{rendered}"));
810                }
811            }
812        }
813        let locator = if parts.is_empty() {
814            Some(format!("span:{span_index}"))
815        } else {
816            Some(parts.join("|"))
817        };
818        let claim = if claim.is_empty() {
819            span.to_string()
820        } else {
821            claim
822        };
823        return (locator, claim);
824    }
825    (Some(format!("span:{span_index}")), span.to_string())
826}
827
828fn condition_refs(finding: &FindingBundle, condition_records: &[ConditionRecord]) -> Vec<String> {
829    if let Some(record) = condition_records
830        .iter()
831        .find(|record| record.finding_id == finding.id)
832    {
833        return vec![record.id.clone()];
834    }
835    let text = finding.conditions.text.trim();
836    if text.is_empty() {
837        vec![format!("finding:{}", finding.id)]
838    } else {
839        vec![condition_record_id(finding)]
840    }
841}
842
843pub fn condition_record_id(finding: &FindingBundle) -> String {
844    let input = format!(
845        "{}|{}|{}|{}|{}",
846        finding.id,
847        finding.conditions.text.trim(),
848        finding.evidence.model_system,
849        finding.evidence.method,
850        finding.evidence.species.clone().unwrap_or_default()
851    );
852    format!("vcnd_{}", short_hash(input.as_bytes()))
853}
854
855fn exposure_or_efficacy(text: &str) -> String {
856    let lower = text.to_ascii_lowercase();
857    let exposure = [
858        "exposure",
859        "uptake",
860        "transport",
861        "delivery",
862        "penetration",
863        "brain level",
864        "biodistribution",
865        "concentration",
866    ]
867    .iter()
868    .any(|needle| lower.contains(needle));
869    let efficacy = [
870        "efficacy",
871        "therapeutic",
872        "functional",
873        "cognition",
874        "survival",
875        "clinical",
876        "symptom",
877        "outcome",
878    ]
879    .iter()
880    .any(|needle| lower.contains(needle));
881    match (exposure, efficacy) {
882        (true, true) => "both",
883        (true, false) => "exposure",
884        (false, true) => "efficacy",
885        (false, false) => "unknown",
886    }
887    .to_string()
888}
889
890fn comparator_status(text: &str, finding: &FindingBundle) -> String {
891    let lower = text.to_ascii_lowercase();
892    if [
893        "control",
894        "comparator",
895        "compared",
896        "versus",
897        "relative to",
898        "baseline",
899        "vs ",
900    ]
901    .iter()
902    .any(|needle| lower.contains(needle))
903        || finding.evidence.effect_size.is_some()
904        || finding.evidence.p_value.is_some()
905    {
906        "declared"
907    } else {
908        "missing_or_unclear"
909    }
910    .to_string()
911}
912
913fn translation_scope(finding: &FindingBundle, text: &str) -> String {
914    let lower = text.to_ascii_lowercase();
915    if finding.conditions.clinical_trial || finding.conditions.human_data {
916        return "human".to_string();
917    }
918    if finding.conditions.in_vivo
919        || finding
920            .evidence
921            .species
922            .as_deref()
923            .is_some_and(|species| !species.to_ascii_lowercase().contains("human"))
924    {
925        return "animal_model".to_string();
926    }
927    if finding.conditions.in_vitro
928        || lower.contains("cell")
929        || lower.contains("in vitro")
930        || lower.contains("organoid")
931    {
932        return "in_vitro".to_string();
933    }
934    if lower.contains("benchmark")
935        || lower.contains("dataset")
936        || lower.contains("simulation")
937        || lower.contains("computational")
938    {
939        return "computational".to_string();
940    }
941    "unspecified".to_string()
942}
943
944fn mentions_human_translation(text: &str) -> bool {
945    let lower = text.to_ascii_lowercase();
946    ["human", "clinical", "patient", "therapeutic efficacy"]
947        .iter()
948        .any(|needle| lower.contains(needle))
949}
950
951fn normalize_source_type(source_type: &str) -> String {
952    match source_type {
953        "published_paper" | "paper" => "paper",
954        "database_record" | "curated_csv" | "csv" => "csv",
955        "pdf" => "pdf",
956        "jats" | "jats_xml" => "jats",
957        "text" | "markdown" => "text",
958        "note" => "note",
959        "doi" | "doi_list" => "doi",
960        "agent_trace" => "agent_trace",
961        "benchmark_output" => "benchmark_output",
962        "notebook_entry" => "notebook_entry",
963        "experiment_log" => "experiment_log",
964        "model_output" | "summary" | "synthesis" | "synthetic_report" => "synthetic_report",
965        _ => "paper",
966    }
967    .to_string()
968}
969
970fn source_locator(provenance: &Provenance, finding_id: &str) -> String {
971    provenance
972        .doi
973        .as_ref()
974        .map(|doi| format!("doi:{doi}"))
975        .or_else(|| provenance.pmid.as_ref().map(|pmid| format!("pmid:{pmid}")))
976        .or_else(|| provenance.pmc.as_ref().map(|pmc| format!("pmc:{pmc}")))
977        .or_else(|| {
978            (!provenance.title.trim().is_empty()).then(|| format!("title:{}", provenance.title))
979        })
980        .unwrap_or_else(|| format!("unknown-source:{finding_id}"))
981}
982
983pub fn source_id(
984    source_type: &str,
985    locator: &str,
986    content_hash: Option<&str>,
987    doi: Option<&str>,
988    pmid: Option<&str>,
989    title: &str,
990) -> String {
991    let mut input = String::new();
992    input.push_str(source_type);
993    input.push('|');
994    input.push_str(locator);
995    input.push('|');
996    input.push_str(content_hash.unwrap_or(""));
997    input.push('|');
998    input.push_str(doi.unwrap_or(""));
999    input.push('|');
1000    input.push_str(pmid.unwrap_or(""));
1001    input.push('|');
1002    input.push_str(title);
1003    format!("vs_{}", short_hash(input.as_bytes()))
1004}
1005
1006pub fn evidence_atom_id(
1007    source_id: &str,
1008    finding_id: &str,
1009    locator: Option<&str>,
1010    measurement_or_claim: &str,
1011    evidence_type: &str,
1012) -> String {
1013    let input = format!(
1014        "{source_id}|{finding_id}|{}|{measurement_or_claim}|{evidence_type}",
1015        locator.unwrap_or("")
1016    );
1017    format!("vea_{}", short_hash(input.as_bytes()))
1018}
1019
1020pub fn is_synthetic_source(source: &SourceRecord) -> bool {
1021    matches!(
1022        source.source_type.as_str(),
1023        "synthetic_report" | "agent_trace"
1024    )
1025}
1026
1027pub fn now_imported_at_fallback(value: &str) -> String {
1028    if value.trim().is_empty() {
1029        Utc::now().to_rfc3339()
1030    } else {
1031        value.to_string()
1032    }
1033}
1034
1035fn push_unique(values: &mut Vec<String>, value: &str) {
1036    if !values.iter().any(|existing| existing == value) {
1037        values.push(value.to_string());
1038        values.sort();
1039    }
1040}
1041
1042fn short_hash(bytes: &[u8]) -> String {
1043    let mut hasher = Sha256::new();
1044    hasher.update(bytes);
1045    let digest = hasher.finalize();
1046    hex::encode(&digest[..8])
1047}
1048
1049#[cfg(test)]
1050mod tests {
1051    use super::*;
1052    use crate::bundle::*;
1053
1054    fn finding_with_span(span: Value) -> FindingBundle {
1055        FindingBundle {
1056            id: "vf_test".to_string(),
1057            version: 1,
1058            previous_version: None,
1059            assertion: Assertion {
1060                text: "TfR targeting increases apparent brain exposure in mice.".to_string(),
1061                assertion_type: "mechanism".to_string(),
1062                entities: Vec::new(),
1063                relation: None,
1064                direction: None,
1065                causal_claim: None,
1066                causal_evidence_grade: None,
1067            },
1068            evidence: Evidence {
1069                evidence_type: "experimental".to_string(),
1070                model_system: "mouse".to_string(),
1071                species: Some("Mus musculus".to_string()),
1072                method: "in vivo exposure assay".to_string(),
1073                sample_size: None,
1074                effect_size: None,
1075                p_value: None,
1076                replicated: false,
1077                replication_count: None,
1078                evidence_spans: vec![span],
1079            },
1080            conditions: Conditions {
1081                text: "Mouse exposure assay; not human therapeutic efficacy.".to_string(),
1082                species_verified: vec!["Mus musculus".to_string()],
1083                species_unverified: Vec::new(),
1084                in_vitro: false,
1085                in_vivo: true,
1086                human_data: false,
1087                clinical_trial: false,
1088                concentration_range: None,
1089                duration: None,
1090                age_group: None,
1091                cell_type: None,
1092            },
1093            confidence: Confidence::raw(0.6, "test", 0.8),
1094            provenance: Provenance {
1095                source_type: "published_paper".to_string(),
1096                doi: Some("10.0000/test".to_string()),
1097                pmid: None,
1098                pmc: None,
1099                openalex_id: None,
1100                url: None,
1101                title: "Test paper".to_string(),
1102                authors: vec![],
1103                year: Some(2026),
1104                journal: None,
1105                license: None,
1106                publisher: None,
1107                funders: vec![],
1108                extraction: Extraction::default(),
1109                review: None,
1110                citation_count: None,
1111            },
1112            flags: Flags {
1113                gap: false,
1114                negative_space: false,
1115                contested: false,
1116                retracted: false,
1117                declining: false,
1118                gravity_well: false,
1119                review_state: None,
1120                superseded: false,
1121                signature_threshold: None,
1122                jointly_accepted: false,
1123            },
1124            links: Vec::new(),
1125            annotations: vec![],
1126            attachments: vec![],
1127            created: "2026-01-01T00:00:00Z".to_string(),
1128            updated: None,
1129
1130            access_tier: crate::access_tier::AccessTier::Public,
1131        }
1132    }
1133
1134    #[test]
1135    fn projection_distinguishes_sources_from_evidence_atoms() {
1136        let finding = finding_with_span(json!({
1137            "text": "Brain exposure increased in mice.",
1138            "section": "results",
1139            "page": 4
1140        }));
1141        let project = crate::project::assemble("test", vec![finding], 1, 0, "test");
1142        let projection = derive_projection(&project);
1143        assert_eq!(projection.sources.len(), 1);
1144        assert_eq!(projection.evidence_atoms.len(), 1);
1145        assert_eq!(projection.condition_records.len(), 1);
1146        assert!(projection.sources[0].id.starts_with("vs_"));
1147        assert!(projection.evidence_atoms[0].id.starts_with("vea_"));
1148        assert!(projection.condition_records[0].id.starts_with("vcnd_"));
1149        assert_eq!(
1150            projection.evidence_atoms[0].source_id,
1151            projection.sources[0].id
1152        );
1153        assert_eq!(
1154            projection.evidence_atoms[0].condition_refs,
1155            vec![projection.condition_records[0].id.clone()]
1156        );
1157        assert_eq!(
1158            projection.evidence_atoms[0].locator.as_deref(),
1159            Some("section:results|page:4")
1160        );
1161    }
1162
1163    #[test]
1164    fn missing_span_creates_weak_atom_with_caveat() {
1165        let mut finding = finding_with_span(json!({"text": "unused"}));
1166        finding.evidence.evidence_spans.clear();
1167        let project = crate::project::assemble("test", vec![finding], 1, 0, "test");
1168        let projection = derive_projection(&project);
1169        assert_eq!(projection.evidence_atoms.len(), 1);
1170        assert!(projection.evidence_atoms[0].locator.is_none());
1171        assert_eq!(
1172            projection.evidence_atoms[0].supports_or_challenges,
1173            "unknown"
1174        );
1175        assert!(
1176            projection.evidence_atoms[0]
1177                .caveats
1178                .iter()
1179                .any(|c| c == "missing evidence locator")
1180        );
1181    }
1182
1183    #[test]
1184    fn condition_record_flags_exposure_efficacy_boundary() {
1185        let finding = finding_with_span(json!({
1186            "text": "Brain exposure and therapeutic efficacy increased in mice.",
1187            "section": "results"
1188        }));
1189        let record = condition_record_for_finding(&finding);
1190        assert_eq!(record.exposure_or_efficacy, "both");
1191        assert_eq!(record.translation_scope, "animal_model");
1192        assert!(
1193            record
1194                .caveats
1195                .iter()
1196                .any(|caveat| caveat.contains("overgeneralization"))
1197        );
1198    }
1199}