1use std::collections::{BTreeMap, BTreeSet};
8
9use chrono::Utc;
10use serde::{Deserialize, Serialize};
11use serde_json::{Value, json};
12use sha2::{Digest, Sha256};
13
14use crate::bundle::{FindingBundle, Provenance};
15use crate::project::Project;
16
17#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
18pub struct SourceRecord {
19 pub id: String,
20 pub source_type: String,
21 pub locator: String,
22 #[serde(default, skip_serializing_if = "Option::is_none")]
23 pub content_hash: Option<String>,
24 #[serde(default)]
25 pub title: String,
26 #[serde(default)]
27 pub authors: Vec<String>,
28 #[serde(default, skip_serializing_if = "Option::is_none")]
29 pub year: Option<i32>,
30 #[serde(default, skip_serializing_if = "Option::is_none")]
31 pub doi: Option<String>,
32 #[serde(default, skip_serializing_if = "Option::is_none")]
33 pub pmid: Option<String>,
34 #[serde(default)]
35 pub imported_at: String,
36 #[serde(default)]
37 pub extraction_mode: String,
38 #[serde(default)]
39 pub source_quality: String,
40 #[serde(default)]
41 pub caveats: Vec<String>,
42 #[serde(default)]
43 pub finding_ids: Vec<String>,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
47pub struct EvidenceAtom {
48 pub id: String,
49 pub source_id: String,
50 pub finding_id: String,
51 #[serde(default, skip_serializing_if = "Option::is_none")]
52 pub locator: Option<String>,
53 pub evidence_type: String,
54 pub measurement_or_claim: String,
55 pub supports_or_challenges: String,
56 pub condition_refs: Vec<String>,
57 pub extraction_method: String,
58 pub human_verified: bool,
59 pub caveats: Vec<String>,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
63pub struct ConditionRecord {
64 pub id: String,
65 pub finding_id: String,
66 pub text: String,
67 #[serde(default, skip_serializing_if = "Option::is_none")]
68 pub species: Option<String>,
69 pub model_system: String,
70 pub method: String,
71 pub in_vitro: bool,
72 pub in_vivo: bool,
73 pub human_data: bool,
74 pub clinical_trial: bool,
75 pub exposure_or_efficacy: String,
76 pub comparator_status: String,
77 pub translation_scope: String,
78 pub caveats: Vec<String>,
79}
80
81#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
82pub struct SourceRegistrySummary {
83 pub count: usize,
84 pub source_types: BTreeMap<String, usize>,
85 pub low_quality_count: usize,
86 pub missing_hash_count: usize,
87}
88
89#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
90pub struct EvidenceAtomSummary {
91 pub count: usize,
92 pub missing_locator_count: usize,
93 pub unverified_count: usize,
94 pub synthetic_source_count: usize,
95}
96
97#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
98pub struct ConditionSummary {
99 pub count: usize,
100 pub missing_text_count: usize,
101 pub missing_comparator_count: usize,
102 pub exposure_efficacy_risk_count: usize,
103 pub translation_scopes: BTreeMap<String, usize>,
104}
105
106#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
107pub struct SourceEvidenceProjection {
108 pub sources: Vec<SourceRecord>,
109 pub evidence_atoms: Vec<EvidenceAtom>,
110 pub condition_records: Vec<ConditionRecord>,
111}
112
113pub fn resync_provenance_from_sources(project: &mut Project) -> usize {
121 use crate::bundle::Author;
122 let mut by_doi: BTreeMap<String, &SourceRecord> = BTreeMap::new();
123 let mut by_pmid: BTreeMap<String, &SourceRecord> = BTreeMap::new();
124 let mut by_title: BTreeMap<String, &SourceRecord> = BTreeMap::new();
125 for source in &project.sources {
126 if let Some(doi) = source.doi.as_deref() {
127 by_doi.insert(doi.to_lowercase(), source);
128 }
129 if let Some(pmid) = source.pmid.as_deref() {
130 by_pmid.insert(pmid.to_string(), source);
131 }
132 if !source.title.trim().is_empty() {
133 by_title.insert(normalize_title_key(&source.title), source);
134 }
135 }
136
137 let mut updated = 0usize;
138 for finding in &mut project.findings {
139 let source: Option<&SourceRecord> = finding
140 .provenance
141 .doi
142 .as_deref()
143 .map(str::to_lowercase)
144 .and_then(|key| by_doi.get(&key).copied())
145 .or_else(|| {
146 finding
147 .provenance
148 .pmid
149 .as_deref()
150 .and_then(|key| by_pmid.get(key).copied())
151 })
152 .or_else(|| {
153 if finding.provenance.title.trim().is_empty() {
154 None
155 } else {
156 by_title
157 .get(&normalize_title_key(&finding.provenance.title))
158 .copied()
159 }
160 });
161
162 let Some(source) = source else { continue };
163 let mut changed = false;
164
165 if !source.title.is_empty() && source.title != finding.provenance.title {
166 finding.provenance.title = source.title.clone();
167 changed = true;
168 }
169 if source.year.is_some() && source.year != finding.provenance.year {
170 finding.provenance.year = source.year;
171 changed = true;
172 }
173 if !source.authors.is_empty() {
174 let derived: Vec<Author> = source
175 .authors
176 .iter()
177 .map(|name| Author {
178 name: name.clone(),
179 orcid: None,
180 })
181 .collect();
182 let differs = derived.len() != finding.provenance.authors.len()
183 || derived
184 .iter()
185 .zip(finding.provenance.authors.iter())
186 .any(|(a, b)| a.name != b.name);
187 if differs {
188 finding.provenance.authors = derived;
189 changed = true;
190 }
191 }
192 if changed {
193 updated += 1;
194 }
195 }
196 updated
197}
198
199fn normalize_title_key(title: &str) -> String {
200 title
201 .split_whitespace()
202 .collect::<Vec<_>>()
203 .join(" ")
204 .to_lowercase()
205}
206
207pub fn materialize_project(project: &mut Project) {
208 let projection = derive_projection(project);
209 project.sources = projection.sources;
210 project.evidence_atoms = projection.evidence_atoms;
211 project.condition_records = projection.condition_records;
212 crate::project::recompute_stats(project);
213}
214
215pub fn derive_projection(project: &Project) -> SourceEvidenceProjection {
216 let sources = derive_source_records(project);
217 let condition_records = derive_condition_records(project);
218 let evidence_atoms = derive_evidence_atoms(project, &sources, &condition_records);
219 SourceEvidenceProjection {
220 sources,
221 evidence_atoms,
222 condition_records,
223 }
224}
225
226pub fn source_summary(project: &Project) -> SourceRegistrySummary {
227 let sources = if project.sources.is_empty() {
228 derive_source_records(project)
229 } else {
230 project.sources.clone()
231 };
232 let mut source_types = BTreeMap::new();
233 let mut low_quality_count = 0usize;
234 let mut missing_hash_count = 0usize;
235 for source in &sources {
236 *source_types.entry(source.source_type.clone()).or_default() += 1;
237 if matches!(
238 source.source_quality.as_str(),
239 "low" | "rough" | "needs_review" | "synthetic"
240 ) {
241 low_quality_count += 1;
242 }
243 if source.content_hash.is_none() {
244 missing_hash_count += 1;
245 }
246 }
247 SourceRegistrySummary {
248 count: sources.len(),
249 source_types,
250 low_quality_count,
251 missing_hash_count,
252 }
253}
254
255pub fn evidence_summary(project: &Project) -> EvidenceAtomSummary {
256 let projection;
257 let (atoms, source_records): (&[EvidenceAtom], &[SourceRecord]) =
258 if project.evidence_atoms.is_empty() || project.sources.is_empty() {
259 projection = derive_projection(project);
260 (&projection.evidence_atoms, &projection.sources)
261 } else {
262 (&project.evidence_atoms, &project.sources)
263 };
264 let source_map = source_records
265 .iter()
266 .map(|source| (source.id.as_str(), source))
267 .collect::<BTreeMap<_, _>>();
268 let mut missing_locator_count = 0usize;
269 let mut unverified_count = 0usize;
270 let mut synthetic_source_count = 0usize;
271 for atom in atoms {
272 if atom.locator.as_deref().is_none_or(str::is_empty) {
273 missing_locator_count += 1;
274 }
275 if !atom.human_verified {
276 unverified_count += 1;
277 }
278 if source_map
279 .get(atom.source_id.as_str())
280 .is_some_and(|source| is_synthetic_source(source))
281 {
282 synthetic_source_count += 1;
283 }
284 }
285 EvidenceAtomSummary {
286 count: atoms.len(),
287 missing_locator_count,
288 unverified_count,
289 synthetic_source_count,
290 }
291}
292
293pub fn condition_summary(project: &Project) -> ConditionSummary {
294 let records = if project.condition_records.is_empty() {
295 derive_condition_records(project)
296 } else {
297 project.condition_records.clone()
298 };
299 let mut translation_scopes = BTreeMap::new();
300 let mut missing_text_count = 0usize;
301 let mut missing_comparator_count = 0usize;
302 let mut exposure_efficacy_risk_count = 0usize;
303 for record in &records {
304 *translation_scopes
305 .entry(record.translation_scope.clone())
306 .or_default() += 1;
307 if record.text.trim().is_empty() {
308 missing_text_count += 1;
309 }
310 if record.comparator_status == "missing_or_unclear" {
311 missing_comparator_count += 1;
312 }
313 if record.exposure_or_efficacy == "both" {
314 exposure_efficacy_risk_count += 1;
315 }
316 }
317 ConditionSummary {
318 count: records.len(),
319 missing_text_count,
320 missing_comparator_count,
321 exposure_efficacy_risk_count,
322 translation_scopes,
323 }
324}
325
326pub fn source_map(project: &Project) -> BTreeMap<&str, &SourceRecord> {
327 let mut map = BTreeMap::new();
328 for source in &project.sources {
329 map.insert(source.id.as_str(), source);
330 }
331 map
332}
333
334pub fn condition_records_for_finding<'a>(
335 project: &'a Project,
336 finding_id: &str,
337) -> Vec<&'a ConditionRecord> {
338 project
339 .condition_records
340 .iter()
341 .filter(|record| record.finding_id == finding_id)
342 .collect()
343}
344
345pub fn evidence_atoms_for_finding<'a>(
346 project: &'a Project,
347 finding_id: &str,
348) -> Vec<&'a EvidenceAtom> {
349 project
350 .evidence_atoms
351 .iter()
352 .filter(|atom| atom.finding_id == finding_id)
353 .collect()
354}
355
356pub fn sources_for_finding<'a>(project: &'a Project, finding_id: &str) -> Vec<&'a SourceRecord> {
357 let atoms = evidence_atoms_for_finding(project, finding_id);
358 let ids = atoms
359 .iter()
360 .map(|atom| atom.source_id.as_str())
361 .collect::<BTreeSet<_>>();
362 project
363 .sources
364 .iter()
365 .filter(|source| {
366 source.finding_ids.iter().any(|id| id == finding_id) || ids.contains(source.id.as_str())
367 })
368 .collect()
369}
370
371pub fn source_evidence_map(project: &Project) -> Value {
372 source_evidence_map_from_atoms(&project.evidence_atoms)
373}
374
375pub fn source_evidence_map_from_atoms(evidence_atoms: &[EvidenceAtom]) -> Value {
376 let mut by_source = BTreeMap::<String, Vec<Value>>::new();
377 for atom in evidence_atoms {
378 by_source
379 .entry(atom.source_id.clone())
380 .or_default()
381 .push(json!({
382 "evidence_atom_id": atom.id,
383 "finding_id": atom.finding_id,
384 "locator": atom.locator,
385 "supports_or_challenges": atom.supports_or_challenges,
386 "human_verified": atom.human_verified,
387 "caveats": atom.caveats,
388 }));
389 }
390 json!({
391 "schema": "vela.source-evidence-map.v0",
392 "sources": by_source,
393 })
394}
395
396pub fn condition_matrix(records: &[ConditionRecord]) -> Value {
397 let rows = records
398 .iter()
399 .map(|record| {
400 json!({
401 "condition_id": record.id,
402 "finding_id": record.finding_id,
403 "text": record.text,
404 "species": record.species,
405 "model_system": record.model_system,
406 "method": record.method,
407 "human_data": record.human_data,
408 "clinical_trial": record.clinical_trial,
409 "exposure_or_efficacy": record.exposure_or_efficacy,
410 "comparator_status": record.comparator_status,
411 "translation_scope": record.translation_scope,
412 "caveats": record.caveats,
413 })
414 })
415 .collect::<Vec<_>>();
416 json!({
417 "schema": "vela.condition-matrix.v0",
418 "conditions": rows,
419 })
420}
421
422pub fn attach_local_source_details(
423 project: &mut Project,
424 finding_hashes: &BTreeMap<String, String>,
425 finding_source_types: &BTreeMap<String, String>,
426) {
427 if finding_hashes.is_empty() && finding_source_types.is_empty() {
428 return;
429 }
430 let mut remap = BTreeMap::<String, String>::new();
431 for source in &mut project.sources {
432 let hashes = source
433 .finding_ids
434 .iter()
435 .filter_map(|finding_id| finding_hashes.get(finding_id))
436 .collect::<BTreeSet<_>>();
437 if hashes.len() == 1
438 && let Some(hash) = hashes.into_iter().next().cloned()
439 {
440 source.content_hash = Some(hash);
441 }
442 let source_types = source
443 .finding_ids
444 .iter()
445 .filter_map(|finding_id| finding_source_types.get(finding_id))
446 .collect::<BTreeSet<_>>();
447 if source_types.len() == 1
448 && let Some(source_type) = source_types.into_iter().next()
449 {
450 source.source_type = normalize_source_type(source_type);
451 }
452 let old_id = source.id.clone();
453 source.id = source_id(
454 &source.source_type,
455 &source.locator,
456 source.content_hash.as_deref(),
457 source.doi.as_deref(),
458 source.pmid.as_deref(),
459 &source.title,
460 );
461 if source.id != old_id {
462 remap.insert(old_id, source.id.clone());
463 }
464 }
465 if remap.is_empty() {
466 crate::project::recompute_stats(project);
467 return;
468 }
469 for atom in &mut project.evidence_atoms {
470 if let Some(new_source_id) = remap.get(&atom.source_id) {
471 atom.source_id = new_source_id.clone();
472 atom.id = evidence_atom_id(
473 &atom.source_id,
474 &atom.finding_id,
475 atom.locator.as_deref(),
476 &atom.measurement_or_claim,
477 &atom.evidence_type,
478 );
479 }
480 }
481 crate::project::recompute_stats(project);
482}
483
484pub fn source_record_for_finding(finding: &FindingBundle) -> SourceRecord {
485 let source_type = normalize_source_type(&finding.provenance.source_type);
486 let locator = source_locator(&finding.provenance, &finding.id);
487 let content_hash = None;
488 let id = source_id(
489 &source_type,
490 &locator,
491 content_hash.as_deref(),
492 finding.provenance.doi.as_deref(),
493 finding.provenance.pmid.as_deref(),
494 &finding.provenance.title,
495 );
496 let mut caveats = Vec::new();
497 if source_type == "synthetic_report" || source_type == "agent_trace" {
498 caveats.push("source requires human review before being treated as evidence".to_string());
499 }
500 if finding.provenance.title.trim().is_empty()
501 && finding.provenance.doi.is_none()
502 && finding.provenance.pmid.is_none()
503 {
504 caveats.push("weak source metadata; locator derived from finding id".to_string());
505 }
506 let source_quality = if caveats.is_empty()
507 && !finding.provenance.extraction.method.contains("fallback")
508 && !finding.provenance.extraction.method.contains("rough")
509 {
510 "declared".to_string()
511 } else if source_type == "synthetic_report" || source_type == "agent_trace" {
512 "synthetic".to_string()
513 } else {
514 "needs_review".to_string()
515 };
516 SourceRecord {
517 id,
518 source_type,
519 locator,
520 content_hash,
521 title: finding.provenance.title.clone(),
522 authors: finding
523 .provenance
524 .authors
525 .iter()
526 .map(|author| author.name.clone())
527 .collect(),
528 year: finding.provenance.year,
529 doi: finding.provenance.doi.clone(),
530 pmid: finding.provenance.pmid.clone(),
531 imported_at: finding.provenance.extraction.extracted_at.clone(),
532 extraction_mode: finding.provenance.extraction.method.clone(),
533 source_quality,
534 caveats,
535 finding_ids: vec![finding.id.clone()],
536 }
537}
538
539fn derive_source_records(project: &Project) -> Vec<SourceRecord> {
540 let mut by_id = BTreeMap::<String, SourceRecord>::new();
541
542 for finding in &project.findings {
543 let mut record = source_record_for_finding(finding);
544 if let Some(existing) = matching_existing_source(project, &record) {
545 record.source_type = existing.source_type.clone();
546 if existing.content_hash.is_some() {
547 record.content_hash = existing.content_hash.clone();
548 }
549 record.id = source_id(
550 &record.source_type,
551 &record.locator,
552 record.content_hash.as_deref(),
553 record.doi.as_deref(),
554 record.pmid.as_deref(),
555 &record.title,
556 );
557 for caveat in &existing.caveats {
558 push_unique(&mut record.caveats, caveat);
559 }
560 }
561 by_id
562 .entry(record.id.clone())
563 .and_modify(|existing| push_unique(&mut existing.finding_ids, &finding.id))
564 .or_insert(record);
565 }
566
567 for existing in &project.sources {
568 by_id
569 .entry(existing.id.clone())
570 .or_insert_with(|| existing.clone());
571 }
572
573 by_id.into_values().collect()
574}
575
576fn matching_existing_source<'a>(
577 project: &'a Project,
578 record: &SourceRecord,
579) -> Option<&'a SourceRecord> {
580 project.sources.iter().find(|existing| {
581 existing
582 .finding_ids
583 .iter()
584 .any(|id| record.finding_ids.iter().any(|record_id| record_id == id))
585 || (existing.locator == record.locator
586 && existing.title == record.title
587 && existing.doi == record.doi
588 && existing.pmid == record.pmid)
589 })
590}
591
592fn derive_evidence_atoms(
593 project: &Project,
594 sources: &[SourceRecord],
595 condition_records: &[ConditionRecord],
596) -> Vec<EvidenceAtom> {
597 let source_by_finding = sources
598 .iter()
599 .flat_map(|source| {
600 source
601 .finding_ids
602 .iter()
603 .map(move |finding_id| (finding_id.as_str(), source))
604 })
605 .collect::<BTreeMap<_, _>>();
606 let mut atoms = BTreeMap::<String, EvidenceAtom>::new();
607 for finding in &project.findings {
608 let source = source_by_finding
609 .get(finding.id.as_str())
610 .copied()
611 .cloned()
612 .unwrap_or_else(|| source_record_for_finding(finding));
613 let source_id = source.id.clone();
614 if finding.evidence.evidence_spans.is_empty() {
615 let atom = weak_atom(finding, &source_id, condition_records);
616 atoms.insert(atom.id.clone(), atom);
617 continue;
618 }
619 for (span_index, span) in finding.evidence.evidence_spans.iter().enumerate() {
620 let (locator, claim) = span_locator_and_claim(span, span_index);
621 let mut caveats = Vec::new();
622 if locator.is_none() {
623 caveats.push("missing evidence locator".to_string());
624 }
625 if finding.conditions.text.trim().is_empty() {
626 caveats.push("condition boundary missing on parent finding".to_string());
627 }
628 let atom = EvidenceAtom {
629 id: evidence_atom_id(
630 &source_id,
631 &finding.id,
632 locator.as_deref(),
633 &claim,
634 &finding.evidence.evidence_type,
635 ),
636 source_id: source_id.clone(),
637 finding_id: finding.id.clone(),
638 locator,
639 evidence_type: finding.evidence.evidence_type.clone(),
640 measurement_or_claim: claim,
641 supports_or_challenges: "supports".to_string(),
642 condition_refs: condition_refs(finding, condition_records),
643 extraction_method: finding.provenance.extraction.method.clone(),
644 human_verified: finding
645 .provenance
646 .review
647 .as_ref()
648 .is_some_and(|review| review.reviewed),
649 caveats,
650 };
651 atoms.insert(atom.id.clone(), atom);
652 }
653 }
654 for existing in &project.evidence_atoms {
665 let id = existing.id.clone();
666 match atoms.get_mut(&id) {
667 None => {
668 atoms.insert(id, existing.clone());
669 }
670 Some(derived) => {
671 if existing.locator.is_some() && derived.locator.is_none() {
672 derived.locator = existing.locator.clone();
673 derived.caveats.retain(|c| c != "missing evidence locator");
674 }
675 if existing.human_verified && !derived.human_verified {
676 derived.human_verified = true;
677 }
678 }
679 }
680 }
681 atoms.into_values().collect()
682}
683
684fn derive_condition_records(project: &Project) -> Vec<ConditionRecord> {
685 let mut records = BTreeMap::<String, ConditionRecord>::new();
686 for finding in &project.findings {
687 let record = condition_record_for_finding(finding);
688 records.insert(record.id.clone(), record);
689 }
690 for existing in &project.condition_records {
691 records
692 .entry(existing.id.clone())
693 .or_insert_with(|| existing.clone());
694 }
695 records.into_values().collect()
696}
697
698pub fn condition_record_for_finding(finding: &FindingBundle) -> ConditionRecord {
699 let text = finding.conditions.text.trim().to_string();
700 let species = finding
701 .conditions
702 .species_verified
703 .first()
704 .cloned()
705 .or_else(|| finding.evidence.species.clone());
706 let combined = format!(
707 "{} {} {} {} {}",
708 finding.assertion.text,
709 finding.evidence.evidence_type,
710 finding.evidence.model_system,
711 finding.evidence.method,
712 text
713 );
714 let exposure_or_efficacy = exposure_or_efficacy(&combined);
715 let comparator_status = comparator_status(&combined, finding);
716 let translation_scope = translation_scope(finding, &combined);
717 let mut caveats = Vec::new();
718 if text.is_empty() {
719 caveats.push("condition boundary missing".to_string());
720 }
721 if comparator_status == "missing_or_unclear" {
722 caveats.push("comparator or baseline missing or unclear".to_string());
723 }
724 if exposure_or_efficacy == "both" {
725 caveats.push(
726 "exposure and efficacy language both present; review for overgeneralization"
727 .to_string(),
728 );
729 }
730 if translation_scope == "animal_model" && mentions_human_translation(&combined) {
731 caveats.push(
732 "animal-model evidence is being discussed near human translation language".to_string(),
733 );
734 }
735 ConditionRecord {
736 id: condition_record_id(finding),
737 finding_id: finding.id.clone(),
738 text,
739 species,
740 model_system: finding.evidence.model_system.clone(),
741 method: finding.evidence.method.clone(),
742 in_vitro: finding.conditions.in_vitro,
743 in_vivo: finding.conditions.in_vivo,
744 human_data: finding.conditions.human_data,
745 clinical_trial: finding.conditions.clinical_trial,
746 exposure_or_efficacy,
747 comparator_status,
748 translation_scope,
749 caveats,
750 }
751}
752
753fn weak_atom(
754 finding: &FindingBundle,
755 source_id: &str,
756 condition_records: &[ConditionRecord],
757) -> EvidenceAtom {
758 let claim = finding.assertion.text.clone();
759 EvidenceAtom {
760 id: evidence_atom_id(
761 source_id,
762 &finding.id,
763 None,
764 &claim,
765 &finding.evidence.evidence_type,
766 ),
767 source_id: source_id.to_string(),
768 finding_id: finding.id.clone(),
769 locator: None,
770 evidence_type: finding.evidence.evidence_type.clone(),
771 measurement_or_claim: claim,
772 supports_or_challenges: "unknown".to_string(),
773 condition_refs: condition_refs(finding, condition_records),
774 extraction_method: finding.provenance.extraction.method.clone(),
775 human_verified: false,
776 caveats: vec!["missing evidence locator".to_string()],
777 }
778}
779
780fn span_locator_and_claim(span: &Value, span_index: usize) -> (Option<String>, String) {
781 if let Some(text) = span.as_str() {
782 let trimmed = text.trim().to_string();
783 let locator = if trimmed.is_empty() {
784 None
785 } else {
786 Some(format!("span:{span_index}"))
787 };
788 return (locator, trimmed);
789 }
790 if let Some(object) = span.as_object() {
791 let claim = object
792 .get("text")
793 .or_else(|| object.get("quote"))
794 .or_else(|| object.get("claim"))
795 .and_then(Value::as_str)
796 .unwrap_or("")
797 .trim()
798 .to_string();
799 let mut parts = Vec::new();
800 for key in [
801 "source", "section", "page", "row", "table", "figure", "start", "end",
802 ] {
803 if let Some(value) = object.get(key) {
804 let rendered = value
805 .as_str()
806 .map(str::to_string)
807 .unwrap_or_else(|| value.to_string());
808 if !rendered.trim().is_empty() {
809 parts.push(format!("{key}:{rendered}"));
810 }
811 }
812 }
813 let locator = if parts.is_empty() {
814 Some(format!("span:{span_index}"))
815 } else {
816 Some(parts.join("|"))
817 };
818 let claim = if claim.is_empty() {
819 span.to_string()
820 } else {
821 claim
822 };
823 return (locator, claim);
824 }
825 (Some(format!("span:{span_index}")), span.to_string())
826}
827
828fn condition_refs(finding: &FindingBundle, condition_records: &[ConditionRecord]) -> Vec<String> {
829 if let Some(record) = condition_records
830 .iter()
831 .find(|record| record.finding_id == finding.id)
832 {
833 return vec![record.id.clone()];
834 }
835 let text = finding.conditions.text.trim();
836 if text.is_empty() {
837 vec![format!("finding:{}", finding.id)]
838 } else {
839 vec![condition_record_id(finding)]
840 }
841}
842
843pub fn condition_record_id(finding: &FindingBundle) -> String {
844 let input = format!(
845 "{}|{}|{}|{}|{}",
846 finding.id,
847 finding.conditions.text.trim(),
848 finding.evidence.model_system,
849 finding.evidence.method,
850 finding.evidence.species.clone().unwrap_or_default()
851 );
852 format!("vcnd_{}", short_hash(input.as_bytes()))
853}
854
855fn exposure_or_efficacy(text: &str) -> String {
856 let lower = text.to_ascii_lowercase();
857 let exposure = [
858 "exposure",
859 "uptake",
860 "transport",
861 "delivery",
862 "penetration",
863 "brain level",
864 "biodistribution",
865 "concentration",
866 ]
867 .iter()
868 .any(|needle| lower.contains(needle));
869 let efficacy = [
870 "efficacy",
871 "therapeutic",
872 "functional",
873 "cognition",
874 "survival",
875 "clinical",
876 "symptom",
877 "outcome",
878 ]
879 .iter()
880 .any(|needle| lower.contains(needle));
881 match (exposure, efficacy) {
882 (true, true) => "both",
883 (true, false) => "exposure",
884 (false, true) => "efficacy",
885 (false, false) => "unknown",
886 }
887 .to_string()
888}
889
890fn comparator_status(text: &str, finding: &FindingBundle) -> String {
891 let lower = text.to_ascii_lowercase();
892 if [
893 "control",
894 "comparator",
895 "compared",
896 "versus",
897 "relative to",
898 "baseline",
899 "vs ",
900 ]
901 .iter()
902 .any(|needle| lower.contains(needle))
903 || finding.evidence.effect_size.is_some()
904 || finding.evidence.p_value.is_some()
905 {
906 "declared"
907 } else {
908 "missing_or_unclear"
909 }
910 .to_string()
911}
912
913fn translation_scope(finding: &FindingBundle, text: &str) -> String {
914 let lower = text.to_ascii_lowercase();
915 if finding.conditions.clinical_trial || finding.conditions.human_data {
916 return "human".to_string();
917 }
918 if finding.conditions.in_vivo
919 || finding
920 .evidence
921 .species
922 .as_deref()
923 .is_some_and(|species| !species.to_ascii_lowercase().contains("human"))
924 {
925 return "animal_model".to_string();
926 }
927 if finding.conditions.in_vitro
928 || lower.contains("cell")
929 || lower.contains("in vitro")
930 || lower.contains("organoid")
931 {
932 return "in_vitro".to_string();
933 }
934 if lower.contains("benchmark")
935 || lower.contains("dataset")
936 || lower.contains("simulation")
937 || lower.contains("computational")
938 {
939 return "computational".to_string();
940 }
941 "unspecified".to_string()
942}
943
944fn mentions_human_translation(text: &str) -> bool {
945 let lower = text.to_ascii_lowercase();
946 ["human", "clinical", "patient", "therapeutic efficacy"]
947 .iter()
948 .any(|needle| lower.contains(needle))
949}
950
951fn normalize_source_type(source_type: &str) -> String {
952 match source_type {
953 "published_paper" | "paper" => "paper",
954 "database_record" | "curated_csv" | "csv" => "csv",
955 "pdf" => "pdf",
956 "jats" | "jats_xml" => "jats",
957 "text" | "markdown" => "text",
958 "note" => "note",
959 "doi" | "doi_list" => "doi",
960 "agent_trace" => "agent_trace",
961 "benchmark_output" => "benchmark_output",
962 "notebook_entry" => "notebook_entry",
963 "experiment_log" => "experiment_log",
964 "model_output" | "summary" | "synthesis" | "synthetic_report" => "synthetic_report",
965 _ => "paper",
966 }
967 .to_string()
968}
969
970fn source_locator(provenance: &Provenance, finding_id: &str) -> String {
971 provenance
972 .doi
973 .as_ref()
974 .map(|doi| format!("doi:{doi}"))
975 .or_else(|| provenance.pmid.as_ref().map(|pmid| format!("pmid:{pmid}")))
976 .or_else(|| provenance.pmc.as_ref().map(|pmc| format!("pmc:{pmc}")))
977 .or_else(|| {
978 (!provenance.title.trim().is_empty()).then(|| format!("title:{}", provenance.title))
979 })
980 .unwrap_or_else(|| format!("unknown-source:{finding_id}"))
981}
982
983pub fn source_id(
984 source_type: &str,
985 locator: &str,
986 content_hash: Option<&str>,
987 doi: Option<&str>,
988 pmid: Option<&str>,
989 title: &str,
990) -> String {
991 let mut input = String::new();
992 input.push_str(source_type);
993 input.push('|');
994 input.push_str(locator);
995 input.push('|');
996 input.push_str(content_hash.unwrap_or(""));
997 input.push('|');
998 input.push_str(doi.unwrap_or(""));
999 input.push('|');
1000 input.push_str(pmid.unwrap_or(""));
1001 input.push('|');
1002 input.push_str(title);
1003 format!("vs_{}", short_hash(input.as_bytes()))
1004}
1005
1006pub fn evidence_atom_id(
1007 source_id: &str,
1008 finding_id: &str,
1009 locator: Option<&str>,
1010 measurement_or_claim: &str,
1011 evidence_type: &str,
1012) -> String {
1013 let input = format!(
1014 "{source_id}|{finding_id}|{}|{measurement_or_claim}|{evidence_type}",
1015 locator.unwrap_or("")
1016 );
1017 format!("vea_{}", short_hash(input.as_bytes()))
1018}
1019
1020pub fn is_synthetic_source(source: &SourceRecord) -> bool {
1021 matches!(
1022 source.source_type.as_str(),
1023 "synthetic_report" | "agent_trace"
1024 )
1025}
1026
1027pub fn now_imported_at_fallback(value: &str) -> String {
1028 if value.trim().is_empty() {
1029 Utc::now().to_rfc3339()
1030 } else {
1031 value.to_string()
1032 }
1033}
1034
1035fn push_unique(values: &mut Vec<String>, value: &str) {
1036 if !values.iter().any(|existing| existing == value) {
1037 values.push(value.to_string());
1038 values.sort();
1039 }
1040}
1041
1042fn short_hash(bytes: &[u8]) -> String {
1043 let mut hasher = Sha256::new();
1044 hasher.update(bytes);
1045 let digest = hasher.finalize();
1046 hex::encode(&digest[..8])
1047}
1048
1049#[cfg(test)]
1050mod tests {
1051 use super::*;
1052 use crate::bundle::*;
1053
1054 fn finding_with_span(span: Value) -> FindingBundle {
1055 FindingBundle {
1056 id: "vf_test".to_string(),
1057 version: 1,
1058 previous_version: None,
1059 assertion: Assertion {
1060 text: "TfR targeting increases apparent brain exposure in mice.".to_string(),
1061 assertion_type: "mechanism".to_string(),
1062 entities: Vec::new(),
1063 relation: None,
1064 direction: None,
1065 causal_claim: None,
1066 causal_evidence_grade: None,
1067 },
1068 evidence: Evidence {
1069 evidence_type: "experimental".to_string(),
1070 model_system: "mouse".to_string(),
1071 species: Some("Mus musculus".to_string()),
1072 method: "in vivo exposure assay".to_string(),
1073 sample_size: None,
1074 effect_size: None,
1075 p_value: None,
1076 replicated: false,
1077 replication_count: None,
1078 evidence_spans: vec![span],
1079 },
1080 conditions: Conditions {
1081 text: "Mouse exposure assay; not human therapeutic efficacy.".to_string(),
1082 species_verified: vec!["Mus musculus".to_string()],
1083 species_unverified: Vec::new(),
1084 in_vitro: false,
1085 in_vivo: true,
1086 human_data: false,
1087 clinical_trial: false,
1088 concentration_range: None,
1089 duration: None,
1090 age_group: None,
1091 cell_type: None,
1092 },
1093 confidence: Confidence::raw(0.6, "test", 0.8),
1094 provenance: Provenance {
1095 source_type: "published_paper".to_string(),
1096 doi: Some("10.0000/test".to_string()),
1097 pmid: None,
1098 pmc: None,
1099 openalex_id: None,
1100 url: None,
1101 title: "Test paper".to_string(),
1102 authors: vec![],
1103 year: Some(2026),
1104 journal: None,
1105 license: None,
1106 publisher: None,
1107 funders: vec![],
1108 extraction: Extraction::default(),
1109 review: None,
1110 citation_count: None,
1111 },
1112 flags: Flags {
1113 gap: false,
1114 negative_space: false,
1115 contested: false,
1116 retracted: false,
1117 declining: false,
1118 gravity_well: false,
1119 review_state: None,
1120 superseded: false,
1121 signature_threshold: None,
1122 jointly_accepted: false,
1123 },
1124 links: Vec::new(),
1125 annotations: vec![],
1126 attachments: vec![],
1127 created: "2026-01-01T00:00:00Z".to_string(),
1128 updated: None,
1129
1130 access_tier: crate::access_tier::AccessTier::Public,
1131 }
1132 }
1133
1134 #[test]
1135 fn projection_distinguishes_sources_from_evidence_atoms() {
1136 let finding = finding_with_span(json!({
1137 "text": "Brain exposure increased in mice.",
1138 "section": "results",
1139 "page": 4
1140 }));
1141 let project = crate::project::assemble("test", vec![finding], 1, 0, "test");
1142 let projection = derive_projection(&project);
1143 assert_eq!(projection.sources.len(), 1);
1144 assert_eq!(projection.evidence_atoms.len(), 1);
1145 assert_eq!(projection.condition_records.len(), 1);
1146 assert!(projection.sources[0].id.starts_with("vs_"));
1147 assert!(projection.evidence_atoms[0].id.starts_with("vea_"));
1148 assert!(projection.condition_records[0].id.starts_with("vcnd_"));
1149 assert_eq!(
1150 projection.evidence_atoms[0].source_id,
1151 projection.sources[0].id
1152 );
1153 assert_eq!(
1154 projection.evidence_atoms[0].condition_refs,
1155 vec![projection.condition_records[0].id.clone()]
1156 );
1157 assert_eq!(
1158 projection.evidence_atoms[0].locator.as_deref(),
1159 Some("section:results|page:4")
1160 );
1161 }
1162
1163 #[test]
1164 fn missing_span_creates_weak_atom_with_caveat() {
1165 let mut finding = finding_with_span(json!({"text": "unused"}));
1166 finding.evidence.evidence_spans.clear();
1167 let project = crate::project::assemble("test", vec![finding], 1, 0, "test");
1168 let projection = derive_projection(&project);
1169 assert_eq!(projection.evidence_atoms.len(), 1);
1170 assert!(projection.evidence_atoms[0].locator.is_none());
1171 assert_eq!(
1172 projection.evidence_atoms[0].supports_or_challenges,
1173 "unknown"
1174 );
1175 assert!(
1176 projection.evidence_atoms[0]
1177 .caveats
1178 .iter()
1179 .any(|c| c == "missing evidence locator")
1180 );
1181 }
1182
1183 #[test]
1184 fn condition_record_flags_exposure_efficacy_boundary() {
1185 let finding = finding_with_span(json!({
1186 "text": "Brain exposure and therapeutic efficacy increased in mice.",
1187 "section": "results"
1188 }));
1189 let record = condition_record_for_finding(&finding);
1190 assert_eq!(record.exposure_or_efficacy, "both");
1191 assert_eq!(record.translation_scope, "animal_model");
1192 assert!(
1193 record
1194 .caveats
1195 .iter()
1196 .any(|caveat| caveat.contains("overgeneralization"))
1197 );
1198 }
1199}