Skip to main content

vela_protocol/
artifact_audit.rs

1//! Artifact proof-readiness checks for frontier-owned files and pointers.
2
3use std::collections::{BTreeMap, HashSet};
4use std::fs;
5use std::path::{Path, PathBuf};
6
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10use crate::bundle::Artifact;
11use crate::project::Project;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ArtifactAudit {
15    pub ok: bool,
16    pub command: String,
17    pub frontier: String,
18    pub artifact_count: usize,
19    pub checked_local_blobs: usize,
20    pub local_blob_bytes: u64,
21    pub by_kind: BTreeMap<String, usize>,
22    pub by_storage_mode: BTreeMap<String, usize>,
23    pub issue_count: usize,
24    pub issues: Vec<ArtifactAuditIssue>,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct ArtifactAuditIssue {
29    pub id: String,
30    pub field: String,
31    pub message: String,
32}
33
34pub fn audit_artifacts(source: &Path, project: &Project) -> ArtifactAudit {
35    let root = artifact_root(source);
36    let finding_ids = project
37        .findings
38        .iter()
39        .map(|finding| finding.id.as_str())
40        .collect::<HashSet<_>>();
41    let mut issues = Vec::new();
42    let mut by_kind = BTreeMap::new();
43    let mut by_storage_mode = BTreeMap::new();
44    let mut checked_local_blobs = 0usize;
45    let mut local_blob_bytes = 0u64;
46
47    for artifact in &project.artifacts {
48        *by_kind.entry(artifact.kind.clone()).or_insert(0) += 1;
49        *by_storage_mode
50            .entry(artifact.storage_mode.clone())
51            .or_insert(0) += 1;
52        audit_artifact_shape(artifact, &finding_ids, &mut issues);
53        if matches!(artifact.storage_mode.as_str(), "local_blob" | "local_file") {
54            if let Some(root) = root.as_deref() {
55                if let Some((checked, bytes)) = audit_local_blob(root, artifact, &mut issues) {
56                    checked_local_blobs += usize::from(checked);
57                    local_blob_bytes += bytes;
58                }
59            } else {
60                push_issue(
61                    &mut issues,
62                    &artifact.id,
63                    "locator",
64                    "local artifact cannot be checked without a frontier directory",
65                );
66            }
67        }
68    }
69
70    ArtifactAudit {
71        ok: issues.is_empty(),
72        command: "artifact-audit".to_string(),
73        frontier: source.display().to_string(),
74        artifact_count: project.artifacts.len(),
75        checked_local_blobs,
76        local_blob_bytes,
77        by_kind,
78        by_storage_mode,
79        issue_count: issues.len(),
80        issues,
81    }
82}
83
84fn audit_artifact_shape(
85    artifact: &Artifact,
86    finding_ids: &HashSet<&str>,
87    issues: &mut Vec<ArtifactAuditIssue>,
88) {
89    if !artifact.id.starts_with("va_") {
90        push_issue(
91            issues,
92            &artifact.id,
93            "id",
94            "artifact id must start with va_",
95        );
96    }
97    if !is_sha256(&artifact.content_hash) {
98        push_issue(
99            issues,
100            &artifact.id,
101            "content_hash",
102            "content_hash must be sha256:<64 lowercase hex>",
103        );
104    }
105    if artifact.license.as_deref().unwrap_or("").trim().is_empty()
106        && artifact
107            .provenance
108            .license
109            .as_deref()
110            .unwrap_or("")
111            .trim()
112            .is_empty()
113    {
114        push_issue(
115            issues,
116            &artifact.id,
117            "license",
118            "artifact must declare license or access terms",
119        );
120    }
121    if artifact.target_findings.is_empty() {
122        push_issue(
123            issues,
124            &artifact.id,
125            "target_findings",
126            "artifact must target at least one finding",
127        );
128    }
129    for finding_id in &artifact.target_findings {
130        if !finding_ids.contains(finding_id.as_str()) {
131            push_issue(
132                issues,
133                &artifact.id,
134                "target_findings",
135                format!("unknown finding id: {finding_id}"),
136            );
137        }
138    }
139    if matches!(artifact.storage_mode.as_str(), "remote" | "pointer")
140        && artifact.locator.is_none()
141        && artifact.source_url.is_none()
142    {
143        push_issue(
144            issues,
145            &artifact.id,
146            "locator",
147            "remote or pointer artifact must have locator or source_url",
148        );
149    }
150    for (field, value) in [
151        ("source_url", artifact.source_url.as_deref()),
152        ("provenance.url", artifact.provenance.url.as_deref()),
153    ] {
154        if let Some(url) = value
155            && !is_http_url(url)
156        {
157            push_issue(
158                issues,
159                &artifact.id,
160                field,
161                format!("{field} must be http(s): {url}"),
162            );
163        }
164    }
165    audit_profile_fields(artifact, issues);
166}
167
168fn audit_profile_fields(artifact: &Artifact, issues: &mut Vec<ArtifactAuditIssue>) {
169    match artifact.kind.as_str() {
170        "clinical_trial_record" => {
171            let has_nct = metadata_string(artifact, "nct_id")
172                .or_else(|| metadata_string(artifact, "nct"))
173                .is_some()
174                || metadata_array_contains_nct(artifact, "nct_ids")
175                || artifact
176                    .source_url
177                    .as_deref()
178                    .or(artifact.locator.as_deref())
179                    .is_some_and(contains_nct_id);
180            if !has_nct {
181                push_issue(
182                    issues,
183                    &artifact.id,
184                    "metadata.nct_id",
185                    "clinical trial artifacts must carry or point to an NCT id",
186                );
187            }
188        }
189        "dataset" => {
190            let has_dataset_id = ["accession", "dataset_id", "repository", "registry"]
191                .iter()
192                .any(|key| metadata_string(artifact, key).is_some());
193            if !has_dataset_id && artifact.source_url.is_none() && artifact.locator.is_none() {
194                push_issue(
195                    issues,
196                    &artifact.id,
197                    "metadata",
198                    "dataset artifacts must carry an accession, repository, locator, or source_url",
199                );
200            }
201        }
202        "code" => {
203            let has_commit = metadata_string(artifact, "commit").is_some();
204            let has_pinned_blob =
205                matches!(artifact.storage_mode.as_str(), "local_blob" | "local_file")
206                    && is_sha256(&artifact.content_hash);
207            if !has_commit && !has_pinned_blob {
208                push_issue(
209                    issues,
210                    &artifact.id,
211                    "metadata.commit",
212                    "remote code artifacts should pin a commit, release tag, or equivalent version",
213                );
214            }
215        }
216        "registry_record" => {
217            if artifact.source_url.is_none()
218                && artifact.locator.is_none()
219                && artifact.provenance.url.is_none()
220            {
221                push_issue(
222                    issues,
223                    &artifact.id,
224                    "source_url",
225                    "registry records must point to an upstream registry page",
226                );
227            }
228        }
229        _ => {}
230    }
231}
232
233fn audit_local_blob(
234    root: &Path,
235    artifact: &Artifact,
236    issues: &mut Vec<ArtifactAuditIssue>,
237) -> Option<(bool, u64)> {
238    let Some(locator) = artifact.locator.as_deref() else {
239        push_issue(
240            issues,
241            &artifact.id,
242            "locator",
243            "local artifact must have a locator",
244        );
245        return None;
246    };
247    let blob_path = resolve_locator(root, locator);
248    let Ok(bytes) = fs::read(&blob_path) else {
249        push_issue(
250            issues,
251            &artifact.id,
252            "locator",
253            format!("local blob not found: {locator}"),
254        );
255        return None;
256    };
257    if is_sha256(&artifact.content_hash) {
258        let actual = format!("sha256:{}", hex::encode(Sha256::digest(&bytes)));
259        if actual != artifact.content_hash {
260            push_issue(
261                issues,
262                &artifact.id,
263                "content_hash",
264                format!("local blob hash mismatch: {actual}"),
265            );
266        }
267    }
268    if let Some(expected_size) = artifact.size_bytes
269        && expected_size != bytes.len() as u64
270    {
271        push_issue(
272            issues,
273            &artifact.id,
274            "size_bytes",
275            format!("expected {expected_size}, found {}", bytes.len()),
276        );
277    }
278    Some((true, bytes.len() as u64))
279}
280
281fn artifact_root(source: &Path) -> Option<PathBuf> {
282    if source.is_dir() {
283        return Some(source.to_path_buf());
284    }
285    source.parent().map(Path::to_path_buf)
286}
287
288fn resolve_locator(root: &Path, locator: &str) -> PathBuf {
289    let path = Path::new(locator);
290    if path.is_absolute() {
291        path.to_path_buf()
292    } else {
293        root.join(path)
294    }
295}
296
297fn is_sha256(value: &str) -> bool {
298    let Some(hex) = value.strip_prefix("sha256:") else {
299        return false;
300    };
301    hex.len() == 64
302        && hex
303            .bytes()
304            .all(|b| b.is_ascii_digit() || (b'a'..=b'f').contains(&b))
305}
306
307fn is_http_url(value: &str) -> bool {
308    value.starts_with("https://") || value.starts_with("http://")
309}
310
311fn contains_nct_id(value: &str) -> bool {
312    value
313        .as_bytes()
314        .windows(11)
315        .any(|window| window.starts_with(b"NCT") && window[3..].iter().all(u8::is_ascii_digit))
316}
317
318fn metadata_string<'a>(artifact: &'a Artifact, key: &str) -> Option<&'a str> {
319    artifact
320        .metadata
321        .get(key)
322        .and_then(serde_json::Value::as_str)
323}
324
325fn metadata_array_contains_nct(artifact: &Artifact, key: &str) -> bool {
326    artifact
327        .metadata
328        .get(key)
329        .and_then(serde_json::Value::as_array)
330        .is_some_and(|items| {
331            items
332                .iter()
333                .filter_map(serde_json::Value::as_str)
334                .any(contains_nct_id)
335        })
336}
337
338fn push_issue(
339    issues: &mut Vec<ArtifactAuditIssue>,
340    id: &str,
341    field: impl Into<String>,
342    message: impl Into<String>,
343) {
344    issues.push(ArtifactAuditIssue {
345        id: id.to_string(),
346        field: field.into(),
347        message: message.into(),
348    });
349}
350
351#[cfg(test)]
352mod tests {
353    use std::collections::BTreeMap;
354    use std::fs;
355
356    use serde_json::json;
357
358    use super::*;
359    use crate::access_tier::AccessTier;
360    use crate::bundle::{
361        Assertion, Conditions, Confidence, Evidence, Extraction, Flags, Provenance,
362    };
363    use crate::project;
364
365    #[test]
366    fn local_blob_hash_and_size_are_checked() {
367        let dir = tempfile::tempdir().expect("tempdir");
368        let blob_dir = dir.path().join(".vela/artifact-blobs/sha256");
369        fs::create_dir_all(&blob_dir).expect("blob dir");
370        let bytes = b"{\"ok\":true}\n";
371        let digest = format!("sha256:{}", hex::encode(Sha256::digest(bytes)));
372        let hex = digest.trim_start_matches("sha256:").to_string();
373        fs::write(blob_dir.join(&hex), bytes).expect("write blob");
374
375        let mut project = project_with_one_finding();
376        let target = project.findings[0].id.clone();
377        project.artifacts.push(
378            Artifact::new(
379                "clinical_trial_record",
380                "CLARITY AD registry record",
381                digest,
382                Some(bytes.len() as u64),
383                Some("application/json".to_string()),
384                "local_blob",
385                Some(format!(".vela/artifact-blobs/sha256/{hex}")),
386                Some("https://clinicaltrials.gov/study/NCT03887455".to_string()),
387                Some("ClinicalTrials.gov public record".to_string()),
388                vec![target],
389                Provenance {
390                    source_type: "database_record".to_string(),
391                    doi: None,
392                    pmid: None,
393                    pmc: None,
394                    openalex_id: None,
395                    title: "ClinicalTrials.gov NCT03887455".to_string(),
396                    authors: vec![],
397                    year: None,
398                    journal: None,
399                    url: Some("https://clinicaltrials.gov/study/NCT03887455".to_string()),
400                    license: Some("ClinicalTrials.gov public record".to_string()),
401                    publisher: None,
402                    funders: vec![],
403                    extraction: test_extraction(),
404                    review: None,
405                    citation_count: None,
406                },
407                BTreeMap::from([("nct_id".to_string(), json!("NCT03887455"))]),
408                AccessTier::Public,
409            )
410            .expect("artifact"),
411        );
412
413        let audit = audit_artifacts(dir.path(), &project);
414        assert!(audit.ok, "{:?}", audit.issues);
415        assert_eq!(audit.checked_local_blobs, 1);
416        assert_eq!(audit.local_blob_bytes, bytes.len() as u64);
417    }
418
419    #[test]
420    fn missing_profile_fields_are_reported() {
421        let mut project = project_with_one_finding();
422        let target = project.findings[0].id.clone();
423        project.artifacts.push(
424            Artifact::new(
425                "code",
426                "unpinned analysis repository",
427                "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
428                None,
429                None,
430                "pointer",
431                Some("https://github.com/example/analysis".to_string()),
432                Some("https://github.com/example/analysis".to_string()),
433                Some("MIT".to_string()),
434                vec![target],
435                Provenance {
436                    source_type: "database_record".to_string(),
437                    doi: None,
438                    pmid: None,
439                    pmc: None,
440                    openalex_id: None,
441                    title: "analysis repository".to_string(),
442                    authors: vec![],
443                    year: None,
444                    journal: None,
445                    url: Some("https://github.com/example/analysis".to_string()),
446                    license: Some("MIT".to_string()),
447                    publisher: None,
448                    funders: vec![],
449                    extraction: test_extraction(),
450                    review: None,
451                    citation_count: None,
452                },
453                BTreeMap::new(),
454                AccessTier::Public,
455            )
456            .expect("artifact"),
457        );
458
459        let audit = audit_artifacts(Path::new("."), &project);
460        assert!(!audit.ok);
461        assert!(
462            audit
463                .issues
464                .iter()
465                .any(|issue| issue.field == "metadata.commit")
466        );
467    }
468
469    fn project_with_one_finding() -> Project {
470        let finding = crate::bundle::FindingBundle::new(
471            Assertion {
472                text: "Lecanemab trial records belong in the frontier.".to_string(),
473                assertion_type: "treatment_effect".to_string(),
474                entities: vec![],
475                relation: Some("has_registry_record".to_string()),
476                direction: None,
477                causal_claim: None,
478                causal_evidence_grade: None,
479            },
480            Evidence {
481                evidence_type: "observational".to_string(),
482                model_system: "registry".to_string(),
483                species: Some("Homo sapiens".to_string()),
484                method: "manual test".to_string(),
485                sample_size: None,
486                effect_size: None,
487                p_value: None,
488                replicated: false,
489                replication_count: None,
490                evidence_spans: vec![],
491            },
492            test_conditions(),
493            Confidence::raw(0.6, "test", 0.6),
494            Provenance {
495                source_type: "database_record".to_string(),
496                doi: None,
497                pmid: None,
498                pmc: None,
499                openalex_id: None,
500                url: None,
501                title: "test".to_string(),
502                authors: vec![],
503                year: None,
504                journal: None,
505                license: Some("test".to_string()),
506                publisher: None,
507                funders: vec![],
508                extraction: test_extraction(),
509                review: None,
510                citation_count: None,
511            },
512            Flags::default(),
513        );
514        project::assemble("artifact audit test", vec![finding], 1, 0, "test")
515    }
516
517    fn test_conditions() -> Conditions {
518        Conditions {
519            text: "test condition".to_string(),
520            species_verified: vec!["Homo sapiens".to_string()],
521            species_unverified: vec![],
522            in_vitro: false,
523            in_vivo: false,
524            human_data: true,
525            clinical_trial: true,
526            concentration_range: None,
527            duration: None,
528            age_group: None,
529            cell_type: None,
530        }
531    }
532
533    fn test_extraction() -> Extraction {
534        Extraction {
535            method: "manual".to_string(),
536            model: None,
537            model_version: None,
538            extracted_at: "2026-05-06T00:00:00Z".to_string(),
539            extractor_version: "test".to_string(),
540        }
541    }
542}