1use std::collections::{BTreeMap, HashSet};
4use std::fs;
5use std::path::{Path, PathBuf};
6
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10use crate::bundle::Artifact;
11use crate::project::Project;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ArtifactAudit {
15 pub ok: bool,
16 pub command: String,
17 pub frontier: String,
18 pub artifact_count: usize,
19 pub checked_local_blobs: usize,
20 pub local_blob_bytes: u64,
21 pub by_kind: BTreeMap<String, usize>,
22 pub by_storage_mode: BTreeMap<String, usize>,
23 pub issue_count: usize,
24 pub issues: Vec<ArtifactAuditIssue>,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct ArtifactAuditIssue {
29 pub id: String,
30 pub field: String,
31 pub message: String,
32}
33
34pub fn audit_artifacts(source: &Path, project: &Project) -> ArtifactAudit {
35 let root = artifact_root(source);
36 let finding_ids = project
37 .findings
38 .iter()
39 .map(|finding| finding.id.as_str())
40 .collect::<HashSet<_>>();
41 let mut issues = Vec::new();
42 let mut by_kind = BTreeMap::new();
43 let mut by_storage_mode = BTreeMap::new();
44 let mut checked_local_blobs = 0usize;
45 let mut local_blob_bytes = 0u64;
46
47 for artifact in &project.artifacts {
48 *by_kind.entry(artifact.kind.clone()).or_insert(0) += 1;
49 *by_storage_mode
50 .entry(artifact.storage_mode.clone())
51 .or_insert(0) += 1;
52 audit_artifact_shape(artifact, &finding_ids, &mut issues);
53 if matches!(artifact.storage_mode.as_str(), "local_blob" | "local_file") {
54 if let Some(root) = root.as_deref() {
55 if let Some((checked, bytes)) = audit_local_blob(root, artifact, &mut issues) {
56 checked_local_blobs += usize::from(checked);
57 local_blob_bytes += bytes;
58 }
59 } else {
60 push_issue(
61 &mut issues,
62 &artifact.id,
63 "locator",
64 "local artifact cannot be checked without a frontier directory",
65 );
66 }
67 }
68 }
69
70 ArtifactAudit {
71 ok: issues.is_empty(),
72 command: "artifact-audit".to_string(),
73 frontier: source.display().to_string(),
74 artifact_count: project.artifacts.len(),
75 checked_local_blobs,
76 local_blob_bytes,
77 by_kind,
78 by_storage_mode,
79 issue_count: issues.len(),
80 issues,
81 }
82}
83
84fn audit_artifact_shape(
85 artifact: &Artifact,
86 finding_ids: &HashSet<&str>,
87 issues: &mut Vec<ArtifactAuditIssue>,
88) {
89 if !artifact.id.starts_with("va_") {
90 push_issue(
91 issues,
92 &artifact.id,
93 "id",
94 "artifact id must start with va_",
95 );
96 }
97 if !is_sha256(&artifact.content_hash) {
98 push_issue(
99 issues,
100 &artifact.id,
101 "content_hash",
102 "content_hash must be sha256:<64 lowercase hex>",
103 );
104 }
105 if artifact.license.as_deref().unwrap_or("").trim().is_empty()
106 && artifact
107 .provenance
108 .license
109 .as_deref()
110 .unwrap_or("")
111 .trim()
112 .is_empty()
113 {
114 push_issue(
115 issues,
116 &artifact.id,
117 "license",
118 "artifact must declare license or access terms",
119 );
120 }
121 if artifact.target_findings.is_empty() {
122 push_issue(
123 issues,
124 &artifact.id,
125 "target_findings",
126 "artifact must target at least one finding",
127 );
128 }
129 for finding_id in &artifact.target_findings {
130 if !finding_ids.contains(finding_id.as_str()) {
131 push_issue(
132 issues,
133 &artifact.id,
134 "target_findings",
135 format!("unknown finding id: {finding_id}"),
136 );
137 }
138 }
139 if matches!(artifact.storage_mode.as_str(), "remote" | "pointer")
140 && artifact.locator.is_none()
141 && artifact.source_url.is_none()
142 {
143 push_issue(
144 issues,
145 &artifact.id,
146 "locator",
147 "remote or pointer artifact must have locator or source_url",
148 );
149 }
150 for (field, value) in [
151 ("source_url", artifact.source_url.as_deref()),
152 ("provenance.url", artifact.provenance.url.as_deref()),
153 ] {
154 if let Some(url) = value
155 && !is_http_url(url)
156 {
157 push_issue(
158 issues,
159 &artifact.id,
160 field,
161 format!("{field} must be http(s): {url}"),
162 );
163 }
164 }
165 audit_profile_fields(artifact, issues);
166}
167
168fn audit_profile_fields(artifact: &Artifact, issues: &mut Vec<ArtifactAuditIssue>) {
169 match artifact.kind.as_str() {
170 "clinical_trial_record" => {
171 let has_nct = metadata_string(artifact, "nct_id")
172 .or_else(|| metadata_string(artifact, "nct"))
173 .is_some()
174 || metadata_array_contains_nct(artifact, "nct_ids")
175 || artifact
176 .source_url
177 .as_deref()
178 .or(artifact.locator.as_deref())
179 .is_some_and(contains_nct_id);
180 if !has_nct {
181 push_issue(
182 issues,
183 &artifact.id,
184 "metadata.nct_id",
185 "clinical trial artifacts must carry or point to an NCT id",
186 );
187 }
188 }
189 "dataset" => {
190 let has_dataset_id = ["accession", "dataset_id", "repository", "registry"]
191 .iter()
192 .any(|key| metadata_string(artifact, key).is_some());
193 if !has_dataset_id && artifact.source_url.is_none() && artifact.locator.is_none() {
194 push_issue(
195 issues,
196 &artifact.id,
197 "metadata",
198 "dataset artifacts must carry an accession, repository, locator, or source_url",
199 );
200 }
201 }
202 "code" => {
203 let has_commit = metadata_string(artifact, "commit").is_some();
204 let has_pinned_blob =
205 matches!(artifact.storage_mode.as_str(), "local_blob" | "local_file")
206 && is_sha256(&artifact.content_hash);
207 if !has_commit && !has_pinned_blob {
208 push_issue(
209 issues,
210 &artifact.id,
211 "metadata.commit",
212 "remote code artifacts should pin a commit, release tag, or equivalent version",
213 );
214 }
215 }
216 "registry_record" => {
217 if artifact.source_url.is_none()
218 && artifact.locator.is_none()
219 && artifact.provenance.url.is_none()
220 {
221 push_issue(
222 issues,
223 &artifact.id,
224 "source_url",
225 "registry records must point to an upstream registry page",
226 );
227 }
228 }
229 _ => {}
230 }
231}
232
233fn audit_local_blob(
234 root: &Path,
235 artifact: &Artifact,
236 issues: &mut Vec<ArtifactAuditIssue>,
237) -> Option<(bool, u64)> {
238 let Some(locator) = artifact.locator.as_deref() else {
239 push_issue(
240 issues,
241 &artifact.id,
242 "locator",
243 "local artifact must have a locator",
244 );
245 return None;
246 };
247 let blob_path = resolve_locator(root, locator);
248 let Ok(bytes) = fs::read(&blob_path) else {
249 push_issue(
250 issues,
251 &artifact.id,
252 "locator",
253 format!("local blob not found: {locator}"),
254 );
255 return None;
256 };
257 if is_sha256(&artifact.content_hash) {
258 let actual = format!("sha256:{}", hex::encode(Sha256::digest(&bytes)));
259 if actual != artifact.content_hash {
260 push_issue(
261 issues,
262 &artifact.id,
263 "content_hash",
264 format!("local blob hash mismatch: {actual}"),
265 );
266 }
267 }
268 if let Some(expected_size) = artifact.size_bytes
269 && expected_size != bytes.len() as u64
270 {
271 push_issue(
272 issues,
273 &artifact.id,
274 "size_bytes",
275 format!("expected {expected_size}, found {}", bytes.len()),
276 );
277 }
278 Some((true, bytes.len() as u64))
279}
280
281fn artifact_root(source: &Path) -> Option<PathBuf> {
282 if source.is_dir() {
283 return Some(source.to_path_buf());
284 }
285 source.parent().map(Path::to_path_buf)
286}
287
288fn resolve_locator(root: &Path, locator: &str) -> PathBuf {
289 let path = Path::new(locator);
290 if path.is_absolute() {
291 path.to_path_buf()
292 } else {
293 root.join(path)
294 }
295}
296
297fn is_sha256(value: &str) -> bool {
298 let Some(hex) = value.strip_prefix("sha256:") else {
299 return false;
300 };
301 hex.len() == 64
302 && hex
303 .bytes()
304 .all(|b| b.is_ascii_digit() || (b'a'..=b'f').contains(&b))
305}
306
307fn is_http_url(value: &str) -> bool {
308 value.starts_with("https://") || value.starts_with("http://")
309}
310
311fn contains_nct_id(value: &str) -> bool {
312 value
313 .as_bytes()
314 .windows(11)
315 .any(|window| window.starts_with(b"NCT") && window[3..].iter().all(u8::is_ascii_digit))
316}
317
318fn metadata_string<'a>(artifact: &'a Artifact, key: &str) -> Option<&'a str> {
319 artifact
320 .metadata
321 .get(key)
322 .and_then(serde_json::Value::as_str)
323}
324
325fn metadata_array_contains_nct(artifact: &Artifact, key: &str) -> bool {
326 artifact
327 .metadata
328 .get(key)
329 .and_then(serde_json::Value::as_array)
330 .is_some_and(|items| {
331 items
332 .iter()
333 .filter_map(serde_json::Value::as_str)
334 .any(contains_nct_id)
335 })
336}
337
338fn push_issue(
339 issues: &mut Vec<ArtifactAuditIssue>,
340 id: &str,
341 field: impl Into<String>,
342 message: impl Into<String>,
343) {
344 issues.push(ArtifactAuditIssue {
345 id: id.to_string(),
346 field: field.into(),
347 message: message.into(),
348 });
349}
350
351#[cfg(test)]
352mod tests {
353 use std::collections::BTreeMap;
354 use std::fs;
355
356 use serde_json::json;
357
358 use super::*;
359 use crate::access_tier::AccessTier;
360 use crate::bundle::{
361 Assertion, Conditions, Confidence, Evidence, Extraction, Flags, Provenance,
362 };
363 use crate::project;
364
365 #[test]
366 fn local_blob_hash_and_size_are_checked() {
367 let dir = tempfile::tempdir().expect("tempdir");
368 let blob_dir = dir.path().join(".vela/artifact-blobs/sha256");
369 fs::create_dir_all(&blob_dir).expect("blob dir");
370 let bytes = b"{\"ok\":true}\n";
371 let digest = format!("sha256:{}", hex::encode(Sha256::digest(bytes)));
372 let hex = digest.trim_start_matches("sha256:").to_string();
373 fs::write(blob_dir.join(&hex), bytes).expect("write blob");
374
375 let mut project = project_with_one_finding();
376 let target = project.findings[0].id.clone();
377 project.artifacts.push(
378 Artifact::new(
379 "clinical_trial_record",
380 "CLARITY AD registry record",
381 digest,
382 Some(bytes.len() as u64),
383 Some("application/json".to_string()),
384 "local_blob",
385 Some(format!(".vela/artifact-blobs/sha256/{hex}")),
386 Some("https://clinicaltrials.gov/study/NCT03887455".to_string()),
387 Some("ClinicalTrials.gov public record".to_string()),
388 vec![target],
389 Provenance {
390 source_type: "database_record".to_string(),
391 doi: None,
392 pmid: None,
393 pmc: None,
394 openalex_id: None,
395 title: "ClinicalTrials.gov NCT03887455".to_string(),
396 authors: vec![],
397 year: None,
398 journal: None,
399 url: Some("https://clinicaltrials.gov/study/NCT03887455".to_string()),
400 license: Some("ClinicalTrials.gov public record".to_string()),
401 publisher: None,
402 funders: vec![],
403 extraction: test_extraction(),
404 review: None,
405 citation_count: None,
406 },
407 BTreeMap::from([("nct_id".to_string(), json!("NCT03887455"))]),
408 AccessTier::Public,
409 )
410 .expect("artifact"),
411 );
412
413 let audit = audit_artifacts(dir.path(), &project);
414 assert!(audit.ok, "{:?}", audit.issues);
415 assert_eq!(audit.checked_local_blobs, 1);
416 assert_eq!(audit.local_blob_bytes, bytes.len() as u64);
417 }
418
419 #[test]
420 fn missing_profile_fields_are_reported() {
421 let mut project = project_with_one_finding();
422 let target = project.findings[0].id.clone();
423 project.artifacts.push(
424 Artifact::new(
425 "code",
426 "unpinned analysis repository",
427 "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
428 None,
429 None,
430 "pointer",
431 Some("https://github.com/example/analysis".to_string()),
432 Some("https://github.com/example/analysis".to_string()),
433 Some("MIT".to_string()),
434 vec![target],
435 Provenance {
436 source_type: "database_record".to_string(),
437 doi: None,
438 pmid: None,
439 pmc: None,
440 openalex_id: None,
441 title: "analysis repository".to_string(),
442 authors: vec![],
443 year: None,
444 journal: None,
445 url: Some("https://github.com/example/analysis".to_string()),
446 license: Some("MIT".to_string()),
447 publisher: None,
448 funders: vec![],
449 extraction: test_extraction(),
450 review: None,
451 citation_count: None,
452 },
453 BTreeMap::new(),
454 AccessTier::Public,
455 )
456 .expect("artifact"),
457 );
458
459 let audit = audit_artifacts(Path::new("."), &project);
460 assert!(!audit.ok);
461 assert!(
462 audit
463 .issues
464 .iter()
465 .any(|issue| issue.field == "metadata.commit")
466 );
467 }
468
469 fn project_with_one_finding() -> Project {
470 let finding = crate::bundle::FindingBundle::new(
471 Assertion {
472 text: "Lecanemab trial records belong in the frontier.".to_string(),
473 assertion_type: "treatment_effect".to_string(),
474 entities: vec![],
475 relation: Some("has_registry_record".to_string()),
476 direction: None,
477 causal_claim: None,
478 causal_evidence_grade: None,
479 },
480 Evidence {
481 evidence_type: "observational".to_string(),
482 model_system: "registry".to_string(),
483 species: Some("Homo sapiens".to_string()),
484 method: "manual test".to_string(),
485 sample_size: None,
486 effect_size: None,
487 p_value: None,
488 replicated: false,
489 replication_count: None,
490 evidence_spans: vec![],
491 },
492 test_conditions(),
493 Confidence::raw(0.6, "test", 0.6),
494 Provenance {
495 source_type: "database_record".to_string(),
496 doi: None,
497 pmid: None,
498 pmc: None,
499 openalex_id: None,
500 url: None,
501 title: "test".to_string(),
502 authors: vec![],
503 year: None,
504 journal: None,
505 license: Some("test".to_string()),
506 publisher: None,
507 funders: vec![],
508 extraction: test_extraction(),
509 review: None,
510 citation_count: None,
511 },
512 Flags::default(),
513 );
514 project::assemble("artifact audit test", vec![finding], 1, 0, "test")
515 }
516
517 fn test_conditions() -> Conditions {
518 Conditions {
519 text: "test condition".to_string(),
520 species_verified: vec!["Homo sapiens".to_string()],
521 species_unverified: vec![],
522 in_vitro: false,
523 in_vivo: false,
524 human_data: true,
525 clinical_trial: true,
526 concentration_range: None,
527 duration: None,
528 age_group: None,
529 cell_type: None,
530 }
531 }
532
533 fn test_extraction() -> Extraction {
534 Extraction {
535 method: "manual".to_string(),
536 model: None,
537 model_version: None,
538 extracted_at: "2026-05-06T00:00:00Z".to_string(),
539 extractor_version: "test".to_string(),
540 }
541 }
542}