1use chrono::{DateTime, SecondsFormat, Utc};
18use ordvec::{
19 probe_index_metadata, IndexKind as CoreIndexKind, IndexMetadata as CoreIndexMetadata,
20 IndexParams as CoreIndexParams,
21};
22use serde::{Deserialize, Serialize};
23use sha2::{Digest, Sha256};
24use std::collections::{BTreeMap, HashSet};
25use std::fmt;
26use std::fs::{self, File};
27use std::io::{self, BufRead, BufReader, Read};
28use std::path::{Component, Path, PathBuf};
29use uuid::Uuid;
30
31pub const SCHEMA_VERSION: &str = "ordvec.index_manifest.v1";
32pub const CALIBRATION_SCHEMA_VERSION: &str = "ordvec.calibration.v1";
33pub const ENCODER_DISTORTION_SCHEMA_VERSION: &str = "ordvec.encoder_distortion.v1";
34pub const DEFAULT_MAX_MANIFEST_BYTES: u64 = 1024 * 1024;
35pub const DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES: usize = 64 * 1024;
36pub const DEFAULT_MAX_ROW_IDENTITY_ROWS: usize = 10_000_000;
37pub const DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES: usize = 64 * 1024 * 1024;
38pub const DEFAULT_MAX_AUXILIARY_ARTIFACTS: usize = 1024;
39pub const DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES: u64 = 64 * 1024 * 1024;
40pub const DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES: u64 = 64 * 1024 * 1024;
41pub const DEFAULT_MAX_REPORT_ISSUES: usize = 1024;
42pub const DEFAULT_MAX_CACHED_REPORT_BYTES: u64 = 4 * 1024 * 1024;
43
44#[derive(Debug)]
45pub enum ManifestError {
46 Io(io::Error),
47 Json(serde_json::Error),
48 Invalid(String),
49 LimitExceeded { code: String, message: String },
50}
51
52impl ManifestError {
53 pub fn invalid(message: impl Into<String>) -> Self {
54 Self::Invalid(message.into())
55 }
56
57 pub fn limit_exceeded(code: impl Into<String>, message: impl Into<String>) -> Self {
58 Self::LimitExceeded {
59 code: code.into(),
60 message: message.into(),
61 }
62 }
63
64 pub fn code(&self) -> Option<&str> {
65 match self {
66 Self::LimitExceeded { code, .. } => Some(code.as_str()),
67 _ => None,
68 }
69 }
70}
71
72impl fmt::Display for ManifestError {
73 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74 match self {
75 Self::Io(err) => write!(f, "{err}"),
76 Self::Json(err) => write!(f, "{err}"),
77 Self::Invalid(message) => f.write_str(message),
78 Self::LimitExceeded { code, message } => write!(f, "{code}: {message}"),
79 }
80 }
81}
82
83impl std::error::Error for ManifestError {}
84
85impl From<io::Error> for ManifestError {
86 fn from(value: io::Error) -> Self {
87 Self::Io(value)
88 }
89}
90
91impl From<serde_json::Error> for ManifestError {
92 fn from(value: serde_json::Error) -> Self {
93 Self::Json(value)
94 }
95}
96
97#[derive(Clone, Debug)]
98pub struct ManifestDocument {
99 pub manifest: IndexManifest,
100 pub source_path: Option<PathBuf>,
101 pub base_dir: PathBuf,
102}
103
104pub fn load_manifest_file(path: impl AsRef<Path>) -> Result<ManifestDocument, ManifestError> {
105 load_manifest_file_with_options(path, &VerifyOptions::default())
106}
107
108pub fn load_manifest_file_with_options(
109 path: impl AsRef<Path>,
110 options: &VerifyOptions,
111) -> Result<ManifestDocument, ManifestError> {
112 let path = path.as_ref();
113 let manifest_bytes = read_bounded_file(
114 path,
115 options.limits.max_manifest_bytes,
116 "manifest_file_too_large",
117 "manifest file",
118 )?;
119 let manifest: IndexManifest = serde_json::from_slice(&manifest_bytes)?;
120 let base_dir = path
121 .parent()
122 .filter(|p| !p.as_os_str().is_empty())
123 .unwrap_or_else(|| Path::new("."))
124 .to_path_buf();
125 Ok(ManifestDocument {
126 manifest,
127 source_path: Some(path.to_path_buf()),
128 base_dir,
129 })
130}
131
132fn read_bounded_file(
133 path: &Path,
134 max_bytes: u64,
135 code: &'static str,
136 context: &'static str,
137) -> Result<Vec<u8>, ManifestError> {
138 let mut file = File::open(path)?;
139 let max_len = usize::try_from(max_bytes).map_err(|_| {
140 ManifestError::limit_exceeded(
141 code,
142 format!(
143 "{context} byte limit {max_bytes} is too large to enforce while reading {}",
144 path.display()
145 ),
146 )
147 })?;
148 let read_limit = max_bytes.checked_add(1).ok_or_else(|| {
149 ManifestError::limit_exceeded(
150 code,
151 format!(
152 "{context} byte limit {max_bytes} is too large to enforce while reading {}",
153 path.display()
154 ),
155 )
156 })?;
157 let mut bytes = Vec::new();
158 let mut limited = file.by_ref().take(read_limit);
159 limited.read_to_end(&mut bytes)?;
160 if bytes.len() > max_len {
161 return Err(ManifestError::limit_exceeded(
162 code,
163 format!(
164 "{context} exceeds {max_bytes} bytes while reading {}",
165 path.display()
166 ),
167 ));
168 }
169 Ok(bytes)
170}
171
172pub fn verify_manifest_with_base(
173 manifest: IndexManifest,
174 base_dir: impl Into<PathBuf>,
175 options: VerifyOptions,
176) -> VerificationReport {
177 let document = ManifestDocument {
178 manifest,
179 source_path: None,
180 base_dir: base_dir.into(),
181 };
182 verify_manifest(&document, options)
183}
184
185pub fn verify_index_manifest(
186 index_path: impl Into<PathBuf>,
187 manifest_path: impl AsRef<Path>,
188 mut options: VerifyOptions,
189) -> Result<VerificationReport, ManifestError> {
190 let document = load_manifest_file_with_options(manifest_path, &options)?;
191 options.index_override = Some(index_path.into());
192 Ok(verify_manifest(&document, options))
193}
194
195pub fn verify_for_load(
205 manifest_path: impl AsRef<Path>,
206 options: VerifyOptions,
207) -> Result<VerifiedLoadPlan, VerifiedLoadPlanError> {
208 let document = load_manifest_file_with_options(manifest_path, &options)?;
209 verify_document_for_load(&document, options)
210}
211
212pub fn verify_document_for_load(
218 document: &ManifestDocument,
219 options: VerifyOptions,
220) -> Result<VerifiedLoadPlan, VerifiedLoadPlanError> {
221 let (report, paths) = verify_manifest_with_path_capture(document, options);
222 VerifiedLoadPlan::from_report(document, report, paths)
223}
224
225pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> VerificationReport {
226 verify_manifest_with_path_capture(document, options).0
227}
228
229fn verify_manifest_with_path_capture(
230 document: &ManifestDocument,
231 options: VerifyOptions,
232) -> (VerificationReport, VerificationPathCapture) {
233 let mut paths = VerificationPathCapture::default();
234 let mut report = VerificationReport::new(Some(document.manifest.manifest_id.clone()));
235 validate_manifest_shape(&document.manifest, &options.limits, &mut report);
236
237 let artifact_display_path = document.manifest.artifact.path.clone();
238 report.artifact.manifest_path = Some(artifact_display_path.clone());
239 let artifact_path = options
240 .index_override
241 .as_ref()
242 .cloned()
243 .unwrap_or_else(|| PathBuf::from(&document.manifest.artifact.path));
244 report.artifact.observed_path = Some(path_to_display(&artifact_path));
245
246 if let Some(resolved) = resolve_existing_path(
247 &artifact_path,
248 &document.base_dir,
249 &options,
250 "artifact",
251 &mut report.errors,
252 ) {
253 paths.artifact_path = Some(resolved.canonical_path.clone());
254 report.artifact.canonical_path = Some(path_to_display(&resolved.canonical_path));
255 match sha256_file(&resolved.resolved_path) {
256 Ok(hash) => {
257 report.artifact.sha256 = Some(hash.sha256.clone());
258 report.artifact.size_bytes = Some(hash.size_bytes);
259 if !hex_digest_eq(&hash.sha256, &document.manifest.artifact.sha256) {
260 report.error(
261 "artifact_sha256_mismatch",
262 format!(
263 "artifact SHA-256 was {}, manifest declares {}",
264 hash.sha256, document.manifest.artifact.sha256
265 ),
266 );
267 }
268 if hash.size_bytes != document.manifest.artifact.file_size_bytes {
269 report.error(
270 "artifact_file_size_mismatch",
271 format!(
272 "artifact size was {}, manifest declares {}",
273 hash.size_bytes, document.manifest.artifact.file_size_bytes
274 ),
275 );
276 }
277 }
278 Err(err) => report.error(
279 "artifact_hash_failed",
280 format!("failed to hash artifact: {err}"),
281 ),
282 }
283
284 match probe_index_metadata(&resolved.resolved_path) {
285 Ok(metadata) => {
286 let metadata_report = MetadataReport::from_core(&metadata);
287 compare_artifact_metadata(&document.manifest.artifact, &metadata, &mut report);
288 report.artifact.metadata = Some(metadata_report);
289 }
290 Err(err) => report.error(
291 "artifact_probe_failed",
292 format!("failed to probe artifact metadata: {err}"),
293 ),
294 }
295 }
296
297 verify_auxiliary_artifacts(document, &options, &mut report, &mut paths);
298 verify_row_identity(document, &options, &mut report, &mut paths);
299 verify_encoder_distortion(document, &options, &mut report);
300 verify_calibration(document, &options, &mut report);
301 verify_attestations(&document.manifest, &mut report);
302
303 enforce_report_issue_limit(&mut report.errors, &options.limits);
304 report.ok = report.errors.is_empty();
305 (report, paths)
306}
307
308fn validate_manifest_shape(
309 manifest: &IndexManifest,
310 limits: &ResourceLimits,
311 report: &mut VerificationReport,
312) {
313 if manifest.schema_version != SCHEMA_VERSION {
314 report.error(
315 "schema_version_unsupported",
316 format!(
317 "schema_version must be {SCHEMA_VERSION}, got {}",
318 manifest.schema_version
319 ),
320 );
321 }
322 if manifest.manifest_id.trim().is_empty() {
323 report.error("manifest_id_empty", "manifest_id must be non-empty");
324 }
325 if DateTime::parse_from_rfc3339(&manifest.created_at).is_err() {
326 report.error("created_at_invalid", "created_at must parse as RFC3339");
327 }
328 if manifest.embedding.model.trim().is_empty() {
329 report.error("embedding_model_empty", "embedding.model must be non-empty");
330 }
331 if manifest.embedding.dim == 0 {
332 report.error(
333 "embedding_dim_zero",
334 "embedding.dim must be greater than zero",
335 );
336 }
337 if manifest.artifact.path.trim().is_empty() {
338 report.error("artifact_path_empty", "artifact.path must be non-empty");
339 }
340 if !is_sha256_hex(&manifest.artifact.sha256) {
341 report.error(
342 "artifact_sha256_invalid",
343 "artifact.sha256 must be a lowercase 64-character hex SHA-256 digest",
344 );
345 }
346 if manifest.artifact.bytes_per_vec == 0 {
347 report.error(
348 "artifact_bytes_per_vec_zero",
349 "artifact.bytes_per_vec must be greater than zero",
350 );
351 }
352 if manifest.artifact.dim != manifest.embedding.dim {
353 report.error(
354 "artifact_embedding_dim_mismatch",
355 format!(
356 "artifact.dim {} does not match embedding.dim {}",
357 manifest.artifact.dim, manifest.embedding.dim
358 ),
359 );
360 }
361 if !artifact_kind_matches_params(manifest.artifact.kind, &manifest.artifact.params) {
362 report.error(
363 "artifact_params_kind_mismatch",
364 "artifact.params discriminator does not match artifact.kind",
365 );
366 }
367
368 let row_count = manifest.row_identity.row_count();
369 if manifest.artifact.vector_count != row_count {
370 report.error(
371 "artifact_row_count_mismatch",
372 format!(
373 "artifact.vector_count {} does not match row_identity.row_count {}",
374 manifest.artifact.vector_count, row_count
375 ),
376 );
377 }
378 if let RowIdentity::Jsonl {
379 path,
380 sha256,
381 id_kind,
382 db,
383 ..
384 } = &manifest.row_identity
385 {
386 if path.trim().is_empty() {
387 report.error(
388 "row_identity_path_empty",
389 "row_identity.path must be non-empty",
390 );
391 }
392 if !is_sha256_hex(sha256) {
393 report.error(
394 "row_identity_sha256_invalid",
395 "row_identity.sha256 must be a lowercase 64-character hex SHA-256 digest",
396 );
397 }
398 if id_kind != "uuid" {
399 report.error(
400 "row_identity_id_kind_unsupported",
401 "row_identity.id_kind must be uuid in v1",
402 );
403 }
404 if db.is_some() {
405 report.error(
406 "row_identity_db_unsupported",
407 "row_identity.db is reserved for a future schema and is not verified in v1",
408 );
409 }
410 }
411
412 validate_auxiliary_artifact_shape(manifest, limits, report);
413
414 validate_optional_non_empty(
415 "embedding_model_revision_empty",
416 "embedding.model_revision must be non-empty when present",
417 manifest.embedding.model_revision.as_deref(),
418 report,
419 );
420 validate_optional_non_empty(
421 "embedding_tokenizer_revision_empty",
422 "embedding.tokenizer_revision must be non-empty when present",
423 manifest.embedding.tokenizer_revision.as_deref(),
424 report,
425 );
426 validate_optional_non_empty(
427 "embedding_pooling_empty",
428 "embedding.pooling must be non-empty when present",
429 manifest.embedding.pooling.as_deref(),
430 report,
431 );
432 validate_optional_sha256(
433 "embedding_corpus_digest_invalid",
434 "embedding.corpus_digest must be a lowercase 64-character hex SHA-256 digest",
435 manifest.embedding.corpus_digest.as_deref(),
436 report,
437 );
438 validate_optional_sha256(
439 "embedding_matrix_digest_invalid",
440 "embedding.embedding_matrix_digest must be a lowercase 64-character hex SHA-256 digest",
441 manifest.embedding.embedding_matrix_digest.as_deref(),
442 report,
443 );
444 validate_optional_non_empty(
445 "embedding_normalization_empty",
446 "embedding.normalization must be non-empty when present",
447 manifest.embedding.normalization.as_deref(),
448 report,
449 );
450
451 if let Some(build) = &manifest.build {
452 if build.invocation_id.trim().is_empty() {
453 report.error(
454 "build_invocation_id_empty",
455 "build.invocation_id must be non-empty",
456 );
457 }
458 if build
459 .builder_id
460 .as_ref()
461 .is_some_and(|builder_id| builder_id.trim().is_empty())
462 {
463 report.error(
464 "build_builder_id_empty",
465 "build.builder_id must be non-empty",
466 );
467 }
468 validate_optional_non_empty(
469 "build_source_repo_empty",
470 "build.source_repo must be non-empty when present",
471 build.source_repo.as_deref(),
472 report,
473 );
474 validate_optional_non_empty(
475 "build_source_commit_empty",
476 "build.source_commit must be non-empty when present",
477 build.source_commit.as_deref(),
478 report,
479 );
480 validate_optional_non_empty(
481 "build_ci_provider_empty",
482 "build.ci_provider must be non-empty when present",
483 build.ci_provider.as_deref(),
484 report,
485 );
486 validate_optional_non_empty(
487 "build_ci_run_id_empty",
488 "build.ci_run_id must be non-empty when present",
489 build.ci_run_id.as_deref(),
490 report,
491 );
492 }
493
494 for key in manifest.extensions.keys() {
495 if !extension_key_is_namespaced(key) {
496 report.error(
497 "extension_key_not_namespaced",
498 format!("extension key {key:?} must be namespaced"),
499 );
500 }
501 }
502}
503
504fn validate_auxiliary_artifact_shape(
505 manifest: &IndexManifest,
506 limits: &ResourceLimits,
507 report: &mut VerificationReport,
508) {
509 if !check_auxiliary_artifact_count(manifest, limits, report) {
510 return;
511 }
512 let mut names = HashSet::new();
513 for artifact in &manifest.auxiliary_artifacts {
514 let name = artifact.name.trim();
515 if name.is_empty() {
516 report.error(
517 "auxiliary_artifact_name_empty",
518 "auxiliary artifact name must be non-empty",
519 );
520 } else if artifact.name != name {
521 report.error(
522 "auxiliary_artifact_name_not_trimmed",
523 format!(
524 "auxiliary artifact name {name:?} must not have leading or trailing whitespace"
525 ),
526 );
527 } else if !names.insert(name.to_string()) {
528 report.error(
529 "auxiliary_artifact_name_duplicate",
530 format!("auxiliary artifact name {name:?} is duplicated"),
531 );
532 }
533
534 if artifact.path.trim().is_empty() {
535 report.error(
536 "auxiliary_artifact_path_empty",
537 format!("auxiliary artifact {name:?} path must be non-empty"),
538 );
539 }
540 if !is_sha256_hex(&artifact.sha256) {
541 report.error(
542 "auxiliary_artifact_sha256_invalid",
543 format!(
544 "auxiliary artifact {name:?} sha256 must be a lowercase 64-character hex SHA-256 digest"
545 ),
546 );
547 }
548 }
549}
550
551fn validate_optional_non_empty(
552 code: &str,
553 message: &str,
554 value: Option<&str>,
555 report: &mut VerificationReport,
556) {
557 if value.is_some_and(|value| value.trim().is_empty()) {
558 report.error(code, message);
559 }
560}
561
562fn validate_optional_sha256(
563 code: &str,
564 message: &str,
565 value: Option<&str>,
566 report: &mut VerificationReport,
567) {
568 if value.is_some_and(|value| !is_sha256_hex(value)) {
569 report.error(code, message);
570 }
571}
572
573fn validate_optional_sha256_uri(
574 code: &str,
575 message: &str,
576 value: Option<&str>,
577 report: &mut VerificationReport,
578) {
579 let Some(value) = value else {
580 return;
581 };
582 let Some(digest) = value.strip_prefix("sha256:") else {
583 report.error(code, message);
584 return;
585 };
586 if !is_sha256_hex(digest) {
587 report.error(code, message);
588 }
589}
590
591fn validate_optional_positive_f64(
592 code: &str,
593 message: &str,
594 value: Option<f64>,
595 report: &mut VerificationReport,
596) {
597 if value.is_some_and(|value| !value.is_finite() || value <= 0.0) {
598 report.error(code, message);
599 }
600}
601
602fn validate_optional_nonnegative_f64(
603 code: &str,
604 message: &str,
605 value: Option<f64>,
606 report: &mut VerificationReport,
607) {
608 if value.is_some_and(|value| !value.is_finite() || value < 0.0) {
609 report.error(code, message);
610 }
611}
612
613fn validate_optional_probability(
614 code: &str,
615 message: &str,
616 value: Option<f64>,
617 report: &mut VerificationReport,
618) {
619 if value.is_some_and(|value| !value.is_finite() || !(0.0..=1.0).contains(&value)) {
620 report.error(code, message);
621 }
622}
623
624fn artifact_kind_matches_params(kind: ManifestIndexKind, params: &ManifestIndexParams) -> bool {
625 matches!(
626 (kind, params),
627 (ManifestIndexKind::Rank, ManifestIndexParams::Rank)
628 | (
629 ManifestIndexKind::RankQuant,
630 ManifestIndexParams::RankQuant { .. }
631 )
632 | (
633 ManifestIndexKind::Bitmap,
634 ManifestIndexParams::Bitmap { .. }
635 )
636 | (
637 ManifestIndexKind::SignBitmap,
638 ManifestIndexParams::SignBitmap
639 )
640 )
641}
642
643fn compare_artifact_metadata(
644 artifact: &Artifact,
645 metadata: &CoreIndexMetadata,
646 report: &mut VerificationReport,
647) {
648 let observed_kind = ManifestIndexKind::from_core(metadata.kind);
649 if artifact.kind != observed_kind {
650 report.error(
651 "artifact_kind_mismatch",
652 format!(
653 "artifact kind was {:?}, manifest declares {:?}",
654 observed_kind, artifact.kind
655 ),
656 );
657 }
658 let observed_params = ManifestIndexParams::from_core(metadata.params);
659 if artifact.params != observed_params {
660 report.error(
661 "artifact_params_mismatch",
662 format!(
663 "artifact params were {:?}, manifest declares {:?}",
664 observed_params, artifact.params
665 ),
666 );
667 }
668 if artifact.format_version != metadata.format_version {
669 report.error(
670 "artifact_format_version_mismatch",
671 format!(
672 "artifact format_version was {}, manifest declares {}",
673 metadata.format_version, artifact.format_version
674 ),
675 );
676 }
677 if artifact.dim != metadata.dim {
678 report.error(
679 "artifact_dim_mismatch",
680 format!(
681 "artifact dim was {}, manifest declares {}",
682 metadata.dim, artifact.dim
683 ),
684 );
685 }
686 if artifact.vector_count != metadata.vector_count {
687 report.error(
688 "artifact_vector_count_mismatch",
689 format!(
690 "artifact vector_count was {}, manifest declares {}",
691 metadata.vector_count, artifact.vector_count
692 ),
693 );
694 }
695 if artifact.bytes_per_vec != metadata.bytes_per_vec {
696 report.error(
697 "artifact_bytes_per_vec_mismatch",
698 format!(
699 "artifact bytes_per_vec was {}, manifest declares {}",
700 metadata.bytes_per_vec, artifact.bytes_per_vec
701 ),
702 );
703 }
704 if artifact.file_size_bytes != metadata.file_size_bytes {
705 report.error(
706 "artifact_metadata_file_size_mismatch",
707 format!(
708 "artifact metadata file_size_bytes was {}, manifest declares {}",
709 metadata.file_size_bytes, artifact.file_size_bytes
710 ),
711 );
712 }
713}
714
715fn verify_row_identity(
716 document: &ManifestDocument,
717 options: &VerifyOptions,
718 report: &mut VerificationReport,
719 paths: &mut VerificationPathCapture,
720) {
721 match &document.manifest.row_identity {
722 RowIdentity::RowIdIdentity { row_count } => {
723 report.row_identity.kind = Some("row_id_identity".to_string());
724 report.row_identity.row_count = Some(*row_count);
725 }
726 RowIdentity::Jsonl {
727 path,
728 sha256,
729 row_count,
730 ..
731 } => {
732 report.row_identity.kind = Some("jsonl".to_string());
733 report.row_identity.manifest_path = Some(path.clone());
734 report.row_identity.row_count = Some(*row_count);
735 if *row_count > options.limits.max_row_identity_rows {
736 report.error(
737 "row_identity_row_count_limit_exceeded",
738 format!(
739 "row_identity.row_count {row_count} exceeds max_row_identity_rows={}",
740 options.limits.max_row_identity_rows
741 ),
742 );
743 return;
744 }
745 let row_path = PathBuf::from(path);
746 if let Some(resolved) = resolve_existing_path(
747 &row_path,
748 &document.base_dir,
749 options,
750 "row_identity",
751 &mut report.errors,
752 ) {
753 paths.row_identity_path = Some(resolved.canonical_path.clone());
754 report.row_identity.canonical_path =
755 Some(path_to_display(&resolved.canonical_path));
756 match validate_jsonl_rows(
757 &resolved.resolved_path,
758 options.allow_duplicate_db_ids,
759 &options.limits,
760 Some(*row_count),
761 &mut report.errors,
762 ) {
763 Ok(stats) => {
764 report.row_identity.validated_rows = Some(stats.validated_rows);
765 if let Some(hash) = &stats.sha256 {
766 report.row_identity.sha256 = Some(hash.clone());
767 if !hex_digest_eq(hash, sha256) {
768 report.error(
769 "row_identity_sha256_mismatch",
770 format!(
771 "row_identity SHA-256 was {hash}, manifest declares {sha256}"
772 ),
773 );
774 }
775 }
776 if stats.row_count != *row_count
777 && !report
778 .errors
779 .iter()
780 .any(|issue| issue.code == "row_identity_row_count_mismatch")
781 {
782 let observed_rows = if stats.sha256.is_some() {
783 stats.row_count.to_string()
784 } else {
785 format!("at least {}", stats.row_count)
786 };
787 report.error(
788 "row_identity_row_count_mismatch",
789 format!(
790 "row identity file has {observed_rows} rows, manifest declares {row_count}"
791 ),
792 );
793 }
794 }
795 Err(err) => report.error(
796 "row_identity_read_failed",
797 format!("failed to read row identity file: {err}"),
798 ),
799 }
800 }
801 }
802 }
803}
804
805fn verify_encoder_distortion(
806 document: &ManifestDocument,
807 options: &VerifyOptions,
808 report: &mut VerificationReport,
809) {
810 let Some(profile) = &document.manifest.encoder_distortion else {
811 return;
812 };
813
814 report.encoder_distortion.present = true;
815 report.encoder_distortion.schema_version = Some(profile.schema_version.clone());
816 report.encoder_distortion.profile_id = Some(profile.profile_id.clone());
817 report.encoder_distortion.evidence_kind = Some(profile.evidence.kind.label().to_string());
818 report.encoder_distortion.source_metric = Some(profile.source_metric.name.clone());
819 report.encoder_distortion.embedding_metric = Some(profile.embedding_metric.name.clone());
820
821 validate_encoder_distortion_shape(profile, report);
822 validate_encoder_distortion_encoder(profile, &document.manifest.embedding, report);
823 validate_encoder_distortion_metrics(profile, report);
824 validate_encoder_distortion_bounds(&profile.bounds, report);
825 validate_encoder_distortion_scope(&profile.scope, report);
826 validate_encoder_distortion_evidence(profile, &document.base_dir, options, report);
827 validate_encoder_distortion_calibration(
828 profile,
829 document.manifest.calibration.as_ref(),
830 report,
831 );
832}
833
834fn validate_encoder_distortion_shape(
835 profile: &EncoderDistortionProfileRef,
836 report: &mut VerificationReport,
837) {
838 if profile.schema_version != ENCODER_DISTORTION_SCHEMA_VERSION {
839 report.error(
840 "encoder_distortion_schema_version_unsupported",
841 format!(
842 "encoder_distortion.schema_version must be {ENCODER_DISTORTION_SCHEMA_VERSION}, got {}",
843 profile.schema_version
844 ),
845 );
846 }
847 if profile.profile_id.trim().is_empty() {
848 report.error(
849 "encoder_distortion_profile_id_empty",
850 "encoder_distortion.profile_id must be non-empty",
851 );
852 }
853 if profile
854 .created_at
855 .as_ref()
856 .is_some_and(|created_at| DateTime::parse_from_rfc3339(created_at).is_err())
857 {
858 report.error(
859 "encoder_distortion_created_at_invalid",
860 "encoder_distortion.created_at must parse as RFC3339 when present",
861 );
862 }
863 if profile.encoder.model.trim().is_empty() {
864 report.error(
865 "encoder_distortion_encoder_model_empty",
866 "encoder_distortion.encoder.model must be non-empty",
867 );
868 }
869 if profile.encoder.dim == 0 {
870 report.error(
871 "encoder_distortion_encoder_dim_zero",
872 "encoder_distortion.encoder.dim must be greater than zero",
873 );
874 }
875 validate_optional_non_empty(
876 "encoder_distortion_encoder_model_revision_empty",
877 "encoder_distortion.encoder.model_revision must be non-empty when present",
878 profile.encoder.model_revision.as_deref(),
879 report,
880 );
881 validate_optional_non_empty(
882 "encoder_distortion_encoder_normalization_empty",
883 "encoder_distortion.encoder.normalization must be non-empty when present",
884 profile.encoder.normalization.as_deref(),
885 report,
886 );
887 validate_optional_non_empty(
888 "encoder_distortion_tokenizer_revision_empty",
889 "encoder_distortion.tokenizer_revision must be non-empty when present",
890 profile.tokenizer_revision.as_deref(),
891 report,
892 );
893 validate_optional_non_empty(
894 "encoder_distortion_pooling_empty",
895 "encoder_distortion.pooling must be non-empty when present",
896 profile.pooling.as_deref(),
897 report,
898 );
899}
900
901fn validate_encoder_distortion_encoder(
902 profile: &EncoderDistortionProfileRef,
903 embedding: &Embedding,
904 report: &mut VerificationReport,
905) {
906 if profile.encoder.model != embedding.model {
907 report.error(
908 "encoder_distortion_encoder_model_mismatch",
909 format!(
910 "encoder_distortion model {:?} does not match embedding.model {:?}",
911 profile.encoder.model, embedding.model
912 ),
913 );
914 }
915 if profile.encoder.dim != embedding.dim {
916 report.error(
917 "encoder_distortion_encoder_dim_mismatch",
918 format!(
919 "encoder_distortion dim {} does not match embedding.dim {}",
920 profile.encoder.dim, embedding.dim
921 ),
922 );
923 }
924 compare_optional_encoder_identity(
925 "encoder_distortion_encoder_model_revision_mismatch",
926 "encoder_distortion encoder",
927 "model_revision",
928 embedding.model_revision.as_deref(),
929 profile.encoder.model_revision.as_deref(),
930 report,
931 );
932 compare_optional_encoder_identity(
933 "encoder_distortion_encoder_normalization_mismatch",
934 "encoder_distortion encoder",
935 "normalization",
936 embedding.normalization.as_deref(),
937 profile.encoder.normalization.as_deref(),
938 report,
939 );
940 compare_optional_encoder_identity(
941 "encoder_distortion_tokenizer_revision_mismatch",
942 "encoder_distortion",
943 "tokenizer_revision",
944 embedding.tokenizer_revision.as_deref(),
945 profile.tokenizer_revision.as_deref(),
946 report,
947 );
948 compare_optional_encoder_identity(
949 "encoder_distortion_pooling_mismatch",
950 "encoder_distortion",
951 "pooling",
952 embedding.pooling.as_deref(),
953 profile.pooling.as_deref(),
954 report,
955 );
956}
957
958fn validate_encoder_distortion_metrics(
959 profile: &EncoderDistortionProfileRef,
960 report: &mut VerificationReport,
961) {
962 validate_metric_spec(
963 "encoder_distortion_source_metric",
964 &profile.source_metric,
965 report,
966 );
967 validate_metric_spec(
968 "encoder_distortion_embedding_metric",
969 &profile.embedding_metric,
970 report,
971 );
972}
973
974fn validate_metric_spec(prefix: &str, metric: &MetricSpec, report: &mut VerificationReport) {
975 if metric.name.trim().is_empty() {
976 report.error(
977 format!("{prefix}_name_empty"),
978 format!("{prefix}.name must be non-empty"),
979 );
980 }
981 validate_optional_non_empty(
982 &format!("{prefix}_version_empty"),
983 &format!("{prefix}.version must be non-empty when present"),
984 metric.version.as_deref(),
985 report,
986 );
987 validate_optional_sha256_uri(
988 &format!("{prefix}_digest_invalid"),
989 &format!("{prefix}.digest must be sha256:<lowercase-hex> when present"),
990 metric.digest.as_deref(),
991 report,
992 );
993}
994
995fn validate_encoder_distortion_bounds(bounds: &DistortionBounds, report: &mut VerificationReport) {
996 if bounds.declared_lower_bound.is_none()
997 && bounds.declared_upper_bound.is_none()
998 && bounds.estimated_distortion.is_none()
999 && bounds.violation_rate.is_none()
1000 && bounds.max_observed_violation.is_none()
1001 && bounds.quantile_observed_violation.is_none()
1002 {
1003 report.error(
1004 "encoder_distortion_bounds_empty",
1005 "encoder_distortion.bounds must declare at least one bound or observed violation statistic",
1006 );
1007 }
1008
1009 validate_optional_positive_f64(
1010 "encoder_distortion_lower_bound_invalid",
1011 "encoder_distortion.bounds.declared_lower_bound must be finite and greater than zero",
1012 bounds.declared_lower_bound,
1013 report,
1014 );
1015 validate_optional_positive_f64(
1016 "encoder_distortion_upper_bound_invalid",
1017 "encoder_distortion.bounds.declared_upper_bound must be finite and greater than zero",
1018 bounds.declared_upper_bound,
1019 report,
1020 );
1021 validate_optional_positive_f64(
1022 "encoder_distortion_estimated_distortion_invalid",
1023 "encoder_distortion.bounds.estimated_distortion must be finite and greater than zero",
1024 bounds.estimated_distortion,
1025 report,
1026 );
1027 validate_optional_probability(
1028 "encoder_distortion_violation_rate_invalid",
1029 "encoder_distortion.bounds.violation_rate must be finite and within [0, 1]",
1030 bounds.violation_rate,
1031 report,
1032 );
1033 validate_optional_nonnegative_f64(
1034 "encoder_distortion_max_observed_violation_invalid",
1035 "encoder_distortion.bounds.max_observed_violation must be finite and non-negative",
1036 bounds.max_observed_violation,
1037 report,
1038 );
1039 validate_optional_nonnegative_f64(
1040 "encoder_distortion_quantile_observed_violation_invalid",
1041 "encoder_distortion.bounds.quantile_observed_violation must be finite and non-negative",
1042 bounds.quantile_observed_violation,
1043 report,
1044 );
1045
1046 if let (Some(lower), Some(upper)) = (bounds.declared_lower_bound, bounds.declared_upper_bound) {
1047 if lower.is_finite() && upper.is_finite() && lower > upper {
1048 report.error(
1049 "encoder_distortion_bounds_order_invalid",
1050 "encoder_distortion.bounds.declared_lower_bound must be less than or equal to declared_upper_bound",
1051 );
1052 }
1053 if lower.is_finite() && upper.is_finite() && lower > 0.0 && upper > 0.0 {
1054 if let Some(estimated) = bounds.estimated_distortion {
1055 let expected = upper / lower;
1056 if !expected.is_finite() {
1057 report.error(
1058 "encoder_distortion_distortion_mismatch",
1059 "encoder_distortion.bounds.declared_upper_bound / declared_lower_bound must be finite",
1060 );
1061 } else {
1062 let tolerance = 1e-9_f64.max(expected.abs() * 1e-9);
1063 if estimated.is_finite() && (estimated - expected).abs() > tolerance {
1064 report.error(
1065 "encoder_distortion_distortion_mismatch",
1066 format!(
1067 "encoder_distortion.bounds.estimated_distortion {} does not match declared_upper_bound / declared_lower_bound {}",
1068 estimated, expected
1069 ),
1070 );
1071 }
1072 }
1073 }
1074 }
1075 }
1076}
1077
1078fn validate_encoder_distortion_scope(scope: &DistortionScope, report: &mut VerificationReport) {
1079 validate_optional_sha256_uri(
1080 "encoder_distortion_scope_corpus_digest_invalid",
1081 "encoder_distortion.scope.corpus_digest must be sha256:<lowercase-hex> when present",
1082 scope.corpus_digest.as_deref(),
1083 report,
1084 );
1085 validate_optional_sha256_uri(
1086 "encoder_distortion_scope_query_set_digest_invalid",
1087 "encoder_distortion.scope.query_set_digest must be sha256:<lowercase-hex> when present",
1088 scope.query_set_digest.as_deref(),
1089 report,
1090 );
1091 validate_optional_sha256_uri(
1092 "encoder_distortion_scope_pair_sample_digest_invalid",
1093 "encoder_distortion.scope.pair_sample_digest must be sha256:<lowercase-hex> when present",
1094 scope.pair_sample_digest.as_deref(),
1095 report,
1096 );
1097 validate_optional_non_empty(
1098 "encoder_distortion_scope_domain_empty",
1099 "encoder_distortion.scope.domain must be non-empty when present",
1100 scope.domain.as_deref(),
1101 report,
1102 );
1103 validate_optional_non_empty(
1104 "encoder_distortion_scope_estimator_version_empty",
1105 "encoder_distortion.scope.estimator_version must be non-empty when present",
1106 scope.estimator_version.as_deref(),
1107 report,
1108 );
1109 if scope
1110 .sample_size
1111 .is_some_and(|sample_size| sample_size == 0)
1112 {
1113 report.error(
1114 "encoder_distortion_scope_sample_size_zero",
1115 "encoder_distortion.scope.sample_size must be greater than zero when present",
1116 );
1117 }
1118 validate_optional_probability(
1119 "encoder_distortion_scope_confidence_invalid",
1120 "encoder_distortion.scope.confidence must be finite and within [0, 1]",
1121 scope.confidence,
1122 report,
1123 );
1124 validate_optional_probability(
1125 "encoder_distortion_scope_coverage_invalid",
1126 "encoder_distortion.scope.coverage must be finite and within [0, 1]",
1127 scope.coverage,
1128 report,
1129 );
1130}
1131
1132fn validate_encoder_distortion_evidence(
1133 profile: &EncoderDistortionProfileRef,
1134 base_dir: &Path,
1135 options: &VerifyOptions,
1136 report: &mut VerificationReport,
1137) {
1138 validate_optional_non_empty(
1139 "encoder_distortion_evidence_estimator_id_empty",
1140 "encoder_distortion.evidence.estimator_id must be non-empty when present",
1141 profile.evidence.estimator_id.as_deref(),
1142 report,
1143 );
1144 validate_optional_sha256_uri(
1145 "encoder_distortion_evidence_estimator_hash_invalid",
1146 "encoder_distortion.evidence.estimator_hash must be sha256:<lowercase-hex> when present",
1147 profile.evidence.estimator_hash.as_deref(),
1148 report,
1149 );
1150
1151 if profile.profile.is_none() && profile.evidence.kind != DistortionEvidenceKind::CallerAsserted
1152 {
1153 report.error(
1154 "encoder_distortion_profile_required",
1155 "non-caller-asserted encoder distortion evidence requires a profile artifact",
1156 );
1157 return;
1158 }
1159
1160 if let Some(artifact) = &profile.profile {
1161 validate_encoder_distortion_profile_artifact(artifact, base_dir, options, report);
1162 }
1163}
1164
1165fn validate_encoder_distortion_profile_artifact(
1166 profile: &DistortionProfileArtifactRef,
1167 base_dir: &Path,
1168 options: &VerifyOptions,
1169 report: &mut VerificationReport,
1170) {
1171 report.encoder_distortion.profile_manifest_path = Some(profile.path.clone());
1172 if profile.path.trim().is_empty() {
1173 report.error(
1174 "encoder_distortion_profile_path_empty",
1175 "encoder_distortion.profile.path must be non-empty",
1176 );
1177 }
1178 if !is_sha256_hex(&profile.sha256) {
1179 report.error(
1180 "encoder_distortion_profile_sha256_invalid",
1181 "encoder_distortion.profile.sha256 must be a lowercase 64-character hex SHA-256 digest",
1182 );
1183 }
1184 if profile.file_size_bytes == 0 {
1185 report.error(
1186 "encoder_distortion_profile_file_size_zero",
1187 "encoder_distortion.profile.file_size_bytes must be greater than zero",
1188 );
1189 }
1190 if profile.format.trim().is_empty() {
1191 report.error(
1192 "encoder_distortion_profile_format_empty",
1193 "encoder_distortion.profile.format must be non-empty",
1194 );
1195 }
1196 validate_optional_sha256_uri(
1197 "encoder_distortion_profile_source_digest_invalid",
1198 "encoder_distortion.profile.source_digest must be sha256:<lowercase-hex> when present",
1199 profile.source_digest.as_deref(),
1200 report,
1201 );
1202
1203 if !profile.path.trim().is_empty() {
1204 let path = PathBuf::from(&profile.path);
1205 if let Some(resolved) = resolve_existing_path(
1206 &path,
1207 base_dir,
1208 options,
1209 "encoder_distortion_profile",
1210 &mut report.errors,
1211 ) {
1212 report.encoder_distortion.profile_canonical_path =
1213 Some(path_to_display(&resolved.canonical_path));
1214 match sha256_file_bounded(
1215 &resolved.resolved_path,
1216 options.limits.max_encoder_distortion_profile_bytes,
1217 "encoder_distortion_profile_too_large",
1218 "encoder distortion profile",
1219 ) {
1220 Ok(hash) => {
1221 report.encoder_distortion.profile_sha256 = Some(hash.sha256.clone());
1222 report.encoder_distortion.profile_size_bytes = Some(hash.size_bytes);
1223 if !hex_digest_eq(&hash.sha256, &profile.sha256) {
1224 report.error(
1225 "encoder_distortion_profile_sha256_mismatch",
1226 format!(
1227 "encoder distortion profile SHA-256 was {}, manifest declares {}",
1228 hash.sha256, profile.sha256
1229 ),
1230 );
1231 }
1232 if hash.size_bytes != profile.file_size_bytes {
1233 report.error(
1234 "encoder_distortion_profile_file_size_mismatch",
1235 format!(
1236 "encoder distortion profile size was {}, manifest declares {}",
1237 hash.size_bytes, profile.file_size_bytes
1238 ),
1239 );
1240 }
1241 }
1242 Err(ManifestError::LimitExceeded { code, message }) => report.error(code, message),
1243 Err(err) => report.error(
1244 "encoder_distortion_profile_hash_failed",
1245 format!("failed to hash encoder distortion profile: {err}"),
1246 ),
1247 }
1248 }
1249 }
1250}
1251
1252fn validate_encoder_distortion_calibration(
1253 profile: &EncoderDistortionProfileRef,
1254 calibration: Option<&CalibrationProfileRef>,
1255 report: &mut VerificationReport,
1256) {
1257 let Some(calibration_profile_id) = &profile.calibration_profile_id else {
1258 return;
1259 };
1260 if calibration_profile_id.trim().is_empty() {
1261 report.error(
1262 "encoder_distortion_calibration_profile_id_empty",
1263 "encoder_distortion.calibration_profile_id must be non-empty when present",
1264 );
1265 return;
1266 }
1267 if calibration_profile_id.trim() != calibration_profile_id {
1268 report.error(
1269 "encoder_distortion_calibration_profile_id_whitespace",
1270 "encoder_distortion.calibration_profile_id must not contain leading or trailing whitespace",
1271 );
1272 return;
1273 }
1274 let Some(calibration) = calibration else {
1275 report.error(
1276 "encoder_distortion_calibration_missing",
1277 "encoder_distortion.calibration_profile_id requires a calibration block",
1278 );
1279 return;
1280 };
1281 if calibration.profile_id != *calibration_profile_id {
1283 report.error(
1284 "encoder_distortion_calibration_profile_mismatch",
1285 format!(
1286 "encoder_distortion.calibration_profile_id {:?} does not match calibration.profile_id {:?}",
1287 calibration_profile_id, calibration.profile_id
1288 ),
1289 );
1290 }
1291}
1292
1293fn verify_calibration(
1294 document: &ManifestDocument,
1295 options: &VerifyOptions,
1296 report: &mut VerificationReport,
1297) {
1298 let Some(calibration) = &document.manifest.calibration else {
1299 return;
1300 };
1301
1302 report.calibration.present = true;
1303 report.calibration.schema_version = Some(calibration.schema_version.clone());
1304 report.calibration.profile_id = Some(calibration.profile_id.clone());
1305 report.calibration.calibrated_for_model = Some(calibration.calibrated_for.model.clone());
1306 report.calibration.ordinalization = Some(calibration.ordinalization.label().to_string());
1307 report.calibration.null_model = Some(calibration.null_model.label().to_string());
1308
1309 validate_calibration_shape(calibration, report);
1310 validate_calibration_encoder(calibration, &document.manifest.embedding, report);
1311 validate_calibration_ordinalization(calibration, &document.manifest.artifact, report);
1312 validate_calibration_null_model_ordinalization(calibration, report);
1313 validate_calibration_profile(
1314 calibration,
1315 &document.manifest.artifact,
1316 &document.base_dir,
1317 options,
1318 report,
1319 );
1320}
1321
1322fn validate_calibration_shape(
1323 calibration: &CalibrationProfileRef,
1324 report: &mut VerificationReport,
1325) {
1326 if calibration.schema_version != CALIBRATION_SCHEMA_VERSION {
1327 report.error(
1328 "calibration_schema_version_unsupported",
1329 format!(
1330 "calibration.schema_version must be {CALIBRATION_SCHEMA_VERSION}, got {}",
1331 calibration.schema_version
1332 ),
1333 );
1334 }
1335 if calibration.profile_id.trim().is_empty() {
1336 report.error(
1337 "calibration_profile_id_empty",
1338 "calibration.profile_id must be non-empty",
1339 );
1340 }
1341 if calibration
1342 .created_at
1343 .as_ref()
1344 .is_some_and(|created_at| DateTime::parse_from_rfc3339(created_at).is_err())
1345 {
1346 report.error(
1347 "calibration_created_at_invalid",
1348 "calibration.created_at must parse as RFC3339 when present",
1349 );
1350 }
1351 if calibration.calibrated_for.model.trim().is_empty() {
1352 report.error(
1353 "calibration_encoder_model_empty",
1354 "calibration.calibrated_for.model must be non-empty",
1355 );
1356 }
1357 if calibration.calibrated_for.dim == 0 {
1358 report.error(
1359 "calibration_encoder_dim_zero",
1360 "calibration.calibrated_for.dim must be greater than zero",
1361 );
1362 }
1363 validate_optional_non_empty(
1364 "calibration_encoder_model_revision_empty",
1365 "calibration.calibrated_for.model_revision must be non-empty when present",
1366 calibration.calibrated_for.model_revision.as_deref(),
1367 report,
1368 );
1369 validate_optional_non_empty(
1370 "calibration_encoder_normalization_empty",
1371 "calibration.calibrated_for.normalization must be non-empty when present",
1372 calibration.calibrated_for.normalization.as_deref(),
1373 report,
1374 );
1375 if calibration.ordinalization.dim() == 0 {
1376 report.error(
1377 "calibration_ordinalization_dim_zero",
1378 "calibration.ordinalization.dim must be greater than zero",
1379 );
1380 }
1381 match &calibration.ordinalization {
1382 CalibrationOrdinalization::TopK { k, .. } if *k == 0 => {
1383 report.error(
1384 "calibration_ordinalization_artifact_mismatch",
1385 "calibration top_k.k must be greater than zero",
1386 );
1387 }
1388 CalibrationOrdinalization::Bucket { bits, .. } if !matches!(*bits, 1 | 2 | 4) => {
1389 report.error(
1390 "calibration_ordinalization_artifact_mismatch",
1391 "calibration bucket.bits must be 1, 2, or 4",
1392 );
1393 }
1394 CalibrationOrdinalization::CallerDefined { name, .. } if name.trim().is_empty() => {
1395 report.error(
1396 "calibration_ordinalization_artifact_mismatch",
1397 "calibration caller_defined.name must be non-empty",
1398 );
1399 }
1400 _ => {}
1401 }
1402 match &calibration.null_model {
1403 NullModelSpec::EmpiricalTailTable { statistic } if statistic.trim().is_empty() => {
1404 report.error(
1405 "calibration_null_statistic_empty",
1406 "calibration.null_model.statistic must be non-empty",
1407 );
1408 }
1409 NullModelSpec::CallerDefined {
1410 name,
1411 parameterization,
1412 } => {
1413 if name.trim().is_empty() {
1414 report.error(
1415 "calibration_null_name_empty",
1416 "calibration.null_model.name must be non-empty",
1417 );
1418 }
1419 validate_optional_non_empty(
1420 "calibration_null_parameterization_empty",
1421 "calibration.null_model.parameterization must be non-empty when present",
1422 parameterization.as_deref(),
1423 report,
1424 );
1425 }
1426 _ => {}
1427 }
1428}
1429
1430fn validate_calibration_encoder(
1431 calibration: &CalibrationProfileRef,
1432 embedding: &Embedding,
1433 report: &mut VerificationReport,
1434) {
1435 if calibration.calibrated_for.model != embedding.model {
1436 report.error(
1437 "calibration_encoder_model_mismatch",
1438 format!(
1439 "calibration model {:?} does not match embedding.model {:?}",
1440 calibration.calibrated_for.model, embedding.model
1441 ),
1442 );
1443 }
1444 if calibration.calibrated_for.dim != embedding.dim {
1445 report.error(
1446 "calibration_encoder_dim_mismatch",
1447 format!(
1448 "calibration dim {} does not match embedding.dim {}",
1449 calibration.calibrated_for.dim, embedding.dim
1450 ),
1451 );
1452 }
1453 compare_optional_identity(
1454 "calibration_encoder_model_revision_mismatch",
1455 "calibration encoder",
1456 "model_revision",
1457 embedding.model_revision.as_deref(),
1458 calibration.calibrated_for.model_revision.as_deref(),
1459 report,
1460 );
1461 compare_optional_identity(
1462 "calibration_encoder_normalization_mismatch",
1463 "calibration encoder",
1464 "normalization",
1465 embedding.normalization.as_deref(),
1466 calibration.calibrated_for.normalization.as_deref(),
1467 report,
1468 );
1469}
1470
1471fn compare_optional_identity(
1472 code: &str,
1473 subject: &str,
1474 field: &str,
1475 embedding_value: Option<&str>,
1476 calibration_value: Option<&str>,
1477 report: &mut VerificationReport,
1478) {
1479 compare_optional_encoder_identity(
1480 code,
1481 subject,
1482 field,
1483 embedding_value,
1484 calibration_value,
1485 report,
1486 );
1487}
1488
1489fn compare_optional_encoder_identity(
1490 code: &str,
1491 subject: &str,
1492 field: &str,
1493 embedding_value: Option<&str>,
1494 observed_value: Option<&str>,
1495 report: &mut VerificationReport,
1496) {
1497 match (embedding_value, observed_value) {
1498 (Some(expected), Some(observed)) if expected == observed => {}
1499 (None, None) => {}
1500 _ => report.error(
1501 code,
1502 format!("{subject} {field} does not match embedding.{field}"),
1503 ),
1504 }
1505}
1506
1507fn validate_calibration_ordinalization(
1508 calibration: &CalibrationProfileRef,
1509 artifact: &Artifact,
1510 report: &mut VerificationReport,
1511) {
1512 if calibration.ordinalization.dim() != artifact.dim {
1513 report.error(
1514 "calibration_ordinalization_dim_mismatch",
1515 format!(
1516 "calibration ordinalization dim {} does not match artifact.dim {}",
1517 calibration.ordinalization.dim(),
1518 artifact.dim
1519 ),
1520 );
1521 }
1522
1523 let compatible = match (artifact.kind, &artifact.params, &calibration.ordinalization) {
1524 (
1525 ManifestIndexKind::Bitmap,
1526 ManifestIndexParams::Bitmap { n_top },
1527 CalibrationOrdinalization::TopK { k, .. },
1528 ) => k == n_top,
1529 (
1530 ManifestIndexKind::RankQuant,
1531 ManifestIndexParams::RankQuant { bits },
1532 CalibrationOrdinalization::Bucket {
1533 bits: calibrated_bits,
1534 ..
1535 },
1536 ) => calibrated_bits == bits,
1537 (
1538 ManifestIndexKind::SignBitmap,
1539 ManifestIndexParams::SignBitmap,
1540 CalibrationOrdinalization::Sign { .. },
1541 ) => true,
1542 (
1543 ManifestIndexKind::Rank,
1544 ManifestIndexParams::Rank,
1545 CalibrationOrdinalization::RankPosition { .. }
1546 | CalibrationOrdinalization::CallerDefined { .. },
1547 ) => true,
1548 _ => false,
1549 };
1550
1551 if !compatible {
1552 report.error(
1553 "calibration_ordinalization_artifact_mismatch",
1554 "calibration.ordinalization is incompatible with artifact.kind/artifact.params",
1555 );
1556 }
1557}
1558
1559fn validate_calibration_null_model_ordinalization(
1560 calibration: &CalibrationProfileRef,
1561 report: &mut VerificationReport,
1562) {
1563 if matches!(
1564 (&calibration.null_model, &calibration.ordinalization),
1565 (
1566 NullModelSpec::UniformHypergeometric,
1567 CalibrationOrdinalization::TopK { .. }
1568 )
1569 ) {
1570 return;
1571 }
1572 if matches!(
1573 &calibration.null_model,
1574 NullModelSpec::UniformHypergeometric
1575 ) {
1576 report.error(
1577 "calibration_null_model_ordinalization_mismatch",
1578 "uniform_hypergeometric calibration requires top_k ordinalization",
1579 );
1580 }
1581}
1582
1583fn validate_calibration_profile(
1584 calibration: &CalibrationProfileRef,
1585 artifact: &Artifact,
1586 base_dir: &Path,
1587 options: &VerifyOptions,
1588 report: &mut VerificationReport,
1589) {
1590 if matches!(
1591 &calibration.null_model,
1592 NullModelSpec::UniformHypergeometric
1593 ) {
1594 if calibration.profile.is_some() {
1595 report.error(
1596 "calibration_profile_unexpected",
1597 "uniform_hypergeometric calibration must not include a profile artifact",
1598 );
1599 }
1600 return;
1601 }
1602
1603 let Some(profile) = &calibration.profile else {
1604 report.error(
1605 "calibration_profile_required",
1606 "non-uniform calibration requires a profile artifact",
1607 );
1608 return;
1609 };
1610
1611 report.calibration.profile_manifest_path = Some(profile.path.clone());
1612 if profile.path.trim().is_empty() {
1613 report.error(
1614 "calibration_profile_path_empty",
1615 "calibration.profile.path must be non-empty",
1616 );
1617 }
1618 if !is_sha256_hex(&profile.sha256) {
1619 report.error(
1620 "calibration_profile_sha256_invalid",
1621 "calibration.profile.sha256 must be a lowercase 64-character hex SHA-256 digest",
1622 );
1623 }
1624 if profile.file_size_bytes == 0 {
1625 report.error(
1626 "calibration_profile_file_size_zero",
1627 "calibration.profile.file_size_bytes must be greater than zero",
1628 );
1629 }
1630 if profile.dim != artifact.dim {
1631 report.error(
1632 "calibration_profile_dim_mismatch",
1633 format!(
1634 "calibration profile dim {} does not match artifact.dim {}",
1635 profile.dim, artifact.dim
1636 ),
1637 );
1638 }
1639 if profile.sample_count == 0 {
1640 report.error(
1641 "calibration_profile_sample_count_zero",
1642 "calibration.profile.sample_count must be greater than zero",
1643 );
1644 }
1645 validate_optional_source_digest(profile.source_digest.as_deref(), report);
1646 validate_calibration_parameterization(calibration, profile, report);
1647 validate_calibration_profile_shape(profile, &calibration.ordinalization, report);
1648
1649 if !profile.path.trim().is_empty() {
1650 let path = PathBuf::from(&profile.path);
1651 if let Some(resolved) = resolve_existing_path(
1652 &path,
1653 base_dir,
1654 options,
1655 "calibration_profile",
1656 &mut report.errors,
1657 ) {
1658 report.calibration.profile_canonical_path =
1659 Some(path_to_display(&resolved.canonical_path));
1660 match sha256_file(&resolved.resolved_path) {
1661 Ok(hash) => {
1662 report.calibration.profile_sha256 = Some(hash.sha256.clone());
1663 report.calibration.profile_size_bytes = Some(hash.size_bytes);
1664 if !hex_digest_eq(&hash.sha256, &profile.sha256) {
1665 report.error(
1666 "calibration_profile_sha256_mismatch",
1667 format!(
1668 "calibration profile SHA-256 was {}, manifest declares {}",
1669 hash.sha256, profile.sha256
1670 ),
1671 );
1672 }
1673 if hash.size_bytes != profile.file_size_bytes {
1674 report.error(
1675 "calibration_profile_file_size_mismatch",
1676 format!(
1677 "calibration profile size was {}, manifest declares {}",
1678 hash.size_bytes, profile.file_size_bytes
1679 ),
1680 );
1681 }
1682 }
1683 Err(err) => report.error(
1684 "calibration_profile_hash_failed",
1685 format!("failed to hash calibration profile: {err}"),
1686 ),
1687 }
1688 }
1689 }
1690}
1691
1692fn validate_optional_source_digest(value: Option<&str>, report: &mut VerificationReport) {
1693 let Some(value) = value else {
1694 return;
1695 };
1696 let Some(digest) = value.strip_prefix("sha256:") else {
1697 report.error(
1698 "calibration_profile_source_digest_invalid",
1699 "calibration.profile.source_digest must be sha256:<lowercase-hex>",
1700 );
1701 return;
1702 };
1703 if !is_sha256_hex(digest) {
1704 report.error(
1705 "calibration_profile_source_digest_invalid",
1706 "calibration.profile.source_digest must be sha256:<lowercase-hex>",
1707 );
1708 }
1709}
1710
1711fn validate_calibration_parameterization(
1712 calibration: &CalibrationProfileRef,
1713 profile: &ProfileArtifactRef,
1714 report: &mut VerificationReport,
1715) {
1716 match &calibration.null_model {
1717 NullModelSpec::WeightedMarginalProfile { parameterization }
1718 if *parameterization != profile.parameterization =>
1719 {
1720 report.error(
1721 "calibration_null_parameterization_mismatch",
1722 format!(
1723 "null_model parameterization {:?} does not match profile parameterization {:?}",
1724 parameterization, profile.parameterization
1725 ),
1726 );
1727 }
1728 NullModelSpec::EmpiricalTailTable { .. }
1729 if profile.parameterization != ProfileParameterization::EmpiricalTailTable =>
1730 {
1731 report.error(
1732 "calibration_null_parameterization_mismatch",
1733 "empirical_tail_table null_model requires empirical_tail_table profile parameterization",
1734 );
1735 }
1736 _ => {}
1737 }
1738 if !profile_parameterization_matches_ordinalization(
1739 profile.parameterization,
1740 &calibration.ordinalization,
1741 ) {
1742 report.error(
1743 "calibration_profile_parameterization_ordinalization_mismatch",
1744 "calibration profile parameterization is incompatible with calibration ordinalization",
1745 );
1746 }
1747}
1748
1749fn profile_parameterization_matches_ordinalization(
1750 parameterization: ProfileParameterization,
1751 ordinalization: &CalibrationOrdinalization,
1752) -> bool {
1753 match ordinalization {
1754 CalibrationOrdinalization::TopK { .. } => matches!(
1755 parameterization,
1756 ProfileParameterization::MarginalTopKFrequency
1757 | ProfileParameterization::EmpiricalTailTable
1758 ),
1759 CalibrationOrdinalization::Bucket { .. } => matches!(
1760 parameterization,
1761 ProfileParameterization::BucketFrequency | ProfileParameterization::EmpiricalTailTable
1762 ),
1763 CalibrationOrdinalization::Sign { .. } => matches!(
1764 parameterization,
1765 ProfileParameterization::SignFrequency | ProfileParameterization::EmpiricalTailTable
1766 ),
1767 CalibrationOrdinalization::RankPosition { .. } => matches!(
1768 parameterization,
1769 ProfileParameterization::RankPositionFrequency
1770 | ProfileParameterization::EmpiricalTailTable
1771 ),
1772 CalibrationOrdinalization::CallerDefined { .. } => true,
1773 }
1774}
1775
1776fn validate_calibration_profile_shape(
1777 profile: &ProfileArtifactRef,
1778 ordinalization: &CalibrationOrdinalization,
1779 report: &mut VerificationReport,
1780) {
1781 if profile.format.trim().is_empty() {
1782 report.error(
1783 "calibration_profile_format_empty",
1784 "calibration.profile.format must be non-empty",
1785 );
1786 }
1787
1788 if profile.shape.is_empty() {
1789 return;
1790 }
1791
1792 if let Some(expected) = expected_profile_shape(profile.parameterization, ordinalization) {
1793 if profile.shape != expected {
1794 report.error(
1795 "calibration_profile_shape_mismatch",
1796 format!(
1797 "calibration profile shape {:?} does not match expected {:?}",
1798 profile.shape, expected
1799 ),
1800 );
1801 }
1802 }
1803
1804 let bytes_per_value = match profile.format.as_str() {
1805 "raw_f64_le" => Some(8u64),
1806 "raw_f32_le" => Some(4u64),
1807 _ => None,
1808 };
1809 let Some(bytes_per_value) = bytes_per_value else {
1810 return;
1811 };
1812 let Some(values) = profile
1813 .shape
1814 .iter()
1815 .try_fold(1u64, |acc, value| acc.checked_mul(*value as u64))
1816 else {
1817 report.error(
1818 "calibration_profile_shape_mismatch",
1819 "calibration.profile.shape product overflows u64",
1820 );
1821 return;
1822 };
1823 let Some(expected_bytes) = values.checked_mul(bytes_per_value) else {
1824 report.error(
1825 "calibration_profile_shape_mismatch",
1826 "calibration.profile.shape byte size overflows u64",
1827 );
1828 return;
1829 };
1830 if profile.file_size_bytes != expected_bytes {
1831 report.error(
1832 "calibration_profile_file_size_mismatch",
1833 format!(
1834 "calibration profile size {} does not match shape/format size {}",
1835 profile.file_size_bytes, expected_bytes
1836 ),
1837 );
1838 }
1839}
1840
1841fn expected_profile_shape(
1842 parameterization: ProfileParameterization,
1843 ordinalization: &CalibrationOrdinalization,
1844) -> Option<Vec<usize>> {
1845 match parameterization {
1846 ProfileParameterization::MarginalTopKFrequency => Some(vec![ordinalization.dim()]),
1847 ProfileParameterization::SignFrequency => Some(vec![ordinalization.dim()]),
1848 ProfileParameterization::BucketFrequency => match ordinalization {
1849 CalibrationOrdinalization::Bucket { dim, bits } if matches!(*bits, 1 | 2 | 4) => {
1850 Some(vec![*dim, 1usize << *bits])
1851 }
1852 _ => None,
1853 },
1854 ProfileParameterization::RankPositionFrequency => {
1855 Some(vec![ordinalization.dim(), ordinalization.dim()])
1856 }
1857 ProfileParameterization::EmpiricalTailTable => None,
1858 }
1859}
1860
1861fn verify_auxiliary_artifacts(
1862 document: &ManifestDocument,
1863 options: &VerifyOptions,
1864 report: &mut VerificationReport,
1865 paths: &mut VerificationPathCapture,
1866) {
1867 if !check_auxiliary_artifact_count(&document.manifest, &options.limits, report) {
1868 return;
1869 }
1870 let artifacts = auxiliary_artifacts_in_report_order(&document.manifest);
1871 let base_canonical = if options.allow_path_escape {
1872 None
1873 } else {
1874 match fs::canonicalize(&document.base_dir) {
1875 Ok(path) => Some(path),
1876 Err(err) => {
1877 for artifact in artifacts {
1878 let mut entry = auxiliary_artifact_report_entry(artifact, &document.base_dir);
1879 if artifact.path.trim().is_empty() {
1880 mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty");
1881 } else {
1882 report.error(
1883 "auxiliary_artifact_base_dir_unavailable",
1884 format!(
1885 "failed to canonicalize base_dir {} for auxiliary artifact {:?}: {err}",
1886 document.base_dir.display(),
1887 artifact.name
1888 ),
1889 );
1890 mark_auxiliary_artifact_failed(
1891 &mut entry,
1892 "auxiliary_artifact_base_dir_unavailable",
1893 );
1894 }
1895 report.auxiliary_artifacts.push(entry);
1896 }
1897 return;
1898 }
1899 }
1900 };
1901
1902 for artifact in artifacts {
1903 let mut entry = auxiliary_artifact_report_entry(artifact, &document.base_dir);
1904 let mut captured_path = None;
1905
1906 if artifact.path.trim().is_empty() {
1907 mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty");
1908 report.auxiliary_artifacts.push(entry);
1909 paths.auxiliary_artifact_paths.push(None);
1910 continue;
1911 }
1912
1913 match resolve_auxiliary_artifact_path(
1914 artifact,
1915 &document.base_dir,
1916 base_canonical.as_deref(),
1917 options,
1918 report,
1919 ) {
1920 AuxiliaryPathResolution::Resolved(resolved) => {
1921 captured_path = Some(resolved.canonical_path.clone());
1922 entry.canonical_path = Some(path_to_display(&resolved.canonical_path));
1923 match sha256_file_bounded(
1924 &resolved.resolved_path,
1925 options.limits.max_auxiliary_artifact_bytes,
1926 "auxiliary_artifact_file_too_large",
1927 "auxiliary artifact",
1928 ) {
1929 Ok(hash) => {
1930 entry.sha256 = Some(hash.sha256.clone());
1931 entry.size_bytes = Some(hash.size_bytes);
1932 if !hex_digest_eq(&hash.sha256, &artifact.sha256) {
1933 mark_auxiliary_artifact_failed(
1934 &mut entry,
1935 "auxiliary_artifact_sha256_mismatch",
1936 );
1937 report.error(
1938 "auxiliary_artifact_sha256_mismatch",
1939 format!(
1940 "auxiliary artifact {:?} SHA-256 was {}, manifest declares {}",
1941 artifact.name, hash.sha256, artifact.sha256
1942 ),
1943 );
1944 }
1945 if hash.size_bytes != artifact.file_size_bytes {
1946 mark_auxiliary_artifact_failed(
1947 &mut entry,
1948 "auxiliary_artifact_file_size_mismatch",
1949 );
1950 report.error(
1951 "auxiliary_artifact_file_size_mismatch",
1952 format!(
1953 "auxiliary artifact {:?} size was {}, manifest declares {}",
1954 artifact.name, hash.size_bytes, artifact.file_size_bytes
1955 ),
1956 );
1957 }
1958 if entry.reason_code.is_none() {
1959 entry.state = AuxiliaryArtifactState::Verified;
1960 }
1961 }
1962 Err(err) => {
1963 let code = err.code().unwrap_or("auxiliary_artifact_hash_failed");
1964 mark_auxiliary_artifact_failed(&mut entry, code);
1965 let message = if err.code().is_some() {
1966 err.to_string()
1967 } else {
1968 format!(
1969 "failed to hash auxiliary artifact {:?}: {err}",
1970 artifact.name
1971 )
1972 };
1973 report.error(code, message);
1974 }
1975 }
1976 }
1977 AuxiliaryPathResolution::OptionalAbsent => {
1978 entry.state = AuxiliaryArtifactState::OptionalAbsent;
1979 entry.reason_code = Some("auxiliary_artifact_optional_absent".to_string());
1980 }
1981 AuxiliaryPathResolution::MissingRequired => {
1982 entry.state = AuxiliaryArtifactState::MissingRequired;
1983 entry.reason_code = Some("auxiliary_artifact_missing_required".to_string());
1984 }
1985 AuxiliaryPathResolution::Failed(code) => {
1986 entry.state = AuxiliaryArtifactState::Failed;
1987 entry.reason_code = Some(code);
1988 }
1989 }
1990
1991 report.auxiliary_artifacts.push(entry);
1992 paths.auxiliary_artifact_paths.push(captured_path);
1993 }
1994}
1995
1996fn auxiliary_artifact_report_entry(
1997 artifact: &AuxiliaryArtifact,
1998 base_dir: &Path,
1999) -> AuxiliaryArtifactReport {
2000 let resolved_path = if artifact.path.trim().is_empty() {
2001 None
2002 } else {
2003 Some(path_to_display(&auxiliary_artifact_resolved_path(
2004 artifact, base_dir,
2005 )))
2006 };
2007 AuxiliaryArtifactReport {
2008 name: artifact.name.clone(),
2009 manifest_path: artifact.path.clone(),
2010 resolved_path,
2011 canonical_path: None,
2012 expected_sha256: Some(artifact.sha256.clone()),
2013 expected_size_bytes: Some(artifact.file_size_bytes),
2014 required: artifact.required,
2015 state: AuxiliaryArtifactState::Failed,
2016 reason_code: None,
2017 sha256: None,
2018 size_bytes: None,
2019 }
2020}
2021
2022fn check_auxiliary_artifact_count(
2023 manifest: &IndexManifest,
2024 limits: &ResourceLimits,
2025 report: &mut VerificationReport,
2026) -> bool {
2027 let count = manifest.auxiliary_artifacts.len();
2028 if count <= limits.max_auxiliary_artifacts {
2029 return true;
2030 }
2031 if !report
2032 .errors
2033 .iter()
2034 .any(|issue| issue.code == "auxiliary_artifact_count_limit_exceeded")
2035 {
2036 push_report_issue_bounded(
2037 &mut report.errors,
2038 limits,
2039 "auxiliary_artifact_count_limit_exceeded",
2040 format!(
2041 "auxiliary_artifacts has {count} entries, exceeding max_auxiliary_artifacts={}",
2042 limits.max_auxiliary_artifacts
2043 ),
2044 );
2045 }
2046 false
2047}
2048
2049fn auxiliary_artifacts_in_report_order(manifest: &IndexManifest) -> Vec<&AuxiliaryArtifact> {
2050 let mut artifacts: Vec<_> = manifest.auxiliary_artifacts.iter().collect();
2051 artifacts.sort_by(|left, right| {
2052 left.name
2053 .cmp(&right.name)
2054 .then_with(|| left.path.cmp(&right.path))
2055 .then_with(|| left.required.cmp(&right.required))
2056 });
2057 artifacts
2058}
2059
2060enum AuxiliaryPathResolution {
2061 Resolved(ResolvedPath),
2062 OptionalAbsent,
2063 MissingRequired,
2064 Failed(String),
2065}
2066
2067fn resolve_auxiliary_artifact_path(
2068 artifact: &AuxiliaryArtifact,
2069 base_dir: &Path,
2070 base_canonical: Option<&Path>,
2071 options: &VerifyOptions,
2072 report: &mut VerificationReport,
2073) -> AuxiliaryPathResolution {
2074 let path = Path::new(&artifact.path);
2075 if path.is_absolute() && !options.allow_absolute_paths {
2076 report.error(
2077 "auxiliary_artifact_absolute_path_rejected",
2078 format!(
2079 "absolute auxiliary artifact path {} for {:?} is rejected by default",
2080 path.display(),
2081 artifact.name
2082 ),
2083 );
2084 return AuxiliaryPathResolution::Failed(
2085 "auxiliary_artifact_absolute_path_rejected".to_string(),
2086 );
2087 }
2088
2089 if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) {
2090 report.error(
2091 "auxiliary_artifact_path_escape_rejected",
2092 format!(
2093 "relative auxiliary artifact path {} for {:?} escapes the manifest base",
2094 path.display(),
2095 artifact.name
2096 ),
2097 );
2098 return AuxiliaryPathResolution::Failed(
2099 "auxiliary_artifact_path_escape_rejected".to_string(),
2100 );
2101 }
2102
2103 let resolved_path = auxiliary_artifact_resolved_path(artifact, base_dir);
2104 let canonical_path = match fs::canonicalize(&resolved_path) {
2105 Ok(path) => path,
2106 Err(err) if err.kind() == io::ErrorKind::NotFound && !artifact.required => {
2107 return AuxiliaryPathResolution::OptionalAbsent;
2108 }
2109 Err(err) if err.kind() == io::ErrorKind::NotFound => {
2110 report.error(
2111 "auxiliary_artifact_missing_required",
2112 format!(
2113 "required auxiliary artifact {:?} is missing at {}",
2114 artifact.name,
2115 resolved_path.display()
2116 ),
2117 );
2118 return AuxiliaryPathResolution::MissingRequired;
2119 }
2120 Err(err) => {
2121 report.error(
2122 "auxiliary_artifact_path_unavailable",
2123 format!(
2124 "failed to canonicalize auxiliary artifact {:?} at {}: {err}",
2125 artifact.name,
2126 resolved_path.display()
2127 ),
2128 );
2129 return AuxiliaryPathResolution::Failed(
2130 "auxiliary_artifact_path_unavailable".to_string(),
2131 );
2132 }
2133 };
2134
2135 if let Some(base_canonical) = base_canonical {
2136 if !canonical_path.starts_with(base_canonical) {
2137 report.error(
2138 "auxiliary_artifact_path_escape_rejected",
2139 format!(
2140 "canonical auxiliary artifact path {} for {:?} is outside manifest base {}",
2141 canonical_path.display(),
2142 artifact.name,
2143 base_canonical.display()
2144 ),
2145 );
2146 return AuxiliaryPathResolution::Failed(
2147 "auxiliary_artifact_path_escape_rejected".to_string(),
2148 );
2149 }
2150 }
2151
2152 AuxiliaryPathResolution::Resolved(ResolvedPath {
2153 resolved_path,
2154 canonical_path,
2155 })
2156}
2157
2158fn auxiliary_artifact_resolved_path(artifact: &AuxiliaryArtifact, base_dir: &Path) -> PathBuf {
2159 let path = Path::new(&artifact.path);
2160 if path.is_absolute() {
2161 path.to_path_buf()
2162 } else {
2163 base_dir.join(path)
2164 }
2165}
2166
2167fn mark_auxiliary_artifact_failed(entry: &mut AuxiliaryArtifactReport, code: &str) {
2168 entry.state = AuxiliaryArtifactState::Failed;
2169 if entry.reason_code.is_none() {
2170 entry.reason_code = Some(code.to_string());
2171 }
2172}
2173
2174fn verify_attestations(manifest: &IndexManifest, report: &mut VerificationReport) {
2175 if manifest.attestations.is_empty() {
2176 report
2177 .skipped_checks
2178 .push("attestations_absent".to_string());
2179 return;
2180 }
2181
2182 let artifact_sha = report
2183 .artifact
2184 .sha256
2185 .clone()
2186 .unwrap_or_else(|| manifest.artifact.sha256.clone());
2187 let mut any_subject_match = false;
2188 for (idx, attestation) in manifest.attestations.iter().enumerate() {
2189 let predicate_type = attestation
2190 .get("predicateType")
2191 .or_else(|| attestation.get("predicate_type"))
2192 .and_then(serde_json::Value::as_str)
2193 .map(ToOwned::to_owned);
2194 if predicate_type.is_none() {
2195 report.error(
2196 "attestation_predicate_type_missing",
2197 format!("attestation {idx} has no predicateType"),
2198 );
2199 }
2200
2201 let builder_id = attestation
2202 .pointer("/predicate/builder/id")
2203 .or_else(|| attestation.pointer("/predicate/runDetails/builder/id"))
2204 .and_then(serde_json::Value::as_str)
2205 .map(ToOwned::to_owned);
2206
2207 let subject_sha256_matched = attestation
2208 .get("subject")
2209 .and_then(serde_json::Value::as_array)
2210 .is_some_and(|subjects| {
2211 subjects.iter().any(|subject| {
2212 subject
2213 .pointer("/digest/sha256")
2214 .and_then(serde_json::Value::as_str)
2215 .is_some_and(|digest| hex_digest_eq(digest, &artifact_sha))
2216 })
2217 });
2218 any_subject_match |= subject_sha256_matched;
2219 report.attestation_shape_checks.push(AttestationShapeCheck {
2220 predicate_type,
2221 builder_id,
2222 subject_sha256_matched,
2223 });
2224 }
2225
2226 if !any_subject_match {
2227 report.error(
2228 "attestation_subject_sha256_mismatch",
2229 "no supplied attestation subject digest matches the artifact SHA-256",
2230 );
2231 }
2232}
2233
2234#[derive(Clone, Debug, Default)]
2235pub struct VerifyOptions {
2236 pub allow_absolute_paths: bool,
2237 pub allow_path_escape: bool,
2238 pub allow_duplicate_db_ids: bool,
2239 pub index_override: Option<PathBuf>,
2240 pub limits: ResourceLimits,
2241}
2242
2243#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2244pub struct ResourceLimits {
2245 pub max_manifest_bytes: u64,
2246 pub max_row_identity_jsonl_line_bytes: usize,
2247 pub max_row_identity_rows: usize,
2248 pub max_row_identity_tracked_db_id_bytes: usize,
2249 pub max_auxiliary_artifacts: usize,
2250 pub max_auxiliary_artifact_bytes: u64,
2251 pub max_encoder_distortion_profile_bytes: u64,
2252 pub max_report_issues: usize,
2253 pub max_cached_report_bytes: u64,
2254}
2255
2256impl Default for ResourceLimits {
2257 fn default() -> Self {
2258 Self {
2259 max_manifest_bytes: DEFAULT_MAX_MANIFEST_BYTES,
2260 max_row_identity_jsonl_line_bytes: DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES,
2261 max_row_identity_rows: DEFAULT_MAX_ROW_IDENTITY_ROWS,
2262 max_row_identity_tracked_db_id_bytes: DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES,
2263 max_auxiliary_artifacts: DEFAULT_MAX_AUXILIARY_ARTIFACTS,
2264 max_auxiliary_artifact_bytes: DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES,
2265 max_encoder_distortion_profile_bytes: DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES,
2266 max_report_issues: DEFAULT_MAX_REPORT_ISSUES,
2267 max_cached_report_bytes: DEFAULT_MAX_CACHED_REPORT_BYTES,
2268 }
2269 }
2270}
2271
2272#[derive(Clone, Debug)]
2273struct ResolvedPath {
2274 resolved_path: PathBuf,
2275 canonical_path: PathBuf,
2276}
2277
2278#[derive(Clone, Debug, Default)]
2279struct VerificationPathCapture {
2280 artifact_path: Option<PathBuf>,
2281 row_identity_path: Option<PathBuf>,
2282 auxiliary_artifact_paths: Vec<Option<PathBuf>>,
2283}
2284
2285fn resolve_existing_path(
2286 path: &Path,
2287 base_dir: &Path,
2288 options: &VerifyOptions,
2289 context: &str,
2290 errors: &mut Vec<ReportIssue>,
2291) -> Option<ResolvedPath> {
2292 if path.is_absolute() && !options.allow_absolute_paths {
2293 errors.push(ReportIssue::new(
2294 format!("{context}_absolute_path_rejected"),
2295 format!("absolute path {} is rejected by default", path.display()),
2296 ));
2297 return None;
2298 }
2299
2300 let base_canonical = match fs::canonicalize(base_dir) {
2301 Ok(path) => path,
2302 Err(err) => {
2303 errors.push(ReportIssue::new(
2304 format!("{context}_base_dir_unavailable"),
2305 format!(
2306 "failed to canonicalize base_dir {}: {err}",
2307 base_dir.display()
2308 ),
2309 ));
2310 return None;
2311 }
2312 };
2313
2314 if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) {
2315 errors.push(ReportIssue::new(
2316 format!("{context}_path_escape_rejected"),
2317 format!("relative path {} escapes the manifest base", path.display()),
2318 ));
2319 return None;
2320 }
2321
2322 let resolved_path = if path.is_absolute() {
2323 path.to_path_buf()
2324 } else {
2325 base_dir.join(path)
2326 };
2327 let canonical_path = match fs::canonicalize(&resolved_path) {
2328 Ok(path) => path,
2329 Err(err) => {
2330 errors.push(ReportIssue::new(
2331 format!("{context}_path_unavailable"),
2332 format!("failed to canonicalize {}: {err}", resolved_path.display()),
2333 ));
2334 return None;
2335 }
2336 };
2337
2338 if !options.allow_path_escape && !canonical_path.starts_with(&base_canonical) {
2339 errors.push(ReportIssue::new(
2340 format!("{context}_path_escape_rejected"),
2341 format!(
2342 "canonical path {} is outside manifest base {}",
2343 canonical_path.display(),
2344 base_canonical.display()
2345 ),
2346 ));
2347 return None;
2348 }
2349
2350 Some(ResolvedPath {
2351 resolved_path,
2352 canonical_path,
2353 })
2354}
2355
2356fn has_lexical_escape(path: &Path) -> bool {
2357 let mut depth = 0usize;
2358 for component in path.components() {
2359 match component {
2360 Component::CurDir => {}
2361 Component::Normal(_) => depth += 1,
2362 Component::ParentDir => {
2363 if depth == 0 {
2364 return true;
2365 }
2366 depth -= 1;
2367 }
2368 Component::Prefix(_) | Component::RootDir => return true,
2369 }
2370 }
2371 false
2372}
2373
2374fn default_required() -> bool {
2375 true
2376}
2377
2378fn is_true(value: &bool) -> bool {
2379 *value
2380}
2381
2382#[derive(Clone, Debug, Serialize, Deserialize)]
2383#[serde(deny_unknown_fields)]
2384pub struct IndexManifest {
2385 pub schema_version: String,
2386 pub manifest_id: String,
2387 pub created_at: String,
2388 pub artifact: Artifact,
2389 #[serde(default, skip_serializing_if = "Vec::is_empty")]
2390 pub auxiliary_artifacts: Vec<AuxiliaryArtifact>,
2391 pub embedding: Embedding,
2392 #[serde(default, skip_serializing_if = "Option::is_none")]
2393 pub encoder_distortion: Option<EncoderDistortionProfileRef>,
2394 #[serde(default, skip_serializing_if = "Option::is_none")]
2395 pub calibration: Option<CalibrationProfileRef>,
2396 pub row_identity: RowIdentity,
2397 #[serde(default, skip_serializing_if = "Option::is_none")]
2398 pub build: Option<BuildInfo>,
2399 #[serde(default, skip_serializing_if = "Vec::is_empty")]
2400 pub attestations: Vec<serde_json::Value>,
2401 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
2402 pub extensions: BTreeMap<String, serde_json::Value>,
2403}
2404
2405#[derive(Clone, Debug, Serialize, Deserialize)]
2406#[serde(deny_unknown_fields)]
2407pub struct Artifact {
2408 pub path: String,
2409 pub sha256: String,
2410 pub kind: ManifestIndexKind,
2411 pub format_version: u8,
2412 pub dim: usize,
2413 pub vector_count: usize,
2414 pub bytes_per_vec: usize,
2415 pub params: ManifestIndexParams,
2416 pub file_size_bytes: u64,
2417}
2418
2419#[derive(Clone, Debug, Serialize, Deserialize)]
2420#[serde(deny_unknown_fields)]
2421pub struct AuxiliaryArtifact {
2422 pub name: String,
2423 pub path: String,
2424 pub sha256: String,
2425 pub file_size_bytes: u64,
2426 #[serde(default = "default_required", skip_serializing_if = "is_true")]
2427 pub required: bool,
2428}
2429
2430#[derive(Clone, Debug, Serialize, Deserialize)]
2431#[serde(deny_unknown_fields)]
2432pub struct Embedding {
2433 pub model: String,
2434 pub dim: usize,
2435 #[serde(default, skip_serializing_if = "Option::is_none")]
2436 pub model_revision: Option<String>,
2437 #[serde(default, skip_serializing_if = "Option::is_none")]
2438 pub tokenizer_revision: Option<String>,
2439 #[serde(default, skip_serializing_if = "Option::is_none")]
2440 pub pooling: Option<String>,
2441 #[serde(default, skip_serializing_if = "Option::is_none")]
2442 pub corpus_digest: Option<String>,
2443 #[serde(default, skip_serializing_if = "Option::is_none")]
2444 pub embedding_matrix_digest: Option<String>,
2445 #[serde(default, skip_serializing_if = "Option::is_none")]
2446 pub normalization: Option<String>,
2447}
2448
2449#[derive(Clone, Debug, Serialize, Deserialize)]
2450#[serde(deny_unknown_fields)]
2451pub struct CalibrationProfileRef {
2452 pub schema_version: String,
2453 pub profile_id: String,
2454 #[serde(default, skip_serializing_if = "Option::is_none")]
2455 pub created_at: Option<String>,
2456 pub calibrated_for: EncoderSpec,
2457 pub ordinalization: CalibrationOrdinalization,
2458 #[serde(default, skip_serializing_if = "Option::is_none")]
2459 pub profile: Option<ProfileArtifactRef>,
2460 pub null_model: NullModelSpec,
2461}
2462
2463#[derive(Clone, Debug, Serialize, Deserialize)]
2464#[serde(deny_unknown_fields)]
2465pub struct EncoderSpec {
2466 pub model: String,
2467 pub dim: usize,
2468 #[serde(default, skip_serializing_if = "Option::is_none")]
2469 pub model_revision: Option<String>,
2470 #[serde(default, skip_serializing_if = "Option::is_none")]
2471 pub normalization: Option<String>,
2472}
2473
2474#[derive(Clone, Debug, Serialize, Deserialize)]
2475#[serde(deny_unknown_fields)]
2476pub struct EncoderDistortionProfileRef {
2477 pub schema_version: String,
2478 pub profile_id: String,
2479 #[serde(default, skip_serializing_if = "Option::is_none")]
2480 pub created_at: Option<String>,
2481 pub encoder: EncoderSpec,
2482 #[serde(default, skip_serializing_if = "Option::is_none")]
2483 pub tokenizer_revision: Option<String>,
2484 #[serde(default, skip_serializing_if = "Option::is_none")]
2485 pub pooling: Option<String>,
2486 pub source_metric: MetricSpec,
2487 pub embedding_metric: MetricSpec,
2488 pub bounds: DistortionBounds,
2489 pub scope: DistortionScope,
2490 pub evidence: DistortionEvidence,
2491 #[serde(default, skip_serializing_if = "Option::is_none")]
2492 pub profile: Option<DistortionProfileArtifactRef>,
2493 #[serde(default, skip_serializing_if = "Option::is_none")]
2494 pub calibration_profile_id: Option<String>,
2495}
2496
2497#[derive(Clone, Debug, Serialize, Deserialize)]
2498#[serde(deny_unknown_fields)]
2499pub struct MetricSpec {
2500 pub name: String,
2501 #[serde(default, skip_serializing_if = "Option::is_none")]
2502 pub version: Option<String>,
2503 #[serde(default, skip_serializing_if = "Option::is_none")]
2504 pub digest: Option<String>,
2505}
2506
2507#[derive(Clone, Debug, Serialize, Deserialize)]
2508#[serde(deny_unknown_fields)]
2509pub struct DistortionBounds {
2510 #[serde(default, skip_serializing_if = "Option::is_none")]
2511 pub declared_lower_bound: Option<f64>,
2512 #[serde(default, skip_serializing_if = "Option::is_none")]
2513 pub declared_upper_bound: Option<f64>,
2514 #[serde(default, skip_serializing_if = "Option::is_none")]
2515 pub estimated_distortion: Option<f64>,
2516 #[serde(default, skip_serializing_if = "Option::is_none")]
2517 pub violation_rate: Option<f64>,
2518 #[serde(default, skip_serializing_if = "Option::is_none")]
2519 pub max_observed_violation: Option<f64>,
2520 #[serde(default, skip_serializing_if = "Option::is_none")]
2521 pub quantile_observed_violation: Option<f64>,
2522}
2523
2524#[derive(Clone, Debug, Serialize, Deserialize)]
2525#[serde(deny_unknown_fields)]
2526pub struct DistortionScope {
2527 #[serde(default, skip_serializing_if = "Option::is_none")]
2528 pub corpus_digest: Option<String>,
2529 #[serde(default, skip_serializing_if = "Option::is_none")]
2530 pub query_set_digest: Option<String>,
2531 #[serde(default, skip_serializing_if = "Option::is_none")]
2532 pub pair_sample_digest: Option<String>,
2533 #[serde(default, skip_serializing_if = "Option::is_none")]
2534 pub domain: Option<String>,
2535 #[serde(default, skip_serializing_if = "Option::is_none")]
2536 pub sample_size: Option<u64>,
2537 #[serde(default, skip_serializing_if = "Option::is_none")]
2538 pub confidence: Option<f64>,
2539 #[serde(default, skip_serializing_if = "Option::is_none")]
2540 pub coverage: Option<f64>,
2541 #[serde(default, skip_serializing_if = "Option::is_none")]
2542 pub estimator_version: Option<String>,
2543}
2544
2545#[derive(Clone, Debug, Serialize, Deserialize)]
2546#[serde(deny_unknown_fields)]
2547pub struct DistortionEvidence {
2548 pub kind: DistortionEvidenceKind,
2549 #[serde(default, skip_serializing_if = "Option::is_none")]
2550 pub estimator_id: Option<String>,
2551 #[serde(default, skip_serializing_if = "Option::is_none")]
2552 pub estimator_hash: Option<String>,
2553}
2554
2555#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2556#[serde(rename_all = "snake_case")]
2557pub enum DistortionEvidenceKind {
2558 Certified,
2559 EmpiricalSample,
2560 BenchmarkEstimate,
2561 TeacherEstimate,
2562 CallerAsserted,
2563}
2564
2565impl DistortionEvidenceKind {
2566 pub fn label(&self) -> &'static str {
2567 match self {
2568 Self::Certified => "certified",
2569 Self::EmpiricalSample => "empirical_sample",
2570 Self::BenchmarkEstimate => "benchmark_estimate",
2571 Self::TeacherEstimate => "teacher_estimate",
2572 Self::CallerAsserted => "caller_asserted",
2573 }
2574 }
2575}
2576
2577#[derive(Clone, Debug, Serialize, Deserialize)]
2578#[serde(deny_unknown_fields)]
2579pub struct DistortionProfileArtifactRef {
2580 pub path: String,
2581 pub sha256: String,
2582 pub file_size_bytes: u64,
2583 pub format: String,
2584 #[serde(default, skip_serializing_if = "Option::is_none")]
2585 pub source_digest: Option<String>,
2586}
2587
2588#[derive(Clone, Debug, Serialize, Deserialize)]
2589#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2590pub enum CalibrationOrdinalization {
2591 TopK { dim: usize, k: usize },
2592 Bucket { dim: usize, bits: u8 },
2593 Sign { dim: usize },
2594 RankPosition { dim: usize },
2595 CallerDefined { dim: usize, name: String },
2596}
2597
2598impl CalibrationOrdinalization {
2599 pub fn dim(&self) -> usize {
2600 match self {
2601 Self::TopK { dim, .. }
2602 | Self::Bucket { dim, .. }
2603 | Self::Sign { dim }
2604 | Self::RankPosition { dim }
2605 | Self::CallerDefined { dim, .. } => *dim,
2606 }
2607 }
2608
2609 pub fn label(&self) -> &'static str {
2610 match self {
2611 Self::TopK { .. } => "top_k",
2612 Self::Bucket { .. } => "bucket",
2613 Self::Sign { .. } => "sign",
2614 Self::RankPosition { .. } => "rank_position",
2615 Self::CallerDefined { .. } => "caller_defined",
2616 }
2617 }
2618}
2619
2620#[derive(Clone, Debug, Serialize, Deserialize)]
2621#[serde(deny_unknown_fields)]
2622pub struct ProfileArtifactRef {
2623 pub path: String,
2624 pub sha256: String,
2625 pub file_size_bytes: u64,
2626 pub dim: usize,
2627 pub sample_count: usize,
2628 pub parameterization: ProfileParameterization,
2629 pub format: String,
2630 #[serde(default, skip_serializing_if = "Vec::is_empty")]
2631 pub shape: Vec<usize>,
2632 #[serde(default, skip_serializing_if = "Option::is_none")]
2633 pub source_digest: Option<String>,
2634}
2635
2636#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2637#[serde(rename_all = "snake_case")]
2638pub enum ProfileParameterization {
2639 #[serde(rename = "marginal_topk_frequency")]
2640 MarginalTopKFrequency,
2641 BucketFrequency,
2642 SignFrequency,
2643 RankPositionFrequency,
2644 EmpiricalTailTable,
2645}
2646
2647#[derive(Clone, Debug, Serialize, Deserialize)]
2648#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2649pub enum NullModelSpec {
2650 UniformHypergeometric,
2651 WeightedMarginalProfile {
2652 parameterization: ProfileParameterization,
2653 },
2654 EmpiricalTailTable {
2655 statistic: String,
2656 },
2657 CallerDefined {
2658 name: String,
2659 #[serde(default, skip_serializing_if = "Option::is_none")]
2660 parameterization: Option<String>,
2661 },
2662}
2663
2664impl NullModelSpec {
2665 pub fn label(&self) -> &'static str {
2666 match self {
2667 Self::UniformHypergeometric => "uniform_hypergeometric",
2668 Self::WeightedMarginalProfile { .. } => "weighted_marginal_profile",
2669 Self::EmpiricalTailTable { .. } => "empirical_tail_table",
2670 Self::CallerDefined { .. } => "caller_defined",
2671 }
2672 }
2673}
2674
2675#[derive(Clone, Debug, Serialize, Deserialize)]
2676#[serde(deny_unknown_fields)]
2677pub struct BuildInfo {
2678 pub invocation_id: String,
2679 #[serde(default, skip_serializing_if = "Option::is_none")]
2680 pub builder_id: Option<String>,
2681 #[serde(default, skip_serializing_if = "Option::is_none")]
2682 pub source_repo: Option<String>,
2683 #[serde(default, skip_serializing_if = "Option::is_none")]
2684 pub source_commit: Option<String>,
2685 #[serde(default, skip_serializing_if = "Option::is_none")]
2686 pub ci_provider: Option<String>,
2687 #[serde(default, skip_serializing_if = "Option::is_none")]
2688 pub ci_run_id: Option<String>,
2689}
2690
2691#[derive(Clone, Debug, Serialize, Deserialize)]
2692#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2693pub enum RowIdentity {
2694 RowIdIdentity {
2695 row_count: usize,
2696 },
2697 Jsonl {
2698 path: String,
2699 sha256: String,
2700 row_count: usize,
2701 id_kind: String,
2702 #[serde(default, skip_serializing_if = "Option::is_none")]
2703 db: Option<RowIdentityDb>,
2704 },
2705}
2706
2707impl RowIdentity {
2708 pub fn row_count(&self) -> usize {
2709 match self {
2710 Self::RowIdIdentity { row_count } | Self::Jsonl { row_count, .. } => *row_count,
2711 }
2712 }
2713}
2714
2715#[derive(Clone, Debug, Serialize, Deserialize)]
2716#[serde(deny_unknown_fields)]
2717pub struct RowIdentityDb {
2718 #[serde(default, skip_serializing_if = "Option::is_none")]
2719 pub path: Option<String>,
2720 #[serde(default, skip_serializing_if = "Option::is_none")]
2721 pub table: Option<String>,
2722 #[serde(default, skip_serializing_if = "Option::is_none")]
2723 pub id_column: Option<String>,
2724}
2725
2726#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2727#[serde(rename_all = "snake_case")]
2728pub enum ManifestIndexKind {
2729 Rank,
2730 RankQuant,
2731 Bitmap,
2732 SignBitmap,
2733}
2734
2735impl ManifestIndexKind {
2736 fn from_core(kind: CoreIndexKind) -> Self {
2737 match kind {
2738 CoreIndexKind::Rank => Self::Rank,
2739 CoreIndexKind::RankQuant => Self::RankQuant,
2740 CoreIndexKind::Bitmap => Self::Bitmap,
2741 CoreIndexKind::SignBitmap => Self::SignBitmap,
2742 }
2743 }
2744}
2745
2746#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2747#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2748pub enum ManifestIndexParams {
2749 Rank,
2750 RankQuant { bits: u8 },
2751 Bitmap { n_top: usize },
2752 SignBitmap,
2753}
2754
2755impl ManifestIndexParams {
2756 fn from_core(params: CoreIndexParams) -> Self {
2757 match params {
2758 CoreIndexParams::Rank => Self::Rank,
2759 CoreIndexParams::RankQuant { bits } => Self::RankQuant { bits },
2760 CoreIndexParams::Bitmap { n_top } => Self::Bitmap { n_top },
2761 CoreIndexParams::SignBitmap => Self::SignBitmap,
2762 }
2763 }
2764}
2765
2766#[derive(Clone, Debug)]
2776pub struct VerifiedLoadPlan {
2777 manifest_path: Option<PathBuf>,
2778 artifact_path: PathBuf,
2779 metadata: MetadataReport,
2780 row_identity: VerifiedRowIdentityPlan,
2781 auxiliary_artifacts: Vec<VerifiedAuxiliaryArtifactPlan>,
2782 report: VerificationReport,
2783}
2784
2785impl VerifiedLoadPlan {
2786 fn from_report(
2787 document: &ManifestDocument,
2788 report: VerificationReport,
2789 paths: VerificationPathCapture,
2790 ) -> Result<Self, VerifiedLoadPlanError> {
2791 if !report.ok {
2792 return Err(VerifiedLoadPlanError::VerificationFailed(Box::new(report)));
2793 }
2794
2795 let artifact_path =
2796 paths
2797 .artifact_path
2798 .clone()
2799 .ok_or_else(|| VerifiedLoadPlanError::IncompletePlan {
2800 report: Box::new(report.clone()),
2801 message: "verified report is missing the captured artifact path".to_string(),
2802 })?;
2803 let metadata = report.artifact.metadata.clone().ok_or_else(|| {
2804 VerifiedLoadPlanError::IncompletePlan {
2805 report: Box::new(report.clone()),
2806 message: "verified report is missing probed artifact metadata".to_string(),
2807 }
2808 })?;
2809 let row_identity =
2810 VerifiedRowIdentityPlan::from_report(paths.row_identity_path.as_ref(), &report)?;
2811 let auxiliary_artifacts = report
2812 .auxiliary_artifacts
2813 .iter()
2814 .enumerate()
2815 .map(|(idx, entry)| {
2816 VerifiedAuxiliaryArtifactPlan::from_report(
2817 entry,
2818 paths
2819 .auxiliary_artifact_paths
2820 .get(idx)
2821 .and_then(|path| path.as_ref()),
2822 &report,
2823 )
2824 })
2825 .collect::<Result<Vec<_>, _>>()?;
2826
2827 Ok(Self {
2828 manifest_path: document.source_path.clone(),
2829 artifact_path,
2830 metadata,
2831 row_identity,
2832 auxiliary_artifacts,
2833 report,
2834 })
2835 }
2836
2837 pub fn manifest_path(&self) -> Option<&Path> {
2838 self.manifest_path.as_deref()
2839 }
2840
2841 pub fn artifact_path(&self) -> &Path {
2847 &self.artifact_path
2848 }
2849
2850 pub fn metadata(&self) -> &MetadataReport {
2851 &self.metadata
2852 }
2853
2854 pub fn row_identity(&self) -> &VerifiedRowIdentityPlan {
2855 &self.row_identity
2856 }
2857
2858 pub fn auxiliary_artifacts(&self) -> &[VerifiedAuxiliaryArtifactPlan] {
2859 &self.auxiliary_artifacts
2860 }
2861
2862 pub fn auxiliary_by_name(&self, name: &str) -> Option<&VerifiedAuxiliaryArtifactPlan> {
2863 let name = name.trim();
2864 self.auxiliary_artifacts
2865 .iter()
2866 .find(|artifact| artifact.name().trim() == name)
2867 }
2868
2869 pub fn require_auxiliary(&self, name: &str) -> Result<&Path, RequireAuxiliaryError> {
2870 let artifact = self.auxiliary_by_name(name).ok_or_else(|| {
2871 RequireAuxiliaryError::MissingDeclaration {
2872 name: name.to_string(),
2873 }
2874 })?;
2875 artifact
2876 .path()
2877 .ok_or_else(|| RequireAuxiliaryError::NotLoadable {
2878 name: name.to_string(),
2879 state: artifact.state(),
2880 reason_code: artifact.reason_code().map(ToOwned::to_owned),
2881 })
2882 }
2883
2884 pub fn report(&self) -> &VerificationReport {
2885 &self.report
2886 }
2887
2888 pub fn into_report(self) -> VerificationReport {
2889 self.report
2890 }
2891}
2892
2893#[derive(Clone, Debug, PartialEq, Eq)]
2894pub enum RequireAuxiliaryError {
2895 MissingDeclaration {
2896 name: String,
2897 },
2898 NotLoadable {
2899 name: String,
2900 state: AuxiliaryArtifactState,
2901 reason_code: Option<String>,
2902 },
2903}
2904
2905impl fmt::Display for RequireAuxiliaryError {
2906 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2907 match self {
2908 Self::MissingDeclaration { name } => {
2909 write!(f, "required auxiliary artifact {name:?} is not declared")
2910 }
2911 Self::NotLoadable {
2912 name,
2913 state,
2914 reason_code,
2915 } => {
2916 write!(
2917 f,
2918 "required auxiliary artifact {name:?} is not loadable: state={state:?}"
2919 )?;
2920 if let Some(reason_code) = reason_code {
2921 write!(f, ", reason_code={reason_code}")?;
2922 }
2923 Ok(())
2924 }
2925 }
2926 }
2927}
2928
2929impl std::error::Error for RequireAuxiliaryError {}
2930
2931#[derive(Clone, Debug)]
2932pub struct VerifiedRowIdentityPlan {
2933 kind: String,
2934 path: Option<PathBuf>,
2935 row_count: usize,
2936 validated_rows: Option<usize>,
2937 sha256: Option<String>,
2938}
2939
2940impl VerifiedRowIdentityPlan {
2941 fn from_report(
2942 captured_path: Option<&PathBuf>,
2943 report: &VerificationReport,
2944 ) -> Result<Self, VerifiedLoadPlanError> {
2945 let kind = report.row_identity.kind.clone().ok_or_else(|| {
2946 VerifiedLoadPlanError::IncompletePlan {
2947 report: Box::new(report.clone()),
2948 message: "verified report is missing row identity kind".to_string(),
2949 }
2950 })?;
2951 let row_count =
2952 report
2953 .row_identity
2954 .row_count
2955 .ok_or_else(|| VerifiedLoadPlanError::IncompletePlan {
2956 report: Box::new(report.clone()),
2957 message: "verified report is missing row identity row count".to_string(),
2958 })?;
2959 let path = match kind.as_str() {
2960 "row_id_identity" => None,
2961 "jsonl" => Some(captured_path.cloned().ok_or_else(|| {
2962 VerifiedLoadPlanError::IncompletePlan {
2963 report: Box::new(report.clone()),
2964 message: "verified report is missing the captured row identity path"
2965 .to_string(),
2966 }
2967 })?),
2968 _ => {
2969 return Err(VerifiedLoadPlanError::IncompletePlan {
2970 report: Box::new(report.clone()),
2971 message: format!("verified report has unsupported row identity kind {kind:?}"),
2972 });
2973 }
2974 };
2975
2976 Ok(Self {
2977 kind,
2978 path,
2979 row_count,
2980 validated_rows: report.row_identity.validated_rows,
2981 sha256: report.row_identity.sha256.clone(),
2982 })
2983 }
2984
2985 pub fn kind(&self) -> &str {
2986 &self.kind
2987 }
2988
2989 pub fn path(&self) -> Option<&Path> {
2990 self.path.as_deref()
2991 }
2992
2993 pub fn row_count(&self) -> usize {
2994 self.row_count
2995 }
2996
2997 pub fn validated_rows(&self) -> Option<usize> {
2998 self.validated_rows
2999 }
3000
3001 pub fn sha256(&self) -> Option<&str> {
3002 self.sha256.as_deref()
3003 }
3004}
3005
3006#[derive(Clone, Debug)]
3007pub struct VerifiedAuxiliaryArtifactPlan {
3008 name: String,
3009 path: Option<PathBuf>,
3010 required: bool,
3011 state: AuxiliaryArtifactState,
3012 reason_code: Option<String>,
3013 sha256: Option<String>,
3014 size_bytes: Option<u64>,
3015}
3016
3017impl VerifiedAuxiliaryArtifactPlan {
3018 fn from_report(
3019 entry: &AuxiliaryArtifactReport,
3020 captured_path: Option<&PathBuf>,
3021 report: &VerificationReport,
3022 ) -> Result<Self, VerifiedLoadPlanError> {
3023 let path = match entry.state {
3024 AuxiliaryArtifactState::Verified => Some(captured_path.cloned().ok_or_else(|| {
3025 VerifiedLoadPlanError::IncompletePlan {
3026 report: Box::new(report.clone()),
3027 message: format!(
3028 "verified auxiliary artifact {:?} is missing its captured path",
3029 entry.name
3030 ),
3031 }
3032 })?),
3033 AuxiliaryArtifactState::OptionalAbsent => None,
3034 AuxiliaryArtifactState::MissingRequired | AuxiliaryArtifactState::Failed => {
3035 return Err(VerifiedLoadPlanError::IncompletePlan {
3036 report: Box::new(report.clone()),
3037 message: format!(
3038 "verified report contains non-loadable auxiliary artifact {:?}",
3039 entry.name
3040 ),
3041 });
3042 }
3043 };
3044
3045 Ok(Self {
3046 name: entry.name.clone(),
3047 path,
3048 required: entry.required,
3049 state: entry.state,
3050 reason_code: entry.reason_code.clone(),
3051 sha256: entry.sha256.clone(),
3052 size_bytes: entry.size_bytes,
3053 })
3054 }
3055
3056 pub fn name(&self) -> &str {
3057 &self.name
3058 }
3059
3060 pub fn path(&self) -> Option<&Path> {
3061 self.path.as_deref()
3062 }
3063
3064 pub fn required(&self) -> bool {
3065 self.required
3066 }
3067
3068 pub fn state(&self) -> AuxiliaryArtifactState {
3069 self.state
3070 }
3071
3072 pub fn reason_code(&self) -> Option<&str> {
3073 self.reason_code.as_deref()
3074 }
3075
3076 pub fn sha256(&self) -> Option<&str> {
3077 self.sha256.as_deref()
3078 }
3079
3080 pub fn size_bytes(&self) -> Option<u64> {
3081 self.size_bytes
3082 }
3083}
3084
3085#[derive(Debug)]
3086pub enum VerifiedLoadPlanError {
3087 Manifest(ManifestError),
3088 VerificationFailed(Box<VerificationReport>),
3089 IncompletePlan {
3090 report: Box<VerificationReport>,
3091 message: String,
3092 },
3093}
3094
3095impl fmt::Display for VerifiedLoadPlanError {
3096 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
3097 match self {
3098 Self::Manifest(err) => write!(f, "{err}"),
3099 Self::VerificationFailed(report) => {
3100 write!(
3101 f,
3102 "manifest verification failed{}",
3103 report_issue_summary(&report.errors)
3104 )
3105 }
3106 Self::IncompletePlan { message, .. } => f.write_str(message),
3107 }
3108 }
3109}
3110
3111impl std::error::Error for VerifiedLoadPlanError {
3112 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
3113 match self {
3114 Self::Manifest(err) => Some(err),
3115 Self::VerificationFailed(_) | Self::IncompletePlan { .. } => None,
3116 }
3117 }
3118}
3119
3120impl From<ManifestError> for VerifiedLoadPlanError {
3121 fn from(value: ManifestError) -> Self {
3122 Self::Manifest(value)
3123 }
3124}
3125
3126fn report_issue_summary(errors: &[ReportIssue]) -> String {
3127 if errors.is_empty() {
3128 return String::new();
3129 }
3130 let codes = errors
3131 .iter()
3132 .take(3)
3133 .map(|issue| issue.code.as_str())
3134 .collect::<Vec<_>>()
3135 .join(", ");
3136 if errors.len() > 3 {
3137 format!(": {codes}, ...")
3138 } else {
3139 format!(": {codes}")
3140 }
3141}
3142
3143#[derive(Clone, Debug, Serialize, Deserialize)]
3144pub struct VerificationReport {
3145 pub ok: bool,
3146 pub checked_at: String,
3147 pub manifest_id: Option<String>,
3148 pub artifact: ArtifactReport,
3149 #[serde(default)]
3150 pub auxiliary_artifacts: Vec<AuxiliaryArtifactReport>,
3151 pub row_identity: RowIdentityReport,
3152 #[serde(default)]
3153 pub encoder_distortion: EncoderDistortionReport,
3154 pub calibration: CalibrationReport,
3155 pub attestation_shape_checks: Vec<AttestationShapeCheck>,
3156 pub errors: Vec<ReportIssue>,
3157 pub warnings: Vec<ReportIssue>,
3158 pub skipped_checks: Vec<String>,
3159}
3160
3161impl VerificationReport {
3162 fn new(manifest_id: Option<String>) -> Self {
3163 Self {
3164 ok: false,
3165 checked_at: Utc::now().to_rfc3339_opts(SecondsFormat::Nanos, true),
3166 manifest_id,
3167 artifact: ArtifactReport::default(),
3168 auxiliary_artifacts: Vec::new(),
3169 row_identity: RowIdentityReport::default(),
3170 encoder_distortion: EncoderDistortionReport::default(),
3171 calibration: CalibrationReport::default(),
3172 attestation_shape_checks: Vec::new(),
3173 errors: Vec::new(),
3174 warnings: Vec::new(),
3175 skipped_checks: Vec::new(),
3176 }
3177 }
3178
3179 fn error(&mut self, code: impl Into<String>, message: impl Into<String>) {
3180 self.errors.push(ReportIssue::new(code, message));
3181 }
3182}
3183
3184#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3185pub struct ArtifactReport {
3186 pub manifest_path: Option<String>,
3187 pub observed_path: Option<String>,
3188 pub canonical_path: Option<String>,
3189 pub sha256: Option<String>,
3190 pub size_bytes: Option<u64>,
3191 pub metadata: Option<MetadataReport>,
3192}
3193
3194#[derive(Clone, Debug, Serialize, Deserialize)]
3195pub struct AuxiliaryArtifactReport {
3196 pub name: String,
3197 pub manifest_path: String,
3198 #[serde(default)]
3199 pub resolved_path: Option<String>,
3200 #[serde(default)]
3201 pub canonical_path: Option<String>,
3202 #[serde(default)]
3203 pub expected_sha256: Option<String>,
3204 #[serde(default)]
3205 pub expected_size_bytes: Option<u64>,
3206 pub required: bool,
3207 pub state: AuxiliaryArtifactState,
3208 pub reason_code: Option<String>,
3209 pub sha256: Option<String>,
3210 pub size_bytes: Option<u64>,
3211}
3212
3213#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
3214#[serde(rename_all = "snake_case")]
3215pub enum AuxiliaryArtifactState {
3216 Verified,
3217 OptionalAbsent,
3218 MissingRequired,
3219 Failed,
3220}
3221
3222#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3223pub struct RowIdentityReport {
3224 pub kind: Option<String>,
3225 pub manifest_path: Option<String>,
3226 pub canonical_path: Option<String>,
3227 pub sha256: Option<String>,
3228 pub row_count: Option<usize>,
3229 pub validated_rows: Option<usize>,
3230}
3231
3232#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3233pub struct EncoderDistortionReport {
3234 pub present: bool,
3235 pub schema_version: Option<String>,
3236 pub profile_id: Option<String>,
3237 pub evidence_kind: Option<String>,
3238 pub source_metric: Option<String>,
3239 pub embedding_metric: Option<String>,
3240 pub profile_manifest_path: Option<String>,
3241 pub profile_canonical_path: Option<String>,
3242 pub profile_sha256: Option<String>,
3243 pub profile_size_bytes: Option<u64>,
3244}
3245
3246#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3247pub struct CalibrationReport {
3248 pub present: bool,
3249 pub schema_version: Option<String>,
3250 pub profile_id: Option<String>,
3251 pub calibrated_for_model: Option<String>,
3252 pub ordinalization: Option<String>,
3253 pub null_model: Option<String>,
3254 pub profile_manifest_path: Option<String>,
3255 pub profile_canonical_path: Option<String>,
3256 pub profile_sha256: Option<String>,
3257 pub profile_size_bytes: Option<u64>,
3258}
3259
3260#[derive(Clone, Debug, Serialize, Deserialize)]
3261pub struct MetadataReport {
3262 pub kind: ManifestIndexKind,
3263 pub format_version: u8,
3264 pub dim: usize,
3265 pub vector_count: usize,
3266 pub bytes_per_vec: usize,
3267 pub params: ManifestIndexParams,
3268 pub file_size_bytes: u64,
3269}
3270
3271impl MetadataReport {
3272 fn from_core(metadata: &CoreIndexMetadata) -> Self {
3273 Self {
3274 kind: ManifestIndexKind::from_core(metadata.kind),
3275 format_version: metadata.format_version,
3276 dim: metadata.dim,
3277 vector_count: metadata.vector_count,
3278 bytes_per_vec: metadata.bytes_per_vec,
3279 params: ManifestIndexParams::from_core(metadata.params),
3280 file_size_bytes: metadata.file_size_bytes,
3281 }
3282 }
3283}
3284
3285#[derive(Clone, Debug, Serialize, Deserialize)]
3286pub struct AttestationShapeCheck {
3287 pub predicate_type: Option<String>,
3288 pub builder_id: Option<String>,
3289 pub subject_sha256_matched: bool,
3290}
3291
3292#[derive(Clone, Debug, Serialize, Deserialize)]
3293pub struct ReportIssue {
3294 pub code: String,
3295 pub message: String,
3296}
3297
3298impl ReportIssue {
3299 pub fn new(code: impl Into<String>, message: impl Into<String>) -> Self {
3300 Self {
3301 code: code.into(),
3302 message: message.into(),
3303 }
3304 }
3305}
3306
3307fn push_report_issue_bounded(
3308 errors: &mut Vec<ReportIssue>,
3309 limits: &ResourceLimits,
3310 code: impl Into<String>,
3311 message: impl Into<String>,
3312) {
3313 let limit = limits.max_report_issues;
3314 if errors.len() < limit {
3315 errors.push(ReportIssue::new(code, message));
3316 return;
3317 }
3318 if errors
3319 .iter()
3320 .any(|issue| issue.code == "verification_report_issue_limit_exceeded")
3321 {
3322 return;
3323 }
3324 let detail_limit = limit.saturating_sub(1);
3325 errors.truncate(detail_limit);
3326 errors.push(ReportIssue::new(
3327 "verification_report_issue_limit_exceeded",
3328 format!("verification report issue count exceeded max_report_issues={limit}"),
3329 ));
3330}
3331
3332fn enforce_report_issue_limit(errors: &mut Vec<ReportIssue>, limits: &ResourceLimits) {
3333 let limit = limits.max_report_issues;
3334 if errors.len() <= limit {
3335 return;
3336 }
3337 errors.retain(|issue| issue.code != "verification_report_issue_limit_exceeded");
3338 let detail_limit = limit.saturating_sub(1);
3339 errors.truncate(detail_limit);
3340 errors.push(ReportIssue::new(
3341 "verification_report_issue_limit_exceeded",
3342 format!("verification report issue count exceeded max_report_issues={limit}"),
3343 ));
3344}
3345
3346#[derive(Clone, Debug, Serialize, Deserialize)]
3347pub struct FileHash {
3348 pub sha256: String,
3349 pub size_bytes: u64,
3350}
3351
3352pub fn sha256_file(path: impl AsRef<Path>) -> io::Result<FileHash> {
3353 let mut file = File::open(path)?;
3354 let mut hasher = Sha256::new();
3355 let mut size_bytes = 0u64;
3356 let mut buf = [0u8; 64 * 1024];
3357 loop {
3358 let n = file.read(&mut buf)?;
3359 if n == 0 {
3360 break;
3361 }
3362 size_bytes += n as u64;
3363 hasher.update(&buf[..n]);
3364 }
3365 Ok(FileHash {
3366 sha256: hex::encode(hasher.finalize()),
3367 size_bytes,
3368 })
3369}
3370
3371pub fn sha256_file_bounded(
3372 path: impl AsRef<Path>,
3373 max_bytes: u64,
3374 code: &'static str,
3375 context: &'static str,
3376) -> Result<FileHash, ManifestError> {
3377 let path = path.as_ref();
3378 let bytes = read_bounded_file(path, max_bytes, code, context)?;
3379 let mut hasher = Sha256::new();
3380 hasher.update(&bytes);
3381 Ok(FileHash {
3382 sha256: hex::encode(hasher.finalize()),
3383 size_bytes: bytes.len() as u64,
3384 })
3385}
3386
3387#[derive(Clone, Debug)]
3388pub enum CreateRowIdentity {
3389 RowIdIdentity,
3390 Jsonl(PathBuf),
3391}
3392
3393#[derive(Clone, Debug)]
3394pub struct CreateAuxiliaryArtifact {
3395 pub name: String,
3396 pub path: PathBuf,
3397 pub required: bool,
3398}
3399
3400#[derive(Clone, Debug, Default)]
3401pub struct CreateManifestOptions {
3402 pub allow_absolute_paths: bool,
3403 pub allow_path_escape: bool,
3404 pub limits: ResourceLimits,
3405 pub auxiliary_artifacts: Vec<CreateAuxiliaryArtifact>,
3406}
3407
3408pub fn create_manifest_for_index(
3409 index_path: impl AsRef<Path>,
3410 row_identity: CreateRowIdentity,
3411 embedding_model: impl Into<String>,
3412 out_path: impl AsRef<Path>,
3413) -> Result<IndexManifest, ManifestError> {
3414 create_manifest_for_index_with_options(
3415 index_path,
3416 row_identity,
3417 embedding_model,
3418 out_path,
3419 CreateManifestOptions::default(),
3420 )
3421}
3422
3423pub fn create_manifest_for_index_with_options(
3424 index_path: impl AsRef<Path>,
3425 row_identity: CreateRowIdentity,
3426 embedding_model: impl Into<String>,
3427 out_path: impl AsRef<Path>,
3428 options: CreateManifestOptions,
3429) -> Result<IndexManifest, ManifestError> {
3430 let index_path = index_path.as_ref();
3431 let out_path = out_path.as_ref();
3432 let out_base = out_path
3433 .parent()
3434 .filter(|p| !p.as_os_str().is_empty())
3435 .unwrap_or_else(|| Path::new("."));
3436 if !out_base.exists() {
3437 fs::create_dir_all(out_base)?;
3438 }
3439 let metadata = probe_index_metadata(index_path)?;
3440 let index_hash = sha256_file(index_path)?;
3441 let artifact = Artifact {
3442 path: manifest_path_for_create(index_path, out_base, &options, "artifact")?,
3443 sha256: index_hash.sha256,
3444 kind: ManifestIndexKind::from_core(metadata.kind),
3445 format_version: metadata.format_version,
3446 dim: metadata.dim,
3447 vector_count: metadata.vector_count,
3448 bytes_per_vec: metadata.bytes_per_vec,
3449 params: ManifestIndexParams::from_core(metadata.params),
3450 file_size_bytes: metadata.file_size_bytes,
3451 };
3452
3453 let row_identity = match row_identity {
3454 CreateRowIdentity::RowIdIdentity => RowIdentity::RowIdIdentity {
3455 row_count: metadata.vector_count,
3456 },
3457 CreateRowIdentity::Jsonl(path) => {
3458 let mut row_errors = Vec::new();
3459 let stats = validate_jsonl_rows(
3460 &path,
3461 false,
3462 &options.limits,
3463 Some(metadata.vector_count),
3464 &mut row_errors,
3465 )?;
3466 if !row_errors.is_empty() {
3467 if let Some(issue) = row_errors
3468 .iter()
3469 .find(|issue| is_limit_issue_code(&issue.code))
3470 {
3471 return Err(ManifestError::limit_exceeded(
3472 issue.code.clone(),
3473 issue.message.clone(),
3474 ));
3475 }
3476 let codes = row_errors
3477 .iter()
3478 .map(|issue| issue.code.as_str())
3479 .collect::<Vec<_>>()
3480 .join(", ");
3481 return Err(ManifestError::invalid(format!(
3482 "row map is invalid: {codes}"
3483 )));
3484 }
3485 if stats.row_count != metadata.vector_count {
3486 return Err(ManifestError::invalid(format!(
3487 "row map has {} rows but index has {} vectors",
3488 stats.row_count, metadata.vector_count
3489 )));
3490 }
3491 let row_sha256 = stats.sha256.ok_or_else(|| {
3492 ManifestError::invalid("row map hash unavailable after bounded validation")
3493 })?;
3494 RowIdentity::Jsonl {
3495 path: manifest_path_for_create(&path, out_base, &options, "row identity")?,
3496 sha256: row_sha256,
3497 row_count: stats.row_count,
3498 id_kind: "uuid".to_string(),
3499 db: None,
3500 }
3501 }
3502 };
3503
3504 let auxiliary_artifacts =
3505 create_auxiliary_artifacts(&options.auxiliary_artifacts, out_base, &options)?;
3506
3507 let invocation_id = format!("urn:uuid:{}", Uuid::new_v4());
3508 Ok(IndexManifest {
3509 schema_version: SCHEMA_VERSION.to_string(),
3510 manifest_id: format!("urn:uuid:{}", Uuid::new_v4()),
3511 created_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true),
3512 artifact,
3513 auxiliary_artifacts,
3514 embedding: Embedding {
3515 model: embedding_model.into(),
3516 dim: metadata.dim,
3517 model_revision: None,
3518 tokenizer_revision: None,
3519 pooling: None,
3520 corpus_digest: None,
3521 embedding_matrix_digest: None,
3522 normalization: None,
3523 },
3524 encoder_distortion: None,
3525 calibration: None,
3526 row_identity,
3527 build: Some(BuildInfo {
3528 invocation_id,
3529 builder_id: Some("ordvec-manifest".to_string()),
3530 source_repo: None,
3531 source_commit: None,
3532 ci_provider: None,
3533 ci_run_id: None,
3534 }),
3535 attestations: Vec::new(),
3536 extensions: BTreeMap::new(),
3537 })
3538}
3539
3540fn create_auxiliary_artifacts(
3541 artifacts: &[CreateAuxiliaryArtifact],
3542 out_base: &Path,
3543 options: &CreateManifestOptions,
3544) -> Result<Vec<AuxiliaryArtifact>, ManifestError> {
3545 let count = artifacts.len();
3546 if count > options.limits.max_auxiliary_artifacts {
3547 return Err(ManifestError::limit_exceeded(
3548 "auxiliary_artifact_count_limit_exceeded",
3549 format!(
3550 "auxiliary_artifacts has {count} entries, exceeding max_auxiliary_artifacts={}",
3551 options.limits.max_auxiliary_artifacts
3552 ),
3553 ));
3554 }
3555
3556 let mut names = HashSet::new();
3557 let mut manifest_artifacts = Vec::with_capacity(artifacts.len());
3558 for artifact in artifacts {
3559 let name = artifact.name.trim();
3560 if name.is_empty() {
3561 return Err(ManifestError::invalid(
3562 "auxiliary artifact name must be non-empty",
3563 ));
3564 }
3565 if !names.insert(name.to_string()) {
3566 return Err(ManifestError::invalid(format!(
3567 "auxiliary artifact name {name:?} is duplicated"
3568 )));
3569 }
3570 let hash = sha256_file_bounded(
3571 &artifact.path,
3572 options.limits.max_auxiliary_artifact_bytes,
3573 "auxiliary_artifact_file_too_large",
3574 "auxiliary artifact",
3575 )?;
3576 manifest_artifacts.push(AuxiliaryArtifact {
3577 name: name.to_string(),
3578 path: manifest_path_for_create(
3579 &artifact.path,
3580 out_base,
3581 options,
3582 "auxiliary artifact",
3583 )?,
3584 sha256: hash.sha256,
3585 file_size_bytes: hash.size_bytes,
3586 required: artifact.required,
3587 });
3588 }
3589 Ok(manifest_artifacts)
3590}
3591
3592pub fn write_manifest_file(
3593 manifest: &IndexManifest,
3594 path: impl AsRef<Path>,
3595) -> Result<(), ManifestError> {
3596 let file = File::create(path)?;
3597 serde_json::to_writer_pretty(file, manifest)?;
3598 Ok(())
3599}
3600
3601#[derive(Clone, Debug)]
3602struct JsonlStats {
3603 row_count: usize,
3604 validated_rows: usize,
3605 sha256: Option<String>,
3606}
3607
3608#[derive(Debug, Deserialize)]
3609#[serde(deny_unknown_fields)]
3610struct JsonlRow {
3611 row_id: usize,
3612 db_id: String,
3613 #[serde(default)]
3614 parent_id: Option<String>,
3615}
3616
3617fn validate_jsonl_rows(
3618 path: &Path,
3619 allow_duplicate_db_ids: bool,
3620 limits: &ResourceLimits,
3621 expected_row_count: Option<usize>,
3622 errors: &mut Vec<ReportIssue>,
3623) -> io::Result<JsonlStats> {
3624 let file = File::open(path)?;
3625 let mut reader = BufReader::new(file);
3626 let mut hasher = Sha256::new();
3627 let mut seen = HashSet::new();
3628 let mut seen_db_id_bytes = 0usize;
3629 let mut row_count = 0usize;
3630 let mut validated_rows = 0usize;
3631 let mut line = Vec::new();
3632 let mut reached_eof = true;
3633
3634 while let Some(too_long) = read_bounded_line(
3635 &mut reader,
3636 limits.max_row_identity_jsonl_line_bytes,
3637 &mut line,
3638 &mut hasher,
3639 )? {
3640 let line_idx = row_count;
3641 row_count += 1;
3642 if row_count > limits.max_row_identity_rows {
3643 reached_eof = false;
3644 push_report_issue_bounded(
3645 errors,
3646 limits,
3647 "row_identity_row_count_limit_exceeded",
3648 format!(
3649 "row identity file has more than max_row_identity_rows={} rows",
3650 limits.max_row_identity_rows
3651 ),
3652 );
3653 break;
3654 }
3655 if let Some(expected_row_count) = expected_row_count {
3656 if row_count > expected_row_count {
3657 reached_eof = false;
3658 push_report_issue_bounded(
3659 errors,
3660 limits,
3661 "row_identity_row_count_mismatch",
3662 format!(
3663 "row identity file has more than declared row_count={expected_row_count}"
3664 ),
3665 );
3666 break;
3667 }
3668 }
3669 if too_long {
3670 reached_eof = false;
3671 push_report_issue_bounded(
3672 errors,
3673 limits,
3674 "row_identity_line_too_large",
3675 format!(
3676 "line {line_idx} exceeds max_row_identity_jsonl_line_bytes={}",
3677 limits.max_row_identity_jsonl_line_bytes
3678 ),
3679 );
3680 break;
3681 }
3682 trim_jsonl_terminator(&mut line);
3683 let row: JsonlRow = match serde_json::from_slice(&line) {
3684 Ok(row) => row,
3685 Err(err) => {
3686 push_report_issue_bounded(
3687 errors,
3688 limits,
3689 "row_identity_jsonl_invalid_json",
3690 format!("line {line_idx} is not a strict row object: {err}"),
3691 );
3692 continue;
3693 }
3694 };
3695 if row.row_id != line_idx {
3696 push_report_issue_bounded(
3697 errors,
3698 limits,
3699 "row_identity_row_id_mismatch",
3700 format!("line {line_idx} has row_id {}", row.row_id),
3701 );
3702 }
3703 validate_row_id_string("db_id", &row.db_id, line_idx, limits, errors);
3704 if let Some(parent_id) = &row.parent_id {
3705 validate_row_id_string("parent_id", parent_id, line_idx, limits, errors);
3706 }
3707 validated_rows += 1;
3708 if !allow_duplicate_db_ids {
3709 if seen.contains(&row.db_id) {
3710 push_report_issue_bounded(
3711 errors,
3712 limits,
3713 "row_identity_duplicate_db_id",
3714 format!("line {line_idx} repeats db_id"),
3715 );
3716 } else {
3717 let next_seen_db_id_bytes = seen_db_id_bytes.saturating_add(row.db_id.len());
3718 if next_seen_db_id_bytes > limits.max_row_identity_tracked_db_id_bytes {
3719 reached_eof = false;
3720 push_report_issue_bounded(
3721 errors,
3722 limits,
3723 "row_identity_duplicate_tracking_limit_exceeded",
3724 format!(
3725 "tracked db_id bytes exceed max_row_identity_tracked_db_id_bytes={}",
3726 limits.max_row_identity_tracked_db_id_bytes
3727 ),
3728 );
3729 break;
3730 }
3731 seen_db_id_bytes = next_seen_db_id_bytes;
3732 seen.insert(row.db_id);
3733 }
3734 }
3735 }
3736
3737 Ok(JsonlStats {
3738 row_count,
3739 validated_rows,
3740 sha256: reached_eof.then(|| hex::encode(hasher.finalize())),
3741 })
3742}
3743
3744fn read_bounded_line<R: BufRead>(
3745 reader: &mut R,
3746 max_bytes: usize,
3747 out: &mut Vec<u8>,
3748 hasher: &mut Sha256,
3749) -> io::Result<Option<bool>> {
3750 out.clear();
3751 let max_bytes = max_bytes.max(1);
3752
3753 loop {
3754 let available = reader.fill_buf()?;
3755 if available.is_empty() {
3756 return if out.is_empty() {
3757 Ok(None)
3758 } else {
3759 Ok(Some(false))
3760 };
3761 }
3762
3763 let newline = available.iter().position(|byte| *byte == b'\n');
3764 let take_len = newline.map_or(available.len(), |pos| pos + 1);
3765
3766 let remaining = max_bytes.saturating_sub(out.len());
3767 if take_len > remaining {
3768 let consume_len = remaining.saturating_add(1).min(take_len);
3769 if remaining > 0 {
3770 out.extend_from_slice(&available[..remaining]);
3771 }
3772 hasher.update(&available[..consume_len]);
3773 reader.consume(consume_len);
3774 return Ok(Some(true));
3775 }
3776
3777 out.extend_from_slice(&available[..take_len]);
3778 hasher.update(&available[..take_len]);
3779 reader.consume(take_len);
3780 if newline.is_some() {
3781 return Ok(Some(false));
3782 }
3783 }
3784}
3785
3786fn trim_jsonl_terminator(line: &mut Vec<u8>) {
3787 if line.last() == Some(&b'\n') {
3788 line.pop();
3789 }
3790 if line.last() == Some(&b'\r') {
3791 line.pop();
3792 }
3793}
3794
3795fn validate_row_id_string(
3796 field: &str,
3797 value: &str,
3798 line_idx: usize,
3799 limits: &ResourceLimits,
3800 errors: &mut Vec<ReportIssue>,
3801) {
3802 let mut structurally_invalid = false;
3803 if value.is_empty() {
3804 structurally_invalid = true;
3805 push_report_issue_bounded(
3806 errors,
3807 limits,
3808 format!("row_identity_{field}_empty"),
3809 format!("line {line_idx} has empty {field}"),
3810 );
3811 }
3812 if value.contains('\0') {
3813 structurally_invalid = true;
3814 push_report_issue_bounded(
3815 errors,
3816 limits,
3817 format!("row_identity_{field}_contains_nul"),
3818 format!("line {line_idx} {field} contains NUL"),
3819 );
3820 }
3821 if !structurally_invalid && Uuid::parse_str(value).is_err() {
3822 push_report_issue_bounded(
3823 errors,
3824 limits,
3825 format!("row_identity_{field}_invalid_uuid"),
3826 format!("line {line_idx} {field} must be a UUID in v1"),
3827 );
3828 }
3829}
3830
3831fn is_limit_issue_code(code: &str) -> bool {
3832 matches!(
3833 code,
3834 "row_identity_line_too_large"
3835 | "row_identity_row_count_limit_exceeded"
3836 | "row_identity_duplicate_tracking_limit_exceeded"
3837 | "verification_report_issue_limit_exceeded"
3838 )
3839}
3840
3841fn manifest_path_for_create(
3842 path: &Path,
3843 base_dir: &Path,
3844 options: &CreateManifestOptions,
3845 context: &str,
3846) -> Result<String, ManifestError> {
3847 let canonical_path = fs::canonicalize(path)?;
3848 let canonical_base = fs::canonicalize(base_dir)?;
3849 if let Ok(relative) = canonical_path.strip_prefix(&canonical_base) {
3850 if !relative.as_os_str().is_empty() {
3851 return Ok(path_to_manifest_string(relative));
3852 }
3853 return Ok(".".to_string());
3854 }
3855
3856 if !options.allow_path_escape {
3857 return Err(ManifestError::invalid(format!(
3858 "{context} path {} is outside manifest directory {}; use --allow-path-escape to create a manifest that requires non-default verification policy",
3859 canonical_path.display(),
3860 canonical_base.display()
3861 )));
3862 }
3863
3864 if let Some(relative) = relative_path_between(&canonical_base, &canonical_path) {
3865 return Ok(path_to_manifest_string(&relative));
3866 }
3867
3868 if options.allow_absolute_paths {
3869 return Ok(path_to_manifest_string(&canonical_path));
3870 }
3871
3872 Err(ManifestError::invalid(format!(
3873 "{context} path {} cannot be expressed relative to manifest directory {}; use --allow-absolute-paths with --allow-path-escape",
3874 canonical_path.display(),
3875 canonical_base.display()
3876 )))
3877}
3878
3879fn relative_path_between(base: &Path, target: &Path) -> Option<PathBuf> {
3880 let base_components = base.components().collect::<Vec<_>>();
3881 let target_components = target.components().collect::<Vec<_>>();
3882 let mut common = 0usize;
3883 while common < base_components.len()
3884 && common < target_components.len()
3885 && base_components[common] == target_components[common]
3886 {
3887 common += 1;
3888 }
3889
3890 if common == 0 {
3891 return None;
3892 }
3893
3894 let mut relative = PathBuf::new();
3895 for component in &base_components[common..] {
3896 if matches!(component, Component::Normal(_)) {
3897 relative.push("..");
3898 }
3899 }
3900 for component in &target_components[common..] {
3901 match component {
3902 Component::Normal(part) => relative.push(part),
3903 Component::CurDir => {}
3904 Component::ParentDir => relative.push(".."),
3905 Component::Prefix(_) | Component::RootDir => return None,
3906 }
3907 }
3908 Some(relative)
3909}
3910
3911fn path_to_manifest_string(path: &Path) -> String {
3912 if path.is_absolute() {
3913 return path.display().to_string().replace('\\', "/");
3914 }
3915 let parts = path
3916 .components()
3917 .filter_map(|component| match component {
3918 Component::Normal(part) => Some(part.to_string_lossy().into_owned()),
3919 Component::CurDir => Some(".".to_string()),
3920 Component::ParentDir => Some("..".to_string()),
3921 Component::Prefix(_) | Component::RootDir => None,
3922 })
3923 .collect::<Vec<_>>();
3924 if parts.is_empty() {
3925 ".".to_string()
3926 } else {
3927 parts.join("/")
3928 }
3929}
3930
3931fn path_to_display(path: &Path) -> String {
3932 path.display().to_string()
3933}
3934
3935fn extension_key_is_namespaced(key: &str) -> bool {
3936 if key.contains("://") || key.starts_with("urn:") {
3937 return true;
3938 }
3939 let mut parts = key.split('.');
3940 let Some(first) = parts.next() else {
3941 return false;
3942 };
3943 if !valid_extension_part(first) {
3944 return false;
3945 }
3946 let mut saw_second = false;
3947 for part in parts {
3948 saw_second = true;
3949 if !valid_extension_part(part) {
3950 return false;
3951 }
3952 }
3953 saw_second
3954}
3955
3956fn valid_extension_part(part: &str) -> bool {
3957 !part.is_empty()
3958 && part
3959 .bytes()
3960 .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_')
3961 && part.bytes().any(|b| b.is_ascii_alphanumeric())
3962}
3963
3964fn is_sha256_hex(value: &str) -> bool {
3965 value.len() == 64
3966 && value
3967 .bytes()
3968 .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f'))
3969}
3970
3971fn hex_digest_eq(a: &str, b: &str) -> bool {
3972 a == b
3973}
3974
3975#[cfg(feature = "sqlite")]
3976pub mod sqlite;