Skip to main content

ordvec_manifest/
lib.rs

1//! Manifest verification for ordvec index artifacts.
2//!
3//! This crate verifies JSON manifests that bind an ordvec index file to
4//! SHA-256 digests, probed loader metadata, row identity, caller-owned
5//! auxiliary artifacts, optional encoder-distortion profiles, optional
6//! calibration profiles, and attestation-shape metadata. It is intentionally a
7//! verifier, not a trust oracle: it does not sign artifacts, manage keys, call
8//! networks, mutate index files, estimate model geometry, or decide deployment
9//! policy.
10//!
11//! Library callers can use [`load_manifest_file_with_options`] and
12//! [`verify_document_for_load`], or use [`verify_for_load`] when they need a
13//! verified snapshot of the canonical artifact path and related load metadata.
14//! The `ordvec-manifest` binary exposes the same bounded verification surfaces
15//! for command-line use.
16
17use chrono::{DateTime, SecondsFormat, Utc};
18use ordvec::{
19    probe_index_metadata, IndexKind as CoreIndexKind, IndexMetadata as CoreIndexMetadata,
20    IndexParams as CoreIndexParams,
21};
22use serde::{Deserialize, Serialize};
23use sha2::{Digest, Sha256};
24use std::collections::{BTreeMap, HashSet};
25use std::fmt;
26use std::fs::{self, File};
27use std::io::{self, BufRead, BufReader, Read};
28use std::path::{Component, Path, PathBuf};
29use uuid::Uuid;
30
31pub const SCHEMA_VERSION: &str = "ordvec.index_manifest.v1";
32pub const CALIBRATION_SCHEMA_VERSION: &str = "ordvec.calibration.v1";
33pub const ENCODER_DISTORTION_SCHEMA_VERSION: &str = "ordvec.encoder_distortion.v1";
34pub const DEFAULT_MAX_MANIFEST_BYTES: u64 = 1024 * 1024;
35pub const DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES: usize = 64 * 1024;
36pub const DEFAULT_MAX_ROW_IDENTITY_ROWS: usize = 10_000_000;
37pub const DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES: usize = 64 * 1024 * 1024;
38pub const DEFAULT_MAX_AUXILIARY_ARTIFACTS: usize = 1024;
39pub const DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES: u64 = 64 * 1024 * 1024;
40pub const DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES: u64 = 64 * 1024 * 1024;
41pub const DEFAULT_MAX_REPORT_ISSUES: usize = 1024;
42pub const DEFAULT_MAX_CACHED_REPORT_BYTES: u64 = 4 * 1024 * 1024;
43
44#[derive(Debug)]
45pub enum ManifestError {
46    Io(io::Error),
47    Json(serde_json::Error),
48    Invalid(String),
49    LimitExceeded { code: String, message: String },
50}
51
52impl ManifestError {
53    pub fn invalid(message: impl Into<String>) -> Self {
54        Self::Invalid(message.into())
55    }
56
57    pub fn limit_exceeded(code: impl Into<String>, message: impl Into<String>) -> Self {
58        Self::LimitExceeded {
59            code: code.into(),
60            message: message.into(),
61        }
62    }
63
64    pub fn code(&self) -> Option<&str> {
65        match self {
66            Self::LimitExceeded { code, .. } => Some(code.as_str()),
67            _ => None,
68        }
69    }
70}
71
72impl fmt::Display for ManifestError {
73    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74        match self {
75            Self::Io(err) => write!(f, "{err}"),
76            Self::Json(err) => write!(f, "{err}"),
77            Self::Invalid(message) => f.write_str(message),
78            Self::LimitExceeded { code, message } => write!(f, "{code}: {message}"),
79        }
80    }
81}
82
83impl std::error::Error for ManifestError {}
84
85impl From<io::Error> for ManifestError {
86    fn from(value: io::Error) -> Self {
87        Self::Io(value)
88    }
89}
90
91impl From<serde_json::Error> for ManifestError {
92    fn from(value: serde_json::Error) -> Self {
93        Self::Json(value)
94    }
95}
96
97#[derive(Clone, Debug)]
98pub struct ManifestDocument {
99    pub manifest: IndexManifest,
100    pub source_path: Option<PathBuf>,
101    pub base_dir: PathBuf,
102}
103
104pub fn load_manifest_file(path: impl AsRef<Path>) -> Result<ManifestDocument, ManifestError> {
105    load_manifest_file_with_options(path, &VerifyOptions::default())
106}
107
108pub fn load_manifest_file_with_options(
109    path: impl AsRef<Path>,
110    options: &VerifyOptions,
111) -> Result<ManifestDocument, ManifestError> {
112    let path = path.as_ref();
113    let manifest_bytes = read_bounded_file(
114        path,
115        options.limits.max_manifest_bytes,
116        "manifest_file_too_large",
117        "manifest file",
118    )?;
119    let manifest: IndexManifest = serde_json::from_slice(&manifest_bytes)?;
120    let base_dir = path
121        .parent()
122        .filter(|p| !p.as_os_str().is_empty())
123        .unwrap_or_else(|| Path::new("."))
124        .to_path_buf();
125    Ok(ManifestDocument {
126        manifest,
127        source_path: Some(path.to_path_buf()),
128        base_dir,
129    })
130}
131
132fn read_bounded_file(
133    path: &Path,
134    max_bytes: u64,
135    code: &'static str,
136    context: &'static str,
137) -> Result<Vec<u8>, ManifestError> {
138    let mut file = File::open(path)?;
139    let max_len = usize::try_from(max_bytes).map_err(|_| {
140        ManifestError::limit_exceeded(
141            code,
142            format!(
143                "{context} byte limit {max_bytes} is too large to enforce while reading {}",
144                path.display()
145            ),
146        )
147    })?;
148    let read_limit = max_bytes.checked_add(1).ok_or_else(|| {
149        ManifestError::limit_exceeded(
150            code,
151            format!(
152                "{context} byte limit {max_bytes} is too large to enforce while reading {}",
153                path.display()
154            ),
155        )
156    })?;
157    let mut bytes = Vec::new();
158    let mut limited = file.by_ref().take(read_limit);
159    limited.read_to_end(&mut bytes)?;
160    if bytes.len() > max_len {
161        return Err(ManifestError::limit_exceeded(
162            code,
163            format!(
164                "{context} exceeds {max_bytes} bytes while reading {}",
165                path.display()
166            ),
167        ));
168    }
169    Ok(bytes)
170}
171
172pub fn verify_manifest_with_base(
173    manifest: IndexManifest,
174    base_dir: impl Into<PathBuf>,
175    options: VerifyOptions,
176) -> VerificationReport {
177    let document = ManifestDocument {
178        manifest,
179        source_path: None,
180        base_dir: base_dir.into(),
181    };
182    verify_manifest(&document, options)
183}
184
185pub fn verify_index_manifest(
186    index_path: impl Into<PathBuf>,
187    manifest_path: impl AsRef<Path>,
188    mut options: VerifyOptions,
189) -> Result<VerificationReport, ManifestError> {
190    let document = load_manifest_file_with_options(manifest_path, &options)?;
191    options.index_override = Some(index_path.into());
192    Ok(verify_manifest(&document, options))
193}
194
195/// Verifies a manifest file and returns a typed plan for caller-side loading.
196///
197/// The returned [`VerifiedLoadPlan`] is a verification snapshot: it contains
198/// canonical paths, probed metadata, row identity, auxiliary artifact states,
199/// and the full report for the bytes observed during this call. It is not a
200/// lease, file lock, mmap, open descriptor, or durable byte pin. If backing
201/// files can change between verification and load, re-verify immediately before
202/// loading, load from immutable storage, or use a caller-owned loading path that
203/// pins bytes.
204pub fn verify_for_load(
205    manifest_path: impl AsRef<Path>,
206    options: VerifyOptions,
207) -> Result<VerifiedLoadPlan, VerifiedLoadPlanError> {
208    let document = load_manifest_file_with_options(manifest_path, &options)?;
209    verify_document_for_load(&document, options)
210}
211
212/// Verifies an already-loaded manifest document and returns a typed load plan.
213///
214/// This has the same snapshot boundary as [`verify_for_load`]: it resolves and
215/// verifies paths at call time, but it does not pin the verified bytes against
216/// later mutation.
217pub fn verify_document_for_load(
218    document: &ManifestDocument,
219    options: VerifyOptions,
220) -> Result<VerifiedLoadPlan, VerifiedLoadPlanError> {
221    let (report, paths) = verify_manifest_with_path_capture(document, options);
222    VerifiedLoadPlan::from_report(document, report, paths)
223}
224
225pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> VerificationReport {
226    verify_manifest_with_path_capture(document, options).0
227}
228
229fn verify_manifest_with_path_capture(
230    document: &ManifestDocument,
231    options: VerifyOptions,
232) -> (VerificationReport, VerificationPathCapture) {
233    let mut paths = VerificationPathCapture::default();
234    let mut report = VerificationReport::new(Some(document.manifest.manifest_id.clone()));
235    validate_manifest_shape(&document.manifest, &options.limits, &mut report);
236
237    let artifact_display_path = document.manifest.artifact.path.clone();
238    report.artifact.manifest_path = Some(artifact_display_path.clone());
239    let artifact_path = options
240        .index_override
241        .as_ref()
242        .cloned()
243        .unwrap_or_else(|| PathBuf::from(&document.manifest.artifact.path));
244    report.artifact.observed_path = Some(path_to_display(&artifact_path));
245
246    if let Some(resolved) = resolve_existing_path(
247        &artifact_path,
248        &document.base_dir,
249        &options,
250        "artifact",
251        &mut report.errors,
252    ) {
253        paths.artifact_path = Some(resolved.canonical_path.clone());
254        report.artifact.canonical_path = Some(path_to_display(&resolved.canonical_path));
255        match sha256_file(&resolved.resolved_path) {
256            Ok(hash) => {
257                report.artifact.sha256 = Some(hash.sha256.clone());
258                report.artifact.size_bytes = Some(hash.size_bytes);
259                if !hex_digest_eq(&hash.sha256, &document.manifest.artifact.sha256) {
260                    report.error(
261                        "artifact_sha256_mismatch",
262                        format!(
263                            "artifact SHA-256 was {}, manifest declares {}",
264                            hash.sha256, document.manifest.artifact.sha256
265                        ),
266                    );
267                }
268                if hash.size_bytes != document.manifest.artifact.file_size_bytes {
269                    report.error(
270                        "artifact_file_size_mismatch",
271                        format!(
272                            "artifact size was {}, manifest declares {}",
273                            hash.size_bytes, document.manifest.artifact.file_size_bytes
274                        ),
275                    );
276                }
277            }
278            Err(err) => report.error(
279                "artifact_hash_failed",
280                format!("failed to hash artifact: {err}"),
281            ),
282        }
283
284        match probe_index_metadata(&resolved.resolved_path) {
285            Ok(metadata) => {
286                let metadata_report = MetadataReport::from_core(&metadata);
287                compare_artifact_metadata(&document.manifest.artifact, &metadata, &mut report);
288                report.artifact.metadata = Some(metadata_report);
289            }
290            Err(err) => report.error(
291                "artifact_probe_failed",
292                format!("failed to probe artifact metadata: {err}"),
293            ),
294        }
295    }
296
297    verify_auxiliary_artifacts(document, &options, &mut report, &mut paths);
298    verify_row_identity(document, &options, &mut report, &mut paths);
299    verify_encoder_distortion(document, &options, &mut report);
300    verify_calibration(document, &options, &mut report);
301    verify_attestations(&document.manifest, &mut report);
302
303    enforce_report_issue_limit(&mut report.errors, &options.limits);
304    report.ok = report.errors.is_empty();
305    (report, paths)
306}
307
308fn validate_manifest_shape(
309    manifest: &IndexManifest,
310    limits: &ResourceLimits,
311    report: &mut VerificationReport,
312) {
313    if manifest.schema_version != SCHEMA_VERSION {
314        report.error(
315            "schema_version_unsupported",
316            format!(
317                "schema_version must be {SCHEMA_VERSION}, got {}",
318                manifest.schema_version
319            ),
320        );
321    }
322    if manifest.manifest_id.trim().is_empty() {
323        report.error("manifest_id_empty", "manifest_id must be non-empty");
324    }
325    if DateTime::parse_from_rfc3339(&manifest.created_at).is_err() {
326        report.error("created_at_invalid", "created_at must parse as RFC3339");
327    }
328    if manifest.embedding.model.trim().is_empty() {
329        report.error("embedding_model_empty", "embedding.model must be non-empty");
330    }
331    if manifest.embedding.dim == 0 {
332        report.error(
333            "embedding_dim_zero",
334            "embedding.dim must be greater than zero",
335        );
336    }
337    if manifest.artifact.path.trim().is_empty() {
338        report.error("artifact_path_empty", "artifact.path must be non-empty");
339    }
340    if !is_sha256_hex(&manifest.artifact.sha256) {
341        report.error(
342            "artifact_sha256_invalid",
343            "artifact.sha256 must be a lowercase 64-character hex SHA-256 digest",
344        );
345    }
346    if manifest.artifact.bytes_per_vec == 0 {
347        report.error(
348            "artifact_bytes_per_vec_zero",
349            "artifact.bytes_per_vec must be greater than zero",
350        );
351    }
352    if manifest.artifact.dim != manifest.embedding.dim {
353        report.error(
354            "artifact_embedding_dim_mismatch",
355            format!(
356                "artifact.dim {} does not match embedding.dim {}",
357                manifest.artifact.dim, manifest.embedding.dim
358            ),
359        );
360    }
361    if !artifact_kind_matches_params(manifest.artifact.kind, &manifest.artifact.params) {
362        report.error(
363            "artifact_params_kind_mismatch",
364            "artifact.params discriminator does not match artifact.kind",
365        );
366    }
367
368    let row_count = manifest.row_identity.row_count();
369    if manifest.artifact.vector_count != row_count {
370        report.error(
371            "artifact_row_count_mismatch",
372            format!(
373                "artifact.vector_count {} does not match row_identity.row_count {}",
374                manifest.artifact.vector_count, row_count
375            ),
376        );
377    }
378    if let RowIdentity::Jsonl {
379        path,
380        sha256,
381        id_kind,
382        db,
383        ..
384    } = &manifest.row_identity
385    {
386        if path.trim().is_empty() {
387            report.error(
388                "row_identity_path_empty",
389                "row_identity.path must be non-empty",
390            );
391        }
392        if !is_sha256_hex(sha256) {
393            report.error(
394                "row_identity_sha256_invalid",
395                "row_identity.sha256 must be a lowercase 64-character hex SHA-256 digest",
396            );
397        }
398        if id_kind != "uuid" {
399            report.error(
400                "row_identity_id_kind_unsupported",
401                "row_identity.id_kind must be uuid in v1",
402            );
403        }
404        if db.is_some() {
405            report.error(
406                "row_identity_db_unsupported",
407                "row_identity.db is reserved for a future schema and is not verified in v1",
408            );
409        }
410    }
411
412    validate_auxiliary_artifact_shape(manifest, limits, report);
413
414    validate_optional_non_empty(
415        "embedding_model_revision_empty",
416        "embedding.model_revision must be non-empty when present",
417        manifest.embedding.model_revision.as_deref(),
418        report,
419    );
420    validate_optional_non_empty(
421        "embedding_tokenizer_revision_empty",
422        "embedding.tokenizer_revision must be non-empty when present",
423        manifest.embedding.tokenizer_revision.as_deref(),
424        report,
425    );
426    validate_optional_non_empty(
427        "embedding_pooling_empty",
428        "embedding.pooling must be non-empty when present",
429        manifest.embedding.pooling.as_deref(),
430        report,
431    );
432    validate_optional_sha256(
433        "embedding_corpus_digest_invalid",
434        "embedding.corpus_digest must be a lowercase 64-character hex SHA-256 digest",
435        manifest.embedding.corpus_digest.as_deref(),
436        report,
437    );
438    validate_optional_sha256(
439        "embedding_matrix_digest_invalid",
440        "embedding.embedding_matrix_digest must be a lowercase 64-character hex SHA-256 digest",
441        manifest.embedding.embedding_matrix_digest.as_deref(),
442        report,
443    );
444    validate_optional_non_empty(
445        "embedding_normalization_empty",
446        "embedding.normalization must be non-empty when present",
447        manifest.embedding.normalization.as_deref(),
448        report,
449    );
450
451    if let Some(build) = &manifest.build {
452        if build.invocation_id.trim().is_empty() {
453            report.error(
454                "build_invocation_id_empty",
455                "build.invocation_id must be non-empty",
456            );
457        }
458        if build
459            .builder_id
460            .as_ref()
461            .is_some_and(|builder_id| builder_id.trim().is_empty())
462        {
463            report.error(
464                "build_builder_id_empty",
465                "build.builder_id must be non-empty",
466            );
467        }
468        validate_optional_non_empty(
469            "build_source_repo_empty",
470            "build.source_repo must be non-empty when present",
471            build.source_repo.as_deref(),
472            report,
473        );
474        validate_optional_non_empty(
475            "build_source_commit_empty",
476            "build.source_commit must be non-empty when present",
477            build.source_commit.as_deref(),
478            report,
479        );
480        validate_optional_non_empty(
481            "build_ci_provider_empty",
482            "build.ci_provider must be non-empty when present",
483            build.ci_provider.as_deref(),
484            report,
485        );
486        validate_optional_non_empty(
487            "build_ci_run_id_empty",
488            "build.ci_run_id must be non-empty when present",
489            build.ci_run_id.as_deref(),
490            report,
491        );
492    }
493
494    for key in manifest.extensions.keys() {
495        if !extension_key_is_namespaced(key) {
496            report.error(
497                "extension_key_not_namespaced",
498                format!("extension key {key:?} must be namespaced"),
499            );
500        }
501    }
502}
503
504fn validate_auxiliary_artifact_shape(
505    manifest: &IndexManifest,
506    limits: &ResourceLimits,
507    report: &mut VerificationReport,
508) {
509    if !check_auxiliary_artifact_count(manifest, limits, report) {
510        return;
511    }
512    let mut names = HashSet::new();
513    for artifact in &manifest.auxiliary_artifacts {
514        let name = artifact.name.trim();
515        if name.is_empty() {
516            report.error(
517                "auxiliary_artifact_name_empty",
518                "auxiliary artifact name must be non-empty",
519            );
520        } else if artifact.name != name {
521            report.error(
522                "auxiliary_artifact_name_not_trimmed",
523                format!(
524                    "auxiliary artifact name {name:?} must not have leading or trailing whitespace"
525                ),
526            );
527        } else if !names.insert(name.to_string()) {
528            report.error(
529                "auxiliary_artifact_name_duplicate",
530                format!("auxiliary artifact name {name:?} is duplicated"),
531            );
532        }
533
534        if artifact.path.trim().is_empty() {
535            report.error(
536                "auxiliary_artifact_path_empty",
537                format!("auxiliary artifact {name:?} path must be non-empty"),
538            );
539        }
540        if !is_sha256_hex(&artifact.sha256) {
541            report.error(
542                "auxiliary_artifact_sha256_invalid",
543                format!(
544                    "auxiliary artifact {name:?} sha256 must be a lowercase 64-character hex SHA-256 digest"
545                ),
546            );
547        }
548    }
549}
550
551fn validate_optional_non_empty(
552    code: &str,
553    message: &str,
554    value: Option<&str>,
555    report: &mut VerificationReport,
556) {
557    if value.is_some_and(|value| value.trim().is_empty()) {
558        report.error(code, message);
559    }
560}
561
562fn validate_optional_sha256(
563    code: &str,
564    message: &str,
565    value: Option<&str>,
566    report: &mut VerificationReport,
567) {
568    if value.is_some_and(|value| !is_sha256_hex(value)) {
569        report.error(code, message);
570    }
571}
572
573fn validate_optional_sha256_uri(
574    code: &str,
575    message: &str,
576    value: Option<&str>,
577    report: &mut VerificationReport,
578) {
579    let Some(value) = value else {
580        return;
581    };
582    let Some(digest) = value.strip_prefix("sha256:") else {
583        report.error(code, message);
584        return;
585    };
586    if !is_sha256_hex(digest) {
587        report.error(code, message);
588    }
589}
590
591fn validate_optional_positive_f64(
592    code: &str,
593    message: &str,
594    value: Option<f64>,
595    report: &mut VerificationReport,
596) {
597    if value.is_some_and(|value| !value.is_finite() || value <= 0.0) {
598        report.error(code, message);
599    }
600}
601
602fn validate_optional_nonnegative_f64(
603    code: &str,
604    message: &str,
605    value: Option<f64>,
606    report: &mut VerificationReport,
607) {
608    if value.is_some_and(|value| !value.is_finite() || value < 0.0) {
609        report.error(code, message);
610    }
611}
612
613fn validate_optional_probability(
614    code: &str,
615    message: &str,
616    value: Option<f64>,
617    report: &mut VerificationReport,
618) {
619    if value.is_some_and(|value| !value.is_finite() || !(0.0..=1.0).contains(&value)) {
620        report.error(code, message);
621    }
622}
623
624fn artifact_kind_matches_params(kind: ManifestIndexKind, params: &ManifestIndexParams) -> bool {
625    matches!(
626        (kind, params),
627        (ManifestIndexKind::Rank, ManifestIndexParams::Rank)
628            | (
629                ManifestIndexKind::RankQuant,
630                ManifestIndexParams::RankQuant { .. }
631            )
632            | (
633                ManifestIndexKind::Bitmap,
634                ManifestIndexParams::Bitmap { .. }
635            )
636            | (
637                ManifestIndexKind::SignBitmap,
638                ManifestIndexParams::SignBitmap
639            )
640    )
641}
642
643fn compare_artifact_metadata(
644    artifact: &Artifact,
645    metadata: &CoreIndexMetadata,
646    report: &mut VerificationReport,
647) {
648    let observed_kind = ManifestIndexKind::from_core(metadata.kind);
649    if artifact.kind != observed_kind {
650        report.error(
651            "artifact_kind_mismatch",
652            format!(
653                "artifact kind was {:?}, manifest declares {:?}",
654                observed_kind, artifact.kind
655            ),
656        );
657    }
658    let observed_params = ManifestIndexParams::from_core(metadata.params);
659    if artifact.params != observed_params {
660        report.error(
661            "artifact_params_mismatch",
662            format!(
663                "artifact params were {:?}, manifest declares {:?}",
664                observed_params, artifact.params
665            ),
666        );
667    }
668    if artifact.format_version != metadata.format_version {
669        report.error(
670            "artifact_format_version_mismatch",
671            format!(
672                "artifact format_version was {}, manifest declares {}",
673                metadata.format_version, artifact.format_version
674            ),
675        );
676    }
677    if artifact.dim != metadata.dim {
678        report.error(
679            "artifact_dim_mismatch",
680            format!(
681                "artifact dim was {}, manifest declares {}",
682                metadata.dim, artifact.dim
683            ),
684        );
685    }
686    if artifact.vector_count != metadata.vector_count {
687        report.error(
688            "artifact_vector_count_mismatch",
689            format!(
690                "artifact vector_count was {}, manifest declares {}",
691                metadata.vector_count, artifact.vector_count
692            ),
693        );
694    }
695    if artifact.bytes_per_vec != metadata.bytes_per_vec {
696        report.error(
697            "artifact_bytes_per_vec_mismatch",
698            format!(
699                "artifact bytes_per_vec was {}, manifest declares {}",
700                metadata.bytes_per_vec, artifact.bytes_per_vec
701            ),
702        );
703    }
704    if artifact.file_size_bytes != metadata.file_size_bytes {
705        report.error(
706            "artifact_metadata_file_size_mismatch",
707            format!(
708                "artifact metadata file_size_bytes was {}, manifest declares {}",
709                metadata.file_size_bytes, artifact.file_size_bytes
710            ),
711        );
712    }
713}
714
715fn verify_row_identity(
716    document: &ManifestDocument,
717    options: &VerifyOptions,
718    report: &mut VerificationReport,
719    paths: &mut VerificationPathCapture,
720) {
721    match &document.manifest.row_identity {
722        RowIdentity::RowIdIdentity { row_count } => {
723            report.row_identity.kind = Some("row_id_identity".to_string());
724            report.row_identity.row_count = Some(*row_count);
725        }
726        RowIdentity::Jsonl {
727            path,
728            sha256,
729            row_count,
730            ..
731        } => {
732            report.row_identity.kind = Some("jsonl".to_string());
733            report.row_identity.manifest_path = Some(path.clone());
734            report.row_identity.row_count = Some(*row_count);
735            if *row_count > options.limits.max_row_identity_rows {
736                report.error(
737                    "row_identity_row_count_limit_exceeded",
738                    format!(
739                        "row_identity.row_count {row_count} exceeds max_row_identity_rows={}",
740                        options.limits.max_row_identity_rows
741                    ),
742                );
743                return;
744            }
745            let row_path = PathBuf::from(path);
746            if let Some(resolved) = resolve_existing_path(
747                &row_path,
748                &document.base_dir,
749                options,
750                "row_identity",
751                &mut report.errors,
752            ) {
753                paths.row_identity_path = Some(resolved.canonical_path.clone());
754                report.row_identity.canonical_path =
755                    Some(path_to_display(&resolved.canonical_path));
756                match validate_jsonl_rows(
757                    &resolved.resolved_path,
758                    options.allow_duplicate_db_ids,
759                    &options.limits,
760                    Some(*row_count),
761                    &mut report.errors,
762                ) {
763                    Ok(stats) => {
764                        report.row_identity.validated_rows = Some(stats.validated_rows);
765                        if let Some(hash) = &stats.sha256 {
766                            report.row_identity.sha256 = Some(hash.clone());
767                            if !hex_digest_eq(hash, sha256) {
768                                report.error(
769                                    "row_identity_sha256_mismatch",
770                                    format!(
771                                        "row_identity SHA-256 was {hash}, manifest declares {sha256}"
772                                    ),
773                                );
774                            }
775                        }
776                        if stats.row_count != *row_count
777                            && !report
778                                .errors
779                                .iter()
780                                .any(|issue| issue.code == "row_identity_row_count_mismatch")
781                        {
782                            let observed_rows = if stats.sha256.is_some() {
783                                stats.row_count.to_string()
784                            } else {
785                                format!("at least {}", stats.row_count)
786                            };
787                            report.error(
788                                "row_identity_row_count_mismatch",
789                                format!(
790                                    "row identity file has {observed_rows} rows, manifest declares {row_count}"
791                                ),
792                            );
793                        }
794                    }
795                    Err(err) => report.error(
796                        "row_identity_read_failed",
797                        format!("failed to read row identity file: {err}"),
798                    ),
799                }
800            }
801        }
802    }
803}
804
805fn verify_encoder_distortion(
806    document: &ManifestDocument,
807    options: &VerifyOptions,
808    report: &mut VerificationReport,
809) {
810    let Some(profile) = &document.manifest.encoder_distortion else {
811        return;
812    };
813
814    report.encoder_distortion.present = true;
815    report.encoder_distortion.schema_version = Some(profile.schema_version.clone());
816    report.encoder_distortion.profile_id = Some(profile.profile_id.clone());
817    report.encoder_distortion.evidence_kind = Some(profile.evidence.kind.label().to_string());
818    report.encoder_distortion.source_metric = Some(profile.source_metric.name.clone());
819    report.encoder_distortion.embedding_metric = Some(profile.embedding_metric.name.clone());
820
821    validate_encoder_distortion_shape(profile, report);
822    validate_encoder_distortion_encoder(profile, &document.manifest.embedding, report);
823    validate_encoder_distortion_metrics(profile, report);
824    validate_encoder_distortion_bounds(&profile.bounds, report);
825    validate_encoder_distortion_scope(&profile.scope, report);
826    validate_encoder_distortion_evidence(profile, &document.base_dir, options, report);
827    validate_encoder_distortion_calibration(
828        profile,
829        document.manifest.calibration.as_ref(),
830        report,
831    );
832}
833
834fn validate_encoder_distortion_shape(
835    profile: &EncoderDistortionProfileRef,
836    report: &mut VerificationReport,
837) {
838    if profile.schema_version != ENCODER_DISTORTION_SCHEMA_VERSION {
839        report.error(
840            "encoder_distortion_schema_version_unsupported",
841            format!(
842                "encoder_distortion.schema_version must be {ENCODER_DISTORTION_SCHEMA_VERSION}, got {}",
843                profile.schema_version
844            ),
845        );
846    }
847    if profile.profile_id.trim().is_empty() {
848        report.error(
849            "encoder_distortion_profile_id_empty",
850            "encoder_distortion.profile_id must be non-empty",
851        );
852    }
853    if profile
854        .created_at
855        .as_ref()
856        .is_some_and(|created_at| DateTime::parse_from_rfc3339(created_at).is_err())
857    {
858        report.error(
859            "encoder_distortion_created_at_invalid",
860            "encoder_distortion.created_at must parse as RFC3339 when present",
861        );
862    }
863    if profile.encoder.model.trim().is_empty() {
864        report.error(
865            "encoder_distortion_encoder_model_empty",
866            "encoder_distortion.encoder.model must be non-empty",
867        );
868    }
869    if profile.encoder.dim == 0 {
870        report.error(
871            "encoder_distortion_encoder_dim_zero",
872            "encoder_distortion.encoder.dim must be greater than zero",
873        );
874    }
875    validate_optional_non_empty(
876        "encoder_distortion_encoder_model_revision_empty",
877        "encoder_distortion.encoder.model_revision must be non-empty when present",
878        profile.encoder.model_revision.as_deref(),
879        report,
880    );
881    validate_optional_non_empty(
882        "encoder_distortion_encoder_normalization_empty",
883        "encoder_distortion.encoder.normalization must be non-empty when present",
884        profile.encoder.normalization.as_deref(),
885        report,
886    );
887    validate_optional_non_empty(
888        "encoder_distortion_tokenizer_revision_empty",
889        "encoder_distortion.tokenizer_revision must be non-empty when present",
890        profile.tokenizer_revision.as_deref(),
891        report,
892    );
893    validate_optional_non_empty(
894        "encoder_distortion_pooling_empty",
895        "encoder_distortion.pooling must be non-empty when present",
896        profile.pooling.as_deref(),
897        report,
898    );
899}
900
901fn validate_encoder_distortion_encoder(
902    profile: &EncoderDistortionProfileRef,
903    embedding: &Embedding,
904    report: &mut VerificationReport,
905) {
906    if profile.encoder.model != embedding.model {
907        report.error(
908            "encoder_distortion_encoder_model_mismatch",
909            format!(
910                "encoder_distortion model {:?} does not match embedding.model {:?}",
911                profile.encoder.model, embedding.model
912            ),
913        );
914    }
915    if profile.encoder.dim != embedding.dim {
916        report.error(
917            "encoder_distortion_encoder_dim_mismatch",
918            format!(
919                "encoder_distortion dim {} does not match embedding.dim {}",
920                profile.encoder.dim, embedding.dim
921            ),
922        );
923    }
924    compare_optional_encoder_identity(
925        "encoder_distortion_encoder_model_revision_mismatch",
926        "encoder_distortion encoder",
927        "model_revision",
928        embedding.model_revision.as_deref(),
929        profile.encoder.model_revision.as_deref(),
930        report,
931    );
932    compare_optional_encoder_identity(
933        "encoder_distortion_encoder_normalization_mismatch",
934        "encoder_distortion encoder",
935        "normalization",
936        embedding.normalization.as_deref(),
937        profile.encoder.normalization.as_deref(),
938        report,
939    );
940    compare_optional_encoder_identity(
941        "encoder_distortion_tokenizer_revision_mismatch",
942        "encoder_distortion",
943        "tokenizer_revision",
944        embedding.tokenizer_revision.as_deref(),
945        profile.tokenizer_revision.as_deref(),
946        report,
947    );
948    compare_optional_encoder_identity(
949        "encoder_distortion_pooling_mismatch",
950        "encoder_distortion",
951        "pooling",
952        embedding.pooling.as_deref(),
953        profile.pooling.as_deref(),
954        report,
955    );
956}
957
958fn validate_encoder_distortion_metrics(
959    profile: &EncoderDistortionProfileRef,
960    report: &mut VerificationReport,
961) {
962    validate_metric_spec(
963        "encoder_distortion_source_metric",
964        &profile.source_metric,
965        report,
966    );
967    validate_metric_spec(
968        "encoder_distortion_embedding_metric",
969        &profile.embedding_metric,
970        report,
971    );
972}
973
974fn validate_metric_spec(prefix: &str, metric: &MetricSpec, report: &mut VerificationReport) {
975    if metric.name.trim().is_empty() {
976        report.error(
977            format!("{prefix}_name_empty"),
978            format!("{prefix}.name must be non-empty"),
979        );
980    }
981    validate_optional_non_empty(
982        &format!("{prefix}_version_empty"),
983        &format!("{prefix}.version must be non-empty when present"),
984        metric.version.as_deref(),
985        report,
986    );
987    validate_optional_sha256_uri(
988        &format!("{prefix}_digest_invalid"),
989        &format!("{prefix}.digest must be sha256:<lowercase-hex> when present"),
990        metric.digest.as_deref(),
991        report,
992    );
993}
994
995fn validate_encoder_distortion_bounds(bounds: &DistortionBounds, report: &mut VerificationReport) {
996    if bounds.declared_lower_bound.is_none()
997        && bounds.declared_upper_bound.is_none()
998        && bounds.estimated_distortion.is_none()
999        && bounds.violation_rate.is_none()
1000        && bounds.max_observed_violation.is_none()
1001        && bounds.quantile_observed_violation.is_none()
1002    {
1003        report.error(
1004            "encoder_distortion_bounds_empty",
1005            "encoder_distortion.bounds must declare at least one bound or observed violation statistic",
1006        );
1007    }
1008
1009    validate_optional_positive_f64(
1010        "encoder_distortion_lower_bound_invalid",
1011        "encoder_distortion.bounds.declared_lower_bound must be finite and greater than zero",
1012        bounds.declared_lower_bound,
1013        report,
1014    );
1015    validate_optional_positive_f64(
1016        "encoder_distortion_upper_bound_invalid",
1017        "encoder_distortion.bounds.declared_upper_bound must be finite and greater than zero",
1018        bounds.declared_upper_bound,
1019        report,
1020    );
1021    validate_optional_positive_f64(
1022        "encoder_distortion_estimated_distortion_invalid",
1023        "encoder_distortion.bounds.estimated_distortion must be finite and greater than zero",
1024        bounds.estimated_distortion,
1025        report,
1026    );
1027    validate_optional_probability(
1028        "encoder_distortion_violation_rate_invalid",
1029        "encoder_distortion.bounds.violation_rate must be finite and within [0, 1]",
1030        bounds.violation_rate,
1031        report,
1032    );
1033    validate_optional_nonnegative_f64(
1034        "encoder_distortion_max_observed_violation_invalid",
1035        "encoder_distortion.bounds.max_observed_violation must be finite and non-negative",
1036        bounds.max_observed_violation,
1037        report,
1038    );
1039    validate_optional_nonnegative_f64(
1040        "encoder_distortion_quantile_observed_violation_invalid",
1041        "encoder_distortion.bounds.quantile_observed_violation must be finite and non-negative",
1042        bounds.quantile_observed_violation,
1043        report,
1044    );
1045
1046    if let (Some(lower), Some(upper)) = (bounds.declared_lower_bound, bounds.declared_upper_bound) {
1047        if lower.is_finite() && upper.is_finite() && lower > upper {
1048            report.error(
1049                "encoder_distortion_bounds_order_invalid",
1050                "encoder_distortion.bounds.declared_lower_bound must be less than or equal to declared_upper_bound",
1051            );
1052        }
1053        if lower.is_finite() && upper.is_finite() && lower > 0.0 && upper > 0.0 {
1054            if let Some(estimated) = bounds.estimated_distortion {
1055                let expected = upper / lower;
1056                if !expected.is_finite() {
1057                    report.error(
1058                        "encoder_distortion_distortion_mismatch",
1059                        "encoder_distortion.bounds.declared_upper_bound / declared_lower_bound must be finite",
1060                    );
1061                } else {
1062                    let tolerance = 1e-9_f64.max(expected.abs() * 1e-9);
1063                    if estimated.is_finite() && (estimated - expected).abs() > tolerance {
1064                        report.error(
1065                            "encoder_distortion_distortion_mismatch",
1066                            format!(
1067                                "encoder_distortion.bounds.estimated_distortion {} does not match declared_upper_bound / declared_lower_bound {}",
1068                                estimated, expected
1069                            ),
1070                        );
1071                    }
1072                }
1073            }
1074        }
1075    }
1076}
1077
1078fn validate_encoder_distortion_scope(scope: &DistortionScope, report: &mut VerificationReport) {
1079    validate_optional_sha256_uri(
1080        "encoder_distortion_scope_corpus_digest_invalid",
1081        "encoder_distortion.scope.corpus_digest must be sha256:<lowercase-hex> when present",
1082        scope.corpus_digest.as_deref(),
1083        report,
1084    );
1085    validate_optional_sha256_uri(
1086        "encoder_distortion_scope_query_set_digest_invalid",
1087        "encoder_distortion.scope.query_set_digest must be sha256:<lowercase-hex> when present",
1088        scope.query_set_digest.as_deref(),
1089        report,
1090    );
1091    validate_optional_sha256_uri(
1092        "encoder_distortion_scope_pair_sample_digest_invalid",
1093        "encoder_distortion.scope.pair_sample_digest must be sha256:<lowercase-hex> when present",
1094        scope.pair_sample_digest.as_deref(),
1095        report,
1096    );
1097    validate_optional_non_empty(
1098        "encoder_distortion_scope_domain_empty",
1099        "encoder_distortion.scope.domain must be non-empty when present",
1100        scope.domain.as_deref(),
1101        report,
1102    );
1103    validate_optional_non_empty(
1104        "encoder_distortion_scope_estimator_version_empty",
1105        "encoder_distortion.scope.estimator_version must be non-empty when present",
1106        scope.estimator_version.as_deref(),
1107        report,
1108    );
1109    if scope
1110        .sample_size
1111        .is_some_and(|sample_size| sample_size == 0)
1112    {
1113        report.error(
1114            "encoder_distortion_scope_sample_size_zero",
1115            "encoder_distortion.scope.sample_size must be greater than zero when present",
1116        );
1117    }
1118    validate_optional_probability(
1119        "encoder_distortion_scope_confidence_invalid",
1120        "encoder_distortion.scope.confidence must be finite and within [0, 1]",
1121        scope.confidence,
1122        report,
1123    );
1124    validate_optional_probability(
1125        "encoder_distortion_scope_coverage_invalid",
1126        "encoder_distortion.scope.coverage must be finite and within [0, 1]",
1127        scope.coverage,
1128        report,
1129    );
1130}
1131
1132fn validate_encoder_distortion_evidence(
1133    profile: &EncoderDistortionProfileRef,
1134    base_dir: &Path,
1135    options: &VerifyOptions,
1136    report: &mut VerificationReport,
1137) {
1138    validate_optional_non_empty(
1139        "encoder_distortion_evidence_estimator_id_empty",
1140        "encoder_distortion.evidence.estimator_id must be non-empty when present",
1141        profile.evidence.estimator_id.as_deref(),
1142        report,
1143    );
1144    validate_optional_sha256_uri(
1145        "encoder_distortion_evidence_estimator_hash_invalid",
1146        "encoder_distortion.evidence.estimator_hash must be sha256:<lowercase-hex> when present",
1147        profile.evidence.estimator_hash.as_deref(),
1148        report,
1149    );
1150
1151    if profile.profile.is_none() && profile.evidence.kind != DistortionEvidenceKind::CallerAsserted
1152    {
1153        report.error(
1154            "encoder_distortion_profile_required",
1155            "non-caller-asserted encoder distortion evidence requires a profile artifact",
1156        );
1157        return;
1158    }
1159
1160    if let Some(artifact) = &profile.profile {
1161        validate_encoder_distortion_profile_artifact(artifact, base_dir, options, report);
1162    }
1163}
1164
1165fn validate_encoder_distortion_profile_artifact(
1166    profile: &DistortionProfileArtifactRef,
1167    base_dir: &Path,
1168    options: &VerifyOptions,
1169    report: &mut VerificationReport,
1170) {
1171    report.encoder_distortion.profile_manifest_path = Some(profile.path.clone());
1172    if profile.path.trim().is_empty() {
1173        report.error(
1174            "encoder_distortion_profile_path_empty",
1175            "encoder_distortion.profile.path must be non-empty",
1176        );
1177    }
1178    if !is_sha256_hex(&profile.sha256) {
1179        report.error(
1180            "encoder_distortion_profile_sha256_invalid",
1181            "encoder_distortion.profile.sha256 must be a lowercase 64-character hex SHA-256 digest",
1182        );
1183    }
1184    if profile.file_size_bytes == 0 {
1185        report.error(
1186            "encoder_distortion_profile_file_size_zero",
1187            "encoder_distortion.profile.file_size_bytes must be greater than zero",
1188        );
1189    }
1190    if profile.format.trim().is_empty() {
1191        report.error(
1192            "encoder_distortion_profile_format_empty",
1193            "encoder_distortion.profile.format must be non-empty",
1194        );
1195    }
1196    validate_optional_sha256_uri(
1197        "encoder_distortion_profile_source_digest_invalid",
1198        "encoder_distortion.profile.source_digest must be sha256:<lowercase-hex> when present",
1199        profile.source_digest.as_deref(),
1200        report,
1201    );
1202
1203    if !profile.path.trim().is_empty() {
1204        let path = PathBuf::from(&profile.path);
1205        if let Some(resolved) = resolve_existing_path(
1206            &path,
1207            base_dir,
1208            options,
1209            "encoder_distortion_profile",
1210            &mut report.errors,
1211        ) {
1212            report.encoder_distortion.profile_canonical_path =
1213                Some(path_to_display(&resolved.canonical_path));
1214            match sha256_file_bounded(
1215                &resolved.resolved_path,
1216                options.limits.max_encoder_distortion_profile_bytes,
1217                "encoder_distortion_profile_too_large",
1218                "encoder distortion profile",
1219            ) {
1220                Ok(hash) => {
1221                    report.encoder_distortion.profile_sha256 = Some(hash.sha256.clone());
1222                    report.encoder_distortion.profile_size_bytes = Some(hash.size_bytes);
1223                    if !hex_digest_eq(&hash.sha256, &profile.sha256) {
1224                        report.error(
1225                            "encoder_distortion_profile_sha256_mismatch",
1226                            format!(
1227                                "encoder distortion profile SHA-256 was {}, manifest declares {}",
1228                                hash.sha256, profile.sha256
1229                            ),
1230                        );
1231                    }
1232                    if hash.size_bytes != profile.file_size_bytes {
1233                        report.error(
1234                            "encoder_distortion_profile_file_size_mismatch",
1235                            format!(
1236                                "encoder distortion profile size was {}, manifest declares {}",
1237                                hash.size_bytes, profile.file_size_bytes
1238                            ),
1239                        );
1240                    }
1241                }
1242                Err(ManifestError::LimitExceeded { code, message }) => report.error(code, message),
1243                Err(err) => report.error(
1244                    "encoder_distortion_profile_hash_failed",
1245                    format!("failed to hash encoder distortion profile: {err}"),
1246                ),
1247            }
1248        }
1249    }
1250}
1251
1252fn validate_encoder_distortion_calibration(
1253    profile: &EncoderDistortionProfileRef,
1254    calibration: Option<&CalibrationProfileRef>,
1255    report: &mut VerificationReport,
1256) {
1257    let Some(calibration_profile_id) = &profile.calibration_profile_id else {
1258        return;
1259    };
1260    if calibration_profile_id.trim().is_empty() {
1261        report.error(
1262            "encoder_distortion_calibration_profile_id_empty",
1263            "encoder_distortion.calibration_profile_id must be non-empty when present",
1264        );
1265        return;
1266    }
1267    if calibration_profile_id.trim() != calibration_profile_id {
1268        report.error(
1269            "encoder_distortion_calibration_profile_id_whitespace",
1270            "encoder_distortion.calibration_profile_id must not contain leading or trailing whitespace",
1271        );
1272        return;
1273    }
1274    let Some(calibration) = calibration else {
1275        report.error(
1276            "encoder_distortion_calibration_missing",
1277            "encoder_distortion.calibration_profile_id requires a calibration block",
1278        );
1279        return;
1280    };
1281    // Calibration profile ids are manifest identifiers; keep matching exact.
1282    if calibration.profile_id != *calibration_profile_id {
1283        report.error(
1284            "encoder_distortion_calibration_profile_mismatch",
1285            format!(
1286                "encoder_distortion.calibration_profile_id {:?} does not match calibration.profile_id {:?}",
1287                calibration_profile_id, calibration.profile_id
1288            ),
1289        );
1290    }
1291}
1292
1293fn verify_calibration(
1294    document: &ManifestDocument,
1295    options: &VerifyOptions,
1296    report: &mut VerificationReport,
1297) {
1298    let Some(calibration) = &document.manifest.calibration else {
1299        return;
1300    };
1301
1302    report.calibration.present = true;
1303    report.calibration.schema_version = Some(calibration.schema_version.clone());
1304    report.calibration.profile_id = Some(calibration.profile_id.clone());
1305    report.calibration.calibrated_for_model = Some(calibration.calibrated_for.model.clone());
1306    report.calibration.ordinalization = Some(calibration.ordinalization.label().to_string());
1307    report.calibration.null_model = Some(calibration.null_model.label().to_string());
1308
1309    validate_calibration_shape(calibration, report);
1310    validate_calibration_encoder(calibration, &document.manifest.embedding, report);
1311    validate_calibration_ordinalization(calibration, &document.manifest.artifact, report);
1312    validate_calibration_null_model_ordinalization(calibration, report);
1313    validate_calibration_profile(
1314        calibration,
1315        &document.manifest.artifact,
1316        &document.base_dir,
1317        options,
1318        report,
1319    );
1320}
1321
1322fn validate_calibration_shape(
1323    calibration: &CalibrationProfileRef,
1324    report: &mut VerificationReport,
1325) {
1326    if calibration.schema_version != CALIBRATION_SCHEMA_VERSION {
1327        report.error(
1328            "calibration_schema_version_unsupported",
1329            format!(
1330                "calibration.schema_version must be {CALIBRATION_SCHEMA_VERSION}, got {}",
1331                calibration.schema_version
1332            ),
1333        );
1334    }
1335    if calibration.profile_id.trim().is_empty() {
1336        report.error(
1337            "calibration_profile_id_empty",
1338            "calibration.profile_id must be non-empty",
1339        );
1340    }
1341    if calibration
1342        .created_at
1343        .as_ref()
1344        .is_some_and(|created_at| DateTime::parse_from_rfc3339(created_at).is_err())
1345    {
1346        report.error(
1347            "calibration_created_at_invalid",
1348            "calibration.created_at must parse as RFC3339 when present",
1349        );
1350    }
1351    if calibration.calibrated_for.model.trim().is_empty() {
1352        report.error(
1353            "calibration_encoder_model_empty",
1354            "calibration.calibrated_for.model must be non-empty",
1355        );
1356    }
1357    if calibration.calibrated_for.dim == 0 {
1358        report.error(
1359            "calibration_encoder_dim_zero",
1360            "calibration.calibrated_for.dim must be greater than zero",
1361        );
1362    }
1363    validate_optional_non_empty(
1364        "calibration_encoder_model_revision_empty",
1365        "calibration.calibrated_for.model_revision must be non-empty when present",
1366        calibration.calibrated_for.model_revision.as_deref(),
1367        report,
1368    );
1369    validate_optional_non_empty(
1370        "calibration_encoder_normalization_empty",
1371        "calibration.calibrated_for.normalization must be non-empty when present",
1372        calibration.calibrated_for.normalization.as_deref(),
1373        report,
1374    );
1375    if calibration.ordinalization.dim() == 0 {
1376        report.error(
1377            "calibration_ordinalization_dim_zero",
1378            "calibration.ordinalization.dim must be greater than zero",
1379        );
1380    }
1381    match &calibration.ordinalization {
1382        CalibrationOrdinalization::TopK { k, .. } if *k == 0 => {
1383            report.error(
1384                "calibration_ordinalization_artifact_mismatch",
1385                "calibration top_k.k must be greater than zero",
1386            );
1387        }
1388        CalibrationOrdinalization::Bucket { bits, .. } if !matches!(*bits, 1 | 2 | 4) => {
1389            report.error(
1390                "calibration_ordinalization_artifact_mismatch",
1391                "calibration bucket.bits must be 1, 2, or 4",
1392            );
1393        }
1394        CalibrationOrdinalization::CallerDefined { name, .. } if name.trim().is_empty() => {
1395            report.error(
1396                "calibration_ordinalization_artifact_mismatch",
1397                "calibration caller_defined.name must be non-empty",
1398            );
1399        }
1400        _ => {}
1401    }
1402    match &calibration.null_model {
1403        NullModelSpec::EmpiricalTailTable { statistic } if statistic.trim().is_empty() => {
1404            report.error(
1405                "calibration_null_statistic_empty",
1406                "calibration.null_model.statistic must be non-empty",
1407            );
1408        }
1409        NullModelSpec::CallerDefined {
1410            name,
1411            parameterization,
1412        } => {
1413            if name.trim().is_empty() {
1414                report.error(
1415                    "calibration_null_name_empty",
1416                    "calibration.null_model.name must be non-empty",
1417                );
1418            }
1419            validate_optional_non_empty(
1420                "calibration_null_parameterization_empty",
1421                "calibration.null_model.parameterization must be non-empty when present",
1422                parameterization.as_deref(),
1423                report,
1424            );
1425        }
1426        _ => {}
1427    }
1428}
1429
1430fn validate_calibration_encoder(
1431    calibration: &CalibrationProfileRef,
1432    embedding: &Embedding,
1433    report: &mut VerificationReport,
1434) {
1435    if calibration.calibrated_for.model != embedding.model {
1436        report.error(
1437            "calibration_encoder_model_mismatch",
1438            format!(
1439                "calibration model {:?} does not match embedding.model {:?}",
1440                calibration.calibrated_for.model, embedding.model
1441            ),
1442        );
1443    }
1444    if calibration.calibrated_for.dim != embedding.dim {
1445        report.error(
1446            "calibration_encoder_dim_mismatch",
1447            format!(
1448                "calibration dim {} does not match embedding.dim {}",
1449                calibration.calibrated_for.dim, embedding.dim
1450            ),
1451        );
1452    }
1453    compare_optional_identity(
1454        "calibration_encoder_model_revision_mismatch",
1455        "calibration encoder",
1456        "model_revision",
1457        embedding.model_revision.as_deref(),
1458        calibration.calibrated_for.model_revision.as_deref(),
1459        report,
1460    );
1461    compare_optional_identity(
1462        "calibration_encoder_normalization_mismatch",
1463        "calibration encoder",
1464        "normalization",
1465        embedding.normalization.as_deref(),
1466        calibration.calibrated_for.normalization.as_deref(),
1467        report,
1468    );
1469}
1470
1471fn compare_optional_identity(
1472    code: &str,
1473    subject: &str,
1474    field: &str,
1475    embedding_value: Option<&str>,
1476    calibration_value: Option<&str>,
1477    report: &mut VerificationReport,
1478) {
1479    compare_optional_encoder_identity(
1480        code,
1481        subject,
1482        field,
1483        embedding_value,
1484        calibration_value,
1485        report,
1486    );
1487}
1488
1489fn compare_optional_encoder_identity(
1490    code: &str,
1491    subject: &str,
1492    field: &str,
1493    embedding_value: Option<&str>,
1494    observed_value: Option<&str>,
1495    report: &mut VerificationReport,
1496) {
1497    match (embedding_value, observed_value) {
1498        (Some(expected), Some(observed)) if expected == observed => {}
1499        (None, None) => {}
1500        _ => report.error(
1501            code,
1502            format!("{subject} {field} does not match embedding.{field}"),
1503        ),
1504    }
1505}
1506
1507fn validate_calibration_ordinalization(
1508    calibration: &CalibrationProfileRef,
1509    artifact: &Artifact,
1510    report: &mut VerificationReport,
1511) {
1512    if calibration.ordinalization.dim() != artifact.dim {
1513        report.error(
1514            "calibration_ordinalization_dim_mismatch",
1515            format!(
1516                "calibration ordinalization dim {} does not match artifact.dim {}",
1517                calibration.ordinalization.dim(),
1518                artifact.dim
1519            ),
1520        );
1521    }
1522
1523    let compatible = match (artifact.kind, &artifact.params, &calibration.ordinalization) {
1524        (
1525            ManifestIndexKind::Bitmap,
1526            ManifestIndexParams::Bitmap { n_top },
1527            CalibrationOrdinalization::TopK { k, .. },
1528        ) => k == n_top,
1529        (
1530            ManifestIndexKind::RankQuant,
1531            ManifestIndexParams::RankQuant { bits },
1532            CalibrationOrdinalization::Bucket {
1533                bits: calibrated_bits,
1534                ..
1535            },
1536        ) => calibrated_bits == bits,
1537        (
1538            ManifestIndexKind::SignBitmap,
1539            ManifestIndexParams::SignBitmap,
1540            CalibrationOrdinalization::Sign { .. },
1541        ) => true,
1542        (
1543            ManifestIndexKind::Rank,
1544            ManifestIndexParams::Rank,
1545            CalibrationOrdinalization::RankPosition { .. }
1546            | CalibrationOrdinalization::CallerDefined { .. },
1547        ) => true,
1548        _ => false,
1549    };
1550
1551    if !compatible {
1552        report.error(
1553            "calibration_ordinalization_artifact_mismatch",
1554            "calibration.ordinalization is incompatible with artifact.kind/artifact.params",
1555        );
1556    }
1557}
1558
1559fn validate_calibration_null_model_ordinalization(
1560    calibration: &CalibrationProfileRef,
1561    report: &mut VerificationReport,
1562) {
1563    if matches!(
1564        (&calibration.null_model, &calibration.ordinalization),
1565        (
1566            NullModelSpec::UniformHypergeometric,
1567            CalibrationOrdinalization::TopK { .. }
1568        )
1569    ) {
1570        return;
1571    }
1572    if matches!(
1573        &calibration.null_model,
1574        NullModelSpec::UniformHypergeometric
1575    ) {
1576        report.error(
1577            "calibration_null_model_ordinalization_mismatch",
1578            "uniform_hypergeometric calibration requires top_k ordinalization",
1579        );
1580    }
1581}
1582
1583fn validate_calibration_profile(
1584    calibration: &CalibrationProfileRef,
1585    artifact: &Artifact,
1586    base_dir: &Path,
1587    options: &VerifyOptions,
1588    report: &mut VerificationReport,
1589) {
1590    if matches!(
1591        &calibration.null_model,
1592        NullModelSpec::UniformHypergeometric
1593    ) {
1594        if calibration.profile.is_some() {
1595            report.error(
1596                "calibration_profile_unexpected",
1597                "uniform_hypergeometric calibration must not include a profile artifact",
1598            );
1599        }
1600        return;
1601    }
1602
1603    let Some(profile) = &calibration.profile else {
1604        report.error(
1605            "calibration_profile_required",
1606            "non-uniform calibration requires a profile artifact",
1607        );
1608        return;
1609    };
1610
1611    report.calibration.profile_manifest_path = Some(profile.path.clone());
1612    if profile.path.trim().is_empty() {
1613        report.error(
1614            "calibration_profile_path_empty",
1615            "calibration.profile.path must be non-empty",
1616        );
1617    }
1618    if !is_sha256_hex(&profile.sha256) {
1619        report.error(
1620            "calibration_profile_sha256_invalid",
1621            "calibration.profile.sha256 must be a lowercase 64-character hex SHA-256 digest",
1622        );
1623    }
1624    if profile.file_size_bytes == 0 {
1625        report.error(
1626            "calibration_profile_file_size_zero",
1627            "calibration.profile.file_size_bytes must be greater than zero",
1628        );
1629    }
1630    if profile.dim != artifact.dim {
1631        report.error(
1632            "calibration_profile_dim_mismatch",
1633            format!(
1634                "calibration profile dim {} does not match artifact.dim {}",
1635                profile.dim, artifact.dim
1636            ),
1637        );
1638    }
1639    if profile.sample_count == 0 {
1640        report.error(
1641            "calibration_profile_sample_count_zero",
1642            "calibration.profile.sample_count must be greater than zero",
1643        );
1644    }
1645    validate_optional_source_digest(profile.source_digest.as_deref(), report);
1646    validate_calibration_parameterization(calibration, profile, report);
1647    validate_calibration_profile_shape(profile, &calibration.ordinalization, report);
1648
1649    if !profile.path.trim().is_empty() {
1650        let path = PathBuf::from(&profile.path);
1651        if let Some(resolved) = resolve_existing_path(
1652            &path,
1653            base_dir,
1654            options,
1655            "calibration_profile",
1656            &mut report.errors,
1657        ) {
1658            report.calibration.profile_canonical_path =
1659                Some(path_to_display(&resolved.canonical_path));
1660            match sha256_file(&resolved.resolved_path) {
1661                Ok(hash) => {
1662                    report.calibration.profile_sha256 = Some(hash.sha256.clone());
1663                    report.calibration.profile_size_bytes = Some(hash.size_bytes);
1664                    if !hex_digest_eq(&hash.sha256, &profile.sha256) {
1665                        report.error(
1666                            "calibration_profile_sha256_mismatch",
1667                            format!(
1668                                "calibration profile SHA-256 was {}, manifest declares {}",
1669                                hash.sha256, profile.sha256
1670                            ),
1671                        );
1672                    }
1673                    if hash.size_bytes != profile.file_size_bytes {
1674                        report.error(
1675                            "calibration_profile_file_size_mismatch",
1676                            format!(
1677                                "calibration profile size was {}, manifest declares {}",
1678                                hash.size_bytes, profile.file_size_bytes
1679                            ),
1680                        );
1681                    }
1682                }
1683                Err(err) => report.error(
1684                    "calibration_profile_hash_failed",
1685                    format!("failed to hash calibration profile: {err}"),
1686                ),
1687            }
1688        }
1689    }
1690}
1691
1692fn validate_optional_source_digest(value: Option<&str>, report: &mut VerificationReport) {
1693    let Some(value) = value else {
1694        return;
1695    };
1696    let Some(digest) = value.strip_prefix("sha256:") else {
1697        report.error(
1698            "calibration_profile_source_digest_invalid",
1699            "calibration.profile.source_digest must be sha256:<lowercase-hex>",
1700        );
1701        return;
1702    };
1703    if !is_sha256_hex(digest) {
1704        report.error(
1705            "calibration_profile_source_digest_invalid",
1706            "calibration.profile.source_digest must be sha256:<lowercase-hex>",
1707        );
1708    }
1709}
1710
1711fn validate_calibration_parameterization(
1712    calibration: &CalibrationProfileRef,
1713    profile: &ProfileArtifactRef,
1714    report: &mut VerificationReport,
1715) {
1716    match &calibration.null_model {
1717        NullModelSpec::WeightedMarginalProfile { parameterization }
1718            if *parameterization != profile.parameterization =>
1719        {
1720            report.error(
1721                "calibration_null_parameterization_mismatch",
1722                format!(
1723                    "null_model parameterization {:?} does not match profile parameterization {:?}",
1724                    parameterization, profile.parameterization
1725                ),
1726            );
1727        }
1728        NullModelSpec::EmpiricalTailTable { .. }
1729            if profile.parameterization != ProfileParameterization::EmpiricalTailTable =>
1730        {
1731            report.error(
1732                "calibration_null_parameterization_mismatch",
1733                "empirical_tail_table null_model requires empirical_tail_table profile parameterization",
1734            );
1735        }
1736        _ => {}
1737    }
1738    if !profile_parameterization_matches_ordinalization(
1739        profile.parameterization,
1740        &calibration.ordinalization,
1741    ) {
1742        report.error(
1743            "calibration_profile_parameterization_ordinalization_mismatch",
1744            "calibration profile parameterization is incompatible with calibration ordinalization",
1745        );
1746    }
1747}
1748
1749fn profile_parameterization_matches_ordinalization(
1750    parameterization: ProfileParameterization,
1751    ordinalization: &CalibrationOrdinalization,
1752) -> bool {
1753    match ordinalization {
1754        CalibrationOrdinalization::TopK { .. } => matches!(
1755            parameterization,
1756            ProfileParameterization::MarginalTopKFrequency
1757                | ProfileParameterization::EmpiricalTailTable
1758        ),
1759        CalibrationOrdinalization::Bucket { .. } => matches!(
1760            parameterization,
1761            ProfileParameterization::BucketFrequency | ProfileParameterization::EmpiricalTailTable
1762        ),
1763        CalibrationOrdinalization::Sign { .. } => matches!(
1764            parameterization,
1765            ProfileParameterization::SignFrequency | ProfileParameterization::EmpiricalTailTable
1766        ),
1767        CalibrationOrdinalization::RankPosition { .. } => matches!(
1768            parameterization,
1769            ProfileParameterization::RankPositionFrequency
1770                | ProfileParameterization::EmpiricalTailTable
1771        ),
1772        CalibrationOrdinalization::CallerDefined { .. } => true,
1773    }
1774}
1775
1776fn validate_calibration_profile_shape(
1777    profile: &ProfileArtifactRef,
1778    ordinalization: &CalibrationOrdinalization,
1779    report: &mut VerificationReport,
1780) {
1781    if profile.format.trim().is_empty() {
1782        report.error(
1783            "calibration_profile_format_empty",
1784            "calibration.profile.format must be non-empty",
1785        );
1786    }
1787
1788    if profile.shape.is_empty() {
1789        return;
1790    }
1791
1792    if let Some(expected) = expected_profile_shape(profile.parameterization, ordinalization) {
1793        if profile.shape != expected {
1794            report.error(
1795                "calibration_profile_shape_mismatch",
1796                format!(
1797                    "calibration profile shape {:?} does not match expected {:?}",
1798                    profile.shape, expected
1799                ),
1800            );
1801        }
1802    }
1803
1804    let bytes_per_value = match profile.format.as_str() {
1805        "raw_f64_le" => Some(8u64),
1806        "raw_f32_le" => Some(4u64),
1807        _ => None,
1808    };
1809    let Some(bytes_per_value) = bytes_per_value else {
1810        return;
1811    };
1812    let Some(values) = profile
1813        .shape
1814        .iter()
1815        .try_fold(1u64, |acc, value| acc.checked_mul(*value as u64))
1816    else {
1817        report.error(
1818            "calibration_profile_shape_mismatch",
1819            "calibration.profile.shape product overflows u64",
1820        );
1821        return;
1822    };
1823    let Some(expected_bytes) = values.checked_mul(bytes_per_value) else {
1824        report.error(
1825            "calibration_profile_shape_mismatch",
1826            "calibration.profile.shape byte size overflows u64",
1827        );
1828        return;
1829    };
1830    if profile.file_size_bytes != expected_bytes {
1831        report.error(
1832            "calibration_profile_file_size_mismatch",
1833            format!(
1834                "calibration profile size {} does not match shape/format size {}",
1835                profile.file_size_bytes, expected_bytes
1836            ),
1837        );
1838    }
1839}
1840
1841fn expected_profile_shape(
1842    parameterization: ProfileParameterization,
1843    ordinalization: &CalibrationOrdinalization,
1844) -> Option<Vec<usize>> {
1845    match parameterization {
1846        ProfileParameterization::MarginalTopKFrequency => Some(vec![ordinalization.dim()]),
1847        ProfileParameterization::SignFrequency => Some(vec![ordinalization.dim()]),
1848        ProfileParameterization::BucketFrequency => match ordinalization {
1849            CalibrationOrdinalization::Bucket { dim, bits } if matches!(*bits, 1 | 2 | 4) => {
1850                Some(vec![*dim, 1usize << *bits])
1851            }
1852            _ => None,
1853        },
1854        ProfileParameterization::RankPositionFrequency => {
1855            Some(vec![ordinalization.dim(), ordinalization.dim()])
1856        }
1857        ProfileParameterization::EmpiricalTailTable => None,
1858    }
1859}
1860
1861fn verify_auxiliary_artifacts(
1862    document: &ManifestDocument,
1863    options: &VerifyOptions,
1864    report: &mut VerificationReport,
1865    paths: &mut VerificationPathCapture,
1866) {
1867    if !check_auxiliary_artifact_count(&document.manifest, &options.limits, report) {
1868        return;
1869    }
1870    let artifacts = auxiliary_artifacts_in_report_order(&document.manifest);
1871    let base_canonical = if options.allow_path_escape {
1872        None
1873    } else {
1874        match fs::canonicalize(&document.base_dir) {
1875            Ok(path) => Some(path),
1876            Err(err) => {
1877                for artifact in artifacts {
1878                    let mut entry = auxiliary_artifact_report_entry(artifact, &document.base_dir);
1879                    if artifact.path.trim().is_empty() {
1880                        mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty");
1881                    } else {
1882                        report.error(
1883                            "auxiliary_artifact_base_dir_unavailable",
1884                            format!(
1885                                "failed to canonicalize base_dir {} for auxiliary artifact {:?}: {err}",
1886                                document.base_dir.display(),
1887                                artifact.name
1888                            ),
1889                        );
1890                        mark_auxiliary_artifact_failed(
1891                            &mut entry,
1892                            "auxiliary_artifact_base_dir_unavailable",
1893                        );
1894                    }
1895                    report.auxiliary_artifacts.push(entry);
1896                }
1897                return;
1898            }
1899        }
1900    };
1901
1902    for artifact in artifacts {
1903        let mut entry = auxiliary_artifact_report_entry(artifact, &document.base_dir);
1904        let mut captured_path = None;
1905
1906        if artifact.path.trim().is_empty() {
1907            mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty");
1908            report.auxiliary_artifacts.push(entry);
1909            paths.auxiliary_artifact_paths.push(None);
1910            continue;
1911        }
1912
1913        match resolve_auxiliary_artifact_path(
1914            artifact,
1915            &document.base_dir,
1916            base_canonical.as_deref(),
1917            options,
1918            report,
1919        ) {
1920            AuxiliaryPathResolution::Resolved(resolved) => {
1921                captured_path = Some(resolved.canonical_path.clone());
1922                entry.canonical_path = Some(path_to_display(&resolved.canonical_path));
1923                match sha256_file_bounded(
1924                    &resolved.resolved_path,
1925                    options.limits.max_auxiliary_artifact_bytes,
1926                    "auxiliary_artifact_file_too_large",
1927                    "auxiliary artifact",
1928                ) {
1929                    Ok(hash) => {
1930                        entry.sha256 = Some(hash.sha256.clone());
1931                        entry.size_bytes = Some(hash.size_bytes);
1932                        if !hex_digest_eq(&hash.sha256, &artifact.sha256) {
1933                            mark_auxiliary_artifact_failed(
1934                                &mut entry,
1935                                "auxiliary_artifact_sha256_mismatch",
1936                            );
1937                            report.error(
1938                                "auxiliary_artifact_sha256_mismatch",
1939                                format!(
1940                                    "auxiliary artifact {:?} SHA-256 was {}, manifest declares {}",
1941                                    artifact.name, hash.sha256, artifact.sha256
1942                                ),
1943                            );
1944                        }
1945                        if hash.size_bytes != artifact.file_size_bytes {
1946                            mark_auxiliary_artifact_failed(
1947                                &mut entry,
1948                                "auxiliary_artifact_file_size_mismatch",
1949                            );
1950                            report.error(
1951                                "auxiliary_artifact_file_size_mismatch",
1952                                format!(
1953                                    "auxiliary artifact {:?} size was {}, manifest declares {}",
1954                                    artifact.name, hash.size_bytes, artifact.file_size_bytes
1955                                ),
1956                            );
1957                        }
1958                        if entry.reason_code.is_none() {
1959                            entry.state = AuxiliaryArtifactState::Verified;
1960                        }
1961                    }
1962                    Err(err) => {
1963                        let code = err.code().unwrap_or("auxiliary_artifact_hash_failed");
1964                        mark_auxiliary_artifact_failed(&mut entry, code);
1965                        let message = if err.code().is_some() {
1966                            err.to_string()
1967                        } else {
1968                            format!(
1969                                "failed to hash auxiliary artifact {:?}: {err}",
1970                                artifact.name
1971                            )
1972                        };
1973                        report.error(code, message);
1974                    }
1975                }
1976            }
1977            AuxiliaryPathResolution::OptionalAbsent => {
1978                entry.state = AuxiliaryArtifactState::OptionalAbsent;
1979                entry.reason_code = Some("auxiliary_artifact_optional_absent".to_string());
1980            }
1981            AuxiliaryPathResolution::MissingRequired => {
1982                entry.state = AuxiliaryArtifactState::MissingRequired;
1983                entry.reason_code = Some("auxiliary_artifact_missing_required".to_string());
1984            }
1985            AuxiliaryPathResolution::Failed(code) => {
1986                entry.state = AuxiliaryArtifactState::Failed;
1987                entry.reason_code = Some(code);
1988            }
1989        }
1990
1991        report.auxiliary_artifacts.push(entry);
1992        paths.auxiliary_artifact_paths.push(captured_path);
1993    }
1994}
1995
1996fn auxiliary_artifact_report_entry(
1997    artifact: &AuxiliaryArtifact,
1998    base_dir: &Path,
1999) -> AuxiliaryArtifactReport {
2000    let resolved_path = if artifact.path.trim().is_empty() {
2001        None
2002    } else {
2003        Some(path_to_display(&auxiliary_artifact_resolved_path(
2004            artifact, base_dir,
2005        )))
2006    };
2007    AuxiliaryArtifactReport {
2008        name: artifact.name.clone(),
2009        manifest_path: artifact.path.clone(),
2010        resolved_path,
2011        canonical_path: None,
2012        expected_sha256: Some(artifact.sha256.clone()),
2013        expected_size_bytes: Some(artifact.file_size_bytes),
2014        required: artifact.required,
2015        state: AuxiliaryArtifactState::Failed,
2016        reason_code: None,
2017        sha256: None,
2018        size_bytes: None,
2019    }
2020}
2021
2022fn check_auxiliary_artifact_count(
2023    manifest: &IndexManifest,
2024    limits: &ResourceLimits,
2025    report: &mut VerificationReport,
2026) -> bool {
2027    let count = manifest.auxiliary_artifacts.len();
2028    if count <= limits.max_auxiliary_artifacts {
2029        return true;
2030    }
2031    if !report
2032        .errors
2033        .iter()
2034        .any(|issue| issue.code == "auxiliary_artifact_count_limit_exceeded")
2035    {
2036        push_report_issue_bounded(
2037            &mut report.errors,
2038            limits,
2039            "auxiliary_artifact_count_limit_exceeded",
2040            format!(
2041                "auxiliary_artifacts has {count} entries, exceeding max_auxiliary_artifacts={}",
2042                limits.max_auxiliary_artifacts
2043            ),
2044        );
2045    }
2046    false
2047}
2048
2049fn auxiliary_artifacts_in_report_order(manifest: &IndexManifest) -> Vec<&AuxiliaryArtifact> {
2050    let mut artifacts: Vec<_> = manifest.auxiliary_artifacts.iter().collect();
2051    artifacts.sort_by(|left, right| {
2052        left.name
2053            .cmp(&right.name)
2054            .then_with(|| left.path.cmp(&right.path))
2055            .then_with(|| left.required.cmp(&right.required))
2056    });
2057    artifacts
2058}
2059
2060enum AuxiliaryPathResolution {
2061    Resolved(ResolvedPath),
2062    OptionalAbsent,
2063    MissingRequired,
2064    Failed(String),
2065}
2066
2067fn resolve_auxiliary_artifact_path(
2068    artifact: &AuxiliaryArtifact,
2069    base_dir: &Path,
2070    base_canonical: Option<&Path>,
2071    options: &VerifyOptions,
2072    report: &mut VerificationReport,
2073) -> AuxiliaryPathResolution {
2074    let path = Path::new(&artifact.path);
2075    if path.is_absolute() && !options.allow_absolute_paths {
2076        report.error(
2077            "auxiliary_artifact_absolute_path_rejected",
2078            format!(
2079                "absolute auxiliary artifact path {} for {:?} is rejected by default",
2080                path.display(),
2081                artifact.name
2082            ),
2083        );
2084        return AuxiliaryPathResolution::Failed(
2085            "auxiliary_artifact_absolute_path_rejected".to_string(),
2086        );
2087    }
2088
2089    if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) {
2090        report.error(
2091            "auxiliary_artifact_path_escape_rejected",
2092            format!(
2093                "relative auxiliary artifact path {} for {:?} escapes the manifest base",
2094                path.display(),
2095                artifact.name
2096            ),
2097        );
2098        return AuxiliaryPathResolution::Failed(
2099            "auxiliary_artifact_path_escape_rejected".to_string(),
2100        );
2101    }
2102
2103    let resolved_path = auxiliary_artifact_resolved_path(artifact, base_dir);
2104    let canonical_path = match fs::canonicalize(&resolved_path) {
2105        Ok(path) => path,
2106        Err(err) if err.kind() == io::ErrorKind::NotFound && !artifact.required => {
2107            return AuxiliaryPathResolution::OptionalAbsent;
2108        }
2109        Err(err) if err.kind() == io::ErrorKind::NotFound => {
2110            report.error(
2111                "auxiliary_artifact_missing_required",
2112                format!(
2113                    "required auxiliary artifact {:?} is missing at {}",
2114                    artifact.name,
2115                    resolved_path.display()
2116                ),
2117            );
2118            return AuxiliaryPathResolution::MissingRequired;
2119        }
2120        Err(err) => {
2121            report.error(
2122                "auxiliary_artifact_path_unavailable",
2123                format!(
2124                    "failed to canonicalize auxiliary artifact {:?} at {}: {err}",
2125                    artifact.name,
2126                    resolved_path.display()
2127                ),
2128            );
2129            return AuxiliaryPathResolution::Failed(
2130                "auxiliary_artifact_path_unavailable".to_string(),
2131            );
2132        }
2133    };
2134
2135    if let Some(base_canonical) = base_canonical {
2136        if !canonical_path.starts_with(base_canonical) {
2137            report.error(
2138                "auxiliary_artifact_path_escape_rejected",
2139                format!(
2140                    "canonical auxiliary artifact path {} for {:?} is outside manifest base {}",
2141                    canonical_path.display(),
2142                    artifact.name,
2143                    base_canonical.display()
2144                ),
2145            );
2146            return AuxiliaryPathResolution::Failed(
2147                "auxiliary_artifact_path_escape_rejected".to_string(),
2148            );
2149        }
2150    }
2151
2152    AuxiliaryPathResolution::Resolved(ResolvedPath {
2153        resolved_path,
2154        canonical_path,
2155    })
2156}
2157
2158fn auxiliary_artifact_resolved_path(artifact: &AuxiliaryArtifact, base_dir: &Path) -> PathBuf {
2159    let path = Path::new(&artifact.path);
2160    if path.is_absolute() {
2161        path.to_path_buf()
2162    } else {
2163        base_dir.join(path)
2164    }
2165}
2166
2167fn mark_auxiliary_artifact_failed(entry: &mut AuxiliaryArtifactReport, code: &str) {
2168    entry.state = AuxiliaryArtifactState::Failed;
2169    if entry.reason_code.is_none() {
2170        entry.reason_code = Some(code.to_string());
2171    }
2172}
2173
2174fn verify_attestations(manifest: &IndexManifest, report: &mut VerificationReport) {
2175    if manifest.attestations.is_empty() {
2176        report
2177            .skipped_checks
2178            .push("attestations_absent".to_string());
2179        return;
2180    }
2181
2182    let artifact_sha = report
2183        .artifact
2184        .sha256
2185        .clone()
2186        .unwrap_or_else(|| manifest.artifact.sha256.clone());
2187    let mut any_subject_match = false;
2188    for (idx, attestation) in manifest.attestations.iter().enumerate() {
2189        let predicate_type = attestation
2190            .get("predicateType")
2191            .or_else(|| attestation.get("predicate_type"))
2192            .and_then(serde_json::Value::as_str)
2193            .map(ToOwned::to_owned);
2194        if predicate_type.is_none() {
2195            report.error(
2196                "attestation_predicate_type_missing",
2197                format!("attestation {idx} has no predicateType"),
2198            );
2199        }
2200
2201        let builder_id = attestation
2202            .pointer("/predicate/builder/id")
2203            .or_else(|| attestation.pointer("/predicate/runDetails/builder/id"))
2204            .and_then(serde_json::Value::as_str)
2205            .map(ToOwned::to_owned);
2206
2207        let subject_sha256_matched = attestation
2208            .get("subject")
2209            .and_then(serde_json::Value::as_array)
2210            .is_some_and(|subjects| {
2211                subjects.iter().any(|subject| {
2212                    subject
2213                        .pointer("/digest/sha256")
2214                        .and_then(serde_json::Value::as_str)
2215                        .is_some_and(|digest| hex_digest_eq(digest, &artifact_sha))
2216                })
2217            });
2218        any_subject_match |= subject_sha256_matched;
2219        report.attestation_shape_checks.push(AttestationShapeCheck {
2220            predicate_type,
2221            builder_id,
2222            subject_sha256_matched,
2223        });
2224    }
2225
2226    if !any_subject_match {
2227        report.error(
2228            "attestation_subject_sha256_mismatch",
2229            "no supplied attestation subject digest matches the artifact SHA-256",
2230        );
2231    }
2232}
2233
2234#[derive(Clone, Debug, Default)]
2235pub struct VerifyOptions {
2236    pub allow_absolute_paths: bool,
2237    pub allow_path_escape: bool,
2238    pub allow_duplicate_db_ids: bool,
2239    pub index_override: Option<PathBuf>,
2240    pub limits: ResourceLimits,
2241}
2242
2243#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2244pub struct ResourceLimits {
2245    pub max_manifest_bytes: u64,
2246    pub max_row_identity_jsonl_line_bytes: usize,
2247    pub max_row_identity_rows: usize,
2248    pub max_row_identity_tracked_db_id_bytes: usize,
2249    pub max_auxiliary_artifacts: usize,
2250    pub max_auxiliary_artifact_bytes: u64,
2251    pub max_encoder_distortion_profile_bytes: u64,
2252    pub max_report_issues: usize,
2253    pub max_cached_report_bytes: u64,
2254}
2255
2256impl Default for ResourceLimits {
2257    fn default() -> Self {
2258        Self {
2259            max_manifest_bytes: DEFAULT_MAX_MANIFEST_BYTES,
2260            max_row_identity_jsonl_line_bytes: DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES,
2261            max_row_identity_rows: DEFAULT_MAX_ROW_IDENTITY_ROWS,
2262            max_row_identity_tracked_db_id_bytes: DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES,
2263            max_auxiliary_artifacts: DEFAULT_MAX_AUXILIARY_ARTIFACTS,
2264            max_auxiliary_artifact_bytes: DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES,
2265            max_encoder_distortion_profile_bytes: DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES,
2266            max_report_issues: DEFAULT_MAX_REPORT_ISSUES,
2267            max_cached_report_bytes: DEFAULT_MAX_CACHED_REPORT_BYTES,
2268        }
2269    }
2270}
2271
2272#[derive(Clone, Debug)]
2273struct ResolvedPath {
2274    resolved_path: PathBuf,
2275    canonical_path: PathBuf,
2276}
2277
2278#[derive(Clone, Debug, Default)]
2279struct VerificationPathCapture {
2280    artifact_path: Option<PathBuf>,
2281    row_identity_path: Option<PathBuf>,
2282    auxiliary_artifact_paths: Vec<Option<PathBuf>>,
2283}
2284
2285fn resolve_existing_path(
2286    path: &Path,
2287    base_dir: &Path,
2288    options: &VerifyOptions,
2289    context: &str,
2290    errors: &mut Vec<ReportIssue>,
2291) -> Option<ResolvedPath> {
2292    if path.is_absolute() && !options.allow_absolute_paths {
2293        errors.push(ReportIssue::new(
2294            format!("{context}_absolute_path_rejected"),
2295            format!("absolute path {} is rejected by default", path.display()),
2296        ));
2297        return None;
2298    }
2299
2300    let base_canonical = match fs::canonicalize(base_dir) {
2301        Ok(path) => path,
2302        Err(err) => {
2303            errors.push(ReportIssue::new(
2304                format!("{context}_base_dir_unavailable"),
2305                format!(
2306                    "failed to canonicalize base_dir {}: {err}",
2307                    base_dir.display()
2308                ),
2309            ));
2310            return None;
2311        }
2312    };
2313
2314    if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) {
2315        errors.push(ReportIssue::new(
2316            format!("{context}_path_escape_rejected"),
2317            format!("relative path {} escapes the manifest base", path.display()),
2318        ));
2319        return None;
2320    }
2321
2322    let resolved_path = if path.is_absolute() {
2323        path.to_path_buf()
2324    } else {
2325        base_dir.join(path)
2326    };
2327    let canonical_path = match fs::canonicalize(&resolved_path) {
2328        Ok(path) => path,
2329        Err(err) => {
2330            errors.push(ReportIssue::new(
2331                format!("{context}_path_unavailable"),
2332                format!("failed to canonicalize {}: {err}", resolved_path.display()),
2333            ));
2334            return None;
2335        }
2336    };
2337
2338    if !options.allow_path_escape && !canonical_path.starts_with(&base_canonical) {
2339        errors.push(ReportIssue::new(
2340            format!("{context}_path_escape_rejected"),
2341            format!(
2342                "canonical path {} is outside manifest base {}",
2343                canonical_path.display(),
2344                base_canonical.display()
2345            ),
2346        ));
2347        return None;
2348    }
2349
2350    Some(ResolvedPath {
2351        resolved_path,
2352        canonical_path,
2353    })
2354}
2355
2356fn has_lexical_escape(path: &Path) -> bool {
2357    let mut depth = 0usize;
2358    for component in path.components() {
2359        match component {
2360            Component::CurDir => {}
2361            Component::Normal(_) => depth += 1,
2362            Component::ParentDir => {
2363                if depth == 0 {
2364                    return true;
2365                }
2366                depth -= 1;
2367            }
2368            Component::Prefix(_) | Component::RootDir => return true,
2369        }
2370    }
2371    false
2372}
2373
2374fn default_required() -> bool {
2375    true
2376}
2377
2378fn is_true(value: &bool) -> bool {
2379    *value
2380}
2381
2382#[derive(Clone, Debug, Serialize, Deserialize)]
2383#[serde(deny_unknown_fields)]
2384pub struct IndexManifest {
2385    pub schema_version: String,
2386    pub manifest_id: String,
2387    pub created_at: String,
2388    pub artifact: Artifact,
2389    #[serde(default, skip_serializing_if = "Vec::is_empty")]
2390    pub auxiliary_artifacts: Vec<AuxiliaryArtifact>,
2391    pub embedding: Embedding,
2392    #[serde(default, skip_serializing_if = "Option::is_none")]
2393    pub encoder_distortion: Option<EncoderDistortionProfileRef>,
2394    #[serde(default, skip_serializing_if = "Option::is_none")]
2395    pub calibration: Option<CalibrationProfileRef>,
2396    pub row_identity: RowIdentity,
2397    #[serde(default, skip_serializing_if = "Option::is_none")]
2398    pub build: Option<BuildInfo>,
2399    #[serde(default, skip_serializing_if = "Vec::is_empty")]
2400    pub attestations: Vec<serde_json::Value>,
2401    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
2402    pub extensions: BTreeMap<String, serde_json::Value>,
2403}
2404
2405#[derive(Clone, Debug, Serialize, Deserialize)]
2406#[serde(deny_unknown_fields)]
2407pub struct Artifact {
2408    pub path: String,
2409    pub sha256: String,
2410    pub kind: ManifestIndexKind,
2411    pub format_version: u8,
2412    pub dim: usize,
2413    pub vector_count: usize,
2414    pub bytes_per_vec: usize,
2415    pub params: ManifestIndexParams,
2416    pub file_size_bytes: u64,
2417}
2418
2419#[derive(Clone, Debug, Serialize, Deserialize)]
2420#[serde(deny_unknown_fields)]
2421pub struct AuxiliaryArtifact {
2422    pub name: String,
2423    pub path: String,
2424    pub sha256: String,
2425    pub file_size_bytes: u64,
2426    #[serde(default = "default_required", skip_serializing_if = "is_true")]
2427    pub required: bool,
2428}
2429
2430#[derive(Clone, Debug, Serialize, Deserialize)]
2431#[serde(deny_unknown_fields)]
2432pub struct Embedding {
2433    pub model: String,
2434    pub dim: usize,
2435    #[serde(default, skip_serializing_if = "Option::is_none")]
2436    pub model_revision: Option<String>,
2437    #[serde(default, skip_serializing_if = "Option::is_none")]
2438    pub tokenizer_revision: Option<String>,
2439    #[serde(default, skip_serializing_if = "Option::is_none")]
2440    pub pooling: Option<String>,
2441    #[serde(default, skip_serializing_if = "Option::is_none")]
2442    pub corpus_digest: Option<String>,
2443    #[serde(default, skip_serializing_if = "Option::is_none")]
2444    pub embedding_matrix_digest: Option<String>,
2445    #[serde(default, skip_serializing_if = "Option::is_none")]
2446    pub normalization: Option<String>,
2447}
2448
2449#[derive(Clone, Debug, Serialize, Deserialize)]
2450#[serde(deny_unknown_fields)]
2451pub struct CalibrationProfileRef {
2452    pub schema_version: String,
2453    pub profile_id: String,
2454    #[serde(default, skip_serializing_if = "Option::is_none")]
2455    pub created_at: Option<String>,
2456    pub calibrated_for: EncoderSpec,
2457    pub ordinalization: CalibrationOrdinalization,
2458    #[serde(default, skip_serializing_if = "Option::is_none")]
2459    pub profile: Option<ProfileArtifactRef>,
2460    pub null_model: NullModelSpec,
2461}
2462
2463#[derive(Clone, Debug, Serialize, Deserialize)]
2464#[serde(deny_unknown_fields)]
2465pub struct EncoderSpec {
2466    pub model: String,
2467    pub dim: usize,
2468    #[serde(default, skip_serializing_if = "Option::is_none")]
2469    pub model_revision: Option<String>,
2470    #[serde(default, skip_serializing_if = "Option::is_none")]
2471    pub normalization: Option<String>,
2472}
2473
2474#[derive(Clone, Debug, Serialize, Deserialize)]
2475#[serde(deny_unknown_fields)]
2476pub struct EncoderDistortionProfileRef {
2477    pub schema_version: String,
2478    pub profile_id: String,
2479    #[serde(default, skip_serializing_if = "Option::is_none")]
2480    pub created_at: Option<String>,
2481    pub encoder: EncoderSpec,
2482    #[serde(default, skip_serializing_if = "Option::is_none")]
2483    pub tokenizer_revision: Option<String>,
2484    #[serde(default, skip_serializing_if = "Option::is_none")]
2485    pub pooling: Option<String>,
2486    pub source_metric: MetricSpec,
2487    pub embedding_metric: MetricSpec,
2488    pub bounds: DistortionBounds,
2489    pub scope: DistortionScope,
2490    pub evidence: DistortionEvidence,
2491    #[serde(default, skip_serializing_if = "Option::is_none")]
2492    pub profile: Option<DistortionProfileArtifactRef>,
2493    #[serde(default, skip_serializing_if = "Option::is_none")]
2494    pub calibration_profile_id: Option<String>,
2495}
2496
2497#[derive(Clone, Debug, Serialize, Deserialize)]
2498#[serde(deny_unknown_fields)]
2499pub struct MetricSpec {
2500    pub name: String,
2501    #[serde(default, skip_serializing_if = "Option::is_none")]
2502    pub version: Option<String>,
2503    #[serde(default, skip_serializing_if = "Option::is_none")]
2504    pub digest: Option<String>,
2505}
2506
2507#[derive(Clone, Debug, Serialize, Deserialize)]
2508#[serde(deny_unknown_fields)]
2509pub struct DistortionBounds {
2510    #[serde(default, skip_serializing_if = "Option::is_none")]
2511    pub declared_lower_bound: Option<f64>,
2512    #[serde(default, skip_serializing_if = "Option::is_none")]
2513    pub declared_upper_bound: Option<f64>,
2514    #[serde(default, skip_serializing_if = "Option::is_none")]
2515    pub estimated_distortion: Option<f64>,
2516    #[serde(default, skip_serializing_if = "Option::is_none")]
2517    pub violation_rate: Option<f64>,
2518    #[serde(default, skip_serializing_if = "Option::is_none")]
2519    pub max_observed_violation: Option<f64>,
2520    #[serde(default, skip_serializing_if = "Option::is_none")]
2521    pub quantile_observed_violation: Option<f64>,
2522}
2523
2524#[derive(Clone, Debug, Serialize, Deserialize)]
2525#[serde(deny_unknown_fields)]
2526pub struct DistortionScope {
2527    #[serde(default, skip_serializing_if = "Option::is_none")]
2528    pub corpus_digest: Option<String>,
2529    #[serde(default, skip_serializing_if = "Option::is_none")]
2530    pub query_set_digest: Option<String>,
2531    #[serde(default, skip_serializing_if = "Option::is_none")]
2532    pub pair_sample_digest: Option<String>,
2533    #[serde(default, skip_serializing_if = "Option::is_none")]
2534    pub domain: Option<String>,
2535    #[serde(default, skip_serializing_if = "Option::is_none")]
2536    pub sample_size: Option<u64>,
2537    #[serde(default, skip_serializing_if = "Option::is_none")]
2538    pub confidence: Option<f64>,
2539    #[serde(default, skip_serializing_if = "Option::is_none")]
2540    pub coverage: Option<f64>,
2541    #[serde(default, skip_serializing_if = "Option::is_none")]
2542    pub estimator_version: Option<String>,
2543}
2544
2545#[derive(Clone, Debug, Serialize, Deserialize)]
2546#[serde(deny_unknown_fields)]
2547pub struct DistortionEvidence {
2548    pub kind: DistortionEvidenceKind,
2549    #[serde(default, skip_serializing_if = "Option::is_none")]
2550    pub estimator_id: Option<String>,
2551    #[serde(default, skip_serializing_if = "Option::is_none")]
2552    pub estimator_hash: Option<String>,
2553}
2554
2555#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2556#[serde(rename_all = "snake_case")]
2557pub enum DistortionEvidenceKind {
2558    Certified,
2559    EmpiricalSample,
2560    BenchmarkEstimate,
2561    TeacherEstimate,
2562    CallerAsserted,
2563}
2564
2565impl DistortionEvidenceKind {
2566    pub fn label(&self) -> &'static str {
2567        match self {
2568            Self::Certified => "certified",
2569            Self::EmpiricalSample => "empirical_sample",
2570            Self::BenchmarkEstimate => "benchmark_estimate",
2571            Self::TeacherEstimate => "teacher_estimate",
2572            Self::CallerAsserted => "caller_asserted",
2573        }
2574    }
2575}
2576
2577#[derive(Clone, Debug, Serialize, Deserialize)]
2578#[serde(deny_unknown_fields)]
2579pub struct DistortionProfileArtifactRef {
2580    pub path: String,
2581    pub sha256: String,
2582    pub file_size_bytes: u64,
2583    pub format: String,
2584    #[serde(default, skip_serializing_if = "Option::is_none")]
2585    pub source_digest: Option<String>,
2586}
2587
2588#[derive(Clone, Debug, Serialize, Deserialize)]
2589#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2590pub enum CalibrationOrdinalization {
2591    TopK { dim: usize, k: usize },
2592    Bucket { dim: usize, bits: u8 },
2593    Sign { dim: usize },
2594    RankPosition { dim: usize },
2595    CallerDefined { dim: usize, name: String },
2596}
2597
2598impl CalibrationOrdinalization {
2599    pub fn dim(&self) -> usize {
2600        match self {
2601            Self::TopK { dim, .. }
2602            | Self::Bucket { dim, .. }
2603            | Self::Sign { dim }
2604            | Self::RankPosition { dim }
2605            | Self::CallerDefined { dim, .. } => *dim,
2606        }
2607    }
2608
2609    pub fn label(&self) -> &'static str {
2610        match self {
2611            Self::TopK { .. } => "top_k",
2612            Self::Bucket { .. } => "bucket",
2613            Self::Sign { .. } => "sign",
2614            Self::RankPosition { .. } => "rank_position",
2615            Self::CallerDefined { .. } => "caller_defined",
2616        }
2617    }
2618}
2619
2620#[derive(Clone, Debug, Serialize, Deserialize)]
2621#[serde(deny_unknown_fields)]
2622pub struct ProfileArtifactRef {
2623    pub path: String,
2624    pub sha256: String,
2625    pub file_size_bytes: u64,
2626    pub dim: usize,
2627    pub sample_count: usize,
2628    pub parameterization: ProfileParameterization,
2629    pub format: String,
2630    #[serde(default, skip_serializing_if = "Vec::is_empty")]
2631    pub shape: Vec<usize>,
2632    #[serde(default, skip_serializing_if = "Option::is_none")]
2633    pub source_digest: Option<String>,
2634}
2635
2636#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2637#[serde(rename_all = "snake_case")]
2638pub enum ProfileParameterization {
2639    #[serde(rename = "marginal_topk_frequency")]
2640    MarginalTopKFrequency,
2641    BucketFrequency,
2642    SignFrequency,
2643    RankPositionFrequency,
2644    EmpiricalTailTable,
2645}
2646
2647#[derive(Clone, Debug, Serialize, Deserialize)]
2648#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2649pub enum NullModelSpec {
2650    UniformHypergeometric,
2651    WeightedMarginalProfile {
2652        parameterization: ProfileParameterization,
2653    },
2654    EmpiricalTailTable {
2655        statistic: String,
2656    },
2657    CallerDefined {
2658        name: String,
2659        #[serde(default, skip_serializing_if = "Option::is_none")]
2660        parameterization: Option<String>,
2661    },
2662}
2663
2664impl NullModelSpec {
2665    pub fn label(&self) -> &'static str {
2666        match self {
2667            Self::UniformHypergeometric => "uniform_hypergeometric",
2668            Self::WeightedMarginalProfile { .. } => "weighted_marginal_profile",
2669            Self::EmpiricalTailTable { .. } => "empirical_tail_table",
2670            Self::CallerDefined { .. } => "caller_defined",
2671        }
2672    }
2673}
2674
2675#[derive(Clone, Debug, Serialize, Deserialize)]
2676#[serde(deny_unknown_fields)]
2677pub struct BuildInfo {
2678    pub invocation_id: String,
2679    #[serde(default, skip_serializing_if = "Option::is_none")]
2680    pub builder_id: Option<String>,
2681    #[serde(default, skip_serializing_if = "Option::is_none")]
2682    pub source_repo: Option<String>,
2683    #[serde(default, skip_serializing_if = "Option::is_none")]
2684    pub source_commit: Option<String>,
2685    #[serde(default, skip_serializing_if = "Option::is_none")]
2686    pub ci_provider: Option<String>,
2687    #[serde(default, skip_serializing_if = "Option::is_none")]
2688    pub ci_run_id: Option<String>,
2689}
2690
2691#[derive(Clone, Debug, Serialize, Deserialize)]
2692#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2693pub enum RowIdentity {
2694    RowIdIdentity {
2695        row_count: usize,
2696    },
2697    Jsonl {
2698        path: String,
2699        sha256: String,
2700        row_count: usize,
2701        id_kind: String,
2702        #[serde(default, skip_serializing_if = "Option::is_none")]
2703        db: Option<RowIdentityDb>,
2704    },
2705}
2706
2707impl RowIdentity {
2708    pub fn row_count(&self) -> usize {
2709        match self {
2710            Self::RowIdIdentity { row_count } | Self::Jsonl { row_count, .. } => *row_count,
2711        }
2712    }
2713}
2714
2715#[derive(Clone, Debug, Serialize, Deserialize)]
2716#[serde(deny_unknown_fields)]
2717pub struct RowIdentityDb {
2718    #[serde(default, skip_serializing_if = "Option::is_none")]
2719    pub path: Option<String>,
2720    #[serde(default, skip_serializing_if = "Option::is_none")]
2721    pub table: Option<String>,
2722    #[serde(default, skip_serializing_if = "Option::is_none")]
2723    pub id_column: Option<String>,
2724}
2725
2726#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2727#[serde(rename_all = "snake_case")]
2728pub enum ManifestIndexKind {
2729    Rank,
2730    RankQuant,
2731    Bitmap,
2732    SignBitmap,
2733}
2734
2735impl ManifestIndexKind {
2736    fn from_core(kind: CoreIndexKind) -> Self {
2737        match kind {
2738            CoreIndexKind::Rank => Self::Rank,
2739            CoreIndexKind::RankQuant => Self::RankQuant,
2740            CoreIndexKind::Bitmap => Self::Bitmap,
2741            CoreIndexKind::SignBitmap => Self::SignBitmap,
2742        }
2743    }
2744}
2745
2746#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
2747#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)]
2748pub enum ManifestIndexParams {
2749    Rank,
2750    RankQuant { bits: u8 },
2751    Bitmap { n_top: usize },
2752    SignBitmap,
2753}
2754
2755impl ManifestIndexParams {
2756    fn from_core(params: CoreIndexParams) -> Self {
2757        match params {
2758            CoreIndexParams::Rank => Self::Rank,
2759            CoreIndexParams::RankQuant { bits } => Self::RankQuant { bits },
2760            CoreIndexParams::Bitmap { n_top } => Self::Bitmap { n_top },
2761            CoreIndexParams::SignBitmap => Self::SignBitmap,
2762        }
2763    }
2764}
2765
2766/// Verified paths and metadata for a caller-managed load.
2767///
2768/// A `VerifiedLoadPlan` means the manifest, primary artifact, row-identity
2769/// file, and declared auxiliary artifacts verified at the time verification
2770/// ran. It is not a durable capability over mutable storage: the plan does not
2771/// pin file descriptors, hold locks, buffer bytes, or guarantee that bytes at
2772/// the returned paths remain unchanged after verification. Treat it as proof of
2773/// the verification just performed, then load from controlled storage
2774/// immediately or re-verify if another actor may have changed the files.
2775#[derive(Clone, Debug)]
2776pub struct VerifiedLoadPlan {
2777    manifest_path: Option<PathBuf>,
2778    artifact_path: PathBuf,
2779    metadata: MetadataReport,
2780    row_identity: VerifiedRowIdentityPlan,
2781    auxiliary_artifacts: Vec<VerifiedAuxiliaryArtifactPlan>,
2782    report: VerificationReport,
2783}
2784
2785impl VerifiedLoadPlan {
2786    fn from_report(
2787        document: &ManifestDocument,
2788        report: VerificationReport,
2789        paths: VerificationPathCapture,
2790    ) -> Result<Self, VerifiedLoadPlanError> {
2791        if !report.ok {
2792            return Err(VerifiedLoadPlanError::VerificationFailed(Box::new(report)));
2793        }
2794
2795        let artifact_path =
2796            paths
2797                .artifact_path
2798                .clone()
2799                .ok_or_else(|| VerifiedLoadPlanError::IncompletePlan {
2800                    report: Box::new(report.clone()),
2801                    message: "verified report is missing the captured artifact path".to_string(),
2802                })?;
2803        let metadata = report.artifact.metadata.clone().ok_or_else(|| {
2804            VerifiedLoadPlanError::IncompletePlan {
2805                report: Box::new(report.clone()),
2806                message: "verified report is missing probed artifact metadata".to_string(),
2807            }
2808        })?;
2809        let row_identity =
2810            VerifiedRowIdentityPlan::from_report(paths.row_identity_path.as_ref(), &report)?;
2811        let auxiliary_artifacts = report
2812            .auxiliary_artifacts
2813            .iter()
2814            .enumerate()
2815            .map(|(idx, entry)| {
2816                VerifiedAuxiliaryArtifactPlan::from_report(
2817                    entry,
2818                    paths
2819                        .auxiliary_artifact_paths
2820                        .get(idx)
2821                        .and_then(|path| path.as_ref()),
2822                    &report,
2823                )
2824            })
2825            .collect::<Result<Vec<_>, _>>()?;
2826
2827        Ok(Self {
2828            manifest_path: document.source_path.clone(),
2829            artifact_path,
2830            metadata,
2831            row_identity,
2832            auxiliary_artifacts,
2833            report,
2834        })
2835    }
2836
2837    pub fn manifest_path(&self) -> Option<&Path> {
2838        self.manifest_path.as_deref()
2839    }
2840
2841    /// Canonical path of the primary index artifact observed during verification.
2842    ///
2843    /// This path is not a byte pin. Loading later from mutable/shared storage can
2844    /// still observe different bytes, so callers that cannot control mutation
2845    /// must re-verify immediately before loading.
2846    pub fn artifact_path(&self) -> &Path {
2847        &self.artifact_path
2848    }
2849
2850    pub fn metadata(&self) -> &MetadataReport {
2851        &self.metadata
2852    }
2853
2854    pub fn row_identity(&self) -> &VerifiedRowIdentityPlan {
2855        &self.row_identity
2856    }
2857
2858    pub fn auxiliary_artifacts(&self) -> &[VerifiedAuxiliaryArtifactPlan] {
2859        &self.auxiliary_artifacts
2860    }
2861
2862    pub fn auxiliary_by_name(&self, name: &str) -> Option<&VerifiedAuxiliaryArtifactPlan> {
2863        let name = name.trim();
2864        self.auxiliary_artifacts
2865            .iter()
2866            .find(|artifact| artifact.name().trim() == name)
2867    }
2868
2869    pub fn require_auxiliary(&self, name: &str) -> Result<&Path, RequireAuxiliaryError> {
2870        let artifact = self.auxiliary_by_name(name).ok_or_else(|| {
2871            RequireAuxiliaryError::MissingDeclaration {
2872                name: name.to_string(),
2873            }
2874        })?;
2875        artifact
2876            .path()
2877            .ok_or_else(|| RequireAuxiliaryError::NotLoadable {
2878                name: name.to_string(),
2879                state: artifact.state(),
2880                reason_code: artifact.reason_code().map(ToOwned::to_owned),
2881            })
2882    }
2883
2884    pub fn report(&self) -> &VerificationReport {
2885        &self.report
2886    }
2887
2888    pub fn into_report(self) -> VerificationReport {
2889        self.report
2890    }
2891}
2892
2893#[derive(Clone, Debug, PartialEq, Eq)]
2894pub enum RequireAuxiliaryError {
2895    MissingDeclaration {
2896        name: String,
2897    },
2898    NotLoadable {
2899        name: String,
2900        state: AuxiliaryArtifactState,
2901        reason_code: Option<String>,
2902    },
2903}
2904
2905impl fmt::Display for RequireAuxiliaryError {
2906    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2907        match self {
2908            Self::MissingDeclaration { name } => {
2909                write!(f, "required auxiliary artifact {name:?} is not declared")
2910            }
2911            Self::NotLoadable {
2912                name,
2913                state,
2914                reason_code,
2915            } => {
2916                write!(
2917                    f,
2918                    "required auxiliary artifact {name:?} is not loadable: state={state:?}"
2919                )?;
2920                if let Some(reason_code) = reason_code {
2921                    write!(f, ", reason_code={reason_code}")?;
2922                }
2923                Ok(())
2924            }
2925        }
2926    }
2927}
2928
2929impl std::error::Error for RequireAuxiliaryError {}
2930
2931#[derive(Clone, Debug)]
2932pub struct VerifiedRowIdentityPlan {
2933    kind: String,
2934    path: Option<PathBuf>,
2935    row_count: usize,
2936    validated_rows: Option<usize>,
2937    sha256: Option<String>,
2938}
2939
2940impl VerifiedRowIdentityPlan {
2941    fn from_report(
2942        captured_path: Option<&PathBuf>,
2943        report: &VerificationReport,
2944    ) -> Result<Self, VerifiedLoadPlanError> {
2945        let kind = report.row_identity.kind.clone().ok_or_else(|| {
2946            VerifiedLoadPlanError::IncompletePlan {
2947                report: Box::new(report.clone()),
2948                message: "verified report is missing row identity kind".to_string(),
2949            }
2950        })?;
2951        let row_count =
2952            report
2953                .row_identity
2954                .row_count
2955                .ok_or_else(|| VerifiedLoadPlanError::IncompletePlan {
2956                    report: Box::new(report.clone()),
2957                    message: "verified report is missing row identity row count".to_string(),
2958                })?;
2959        let path = match kind.as_str() {
2960            "row_id_identity" => None,
2961            "jsonl" => Some(captured_path.cloned().ok_or_else(|| {
2962                VerifiedLoadPlanError::IncompletePlan {
2963                    report: Box::new(report.clone()),
2964                    message: "verified report is missing the captured row identity path"
2965                        .to_string(),
2966                }
2967            })?),
2968            _ => {
2969                return Err(VerifiedLoadPlanError::IncompletePlan {
2970                    report: Box::new(report.clone()),
2971                    message: format!("verified report has unsupported row identity kind {kind:?}"),
2972                });
2973            }
2974        };
2975
2976        Ok(Self {
2977            kind,
2978            path,
2979            row_count,
2980            validated_rows: report.row_identity.validated_rows,
2981            sha256: report.row_identity.sha256.clone(),
2982        })
2983    }
2984
2985    pub fn kind(&self) -> &str {
2986        &self.kind
2987    }
2988
2989    pub fn path(&self) -> Option<&Path> {
2990        self.path.as_deref()
2991    }
2992
2993    pub fn row_count(&self) -> usize {
2994        self.row_count
2995    }
2996
2997    pub fn validated_rows(&self) -> Option<usize> {
2998        self.validated_rows
2999    }
3000
3001    pub fn sha256(&self) -> Option<&str> {
3002        self.sha256.as_deref()
3003    }
3004}
3005
3006#[derive(Clone, Debug)]
3007pub struct VerifiedAuxiliaryArtifactPlan {
3008    name: String,
3009    path: Option<PathBuf>,
3010    required: bool,
3011    state: AuxiliaryArtifactState,
3012    reason_code: Option<String>,
3013    sha256: Option<String>,
3014    size_bytes: Option<u64>,
3015}
3016
3017impl VerifiedAuxiliaryArtifactPlan {
3018    fn from_report(
3019        entry: &AuxiliaryArtifactReport,
3020        captured_path: Option<&PathBuf>,
3021        report: &VerificationReport,
3022    ) -> Result<Self, VerifiedLoadPlanError> {
3023        let path = match entry.state {
3024            AuxiliaryArtifactState::Verified => Some(captured_path.cloned().ok_or_else(|| {
3025                VerifiedLoadPlanError::IncompletePlan {
3026                    report: Box::new(report.clone()),
3027                    message: format!(
3028                        "verified auxiliary artifact {:?} is missing its captured path",
3029                        entry.name
3030                    ),
3031                }
3032            })?),
3033            AuxiliaryArtifactState::OptionalAbsent => None,
3034            AuxiliaryArtifactState::MissingRequired | AuxiliaryArtifactState::Failed => {
3035                return Err(VerifiedLoadPlanError::IncompletePlan {
3036                    report: Box::new(report.clone()),
3037                    message: format!(
3038                        "verified report contains non-loadable auxiliary artifact {:?}",
3039                        entry.name
3040                    ),
3041                });
3042            }
3043        };
3044
3045        Ok(Self {
3046            name: entry.name.clone(),
3047            path,
3048            required: entry.required,
3049            state: entry.state,
3050            reason_code: entry.reason_code.clone(),
3051            sha256: entry.sha256.clone(),
3052            size_bytes: entry.size_bytes,
3053        })
3054    }
3055
3056    pub fn name(&self) -> &str {
3057        &self.name
3058    }
3059
3060    pub fn path(&self) -> Option<&Path> {
3061        self.path.as_deref()
3062    }
3063
3064    pub fn required(&self) -> bool {
3065        self.required
3066    }
3067
3068    pub fn state(&self) -> AuxiliaryArtifactState {
3069        self.state
3070    }
3071
3072    pub fn reason_code(&self) -> Option<&str> {
3073        self.reason_code.as_deref()
3074    }
3075
3076    pub fn sha256(&self) -> Option<&str> {
3077        self.sha256.as_deref()
3078    }
3079
3080    pub fn size_bytes(&self) -> Option<u64> {
3081        self.size_bytes
3082    }
3083}
3084
3085#[derive(Debug)]
3086pub enum VerifiedLoadPlanError {
3087    Manifest(ManifestError),
3088    VerificationFailed(Box<VerificationReport>),
3089    IncompletePlan {
3090        report: Box<VerificationReport>,
3091        message: String,
3092    },
3093}
3094
3095impl fmt::Display for VerifiedLoadPlanError {
3096    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
3097        match self {
3098            Self::Manifest(err) => write!(f, "{err}"),
3099            Self::VerificationFailed(report) => {
3100                write!(
3101                    f,
3102                    "manifest verification failed{}",
3103                    report_issue_summary(&report.errors)
3104                )
3105            }
3106            Self::IncompletePlan { message, .. } => f.write_str(message),
3107        }
3108    }
3109}
3110
3111impl std::error::Error for VerifiedLoadPlanError {
3112    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
3113        match self {
3114            Self::Manifest(err) => Some(err),
3115            Self::VerificationFailed(_) | Self::IncompletePlan { .. } => None,
3116        }
3117    }
3118}
3119
3120impl From<ManifestError> for VerifiedLoadPlanError {
3121    fn from(value: ManifestError) -> Self {
3122        Self::Manifest(value)
3123    }
3124}
3125
3126fn report_issue_summary(errors: &[ReportIssue]) -> String {
3127    if errors.is_empty() {
3128        return String::new();
3129    }
3130    let codes = errors
3131        .iter()
3132        .take(3)
3133        .map(|issue| issue.code.as_str())
3134        .collect::<Vec<_>>()
3135        .join(", ");
3136    if errors.len() > 3 {
3137        format!(": {codes}, ...")
3138    } else {
3139        format!(": {codes}")
3140    }
3141}
3142
3143#[derive(Clone, Debug, Serialize, Deserialize)]
3144pub struct VerificationReport {
3145    pub ok: bool,
3146    pub checked_at: String,
3147    pub manifest_id: Option<String>,
3148    pub artifact: ArtifactReport,
3149    #[serde(default)]
3150    pub auxiliary_artifacts: Vec<AuxiliaryArtifactReport>,
3151    pub row_identity: RowIdentityReport,
3152    #[serde(default)]
3153    pub encoder_distortion: EncoderDistortionReport,
3154    pub calibration: CalibrationReport,
3155    pub attestation_shape_checks: Vec<AttestationShapeCheck>,
3156    pub errors: Vec<ReportIssue>,
3157    pub warnings: Vec<ReportIssue>,
3158    pub skipped_checks: Vec<String>,
3159}
3160
3161impl VerificationReport {
3162    fn new(manifest_id: Option<String>) -> Self {
3163        Self {
3164            ok: false,
3165            checked_at: Utc::now().to_rfc3339_opts(SecondsFormat::Nanos, true),
3166            manifest_id,
3167            artifact: ArtifactReport::default(),
3168            auxiliary_artifacts: Vec::new(),
3169            row_identity: RowIdentityReport::default(),
3170            encoder_distortion: EncoderDistortionReport::default(),
3171            calibration: CalibrationReport::default(),
3172            attestation_shape_checks: Vec::new(),
3173            errors: Vec::new(),
3174            warnings: Vec::new(),
3175            skipped_checks: Vec::new(),
3176        }
3177    }
3178
3179    fn error(&mut self, code: impl Into<String>, message: impl Into<String>) {
3180        self.errors.push(ReportIssue::new(code, message));
3181    }
3182}
3183
3184#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3185pub struct ArtifactReport {
3186    pub manifest_path: Option<String>,
3187    pub observed_path: Option<String>,
3188    pub canonical_path: Option<String>,
3189    pub sha256: Option<String>,
3190    pub size_bytes: Option<u64>,
3191    pub metadata: Option<MetadataReport>,
3192}
3193
3194#[derive(Clone, Debug, Serialize, Deserialize)]
3195pub struct AuxiliaryArtifactReport {
3196    pub name: String,
3197    pub manifest_path: String,
3198    #[serde(default)]
3199    pub resolved_path: Option<String>,
3200    #[serde(default)]
3201    pub canonical_path: Option<String>,
3202    #[serde(default)]
3203    pub expected_sha256: Option<String>,
3204    #[serde(default)]
3205    pub expected_size_bytes: Option<u64>,
3206    pub required: bool,
3207    pub state: AuxiliaryArtifactState,
3208    pub reason_code: Option<String>,
3209    pub sha256: Option<String>,
3210    pub size_bytes: Option<u64>,
3211}
3212
3213#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
3214#[serde(rename_all = "snake_case")]
3215pub enum AuxiliaryArtifactState {
3216    Verified,
3217    OptionalAbsent,
3218    MissingRequired,
3219    Failed,
3220}
3221
3222#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3223pub struct RowIdentityReport {
3224    pub kind: Option<String>,
3225    pub manifest_path: Option<String>,
3226    pub canonical_path: Option<String>,
3227    pub sha256: Option<String>,
3228    pub row_count: Option<usize>,
3229    pub validated_rows: Option<usize>,
3230}
3231
3232#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3233pub struct EncoderDistortionReport {
3234    pub present: bool,
3235    pub schema_version: Option<String>,
3236    pub profile_id: Option<String>,
3237    pub evidence_kind: Option<String>,
3238    pub source_metric: Option<String>,
3239    pub embedding_metric: Option<String>,
3240    pub profile_manifest_path: Option<String>,
3241    pub profile_canonical_path: Option<String>,
3242    pub profile_sha256: Option<String>,
3243    pub profile_size_bytes: Option<u64>,
3244}
3245
3246#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3247pub struct CalibrationReport {
3248    pub present: bool,
3249    pub schema_version: Option<String>,
3250    pub profile_id: Option<String>,
3251    pub calibrated_for_model: Option<String>,
3252    pub ordinalization: Option<String>,
3253    pub null_model: Option<String>,
3254    pub profile_manifest_path: Option<String>,
3255    pub profile_canonical_path: Option<String>,
3256    pub profile_sha256: Option<String>,
3257    pub profile_size_bytes: Option<u64>,
3258}
3259
3260#[derive(Clone, Debug, Serialize, Deserialize)]
3261pub struct MetadataReport {
3262    pub kind: ManifestIndexKind,
3263    pub format_version: u8,
3264    pub dim: usize,
3265    pub vector_count: usize,
3266    pub bytes_per_vec: usize,
3267    pub params: ManifestIndexParams,
3268    pub file_size_bytes: u64,
3269}
3270
3271impl MetadataReport {
3272    fn from_core(metadata: &CoreIndexMetadata) -> Self {
3273        Self {
3274            kind: ManifestIndexKind::from_core(metadata.kind),
3275            format_version: metadata.format_version,
3276            dim: metadata.dim,
3277            vector_count: metadata.vector_count,
3278            bytes_per_vec: metadata.bytes_per_vec,
3279            params: ManifestIndexParams::from_core(metadata.params),
3280            file_size_bytes: metadata.file_size_bytes,
3281        }
3282    }
3283}
3284
3285#[derive(Clone, Debug, Serialize, Deserialize)]
3286pub struct AttestationShapeCheck {
3287    pub predicate_type: Option<String>,
3288    pub builder_id: Option<String>,
3289    pub subject_sha256_matched: bool,
3290}
3291
3292#[derive(Clone, Debug, Serialize, Deserialize)]
3293pub struct ReportIssue {
3294    pub code: String,
3295    pub message: String,
3296}
3297
3298impl ReportIssue {
3299    pub fn new(code: impl Into<String>, message: impl Into<String>) -> Self {
3300        Self {
3301            code: code.into(),
3302            message: message.into(),
3303        }
3304    }
3305}
3306
3307fn push_report_issue_bounded(
3308    errors: &mut Vec<ReportIssue>,
3309    limits: &ResourceLimits,
3310    code: impl Into<String>,
3311    message: impl Into<String>,
3312) {
3313    let limit = limits.max_report_issues;
3314    if errors.len() < limit {
3315        errors.push(ReportIssue::new(code, message));
3316        return;
3317    }
3318    if errors
3319        .iter()
3320        .any(|issue| issue.code == "verification_report_issue_limit_exceeded")
3321    {
3322        return;
3323    }
3324    let detail_limit = limit.saturating_sub(1);
3325    errors.truncate(detail_limit);
3326    errors.push(ReportIssue::new(
3327        "verification_report_issue_limit_exceeded",
3328        format!("verification report issue count exceeded max_report_issues={limit}"),
3329    ));
3330}
3331
3332fn enforce_report_issue_limit(errors: &mut Vec<ReportIssue>, limits: &ResourceLimits) {
3333    let limit = limits.max_report_issues;
3334    if errors.len() <= limit {
3335        return;
3336    }
3337    errors.retain(|issue| issue.code != "verification_report_issue_limit_exceeded");
3338    let detail_limit = limit.saturating_sub(1);
3339    errors.truncate(detail_limit);
3340    errors.push(ReportIssue::new(
3341        "verification_report_issue_limit_exceeded",
3342        format!("verification report issue count exceeded max_report_issues={limit}"),
3343    ));
3344}
3345
3346#[derive(Clone, Debug, Serialize, Deserialize)]
3347pub struct FileHash {
3348    pub sha256: String,
3349    pub size_bytes: u64,
3350}
3351
3352pub fn sha256_file(path: impl AsRef<Path>) -> io::Result<FileHash> {
3353    let mut file = File::open(path)?;
3354    let mut hasher = Sha256::new();
3355    let mut size_bytes = 0u64;
3356    let mut buf = [0u8; 64 * 1024];
3357    loop {
3358        let n = file.read(&mut buf)?;
3359        if n == 0 {
3360            break;
3361        }
3362        size_bytes += n as u64;
3363        hasher.update(&buf[..n]);
3364    }
3365    Ok(FileHash {
3366        sha256: hex::encode(hasher.finalize()),
3367        size_bytes,
3368    })
3369}
3370
3371pub fn sha256_file_bounded(
3372    path: impl AsRef<Path>,
3373    max_bytes: u64,
3374    code: &'static str,
3375    context: &'static str,
3376) -> Result<FileHash, ManifestError> {
3377    let path = path.as_ref();
3378    let bytes = read_bounded_file(path, max_bytes, code, context)?;
3379    let mut hasher = Sha256::new();
3380    hasher.update(&bytes);
3381    Ok(FileHash {
3382        sha256: hex::encode(hasher.finalize()),
3383        size_bytes: bytes.len() as u64,
3384    })
3385}
3386
3387#[derive(Clone, Debug)]
3388pub enum CreateRowIdentity {
3389    RowIdIdentity,
3390    Jsonl(PathBuf),
3391}
3392
3393#[derive(Clone, Debug)]
3394pub struct CreateAuxiliaryArtifact {
3395    pub name: String,
3396    pub path: PathBuf,
3397    pub required: bool,
3398}
3399
3400#[derive(Clone, Debug, Default)]
3401pub struct CreateManifestOptions {
3402    pub allow_absolute_paths: bool,
3403    pub allow_path_escape: bool,
3404    pub limits: ResourceLimits,
3405    pub auxiliary_artifacts: Vec<CreateAuxiliaryArtifact>,
3406}
3407
3408pub fn create_manifest_for_index(
3409    index_path: impl AsRef<Path>,
3410    row_identity: CreateRowIdentity,
3411    embedding_model: impl Into<String>,
3412    out_path: impl AsRef<Path>,
3413) -> Result<IndexManifest, ManifestError> {
3414    create_manifest_for_index_with_options(
3415        index_path,
3416        row_identity,
3417        embedding_model,
3418        out_path,
3419        CreateManifestOptions::default(),
3420    )
3421}
3422
3423pub fn create_manifest_for_index_with_options(
3424    index_path: impl AsRef<Path>,
3425    row_identity: CreateRowIdentity,
3426    embedding_model: impl Into<String>,
3427    out_path: impl AsRef<Path>,
3428    options: CreateManifestOptions,
3429) -> Result<IndexManifest, ManifestError> {
3430    let index_path = index_path.as_ref();
3431    let out_path = out_path.as_ref();
3432    let out_base = out_path
3433        .parent()
3434        .filter(|p| !p.as_os_str().is_empty())
3435        .unwrap_or_else(|| Path::new("."));
3436    if !out_base.exists() {
3437        fs::create_dir_all(out_base)?;
3438    }
3439    let metadata = probe_index_metadata(index_path)?;
3440    let index_hash = sha256_file(index_path)?;
3441    let artifact = Artifact {
3442        path: manifest_path_for_create(index_path, out_base, &options, "artifact")?,
3443        sha256: index_hash.sha256,
3444        kind: ManifestIndexKind::from_core(metadata.kind),
3445        format_version: metadata.format_version,
3446        dim: metadata.dim,
3447        vector_count: metadata.vector_count,
3448        bytes_per_vec: metadata.bytes_per_vec,
3449        params: ManifestIndexParams::from_core(metadata.params),
3450        file_size_bytes: metadata.file_size_bytes,
3451    };
3452
3453    let row_identity = match row_identity {
3454        CreateRowIdentity::RowIdIdentity => RowIdentity::RowIdIdentity {
3455            row_count: metadata.vector_count,
3456        },
3457        CreateRowIdentity::Jsonl(path) => {
3458            let mut row_errors = Vec::new();
3459            let stats = validate_jsonl_rows(
3460                &path,
3461                false,
3462                &options.limits,
3463                Some(metadata.vector_count),
3464                &mut row_errors,
3465            )?;
3466            if !row_errors.is_empty() {
3467                if let Some(issue) = row_errors
3468                    .iter()
3469                    .find(|issue| is_limit_issue_code(&issue.code))
3470                {
3471                    return Err(ManifestError::limit_exceeded(
3472                        issue.code.clone(),
3473                        issue.message.clone(),
3474                    ));
3475                }
3476                let codes = row_errors
3477                    .iter()
3478                    .map(|issue| issue.code.as_str())
3479                    .collect::<Vec<_>>()
3480                    .join(", ");
3481                return Err(ManifestError::invalid(format!(
3482                    "row map is invalid: {codes}"
3483                )));
3484            }
3485            if stats.row_count != metadata.vector_count {
3486                return Err(ManifestError::invalid(format!(
3487                    "row map has {} rows but index has {} vectors",
3488                    stats.row_count, metadata.vector_count
3489                )));
3490            }
3491            let row_sha256 = stats.sha256.ok_or_else(|| {
3492                ManifestError::invalid("row map hash unavailable after bounded validation")
3493            })?;
3494            RowIdentity::Jsonl {
3495                path: manifest_path_for_create(&path, out_base, &options, "row identity")?,
3496                sha256: row_sha256,
3497                row_count: stats.row_count,
3498                id_kind: "uuid".to_string(),
3499                db: None,
3500            }
3501        }
3502    };
3503
3504    let auxiliary_artifacts =
3505        create_auxiliary_artifacts(&options.auxiliary_artifacts, out_base, &options)?;
3506
3507    let invocation_id = format!("urn:uuid:{}", Uuid::new_v4());
3508    Ok(IndexManifest {
3509        schema_version: SCHEMA_VERSION.to_string(),
3510        manifest_id: format!("urn:uuid:{}", Uuid::new_v4()),
3511        created_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true),
3512        artifact,
3513        auxiliary_artifacts,
3514        embedding: Embedding {
3515            model: embedding_model.into(),
3516            dim: metadata.dim,
3517            model_revision: None,
3518            tokenizer_revision: None,
3519            pooling: None,
3520            corpus_digest: None,
3521            embedding_matrix_digest: None,
3522            normalization: None,
3523        },
3524        encoder_distortion: None,
3525        calibration: None,
3526        row_identity,
3527        build: Some(BuildInfo {
3528            invocation_id,
3529            builder_id: Some("ordvec-manifest".to_string()),
3530            source_repo: None,
3531            source_commit: None,
3532            ci_provider: None,
3533            ci_run_id: None,
3534        }),
3535        attestations: Vec::new(),
3536        extensions: BTreeMap::new(),
3537    })
3538}
3539
3540fn create_auxiliary_artifacts(
3541    artifacts: &[CreateAuxiliaryArtifact],
3542    out_base: &Path,
3543    options: &CreateManifestOptions,
3544) -> Result<Vec<AuxiliaryArtifact>, ManifestError> {
3545    let count = artifacts.len();
3546    if count > options.limits.max_auxiliary_artifacts {
3547        return Err(ManifestError::limit_exceeded(
3548            "auxiliary_artifact_count_limit_exceeded",
3549            format!(
3550                "auxiliary_artifacts has {count} entries, exceeding max_auxiliary_artifacts={}",
3551                options.limits.max_auxiliary_artifacts
3552            ),
3553        ));
3554    }
3555
3556    let mut names = HashSet::new();
3557    let mut manifest_artifacts = Vec::with_capacity(artifacts.len());
3558    for artifact in artifacts {
3559        let name = artifact.name.trim();
3560        if name.is_empty() {
3561            return Err(ManifestError::invalid(
3562                "auxiliary artifact name must be non-empty",
3563            ));
3564        }
3565        if !names.insert(name.to_string()) {
3566            return Err(ManifestError::invalid(format!(
3567                "auxiliary artifact name {name:?} is duplicated"
3568            )));
3569        }
3570        let hash = sha256_file_bounded(
3571            &artifact.path,
3572            options.limits.max_auxiliary_artifact_bytes,
3573            "auxiliary_artifact_file_too_large",
3574            "auxiliary artifact",
3575        )?;
3576        manifest_artifacts.push(AuxiliaryArtifact {
3577            name: name.to_string(),
3578            path: manifest_path_for_create(
3579                &artifact.path,
3580                out_base,
3581                options,
3582                "auxiliary artifact",
3583            )?,
3584            sha256: hash.sha256,
3585            file_size_bytes: hash.size_bytes,
3586            required: artifact.required,
3587        });
3588    }
3589    Ok(manifest_artifacts)
3590}
3591
3592pub fn write_manifest_file(
3593    manifest: &IndexManifest,
3594    path: impl AsRef<Path>,
3595) -> Result<(), ManifestError> {
3596    let file = File::create(path)?;
3597    serde_json::to_writer_pretty(file, manifest)?;
3598    Ok(())
3599}
3600
3601#[derive(Clone, Debug)]
3602struct JsonlStats {
3603    row_count: usize,
3604    validated_rows: usize,
3605    sha256: Option<String>,
3606}
3607
3608#[derive(Debug, Deserialize)]
3609#[serde(deny_unknown_fields)]
3610struct JsonlRow {
3611    row_id: usize,
3612    db_id: String,
3613    #[serde(default)]
3614    parent_id: Option<String>,
3615}
3616
3617fn validate_jsonl_rows(
3618    path: &Path,
3619    allow_duplicate_db_ids: bool,
3620    limits: &ResourceLimits,
3621    expected_row_count: Option<usize>,
3622    errors: &mut Vec<ReportIssue>,
3623) -> io::Result<JsonlStats> {
3624    let file = File::open(path)?;
3625    let mut reader = BufReader::new(file);
3626    let mut hasher = Sha256::new();
3627    let mut seen = HashSet::new();
3628    let mut seen_db_id_bytes = 0usize;
3629    let mut row_count = 0usize;
3630    let mut validated_rows = 0usize;
3631    let mut line = Vec::new();
3632    let mut reached_eof = true;
3633
3634    while let Some(too_long) = read_bounded_line(
3635        &mut reader,
3636        limits.max_row_identity_jsonl_line_bytes,
3637        &mut line,
3638        &mut hasher,
3639    )? {
3640        let line_idx = row_count;
3641        row_count += 1;
3642        if row_count > limits.max_row_identity_rows {
3643            reached_eof = false;
3644            push_report_issue_bounded(
3645                errors,
3646                limits,
3647                "row_identity_row_count_limit_exceeded",
3648                format!(
3649                    "row identity file has more than max_row_identity_rows={} rows",
3650                    limits.max_row_identity_rows
3651                ),
3652            );
3653            break;
3654        }
3655        if let Some(expected_row_count) = expected_row_count {
3656            if row_count > expected_row_count {
3657                reached_eof = false;
3658                push_report_issue_bounded(
3659                    errors,
3660                    limits,
3661                    "row_identity_row_count_mismatch",
3662                    format!(
3663                        "row identity file has more than declared row_count={expected_row_count}"
3664                    ),
3665                );
3666                break;
3667            }
3668        }
3669        if too_long {
3670            reached_eof = false;
3671            push_report_issue_bounded(
3672                errors,
3673                limits,
3674                "row_identity_line_too_large",
3675                format!(
3676                    "line {line_idx} exceeds max_row_identity_jsonl_line_bytes={}",
3677                    limits.max_row_identity_jsonl_line_bytes
3678                ),
3679            );
3680            break;
3681        }
3682        trim_jsonl_terminator(&mut line);
3683        let row: JsonlRow = match serde_json::from_slice(&line) {
3684            Ok(row) => row,
3685            Err(err) => {
3686                push_report_issue_bounded(
3687                    errors,
3688                    limits,
3689                    "row_identity_jsonl_invalid_json",
3690                    format!("line {line_idx} is not a strict row object: {err}"),
3691                );
3692                continue;
3693            }
3694        };
3695        if row.row_id != line_idx {
3696            push_report_issue_bounded(
3697                errors,
3698                limits,
3699                "row_identity_row_id_mismatch",
3700                format!("line {line_idx} has row_id {}", row.row_id),
3701            );
3702        }
3703        validate_row_id_string("db_id", &row.db_id, line_idx, limits, errors);
3704        if let Some(parent_id) = &row.parent_id {
3705            validate_row_id_string("parent_id", parent_id, line_idx, limits, errors);
3706        }
3707        validated_rows += 1;
3708        if !allow_duplicate_db_ids {
3709            if seen.contains(&row.db_id) {
3710                push_report_issue_bounded(
3711                    errors,
3712                    limits,
3713                    "row_identity_duplicate_db_id",
3714                    format!("line {line_idx} repeats db_id"),
3715                );
3716            } else {
3717                let next_seen_db_id_bytes = seen_db_id_bytes.saturating_add(row.db_id.len());
3718                if next_seen_db_id_bytes > limits.max_row_identity_tracked_db_id_bytes {
3719                    reached_eof = false;
3720                    push_report_issue_bounded(
3721                        errors,
3722                        limits,
3723                        "row_identity_duplicate_tracking_limit_exceeded",
3724                        format!(
3725                            "tracked db_id bytes exceed max_row_identity_tracked_db_id_bytes={}",
3726                            limits.max_row_identity_tracked_db_id_bytes
3727                        ),
3728                    );
3729                    break;
3730                }
3731                seen_db_id_bytes = next_seen_db_id_bytes;
3732                seen.insert(row.db_id);
3733            }
3734        }
3735    }
3736
3737    Ok(JsonlStats {
3738        row_count,
3739        validated_rows,
3740        sha256: reached_eof.then(|| hex::encode(hasher.finalize())),
3741    })
3742}
3743
3744fn read_bounded_line<R: BufRead>(
3745    reader: &mut R,
3746    max_bytes: usize,
3747    out: &mut Vec<u8>,
3748    hasher: &mut Sha256,
3749) -> io::Result<Option<bool>> {
3750    out.clear();
3751    let max_bytes = max_bytes.max(1);
3752
3753    loop {
3754        let available = reader.fill_buf()?;
3755        if available.is_empty() {
3756            return if out.is_empty() {
3757                Ok(None)
3758            } else {
3759                Ok(Some(false))
3760            };
3761        }
3762
3763        let newline = available.iter().position(|byte| *byte == b'\n');
3764        let take_len = newline.map_or(available.len(), |pos| pos + 1);
3765
3766        let remaining = max_bytes.saturating_sub(out.len());
3767        if take_len > remaining {
3768            let consume_len = remaining.saturating_add(1).min(take_len);
3769            if remaining > 0 {
3770                out.extend_from_slice(&available[..remaining]);
3771            }
3772            hasher.update(&available[..consume_len]);
3773            reader.consume(consume_len);
3774            return Ok(Some(true));
3775        }
3776
3777        out.extend_from_slice(&available[..take_len]);
3778        hasher.update(&available[..take_len]);
3779        reader.consume(take_len);
3780        if newline.is_some() {
3781            return Ok(Some(false));
3782        }
3783    }
3784}
3785
3786fn trim_jsonl_terminator(line: &mut Vec<u8>) {
3787    if line.last() == Some(&b'\n') {
3788        line.pop();
3789    }
3790    if line.last() == Some(&b'\r') {
3791        line.pop();
3792    }
3793}
3794
3795fn validate_row_id_string(
3796    field: &str,
3797    value: &str,
3798    line_idx: usize,
3799    limits: &ResourceLimits,
3800    errors: &mut Vec<ReportIssue>,
3801) {
3802    let mut structurally_invalid = false;
3803    if value.is_empty() {
3804        structurally_invalid = true;
3805        push_report_issue_bounded(
3806            errors,
3807            limits,
3808            format!("row_identity_{field}_empty"),
3809            format!("line {line_idx} has empty {field}"),
3810        );
3811    }
3812    if value.contains('\0') {
3813        structurally_invalid = true;
3814        push_report_issue_bounded(
3815            errors,
3816            limits,
3817            format!("row_identity_{field}_contains_nul"),
3818            format!("line {line_idx} {field} contains NUL"),
3819        );
3820    }
3821    if !structurally_invalid && Uuid::parse_str(value).is_err() {
3822        push_report_issue_bounded(
3823            errors,
3824            limits,
3825            format!("row_identity_{field}_invalid_uuid"),
3826            format!("line {line_idx} {field} must be a UUID in v1"),
3827        );
3828    }
3829}
3830
3831fn is_limit_issue_code(code: &str) -> bool {
3832    matches!(
3833        code,
3834        "row_identity_line_too_large"
3835            | "row_identity_row_count_limit_exceeded"
3836            | "row_identity_duplicate_tracking_limit_exceeded"
3837            | "verification_report_issue_limit_exceeded"
3838    )
3839}
3840
3841fn manifest_path_for_create(
3842    path: &Path,
3843    base_dir: &Path,
3844    options: &CreateManifestOptions,
3845    context: &str,
3846) -> Result<String, ManifestError> {
3847    let canonical_path = fs::canonicalize(path)?;
3848    let canonical_base = fs::canonicalize(base_dir)?;
3849    if let Ok(relative) = canonical_path.strip_prefix(&canonical_base) {
3850        if !relative.as_os_str().is_empty() {
3851            return Ok(path_to_manifest_string(relative));
3852        }
3853        return Ok(".".to_string());
3854    }
3855
3856    if !options.allow_path_escape {
3857        return Err(ManifestError::invalid(format!(
3858            "{context} path {} is outside manifest directory {}; use --allow-path-escape to create a manifest that requires non-default verification policy",
3859            canonical_path.display(),
3860            canonical_base.display()
3861        )));
3862    }
3863
3864    if let Some(relative) = relative_path_between(&canonical_base, &canonical_path) {
3865        return Ok(path_to_manifest_string(&relative));
3866    }
3867
3868    if options.allow_absolute_paths {
3869        return Ok(path_to_manifest_string(&canonical_path));
3870    }
3871
3872    Err(ManifestError::invalid(format!(
3873        "{context} path {} cannot be expressed relative to manifest directory {}; use --allow-absolute-paths with --allow-path-escape",
3874        canonical_path.display(),
3875        canonical_base.display()
3876    )))
3877}
3878
3879fn relative_path_between(base: &Path, target: &Path) -> Option<PathBuf> {
3880    let base_components = base.components().collect::<Vec<_>>();
3881    let target_components = target.components().collect::<Vec<_>>();
3882    let mut common = 0usize;
3883    while common < base_components.len()
3884        && common < target_components.len()
3885        && base_components[common] == target_components[common]
3886    {
3887        common += 1;
3888    }
3889
3890    if common == 0 {
3891        return None;
3892    }
3893
3894    let mut relative = PathBuf::new();
3895    for component in &base_components[common..] {
3896        if matches!(component, Component::Normal(_)) {
3897            relative.push("..");
3898        }
3899    }
3900    for component in &target_components[common..] {
3901        match component {
3902            Component::Normal(part) => relative.push(part),
3903            Component::CurDir => {}
3904            Component::ParentDir => relative.push(".."),
3905            Component::Prefix(_) | Component::RootDir => return None,
3906        }
3907    }
3908    Some(relative)
3909}
3910
3911fn path_to_manifest_string(path: &Path) -> String {
3912    if path.is_absolute() {
3913        return path.display().to_string().replace('\\', "/");
3914    }
3915    let parts = path
3916        .components()
3917        .filter_map(|component| match component {
3918            Component::Normal(part) => Some(part.to_string_lossy().into_owned()),
3919            Component::CurDir => Some(".".to_string()),
3920            Component::ParentDir => Some("..".to_string()),
3921            Component::Prefix(_) | Component::RootDir => None,
3922        })
3923        .collect::<Vec<_>>();
3924    if parts.is_empty() {
3925        ".".to_string()
3926    } else {
3927        parts.join("/")
3928    }
3929}
3930
3931fn path_to_display(path: &Path) -> String {
3932    path.display().to_string()
3933}
3934
3935fn extension_key_is_namespaced(key: &str) -> bool {
3936    if key.contains("://") || key.starts_with("urn:") {
3937        return true;
3938    }
3939    let mut parts = key.split('.');
3940    let Some(first) = parts.next() else {
3941        return false;
3942    };
3943    if !valid_extension_part(first) {
3944        return false;
3945    }
3946    let mut saw_second = false;
3947    for part in parts {
3948        saw_second = true;
3949        if !valid_extension_part(part) {
3950            return false;
3951        }
3952    }
3953    saw_second
3954}
3955
3956fn valid_extension_part(part: &str) -> bool {
3957    !part.is_empty()
3958        && part
3959            .bytes()
3960            .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_')
3961        && part.bytes().any(|b| b.is_ascii_alphanumeric())
3962}
3963
3964fn is_sha256_hex(value: &str) -> bool {
3965    value.len() == 64
3966        && value
3967            .bytes()
3968            .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f'))
3969}
3970
3971fn hex_digest_eq(a: &str, b: &str) -> bool {
3972    a == b
3973}
3974
3975#[cfg(feature = "sqlite")]
3976pub mod sqlite;