Skip to main content

vernier_core/
error.rs

1//! Typed errors for the evaluator.
2//!
3//! Per the workspace clippy lints, we forbid `panic!`, `unwrap`, and
4//! `expect` in non-test code. Every fallible operation in `vernier-core`
5//! returns `Result<_, EvalError>`, including `Similarity::compute`
6//! (per ADR-0005).
7
8use thiserror::Error;
9use vernier_mask::MaskError;
10use vernier_partial::PartialError;
11
12// Re-export the shared sub-discriminator under its existing path so
13// callers (FFI, tests) keep using `EvalError::PartialFormatMismatch
14// { kind: PartialFormatErrorKind }` unchanged after ADR-0032's move
15// of the framing logic into the leaf crate.
16pub use vernier_partial::PartialFormatErrorKind;
17
18/// Unified error type for evaluation paths.
19///
20/// Variants are kept coarse on purpose: each one corresponds to a class
21/// of failure a caller can plausibly recover from or report distinctly.
22/// We add new variants as they're needed, rather than enumerating every
23/// possible cause up front.
24#[derive(Debug, Error)]
25pub enum EvalError {
26    /// Two annotations or two RLEs disagree on dimensions in a way that
27    /// makes the operation undefined. Replaces the `-1` sentinel
28    /// pycocotools' `rleIou` returns on dimension mismatch (quirk
29    /// **I2**, dispositioned `corrected` per ADR-0002).
30    #[error("dimension mismatch: {detail}")]
31    DimensionMismatch {
32        /// Free-form detail string for the operator that detected the
33        /// mismatch; carries the offending dimensions.
34        detail: String,
35    },
36
37    /// Annotation could not be parsed from JSON, or referenced an
38    /// `image_id` / `category_id` that the dataset does not contain.
39    /// Quirk **J5** in pycocotools is the matching enforcement on
40    /// `loadRes`.
41    #[error("invalid annotation: {detail}")]
42    InvalidAnnotation {
43        /// Free-form detail string identifying the offending field.
44        detail: String,
45    },
46
47    /// JSON deserialization failed before any vernier-side validation.
48    #[error("json: {0}")]
49    Json(#[from] serde_json::Error),
50
51    /// Mask-side operation failed (codec decode, polygon rasterization,
52    /// merge dimension mismatch). Propagated from `vernier-mask` per
53    /// ADR-0009's one-way dependency.
54    #[error("mask: {0}")]
55    Mask(#[from] MaskError),
56
57    /// Numeric input was not finite (NaN or infinity reached an
58    /// arithmetic that cannot tolerate it). Used at boundaries where
59    /// we receive scores or coordinates from external code.
60    #[error("non-finite value in {context}")]
61    NonFinite {
62        /// Where the non-finite value was encountered.
63        context: &'static str,
64    },
65
66    /// Caller-supplied evaluation parameters are inconsistent with the
67    /// data they're being applied to (e.g., a maxDet value that the
68    /// accumulator never saw, an IoU threshold absent from the
69    /// ladder). Distinct from `InvalidAnnotation`, which is for
70    /// dataset-side data errors.
71    #[error("invalid config: {detail}")]
72    InvalidConfig {
73        /// Free-form detail string identifying the offending parameter.
74        detail: String,
75    },
76
77    /// Streaming evaluator memory budget exceeded. Carries a breakdown of
78    /// where bytes are spent so the user can pick a remediation (shard,
79    /// shrink iou_thresholds, raise budget).
80    #[error("memory budget exceeded: used {used_bytes} / budget {budget_bytes} bytes")]
81    OutOfBudget {
82        /// Total bytes the evaluator was holding when it tripped the budget.
83        used_bytes: usize,
84        /// Configured budget cap.
85        budget_bytes: usize,
86        /// Stable keys: `"cells_store"`, `"scores"`, `"match_flags"`. The
87        /// schema is future-additive — consumers must tolerate extra keys.
88        breakdown: std::collections::HashMap<&'static str, usize>,
89    },
90
91    /// Feature wired but not yet implemented in v0. Used by the streaming
92    /// evaluator's `checkpoint`/`restore` pair, deferred per the user's
93    /// scope decision; future ADR re-introduces the implementation.
94    #[error("not implemented: {feature}")]
95    NotImplemented {
96        /// Stable identifier of the unimplemented feature, e.g.
97        /// `"StreamingEvaluator::checkpoint"`.
98        feature: &'static str,
99    },
100
101    /// `per_pair` row count exceeded the configured cap (ADR-0019
102    /// `TablesConfig::per_pair_max_rows`). Carries the observed count
103    /// at the moment the cap was tripped and the cap value, so callers
104    /// can decide whether to raise the cap or constrain the workload.
105    #[error("per_pair table exceeded cap: would emit at least {observed} rows, cap {cap}")]
106    PerPairOverflow {
107        /// Best-effort lower bound on the row count at the moment the
108        /// cap was tripped. The check is per-cell so the actual final
109        /// count may be larger; this is the value that triggered the
110        /// abort.
111        observed: usize,
112        /// `TablesConfig::per_pair_max_rows` value the caller (or
113        /// default) configured.
114        cap: usize,
115    },
116
117    /// LVIS federated metadata violates the disjointness invariant
118    /// for one `(image, category)` cell: the category appears in both
119    /// `not_exhaustive_category_ids` and `neg_category_ids` (or is
120    /// listed in `neg_category_ids` while a GT of that category exists,
121    /// which would put it implicitly in `pos`). Quirk **AA7** of
122    /// ADR-0026, dispositioned `corrected`: lvis-api silently picks
123    /// `not_exhaustive` on overlap; vernier rejects at load.
124    #[error("lvis federated conflict on image_id={image_id}, category_id={category_id}: {detail}")]
125    LvisFederatedConflict {
126        /// Offending image id.
127        image_id: i64,
128        /// Offending category id.
129        category_id: i64,
130        /// Free-form detail string identifying which constraint failed
131        /// (e.g., `"category in both not_exhaustive and neg"`).
132        detail: &'static str,
133    },
134
135    /// LVIS dataset is missing the `frequency` field on one or more
136    /// categories. Quirk **AB6** of ADR-0026, dispositioned `corrected`:
137    /// lvis-api raises `KeyError` mid-eval on the first miss; vernier
138    /// raises at load with the full list of offending categories so
139    /// the failure is debuggable in one shot.
140    ///
141    /// The `category_ids` list is sorted ascending for stable error
142    /// messages.
143    #[error(
144        "lvis dataset is missing `frequency` on {} categories: {category_ids:?}",
145        category_ids.len()
146    )]
147    MissingFrequency {
148        /// Sorted list of category ids that lacked a `frequency` value.
149        category_ids: Vec<i64>,
150    },
151
152    /// Partial wire-format header / framing rejected by
153    /// [`vernier_partial::with_validated_envelope`] (ADR-0031). The `kind`
154    /// names which structural check tripped — magic, version, CRC,
155    /// kernel discriminator, grid dims, or rkyv archive validation.
156    #[error("partial wire format rejected: {kind}")]
157    PartialFormatMismatch {
158        /// Which framing or structural check failed. See
159        /// [`PartialFormatErrorKind`].
160        kind: PartialFormatErrorKind,
161    },
162
163    /// One or more partials carry a `dataset_hash` that doesn't match
164    /// the live dataset's. Means the partial was computed against a
165    /// different GT than the receiving rank loaded — almost always a
166    /// sampler / config bug; refusing protects the merge result from
167    /// the head-rank's perspective. ADR-0031 §"Validation order" #6.
168    #[error("partial dataset_hash mismatch: expected {expected:02x?}, got {actual:02x?}")]
169    PartialDatasetMismatch {
170        /// Receiving rank's `dataset_hash` (what the partial was
171        /// expected to be computed against).
172        expected: [u8; 32],
173        /// Partial's declared `dataset_hash` (what was actually used).
174        actual: [u8; 32],
175    },
176
177    /// One or more partials carry a `params_hash` that doesn't match
178    /// the receiving rank's. Means the partial was produced with
179    /// different `iou_thresholds` / `max_dets` / `use_cats` / etc. and
180    /// the merged result would not equal a batch run. ADR-0031
181    /// §"Validation order" #7.
182    #[error("partial params_hash mismatch: expected {expected:02x?}, got {actual:02x?}")]
183    PartialParamsMismatch {
184        /// Receiving rank's `params_hash`.
185        expected: [u8; 32],
186        /// Partial's declared `params_hash`.
187        actual: [u8; 32],
188    },
189
190    /// Two partials cover the same `image_id` — the disjoint-partition
191    /// rule (ADR-0031 §"Axis D" D1) is violated. Almost always a
192    /// `DistributedSampler` misconfiguration where two ranks evaluated
193    /// the same image. The error names both rank ids and the colliding
194    /// image so the user can fix their sampler.
195    #[error("partials cover image_id={image_id} on both rank {rank_a} and rank {rank_b}")]
196    PartialPartitionOverlap {
197        /// Lower rank id involved in the collision (sorted for
198        /// determinism — `min(a, b)`).
199        rank_a: u32,
200        /// Higher rank id involved in the collision.
201        rank_b: u32,
202        /// Image id that appeared in both partials' `seen_images`.
203        image_id: i64,
204    },
205
206    /// Two strict-mode partials declare the same `rank_id`. ADR-0031
207    /// §"Axis C" C2: strict-mode merge requires distinct rank ids so
208    /// the future `(score, rank_id, local_position)` tiebreak gives a
209    /// total order. Corrected mode tolerates collisions.
210    #[error("partials share rank_id={rank_id} in strict mode")]
211    PartialRankCollision {
212        /// The duplicated rank id.
213        rank_id: u32,
214    },
215}
216
217/// Translate a leaf-crate [`PartialError`] into the equivalent
218/// [`EvalError`] variant. Centralizes the variant-name mapping
219/// (`Format` ↔ `PartialFormatMismatch` etc.) so call sites use `?` to
220/// propagate.
221impl From<PartialError> for EvalError {
222    fn from(err: PartialError) -> Self {
223        match err {
224            PartialError::Format { kind } => EvalError::PartialFormatMismatch { kind },
225            PartialError::DatasetMismatch { expected, actual } => {
226                EvalError::PartialDatasetMismatch { expected, actual }
227            }
228            PartialError::ParamsMismatch { expected, actual } => {
229                EvalError::PartialParamsMismatch { expected, actual }
230            }
231            PartialError::PartitionOverlap {
232                rank_a,
233                rank_b,
234                image_id,
235            } => EvalError::PartialPartitionOverlap {
236                rank_a,
237                rank_b,
238                image_id,
239            },
240            PartialError::RankCollision { rank_id } => EvalError::PartialRankCollision { rank_id },
241        }
242    }
243}