Skip to main content

osproxy_core/
error.rs

1//! The request-path error taxonomy.
2//!
3//! Every failure reachable from the request path is typed and carries an
4//! [`ErrorContext`]: a stable code, the decision chain that led there, whether
5//! it is retryable, and an actionable remediation hint. This is what lets an
6//! LLM diagnose a failure from telemetry alone, with no source reading
7//! (`docs/02` §4, `docs/05`, NFR-T5).
8//!
9//! The context carries **ids and shapes only, never tenant values or secrets**
10//! (`docs/05` §7).
11
12use std::fmt;
13
14use crate::ids::{ClusterId, IndexName, PartitionId, PrincipalId};
15
16/// A stable, documented, machine-matchable error code.
17///
18/// Codes are part of the public contract: operators and LLMs match on them and
19/// look them up in the generated error reference. `#[non_exhaustive]` so new
20/// codes are additive (`docs/08` §7).
21#[non_exhaustive]
22#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
23pub enum ErrorCode {
24    /// The partition could not be resolved from the request.
25    PartitionUnresolved,
26    /// No placement exists for the resolved partition.
27    PlacementMissing,
28    /// The placement-lookup backend was unavailable.
29    PlacementBackendUnavailable,
30    /// The endpoint is not supported for tenancy rewriting in this mode.
31    UnsupportedEndpoint,
32    /// A write was rejected because its stamped epoch is stale for a migrating
33    /// partition (`docs/06` §2). Retryable: the client re-resolves.
34    StaleEpoch,
35    /// Client authentication failed.
36    AuthFailed,
37    /// The authenticated principal is not authorized for the action.
38    Unauthorized,
39    /// The upstream cluster failed (timeout, reset, 5xx).
40    UpstreamFailed,
41    /// The proxy is shedding load.
42    Overloaded,
43    /// A scroll/PIT cursor could not be resolved to its pinned cluster, its
44    /// affinity envelope is missing, malformed, or unverifiable. The client must
45    /// re-issue the originating search (`docs/03` §6).
46    CursorUnresolvable,
47    /// The request body exceeded a size cap (e.g. a single `_bulk` line over the
48    /// per-op limit). A client error: the client must split or shrink the body.
49    PayloadTooLarge,
50}
51
52impl ErrorCode {
53    /// A short, stable, machine-readable slug for logs and trace attributes.
54    #[must_use]
55    pub fn as_slug(self) -> &'static str {
56        match self {
57            Self::PartitionUnresolved => "partition_unresolved",
58            Self::PlacementMissing => "placement_missing",
59            Self::PlacementBackendUnavailable => "placement_backend_unavailable",
60            Self::UnsupportedEndpoint => "unsupported_endpoint",
61            Self::StaleEpoch => "stale_epoch",
62            Self::AuthFailed => "auth_failed",
63            Self::Unauthorized => "unauthorized",
64            Self::UpstreamFailed => "upstream_failed",
65            Self::Overloaded => "overloaded",
66            Self::CursorUnresolvable => "cursor_unresolvable",
67            Self::PayloadTooLarge => "payload_too_large",
68        }
69    }
70}
71
72impl fmt::Display for ErrorCode {
73    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74        f.write_str(self.as_slug())
75    }
76}
77
78/// The ordered chain of routing decisions made before a failure occurred.
79///
80/// Each field is populated as the request advances through the pipeline so a
81/// failure at any stage carries the full upstream context. Every field is an
82/// id (never a value), keeping the chain safe to emit in telemetry.
83#[derive(Clone, Default, PartialEq, Eq, Debug)]
84pub struct DecisionChain {
85    /// The authenticated principal, if authentication succeeded.
86    pub principal: Option<PrincipalId>,
87    /// The resolved partition, if resolution succeeded.
88    pub partition: Option<PartitionId>,
89    /// The target cluster, if placement resolved.
90    pub cluster: Option<ClusterId>,
91    /// The target index, if placement resolved.
92    pub index: Option<IndexName>,
93}
94
95impl DecisionChain {
96    /// An empty chain (nothing decided yet).
97    #[must_use]
98    pub fn new() -> Self {
99        Self::default()
100    }
101}
102
103/// The structured context attached to every request-path error.
104#[derive(Clone, PartialEq, Eq, Debug)]
105pub struct ErrorContext {
106    /// The stable error code.
107    pub code: ErrorCode,
108    /// The decision chain leading to the failure (ids/shapes only).
109    pub decision_chain: DecisionChain,
110    /// Whether the caller may retry (possibly after re-resolving).
111    pub retryable: bool,
112    /// A short, actionable hint for an operator or LLM.
113    pub remediation: &'static str,
114}
115
116impl ErrorContext {
117    /// Builds a context for `code` with an empty decision chain. Stages enrich
118    /// the chain via [`ErrorContext::with_chain`] as context becomes available.
119    #[must_use]
120    pub fn new(code: ErrorCode, retryable: bool, remediation: &'static str) -> Self {
121        Self {
122            code,
123            decision_chain: DecisionChain::new(),
124            retryable,
125            remediation,
126        }
127    }
128
129    /// Attaches a decision chain (builder style).
130    #[must_use]
131    pub fn with_chain(mut self, chain: DecisionChain) -> Self {
132        self.decision_chain = chain;
133        self
134    }
135}
136
137impl fmt::Display for ErrorContext {
138    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
139        write!(
140            f,
141            "{} (retryable={}): {}",
142            self.code, self.retryable, self.remediation
143        )
144    }
145}
146
147impl std::error::Error for ErrorContext {}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    /// Every variant's slug and Display, so each `as_slug` arm is exercised and
154    /// slugs stay stable, distinct, and lowercase (they are part of the public
155    /// contract, operators and LLMs match on them).
156    #[test]
157    fn every_error_code_has_a_stable_distinct_slug() {
158        let all = [
159            ErrorCode::PartitionUnresolved,
160            ErrorCode::PlacementMissing,
161            ErrorCode::PlacementBackendUnavailable,
162            ErrorCode::UnsupportedEndpoint,
163            ErrorCode::StaleEpoch,
164            ErrorCode::AuthFailed,
165            ErrorCode::Unauthorized,
166            ErrorCode::UpstreamFailed,
167            ErrorCode::Overloaded,
168            ErrorCode::CursorUnresolvable,
169            ErrorCode::PayloadTooLarge,
170        ];
171        let mut seen = std::collections::HashSet::new();
172        for code in all {
173            let slug = code.as_slug();
174            assert_eq!(slug, code.to_string(), "Display must equal as_slug");
175            assert!(
176                slug.chars().all(|c| c.is_ascii_lowercase() || c == '_'),
177                "{slug} must be lowercase snake_case"
178            );
179            assert!(seen.insert(slug), "duplicate slug {slug}");
180        }
181        assert_eq!(seen.len(), all.len());
182    }
183
184    #[test]
185    fn context_carries_chain_and_displays_actionably() {
186        let chain = DecisionChain {
187            partition: Some(PartitionId::from("t-1")),
188            ..DecisionChain::new()
189        };
190        let ctx = ErrorContext::new(
191            ErrorCode::PlacementMissing,
192            false,
193            "register a placement for the partition",
194        )
195        .with_chain(chain.clone());
196
197        assert_eq!(ctx.decision_chain, chain);
198        assert!(!ctx.retryable);
199        assert!(ctx.to_string().contains("placement_missing"));
200        assert!(ctx.to_string().contains("register a placement"));
201    }
202
203    #[test]
204    fn context_is_a_std_error() {
205        fn assert_error<E: std::error::Error>(_: &E) {}
206        let ctx = ErrorContext::new(ErrorCode::Overloaded, true, "retry with backoff");
207        assert_error(&ctx);
208    }
209}