Skip to main content

cellos_server/
error.rs

1//! RFC 9457 Problem Details for HTTP APIs.
2//!
3//! Every error path in the server returns `application/problem+json` so
4//! that `cellctl` (and the web UI) can render structured diagnostics
5//! without parsing free-form strings. The `type` field is a stable
6//! identifier — clients may switch on it; the `title`/`detail` fields are
7//! human-readable and may change.
8
9use axum::body::{to_bytes, Body};
10use axum::http::{header, HeaderValue, StatusCode};
11use axum::response::{IntoResponse, Response};
12use axum::Json;
13use serde::Serialize;
14
15/// Stable error identifier. Adding a variant is a non-breaking change;
16/// renaming one IS breaking (clients pin on `type`).
17#[derive(Debug, Clone, Copy)]
18pub enum AppErrorKind {
19    Unauthorized,
20    BadRequest,
21    NotFound,
22    Conflict,
23    Internal,
24    /// FUZZ-WAVE-1 MED-1: axum's built-in extractors (Json, Path, Query)
25    /// reject malformed input with `text/plain`. We catch those rejections
26    /// in the response-mapping middleware and re-emit them as
27    /// problem+json under these stable `type` URIs.
28    PayloadTooLarge,
29    UnsupportedMediaType,
30    MethodNotAllowed,
31    /// FUZZ-CRIT-1: the upstream event store (JetStream/NATS) is
32    /// unreachable or timing out. Distinct from `Internal` because the
33    /// HTTP control plane itself is healthy — only the data tier behind
34    /// `/v1/events` is degraded.
35    ServiceUnavailable,
36    /// Discriminants from ADR-0010 §Enforcement: cellos-server admission
37    /// gate rejection reasons. Surfaced via `application/problem+json`
38    /// so cellctl can switch on `type` without parsing `detail`.
39    FormationCycle,
40    FormationMultipleCoordinators,
41    FormationNoCoordinator,
42    FormationAuthorityNotNarrowing,
43    /// FUZZ-MED-4: distinct discriminant for duplicate `members[*].id`.
44    /// Historically the duplicate-id case rode on `multiple-coordinators`
45    /// because the ADR-0010 §Consequences narrative ("two members both
46    /// named `coord`") used the same scenario. Operators reading the
47    /// problem-type couldn't distinguish "your manifest declares two
48    /// coordinators" (a structural design error) from "your manifest
49    /// declares two members with the same id" (a typo). Both still
50    /// 400; clients pinning on the old type continue to see 400 + the
51    /// authority-not-narrowing/cycle chain when applicable. New clients
52    /// can switch on this discriminant to render a typo-specific hint.
53    FormationDuplicateMemberId,
54}
55
56impl AppErrorKind {
57    pub fn status(self) -> StatusCode {
58        match self {
59            AppErrorKind::Unauthorized => StatusCode::UNAUTHORIZED,
60            AppErrorKind::BadRequest
61            | AppErrorKind::FormationCycle
62            | AppErrorKind::FormationMultipleCoordinators
63            | AppErrorKind::FormationNoCoordinator
64            | AppErrorKind::FormationAuthorityNotNarrowing
65            | AppErrorKind::FormationDuplicateMemberId => StatusCode::BAD_REQUEST,
66            AppErrorKind::NotFound => StatusCode::NOT_FOUND,
67            AppErrorKind::MethodNotAllowed => StatusCode::METHOD_NOT_ALLOWED,
68            AppErrorKind::Conflict => StatusCode::CONFLICT,
69            AppErrorKind::PayloadTooLarge => StatusCode::PAYLOAD_TOO_LARGE,
70            AppErrorKind::UnsupportedMediaType => StatusCode::UNSUPPORTED_MEDIA_TYPE,
71            AppErrorKind::Internal => StatusCode::INTERNAL_SERVER_ERROR,
72            AppErrorKind::ServiceUnavailable => StatusCode::SERVICE_UNAVAILABLE,
73        }
74    }
75
76    /// `type` URI identifier per RFC 9457 §3.1. We use relative URI
77    /// references rooted at `/problems/` so the server's deployment URL
78    /// does not affect the stable identifier.
79    pub fn type_uri(self) -> &'static str {
80        match self {
81            AppErrorKind::Unauthorized => "/problems/unauthorized",
82            AppErrorKind::BadRequest => "/problems/bad-request",
83            AppErrorKind::NotFound => "/problems/not-found",
84            AppErrorKind::Conflict => "/problems/conflict",
85            AppErrorKind::Internal => "/problems/internal",
86            AppErrorKind::PayloadTooLarge => "/problems/payload-too-large",
87            AppErrorKind::UnsupportedMediaType => "/problems/unsupported-media-type",
88            AppErrorKind::MethodNotAllowed => "/problems/method-not-allowed",
89            AppErrorKind::ServiceUnavailable => "/problems/service-unavailable",
90            AppErrorKind::FormationCycle => "/problems/formation/cycle",
91            AppErrorKind::FormationMultipleCoordinators => {
92                "/problems/formation/multiple-coordinators"
93            }
94            AppErrorKind::FormationNoCoordinator => "/problems/formation/no-coordinator",
95            AppErrorKind::FormationAuthorityNotNarrowing => {
96                "/problems/formation/authority-not-narrowing"
97            }
98            AppErrorKind::FormationDuplicateMemberId => "/problems/formation/duplicate-member-id",
99        }
100    }
101
102    pub fn title(self) -> &'static str {
103        match self {
104            AppErrorKind::Unauthorized => "Unauthorized",
105            AppErrorKind::BadRequest => "Bad Request",
106            AppErrorKind::NotFound => "Not Found",
107            AppErrorKind::Conflict => "Conflict",
108            AppErrorKind::Internal => "Internal Server Error",
109            AppErrorKind::PayloadTooLarge => "Payload Too Large",
110            AppErrorKind::UnsupportedMediaType => "Unsupported Media Type",
111            AppErrorKind::MethodNotAllowed => "Method Not Allowed",
112            AppErrorKind::ServiceUnavailable => "Event store unavailable",
113            AppErrorKind::FormationCycle => "Formation rejected: authority cycle",
114            AppErrorKind::FormationMultipleCoordinators => {
115                "Formation rejected: multiple coordinators"
116            }
117            AppErrorKind::FormationNoCoordinator => "Formation rejected: no coordinator",
118            AppErrorKind::FormationAuthorityNotNarrowing => {
119                "Formation rejected: authority does not narrow"
120            }
121            AppErrorKind::FormationDuplicateMemberId => "Formation rejected: duplicate member id",
122        }
123    }
124}
125
126#[derive(Debug, Clone)]
127pub struct AppError {
128    pub kind: AppErrorKind,
129    pub detail: String,
130}
131
132impl AppError {
133    pub fn new(kind: AppErrorKind, detail: impl Into<String>) -> Self {
134        Self {
135            kind,
136            detail: detail.into(),
137        }
138    }
139
140    pub fn bad_request(detail: impl Into<String>) -> Self {
141        Self::new(AppErrorKind::BadRequest, detail)
142    }
143
144    pub fn unauthorized(detail: impl Into<String>) -> Self {
145        Self::new(AppErrorKind::Unauthorized, detail)
146    }
147
148    pub fn not_found(detail: impl Into<String>) -> Self {
149        Self::new(AppErrorKind::NotFound, detail)
150    }
151
152    pub fn internal(detail: impl Into<String>) -> Self {
153        Self::new(AppErrorKind::Internal, detail)
154    }
155
156    pub fn payload_too_large(detail: impl Into<String>) -> Self {
157        Self::new(AppErrorKind::PayloadTooLarge, detail)
158    }
159
160    pub fn unsupported_media_type(detail: impl Into<String>) -> Self {
161        Self::new(AppErrorKind::UnsupportedMediaType, detail)
162    }
163
164    pub fn method_not_allowed(detail: impl Into<String>) -> Self {
165        Self::new(AppErrorKind::MethodNotAllowed, detail)
166    }
167
168    /// Redacted 503 for upstream-data-tier failures. The `detail` text is
169    /// fixed at the type level so callers cannot accidentally splice
170    /// internal stream/subject names into the response body — the
171    /// FUZZ-CRIT-1 leak. Operators get the underlying cause via the WARN
172    /// log emitted at the call site, not via this user-visible body.
173    pub fn service_unavailable() -> Self {
174        Self::new(
175            AppErrorKind::ServiceUnavailable,
176            "Event store is temporarily unreachable; retry later",
177        )
178    }
179}
180
181/// Media type identifier per RFC 9457 §3.
182pub const PROBLEM_JSON_CT: &str = "application/problem+json";
183
184/// Build a problem+json response from a kind + detail string, bypassing
185/// the full `AppError` construction path. Used by fallbacks and the
186/// rejection-normalising middleware where we already know the status.
187pub fn problem_response(kind: AppErrorKind, detail: impl Into<String>) -> Response {
188    AppError::new(kind, detail).into_response()
189}
190
191/// Wire shape of the problem document (RFC 9457 §3.1).
192#[derive(Debug, Serialize)]
193struct ProblemDetails<'a> {
194    #[serde(rename = "type")]
195    type_uri: &'a str,
196    title: &'a str,
197    status: u16,
198    detail: &'a str,
199}
200
201impl IntoResponse for AppError {
202    fn into_response(self) -> Response {
203        let status = self.kind.status();
204        let body = ProblemDetails {
205            type_uri: self.kind.type_uri(),
206            title: self.kind.title(),
207            status: status.as_u16(),
208            detail: &self.detail,
209        };
210        let mut resp = (status, Json(body)).into_response();
211        // RFC 9457 §3 — the media type is `application/problem+json`.
212        resp.headers_mut().insert(
213            axum::http::header::CONTENT_TYPE,
214            axum::http::HeaderValue::from_static("application/problem+json"),
215        );
216        resp
217    }
218}
219
220impl From<anyhow::Error> for AppError {
221    fn from(e: anyhow::Error) -> Self {
222        AppError::internal(format!("{e:#}"))
223    }
224}
225
226impl From<serde_json::Error> for AppError {
227    fn from(e: serde_json::Error) -> Self {
228        AppError::bad_request(format!("invalid json: {e}"))
229    }
230}
231
232/// FUZZ-WAVE-1 MED-1 / MED-2: response-mapping middleware that
233/// guarantees every 4xx leaving the server carries
234/// `Content-Type: application/problem+json` (RFC 9457 §3).
235///
236/// axum's built-in extractors (`Json`, `Path`, `Query`,
237/// `DefaultBodyLimit`) reject malformed input by returning a bare
238/// `text/plain` body with the error string. The application-level
239/// `AppError` path is already problem+json; this layer brings axum's
240/// built-in rejections — plus the 404/405 fallbacks below — into the
241/// same wire shape.
242///
243/// Strategy: inspect the outgoing response. If status is 4xx **and**
244/// the existing Content-Type is **not** `application/problem+json`,
245/// drain the body, pick a kind from the status, and re-emit. Headers
246/// other than Content-Type/Content-Length are preserved verbatim — this
247/// matters for 405 where axum already set `Allow:`.
248///
249/// 2xx, 3xx, and 5xx responses pass through unchanged. The Critical
250/// finding in the wave-1 report (5xx leak) is out of scope for this
251/// fix; this middleware only normalises 4xx content-type.
252pub async fn normalize_problem_response(resp: Response) -> Response {
253    let status = resp.status();
254
255    if !status.is_client_error() {
256        return resp;
257    }
258
259    let is_problem_json = resp
260        .headers()
261        .get(header::CONTENT_TYPE)
262        .and_then(|v| v.to_str().ok())
263        .map(|ct| ct.starts_with(PROBLEM_JSON_CT))
264        .unwrap_or(false);
265
266    if is_problem_json {
267        return resp;
268    }
269
270    // Preserve headers we want to carry across the body rewrite. The
271    // `Allow` header on a 405 is the most important — RFC 9110 §15.5.6
272    // requires it and operators rely on it to discover the valid verbs.
273    let allow_header = resp.headers().get(header::ALLOW).cloned();
274
275    let (parts, body) = resp.into_parts();
276    // 64 KiB is more than enough for an axum rejection string. If a
277    // hostile upstream layer ever attaches a giant body to a 4xx we
278    // drop it on the floor and fall back to a generic detail.
279    let detail_bytes = to_bytes(body, 64 * 1024).await.unwrap_or_default();
280    let detail = std::str::from_utf8(&detail_bytes)
281        .unwrap_or("")
282        .trim()
283        .to_string();
284
285    let kind = match status {
286        StatusCode::BAD_REQUEST => AppErrorKind::BadRequest,
287        StatusCode::UNAUTHORIZED => AppErrorKind::Unauthorized,
288        StatusCode::NOT_FOUND => AppErrorKind::NotFound,
289        StatusCode::METHOD_NOT_ALLOWED => AppErrorKind::MethodNotAllowed,
290        StatusCode::CONFLICT => AppErrorKind::Conflict,
291        StatusCode::PAYLOAD_TOO_LARGE => AppErrorKind::PayloadTooLarge,
292        StatusCode::UNSUPPORTED_MEDIA_TYPE => AppErrorKind::UnsupportedMediaType,
293        // Other 4xx (422, 415, 429, ...) — fall back to a generic
294        // bad-request shape but keep the original status code below.
295        _ => AppErrorKind::BadRequest,
296    };
297
298    // Empty body (e.g. axum 0.7's built-in 404/405) produces a useless
299    // detail. Synthesize a sensible one so adopters see *something*
300    // structured.
301    let detail = if detail.is_empty() {
302        match status {
303            StatusCode::NOT_FOUND => "no route matched the request path".to_string(),
304            StatusCode::METHOD_NOT_ALLOWED => "HTTP method not allowed for this path".to_string(),
305            StatusCode::PAYLOAD_TOO_LARGE => "request body exceeds the per-route cap".to_string(),
306            _ => parts
307                .status
308                .canonical_reason()
309                .unwrap_or("client error")
310                .to_string(),
311        }
312    } else {
313        detail
314    };
315
316    let body = ProblemDetails {
317        type_uri: kind.type_uri(),
318        title: kind.title(),
319        status: status.as_u16(),
320        detail: &detail,
321    };
322    let body_bytes = serde_json::to_vec(&body)
323        .unwrap_or_else(|_| br#"{"type":"/problems/internal","title":"Internal Server Error","status":500,"detail":"failed to serialise problem document"}"#.to_vec());
324
325    let mut new = Response::builder()
326        .status(status)
327        .body(Body::from(body_bytes))
328        .expect("problem+json response build");
329
330    // Copy through every original header except those that no longer
331    // describe the rewritten body.
332    for (name, value) in parts.headers.iter() {
333        if name == header::CONTENT_TYPE || name == header::CONTENT_LENGTH {
334            continue;
335        }
336        new.headers_mut().append(name.clone(), value.clone());
337    }
338    new.headers_mut().insert(
339        header::CONTENT_TYPE,
340        HeaderValue::from_static(PROBLEM_JSON_CT),
341    );
342    // Preserve Allow if axum's method router set it and we didn't catch
343    // it in the loop above (header iteration is the canonical source,
344    // but this is a belt-and-braces guarantee for the 405 contract).
345    if let Some(v) = allow_header {
346        new.headers_mut().insert(header::ALLOW, v);
347    }
348
349    new
350}