Skip to main content

vane_core/
error.rs

1use std::borrow::Cow;
2
3pub const SERIALIZED_MESSAGE_CAP: usize = 4 * 1024;
4pub const SERIALIZED_CTX_CAP: usize = 1024;
5pub const SERIALIZED_CHAIN_MAX_ENTRIES: usize = 16;
6pub const SERIALIZED_CHAIN_ENTRY_CAP: usize = 1024;
7
8#[derive(thiserror::Error, Debug)]
9#[error("{kind}{}", .ctx.as_deref().map(|c| format!(": {c}")).unwrap_or_default())]
10pub struct Error {
11	pub kind: ErrorKind,
12	pub ctx: Option<Cow<'static, str>>,
13	#[source]
14	pub source: Option<Box<dyn std::error::Error + Send + Sync>>,
15}
16
17#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
18pub enum ErrorKind {
19	#[error("i/o")]
20	Io,
21	#[error("protocol")]
22	Protocol,
23	#[error("upstream: {0}")]
24	Upstream(UpstreamReason),
25	#[error("middleware")]
26	Middleware,
27	#[error("compile")]
28	Compile,
29	#[error("timeout: {0}")]
30	Timeout(TimeoutKind),
31	#[error("canceled")]
32	Canceled,
33	#[error("resource: {0}")]
34	Resource(ResourceKind),
35	#[error("internal")]
36	Internal,
37}
38
39#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
40pub enum UpstreamReason {
41	#[error("unreachable")]
42	Unreachable,
43	#[error("reset mid-request")]
44	ResetMidRequest,
45	#[error("reset on idle pickup")]
46	ResetOnIdlePickup,
47	#[error("tls handshake failed")]
48	TlsHandshake,
49	#[error("dns resolution failed")]
50	DnsFailure,
51	#[error("refused by upstream")]
52	Refused,
53	#[error("gone")]
54	Gone,
55	#[error("malformed response")]
56	Malformed,
57}
58
59#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
60pub enum TimeoutKind {
61	#[error("connect")]
62	Connect,
63	#[error("read")]
64	Read,
65	#[error("total")]
66	Total,
67	#[error("idle")]
68	Idle,
69	#[error("handshake")]
70	Handshake,
71}
72
73#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
74pub enum ResourceKind {
75	#[error("connection pool exhausted")]
76	ConnectionPool,
77	#[error("wasm pool exhausted")]
78	WasmPool,
79	#[error("memory budget exceeded")]
80	Memory,
81	#[error("file descriptors exhausted")]
82	FdExhausted,
83}
84
85impl Error {
86	#[must_use]
87	pub const fn new(kind: ErrorKind) -> Self {
88		Self { kind, ctx: None, source: None }
89	}
90
91	#[must_use]
92	pub fn with_ctx(mut self, ctx: impl Into<Cow<'static, str>>) -> Self {
93		self.ctx = Some(ctx.into());
94		self
95	}
96
97	#[must_use]
98	pub fn with_source<E: Into<Box<dyn std::error::Error + Send + Sync>>>(mut self, e: E) -> Self {
99		self.source = Some(e.into());
100		self
101	}
102
103	#[must_use]
104	pub fn io(msg: impl Into<Cow<'static, str>>) -> Self {
105		Self::new(ErrorKind::Io).with_ctx(msg)
106	}
107
108	#[must_use]
109	pub fn protocol(msg: impl Into<Cow<'static, str>>) -> Self {
110		Self::new(ErrorKind::Protocol).with_ctx(msg)
111	}
112
113	#[must_use]
114	pub const fn upstream(reason: UpstreamReason) -> Self {
115		Self::new(ErrorKind::Upstream(reason))
116	}
117
118	#[must_use]
119	pub fn middleware(msg: impl Into<Cow<'static, str>>) -> Self {
120		Self::new(ErrorKind::Middleware).with_ctx(msg)
121	}
122
123	#[must_use]
124	pub fn compile(msg: impl Into<Cow<'static, str>>) -> Self {
125		Self::new(ErrorKind::Compile).with_ctx(msg)
126	}
127
128	#[must_use]
129	pub const fn timeout(kind: TimeoutKind) -> Self {
130		Self::new(ErrorKind::Timeout(kind))
131	}
132
133	#[must_use]
134	pub const fn canceled() -> Self {
135		Self::new(ErrorKind::Canceled)
136	}
137
138	#[must_use]
139	pub const fn resource(kind: ResourceKind) -> Self {
140		Self::new(ErrorKind::Resource(kind))
141	}
142
143	/// Build an `ErrorKind::Internal` carrier for a detected invariant
144	/// violation.
145	///
146	/// **Reserved for invariant breaks.** The error class signals that
147	/// the code has reached a state the type system or the lower-pass
148	/// invariants were supposed to make unreachable — examples in this
149	/// codebase are `l4_forward` receiving an unexpected `L4Conn`
150	/// variant, the executor finding the dispatch table missing from
151	/// `ConnContext.user`, or a response builder rejecting bytes we
152	/// validated upstream. Runtime user-data failures
153	/// (`std::io::Error`, WASM trap, hyper-build mismatch on operator-
154	/// controlled bytes) belong on `Error::middleware` / `Error::io` /
155	/// `Error::protocol` instead.
156	///
157	/// In debug / test builds the constructor `debug_assert!`s false so
158	/// the panic surfaces locally with the message context — invariant
159	/// breaks are bugs that deserve to be found at dev time, not
160	/// silently 500ed in production. Release builds keep the cheap
161	/// `Error` construction path.
162	#[must_use]
163	#[track_caller]
164	pub fn internal(msg: impl Into<Cow<'static, str>>) -> Self {
165		let ctx = msg.into();
166		// Allow tests to construct `Error::internal(...)` as a fixture
167		// (e.g. asserting downstream code surfaces it correctly).
168		// Non-test debug builds panic so dev iterations catch the
169		// invariant break immediately.
170		#[cfg(all(debug_assertions, not(test)))]
171		debug_assert!(false, "Error::internal invariant violation: {ctx}");
172		Self::new(ErrorKind::Internal).with_ctx(ctx)
173	}
174
175	#[must_use]
176	pub const fn kind(&self) -> &ErrorKind {
177		&self.kind
178	}
179
180	#[must_use]
181	pub fn ctx(&self) -> Option<&str> {
182		self.ctx.as_deref()
183	}
184
185	#[must_use]
186	pub const fn kind_label(&self) -> &'static str {
187		match &self.kind {
188			ErrorKind::Io => "io",
189			ErrorKind::Protocol => "protocol",
190			ErrorKind::Upstream(_) => "upstream",
191			ErrorKind::Middleware => "middleware",
192			ErrorKind::Compile => "compile",
193			ErrorKind::Timeout(_) => "timeout",
194			ErrorKind::Canceled => "canceled",
195			ErrorKind::Resource(_) => "resource",
196			ErrorKind::Internal => "internal",
197		}
198	}
199
200	#[must_use]
201	pub const fn reason_label(&self) -> Option<&'static str> {
202		match &self.kind {
203			ErrorKind::Upstream(r) => Some(match r {
204				UpstreamReason::Unreachable => "unreachable",
205				UpstreamReason::ResetMidRequest => "reset_mid_request",
206				UpstreamReason::ResetOnIdlePickup => "reset_idle_pickup",
207				UpstreamReason::TlsHandshake => "tls_handshake",
208				UpstreamReason::DnsFailure => "dns_failure",
209				UpstreamReason::Refused => "refused",
210				UpstreamReason::Gone => "gone",
211				UpstreamReason::Malformed => "malformed",
212			}),
213			ErrorKind::Timeout(t) => Some(match t {
214				TimeoutKind::Connect => "connect",
215				TimeoutKind::Read => "read",
216				TimeoutKind::Total => "total",
217				TimeoutKind::Idle => "idle",
218				TimeoutKind::Handshake => "handshake",
219			}),
220			ErrorKind::Resource(r) => Some(match r {
221				ResourceKind::ConnectionPool => "connection_pool",
222				ResourceKind::WasmPool => "wasm_pool",
223				ResourceKind::Memory => "memory",
224				ResourceKind::FdExhausted => "fd_exhausted",
225			}),
226			_ => None,
227		}
228	}
229
230	/// Method-agnostic retry eligibility. Returns true for the
231	/// pre-connect failures (request never left the wire), plus the
232	/// hyper-pool race cases (`ResetOnIdlePickup`, `Refused`,
233	/// `Gone`) and DNS / unreachable / connect-timeout — all of
234	/// which are safe to retry regardless of HTTP method idempotency.
235	///
236	/// Mid-request failures (`ResetMidRequest`) need a method check
237	/// before retrying so we don't double-deliver a POST body. Use
238	/// [`Self::is_retryable_in`] for that path; this method exists
239	/// for back-compat with callers that already pre-gate on method
240	/// idempotency.
241	#[must_use]
242	pub const fn is_retryable(&self) -> bool {
243		match &self.kind {
244			ErrorKind::Upstream(r) => matches!(
245				r,
246				UpstreamReason::Unreachable
247					| UpstreamReason::ResetOnIdlePickup
248					| UpstreamReason::DnsFailure
249					| UpstreamReason::Refused
250					| UpstreamReason::Gone
251			),
252			ErrorKind::Timeout(TimeoutKind::Connect | TimeoutKind::Handshake)
253			| ErrorKind::Resource(ResourceKind::ConnectionPool) => true,
254			_ => false,
255		}
256	}
257
258	/// Method-aware retry eligibility, per `spec/crates/engine.md`
259	/// § _Error classification_:
260	///
261	/// - Pre-connect failures (TCP connect, TLS handshake, DNS,
262	///   connection-pool exhaustion, hyper-pool idle-pickup race)
263	///   return `true` regardless of method — the request never left
264	///   the wire, so retrying a POST is safe.
265	/// - Mid-request failures (`ResetMidRequest`) return `true`
266	///   ONLY for idempotent methods (GET / HEAD / PUT / DELETE /
267	///   OPTIONS, per RFC 9110 § 9.2.2). Retrying a non-idempotent
268	///   POST mid-request risks double-delivery.
269	/// - All other error kinds return `false`.
270	///
271	/// `Method::TRACE` is treated as non-idempotent in this table
272	/// — RFC 9110 lists it as idempotent but middleboxes routinely
273	/// rewrite it, so retrying is rarely the right move at proxy
274	/// scope.
275	#[must_use]
276	pub fn is_retryable_in(&self, method: &http::Method) -> bool {
277		use http::Method;
278		match &self.kind {
279			// Pre-connect failures: request body never reached the
280			// upstream wire, retry is always safe.
281			ErrorKind::Timeout(TimeoutKind::Connect | TimeoutKind::Handshake)
282			| ErrorKind::Resource(ResourceKind::ConnectionPool)
283			| ErrorKind::Upstream(
284				UpstreamReason::TlsHandshake
285				| UpstreamReason::DnsFailure
286				| UpstreamReason::Unreachable
287				| UpstreamReason::Refused
288				| UpstreamReason::ResetOnIdlePickup,
289			) => true,
290			// Mid-request failures: only idempotent methods retry.
291			ErrorKind::Upstream(UpstreamReason::ResetMidRequest | UpstreamReason::Gone) => matches!(
292				*method,
293				Method::GET | Method::HEAD | Method::PUT | Method::DELETE | Method::OPTIONS
294			),
295			_ => false,
296		}
297	}
298
299	#[must_use]
300	pub const fn http_status(&self) -> u16 {
301		match &self.kind {
302			ErrorKind::Protocol => 400,
303			ErrorKind::Upstream(_) => 502,
304			ErrorKind::Timeout(_) => 504,
305			ErrorKind::Resource(_) => 503,
306			ErrorKind::Canceled => 499,
307			ErrorKind::Middleware | ErrorKind::Compile | ErrorKind::Internal | ErrorKind::Io => 500,
308		}
309	}
310
311	#[must_use]
312	pub fn source_chain(&self) -> Vec<String> {
313		let mut out = Vec::new();
314		let mut cur: &dyn std::error::Error = self;
315		while let Some(src) = cur.source() {
316			out.push(src.to_string());
317			cur = src;
318		}
319		out
320	}
321
322	/// Display adapter that renders the error in a richer one-line
323	/// form suitable for `tracing::error!(error = %e.tracing(), …)`.
324	///
325	/// Layout:
326	/// ```text
327	/// <Display> reason=<reason?> chain=[<src> / <src> / …]
328	/// ```
329	///
330	/// Drop-in replacement for `error = %e`. `kind` is already
331	/// embedded in the `Display` impl (`<kind>{ctx}`); `reason` and
332	/// `chain` add the structured fields that operator post-mortems
333	/// otherwise lose. Released by `to_string()`; no extra allocations
334	/// at construction time.
335	#[must_use]
336	pub fn tracing(&self) -> ErrorTracing<'_> {
337		ErrorTracing(self)
338	}
339}
340
341/// Display adapter — see [`Error::tracing`].
342pub struct ErrorTracing<'a>(&'a Error);
343
344impl std::fmt::Display for ErrorTracing<'_> {
345	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
346		write!(f, "{}", self.0)?;
347		if let Some(reason) = self.0.reason_label() {
348			write!(f, " reason={reason}")?;
349		}
350		let chain = self.0.source_chain();
351		if !chain.is_empty() {
352			write!(f, " chain=[{}]", chain.join(" / "))?;
353		}
354		Ok(())
355	}
356}
357
358/// Accumulator for compile-pipeline diagnostics.
359///
360/// Each stage in `merge → expand → analyze → lower → validate` runs
361/// per-rule / per-node "leaf checks" against the input. Historically
362/// every leaf check used `?` to early-return, which meant an operator
363/// running `vane compile <dir>` only ever saw the first error: fix
364/// that, re-run, see the next, fix that, re-run, etc. With
365/// `Diagnostics`, leaf checks `push` instead of `?`-returning and the
366/// stage boundary decides whether to bail with the full accumulator
367/// or continue into the next stage.
368///
369/// Every entry currently has the same severity (compile error). The
370/// `has_fatal` helper exists so callers can express the "any error
371/// stops the next stage" gate clearly at stage boundaries; future
372/// warning-level diagnostics would slot in without changing call
373/// sites.
374#[derive(Debug, Default)]
375pub struct Diagnostics {
376	entries: Vec<Error>,
377}
378
379impl Diagnostics {
380	#[must_use]
381	pub const fn new() -> Self {
382		Self { entries: Vec::new() }
383	}
384
385	pub fn push(&mut self, e: Error) {
386		self.entries.push(e);
387	}
388
389	pub fn extend<I: IntoIterator<Item = Error>>(&mut self, iter: I) {
390		self.entries.extend(iter);
391	}
392
393	#[must_use]
394	pub fn is_empty(&self) -> bool {
395		self.entries.is_empty()
396	}
397
398	#[must_use]
399	pub fn len(&self) -> usize {
400		self.entries.len()
401	}
402
403	/// True when the accumulator carries at least one error that
404	/// should stop the pipeline at the next stage boundary. Equivalent
405	/// to `!is_empty()` today; reserved as a hook for warning-level
406	/// diagnostics that might land here in the future.
407	#[must_use]
408	pub fn has_fatal(&self) -> bool {
409		!self.entries.is_empty()
410	}
411
412	#[must_use]
413	pub fn entries(&self) -> &[Error] {
414		&self.entries
415	}
416
417	#[must_use]
418	pub fn into_errors(self) -> Vec<Error> {
419		self.entries
420	}
421
422	/// Stage-boundary gate. Returns `Ok(value)` when no diagnostics
423	/// have been pushed; otherwise returns `Err(Self)` so the caller
424	/// can either bubble or merge it into another accumulator.
425	///
426	/// # Errors
427	/// Returns `self` when `has_fatal()` is true.
428	pub fn into_result<T>(self, value: T) -> Result<T, Self> {
429		if self.has_fatal() { Err(self) } else { Ok(value) }
430	}
431}
432
433impl std::fmt::Display for Diagnostics {
434	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
435		match self.entries.len() {
436			0 => write!(f, "no diagnostics"),
437			1 => write!(f, "{}", self.entries[0]),
438			n => {
439				writeln!(f, "{n} compile errors:")?;
440				for (i, e) in self.entries.iter().enumerate() {
441					writeln!(f, "  [{}/{n}] {e}", i + 1)?;
442				}
443				Ok(())
444			}
445		}
446	}
447}
448
449impl From<Error> for Diagnostics {
450	fn from(e: Error) -> Self {
451		Self { entries: vec![e] }
452	}
453}
454
455/// Collapse the accumulated diagnostics into a single
456/// [`ErrorKind::Compile`] `Error` whose context carries every entry's
457/// `to_string()`, separated by `\n`. Used at the boundary into APIs
458/// whose error channel is a single `Error` (e.g. the existing
459/// `compile()` facade, the management-RPC wire payload).
460impl From<Diagnostics> for Error {
461	fn from(d: Diagnostics) -> Self {
462		match d.entries.len() {
463			0 => Error::compile("no diagnostics"),
464			1 => d.entries.into_iter().next().expect("len == 1"),
465			n => {
466				use std::fmt::Write as _;
467				let mut joined = format!("{n} compile errors:");
468				for (i, e) in d.entries.iter().enumerate() {
469					let _ = write!(joined, "\n  [{}/{n}] {e}", i + 1);
470				}
471				Error::compile(joined)
472			}
473		}
474	}
475}
476
477fn from_source<E>(kind: ErrorKind, e: E) -> Error
478where
479	E: std::error::Error + Send + Sync + 'static,
480{
481	Error { kind, ctx: None, source: Some(Box::new(e)) }
482}
483
484impl From<std::io::Error> for Error {
485	fn from(e: std::io::Error) -> Self {
486		from_source(ErrorKind::Io, e)
487	}
488}
489
490impl From<serde_json::Error> for Error {
491	fn from(e: serde_json::Error) -> Self {
492		from_source(ErrorKind::Compile, e)
493	}
494}
495
496impl From<fancy_regex::Error> for Error {
497	fn from(e: fancy_regex::Error) -> Self {
498		from_source(ErrorKind::Compile, e)
499	}
500}
501
502impl From<ipnet::AddrParseError> for Error {
503	fn from(e: ipnet::AddrParseError) -> Self {
504		from_source(ErrorKind::Compile, e)
505	}
506}
507
508// `From<tokio::time::error::Elapsed>` is intentionally not provided.
509// `Elapsed` carries no discriminator for which timeout tripped, and a
510// blanket conversion swept every timeout site into `TimeoutKind::Total`
511// regardless of the actual stage (connect, read, header, etc.) —
512// observers and retry classifiers then lost the distinction. Use
513// [`timeout_with`] at the call site instead so the stage is named
514// explicitly.
515
516/// Run `fut` under a tokio timeout and translate the elapsed case into
517/// a named [`TimeoutKind`]. Replaces the previous `From<Elapsed>` impl
518/// so every call site spells out which stage owns the timeout.
519///
520/// # Errors
521/// On expiry returns [`Error::timeout`]; otherwise propagates
522/// `fut`'s own `Result`.
523pub async fn timeout_with<T, E, F>(
524	kind: TimeoutKind,
525	duration: std::time::Duration,
526	fut: F,
527) -> Result<T, Error>
528where
529	F: std::future::Future<Output = Result<T, E>>,
530	Error: From<E>,
531{
532	match tokio::time::timeout(duration, fut).await {
533		Ok(Ok(v)) => Ok(v),
534		Ok(Err(e)) => Err(Error::from(e)),
535		Err(_) => Err(Error::timeout(kind)),
536	}
537}
538
539// `From<hyper::Error>` / `h3::Error` / `rustls::Error` /
540// `hickory_resolver::ResolveError` deliberately not implemented here:
541// vane-core is backend-agnostic, and adding those impls (orphan rules
542// require they live next to the local type) would force every transport
543// crate into core's dep graph. Engine code constructs upstream errors
544// explicitly via `Error::upstream(...).with_source(e)` so the
545// `ErrorKind` / `UpstreamReason` is chosen at the call site rather
546// than baked into a blanket `From` impl.
547
548#[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
549pub struct SerializedError {
550	pub kind: String,
551	pub reason: Option<String>,
552	pub message: String,
553	pub ctx: Option<String>,
554	pub source_chain: Vec<String>,
555	pub http_status: u16,
556	pub retryable: bool,
557}
558
559impl From<&Error> for SerializedError {
560	fn from(e: &Error) -> Self {
561		Self {
562			kind: e.kind_label().to_owned(),
563			reason: e.reason_label().map(ToOwned::to_owned),
564			message: cap_bytes(e.to_string(), SERIALIZED_MESSAGE_CAP),
565			ctx: e.ctx.as_deref().map(|c| cap_bytes(c.to_owned(), SERIALIZED_CTX_CAP)),
566			source_chain: cap_chain(e.source_chain()),
567			http_status: e.http_status(),
568			retryable: e.is_retryable(),
569		}
570	}
571}
572
573const TRUNC_SUFFIX: &str = "… [truncated]";
574
575fn cap_bytes(s: String, cap: usize) -> String {
576	if s.len() <= cap {
577		return s;
578	}
579	let budget = cap.saturating_sub(TRUNC_SUFFIX.len());
580	let mut end = budget.min(s.len());
581	while end > 0 && !s.is_char_boundary(end) {
582		end -= 1;
583	}
584	let mut out = String::with_capacity(end + TRUNC_SUFFIX.len());
585	out.push_str(&s[..end]);
586	out.push_str(TRUNC_SUFFIX);
587	out
588}
589
590fn cap_chain(chain: Vec<String>) -> Vec<String> {
591	if chain.len() <= SERIALIZED_CHAIN_MAX_ENTRIES {
592		return chain.into_iter().map(|s| cap_bytes(s, SERIALIZED_CHAIN_ENTRY_CAP)).collect();
593	}
594	let keep = SERIALIZED_CHAIN_MAX_ENTRIES - 1;
595	let dropped = chain.len() - keep;
596	let mut out: Vec<String> =
597		chain.into_iter().take(keep).map(|s| cap_bytes(s, SERIALIZED_CHAIN_ENTRY_CAP)).collect();
598	out.push(format!("… [{dropped} more]"));
599	out
600}
601
602#[cfg(test)]
603mod diagnostics_tests {
604	use super::{Diagnostics, Error};
605
606	#[test]
607	fn empty_diagnostics_into_result_returns_ok_value() {
608		let d = Diagnostics::new();
609		assert!(d.is_empty());
610		assert!(!d.has_fatal());
611		let r: Result<u32, Diagnostics> = d.into_result(42);
612		assert_eq!(r.unwrap(), 42);
613	}
614
615	#[test]
616	fn non_empty_diagnostics_into_result_surfaces_self() {
617		let mut d = Diagnostics::new();
618		d.push(Error::compile("first"));
619		d.push(Error::compile("second"));
620		assert_eq!(d.len(), 2);
621		assert!(d.has_fatal());
622		let r: Result<(), Diagnostics> = d.into_result(());
623		let got = r.expect_err("non-empty must be Err");
624		assert_eq!(got.len(), 2);
625	}
626
627	#[test]
628	fn diagnostics_display_lists_every_entry_with_numbered_prefix() {
629		let mut d = Diagnostics::new();
630		d.push(Error::compile("alpha"));
631		d.push(Error::compile("beta"));
632		let s = d.to_string();
633		assert!(s.contains("2 compile errors"), "{s}");
634		assert!(s.contains("[1/2]") && s.contains("alpha"), "{s}");
635		assert!(s.contains("[2/2]") && s.contains("beta"), "{s}");
636	}
637
638	#[test]
639	fn diagnostics_to_single_error_joins_messages_under_compile_kind() {
640		let mut d = Diagnostics::new();
641		d.push(Error::compile("alpha"));
642		d.push(Error::compile("beta"));
643		let collapsed: Error = d.into();
644		let msg = collapsed.to_string();
645		assert!(msg.contains("alpha"));
646		assert!(msg.contains("beta"));
647		assert!(matches!(collapsed.kind, super::ErrorKind::Compile));
648	}
649
650	#[test]
651	fn single_error_diagnostics_collapses_to_that_error_verbatim() {
652		let mut d = Diagnostics::new();
653		d.push(Error::compile("solo"));
654		let collapsed: Error = d.into();
655		assert_eq!(collapsed.to_string(), Error::compile("solo").to_string());
656	}
657}