Skip to main content

zeph_orchestration/
error.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use zeph_subagent::SubAgentError;
5
6use super::lineage::LineageEntry;
7
8/// All error variants produced by the orchestration subsystem.
9///
10/// Variants are exhaustive — callers that match on this type should use a
11/// `_ => …` arm to stay robust against future additions.
12///
13/// # Fail-open policy
14///
15/// LLM-backed steps (verification, replan) are always fail-open: on failure
16/// they log a warning and continue rather than returning an error. Only
17/// structural invariant violations and hard configuration errors propagate as
18/// `Err`.
19///
20/// # Examples
21///
22/// ```rust
23/// use zeph_orchestration::OrchestrationError;
24///
25/// fn describe(err: &OrchestrationError) -> &'static str {
26///     match err {
27///         OrchestrationError::CycleDetected => "graph has a cycle",
28///         OrchestrationError::Disabled => "orchestration is off",
29///         _ => "other orchestration error",
30///     }
31/// }
32///
33/// let err = OrchestrationError::CycleDetected;
34/// assert_eq!(describe(&err), "graph has a cycle");
35/// ```
36#[derive(Debug, thiserror::Error)]
37pub enum OrchestrationError {
38    /// Orchestration is disabled in configuration.
39    #[error("orchestration is disabled")]
40    Disabled,
41
42    /// The LLM planner failed to produce a valid task graph.
43    #[error("planning failed: {0}")]
44    PlanningFailed(String),
45
46    /// The task graph structure is invalid (e.g. wrong task-id invariant, bad reference).
47    #[error("invalid graph: {0}")]
48    InvalidGraph(String),
49
50    /// A cycle was detected during topological sort of the task graph.
51    #[error("cycle detected in task graph")]
52    CycleDetected,
53
54    /// A `TaskId` or task title lookup yielded no result.
55    #[error("task not found: {0}")]
56    TaskNotFound(String),
57
58    /// No agent in the available pool can be routed to a task.
59    #[error("no agent available for task: {0}")]
60    NoAgentAvailable(String),
61
62    /// A `GraphId` could not be found in persistence.
63    #[error("graph not found: {0}")]
64    GraphNotFound(String),
65
66    /// An internal scheduler invariant was violated.
67    #[error("scheduler error: {0}")]
68    Scheduler(String),
69
70    /// Result aggregation failed and the fallback path also failed.
71    #[error("aggregation failed: {0}")]
72    AggregationFailed(String),
73
74    /// A database read/write or serialization error in graph persistence.
75    #[error("persistence error: {0}")]
76    Persistence(String),
77
78    /// A task exceeded its per-task wall-clock timeout.
79    #[error("task timed out: {0}")]
80    TaskTimeout(String),
81
82    /// The scheduler or a task was canceled by the caller.
83    #[error("canceled")]
84    Canceled,
85
86    /// A `/plan` CLI command could not be parsed.
87    #[error("invalid command: {0}")]
88    InvalidCommand(String),
89
90    /// Hard invariant violation during verification (e.g. cycle detected after `inject_tasks`).
91    ///
92    /// Never used for LLM call failures — those are fail-open and only log a warning.
93    #[error("verification failed: {0}")]
94    VerificationFailed(String),
95
96    /// A required configuration value is missing or out of range.
97    #[error("invalid configuration: {0}")]
98    InvalidConfig(String),
99
100    /// Propagated error from a sub-agent execution.
101    #[error(transparent)]
102    SubAgent(#[from] SubAgentError),
103
104    /// A `VerifyPredicate::Expression` was encountered; only `Natural` is
105    /// supported in v1.
106    #[error("predicate type not supported: {0}")]
107    PredicateNotSupported(String),
108
109    /// Predicate remediation could not be injected because the replan budget is exhausted.
110    #[error("replan budget exhausted for task {task_id}: {reason}")]
111    ReplanBudgetExhausted {
112        /// Task that triggered remediation.
113        task_id: String,
114        /// Human-readable reason (e.g. "predicate remediation").
115        reason: String,
116    },
117
118    /// The DAG was aborted because a consecutive error chain in a `depends_on` path
119    /// (or a region fan-out failure rate) exceeded the configured threshold.
120    ///
121    /// `chain_depth` in the display is `chain.len()` for quick log scanning; the full
122    /// [`LineageEntry`] list is emitted to the structured audit log.
123    #[error("cascade abort: root={root:?}, chain_depth={}", chain.len())]
124    CascadeAborted {
125        /// Root task ID where the failure chain began.
126        root: super::graph::TaskId,
127        /// Full lineage chain at the time of abort; earliest entry first.
128        chain: Vec<LineageEntry>,
129    },
130}