Skip to main content

zeph_orchestration/
error.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use zeph_llm::LlmError;
5use zeph_subagent::SubAgentError;
6
7use super::lineage::LineageEntry;
8
9/// All error variants produced by the orchestration subsystem.
10///
11/// Variants are exhaustive — callers that match on this type should use a
12/// `_ => …` arm to stay robust against future additions.
13///
14/// # Fail-open policy
15///
16/// LLM-backed steps (verification, replan) are always fail-open: on failure
17/// they log a warning and continue rather than returning an error. Only
18/// structural invariant violations and hard configuration errors propagate as
19/// `Err`.
20///
21/// # Examples
22///
23/// ```rust
24/// use zeph_orchestration::OrchestrationError;
25///
26/// fn describe(err: &OrchestrationError) -> &'static str {
27///     match err {
28///         OrchestrationError::CycleDetected => "graph has a cycle",
29///         OrchestrationError::Disabled => "orchestration is off",
30///         _ => "other orchestration error",
31///     }
32/// }
33///
34/// let err = OrchestrationError::CycleDetected;
35/// assert_eq!(describe(&err), "graph has a cycle");
36/// ```
37#[derive(Debug, thiserror::Error)]
38#[non_exhaustive]
39pub enum OrchestrationError {
40    /// Orchestration is disabled in configuration.
41    #[error("orchestration is disabled")]
42    Disabled,
43
44    /// The LLM planner failed to produce a valid task graph.
45    #[error("planning failed: {0}")]
46    PlanningFailed(String),
47
48    /// The task graph structure is invalid (e.g. wrong task-id invariant, bad reference).
49    #[error("invalid graph: {0}")]
50    InvalidGraph(String),
51
52    /// A cycle was detected during topological sort of the task graph.
53    #[error("cycle detected in task graph")]
54    CycleDetected,
55
56    /// A `TaskId` or task title lookup yielded no result.
57    #[error("task not found: {0}")]
58    TaskNotFound(String),
59
60    /// No agent in the available pool can be routed to a task.
61    #[error("no agent available for task: {0}")]
62    NoAgentAvailable(String),
63
64    /// A `GraphId` could not be found in persistence.
65    #[error("graph not found: {0}")]
66    GraphNotFound(String),
67
68    /// An internal scheduler invariant was violated.
69    #[error("scheduler error: {0}")]
70    Scheduler(String),
71
72    /// Result aggregation failed and the fallback path also failed.
73    #[error("aggregation failed: {0}")]
74    AggregationFailed(String),
75
76    /// A database read/write or serialization error in graph persistence.
77    #[error("persistence error: {0}")]
78    Persistence(String),
79
80    /// A task exceeded its per-task wall-clock timeout.
81    #[error("task timed out: {0}")]
82    TaskTimeout(String),
83
84    /// The scheduler or a task was canceled by the caller.
85    #[error("canceled")]
86    Canceled,
87
88    /// A `/plan` CLI command could not be parsed.
89    #[error("invalid command: {0}")]
90    InvalidCommand(String),
91
92    /// Hard invariant violation during verification (e.g. cycle detected after `inject_tasks`).
93    ///
94    /// Never used for LLM call failures — those are fail-open and only log a warning.
95    #[error("verification failed: {0}")]
96    VerificationFailed(String),
97
98    /// A required configuration value is missing or out of range.
99    #[error("invalid configuration: {0}")]
100    InvalidConfig(String),
101
102    /// Propagated error from a sub-agent execution.
103    #[error(transparent)]
104    SubAgent(#[from] SubAgentError),
105
106    /// An LLM provider call failed in a context with no more specific variant.
107    ///
108    /// Existing call sites that map `LlmError` to `PlanningFailed` or other semantic
109    /// variants are intentional and should not be changed. This variant is for new code
110    /// and for callers in `zeph-core` that propagate raw LLM errors.
111    #[error("orchestration LLM call failed: {0}")]
112    Llm(#[from] LlmError),
113
114    /// A `VerifyPredicate::Expression` was encountered; only `Natural` is
115    /// supported in v1.
116    #[error("predicate type not supported: {0}")]
117    PredicateNotSupported(String),
118
119    /// Predicate remediation could not be injected because the replan budget is exhausted.
120    #[error("replan budget exhausted for task {task_id}: {reason}")]
121    ReplanBudgetExhausted {
122        /// Task that triggered remediation.
123        task_id: String,
124        /// Human-readable reason (e.g. "predicate remediation").
125        reason: String,
126    },
127
128    /// The DAG was aborted because a consecutive error chain in a `depends_on` path
129    /// (or a region fan-out failure rate) exceeded the configured threshold.
130    ///
131    /// `chain_depth` in the display is `chain.len()` for quick log scanning; the full
132    /// [`LineageEntry`] list is emitted to the structured audit log.
133    #[error("cascade abort: root={root:?}, chain_depth={}", chain.len())]
134    CascadeAborted {
135        /// Root task ID where the failure chain began.
136        root: super::graph::TaskId,
137        /// Full lineage chain at the time of abort; earliest entry first.
138        chain: Vec<LineageEntry>,
139    },
140}