Skip to main content

awaken_server/
mailbox.rs

1//! Mailbox service: unified persistent run queue.
2//!
3//! Every run request (streaming, background, A2A, internal) enters as a
4//! [`RunDispatch`] keyed by `thread_id`. The Mailbox orchestrates persistent
5//! enqueue, lease-based claim, execution via [`RunDispatchExecutor`], and lifecycle
6//! management (lease renewal, sweep, GC).
7
8use std::collections::HashMap;
9use std::sync::Arc;
10use std::sync::Mutex as StdMutex;
11use std::sync::atomic::{AtomicBool, Ordering};
12use std::time::{Duration, Instant};
13
14use async_trait::async_trait;
15use parking_lot::Mutex as SyncMutex;
16use thiserror::Error;
17use tokio::sync::{Mutex, RwLock, Semaphore, mpsc};
18use tokio::task::{JoinHandle, JoinSet};
19
20use awaken_contract::contract::event::AgentEvent;
21use awaken_contract::contract::event_sink::EventSink;
22use awaken_contract::contract::lifecycle::{RunStatus, TerminationReason};
23use awaken_contract::contract::mailbox::{
24    DispatchSignalEntry, LiveDeliveryOutcome, LiveRunCommand, LiveRunTarget, MailboxInterrupt,
25    MailboxInterruptDetails, MailboxStore, RunDispatch, RunDispatchResult, RunDispatchStatus,
26};
27use awaken_contract::contract::message::Message;
28use awaken_contract::contract::storage::{
29    MessageSeqRange, RunMessageInput, RunRecord, RunRequestSnapshot, RunResumeDecision,
30    StorageError, ThreadRunStore,
31};
32use awaken_contract::contract::suspension::{ToolCallOutcome, ToolCallResume};
33use awaken_contract::contract::tool_intercept::{AdapterKind, RunMode};
34use awaken_contract::now_ms;
35use awaken_runtime::loop_runner::{AgentLoopError, AgentRunResult};
36use awaken_runtime::{AgentRuntime, RunRequest, ThreadContextSnapshot};
37
38use crate::transport::channel_sink::ReconnectableEventSink;
39
40/// Guard window for inline-claimed dispatches: if the process crashes between
41/// enqueue and claim, the sweep will reclaim the dispatch after this period.
42const INLINE_CLAIM_GUARD_MS: u64 = 60_000;
43#[cfg(not(test))]
44const REMOTE_CANCEL_WAIT_MS: u64 = 5_000;
45#[cfg(test)]
46const REMOTE_CANCEL_WAIT_MS: u64 = 250;
47const REMOTE_CANCEL_POLL_MS: u64 = 25;
48const DISPATCH_SIGNAL_BATCH_DEFAULT: usize = 32;
49const DISPATCH_SIGNAL_EXPIRES_DEFAULT: Duration = Duration::from_millis(500);
50const DISPATCH_SIGNAL_ERROR_DELAY: Duration = Duration::from_millis(250);
51const DISPATCH_SIGNAL_BLOCKED_NACK_BASE_DELAY_DEFAULT: Duration = Duration::from_millis(500);
52const DISPATCH_SIGNAL_BLOCKED_NACK_MAX_DELAY_DEFAULT: Duration = Duration::from_secs(30);
53const DISPATCH_SIGNAL_BATCH_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_BATCH_SIZE";
54const DISPATCH_SIGNAL_EXPIRES_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_FETCH_EXPIRES_MS";
55const DISPATCH_SIGNAL_NACK_BASE_DELAY_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_NACK_BASE_DELAY_MS";
56const DISPATCH_SIGNAL_NACK_MAX_DELAY_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_NACK_MAX_DELAY_MS";
57const DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_DEFAULT: usize = 32;
58const DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_ENV: &str =
59    "AWAKEN_DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS";
60const TERMINAL_RECONCILE_BATCH: usize = 100;
61const MAILBOX_DEPTH_STATUSES: [RunDispatchStatus; 6] = [
62    RunDispatchStatus::Queued,
63    RunDispatchStatus::Claimed,
64    RunDispatchStatus::Acked,
65    RunDispatchStatus::Cancelled,
66    RunDispatchStatus::Superseded,
67    RunDispatchStatus::DeadLetter,
68];
69
70/// Validation message returned when an inline submit loses the active-run race.
71pub(crate) const ACTIVE_RUN_CONFLICT_MESSAGE: &str =
72    "thread has an active run; cannot claim inline";
73
74// ── RunRequest ↔ RunDispatch conversion ───────────────────────────────
75
76/// Typed envelope for RunRequest fields that Mailbox stores opaquely.
77/// Centralizes the RunRequest → RunDispatch → RunRequest round-trip.
78#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
79struct RunRequestExtras {
80    #[serde(default, skip_serializing_if = "Option::is_none")]
81    overrides: Option<awaken_contract::contract::inference::InferenceOverride>,
82    #[serde(default, skip_serializing_if = "Vec::is_empty")]
83    decisions: Vec<(
84        String,
85        awaken_contract::contract::suspension::ToolCallResume,
86    )>,
87    #[serde(default, skip_serializing_if = "Vec::is_empty")]
88    frontend_tools: Vec<awaken_contract::contract::tool::ToolDescriptor>,
89    #[serde(default, skip_serializing_if = "Option::is_none")]
90    continue_run_id: Option<String>,
91    #[serde(default, skip_serializing_if = "Option::is_none")]
92    run_id_hint: Option<String>,
93    #[serde(default, skip_serializing_if = "Option::is_none")]
94    dispatch_id_hint: Option<String>,
95    #[serde(default, skip_serializing_if = "Option::is_none")]
96    parent_thread_id: Option<String>,
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    transport_request_id: Option<String>,
99    #[serde(default)]
100    run_mode: RunMode,
101    #[serde(default)]
102    adapter: AdapterKind,
103}
104
105impl RunRequestExtras {
106    fn from_request(request: &awaken_runtime::RunRequest) -> Self {
107        Self {
108            overrides: request.overrides.clone(),
109            decisions: request.decisions.clone(),
110            frontend_tools: request.frontend_tools.clone(),
111            continue_run_id: request.continue_run_id.clone(),
112            run_id_hint: request.run_id_hint.clone(),
113            dispatch_id_hint: request.dispatch_id_hint.clone(),
114            parent_thread_id: request.parent_thread_id.clone(),
115            transport_request_id: request.transport_request_id.clone(),
116            run_mode: request.run_mode,
117            adapter: request.adapter,
118        }
119    }
120
121    fn to_value(&self) -> Result<Option<serde_json::Value>, serde_json::Error> {
122        if self.overrides.is_none()
123            && self.decisions.is_empty()
124            && self.frontend_tools.is_empty()
125            && self.continue_run_id.is_none()
126            && self.run_id_hint.is_none()
127            && self.dispatch_id_hint.is_none()
128            && self.parent_thread_id.is_none()
129            && self.transport_request_id.is_none()
130            && self.run_mode == RunMode::Foreground
131            && self.adapter == AdapterKind::Internal
132        {
133            Ok(None)
134        } else {
135            serde_json::to_value(self).map(Some)
136        }
137    }
138
139    fn from_value(value: &serde_json::Value) -> Result<Self, serde_json::Error> {
140        serde_json::from_value(value.clone())
141    }
142
143    fn apply_to(self, mut request: awaken_runtime::RunRequest) -> awaken_runtime::RunRequest {
144        if let Some(ov) = self.overrides {
145            request = request.with_overrides(ov);
146        }
147        if !self.decisions.is_empty() {
148            request = request.with_decisions(self.decisions);
149        }
150        if !self.frontend_tools.is_empty() {
151            request = request.with_frontend_tools(self.frontend_tools);
152        }
153        if let Some(crid) = self.continue_run_id {
154            request = request.with_continue_run_id(crid);
155        }
156        if let Some(run_id_hint) = self.run_id_hint {
157            request = request.with_run_id_hint(run_id_hint);
158        }
159        if let Some(dispatch_id_hint) = self.dispatch_id_hint {
160            request = request.with_dispatch_id_hint(dispatch_id_hint);
161        }
162        if let Some(parent_thread_id) = self.parent_thread_id {
163            request = request.with_parent_thread_id(parent_thread_id);
164        }
165        if let Some(transport_request_id) = self.transport_request_id {
166            request = request.with_transport_request_id(transport_request_id);
167        }
168        request
169            .with_run_mode(self.run_mode)
170            .with_adapter(self.adapter)
171    }
172}
173
174// ── TaskDoneMailboxNotify ────────────────────────────────────────────
175
176/// Fallback for inbox delivery when the agent's run has ended.
177///
178/// Implements [`OnInboxClosed`](awaken_runtime::inbox::OnInboxClosed) — when an `InboxSender::send()` fails
179/// because the receiver was dropped (agent run returned with AwaitingTasks),
180/// this enqueues a mailbox wake dispatch so the thread gets a continuation run.
181pub struct TaskDoneMailboxNotify {
182    mailbox: Arc<Mailbox>,
183    thread_id: String,
184    continue_run_id: Option<String>,
185}
186
187impl TaskDoneMailboxNotify {
188    pub fn new(mailbox: Arc<Mailbox>, thread_id: String, continue_run_id: Option<String>) -> Self {
189        Self {
190            mailbox,
191            thread_id,
192            continue_run_id,
193        }
194    }
195}
196
197impl awaken_runtime::inbox::OnInboxClosed for TaskDoneMailboxNotify {
198    fn closed(&self, message: &serde_json::Value) {
199        let mailbox = self.mailbox.clone();
200        let thread_id = self.thread_id.clone();
201        let continue_run_id = self.continue_run_id.clone();
202        let wake_message = awaken_runtime::inbox::inbox_event_message(message);
203
204        // Spawn because OnInboxClosed::closed is sync but enqueue+dispatch is async
205        tokio::spawn(async move {
206            let mut request = RunRequest::new(thread_id.clone(), vec![wake_message])
207                .with_origin(awaken_contract::contract::storage::RunRequestOrigin::Internal)
208                .with_run_mode(RunMode::InternalWake)
209                .with_adapter(AdapterKind::Internal);
210            if let Some(run_id) = continue_run_id {
211                request = request.with_continue_run_id(run_id);
212            }
213            if let Err(e) = mailbox.submit_background(request).await {
214                tracing::warn!(thread_id, error = %e, "failed to enqueue background task wake dispatch");
215            }
216        });
217    }
218}
219
220// ── Public types ─────────────────────────────────────────────────────
221
222/// Result returned by submit/submit_background.
223#[derive(Debug, Clone)]
224pub struct MailboxSubmitResult {
225    pub dispatch_id: String,
226    pub run_id: String,
227    pub thread_id: String,
228    pub status: MailboxDispatchStatus,
229}
230
231/// Dispatch status for a submitted run activation.
232#[derive(Debug, Clone, Copy, PartialEq, Eq)]
233pub enum MailboxDispatchStatus {
234    /// Job was claimed and is executing now.
235    Running,
236    /// Job is queued, waiting for the current run to finish.
237    Queued,
238}
239
240/// Mailbox service errors.
241#[derive(Debug, Error)]
242pub enum MailboxError {
243    #[error("validation error: {0}")]
244    Validation(String),
245    #[error("store error: {0}")]
246    Store(#[from] StorageError),
247    #[error("internal error: {0}")]
248    Internal(String),
249}
250
251/// Outcome classification for runtime run results.
252#[derive(Debug)]
253pub enum MailboxRunOutcome {
254    /// Run completed successfully.
255    Completed,
256    /// Transient infrastructure failure -- retry.
257    TransientError(String),
258    /// Permanent failure -- do not retry.
259    PermanentError(String),
260}
261
262impl MailboxRunOutcome {
263    fn metric_label(&self) -> &'static str {
264        match self {
265            Self::Completed => "completed",
266            Self::TransientError(_) => "transient_error",
267            Self::PermanentError(_) => "permanent_error",
268        }
269    }
270}
271
272/// Execution boundary used by mailbox dispatch.
273///
274/// Mailbox owns delivery, leasing, retry, and recovery. The executor behind
275/// this trait owns actual run execution and live-run control. It intentionally
276/// does not expose storage so mailbox scheduling stays orthogonal to the main
277/// runtime implementation.
278#[async_trait]
279pub trait RunDispatchExecutor: Send + Sync {
280    /// Execute a run request and stream events into the provided sink.
281    async fn run(
282        &self,
283        request: RunRequest,
284        sink: Arc<dyn EventSink>,
285    ) -> Result<AgentRunResult, AgentLoopError>;
286
287    /// Execute a run with an optional mailbox-provided thread cache.
288    async fn run_with_thread_context(
289        &self,
290        request: RunRequest,
291        sink: Arc<dyn EventSink>,
292        thread_ctx: Option<ThreadContextSnapshot>,
293    ) -> Result<AgentRunResult, AgentLoopError> {
294        let _ = thread_ctx;
295        self.run(request, sink).await
296    }
297
298    /// Cancel an active run by run id or thread id.
299    fn cancel(&self, id: &str) -> bool;
300
301    /// Cancel an active run by thread id and wait for it to unregister.
302    async fn cancel_and_wait_by_thread(&self, thread_id: &str) -> bool;
303
304    /// Forward one human/tool decision to an active run.
305    fn send_decision(&self, id: &str, tool_call_id: String, resume: ToolCallResume) -> bool;
306
307    /// Forward direct input messages to an active run.
308    fn send_messages(&self, id: &str, messages: Vec<Message>) -> bool {
309        let _ = (id, messages);
310        false
311    }
312}
313
314#[async_trait]
315impl RunDispatchExecutor for AgentRuntime {
316    async fn run(
317        &self,
318        request: RunRequest,
319        sink: Arc<dyn EventSink>,
320    ) -> Result<AgentRunResult, AgentLoopError> {
321        AgentRuntime::run(self, request, sink).await
322    }
323
324    async fn run_with_thread_context(
325        &self,
326        request: RunRequest,
327        sink: Arc<dyn EventSink>,
328        thread_ctx: Option<ThreadContextSnapshot>,
329    ) -> Result<AgentRunResult, AgentLoopError> {
330        AgentRuntime::run_with_thread_context(self, request, sink, thread_ctx).await
331    }
332
333    fn cancel(&self, id: &str) -> bool {
334        AgentRuntime::cancel(self, id)
335    }
336
337    async fn cancel_and_wait_by_thread(&self, thread_id: &str) -> bool {
338        AgentRuntime::cancel_and_wait_by_thread(self, thread_id).await
339    }
340
341    fn send_decision(&self, id: &str, tool_call_id: String, resume: ToolCallResume) -> bool {
342        AgentRuntime::send_decision(self, id, tool_call_id, resume)
343    }
344
345    fn send_messages(&self, id: &str, messages: Vec<Message>) -> bool {
346        AgentRuntime::send_messages(self, id, messages)
347    }
348}
349
350/// Configuration for the Mailbox service.
351#[derive(Debug, Clone)]
352pub struct MailboxConfig {
353    /// Lease duration in milliseconds (default 30_000).
354    pub lease_ms: u64,
355    /// Lease duration in milliseconds when the run is suspended/waiting
356    /// for human input (default 600_000 = 10 minutes).
357    pub suspended_lease_ms: u64,
358    /// How often to renew leases (default 10s).
359    pub lease_renewal_interval: Duration,
360    /// How often to sweep for expired leases (default 30s).
361    pub sweep_interval: Duration,
362    /// How often to run GC for terminal dispatches (default 60s).
363    pub gc_interval: Duration,
364    /// How long to keep terminal dispatches before purging (default 24h).
365    pub gc_ttl: Duration,
366    /// Default max attempts before dead-lettering (default 5).
367    pub default_max_attempts: u32,
368    /// Default retry delay in milliseconds (default 250).
369    pub default_retry_delay_ms: u64,
370    /// Maximum retry delay in milliseconds for exponential backoff (default 30_000).
371    pub max_retry_delay_ms: u64,
372}
373
374impl Default for MailboxConfig {
375    fn default() -> Self {
376        Self {
377            lease_ms: 30_000,
378            suspended_lease_ms: 600_000,
379            lease_renewal_interval: Duration::from_secs(10),
380            sweep_interval: Duration::from_secs(30),
381            gc_interval: Duration::from_secs(60),
382            gc_ttl: Duration::from_secs(24 * 60 * 60),
383            default_max_attempts: 5,
384            default_retry_delay_ms: 250,
385            max_retry_delay_ms: 30_000,
386        }
387    }
388}
389
390/// Callback invoked during mailbox maintenance GC ticks.
391pub type MailboxMaintenanceCallback = Arc<dyn Fn() + Send + Sync + 'static>;
392
393/// Startup recovery retry settings used by lifecycle startup.
394#[derive(Clone)]
395pub struct MailboxStartupRecoveryConfig {
396    /// Maximum recovery attempts before giving up. Values below 1 are treated
397    /// as one attempt.
398    pub max_attempts: u32,
399    /// Delay between failed recovery attempts.
400    pub retry_delay: Duration,
401}
402
403impl Default for MailboxStartupRecoveryConfig {
404    fn default() -> Self {
405        Self {
406            max_attempts: 1,
407            retry_delay: Duration::from_millis(250),
408        }
409    }
410}
411
412/// Configuration for framework-managed mailbox lifecycle tasks.
413#[derive(Clone)]
414pub struct MailboxLifecycleConfig {
415    /// Delay before startup recovery and maintenance begin.
416    pub startup_delay: Duration,
417    /// Retry policy for startup recovery.
418    pub startup_recovery: MailboxStartupRecoveryConfig,
419    /// Optional cleanup hook for application-owned resources.
420    pub maintenance_callback: Option<MailboxMaintenanceCallback>,
421}
422
423impl Default for MailboxLifecycleConfig {
424    fn default() -> Self {
425        Self {
426            startup_delay: Duration::ZERO,
427            startup_recovery: MailboxStartupRecoveryConfig::default(),
428            maintenance_callback: None,
429        }
430    }
431}
432
433/// Handle for framework-managed mailbox lifecycle tasks.
434///
435/// Dropping the handle does not stop lifecycle tasks. Call [`shutdown`](Self::shutdown)
436/// for quiescent shutdown or [`abort`](Self::abort) for fire-and-forget stop.
437#[derive(Clone)]
438pub struct MailboxLifecycleHandle {
439    tasks: Arc<StdMutex<Option<MailboxLifecycleTasks>>>,
440    transition_lock: Arc<Mutex<()>>,
441}
442
443impl MailboxLifecycleHandle {
444    /// Abort lifecycle tasks. This is idempotent.
445    pub fn abort(&self) {
446        if let Some(tasks) = self.tasks.lock().expect("lifecycle lock poisoned").take() {
447            tasks.abort();
448        }
449    }
450
451    /// Abort lifecycle tasks and wait until they have fully exited.
452    ///
453    /// This is the quiescent shutdown path. Use it when a caller needs a hard
454    /// guarantee that a subsequent lifecycle start cannot overlap old recovery
455    /// or maintenance tasks.
456    pub async fn shutdown(&self) -> Result<(), MailboxError> {
457        let _transition_guard = self.transition_lock.lock().await;
458        let tasks = self.tasks.lock().expect("lifecycle lock poisoned").take();
459        if let Some(tasks) = tasks {
460            tasks.shutdown().await?;
461        }
462        Ok(())
463    }
464
465    /// Returns true while lifecycle tasks are registered for this mailbox.
466    pub fn is_running(&self) -> bool {
467        self.tasks
468            .lock()
469            .expect("lifecycle lock poisoned")
470            .is_some()
471    }
472}
473
474struct MailboxLifecycleTasks {
475    recover_task: Option<JoinHandle<()>>,
476    dispatch_signal_task: Option<JoinHandle<()>>,
477    maintenance_task: JoinHandle<()>,
478}
479
480impl MailboxLifecycleTasks {
481    fn abort(self) {
482        if let Some(task) = self.recover_task {
483            task.abort();
484        }
485        if let Some(task) = self.dispatch_signal_task {
486            task.abort();
487        }
488        self.maintenance_task.abort();
489    }
490
491    async fn shutdown(self) -> Result<(), MailboxError> {
492        if let Some(task) = self.recover_task {
493            task.abort();
494            await_lifecycle_task("mailbox startup recovery", task).await?;
495        }
496        if let Some(task) = self.dispatch_signal_task {
497            task.abort();
498            await_lifecycle_task("mailbox dispatch signal loop", task).await?;
499        }
500        self.maintenance_task.abort();
501        await_lifecycle_task("mailbox maintenance", self.maintenance_task).await
502    }
503}
504
505async fn await_lifecycle_task(name: &str, task: JoinHandle<()>) -> Result<(), MailboxError> {
506    match task.await {
507        Ok(()) => Ok(()),
508        Err(error) if error.is_cancelled() => Ok(()),
509        Err(error) if error.is_panic() => Err(MailboxError::Internal(format!("{name} panicked"))),
510        Err(error) => Err(MailboxError::Internal(format!("{name} failed: {error}"))),
511    }
512}
513
514// ── Internal types ───────────────────────────────────────────────────
515
516/// Per-thread worker status.
517enum MailboxWorkerStatus {
518    Idle,
519    /// Transitional: claim in progress. Prevents TOCTOU race where two
520    /// concurrent dispatches both see Idle and both try to claim.
521    Claiming,
522    Running {
523        dispatch_id: String,
524        run_id: String,
525        lease_handle: JoinHandle<()>,
526        sink: Arc<ReconnectableEventSink>,
527    },
528}
529
530#[derive(Debug, Clone, Copy, PartialEq, Eq)]
531enum DispatchAttempt {
532    Claimed,
533    Busy,
534    NoEligible,
535    TransientError,
536}
537
538impl DispatchAttempt {
539    fn started_execution(self) -> bool {
540        matches!(self, DispatchAttempt::Claimed)
541    }
542}
543
544/// Cached thread state, valid for the duration of a lease.
545struct ThreadContext {
546    messages: Vec<Message>,
547    latest_run: Option<RunRecord>,
548    run_cache: HashMap<String, RunRecord>,
549}
550
551impl ThreadContext {
552    async fn load(run_store: &dyn ThreadRunStore, thread_id: &str) -> Result<Self, MailboxError> {
553        let messages = run_store
554            .load_messages(thread_id)
555            .await?
556            .unwrap_or_default();
557        let latest_run = run_store.latest_run(thread_id).await?;
558        let mut run_cache = HashMap::new();
559        if let Some(ref run) = latest_run {
560            run_cache.insert(run.run_id.clone(), run.clone());
561        }
562        Ok(Self {
563            messages,
564            latest_run,
565            run_cache,
566        })
567    }
568
569    fn get_run(&self, run_id: &str) -> Option<&RunRecord> {
570        self.run_cache.get(run_id)
571    }
572
573    fn apply_checkpoint(&mut self, messages: &[Message], run: &RunRecord) {
574        self.messages = messages.to_vec();
575        self.latest_run = Some(run.clone());
576        self.run_cache.insert(run.run_id.clone(), run.clone());
577    }
578}
579
580/// Per-thread worker. Store is the sole queue authority.
581struct MailboxWorker {
582    status: MailboxWorkerStatus,
583    thread_ctx: Option<ThreadContext>,
584}
585
586impl Default for MailboxWorker {
587    fn default() -> Self {
588        Self {
589            status: MailboxWorkerStatus::Idle,
590            thread_ctx: None,
591        }
592    }
593}
594
595// ── Suspension-aware event sink ──────────────────────────────────────
596
597/// Wraps an inner `EventSink` and sets a shared flag when the run
598/// enters a suspended (waiting) state, detected by a `ToolCallDone`
599/// event with `ToolCallOutcome::Suspended`.
600struct SuspensionAwareSink {
601    inner: Arc<dyn EventSink>,
602    suspended: Arc<AtomicBool>,
603}
604
605#[async_trait]
606impl EventSink for SuspensionAwareSink {
607    async fn emit(&self, event: AgentEvent) {
608        if matches!(
609            &event,
610            AgentEvent::ToolCallDone {
611                outcome: ToolCallOutcome::Suspended,
612                ..
613            }
614        ) {
615            self.suspended.store(true, Ordering::Release);
616        }
617        // Reset the flag when the run resumes from suspension.
618        if matches!(&event, AgentEvent::ToolCallResumed { .. }) {
619            self.suspended.store(false, Ordering::Release);
620        }
621        self.inner.emit(event).await;
622    }
623
624    async fn close(&self) {
625        self.inner.close().await;
626    }
627}
628
629/// RAII guard that decrements the active-runs gauge on drop.
630struct ActiveRunGuard;
631
632impl Drop for ActiveRunGuard {
633    fn drop(&mut self) {
634        crate::metrics::dec_active_runs();
635    }
636}
637
638// ── Mailbox service ──────────────────────────────────────────────────
639
640/// Unified persistent run queue.
641///
642/// Orchestrates `MailboxStore` (dispatch persistence) + `ThreadRunStore`
643/// (run/message truth) + `RunDispatchExecutor` (execution)
644/// with lease-based distributed claim, per-thread serialization, sweep,
645/// and garbage collection.
646pub struct Mailbox {
647    executor: Arc<dyn RunDispatchExecutor>,
648    store: Arc<dyn MailboxStore>,
649    run_store: Arc<dyn ThreadRunStore>,
650    consumer_id: String,
651    workers: RwLock<HashMap<String, Arc<SyncMutex<MailboxWorker>>>>,
652    config: MailboxConfig,
653    lifecycle_tasks: Arc<StdMutex<Option<MailboxLifecycleTasks>>>,
654    lifecycle_start_lock: Arc<Mutex<()>>,
655}
656
657impl Mailbox {
658    /// Create a new Mailbox service.
659    pub fn new<R>(
660        executor: Arc<R>,
661        store: Arc<dyn MailboxStore>,
662        run_store: Arc<dyn ThreadRunStore>,
663        consumer_id: String,
664        config: MailboxConfig,
665    ) -> Self
666    where
667        R: RunDispatchExecutor + 'static,
668    {
669        Self::new_with_executor(executor, store, run_store, consumer_id, config)
670    }
671
672    /// Create a Mailbox service from an already-erased execution boundary.
673    pub fn new_with_executor(
674        executor: Arc<dyn RunDispatchExecutor>,
675        store: Arc<dyn MailboxStore>,
676        run_store: Arc<dyn ThreadRunStore>,
677        consumer_id: String,
678        config: MailboxConfig,
679    ) -> Self {
680        Self {
681            executor,
682            store,
683            run_store,
684            consumer_id,
685            workers: RwLock::new(HashMap::new()),
686            config,
687            lifecycle_tasks: Arc::new(StdMutex::new(None)),
688            lifecycle_start_lock: Arc::new(Mutex::new(())),
689        }
690    }
691
692    async fn refresh_dispatch_depth_metrics(&self) {
693        for status in MAILBOX_DEPTH_STATUSES {
694            match self.store.count_dispatches_by_status(status).await {
695                Ok(count) => {
696                    let depth = count as f64;
697                    crate::metrics::set_mailbox_dispatch_depth(
698                        dispatch_status_label(status),
699                        depth,
700                    );
701                    if status == RunDispatchStatus::Queued {
702                        crate::metrics::set_mailbox_queue_depth(depth);
703                    }
704                }
705                Err(error) => {
706                    tracing::debug!(
707                        status = dispatch_status_label(status),
708                        error = %error,
709                        "mailbox dispatch depth metric unavailable"
710                    );
711                    return;
712                }
713            }
714        }
715    }
716
717    async fn enqueue_dispatch_with_metrics(
718        &self,
719        dispatch: &RunDispatch,
720    ) -> Result<(), StorageError> {
721        let start = Instant::now();
722        let result = self.store.enqueue(dispatch).await;
723        record_mailbox_operation_result("enqueue", result_label(&result), start);
724        if result.is_ok() {
725            self.refresh_dispatch_depth_metrics().await;
726        }
727        result
728    }
729
730    // ── Submission ───────────────────────────────────────────────────
731
732    /// Default bounded channel capacity for the runtime→SSE relay.
733    const EVENT_CHANNEL_CAPACITY: usize = 256;
734
735    /// Submit a run for streaming. Returns event receiver immediately.
736    ///
737    /// The dispatch is persisted (WAL), then claimed inline by this process.
738    /// The caller wires `event_rx` to their transport (SSE, WebSocket, etc).
739    #[tracing::instrument(skip(self, request), fields(thread_id = %request.thread_id))]
740    pub async fn submit(
741        self: &Arc<Self>,
742        mut request: RunRequest,
743    ) -> Result<(MailboxSubmitResult, mpsc::Receiver<AgentEvent>), MailboxError> {
744        normalize_mailbox_run_mode(&mut request, false);
745        let (thread_id, messages) = validate_run_inputs(
746            request.thread_id.clone(),
747            request.messages.clone(),
748            !request.decisions.is_empty(),
749        )?;
750
751        // Step 1: Interrupt — bump dispatch epoch, supersede stale queued dispatches.
752        let now = now_ms();
753        let interrupt_start = Instant::now();
754        match self.store.interrupt_detailed(&thread_id, now).await {
755            Ok(interrupt) => {
756                record_mailbox_operation_result("interrupt", "ok", interrupt_start);
757                crate::metrics::inc_mailbox_operation_by(
758                    "supersede",
759                    "ok",
760                    interrupt.superseded_count as u64,
761                );
762                self.refresh_dispatch_depth_metrics().await;
763                for superseded_dispatch in &interrupt.superseded_dispatches {
764                    self.mark_superseded_dispatch_run_cancelled(
765                        superseded_dispatch,
766                        "queued dispatch superseded by foreground submit",
767                    )
768                    .await;
769                }
770                // Step 2: Cancel active runtime run if the interrupt found one.
771                if let Some(active_dispatch) = interrupt.active_dispatch.as_ref() {
772                    let cancelled = self
773                        .cancel_active_dispatch(&thread_id, active_dispatch, true)
774                        .await?;
775                    if !cancelled {
776                        return Err(MailboxError::Validation(ACTIVE_RUN_CONFLICT_MESSAGE.into()));
777                    }
778                    tracing::info!(
779                        thread_id = %thread_id,
780                        superseded = interrupt.superseded_count,
781                        "interrupted thread for new submission"
782                    );
783                }
784            }
785            Err(e) => {
786                record_mailbox_operation_result("interrupt", "error", interrupt_start);
787                tracing::warn!(thread_id = %thread_id, error = %e, "interrupt failed, falling back to cancel");
788                if !self.executor.cancel_and_wait_by_thread(&thread_id).await {
789                    return Err(MailboxError::Validation(ACTIVE_RUN_CONFLICT_MESSAGE.into()));
790                }
791            }
792        }
793
794        let run_id = self
795            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
796            .await?;
797        let dispatch = self.build_dispatch(&request, &thread_id)?;
798        let dispatch_id = dispatch.dispatch_id.clone();
799        let thread_id = dispatch.thread_id.clone();
800
801        // WAL: persist before anything else.
802        // Set available_at slightly in the future to prevent sweep from grabbing
803        // the dispatch during the inline claim window. If the process crashes before
804        // the claim completes, sweep will reclaim the dispatch after the guard period.
805        let mut wal_dispatch = dispatch;
806        wal_dispatch.available_at = now_ms() + INLINE_CLAIM_GUARD_MS;
807        self.enqueue_dispatch_with_metrics(&wal_dispatch).await?;
808
809        // Inline claim.
810        let now = now_ms();
811        let claim_start = Instant::now();
812        let claimed_result = self
813            .store
814            .claim_dispatch(&dispatch_id, &self.consumer_id, self.config.lease_ms, now)
815            .await;
816        let claim_result_label = match &claimed_result {
817            Ok(Some(_)) => "ok",
818            Ok(None) => "empty",
819            Err(_) => "error",
820        };
821        record_mailbox_operation_result("claim_dispatch", claim_result_label, claim_start);
822        let claimed = claimed_result?;
823        self.refresh_dispatch_depth_metrics().await;
824
825        let (event_tx, event_rx) = mpsc::channel(Self::EVENT_CHANNEL_CAPACITY);
826
827        if let Some(claimed_dispatch) = claimed {
828            let claim_token = claimed_dispatch.claim_token.clone().unwrap_or_default();
829
830            // Shared flag: set by the event sink when a tool call is suspended.
831            let suspended = Arc::new(AtomicBool::new(false));
832
833            // Start lease renewal.
834            let lease_handle = self.spawn_lease_renewal(
835                dispatch_id.clone(),
836                claim_token.clone(),
837                thread_id.clone(),
838                Arc::clone(&suspended),
839            );
840
841            // Create reconnectable sink for SSE reconnection on resume.
842            let reconnectable_sink = Arc::new(ReconnectableEventSink::new(event_tx.clone()));
843
844            // Pre-warm thread context cache.
845            let thread_ctx = match ThreadContext::load(self.run_store.as_ref(), &thread_id).await {
846                Ok(ctx) => Some(ctx),
847                Err(e) => {
848                    tracing::warn!(thread_id, error = %e, "failed to pre-warm thread context");
849                    None
850                }
851            };
852
853            // Update worker state.
854            let worker = self.get_or_create_worker(&thread_id).await;
855            {
856                let mut w = worker.lock();
857                w.thread_ctx = thread_ctx;
858                w.status = MailboxWorkerStatus::Running {
859                    dispatch_id: dispatch_id.clone(),
860                    run_id: run_id.clone(),
861                    lease_handle,
862                    sink: Arc::clone(&reconnectable_sink),
863                };
864            }
865
866            // Spawn execution.
867            self.spawn_execution(
868                claimed_dispatch,
869                event_tx.clone(),
870                reconnectable_sink,
871                claim_token,
872                thread_id.clone(),
873                suspended,
874            );
875
876            Ok((
877                MailboxSubmitResult {
878                    dispatch_id,
879                    run_id,
880                    thread_id,
881                    status: MailboxDispatchStatus::Running,
882                },
883                event_rx,
884            ))
885        } else {
886            // Inline claim failed (another claimed dispatch exists for this
887            // thread). Cancel the orphaned dispatch to prevent it from
888            // lingering with the guard available_at.
889            let now_fix = now_ms();
890            let cancel_start = Instant::now();
891            let cancel_result = self.store.cancel(&dispatch_id, now_fix).await;
892            record_mailbox_operation_result("cancel", result_label(&cancel_result), cancel_start);
893            match cancel_result {
894                Ok(Some(cancelled_dispatch)) => {
895                    self.mark_cancelled_dispatch_run_cancelled(
896                        &cancelled_dispatch,
897                        "inline dispatch cancelled after claim race",
898                    )
899                    .await;
900                    self.refresh_dispatch_depth_metrics().await;
901                }
902                Ok(None) => {
903                    if let Ok(Some(dispatch)) = self.store.load_dispatch(&dispatch_id).await {
904                        self.reconcile_terminal_dispatch(&dispatch).await;
905                    }
906                    self.refresh_dispatch_depth_metrics().await;
907                }
908                Err(e) => {
909                    tracing::warn!(dispatch_id, error = %e, "failed to cancel unclaimed inline dispatch");
910                }
911            }
912            Err(MailboxError::Validation(ACTIVE_RUN_CONFLICT_MESSAGE.into()))
913        }
914    }
915
916    /// Submit a run in the background (fire-and-forget).
917    ///
918    /// Dispatch is persisted with `available_at = now`, then execution is event-driven.
919    /// Returns dispatch_id + thread_id for status polling.
920    #[tracing::instrument(skip(self, request), fields(thread_id = %request.thread_id))]
921    pub async fn submit_background(
922        self: &Arc<Self>,
923        mut request: RunRequest,
924    ) -> Result<MailboxSubmitResult, MailboxError> {
925        normalize_mailbox_run_mode(&mut request, true);
926        let (thread_id, messages) = validate_run_inputs(
927            request.thread_id.clone(),
928            request.messages.clone(),
929            !request.decisions.is_empty(),
930        )?;
931
932        let run_id = self
933            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
934            .await?;
935        let dispatch = self.build_dispatch(&request, &thread_id)?;
936        let dispatch_id = dispatch.dispatch_id.clone();
937        let thread_id = dispatch.thread_id.clone();
938
939        // WAL: persist with available_at = now.
940        self.enqueue_dispatch_with_metrics(&dispatch).await?;
941
942        // Dispatch via try_dispatch_next which handles Idle → Claiming atomically.
943        self.get_or_create_worker(&thread_id).await;
944        let claimed = self.try_dispatch_next(&thread_id).await;
945        let status = if claimed.started_execution() {
946            MailboxDispatchStatus::Running
947        } else {
948            MailboxDispatchStatus::Queued
949        };
950
951        Ok(MailboxSubmitResult {
952            dispatch_id,
953            run_id,
954            thread_id,
955            status,
956        })
957    }
958
959    /// Try to steer the currently active run first, then fall back to the
960    /// durable mailbox queue when live delivery is unavailable.
961    ///
962    /// # Delivery semantics
963    ///
964    /// **At-least-once** across the live + durable paths. The owning
965    /// node's forwarder acks a live command only after `InboxSender::
966    /// try_send` has returned success, so `Delivered` means the run has
967    /// the message. However — and this is the distributed edge case —
968    /// there is still a window where:
969    ///
970    /// 1. The forwarder hands the message to the run (`try_send` ok).
971    /// 2. The ack publish to the producer's reply subject drops or
972    ///    times out (network blip, broker partition).
973    /// 3. Producer observes `NoSubscriber` and falls back to
974    ///    [`Mailbox::submit_background`], which enqueues a fresh
975    ///    durable dispatch carrying the same messages.
976    /// 4. When the current run ends, the queued dispatch executes and
977    ///    the user-visible message history contains duplicates.
978    ///
979    /// `RunRequest` does not expose dispatch-level dedupe. Callers that need
980    /// exactly-once effects must drive idempotency at the application layer
981    /// (e.g., unique
982    ///   message IDs normalized via `normalize_message_ids`; agent
983    ///   state that rejects redundant inputs).
984    #[tracing::instrument(skip(self, request), fields(thread_id = %request.thread_id))]
985    pub async fn submit_live_then_queue(
986        self: &Arc<Self>,
987        mut request: RunRequest,
988        expected_run_id: Option<&str>,
989    ) -> Result<MailboxSubmitResult, MailboxError> {
990        let (thread_id, messages) = validate_run_inputs(
991            request.thread_id.clone(),
992            request.messages.clone(),
993            !request.decisions.is_empty(),
994        )?;
995        let messages = normalize_message_ids(&messages);
996
997        if let Some(result) = self
998            .try_deliver_live_messages(&thread_id, expected_run_id, messages.clone())
999            .await?
1000        {
1001            return Ok(result);
1002        }
1003
1004        request.thread_id = thread_id;
1005        request.messages = messages;
1006        self.submit_background(request).await
1007    }
1008
1009    // ── Control ──────────────────────────────────────────────────────
1010
1011    /// Cancel a run by dispatch_id or thread_id.
1012    ///
1013    /// If Queued: transitions to Cancelled via store.
1014    /// If Claimed/Running: cancels runtime run via dual-index lookup.
1015    pub async fn cancel(&self, id: &str) -> Result<bool, MailboxError> {
1016        // Try store cancel first (works for Queued dispatches).
1017        let now = now_ms();
1018        let cancel_start = Instant::now();
1019        let cancel_result = self.store.cancel(id, now).await;
1020        record_mailbox_operation_result("cancel", result_label(&cancel_result), cancel_start);
1021        let cancelled = cancel_result?;
1022        if let Some(cancelled_dispatch) = cancelled {
1023            self.mark_cancelled_dispatch_run_cancelled(
1024                &cancelled_dispatch,
1025                "queued dispatch cancelled",
1026            )
1027            .await;
1028            self.refresh_dispatch_depth_metrics().await;
1029            return Ok(true);
1030        }
1031
1032        // Try runtime cancel (for Claimed/Running dispatches).
1033        if self.executor.cancel(id) {
1034            return Ok(true);
1035        }
1036
1037        if let Some(dispatch) = self.store.load_dispatch(id).await?
1038            && dispatch.status == RunDispatchStatus::Claimed
1039        {
1040            return self
1041                .deliver_live_cancel(&live_target_for_dispatch(&dispatch))
1042                .await;
1043        }
1044
1045        let run = if let Some(run) = self.run_store.load_run(id).await? {
1046            Some(run)
1047        } else {
1048            self.run_store.latest_run(id).await?
1049        };
1050        if let Some(run) = run
1051            && matches!(run.status, RunStatus::Running | RunStatus::Waiting)
1052        {
1053            return self.deliver_live_cancel(&live_target_for_run(&run)).await;
1054        }
1055
1056        Ok(false)
1057    }
1058
1059    /// Interrupt a thread: bump dispatch epoch, supersede all pending,
1060    /// cancel active run. Clean slate for the thread.
1061    pub async fn interrupt(&self, thread_id: &str) -> Result<MailboxInterrupt, MailboxError> {
1062        self.interrupt_detailed(thread_id).await.map(Into::into)
1063    }
1064
1065    /// Interrupt a thread and return the exact queued dispatches superseded by
1066    /// the operation.
1067    pub async fn interrupt_detailed(
1068        &self,
1069        thread_id: &str,
1070    ) -> Result<MailboxInterruptDetails, MailboxError> {
1071        let now = now_ms();
1072        let interrupt_start = Instant::now();
1073        let interrupt_result = self.store.interrupt_detailed(thread_id, now).await;
1074        record_mailbox_operation_result(
1075            "interrupt",
1076            result_label(&interrupt_result),
1077            interrupt_start,
1078        );
1079        let result = interrupt_result?;
1080        crate::metrics::inc_mailbox_operation_by("supersede", "ok", result.superseded_count as u64);
1081        self.refresh_dispatch_depth_metrics().await;
1082        for superseded_dispatch in &result.superseded_dispatches {
1083            self.mark_superseded_dispatch_run_cancelled(
1084                superseded_dispatch,
1085                "queued dispatch superseded by interrupt",
1086            )
1087            .await;
1088        }
1089
1090        // Cancel active runtime run if any.
1091        if let Some(active_dispatch) = result.active_dispatch.as_ref() {
1092            self.cancel_active_dispatch(thread_id, active_dispatch, false)
1093                .await?;
1094        }
1095
1096        Ok(result)
1097    }
1098
1099    /// Forward a tool-call decision to an active run in this process only.
1100    ///
1101    /// Distributed callers should use [`Self::send_decision_live`] so remote
1102    /// active runs can receive the decision through targeted live delivery.
1103    pub fn send_decision(&self, id: &str, tool_call_id: String, resume: ToolCallResume) -> bool {
1104        self.executor.send_decision(id, tool_call_id, resume)
1105    }
1106
1107    /// Forward a tool-call decision locally or through targeted live delivery.
1108    ///
1109    /// Live delivery is at-least-once when the remote run accepts the decision
1110    /// but the ack is lost before the durable fallback is enqueued. Consumers
1111    /// must treat `(tool_call_id, decision_id)` as idempotent.
1112    pub async fn send_decision_live(
1113        &self,
1114        id: &str,
1115        tool_call_id: String,
1116        resume: ToolCallResume,
1117    ) -> Result<bool, MailboxError> {
1118        if self
1119            .executor
1120            .send_decision(id, tool_call_id.clone(), resume.clone())
1121        {
1122            return Ok(true);
1123        }
1124
1125        if let Some(dispatch) = self.store.load_dispatch(id).await?
1126            && dispatch.status == RunDispatchStatus::Claimed
1127        {
1128            return self
1129                .deliver_live_decision(
1130                    &live_target_for_dispatch(&dispatch),
1131                    vec![(tool_call_id, resume)],
1132                )
1133                .await;
1134        }
1135
1136        let run = if let Some(run) = self.run_store.load_run(id).await? {
1137            Some(run)
1138        } else {
1139            self.run_store.latest_run(id).await?
1140        };
1141        if let Some(run) = run
1142            && matches!(run.status, RunStatus::Running | RunStatus::Waiting)
1143        {
1144            return self
1145                .deliver_live_decision(&live_target_for_run(&run), vec![(tool_call_id, resume)])
1146                .await;
1147        }
1148
1149        Ok(false)
1150    }
1151
1152    async fn cancel_active_dispatch(
1153        &self,
1154        thread_id: &str,
1155        active_dispatch: &RunDispatch,
1156        wait_for_release: bool,
1157    ) -> Result<bool, MailboxError> {
1158        if wait_for_release {
1159            if self.executor.cancel_and_wait_by_thread(thread_id).await {
1160                if self
1161                    .wait_for_dispatch_not_claimed(&active_dispatch.dispatch_id)
1162                    .await?
1163                {
1164                    return Ok(true);
1165                }
1166                tracing::warn!(
1167                    thread_id,
1168                    dispatch_id = %active_dispatch.dispatch_id,
1169                    "local cancel completed but active dispatch did not release before foreground submit"
1170                );
1171                return Ok(false);
1172            }
1173        } else if self.executor.cancel(thread_id) {
1174            return Ok(true);
1175        }
1176
1177        if !self
1178            .deliver_live_cancel(&live_target_for_dispatch(active_dispatch))
1179            .await?
1180        {
1181            return Ok(false);
1182        }
1183
1184        if wait_for_release
1185            && !self
1186                .wait_for_dispatch_not_claimed(&active_dispatch.dispatch_id)
1187                .await?
1188        {
1189            tracing::warn!(
1190                thread_id,
1191                dispatch_id = %active_dispatch.dispatch_id,
1192                "remote cancel delivered but active dispatch did not release before foreground submit"
1193            );
1194            return Ok(false);
1195        }
1196        Ok(true)
1197    }
1198
1199    async fn deliver_live_cancel(&self, target: &LiveRunTarget) -> Result<bool, MailboxError> {
1200        match self
1201            .store
1202            .deliver_live_to(target, LiveRunCommand::Cancel)
1203            .await?
1204        {
1205            LiveDeliveryOutcome::Delivered => Ok(true),
1206            LiveDeliveryOutcome::NoSubscriber => Ok(false),
1207        }
1208    }
1209
1210    async fn deliver_live_decision(
1211        &self,
1212        target: &LiveRunTarget,
1213        decisions: Vec<(String, ToolCallResume)>,
1214    ) -> Result<bool, MailboxError> {
1215        match self
1216            .store
1217            .deliver_live_to(target, LiveRunCommand::Decision(decisions))
1218            .await?
1219        {
1220            LiveDeliveryOutcome::Delivered => Ok(true),
1221            LiveDeliveryOutcome::NoSubscriber => Ok(false),
1222        }
1223    }
1224
1225    async fn wait_for_dispatch_not_claimed(&self, dispatch_id: &str) -> Result<bool, MailboxError> {
1226        let deadline = tokio::time::Instant::now() + Duration::from_millis(REMOTE_CANCEL_WAIT_MS);
1227        loop {
1228            match self.store.load_dispatch(dispatch_id).await? {
1229                Some(dispatch) if dispatch.status == RunDispatchStatus::Claimed => {}
1230                _ => return Ok(true),
1231            }
1232            if tokio::time::Instant::now() >= deadline {
1233                return Ok(false);
1234            }
1235            tokio::time::sleep(Duration::from_millis(REMOTE_CANCEL_POLL_MS)).await;
1236        }
1237    }
1238
1239    async fn mark_superseded_dispatch_run_cancelled(&self, dispatch: &RunDispatch, reason: &str) {
1240        self.mark_dispatch_run_cancelled("mark_run_superseded", "superseded", dispatch, reason)
1241            .await;
1242    }
1243
1244    async fn mark_cancelled_dispatch_run_cancelled(&self, dispatch: &RunDispatch, reason: &str) {
1245        self.mark_dispatch_run_cancelled("mark_run_cancelled", "cancelled", dispatch, reason)
1246            .await;
1247    }
1248
1249    async fn mark_dispatch_run_cancelled(
1250        &self,
1251        operation: &str,
1252        outcome: &str,
1253        dispatch: &RunDispatch,
1254        reason: &str,
1255    ) {
1256        let start = Instant::now();
1257        let result = self
1258            .mark_dispatch_run_cancelled_inner(dispatch, reason)
1259            .await;
1260        record_mailbox_operation_result(operation, result_label(&result), start);
1261        if matches!(result, Ok(true)) {
1262            record_mailbox_dispatch_terminal_metrics(dispatch, outcome);
1263        }
1264        if let Err(error) = result {
1265            tracing::warn!(
1266                dispatch_id = %dispatch.dispatch_id,
1267                run_id = %dispatch.run_id,
1268                thread_id = %dispatch.thread_id,
1269                reason,
1270                error = %error,
1271                "failed to mark terminal mailbox run as cancelled"
1272            );
1273        }
1274    }
1275
1276    async fn mark_dispatch_run_cancelled_inner(
1277        &self,
1278        dispatch: &RunDispatch,
1279        _reason: &str,
1280    ) -> Result<bool, MailboxError> {
1281        let Some(mut run) = self.run_store.load_run(&dispatch.run_id).await? else {
1282            return Ok(false);
1283        };
1284        if run.thread_id != dispatch.thread_id || run.status == RunStatus::Done {
1285            return Ok(false);
1286        }
1287
1288        let now = now_ms() / 1000;
1289        run.status = RunStatus::Done;
1290        run.termination_reason = Some(TerminationReason::Cancelled);
1291        run.error_payload = None;
1292        run.dispatch_id = Some(dispatch.dispatch_id.clone());
1293        run.session_id = dispatch.dispatch_instance_id.clone();
1294        run.waiting = None;
1295        run.finished_at = Some(now);
1296        run.updated_at = now;
1297
1298        self.checkpoint_terminal_dispatch_run(dispatch, &run)
1299            .await?;
1300        Ok(true)
1301    }
1302
1303    async fn mark_dead_letter_dispatch_run_error(&self, dispatch: &RunDispatch) {
1304        let start = Instant::now();
1305        let result = self
1306            .mark_dead_letter_dispatch_run_error_inner(dispatch)
1307            .await;
1308        record_mailbox_operation_result("mark_run_dead_letter", result_label(&result), start);
1309        if matches!(result, Ok(true)) {
1310            record_mailbox_dispatch_terminal_metrics(dispatch, "dead_letter");
1311        }
1312        if let Err(error) = result {
1313            tracing::warn!(
1314                dispatch_id = %dispatch.dispatch_id,
1315                run_id = %dispatch.run_id,
1316                thread_id = %dispatch.thread_id,
1317                error = %error,
1318                "failed to mark dead-lettered mailbox run as errored"
1319            );
1320        }
1321    }
1322
1323    async fn reconcile_terminal_dispatch(&self, dispatch: &RunDispatch) {
1324        match dispatch.status {
1325            RunDispatchStatus::DeadLetter => {
1326                self.mark_dead_letter_dispatch_run_error(dispatch).await;
1327            }
1328            RunDispatchStatus::Cancelled => {
1329                self.mark_cancelled_dispatch_run_cancelled(
1330                    dispatch,
1331                    "cancelled dispatch reclaimed during mailbox maintenance",
1332                )
1333                .await;
1334            }
1335            RunDispatchStatus::Superseded => {
1336                self.mark_superseded_dispatch_run_cancelled(
1337                    dispatch,
1338                    "superseded dispatch reclaimed during mailbox maintenance",
1339                )
1340                .await;
1341            }
1342            RunDispatchStatus::Queued | RunDispatchStatus::Claimed | RunDispatchStatus::Acked => {}
1343        }
1344    }
1345
1346    async fn reconcile_terminal_dispatches(&self) {
1347        let mut offset = 0;
1348        loop {
1349            let list_start = Instant::now();
1350            let result = self
1351                .store
1352                .list_terminal_dispatches(TERMINAL_RECONCILE_BATCH, offset)
1353                .await;
1354            record_mailbox_operation_result(
1355                "list_terminal_dispatches",
1356                result_label(&result),
1357                list_start,
1358            );
1359            let dispatches = match result {
1360                Ok(dispatches) => dispatches,
1361                Err(error) => {
1362                    tracing::warn!(
1363                        error = %error,
1364                        "failed to list terminal mailbox dispatches for run reconciliation"
1365                    );
1366                    return;
1367                }
1368            };
1369            if dispatches.is_empty() {
1370                return;
1371            }
1372            crate::metrics::inc_mailbox_operation_by(
1373                "reconcile_terminal_dispatch",
1374                "ok",
1375                dispatches.len() as u64,
1376            );
1377            let page_len = dispatches.len();
1378            for dispatch in &dispatches {
1379                self.reconcile_terminal_dispatch(dispatch).await;
1380            }
1381            if page_len < TERMINAL_RECONCILE_BATCH {
1382                return;
1383            }
1384            offset += page_len;
1385        }
1386    }
1387
1388    async fn mark_dead_letter_dispatch_run_error_inner(
1389        &self,
1390        dispatch: &RunDispatch,
1391    ) -> Result<bool, MailboxError> {
1392        let Some(mut run) = self.run_store.load_run(&dispatch.run_id).await? else {
1393            return Ok(false);
1394        };
1395        if run.thread_id != dispatch.thread_id || run.status == RunStatus::Done {
1396            return Ok(false);
1397        }
1398
1399        let reason = dispatch
1400            .run_error
1401            .clone()
1402            .or_else(|| dispatch.last_error.clone())
1403            .unwrap_or_else(|| "mailbox dispatch dead-lettered".to_string());
1404        let now = now_ms() / 1000;
1405        run.status = RunStatus::Done;
1406        run.termination_reason = Some(TerminationReason::Error(reason.clone()));
1407        run.error_payload = Some(serde_json::json!({ "message": reason }));
1408        run.dispatch_id = Some(dispatch.dispatch_id.clone());
1409        run.session_id = dispatch.dispatch_instance_id.clone();
1410        run.waiting = None;
1411        run.finished_at = Some(now);
1412        run.updated_at = now;
1413
1414        self.checkpoint_terminal_dispatch_run(dispatch, &run)
1415            .await?;
1416        Ok(true)
1417    }
1418
1419    async fn checkpoint_terminal_dispatch_run(
1420        &self,
1421        dispatch: &RunDispatch,
1422        run: &RunRecord,
1423    ) -> Result<(), MailboxError> {
1424        let messages = self
1425            .run_store
1426            .load_messages(&dispatch.thread_id)
1427            .await?
1428            .unwrap_or_default();
1429        self.run_store
1430            .checkpoint(&dispatch.thread_id, &messages, run)
1431            .await?;
1432        {
1433            let workers = self.workers.read().await;
1434            if let Some(worker) = workers.get(&dispatch.thread_id) {
1435                let mut worker = worker.lock();
1436                if let Some(ref mut ctx) = worker.thread_ctx {
1437                    ctx.apply_checkpoint(&messages, run);
1438                }
1439            }
1440        }
1441        Ok(())
1442    }
1443
1444    async fn try_deliver_live_messages(
1445        &self,
1446        thread_id: &str,
1447        expected_run_id: Option<&str>,
1448        messages: Vec<Message>,
1449    ) -> Result<Option<MailboxSubmitResult>, MailboxError> {
1450        if messages.is_empty() {
1451            return Ok(None);
1452        }
1453
1454        let local_active = {
1455            let workers = self.workers.read().await;
1456            workers.get(thread_id).and_then(|worker| {
1457                let worker = worker.lock();
1458                match &worker.status {
1459                    MailboxWorkerStatus::Running {
1460                        dispatch_id,
1461                        run_id,
1462                        ..
1463                    } => Some((dispatch_id.clone(), run_id.clone())),
1464                    MailboxWorkerStatus::Idle | MailboxWorkerStatus::Claiming => None,
1465                }
1466            })
1467        };
1468
1469        if let Some((active_dispatch_id, active_run_id)) = local_active {
1470            // Race guard against a run that just rolled over.
1471            if expected_run_id.is_some_and(|expected| expected != active_run_id) {
1472                return Ok(None);
1473            }
1474            // Local fast path: executor has a direct handle to the run's
1475            // inbox and returns a boolean indicating whether the channel
1476            // accepted the payload. A `false` here means the local run
1477            // just ended — fall back to durable dispatch.
1478            if !self.executor.send_messages(&active_run_id, messages) {
1479                return Ok(None);
1480            }
1481            return Ok(Some(MailboxSubmitResult {
1482                dispatch_id: active_dispatch_id,
1483                run_id: active_run_id,
1484                thread_id: thread_id.to_string(),
1485                status: MailboxDispatchStatus::Running,
1486            }));
1487        }
1488
1489        // No local worker: check whether another node is running this
1490        // thread. `ThreadRunStore::latest_run` is the global truth (every
1491        // node checkpoints to the same store).
1492        let Some(remote_run) = self.run_store.latest_run(thread_id).await? else {
1493            return Ok(None);
1494        };
1495        if remote_run.status != RunStatus::Running {
1496            return Ok(None);
1497        }
1498        if expected_run_id.is_some_and(|expected| expected != remote_run.run_id) {
1499            return Ok(None);
1500        }
1501
1502        // Cross-node: ask the store to deliver. If the owning node's
1503        // forwarder isn't subscribed yet, `deliver_live` reports
1504        // `NoSubscriber` and we fall through so `submit_live_then_queue`
1505        // enqueues a durable dispatch instead of silently dropping.
1506        let outcome = self
1507            .store
1508            .deliver_live_to(
1509                &live_target_for_run(&remote_run),
1510                awaken_contract::contract::mailbox::LiveRunCommand::Messages(messages),
1511            )
1512            .await?;
1513        match outcome {
1514            awaken_contract::contract::mailbox::LiveDeliveryOutcome::Delivered => {}
1515            awaken_contract::contract::mailbox::LiveDeliveryOutcome::NoSubscriber => {
1516                return Ok(None);
1517            }
1518        }
1519
1520        let dispatch_id = remote_run
1521            .dispatch_id
1522            .clone()
1523            .unwrap_or_else(|| remote_run.run_id.clone());
1524        Ok(Some(MailboxSubmitResult {
1525            dispatch_id,
1526            run_id: remote_run.run_id,
1527            thread_id: thread_id.to_string(),
1528            status: MailboxDispatchStatus::Running,
1529        }))
1530    }
1531
1532    /// Reconnect the event sink for an active (suspended) run.
1533    ///
1534    /// Replaces the underlying channel sender so subsequent events flow to
1535    /// `new_tx`. Returns `true` if the thread has an active worker.
1536    pub async fn reconnect_sink(&self, thread_id: &str, new_tx: mpsc::Sender<AgentEvent>) -> bool {
1537        let workers = self.workers.read().await;
1538        let Some(worker) = workers.get(thread_id) else {
1539            return false;
1540        };
1541        let w = worker.lock();
1542        match &w.status {
1543            MailboxWorkerStatus::Running { sink, .. } => {
1544                sink.reconnect(new_tx);
1545                true
1546            }
1547            MailboxWorkerStatus::Idle | MailboxWorkerStatus::Claiming => false,
1548        }
1549    }
1550
1551    async fn reusable_waiting_run_id(&self, thread_id: &str) -> Option<String> {
1552        if let Some(thread) = self.run_store.load_thread(thread_id).await.ok().flatten()
1553            && let Some(open_run_id) = thread.open_run_id.as_deref()
1554            && let Some(run) = self.run_store.load_run(open_run_id).await.ok().flatten()
1555            && run.thread_id == thread_id
1556            && run.is_resumable_waiting()
1557        {
1558            return Some(run.run_id);
1559        }
1560        let run = self.run_store.latest_run(thread_id).await.ok().flatten()?;
1561        run.is_resumable_waiting().then_some(run.run_id)
1562    }
1563
1564    // ── Query ────────────────────────────────────────────────────────
1565
1566    /// List mailbox dispatches for a thread (with optional status filter).
1567    pub async fn list_dispatches(
1568        &self,
1569        thread_id: &str,
1570        status_filter: Option<&[RunDispatchStatus]>,
1571        limit: usize,
1572        offset: usize,
1573    ) -> Result<Vec<RunDispatch>, MailboxError> {
1574        Ok(self
1575            .store
1576            .list_dispatches(thread_id, status_filter, limit, offset)
1577            .await?)
1578    }
1579
1580    /// List thread IDs that currently have queued dispatches.
1581    pub async fn queued_thread_ids(&self) -> Result<Vec<String>, MailboxError> {
1582        Ok(self.store.queued_thread_ids().await?)
1583    }
1584
1585    pub async fn load_dispatch(
1586        &self,
1587        dispatch_id: &str,
1588    ) -> Result<Option<RunDispatch>, MailboxError> {
1589        Ok(self.store.load_dispatch(dispatch_id).await?)
1590    }
1591
1592    // ── Lifecycle ────────────────────────────────────────────────────
1593
1594    /// Start framework-managed startup recovery plus sweep/GC maintenance.
1595    ///
1596    /// This method is idempotent: repeated calls return a handle to the
1597    /// already-running lifecycle instead of spawning duplicate recovery or
1598    /// maintenance loops. Dropping the returned handle does not stop the
1599    /// lifecycle; call `MailboxLifecycleHandle::shutdown().await` for
1600    /// quiescent shutdown or `MailboxLifecycleHandle::abort()` for
1601    /// fire-and-forget stop.
1602    ///
1603    /// If an async lifecycle transition is already in progress, this method
1604    /// returns an error instead of racing that transition. Use
1605    /// [`start_lifecycle_ready`](Self::start_lifecycle_ready) when the caller
1606    /// needs to wait for startup readiness.
1607    pub fn start_lifecycle(
1608        self: &Arc<Self>,
1609        config: MailboxLifecycleConfig,
1610    ) -> Result<MailboxLifecycleHandle, MailboxError> {
1611        let handle = MailboxLifecycleHandle {
1612            tasks: Arc::clone(&self.lifecycle_tasks),
1613            transition_lock: Arc::clone(&self.lifecycle_start_lock),
1614        };
1615        for _ in 0..16 {
1616            match self.lifecycle_start_lock.try_lock() {
1617                Ok(_transition_guard) => return self.start_lifecycle_internal(config, true),
1618                Err(_) if self.lifecycle_is_running()? => return Ok(handle),
1619                Err(_) => std::thread::yield_now(),
1620            }
1621        }
1622        Err(MailboxError::Internal(
1623            "mailbox lifecycle transition is already running".to_string(),
1624        ))
1625    }
1626
1627    /// Run startup recovery to readiness, then start framework-managed
1628    /// maintenance.
1629    ///
1630    /// Unlike [`start_lifecycle`](Self::start_lifecycle), this method waits for
1631    /// startup recovery and returns an error when recovery exhausts its retry
1632    /// policy. Repeated calls remain idempotent: if lifecycle tasks are already
1633    /// running, the existing handle is returned.
1634    pub async fn start_lifecycle_ready(
1635        self: &Arc<Self>,
1636        mut config: MailboxLifecycleConfig,
1637    ) -> Result<MailboxLifecycleHandle, MailboxError> {
1638        let _start_guard = self.lifecycle_start_lock.lock().await;
1639        let handle = MailboxLifecycleHandle {
1640            tasks: Arc::clone(&self.lifecycle_tasks),
1641            transition_lock: Arc::clone(&self.lifecycle_start_lock),
1642        };
1643        if self.lifecycle_is_running()? {
1644            return Ok(handle);
1645        }
1646
1647        if !config.startup_delay.is_zero() {
1648            tokio::time::sleep(config.startup_delay).await;
1649            config.startup_delay = Duration::ZERO;
1650        }
1651
1652        self.run_startup_recovery_with_retry(config.startup_recovery.clone())
1653            .await?;
1654        self.start_lifecycle_internal(config, false)
1655    }
1656
1657    fn lifecycle_is_running(&self) -> Result<bool, MailboxError> {
1658        Ok(self
1659            .lifecycle_tasks
1660            .lock()
1661            .map_err(|_| MailboxError::Internal("mailbox lifecycle lock poisoned".to_string()))?
1662            .is_some())
1663    }
1664
1665    fn start_lifecycle_internal(
1666        self: &Arc<Self>,
1667        config: MailboxLifecycleConfig,
1668        run_startup_recovery: bool,
1669    ) -> Result<MailboxLifecycleHandle, MailboxError> {
1670        let handle = MailboxLifecycleHandle {
1671            tasks: Arc::clone(&self.lifecycle_tasks),
1672            transition_lock: Arc::clone(&self.lifecycle_start_lock),
1673        };
1674        let mut lifecycle = self
1675            .lifecycle_tasks
1676            .lock()
1677            .map_err(|_| MailboxError::Internal("mailbox lifecycle lock poisoned".to_string()))?;
1678
1679        if lifecycle.is_some() {
1680            return Ok(handle);
1681        }
1682
1683        let startup_delay = config.startup_delay;
1684        let startup_recovery = config.startup_recovery.clone();
1685        let recover_mailbox = Arc::clone(self);
1686        let recover_task = run_startup_recovery.then(|| {
1687            tokio::spawn(async move {
1688                if !startup_delay.is_zero() {
1689                    tokio::time::sleep(startup_delay).await;
1690                }
1691                match recover_mailbox
1692                    .run_startup_recovery_with_retry(startup_recovery)
1693                    .await
1694                {
1695                    Ok(recovered) => {
1696                        tracing::info!(recovered, "mailbox startup recovery completed");
1697                    }
1698                    Err(error) => {
1699                        tracing::error!(error = %error, "mailbox startup recovery failed");
1700                    }
1701                }
1702            })
1703        });
1704
1705        let maintenance_mailbox = Arc::clone(self);
1706        let maintenance_callback = config.maintenance_callback;
1707        let maintenance_task = tokio::spawn(async move {
1708            if !startup_delay.is_zero() {
1709                tokio::time::sleep(startup_delay).await;
1710            }
1711            maintenance_mailbox
1712                .run_maintenance_loop(maintenance_callback)
1713                .await;
1714        });
1715
1716        let dispatch_signal_task = self.store.supports_dispatch_signals().then(|| {
1717            let signal_mailbox = Arc::clone(self);
1718            tokio::spawn(async move {
1719                if !startup_delay.is_zero() {
1720                    tokio::time::sleep(startup_delay).await;
1721                }
1722                signal_mailbox.run_dispatch_signal_loop().await;
1723            })
1724        });
1725
1726        *lifecycle = Some(MailboxLifecycleTasks {
1727            recover_task,
1728            dispatch_signal_task,
1729            maintenance_task,
1730        });
1731        Ok(handle)
1732    }
1733
1734    async fn run_startup_recovery_with_retry(
1735        self: &Arc<Self>,
1736        config: MailboxStartupRecoveryConfig,
1737    ) -> Result<usize, MailboxError> {
1738        let max_attempts = config.max_attempts.max(1);
1739        for attempt in 1..=max_attempts {
1740            match self.recover().await {
1741                Ok(recovered) => return Ok(recovered),
1742                Err(error) if attempt < max_attempts => {
1743                    tracing::warn!(
1744                        attempt,
1745                        max_attempts,
1746                        retry_delay_ms = config.retry_delay.as_millis(),
1747                        error = %error,
1748                        "mailbox startup recovery failed; retrying"
1749                    );
1750                    if !config.retry_delay.is_zero() {
1751                        tokio::time::sleep(config.retry_delay).await;
1752                    }
1753                }
1754                Err(error) => return Err(error),
1755            }
1756        }
1757        unreachable!("max_attempts is normalized to at least one")
1758    }
1759
1760    /// Recover on startup: reload queued dispatches and dispatch idle threads.
1761    #[tracing::instrument(skip(self))]
1762    pub async fn recover(self: &Arc<Self>) -> Result<usize, MailboxError> {
1763        let now = now_ms();
1764        let mut total = 0;
1765
1766        // Reclaim expired leases from previous process crash.
1767        let reclaim_start = Instant::now();
1768        let reclaimed_result = self.store.reclaim_expired_leases(now, 100).await;
1769        record_mailbox_operation_result("reclaim", result_label(&reclaimed_result), reclaim_start);
1770        let reclaimed = reclaimed_result?;
1771        crate::metrics::inc_mailbox_operation_by("reclaim_dispatch", "ok", reclaimed.len() as u64);
1772        if !reclaimed.is_empty() {
1773            self.refresh_dispatch_depth_metrics().await;
1774        }
1775        for dispatch in &reclaimed {
1776            self.reconcile_terminal_dispatch(dispatch).await;
1777        }
1778        self.reconcile_terminal_dispatches().await;
1779        total += reclaimed.len();
1780
1781        // Reload all queued mailbox IDs and try to dispatch.
1782        let thread_ids = self.store.queued_thread_ids().await?;
1783        for thread_id in &thread_ids {
1784            // Ensure worker exists for each thread with queued dispatches.
1785            self.get_or_create_worker(thread_id).await;
1786            self.try_dispatch_next(thread_id).await;
1787        }
1788
1789        // Recover orphaned background-task waits with no queued wake dispatch.
1790        {
1791            let query = awaken_contract::contract::storage::RunQuery {
1792                status: Some(awaken_contract::contract::lifecycle::RunStatus::Waiting),
1793                limit: 200,
1794                ..Default::default()
1795            };
1796            if let Ok(page) = self.run_store.list_runs(&query).await {
1797                let queued_set: std::collections::HashSet<String> =
1798                    thread_ids.iter().cloned().collect();
1799                for run in &page.items {
1800                    if !run.is_background_task_waiting() {
1801                        continue;
1802                    }
1803                    // Skip if this thread already has a queued dispatch.
1804                    if queued_set.contains(&run.thread_id) {
1805                        continue;
1806                    }
1807                    let request = RunRequest::new(
1808                        run.thread_id.clone(),
1809                        vec![Message::internal_user("<background-tasks-updated />")],
1810                    )
1811                    .with_agent_id(run.agent_id.clone())
1812                    .with_continue_run_id(run.run_id.clone())
1813                    .with_origin(awaken_contract::contract::storage::RunRequestOrigin::Internal)
1814                    .with_run_mode(RunMode::InternalWake)
1815                    .with_adapter(AdapterKind::Internal);
1816                    if self.submit_background(request).await.is_ok() {
1817                        total += 1;
1818                        tracing::info!(
1819                            thread_id = %run.thread_id,
1820                            run_id = %run.run_id,
1821                            "recover: enqueued wake dispatch for orphaned background-task thread"
1822                        );
1823                    }
1824                }
1825            }
1826        }
1827
1828        Ok(total)
1829    }
1830
1831    /// Run sweep + GC loop forever. Call from `tokio::spawn`.
1832    ///
1833    /// When `maintenance_callback` is provided, it runs on each GC tick so
1834    /// applications can clean up resources they own.
1835    pub async fn run_maintenance_loop(
1836        self: Arc<Self>,
1837        maintenance_callback: Option<MailboxMaintenanceCallback>,
1838    ) {
1839        let mut sweep_interval = tokio::time::interval(self.config.sweep_interval);
1840        let mut gc_interval = tokio::time::interval(self.config.gc_interval);
1841
1842        // Skip the initial immediate tick.
1843        sweep_interval.tick().await;
1844        gc_interval.tick().await;
1845
1846        loop {
1847            tokio::select! {
1848                _ = sweep_interval.tick() => {
1849                    self.run_sweep().await;
1850                }
1851                _ = gc_interval.tick() => {
1852                    self.run_gc().await;
1853                    if let Some(cleanup) = &maintenance_callback {
1854                        cleanup();
1855                    }
1856                }
1857            }
1858        }
1859    }
1860
1861    /// Drain backend work-queue delivery signals and wake local workers.
1862    pub async fn run_dispatch_signal_loop(self: Arc<Self>) {
1863        loop {
1864            let pull_start = Instant::now();
1865            let pull_result = self
1866                .store
1867                .pull_dispatch_signals(
1868                    dispatch_signal_batch_size(),
1869                    dispatch_signal_fetch_expires(),
1870                )
1871                .await;
1872            record_mailbox_operation_result("signal_pull", result_label(&pull_result), pull_start);
1873            match pull_result {
1874                Ok(entries) => {
1875                    crate::metrics::inc_mailbox_dispatch_signal_pulled_by(entries.len() as u64);
1876                    self.handle_dispatch_signal_entries(entries).await;
1877                }
1878                Err(error) => {
1879                    tracing::warn!(error = %error, "dispatch signal pull failed");
1880                    tokio::time::sleep(DISPATCH_SIGNAL_ERROR_DELAY).await;
1881                }
1882            }
1883        }
1884    }
1885
1886    async fn handle_dispatch_signal_entries(self: &Arc<Self>, entries: Vec<DispatchSignalEntry>) {
1887        if entries.is_empty() {
1888            return;
1889        }
1890        let max_concurrent = dispatch_signal_max_concurrent_handlers()
1891            .min(entries.len())
1892            .max(1);
1893        let semaphore = Arc::new(Semaphore::new(max_concurrent));
1894        let mut tasks = JoinSet::new();
1895        for entry in entries {
1896            let Ok(permit) = Arc::clone(&semaphore).acquire_owned().await else {
1897                tracing::warn!("dispatch signal concurrency limiter closed");
1898                break;
1899            };
1900            let mailbox = Arc::clone(self);
1901            tasks.spawn(async move {
1902                let _permit = permit;
1903                mailbox.handle_dispatch_signal_entry(entry).await;
1904            });
1905        }
1906        while let Some(result) = tasks.join_next().await {
1907            if let Err(error) = result {
1908                tracing::warn!(error = %error, "dispatch signal handler task failed");
1909            }
1910        }
1911    }
1912
1913    async fn handle_dispatch_signal_entry(self: Arc<Self>, entry: DispatchSignalEntry) {
1914        let redelivery_attempts = entry.receipt.redelivery_attempts();
1915        if redelivery_attempts.is_some_and(|attempts| attempts > 1) {
1916            crate::metrics::inc_mailbox_dispatch_signal_redelivery();
1917        }
1918        self.get_or_create_worker(&entry.thread_id).await;
1919        let attempt = self.try_dispatch_next(&entry.thread_id).await;
1920        let nack_delay = match attempt {
1921            DispatchAttempt::TransientError => Some(None),
1922            DispatchAttempt::NoEligible => {
1923                match self.dispatch_signal_still_available(&entry).await {
1924                    Ok(true) => Some(Some(dispatch_signal_blocked_nack_delay(
1925                        redelivery_attempts,
1926                    ))),
1927                    Ok(false) => None,
1928                    Err(error) => {
1929                        tracing::warn!(
1930                            thread_id = %entry.thread_id,
1931                            dispatch_id = %entry.dispatch_id,
1932                            error = %error,
1933                            "failed to verify unclaimed dispatch signal"
1934                        );
1935                        Some(None)
1936                    }
1937                }
1938            }
1939            DispatchAttempt::Claimed | DispatchAttempt::Busy => None,
1940        };
1941        if let Some(delay) = nack_delay {
1942            let nack_start = Instant::now();
1943            let result = if let Some(delay) = delay {
1944                entry.receipt.nack_with_delay(delay).await
1945            } else {
1946                entry.receipt.nack().await
1947            };
1948            record_mailbox_operation_result("signal_nack", result_label(&result), nack_start);
1949            if result.is_ok() {
1950                crate::metrics::inc_mailbox_dispatch_signal_nack(delay.is_some());
1951            }
1952            if let Err(error) = result {
1953                tracing::warn!(
1954                    thread_id = %entry.thread_id,
1955                    dispatch_id = %entry.dispatch_id,
1956                    error = %error,
1957                    "failed to nack dispatch signal after claim error"
1958                );
1959            }
1960            return;
1961        }
1962        let ack_start = Instant::now();
1963        let ack_result = entry.receipt.ack().await;
1964        record_mailbox_operation_result("signal_ack", result_label(&ack_result), ack_start);
1965        if ack_result.is_ok() {
1966            crate::metrics::inc_mailbox_dispatch_signal_ack();
1967        }
1968        if let Err(error) = ack_result {
1969            tracing::warn!(
1970                thread_id = %entry.thread_id,
1971                dispatch_id = %entry.dispatch_id,
1972                error = %error,
1973                "failed to ack dispatch signal"
1974            );
1975        }
1976    }
1977
1978    async fn dispatch_signal_still_available(
1979        &self,
1980        entry: &awaken_contract::contract::mailbox::DispatchSignalEntry,
1981    ) -> Result<bool, StorageError> {
1982        let now = now_ms();
1983        let Some(dispatch) = self.store.load_dispatch(&entry.dispatch_id).await? else {
1984            return Ok(false);
1985        };
1986        Ok(dispatch.status == RunDispatchStatus::Queued && dispatch.available_at <= now)
1987    }
1988
1989    // ── Internal: dispatch ───────────────────────────────────────────
1990
1991    /// Claim a dispatch from the store and start execution.
1992    #[tracing::instrument(skip(self), fields(thread_id = %thread_id))]
1993    async fn dispatch_next_claim(self: &Arc<Self>, thread_id: &str) -> DispatchAttempt {
1994        let now = now_ms();
1995        let claim_start = Instant::now();
1996        let claim_result = self
1997            .store
1998            .claim(thread_id, &self.consumer_id, self.config.lease_ms, now, 1)
1999            .await;
2000        let claim_result_label = match &claim_result {
2001            Ok(claimed) if claimed.is_empty() => "empty",
2002            Ok(_) => "ok",
2003            Err(_) => "error",
2004        };
2005        record_mailbox_operation_result("claim", claim_result_label, claim_start);
2006        let claimed = match claim_result {
2007            Ok(c) => {
2008                self.refresh_dispatch_depth_metrics().await;
2009                c
2010            }
2011            Err(e) => {
2012                tracing::warn!(error = %e, thread_id, "failed to claim dispatch");
2013                revert_claiming_to_idle(&self.workers, thread_id).await;
2014                return DispatchAttempt::TransientError;
2015            }
2016        };
2017
2018        let Some(dispatch) = claimed.into_iter().next() else {
2019            // No dispatches to claim.
2020            revert_claiming_to_idle(&self.workers, thread_id).await;
2021            return DispatchAttempt::NoEligible;
2022        };
2023
2024        let dispatch_id = dispatch.dispatch_id.clone();
2025        let claim_token = dispatch.claim_token.clone().unwrap_or_default();
2026
2027        // Shared flag: set by the event sink when a tool call is suspended.
2028        let suspended = Arc::new(AtomicBool::new(false));
2029
2030        // Start lease renewal.
2031        let lease_handle = self.spawn_lease_renewal(
2032            dispatch_id.clone(),
2033            claim_token.clone(),
2034            thread_id.to_string(),
2035            Arc::clone(&suspended),
2036        );
2037
2038        // Pre-warm thread context cache.
2039        let thread_ctx = match ThreadContext::load(self.run_store.as_ref(), thread_id).await {
2040            Ok(ctx) => Some(ctx),
2041            Err(e) => {
2042                tracing::warn!(thread_id, error = %e, "failed to pre-warm thread context");
2043                None
2044            }
2045        };
2046
2047        // Create channel for background dispatch (events go nowhere unless observed).
2048        let (event_tx, _event_rx) = mpsc::channel(Self::EVENT_CHANNEL_CAPACITY);
2049        let reconnectable_sink = Arc::new(ReconnectableEventSink::new(event_tx.clone()));
2050
2051        // Update worker state.
2052        let worker = self.get_or_create_worker(thread_id).await;
2053        {
2054            let mut w = worker.lock();
2055            w.thread_ctx = thread_ctx;
2056            w.status = MailboxWorkerStatus::Running {
2057                dispatch_id: dispatch_id.clone(),
2058                run_id: dispatch.run_id.clone(),
2059                lease_handle,
2060                sink: Arc::clone(&reconnectable_sink),
2061            };
2062        }
2063
2064        self.spawn_execution(
2065            dispatch,
2066            event_tx,
2067            reconnectable_sink,
2068            claim_token,
2069            thread_id.to_string(),
2070            suspended,
2071        );
2072        DispatchAttempt::Claimed
2073    }
2074
2075    /// Claim from store and execute the next dispatch for this thread.
2076    #[tracing::instrument(skip(self), fields(thread_id = %thread_id))]
2077    async fn try_dispatch_next(self: &Arc<Self>, thread_id: &str) -> DispatchAttempt {
2078        let worker = {
2079            let workers = self.workers.read().await;
2080            match workers.get(thread_id) {
2081                Some(w) => Arc::clone(w),
2082                None => return DispatchAttempt::NoEligible,
2083            }
2084        };
2085
2086        // Atomically transition Idle → Claiming to prevent TOCTOU race.
2087        {
2088            let mut w = worker.lock();
2089            if !matches!(w.status, MailboxWorkerStatus::Idle) {
2090                return DispatchAttempt::Busy;
2091            }
2092            w.status = MailboxWorkerStatus::Claiming;
2093        }
2094
2095        self.dispatch_next_claim(thread_id).await
2096    }
2097
2098    /// Spawn a lease renewal task that periodically extends the lease.
2099    ///
2100    /// When the `suspended` flag is set (run is waiting for human input),
2101    /// the renewal uses `suspended_lease_ms` instead of the default `lease_ms`
2102    /// to prevent premature lease expiration during HITL scenarios.
2103    fn spawn_lease_renewal(
2104        &self,
2105        dispatch_id: String,
2106        claim_token: String,
2107        thread_id: String,
2108        suspended: Arc<AtomicBool>,
2109    ) -> JoinHandle<()> {
2110        let store = Arc::clone(&self.store);
2111        let runtime = Arc::clone(&self.executor);
2112        let lease_ms = self.config.lease_ms;
2113        let suspended_lease_ms = self.config.suspended_lease_ms;
2114        let interval = self.config.lease_renewal_interval;
2115
2116        tokio::spawn(async move {
2117            let mut tick = tokio::time::interval(interval);
2118            tick.tick().await; // skip initial
2119
2120            loop {
2121                tick.tick().await;
2122                let now = now_ms();
2123                let effective_lease_ms = if suspended.load(Ordering::Acquire) {
2124                    suspended_lease_ms
2125                } else {
2126                    lease_ms
2127                };
2128                let renew_start = Instant::now();
2129                match store
2130                    .extend_lease(&dispatch_id, &claim_token, effective_lease_ms, now)
2131                    .await
2132                {
2133                    Ok(true) => {
2134                        record_mailbox_operation_result("lease_renewal", "ok", renew_start);
2135                    }
2136                    Ok(false) => {
2137                        record_mailbox_operation_result("lease_renewal", "lost", renew_start);
2138                        // Lease lost -- another process reclaimed.
2139                        tracing::warn!(dispatch_id, thread_id, "lease lost, cancelling run");
2140                        runtime.cancel(&thread_id);
2141                        break;
2142                    }
2143                    Err(e) => {
2144                        record_mailbox_operation_result("lease_renewal", "error", renew_start);
2145                        tracing::warn!(dispatch_id, error = %e, "lease extension failed");
2146                        break;
2147                    }
2148                }
2149            }
2150        })
2151    }
2152
2153    /// Spawn the actual execution task for a claimed dispatch.
2154    #[tracing::instrument(skip(self, event_tx, reconnectable_sink, suspended), fields(dispatch_id = %dispatch.dispatch_id, thread_id = %thread_id))]
2155    fn spawn_execution(
2156        self: &Arc<Self>,
2157        dispatch: RunDispatch,
2158        event_tx: mpsc::Sender<AgentEvent>,
2159        reconnectable_sink: Arc<ReconnectableEventSink>,
2160        claim_token: String,
2161        thread_id: String,
2162        suspended: Arc<AtomicBool>,
2163    ) {
2164        let this = Arc::clone(self);
2165        let dispatch_id = dispatch.dispatch_id.clone();
2166
2167        tokio::spawn(async move {
2168            crate::metrics::inc_active_runs();
2169            let _guard = ActiveRunGuard;
2170
2171            let sink = SuspensionAwareSink {
2172                inner: reconnectable_sink as Arc<dyn EventSink>,
2173                suspended,
2174            };
2175
2176            // Dispatch epoch check: if this dispatch was superseded between claim and
2177            // execution start, terminalize it and abort without entering the runtime.
2178            let load_start = Instant::now();
2179            let current_dispatch_result = this.store.load_dispatch(&dispatch_id).await;
2180            record_mailbox_operation_result(
2181                "load_dispatch",
2182                result_label(&current_dispatch_result),
2183                load_start,
2184            );
2185            let current_dispatch = match current_dispatch_result {
2186                Ok(Some(current_dispatch)) => current_dispatch,
2187                Ok(None) => {
2188                    tracing::info!(dispatch_id, "dispatch disappeared before execution");
2189                    this.finish_execution(&thread_id, &dispatch_id).await;
2190                    return;
2191                }
2192                Err(error) => {
2193                    tracing::warn!(dispatch_id, error = %error, "failed to verify dispatch before execution");
2194                    this.finish_execution(&thread_id, &dispatch_id).await;
2195                    return;
2196                }
2197            };
2198            if current_dispatch.status != RunDispatchStatus::Claimed
2199                || current_dispatch.claim_token.as_deref() != Some(claim_token.as_str())
2200            {
2201                tracing::info!(dispatch_id, status = ?current_dispatch.status, "dispatch no longer owned by this worker, skipping execution");
2202                if current_dispatch.status == RunDispatchStatus::Superseded {
2203                    this.mark_superseded_dispatch_run_cancelled(
2204                        &current_dispatch,
2205                        "dispatch superseded before execution start",
2206                    )
2207                    .await;
2208                }
2209                this.finish_execution(&thread_id, &dispatch_id).await;
2210                return;
2211            }
2212            let epoch_start = Instant::now();
2213            let current_epoch_result = this.store.current_dispatch_epoch(&thread_id).await;
2214            record_mailbox_operation_result(
2215                "current_dispatch_epoch",
2216                result_label(&current_epoch_result),
2217                epoch_start,
2218            );
2219            match current_epoch_result {
2220                Ok(current_epoch) if current_dispatch.dispatch_epoch < current_epoch => {
2221                    tracing::info!(
2222                        dispatch_id,
2223                        thread_id,
2224                        dispatch_epoch = current_dispatch.dispatch_epoch,
2225                        current_epoch,
2226                        "dispatch superseded before execution start"
2227                    );
2228                    let supersede_reason = "claimed dispatch superseded before execution start";
2229                    let supersede_start = Instant::now();
2230                    let supersede_result = this
2231                        .store
2232                        .supersede_claimed(&dispatch_id, &claim_token, now_ms(), supersede_reason)
2233                        .await;
2234                    record_mailbox_operation_result(
2235                        "supersede_claimed",
2236                        result_label(&supersede_result),
2237                        supersede_start,
2238                    );
2239                    if supersede_result.is_ok() {
2240                        this.refresh_dispatch_depth_metrics().await;
2241                        this.mark_superseded_dispatch_run_cancelled(
2242                            &current_dispatch,
2243                            supersede_reason,
2244                        )
2245                        .await;
2246                    }
2247                    this.finish_execution(&thread_id, &dispatch_id).await;
2248                    return;
2249                }
2250                Ok(_) => {}
2251                Err(error) => {
2252                    tracing::warn!(dispatch_id, thread_id, error = %error, "failed to read dispatch epoch before execution");
2253                    this.finish_execution(&thread_id, &dispatch_id).await;
2254                    return;
2255                }
2256            }
2257
2258            let dispatch_instance_id = uuid::Uuid::now_v7().to_string();
2259            let start_now = now_ms();
2260            record_mailbox_dispatch_start_metrics(&dispatch, start_now);
2261            let mut request = match this.reconstruct_run_request(&dispatch).await {
2262                Ok(request) => request,
2263                Err(error) => {
2264                    tracing::error!(dispatch_id, error = %error, "failed to reconstruct run request from durable run record");
2265                    let now = now_ms();
2266                    record_mailbox_dispatch_completion_metrics(
2267                        &dispatch,
2268                        start_now,
2269                        now,
2270                        "permanent_error",
2271                    );
2272                    let msg = error.to_string();
2273                    let run_result = RunDispatchResult {
2274                        run_id: dispatch.run_id.clone(),
2275                        dispatch_instance_id: dispatch_instance_id.clone(),
2276                        status: awaken_contract::contract::lifecycle::RunStatus::Done,
2277                        termination: Some(
2278                            awaken_contract::contract::lifecycle::TerminationReason::Error(
2279                                msg.clone(),
2280                            ),
2281                        ),
2282                        response: None,
2283                        error: Some(msg.clone()),
2284                    };
2285                    let record_start = Instant::now();
2286                    let record_result = this
2287                        .store
2288                        .record_dispatch_start(
2289                            &dispatch_id,
2290                            &claim_token,
2291                            &dispatch_instance_id,
2292                            start_now,
2293                        )
2294                        .await;
2295                    record_mailbox_operation_result(
2296                        "record_dispatch_start",
2297                        result_label(&record_result),
2298                        record_start,
2299                    );
2300                    if let Err(error) = record_result {
2301                        tracing::warn!(dispatch_id, error = %error, "failed to record dispatch start for reconstruction failure");
2302                        if let Ok(Some(latest_dispatch)) =
2303                            this.store.load_dispatch(&dispatch_id).await
2304                            && latest_dispatch.status == RunDispatchStatus::Superseded
2305                        {
2306                            this.mark_superseded_dispatch_run_cancelled(
2307                                &latest_dispatch,
2308                                "dispatch superseded before reconstruction failure was recorded",
2309                            )
2310                            .await;
2311                        }
2312                        this.finish_execution(&thread_id, &dispatch_id).await;
2313                        return;
2314                    }
2315                    let record_result_start = Instant::now();
2316                    let record_run_result = this
2317                        .store
2318                        .record_run_result(&dispatch_id, &claim_token, &run_result, now)
2319                        .await;
2320                    record_mailbox_operation_result(
2321                        "record_run_result",
2322                        result_label(&record_run_result),
2323                        record_result_start,
2324                    );
2325                    let dead_letter_start = Instant::now();
2326                    let dead_letter_result = this
2327                        .store
2328                        .dead_letter(&dispatch_id, &claim_token, &msg, now)
2329                        .await;
2330                    record_mailbox_operation_result(
2331                        "dead_letter",
2332                        result_label(&dead_letter_result),
2333                        dead_letter_start,
2334                    );
2335                    if dead_letter_result.is_ok() {
2336                        this.refresh_dispatch_depth_metrics().await;
2337                        if let Ok(Some(dead_letter_dispatch)) =
2338                            this.store.load_dispatch(&dispatch_id).await
2339                            && dead_letter_dispatch.status == RunDispatchStatus::DeadLetter
2340                        {
2341                            this.mark_dead_letter_dispatch_run_error(&dead_letter_dispatch)
2342                                .await;
2343                        }
2344                    }
2345                    this.finish_execution(&thread_id, &dispatch_id).await;
2346                    return;
2347                }
2348            };
2349            normalize_mailbox_run_mode(&mut request, false);
2350            let run_id = dispatch.run_id.clone();
2351            request = request
2352                .with_dispatch_id(dispatch_id.clone())
2353                .with_session_id(dispatch_instance_id.clone());
2354            let record_start = Instant::now();
2355            let record_start_result = this
2356                .store
2357                .record_dispatch_start(&dispatch_id, &claim_token, &dispatch_instance_id, start_now)
2358                .await;
2359            record_mailbox_operation_result(
2360                "record_dispatch_start",
2361                result_label(&record_start_result),
2362                record_start,
2363            );
2364            if let Err(e) = record_start_result {
2365                tracing::warn!(dispatch_id, run_id, error = %e, "failed to record mailbox dispatch start; skipping execution");
2366                if let Ok(Some(latest_dispatch)) = this.store.load_dispatch(&dispatch_id).await
2367                    && latest_dispatch.status == RunDispatchStatus::Superseded
2368                {
2369                    this.mark_superseded_dispatch_run_cancelled(
2370                        &latest_dispatch,
2371                        "dispatch superseded before runtime start was recorded",
2372                    )
2373                    .await;
2374                }
2375                this.finish_execution(&thread_id, &dispatch_id).await;
2376                return;
2377            }
2378            let thread_ctx = {
2379                let workers = this.workers.read().await;
2380                workers.get(&thread_id).and_then(|worker| {
2381                    let w = worker.lock();
2382                    w.thread_ctx.as_ref().map(|ctx| {
2383                        ThreadContextSnapshot::new(
2384                            ctx.messages.clone(),
2385                            ctx.latest_run.clone(),
2386                            ctx.run_cache.clone(),
2387                        )
2388                    })
2389                })
2390            };
2391            let continue_run_id = request.continue_run_id.clone();
2392            let (inbox_sender, inbox_receiver) = awaken_runtime::inbox::inbox_channel_with_fallback(
2393                Arc::new(TaskDoneMailboxNotify::new(
2394                    this.clone(),
2395                    dispatch.thread_id.clone(),
2396                    continue_run_id,
2397                )),
2398            );
2399            request = request.with_inbox(inbox_sender, inbox_receiver);
2400
2401            let result = this
2402                .executor
2403                .run_with_thread_context(request, Arc::new(sink), thread_ctx)
2404                .await;
2405            let now = now_ms();
2406            let run_result = mailbox_run_result(&run_id, &dispatch_instance_id, &result);
2407            let record_result_start = Instant::now();
2408            let record_run_result = this
2409                .store
2410                .record_run_result(&dispatch_id, &claim_token, &run_result, now)
2411                .await;
2412            record_mailbox_operation_result(
2413                "record_run_result",
2414                result_label(&record_run_result),
2415                record_result_start,
2416            );
2417            if let Err(e) = record_run_result {
2418                tracing::warn!(dispatch_id, run_id, error = %e, "failed to record mailbox run result");
2419            }
2420
2421            let outcome = classify_error(&result);
2422            record_mailbox_dispatch_completion_metrics(
2423                &dispatch,
2424                start_now,
2425                now,
2426                outcome.metric_label(),
2427            );
2428
2429            match outcome {
2430                MailboxRunOutcome::Completed => {
2431                    let ack_start = Instant::now();
2432                    let ack_result = this.store.ack(&dispatch_id, &claim_token, now).await;
2433                    record_mailbox_operation_result("ack", result_label(&ack_result), ack_start);
2434                    if let Err(e) = ack_result {
2435                        tracing::warn!(dispatch_id, error = %e, "ack failed");
2436                    } else {
2437                        this.refresh_dispatch_depth_metrics().await;
2438                    }
2439                }
2440                MailboxRunOutcome::TransientError(msg) => {
2441                    tracing::warn!(dispatch_id, error = %msg, "run failed (transient), nacking");
2442                    // Emit error event so the SSE stream terminates with a
2443                    // proper RUN_ERROR instead of silently closing.
2444                    let _ = event_tx
2445                        .send(AgentEvent::RunFinish {
2446                            thread_id: dispatch.thread_id.clone(),
2447                            run_id: run_id.clone(),
2448                            identity: Some(mailbox_run_identity(
2449                                &dispatch,
2450                                &run_id,
2451                                &dispatch_instance_id,
2452                            )),
2453                            result: None,
2454                            termination:
2455                                awaken_contract::contract::lifecycle::TerminationReason::Error(
2456                                    msg.clone(),
2457                                ),
2458                        })
2459                        .await;
2460                    let backoff_factor = 2u64.pow(dispatch.attempt_count.saturating_sub(1).min(6));
2461                    let retry_at = now
2462                        + (this.config.default_retry_delay_ms * backoff_factor)
2463                            .min(this.config.max_retry_delay_ms);
2464                    let nack_start = Instant::now();
2465                    let nack_result = this
2466                        .store
2467                        .nack(&dispatch_id, &claim_token, retry_at, &msg, now)
2468                        .await;
2469                    record_mailbox_operation_result("nack", result_label(&nack_result), nack_start);
2470                    if let Err(e) = nack_result {
2471                        tracing::warn!(dispatch_id, error = %e, "nack failed");
2472                    } else {
2473                        this.refresh_dispatch_depth_metrics().await;
2474                    }
2475                }
2476                MailboxRunOutcome::PermanentError(msg) => {
2477                    tracing::warn!(dispatch_id, error = %msg, "run failed (permanent), dead-lettering");
2478                    // Emit error event so the SSE stream terminates with a
2479                    // proper RUN_ERROR. The runtime did not reach the loop,
2480                    // so no RunFinish was emitted — we must do it here.
2481                    let _ = event_tx
2482                        .send(AgentEvent::RunFinish {
2483                            thread_id: dispatch.thread_id.clone(),
2484                            run_id: run_id.clone(),
2485                            identity: Some(mailbox_run_identity(
2486                                &dispatch,
2487                                &run_id,
2488                                &dispatch_instance_id,
2489                            )),
2490                            result: None,
2491                            termination:
2492                                awaken_contract::contract::lifecycle::TerminationReason::Error(
2493                                    msg.clone(),
2494                                ),
2495                        })
2496                        .await;
2497                    let dead_letter_start = Instant::now();
2498                    let dead_letter_result = this
2499                        .store
2500                        .dead_letter(&dispatch_id, &claim_token, &msg, now)
2501                        .await;
2502                    record_mailbox_operation_result(
2503                        "dead_letter",
2504                        result_label(&dead_letter_result),
2505                        dead_letter_start,
2506                    );
2507                    if let Err(e) = dead_letter_result {
2508                        tracing::warn!(dispatch_id, error = %e, "dead_letter failed");
2509                    } else {
2510                        this.refresh_dispatch_depth_metrics().await;
2511                        if let Ok(Some(dead_letter_dispatch)) =
2512                            this.store.load_dispatch(&dispatch_id).await
2513                            && dead_letter_dispatch.status == RunDispatchStatus::DeadLetter
2514                        {
2515                            this.mark_dead_letter_dispatch_run_error(&dead_letter_dispatch)
2516                                .await;
2517                        }
2518                    }
2519                }
2520            }
2521
2522            this.finish_execution(&thread_id, &dispatch_id).await;
2523        });
2524    }
2525
2526    async fn finish_execution(self: &Arc<Self>, thread_id: &str, dispatch_id: &str) {
2527        // Abort lease renewal and return the worker to Idle.
2528        let worker = self.get_or_create_worker(thread_id).await;
2529        {
2530            let mut w = worker.lock();
2531            let should_transition = matches!(
2532                &w.status,
2533                MailboxWorkerStatus::Running { dispatch_id: cid, .. } if cid == dispatch_id
2534            );
2535            if should_transition {
2536                // Take ownership of the old status to abort the lease handle.
2537                let old = std::mem::replace(&mut w.status, MailboxWorkerStatus::Idle);
2538                w.thread_ctx = None;
2539                if let MailboxWorkerStatus::Running { lease_handle, .. } = old {
2540                    lease_handle.abort();
2541                }
2542            }
2543        }
2544
2545        // Try to execute the next queued dispatch for this thread.
2546        self.try_dispatch_next(thread_id).await;
2547    }
2548
2549    /// Get or create a per-thread worker.
2550    async fn get_or_create_worker(&self, thread_id: &str) -> Arc<SyncMutex<MailboxWorker>> {
2551        // Fast path: read lock.
2552        {
2553            let workers = self.workers.read().await;
2554            if let Some(w) = workers.get(thread_id) {
2555                return Arc::clone(w);
2556            }
2557        }
2558        // Slow path: write lock.
2559        let mut workers = self.workers.write().await;
2560        Arc::clone(
2561            workers
2562                .entry(thread_id.to_string())
2563                .or_insert_with(|| Arc::new(SyncMutex::new(MailboxWorker::default()))),
2564        )
2565    }
2566
2567    /// Create or update the durable run truth before enqueuing a dispatch.
2568    async fn prepare_run_for_dispatch(
2569        &self,
2570        request: &mut RunRequest,
2571        thread_id: &str,
2572        messages: &[Message],
2573    ) -> Result<String, MailboxError> {
2574        if request.continue_run_id.is_none()
2575            && request.run_id_hint.is_none()
2576            && let Some(waiting_run_id) = self.reusable_waiting_run_id(thread_id).await
2577        {
2578            request.continue_run_id = Some(waiting_run_id);
2579        }
2580
2581        let run_id = request
2582            .continue_run_id
2583            .clone()
2584            .or_else(|| request.run_id_hint.clone())
2585            .filter(|id| !id.trim().is_empty())
2586            .unwrap_or_else(|| uuid::Uuid::now_v7().to_string());
2587        if request.continue_run_id.is_none() {
2588            request.run_id_hint = Some(run_id.clone());
2589        }
2590
2591        let normalized_messages = normalize_message_ids(messages);
2592        let existing_messages = self
2593            .run_store
2594            .load_messages(thread_id)
2595            .await?
2596            .unwrap_or_default();
2597        let previous_run = self.run_store.latest_run(thread_id).await?;
2598        let mut appended_messages = existing_messages;
2599        appended_messages.extend(normalized_messages.iter().cloned());
2600        let input_message_ids = normalized_messages
2601            .iter()
2602            .filter_map(|message| message.id.clone())
2603            .collect::<Vec<_>>();
2604        let request_extras = RunRequestExtras::from_request(request)
2605            .to_value()
2606            .map_err(|e| {
2607                MailboxError::Internal(format!("failed to serialize request extras: {e}"))
2608            })?;
2609        let request_snapshot = RunRequestSnapshot {
2610            origin: request.origin,
2611            sender_id: None,
2612            input_message_ids: input_message_ids.clone(),
2613            input_message_count: normalized_messages.len() as u64,
2614            request_extras,
2615            decisions: request
2616                .decisions
2617                .iter()
2618                .map(|(call_id, resume)| RunResumeDecision {
2619                    call_id: call_id.clone(),
2620                    resume: resume.clone(),
2621                })
2622                .collect(),
2623            frontend_tools: request.frontend_tools.clone(),
2624            parent_thread_id: request.parent_thread_id.clone(),
2625            transport_request_id: request.transport_request_id.clone(),
2626        };
2627        let input = Some(RunMessageInput {
2628            thread_id: thread_id.to_string(),
2629            range: MessageSeqRange::new(1, appended_messages.len() as u64),
2630            trigger_message_ids: input_message_ids,
2631            selected_message_ids: Vec::new(),
2632            context_policy: None,
2633            compacted_snapshot_id: None,
2634        });
2635
2636        let existing_run = self.run_store.load_run(&run_id).await?;
2637        if let Some(mut existing) = existing_run {
2638            if existing.thread_id != thread_id {
2639                return Err(MailboxError::Validation(format!(
2640                    "run_id '{run_id}' belongs to thread '{}', not '{thread_id}'",
2641                    existing.thread_id
2642                )));
2643            }
2644            if existing.status != RunStatus::Created && !existing.is_resumable_waiting() {
2645                return Err(MailboxError::Validation(format!(
2646                    "run_id '{run_id}' is not open for dispatch"
2647                )));
2648            }
2649            existing.request = Some(request_snapshot);
2650            existing.input = input;
2651            existing.updated_at = now_ms() / 1000;
2652            self.run_store
2653                .checkpoint(thread_id, &appended_messages, &existing)
2654                .await?;
2655            {
2656                let workers = self.workers.read().await;
2657                if let Some(worker) = workers.get(thread_id) {
2658                    let mut w = worker.lock();
2659                    if let Some(ref mut ctx) = w.thread_ctx {
2660                        ctx.apply_checkpoint(&appended_messages, &existing);
2661                    }
2662                }
2663            }
2664            return Ok(run_id);
2665        }
2666
2667        let inferred_agent_id = request
2668            .agent_id
2669            .clone()
2670            .or_else(|| {
2671                previous_run.as_ref().and_then(|run| {
2672                    (run.status != RunStatus::Created && !run.agent_id.trim().is_empty())
2673                        .then(|| run.agent_id.clone())
2674                })
2675            })
2676            .unwrap_or_else(|| "default".to_string());
2677        let inherited_state = previous_run
2678            .as_ref()
2679            .filter(|run| run.status != RunStatus::Created)
2680            .and_then(|run| run.state.clone());
2681        let now = now_ms() / 1000;
2682        let record = RunRecord {
2683            run_id: run_id.clone(),
2684            thread_id: thread_id.to_string(),
2685            agent_id: inferred_agent_id,
2686            parent_run_id: request.parent_run_id.clone(),
2687            request: Some(request_snapshot),
2688            input,
2689            output: None,
2690            status: RunStatus::Created,
2691            termination_reason: None,
2692            final_output: None,
2693            error_payload: None,
2694            dispatch_id: None,
2695            session_id: None,
2696            transport_request_id: request.transport_request_id.clone(),
2697            waiting: None,
2698            outcome: None,
2699            created_at: now,
2700            started_at: None,
2701            finished_at: None,
2702            updated_at: now,
2703            steps: 0,
2704            input_tokens: 0,
2705            output_tokens: 0,
2706            state: inherited_state,
2707        };
2708        self.run_store
2709            .checkpoint(thread_id, &appended_messages, &record)
2710            .await?;
2711        {
2712            let workers = self.workers.read().await;
2713            if let Some(worker) = workers.get(thread_id) {
2714                let mut w = worker.lock();
2715                if let Some(ref mut ctx) = w.thread_ctx {
2716                    ctx.apply_checkpoint(&appended_messages, &record);
2717                }
2718            }
2719        }
2720        Ok(run_id)
2721    }
2722
2723    /// Build a RunDispatch from the durable run prepared above.
2724    fn build_dispatch(
2725        &self,
2726        request: &RunRequest,
2727        thread_id: &str,
2728    ) -> Result<RunDispatch, MailboxError> {
2729        let run_id = request
2730            .continue_run_id
2731            .clone()
2732            .or_else(|| request.run_id_hint.clone())
2733            .ok_or_else(|| MailboxError::Internal("run_id missing after preparation".into()))?;
2734        let now = now_ms();
2735        Ok(RunDispatch {
2736            dispatch_id: request
2737                .dispatch_id_hint
2738                .clone()
2739                .unwrap_or_else(|| uuid::Uuid::now_v7().to_string()),
2740            thread_id: thread_id.to_string(),
2741            run_id,
2742            priority: 128,
2743            dedupe_key: None,
2744            dispatch_epoch: 0,
2745            status: RunDispatchStatus::Queued,
2746            available_at: now,
2747            attempt_count: 0,
2748            max_attempts: self.config.default_max_attempts,
2749            last_error: None,
2750            claim_token: None,
2751            claimed_by: None,
2752            lease_until: None,
2753            dispatch_instance_id: None,
2754            run_status: None,
2755            termination: None,
2756            run_response: None,
2757            run_error: None,
2758            completed_at: None,
2759            created_at: now,
2760            updated_at: now,
2761        })
2762    }
2763
2764    async fn reconstruct_run_request(
2765        &self,
2766        dispatch: &RunDispatch,
2767    ) -> Result<RunRequest, MailboxError> {
2768        let run = {
2769            let cached = {
2770                let workers = self.workers.read().await;
2771                workers.get(&dispatch.thread_id).and_then(|w| {
2772                    let w = w.lock();
2773                    w.thread_ctx
2774                        .as_ref()
2775                        .and_then(|ctx| ctx.get_run(&dispatch.run_id).cloned())
2776                })
2777            };
2778            if let Some(run) = cached {
2779                run
2780            } else {
2781                self.run_store
2782                    .load_run(&dispatch.run_id)
2783                    .await?
2784                    .ok_or_else(|| {
2785                        MailboxError::Validation(format!(
2786                            "run '{}' not found for dispatch '{}'",
2787                            dispatch.run_id, dispatch.dispatch_id
2788                        ))
2789                    })?
2790            }
2791        };
2792        if run.thread_id != dispatch.thread_id {
2793            return Err(MailboxError::Validation(format!(
2794                "run '{}' belongs to thread '{}', not dispatch thread '{}'",
2795                run.run_id, run.thread_id, dispatch.thread_id
2796            )));
2797        }
2798        let snapshot = run.request.clone().ok_or_else(|| {
2799            MailboxError::Validation(format!("run '{}' has no request snapshot", run.run_id))
2800        })?;
2801        let activation_messages = self.activation_messages_for_run(&run, &snapshot).await?;
2802        let mut request = RunRequest::new(run.thread_id.clone(), activation_messages)
2803            .with_messages_already_persisted(true)
2804            .with_origin(snapshot.origin)
2805            .with_run_mode(RunMode::Resume)
2806            .with_adapter(AdapterKind::Internal);
2807        if !run.agent_id.trim().is_empty() {
2808            request = request.with_agent_id(run.agent_id.clone());
2809        }
2810        if let Some(parent_run_id) = run.parent_run_id.clone() {
2811            request = request.with_parent_run_id(parent_run_id);
2812        }
2813        if let Some(parent_thread_id) = snapshot.parent_thread_id.clone() {
2814            request = request.with_parent_thread_id(parent_thread_id);
2815        }
2816        if let Some(transport_request_id) = snapshot.transport_request_id.clone() {
2817            request = request.with_transport_request_id(transport_request_id);
2818        }
2819        if !snapshot.decisions.is_empty() {
2820            request = request.with_decisions(
2821                snapshot
2822                    .decisions
2823                    .iter()
2824                    .map(|decision| (decision.call_id.clone(), decision.resume.clone()))
2825                    .collect(),
2826            );
2827        }
2828        if !snapshot.frontend_tools.is_empty() {
2829            request = request.with_frontend_tools(snapshot.frontend_tools.clone());
2830        }
2831        if let Some(extras_value) = snapshot.request_extras.as_ref() {
2832            let extras = RunRequestExtras::from_value(extras_value).map_err(|error| {
2833                MailboxError::Validation(format!("corrupt request_extras: {error}"))
2834            })?;
2835            request = extras.apply_to(request);
2836        }
2837        request = if run.is_resumable_waiting() {
2838            request.with_continue_run_id(run.run_id.clone())
2839        } else {
2840            request.with_run_id_hint(run.run_id.clone())
2841        };
2842        Ok(request.with_trace_dispatch_id(dispatch.dispatch_id.clone()))
2843    }
2844
2845    async fn activation_messages_for_run(
2846        &self,
2847        run: &RunRecord,
2848        snapshot: &RunRequestSnapshot,
2849    ) -> Result<Vec<Message>, MailboxError> {
2850        if snapshot.input_message_ids.is_empty() {
2851            return self.activation_messages_from_range(run, snapshot).await;
2852        }
2853        // Try cache first for message lookups.
2854        let cached_messages: Option<Vec<Message>> = {
2855            let workers = self.workers.read().await;
2856            workers.get(&run.thread_id).and_then(|w| {
2857                let w = w.lock();
2858                w.thread_ctx.as_ref().and_then(|ctx| {
2859                    let mut msgs = Vec::with_capacity(snapshot.input_message_ids.len());
2860                    for msg_id in &snapshot.input_message_ids {
2861                        let found = ctx
2862                            .messages
2863                            .iter()
2864                            .find(|m| m.id.as_deref() == Some(msg_id.as_str()));
2865                        msgs.push(found?.clone());
2866                    }
2867                    Some(msgs)
2868                })
2869            })
2870        };
2871        if let Some(msgs) = cached_messages {
2872            return Ok(msgs);
2873        }
2874        let mut messages = Vec::with_capacity(snapshot.input_message_ids.len());
2875        for message_id in &snapshot.input_message_ids {
2876            let record = self
2877                .run_store
2878                .load_message_record(&run.thread_id, message_id)
2879                .await?
2880                .ok_or_else(|| {
2881                    MailboxError::Validation(format!(
2882                        "message '{message_id}' not found for run '{}'",
2883                        run.run_id
2884                    ))
2885                })?;
2886            messages.push(record.message);
2887        }
2888        Ok(messages)
2889    }
2890
2891    async fn activation_messages_from_range(
2892        &self,
2893        run: &RunRecord,
2894        snapshot: &RunRequestSnapshot,
2895    ) -> Result<Vec<Message>, MailboxError> {
2896        let Some(input) = run.input.as_ref() else {
2897            return Ok(Vec::new());
2898        };
2899        let Some(range) = input.range else {
2900            return Ok(Vec::new());
2901        };
2902        let count = snapshot.input_message_count;
2903        if count == 0 {
2904            return Ok(Vec::new());
2905        }
2906        let from_seq = range.to_seq.saturating_sub(count).saturating_add(1);
2907        let Some(range) = MessageSeqRange::new(from_seq.max(range.from_seq), range.to_seq) else {
2908            return Ok(Vec::new());
2909        };
2910        let records = self
2911            .run_store
2912            .load_message_records_range(&run.thread_id, range)
2913            .await?;
2914        Ok(records.into_iter().map(|record| record.message).collect())
2915    }
2916
2917    // ── Maintenance ──────────────────────────────────────────────────
2918
2919    async fn run_sweep(self: &Arc<Self>) {
2920        let now = now_ms();
2921        let reclaim_start = Instant::now();
2922        let reclaim_result = self.store.reclaim_expired_leases(now, 100).await;
2923        record_mailbox_operation_result("reclaim", result_label(&reclaim_result), reclaim_start);
2924        match reclaim_result {
2925            Ok(reclaimed) => {
2926                crate::metrics::inc_mailbox_operation_by(
2927                    "reclaim_dispatch",
2928                    "ok",
2929                    reclaimed.len() as u64,
2930                );
2931                if !reclaimed.is_empty() {
2932                    tracing::info!(count = reclaimed.len(), "sweep reclaimed expired leases");
2933                    self.refresh_dispatch_depth_metrics().await;
2934                    for dispatch in reclaimed {
2935                        self.reconcile_terminal_dispatch(&dispatch).await;
2936                        if dispatch.status == RunDispatchStatus::Queued {
2937                            let thread_id = dispatch.thread_id.clone();
2938                            self.get_or_create_worker(&thread_id).await;
2939                            self.try_dispatch_next(&thread_id).await;
2940                        }
2941                    }
2942                }
2943                self.reconcile_terminal_dispatches().await;
2944            }
2945            Err(e) => {
2946                tracing::warn!(error = %e, "sweep failed");
2947            }
2948        }
2949    }
2950
2951    async fn run_gc(&self) {
2952        let now = now_ms();
2953        let gc_ttl_ms = self.config.gc_ttl.as_millis() as u64;
2954        let older_than = now.saturating_sub(gc_ttl_ms);
2955        let purge_start = Instant::now();
2956        let purge_result = self.store.purge_terminal(older_than).await;
2957        record_mailbox_operation_result("purge_terminal", result_label(&purge_result), purge_start);
2958        match purge_result {
2959            Ok(purged) => {
2960                crate::metrics::inc_mailbox_operation_by("purged", "ok", purged as u64);
2961                if purged > 0 {
2962                    tracing::info!(purged, "GC purged terminal dispatches");
2963                    self.refresh_dispatch_depth_metrics().await;
2964                }
2965            }
2966            Err(e) => {
2967                tracing::warn!(error = %e, "GC failed");
2968            }
2969        }
2970
2971        // Clean up idle workers with no queued dispatches.
2972        self.gc_idle_workers().await;
2973    }
2974
2975    /// Remove workers in `Idle` state that have no queued dispatches in the store.
2976    ///
2977    /// This prevents the `workers` HashMap from growing unbounded as new
2978    /// threads are created and their runs complete.
2979    async fn gc_idle_workers(&self) {
2980        let idle_keys: Vec<String> = {
2981            let workers = self.workers.read().await;
2982            let mut keys = Vec::new();
2983            for (thread_id, worker) in workers.iter() {
2984                let w = worker.lock();
2985                if matches!(w.status, MailboxWorkerStatus::Idle) {
2986                    keys.push(thread_id.clone());
2987                }
2988            }
2989            keys
2990        };
2991
2992        if idle_keys.is_empty() {
2993            return;
2994        }
2995
2996        // Check the store without holding the workers write lock. Remote stores
2997        // may block on network or disk I/O; keeping the lock during those awaits
2998        // would stall submissions, reconnects, and dispatch transitions.
2999        let mut removable = Vec::new();
3000        for thread_id in &idle_keys {
3001            let has_queued = self
3002                .store
3003                .list_dispatches(
3004                    thread_id,
3005                    Some(&[RunDispatchStatus::Queued, RunDispatchStatus::Claimed]),
3006                    1,
3007                    0,
3008                )
3009                .await
3010                .map(|dispatches| !dispatches.is_empty())
3011                .unwrap_or(true); // Err → keep worker to be safe
3012
3013            if !has_queued {
3014                removable.push(thread_id.clone());
3015            }
3016        }
3017
3018        if removable.is_empty() {
3019            return;
3020        }
3021
3022        let mut removed = 0usize;
3023        let mut workers = self.workers.write().await;
3024        for thread_id in removable {
3025            // Re-check under write lock: status might have changed while the
3026            // store query was in flight.
3027            let still_idle = if let Some(worker) = workers.get(&thread_id) {
3028                let w = worker.lock();
3029                matches!(w.status, MailboxWorkerStatus::Idle)
3030            } else {
3031                false
3032            };
3033            if still_idle {
3034                workers.remove(&thread_id);
3035                removed += 1;
3036            }
3037        }
3038
3039        if removed > 0 {
3040            tracing::debug!(removed, "GC removed idle workers");
3041        }
3042    }
3043}
3044
3045/// Revert worker from Claiming → Idle, but only if still in Claiming state.
3046/// Prevents overwriting a Running state set by a concurrent dispatch.
3047async fn revert_claiming_to_idle(
3048    workers: &RwLock<HashMap<String, Arc<SyncMutex<MailboxWorker>>>>,
3049    thread_id: &str,
3050) {
3051    let workers = workers.read().await;
3052    if let Some(worker) = workers.get(thread_id) {
3053        let mut w = worker.lock();
3054        if matches!(w.status, MailboxWorkerStatus::Claiming) {
3055            w.status = MailboxWorkerStatus::Idle;
3056        }
3057    }
3058}
3059
3060// ── Free functions ───────────────────────────────────────────────────
3061
3062fn normalize_mailbox_run_mode(request: &mut RunRequest, background: bool) {
3063    if request.run_mode != RunMode::Foreground {
3064        return;
3065    }
3066
3067    request.run_mode = if !request.decisions.is_empty() || request.continue_run_id.is_some() {
3068        RunMode::Resume
3069    } else if matches!(
3070        request.origin,
3071        awaken_contract::contract::storage::RunRequestOrigin::Internal
3072    ) {
3073        RunMode::InternalWake
3074    } else if background {
3075        RunMode::Scheduled
3076    } else {
3077        RunMode::Foreground
3078    };
3079}
3080
3081/// Validate and normalize run request inputs.
3082///
3083/// Checks that messages are non-empty, trims/generates thread_id.
3084/// Returns `(thread_id, messages)`.
3085/// Internal validation for mailbox submit paths.
3086fn validate_run_inputs(
3087    thread_id: String,
3088    messages: Vec<Message>,
3089    allow_empty_messages: bool,
3090) -> Result<(String, Vec<Message>), MailboxError> {
3091    if messages.is_empty() && !allow_empty_messages {
3092        return Err(MailboxError::Validation(
3093            "at least one message is required".to_string(),
3094        ));
3095    }
3096    let thread_id = {
3097        let trimmed = thread_id.trim().to_string();
3098        if trimmed.is_empty() {
3099            uuid::Uuid::now_v7().to_string()
3100        } else {
3101            trimmed
3102        }
3103    };
3104    Ok((thread_id, messages))
3105}
3106
3107fn normalize_message_ids(messages: &[Message]) -> Vec<Message> {
3108    messages
3109        .iter()
3110        .cloned()
3111        .map(|mut message| {
3112            if message.id.as_deref().map(str::is_empty).unwrap_or(true) {
3113                message.id = Some(awaken_contract::contract::message::gen_message_id());
3114            }
3115            message
3116        })
3117        .collect()
3118}
3119
3120fn live_target_for_dispatch(dispatch: &RunDispatch) -> LiveRunTarget {
3121    LiveRunTarget::new(dispatch.thread_id.clone(), dispatch.run_id.clone())
3122        .with_dispatch_id(dispatch.dispatch_id.clone())
3123}
3124
3125fn live_target_for_run(run: &RunRecord) -> LiveRunTarget {
3126    let mut target = LiveRunTarget::new(run.thread_id.clone(), run.run_id.clone());
3127    if let Some(dispatch_id) = run.dispatch_id.clone() {
3128        target = target.with_dispatch_id(dispatch_id);
3129    }
3130    target
3131}
3132
3133fn mailbox_run_result(
3134    run_id: &str,
3135    dispatch_instance_id: &str,
3136    result: &Result<
3137        awaken_runtime::loop_runner::AgentRunResult,
3138        awaken_runtime::loop_runner::AgentLoopError,
3139    >,
3140) -> RunDispatchResult {
3141    use awaken_contract::contract::lifecycle::{RunStatus, TerminationReason};
3142
3143    match result {
3144        Ok(run) => {
3145            let (status, _) = run.termination.to_run_status();
3146            RunDispatchResult {
3147                run_id: run.run_id.clone(),
3148                dispatch_instance_id: dispatch_instance_id.to_string(),
3149                status,
3150                termination: Some(run.termination.clone()),
3151                response: (!run.response.is_empty()).then(|| run.response.clone()),
3152                error: match &run.termination {
3153                    TerminationReason::Error(message) => Some(message.clone()),
3154                    _ => None,
3155                },
3156            }
3157        }
3158        Err(error) => RunDispatchResult {
3159            run_id: run_id.to_string(),
3160            dispatch_instance_id: dispatch_instance_id.to_string(),
3161            status: RunStatus::Done,
3162            termination: Some(TerminationReason::Error(error.to_string())),
3163            response: None,
3164            error: Some(error.to_string()),
3165        },
3166    }
3167}
3168
3169fn mailbox_run_identity(
3170    dispatch: &RunDispatch,
3171    run_id: &str,
3172    dispatch_instance_id: &str,
3173) -> awaken_contract::contract::identity::RunIdentity {
3174    awaken_contract::contract::identity::RunIdentity::new(
3175        dispatch.thread_id.clone(),
3176        None,
3177        run_id.to_string(),
3178        None,
3179        String::new(),
3180        awaken_contract::contract::identity::RunOrigin::Internal,
3181    )
3182    .with_dispatch_id(dispatch.dispatch_id.clone())
3183    .with_session_id(dispatch_instance_id.to_string())
3184}
3185
3186fn millis_to_seconds(ms: u64) -> f64 {
3187    ms as f64 / 1_000.0
3188}
3189
3190fn record_mailbox_dispatch_start_metrics(dispatch: &RunDispatch, start_now: u64) {
3191    let enqueue_to_start_ms = start_now.saturating_sub(dispatch.created_at);
3192    let eligible_to_start_ms = start_now.saturating_sub(dispatch.available_at);
3193    let claim_to_start_ms = start_now.saturating_sub(dispatch.updated_at);
3194
3195    crate::metrics::record_mailbox_dispatch_enqueue_to_start(millis_to_seconds(
3196        enqueue_to_start_ms,
3197    ));
3198    crate::metrics::record_mailbox_dispatch_eligible_to_start(millis_to_seconds(
3199        eligible_to_start_ms,
3200    ));
3201    crate::metrics::record_mailbox_dispatch_claim_to_start(millis_to_seconds(claim_to_start_ms));
3202
3203    tracing::info!(
3204        dispatch_id = %dispatch.dispatch_id,
3205        run_id = %dispatch.run_id,
3206        thread_id = %dispatch.thread_id,
3207        enqueue_to_start_ms,
3208        eligible_to_start_ms,
3209        claim_to_start_ms,
3210        "mailbox dispatch processing started"
3211    );
3212}
3213
3214fn record_mailbox_dispatch_completion_metrics(
3215    dispatch: &RunDispatch,
3216    start_now: u64,
3217    completed_now: u64,
3218    outcome: &str,
3219) {
3220    let runtime_ms = completed_now.saturating_sub(start_now);
3221    let enqueue_to_complete_ms = completed_now.saturating_sub(dispatch.created_at);
3222
3223    crate::metrics::record_mailbox_dispatch_runtime(millis_to_seconds(runtime_ms), outcome);
3224    crate::metrics::record_mailbox_dispatch_enqueue_to_complete(
3225        millis_to_seconds(enqueue_to_complete_ms),
3226        outcome,
3227    );
3228    crate::metrics::record_run_completion(millis_to_seconds(runtime_ms), outcome);
3229
3230    tracing::info!(
3231        dispatch_id = %dispatch.dispatch_id,
3232        run_id = %dispatch.run_id,
3233        thread_id = %dispatch.thread_id,
3234        outcome,
3235        runtime_ms,
3236        enqueue_to_complete_ms,
3237        "mailbox dispatch processing completed"
3238    );
3239}
3240
3241fn record_mailbox_dispatch_terminal_metrics(dispatch: &RunDispatch, outcome: &str) {
3242    let completed_now = dispatch.completed_at.unwrap_or_else(now_ms);
3243    record_mailbox_dispatch_completion_metrics(dispatch, completed_now, completed_now, outcome);
3244}
3245
3246fn record_mailbox_operation_result(operation: &str, result: &str, start: Instant) {
3247    crate::metrics::record_mailbox_operation(operation, result, start.elapsed().as_secs_f64());
3248}
3249
3250fn dispatch_signal_blocked_nack_delay(redelivery_attempts: Option<u64>) -> Duration {
3251    let exponent = redelivery_attempts.unwrap_or(1).saturating_sub(1).min(16);
3252    let multiplier = 1u32.checked_shl(exponent as u32).unwrap_or(u32::MAX);
3253    dispatch_signal_nack_base_delay()
3254        .saturating_mul(multiplier)
3255        .min(dispatch_signal_nack_max_delay())
3256}
3257
3258fn dispatch_signal_batch_size() -> usize {
3259    env_usize(DISPATCH_SIGNAL_BATCH_ENV, DISPATCH_SIGNAL_BATCH_DEFAULT)
3260}
3261
3262fn dispatch_signal_fetch_expires() -> Duration {
3263    env_duration_ms(DISPATCH_SIGNAL_EXPIRES_ENV, DISPATCH_SIGNAL_EXPIRES_DEFAULT)
3264}
3265
3266fn dispatch_signal_nack_base_delay() -> Duration {
3267    env_duration_ms(
3268        DISPATCH_SIGNAL_NACK_BASE_DELAY_ENV,
3269        DISPATCH_SIGNAL_BLOCKED_NACK_BASE_DELAY_DEFAULT,
3270    )
3271}
3272
3273fn dispatch_signal_nack_max_delay() -> Duration {
3274    env_duration_ms(
3275        DISPATCH_SIGNAL_NACK_MAX_DELAY_ENV,
3276        DISPATCH_SIGNAL_BLOCKED_NACK_MAX_DELAY_DEFAULT,
3277    )
3278}
3279
3280fn dispatch_signal_max_concurrent_handlers() -> usize {
3281    env_usize(
3282        DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_ENV,
3283        DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_DEFAULT,
3284    )
3285}
3286
3287fn env_usize(name: &str, default: usize) -> usize {
3288    std::env::var(name)
3289        .ok()
3290        .and_then(|value| value.parse::<usize>().ok())
3291        .filter(|value| *value > 0)
3292        .unwrap_or(default)
3293}
3294
3295fn env_duration_ms(name: &str, default: Duration) -> Duration {
3296    std::env::var(name)
3297        .ok()
3298        .and_then(|value| value.parse::<u64>().ok())
3299        .filter(|value| *value > 0)
3300        .map(Duration::from_millis)
3301        .unwrap_or(default)
3302}
3303
3304fn result_label<T, E>(result: &Result<T, E>) -> &'static str {
3305    if result.is_ok() { "ok" } else { "error" }
3306}
3307
3308fn dispatch_status_label(status: RunDispatchStatus) -> &'static str {
3309    match status {
3310        RunDispatchStatus::Queued => "queued",
3311        RunDispatchStatus::Claimed => "claimed",
3312        RunDispatchStatus::Acked => "acked",
3313        RunDispatchStatus::Cancelled => "cancelled",
3314        RunDispatchStatus::Superseded => "superseded",
3315        RunDispatchStatus::DeadLetter => "dead_letter",
3316    }
3317}
3318
3319/// Classify a runtime run result for ack/nack/dead_letter.
3320fn classify_error(
3321    result: &Result<
3322        awaken_runtime::loop_runner::AgentRunResult,
3323        awaken_runtime::loop_runner::AgentLoopError,
3324    >,
3325) -> MailboxRunOutcome {
3326    match result {
3327        Ok(_) => MailboxRunOutcome::Completed,
3328        Err(e) => {
3329            use awaken_runtime::loop_runner::AgentLoopError;
3330            match e {
3331                AgentLoopError::RuntimeError(re) => {
3332                    use awaken_runtime::RuntimeError;
3333                    match re {
3334                        RuntimeError::ThreadAlreadyRunning { .. } => {
3335                            // After the cancel-on-submit change, this error
3336                            // indicates a race that retrying won't fix.
3337                            MailboxRunOutcome::PermanentError(e.to_string())
3338                        }
3339                        RuntimeError::AgentNotFound { .. } | RuntimeError::ResolveFailed { .. } => {
3340                            MailboxRunOutcome::PermanentError(e.to_string())
3341                        }
3342                        _ => MailboxRunOutcome::TransientError(e.to_string()),
3343                    }
3344                }
3345                AgentLoopError::StorageError(_) => MailboxRunOutcome::TransientError(e.to_string()),
3346                AgentLoopError::InferenceFailed(_) => {
3347                    MailboxRunOutcome::TransientError(e.to_string())
3348                }
3349                // Agent-level failures (phase error, invalid resume) are not infra errors.
3350                _ => MailboxRunOutcome::Completed,
3351            }
3352        }
3353    }
3354}
3355
3356// ── Tests ────────────────────────────────────────────────────────────
3357
3358#[cfg(test)]
3359mod tests {
3360    use super::*;
3361    use async_trait::async_trait;
3362    use awaken_contract::contract::content::ContentBlock;
3363    use awaken_contract::contract::executor::{
3364        InferenceExecutionError, InferenceRequest, LlmExecutor,
3365    };
3366    use awaken_contract::contract::inference::{StopReason, StreamResult};
3367    use awaken_contract::contract::lifecycle::{RunStatus, TerminationReason};
3368    use awaken_contract::contract::message::{Message, ToolCall};
3369    use awaken_contract::contract::storage::RunRequestOrigin;
3370    use awaken_contract::contract::storage::{
3371        RunRecord, RunStore, RunWaitingState, ThreadRunStore, ThreadStore, WaitingReason,
3372    };
3373    use awaken_contract::contract::tool::{
3374        Tool, ToolCallContext, ToolDescriptor, ToolError, ToolOutput, ToolResult,
3375    };
3376    use awaken_contract::thread::Thread;
3377    use awaken_runtime::extensions::background::{
3378        BackgroundTaskManager, BackgroundTaskPlugin, TaskParentContext,
3379        TaskResult as BackgroundTaskResult,
3380    };
3381    use awaken_runtime::loop_runner::build_agent_env;
3382    use awaken_runtime::{Plugin, ResolvedAgent};
3383    use awaken_stores::{InMemoryMailboxStore, InMemoryStore};
3384    use serde_json::{Value, json};
3385    use std::collections::VecDeque;
3386    use std::sync::Mutex as StdMutex;
3387    use std::sync::atomic::AtomicUsize;
3388    use tokio::time::{Duration, Instant, sleep};
3389
3390    // ── Helper ───────────────────────────────────────────────────────
3391
3392    /// Stub resolver that always returns an error (no agents registered).
3393    struct StubResolver;
3394    impl awaken_runtime::AgentResolver for StubResolver {
3395        fn resolve(
3396            &self,
3397            agent_id: &str,
3398        ) -> Result<awaken_runtime::ResolvedAgent, awaken_runtime::RuntimeError> {
3399            Err(awaken_runtime::RuntimeError::AgentNotFound {
3400                agent_id: agent_id.to_string(),
3401            })
3402        }
3403    }
3404
3405    fn make_store() -> Arc<InMemoryMailboxStore> {
3406        Arc::new(InMemoryMailboxStore::new())
3407    }
3408
3409    fn make_resume() -> ToolCallResume {
3410        ToolCallResume {
3411            decision_id: "d1".into(),
3412            action: awaken_contract::contract::suspension::ResumeDecisionAction::Resume,
3413            result: serde_json::json!({"approved": true}),
3414            reason: None,
3415            updated_at: 0,
3416        }
3417    }
3418
3419    struct RecoverFlakyMailboxStore {
3420        inner: InMemoryMailboxStore,
3421        reclaim_failures_remaining: AtomicUsize,
3422        reclaim_calls: AtomicUsize,
3423    }
3424
3425    impl RecoverFlakyMailboxStore {
3426        fn new(reclaim_failures: usize) -> Self {
3427            Self {
3428                inner: InMemoryMailboxStore::new(),
3429                reclaim_failures_remaining: AtomicUsize::new(reclaim_failures),
3430                reclaim_calls: AtomicUsize::new(0),
3431            }
3432        }
3433
3434        fn reclaim_calls(&self) -> usize {
3435            self.reclaim_calls.load(Ordering::SeqCst)
3436        }
3437    }
3438
3439    #[async_trait::async_trait]
3440    impl MailboxStore for RecoverFlakyMailboxStore {
3441        async fn enqueue(&self, dispatch: &RunDispatch) -> Result<(), StorageError> {
3442            self.inner.enqueue(dispatch).await
3443        }
3444
3445        async fn claim(
3446            &self,
3447            thread_id: &str,
3448            consumer_id: &str,
3449            lease_ms: u64,
3450            now: u64,
3451            limit: usize,
3452        ) -> Result<Vec<RunDispatch>, StorageError> {
3453            self.inner
3454                .claim(thread_id, consumer_id, lease_ms, now, limit)
3455                .await
3456        }
3457
3458        async fn claim_dispatch(
3459            &self,
3460            dispatch_id: &str,
3461            consumer_id: &str,
3462            lease_ms: u64,
3463            now: u64,
3464        ) -> Result<Option<RunDispatch>, StorageError> {
3465            self.inner
3466                .claim_dispatch(dispatch_id, consumer_id, lease_ms, now)
3467                .await
3468        }
3469
3470        async fn ack(
3471            &self,
3472            dispatch_id: &str,
3473            claim_token: &str,
3474            now: u64,
3475        ) -> Result<(), StorageError> {
3476            self.inner.ack(dispatch_id, claim_token, now).await
3477        }
3478
3479        async fn record_dispatch_start(
3480            &self,
3481            dispatch_id: &str,
3482            claim_token: &str,
3483            dispatch_instance_id: &str,
3484            now: u64,
3485        ) -> Result<(), StorageError> {
3486            self.inner
3487                .record_dispatch_start(dispatch_id, claim_token, dispatch_instance_id, now)
3488                .await
3489        }
3490
3491        async fn record_run_result(
3492            &self,
3493            dispatch_id: &str,
3494            claim_token: &str,
3495            result: &RunDispatchResult,
3496            now: u64,
3497        ) -> Result<(), StorageError> {
3498            self.inner
3499                .record_run_result(dispatch_id, claim_token, result, now)
3500                .await
3501        }
3502
3503        async fn nack(
3504            &self,
3505            dispatch_id: &str,
3506            claim_token: &str,
3507            retry_at: u64,
3508            error: &str,
3509            now: u64,
3510        ) -> Result<(), StorageError> {
3511            self.inner
3512                .nack(dispatch_id, claim_token, retry_at, error, now)
3513                .await
3514        }
3515
3516        async fn dead_letter(
3517            &self,
3518            dispatch_id: &str,
3519            claim_token: &str,
3520            error: &str,
3521            now: u64,
3522        ) -> Result<(), StorageError> {
3523            self.inner
3524                .dead_letter(dispatch_id, claim_token, error, now)
3525                .await
3526        }
3527
3528        async fn cancel(
3529            &self,
3530            dispatch_id: &str,
3531            now: u64,
3532        ) -> Result<Option<RunDispatch>, StorageError> {
3533            self.inner.cancel(dispatch_id, now).await
3534        }
3535
3536        async fn extend_lease(
3537            &self,
3538            dispatch_id: &str,
3539            claim_token: &str,
3540            extension_ms: u64,
3541            now: u64,
3542        ) -> Result<bool, StorageError> {
3543            self.inner
3544                .extend_lease(dispatch_id, claim_token, extension_ms, now)
3545                .await
3546        }
3547
3548        async fn interrupt(
3549            &self,
3550            thread_id: &str,
3551            now: u64,
3552        ) -> Result<MailboxInterrupt, StorageError> {
3553            self.inner.interrupt(thread_id, now).await
3554        }
3555
3556        async fn interrupt_detailed(
3557            &self,
3558            thread_id: &str,
3559            now: u64,
3560        ) -> Result<MailboxInterruptDetails, StorageError> {
3561            self.inner.interrupt_detailed(thread_id, now).await
3562        }
3563
3564        async fn current_dispatch_epoch(&self, thread_id: &str) -> Result<u64, StorageError> {
3565            self.inner.current_dispatch_epoch(thread_id).await
3566        }
3567
3568        async fn supersede_claimed(
3569            &self,
3570            dispatch_id: &str,
3571            claim_token: &str,
3572            now: u64,
3573            reason: &str,
3574        ) -> Result<Option<RunDispatch>, StorageError> {
3575            self.inner
3576                .supersede_claimed(dispatch_id, claim_token, now, reason)
3577                .await
3578        }
3579
3580        async fn load_dispatch(
3581            &self,
3582            dispatch_id: &str,
3583        ) -> Result<Option<RunDispatch>, StorageError> {
3584            self.inner.load_dispatch(dispatch_id).await
3585        }
3586
3587        async fn list_dispatches(
3588            &self,
3589            thread_id: &str,
3590            status_filter: Option<&[RunDispatchStatus]>,
3591            limit: usize,
3592            offset: usize,
3593        ) -> Result<Vec<RunDispatch>, StorageError> {
3594            self.inner
3595                .list_dispatches(thread_id, status_filter, limit, offset)
3596                .await
3597        }
3598
3599        async fn list_terminal_dispatches(
3600            &self,
3601            limit: usize,
3602            offset: usize,
3603        ) -> Result<Vec<RunDispatch>, StorageError> {
3604            self.inner.list_terminal_dispatches(limit, offset).await
3605        }
3606
3607        async fn reclaim_expired_leases(
3608            &self,
3609            now: u64,
3610            limit: usize,
3611        ) -> Result<Vec<RunDispatch>, StorageError> {
3612            self.reclaim_calls.fetch_add(1, Ordering::SeqCst);
3613            let remaining = self.reclaim_failures_remaining.load(Ordering::SeqCst);
3614            if remaining > 0
3615                && self
3616                    .reclaim_failures_remaining
3617                    .compare_exchange(remaining, remaining - 1, Ordering::SeqCst, Ordering::SeqCst)
3618                    .is_ok()
3619            {
3620                return Err(StorageError::Io("injected startup recovery failure".into()));
3621            }
3622            self.inner.reclaim_expired_leases(now, limit).await
3623        }
3624
3625        async fn purge_terminal(&self, older_than: u64) -> Result<usize, StorageError> {
3626            self.inner.purge_terminal(older_than).await
3627        }
3628
3629        async fn queued_thread_ids(&self) -> Result<Vec<String>, StorageError> {
3630            self.inner.queued_thread_ids().await
3631        }
3632    }
3633
3634    #[derive(Clone)]
3635    struct TestDispatchSignal {
3636        thread_id: String,
3637        dispatch_id: String,
3638    }
3639
3640    struct TestDispatchSignalReceipt {
3641        signal: TestDispatchSignal,
3642        queue: Arc<tokio::sync::Mutex<VecDeque<TestDispatchSignal>>>,
3643        acked_count: Arc<AtomicUsize>,
3644        nacked_count: Arc<AtomicUsize>,
3645    }
3646
3647    #[async_trait::async_trait]
3648    impl awaken_contract::contract::mailbox::DispatchSignalReceipt for TestDispatchSignalReceipt {
3649        async fn ack(self: Box<Self>) -> Result<(), StorageError> {
3650            self.acked_count.fetch_add(1, Ordering::SeqCst);
3651            Ok(())
3652        }
3653
3654        async fn nack(self: Box<Self>) -> Result<(), StorageError> {
3655            self.nacked_count.fetch_add(1, Ordering::SeqCst);
3656            self.queue.lock().await.push_back(self.signal.clone());
3657            Ok(())
3658        }
3659    }
3660
3661    struct SignalMailboxStore {
3662        inner: InMemoryMailboxStore,
3663        signals: Arc<tokio::sync::Mutex<VecDeque<TestDispatchSignal>>>,
3664        acked_count: Arc<AtomicUsize>,
3665        nacked_count: Arc<AtomicUsize>,
3666        claim_failures_remaining: AtomicUsize,
3667        claim_dispatch_empty_once: AtomicBool,
3668    }
3669
3670    impl SignalMailboxStore {
3671        fn new() -> Self {
3672            Self::with_claim_failures(0)
3673        }
3674
3675        fn with_claim_failures(claim_failures: usize) -> Self {
3676            Self::with_failures_and_empty_claim_dispatch(claim_failures, false)
3677        }
3678
3679        fn with_empty_claim_dispatch_once() -> Self {
3680            Self::with_failures_and_empty_claim_dispatch(0, true)
3681        }
3682
3683        fn with_failures_and_empty_claim_dispatch(
3684            claim_failures: usize,
3685            claim_dispatch_empty_once: bool,
3686        ) -> Self {
3687            Self {
3688                inner: InMemoryMailboxStore::new(),
3689                signals: Arc::new(tokio::sync::Mutex::new(VecDeque::new())),
3690                acked_count: Arc::new(AtomicUsize::new(0)),
3691                nacked_count: Arc::new(AtomicUsize::new(0)),
3692                claim_failures_remaining: AtomicUsize::new(claim_failures),
3693                claim_dispatch_empty_once: AtomicBool::new(claim_dispatch_empty_once),
3694            }
3695        }
3696
3697        fn acked_signal_count(&self) -> usize {
3698            self.acked_count.load(Ordering::SeqCst)
3699        }
3700
3701        fn nacked_signal_count(&self) -> usize {
3702            self.nacked_count.load(Ordering::SeqCst)
3703        }
3704    }
3705
3706    #[async_trait::async_trait]
3707    impl MailboxStore for SignalMailboxStore {
3708        async fn enqueue(&self, dispatch: &RunDispatch) -> Result<(), StorageError> {
3709            self.inner.enqueue(dispatch).await?;
3710            self.signals.lock().await.push_back(TestDispatchSignal {
3711                thread_id: dispatch.thread_id.clone(),
3712                dispatch_id: dispatch.dispatch_id.clone(),
3713            });
3714            Ok(())
3715        }
3716
3717        async fn claim(
3718            &self,
3719            thread_id: &str,
3720            consumer_id: &str,
3721            lease_ms: u64,
3722            now: u64,
3723            limit: usize,
3724        ) -> Result<Vec<RunDispatch>, StorageError> {
3725            let remaining = self.claim_failures_remaining.load(Ordering::SeqCst);
3726            if remaining > 0
3727                && self
3728                    .claim_failures_remaining
3729                    .compare_exchange(remaining, remaining - 1, Ordering::SeqCst, Ordering::SeqCst)
3730                    .is_ok()
3731            {
3732                return Err(StorageError::Io("injected claim failure".into()));
3733            }
3734            self.inner
3735                .claim(thread_id, consumer_id, lease_ms, now, limit)
3736                .await
3737        }
3738
3739        async fn claim_dispatch(
3740            &self,
3741            dispatch_id: &str,
3742            consumer_id: &str,
3743            lease_ms: u64,
3744            now: u64,
3745        ) -> Result<Option<RunDispatch>, StorageError> {
3746            if self.claim_dispatch_empty_once.swap(false, Ordering::SeqCst) {
3747                return Ok(None);
3748            }
3749            self.inner
3750                .claim_dispatch(dispatch_id, consumer_id, lease_ms, now)
3751                .await
3752        }
3753
3754        async fn ack(
3755            &self,
3756            dispatch_id: &str,
3757            claim_token: &str,
3758            now: u64,
3759        ) -> Result<(), StorageError> {
3760            self.inner.ack(dispatch_id, claim_token, now).await
3761        }
3762
3763        async fn record_dispatch_start(
3764            &self,
3765            dispatch_id: &str,
3766            claim_token: &str,
3767            dispatch_instance_id: &str,
3768            now: u64,
3769        ) -> Result<(), StorageError> {
3770            self.inner
3771                .record_dispatch_start(dispatch_id, claim_token, dispatch_instance_id, now)
3772                .await
3773        }
3774
3775        async fn record_run_result(
3776            &self,
3777            dispatch_id: &str,
3778            claim_token: &str,
3779            result: &RunDispatchResult,
3780            now: u64,
3781        ) -> Result<(), StorageError> {
3782            self.inner
3783                .record_run_result(dispatch_id, claim_token, result, now)
3784                .await
3785        }
3786
3787        async fn nack(
3788            &self,
3789            dispatch_id: &str,
3790            claim_token: &str,
3791            retry_at: u64,
3792            error: &str,
3793            now: u64,
3794        ) -> Result<(), StorageError> {
3795            self.inner
3796                .nack(dispatch_id, claim_token, retry_at, error, now)
3797                .await
3798        }
3799
3800        async fn dead_letter(
3801            &self,
3802            dispatch_id: &str,
3803            claim_token: &str,
3804            error: &str,
3805            now: u64,
3806        ) -> Result<(), StorageError> {
3807            self.inner
3808                .dead_letter(dispatch_id, claim_token, error, now)
3809                .await
3810        }
3811
3812        async fn cancel(
3813            &self,
3814            dispatch_id: &str,
3815            now: u64,
3816        ) -> Result<Option<RunDispatch>, StorageError> {
3817            self.inner.cancel(dispatch_id, now).await
3818        }
3819
3820        async fn extend_lease(
3821            &self,
3822            dispatch_id: &str,
3823            claim_token: &str,
3824            extension_ms: u64,
3825            now: u64,
3826        ) -> Result<bool, StorageError> {
3827            self.inner
3828                .extend_lease(dispatch_id, claim_token, extension_ms, now)
3829                .await
3830        }
3831
3832        async fn interrupt(
3833            &self,
3834            thread_id: &str,
3835            now: u64,
3836        ) -> Result<MailboxInterrupt, StorageError> {
3837            self.inner.interrupt(thread_id, now).await
3838        }
3839
3840        async fn interrupt_detailed(
3841            &self,
3842            thread_id: &str,
3843            now: u64,
3844        ) -> Result<MailboxInterruptDetails, StorageError> {
3845            self.inner.interrupt_detailed(thread_id, now).await
3846        }
3847
3848        async fn current_dispatch_epoch(&self, thread_id: &str) -> Result<u64, StorageError> {
3849            self.inner.current_dispatch_epoch(thread_id).await
3850        }
3851
3852        async fn supersede_claimed(
3853            &self,
3854            dispatch_id: &str,
3855            claim_token: &str,
3856            now: u64,
3857            reason: &str,
3858        ) -> Result<Option<RunDispatch>, StorageError> {
3859            self.inner
3860                .supersede_claimed(dispatch_id, claim_token, now, reason)
3861                .await
3862        }
3863
3864        async fn load_dispatch(
3865            &self,
3866            dispatch_id: &str,
3867        ) -> Result<Option<RunDispatch>, StorageError> {
3868            self.inner.load_dispatch(dispatch_id).await
3869        }
3870
3871        async fn list_dispatches(
3872            &self,
3873            thread_id: &str,
3874            status_filter: Option<&[RunDispatchStatus]>,
3875            limit: usize,
3876            offset: usize,
3877        ) -> Result<Vec<RunDispatch>, StorageError> {
3878            self.inner
3879                .list_dispatches(thread_id, status_filter, limit, offset)
3880                .await
3881        }
3882
3883        async fn list_terminal_dispatches(
3884            &self,
3885            limit: usize,
3886            offset: usize,
3887        ) -> Result<Vec<RunDispatch>, StorageError> {
3888            self.inner.list_terminal_dispatches(limit, offset).await
3889        }
3890
3891        async fn reclaim_expired_leases(
3892            &self,
3893            now: u64,
3894            limit: usize,
3895        ) -> Result<Vec<RunDispatch>, StorageError> {
3896            self.inner.reclaim_expired_leases(now, limit).await
3897        }
3898
3899        async fn purge_terminal(&self, older_than: u64) -> Result<usize, StorageError> {
3900            self.inner.purge_terminal(older_than).await
3901        }
3902
3903        async fn queued_thread_ids(&self) -> Result<Vec<String>, StorageError> {
3904            self.inner.queued_thread_ids().await
3905        }
3906
3907        fn supports_dispatch_signals(&self) -> bool {
3908            true
3909        }
3910
3911        async fn pull_dispatch_signals(
3912            &self,
3913            max: usize,
3914            _expires: Duration,
3915        ) -> Result<Vec<awaken_contract::contract::mailbox::DispatchSignalEntry>, StorageError>
3916        {
3917            let mut signals = self.signals.lock().await;
3918            let mut entries = Vec::new();
3919            for _ in 0..max {
3920                let Some(signal) = signals.pop_front() else {
3921                    break;
3922                };
3923                entries.push(awaken_contract::contract::mailbox::DispatchSignalEntry {
3924                    thread_id: signal.thread_id.clone(),
3925                    dispatch_id: signal.dispatch_id.clone(),
3926                    receipt: Box::new(TestDispatchSignalReceipt {
3927                        signal,
3928                        queue: Arc::clone(&self.signals),
3929                        acked_count: Arc::clone(&self.acked_count),
3930                        nacked_count: Arc::clone(&self.nacked_count),
3931                    }),
3932                });
3933            }
3934            Ok(entries)
3935        }
3936    }
3937
3938    struct InterruptOnLoadMailboxStore {
3939        inner: InMemoryMailboxStore,
3940        interrupt_once: AtomicBool,
3941    }
3942
3943    impl InterruptOnLoadMailboxStore {
3944        fn new() -> Self {
3945            Self {
3946                inner: InMemoryMailboxStore::new(),
3947                interrupt_once: AtomicBool::new(true),
3948            }
3949        }
3950    }
3951
3952    #[async_trait::async_trait]
3953    impl MailboxStore for InterruptOnLoadMailboxStore {
3954        async fn enqueue(&self, dispatch: &RunDispatch) -> Result<(), StorageError> {
3955            self.inner.enqueue(dispatch).await
3956        }
3957
3958        async fn claim(
3959            &self,
3960            thread_id: &str,
3961            consumer_id: &str,
3962            lease_ms: u64,
3963            now: u64,
3964            limit: usize,
3965        ) -> Result<Vec<RunDispatch>, StorageError> {
3966            self.inner
3967                .claim(thread_id, consumer_id, lease_ms, now, limit)
3968                .await
3969        }
3970
3971        async fn claim_dispatch(
3972            &self,
3973            dispatch_id: &str,
3974            consumer_id: &str,
3975            lease_ms: u64,
3976            now: u64,
3977        ) -> Result<Option<RunDispatch>, StorageError> {
3978            self.inner
3979                .claim_dispatch(dispatch_id, consumer_id, lease_ms, now)
3980                .await
3981        }
3982
3983        async fn ack(
3984            &self,
3985            dispatch_id: &str,
3986            claim_token: &str,
3987            now: u64,
3988        ) -> Result<(), StorageError> {
3989            self.inner.ack(dispatch_id, claim_token, now).await
3990        }
3991
3992        async fn record_dispatch_start(
3993            &self,
3994            dispatch_id: &str,
3995            claim_token: &str,
3996            dispatch_instance_id: &str,
3997            now: u64,
3998        ) -> Result<(), StorageError> {
3999            self.inner
4000                .record_dispatch_start(dispatch_id, claim_token, dispatch_instance_id, now)
4001                .await
4002        }
4003
4004        async fn record_run_result(
4005            &self,
4006            dispatch_id: &str,
4007            claim_token: &str,
4008            result: &RunDispatchResult,
4009            now: u64,
4010        ) -> Result<(), StorageError> {
4011            self.inner
4012                .record_run_result(dispatch_id, claim_token, result, now)
4013                .await
4014        }
4015
4016        async fn nack(
4017            &self,
4018            dispatch_id: &str,
4019            claim_token: &str,
4020            retry_at: u64,
4021            error: &str,
4022            now: u64,
4023        ) -> Result<(), StorageError> {
4024            self.inner
4025                .nack(dispatch_id, claim_token, retry_at, error, now)
4026                .await
4027        }
4028
4029        async fn dead_letter(
4030            &self,
4031            dispatch_id: &str,
4032            claim_token: &str,
4033            error: &str,
4034            now: u64,
4035        ) -> Result<(), StorageError> {
4036            self.inner
4037                .dead_letter(dispatch_id, claim_token, error, now)
4038                .await
4039        }
4040
4041        async fn cancel(
4042            &self,
4043            dispatch_id: &str,
4044            now: u64,
4045        ) -> Result<Option<RunDispatch>, StorageError> {
4046            self.inner.cancel(dispatch_id, now).await
4047        }
4048
4049        async fn extend_lease(
4050            &self,
4051            dispatch_id: &str,
4052            claim_token: &str,
4053            extension_ms: u64,
4054            now: u64,
4055        ) -> Result<bool, StorageError> {
4056            self.inner
4057                .extend_lease(dispatch_id, claim_token, extension_ms, now)
4058                .await
4059        }
4060
4061        async fn interrupt(
4062            &self,
4063            thread_id: &str,
4064            now: u64,
4065        ) -> Result<MailboxInterrupt, StorageError> {
4066            self.inner.interrupt(thread_id, now).await
4067        }
4068
4069        async fn interrupt_detailed(
4070            &self,
4071            thread_id: &str,
4072            now: u64,
4073        ) -> Result<MailboxInterruptDetails, StorageError> {
4074            self.inner.interrupt_detailed(thread_id, now).await
4075        }
4076
4077        async fn current_dispatch_epoch(&self, thread_id: &str) -> Result<u64, StorageError> {
4078            self.inner.current_dispatch_epoch(thread_id).await
4079        }
4080
4081        async fn supersede_claimed(
4082            &self,
4083            dispatch_id: &str,
4084            claim_token: &str,
4085            now: u64,
4086            reason: &str,
4087        ) -> Result<Option<RunDispatch>, StorageError> {
4088            self.inner
4089                .supersede_claimed(dispatch_id, claim_token, now, reason)
4090                .await
4091        }
4092
4093        async fn load_dispatch(
4094            &self,
4095            dispatch_id: &str,
4096        ) -> Result<Option<RunDispatch>, StorageError> {
4097            let loaded = self.inner.load_dispatch(dispatch_id).await?;
4098            if let Some(dispatch) = loaded.as_ref()
4099                && dispatch.status == RunDispatchStatus::Claimed
4100                && self.interrupt_once.swap(false, Ordering::SeqCst)
4101            {
4102                self.inner.interrupt(&dispatch.thread_id, now_ms()).await?;
4103            }
4104            Ok(loaded)
4105        }
4106
4107        async fn list_dispatches(
4108            &self,
4109            thread_id: &str,
4110            status_filter: Option<&[RunDispatchStatus]>,
4111            limit: usize,
4112            offset: usize,
4113        ) -> Result<Vec<RunDispatch>, StorageError> {
4114            self.inner
4115                .list_dispatches(thread_id, status_filter, limit, offset)
4116                .await
4117        }
4118
4119        async fn count_dispatches_by_status(
4120            &self,
4121            status: RunDispatchStatus,
4122        ) -> Result<usize, StorageError> {
4123            self.inner.count_dispatches_by_status(status).await
4124        }
4125
4126        async fn list_terminal_dispatches(
4127            &self,
4128            limit: usize,
4129            offset: usize,
4130        ) -> Result<Vec<RunDispatch>, StorageError> {
4131            self.inner.list_terminal_dispatches(limit, offset).await
4132        }
4133
4134        async fn reclaim_expired_leases(
4135            &self,
4136            now: u64,
4137            limit: usize,
4138        ) -> Result<Vec<RunDispatch>, StorageError> {
4139            self.inner.reclaim_expired_leases(now, limit).await
4140        }
4141
4142        async fn purge_terminal(&self, older_than: u64) -> Result<usize, StorageError> {
4143            self.inner.purge_terminal(older_than).await
4144        }
4145
4146        async fn queued_thread_ids(&self) -> Result<Vec<String>, StorageError> {
4147            self.inner.queued_thread_ids().await
4148        }
4149    }
4150
4151    fn make_runtime() -> Arc<AgentRuntime> {
4152        Arc::new(AgentRuntime::new(Arc::new(StubResolver)))
4153    }
4154
4155    fn make_mailbox(runtime: Arc<AgentRuntime>, store: Arc<InMemoryMailboxStore>) -> Arc<Mailbox> {
4156        Arc::new(Mailbox::new(
4157            runtime,
4158            store,
4159            Arc::new(InMemoryStore::new()),
4160            "test-consumer".to_string(),
4161            MailboxConfig::default(),
4162        ))
4163    }
4164
4165    fn make_mailbox_with_run_store(
4166        runtime: Arc<AgentRuntime>,
4167        store: Arc<InMemoryMailboxStore>,
4168        run_store: Arc<dyn ThreadRunStore>,
4169    ) -> Arc<Mailbox> {
4170        Arc::new(Mailbox::new(
4171            runtime,
4172            store,
4173            run_store,
4174            "test-consumer".to_string(),
4175            MailboxConfig::default(),
4176        ))
4177    }
4178
4179    struct NoopMailboxRuntime;
4180
4181    #[async_trait::async_trait]
4182    impl RunDispatchExecutor for NoopMailboxRuntime {
4183        async fn run(
4184            &self,
4185            _request: RunRequest,
4186            _sink: Arc<dyn EventSink>,
4187        ) -> Result<AgentRunResult, AgentLoopError> {
4188            panic!("decoupling test must not execute runs")
4189        }
4190
4191        fn cancel(&self, _id: &str) -> bool {
4192            false
4193        }
4194
4195        async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4196            false
4197        }
4198
4199        fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4200            false
4201        }
4202
4203        fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4204            false
4205        }
4206    }
4207
4208    struct ImmediateLocalCancelRuntime;
4209
4210    #[async_trait::async_trait]
4211    impl RunDispatchExecutor for ImmediateLocalCancelRuntime {
4212        async fn run(
4213            &self,
4214            _request: RunRequest,
4215            _sink: Arc<dyn EventSink>,
4216        ) -> Result<AgentRunResult, AgentLoopError> {
4217            panic!("local cancel test must not execute runs")
4218        }
4219
4220        fn cancel(&self, _id: &str) -> bool {
4221            true
4222        }
4223
4224        async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4225            true
4226        }
4227
4228        fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4229            false
4230        }
4231
4232        fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4233            false
4234        }
4235    }
4236
4237    #[derive(Default)]
4238    struct CountingMailboxRuntime {
4239        run_count: AtomicUsize,
4240    }
4241
4242    impl CountingMailboxRuntime {
4243        fn run_count(&self) -> usize {
4244            self.run_count.load(Ordering::SeqCst)
4245        }
4246    }
4247
4248    #[async_trait::async_trait]
4249    impl RunDispatchExecutor for CountingMailboxRuntime {
4250        async fn run(
4251            &self,
4252            request: RunRequest,
4253            _sink: Arc<dyn EventSink>,
4254        ) -> Result<AgentRunResult, AgentLoopError> {
4255            self.run_count.fetch_add(1, Ordering::SeqCst);
4256            Ok(AgentRunResult {
4257                run_id: request
4258                    .continue_run_id
4259                    .clone()
4260                    .or(request.run_id_hint.clone())
4261                    .or(request.dispatch_id.clone())
4262                    .unwrap_or_else(|| "counted-run".to_string()),
4263                response: "ok".to_string(),
4264                termination: TerminationReason::NaturalEnd,
4265                steps: 1,
4266            })
4267        }
4268
4269        fn cancel(&self, _id: &str) -> bool {
4270            false
4271        }
4272
4273        async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4274            false
4275        }
4276
4277        fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4278            false
4279        }
4280
4281        fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4282            false
4283        }
4284    }
4285
4286    struct RecordedMailboxRequest {
4287        run_mode: RunMode,
4288        adapter: AdapterKind,
4289        dispatch_id: Option<String>,
4290        session_id: Option<String>,
4291    }
4292
4293    #[derive(Default)]
4294    struct RecordingMailboxRuntime {
4295        requests: StdMutex<Vec<RecordedMailboxRequest>>,
4296    }
4297
4298    struct BlockingMailboxRuntime {
4299        run_count: AtomicUsize,
4300        started_tx: tokio::sync::mpsc::UnboundedSender<(usize, Option<String>)>,
4301        release_first: Arc<tokio::sync::Notify>,
4302    }
4303
4304    impl BlockingMailboxRuntime {
4305        fn new(
4306            started_tx: tokio::sync::mpsc::UnboundedSender<(usize, Option<String>)>,
4307            release_first: Arc<tokio::sync::Notify>,
4308        ) -> Self {
4309            Self {
4310                run_count: AtomicUsize::new(0),
4311                started_tx,
4312                release_first,
4313            }
4314        }
4315    }
4316
4317    #[async_trait::async_trait]
4318    impl RunDispatchExecutor for BlockingMailboxRuntime {
4319        async fn run(
4320            &self,
4321            request: RunRequest,
4322            _sink: Arc<dyn EventSink>,
4323        ) -> Result<AgentRunResult, AgentLoopError> {
4324            let ordinal = self.run_count.fetch_add(1, Ordering::SeqCst) + 1;
4325            let _ = self.started_tx.send((ordinal, request.dispatch_id.clone()));
4326            if ordinal == 1 {
4327                self.release_first.notified().await;
4328            }
4329            let run_id = request
4330                .continue_run_id
4331                .clone()
4332                .or(request.run_id_hint.clone())
4333                .or(request.dispatch_id.clone())
4334                .unwrap_or_else(|| format!("blocking-run-{ordinal}"));
4335            Ok(AgentRunResult {
4336                run_id,
4337                response: "ok".to_string(),
4338                termination: TerminationReason::NaturalEnd,
4339                steps: 1,
4340            })
4341        }
4342
4343        fn cancel(&self, _id: &str) -> bool {
4344            false
4345        }
4346
4347        async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4348            false
4349        }
4350
4351        fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4352            false
4353        }
4354
4355        fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4356            false
4357        }
4358    }
4359
4360    #[async_trait::async_trait]
4361    impl RunDispatchExecutor for RecordingMailboxRuntime {
4362        async fn run(
4363            &self,
4364            request: RunRequest,
4365            _sink: Arc<dyn EventSink>,
4366        ) -> Result<AgentRunResult, AgentLoopError> {
4367            let run_id = request
4368                .continue_run_id
4369                .clone()
4370                .or(request.run_id_hint.clone())
4371                .unwrap_or_else(|| "recorded-run".to_string());
4372            self.requests
4373                .lock()
4374                .expect("lock poisoned")
4375                .push(RecordedMailboxRequest {
4376                    run_mode: request.run_mode,
4377                    adapter: request.adapter,
4378                    dispatch_id: request.dispatch_id.clone(),
4379                    session_id: request.session_id.clone(),
4380                });
4381            Ok(AgentRunResult {
4382                run_id,
4383                response: "ok".to_string(),
4384                termination: TerminationReason::NaturalEnd,
4385                steps: 1,
4386            })
4387        }
4388
4389        fn cancel(&self, _id: &str) -> bool {
4390            false
4391        }
4392
4393        async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4394            false
4395        }
4396
4397        fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4398            false
4399        }
4400
4401        fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4402            false
4403        }
4404    }
4405
4406    struct RecordedStoreMailboxRequest {
4407        thread_id: String,
4408        continue_run_id: Option<String>,
4409        run_mode: RunMode,
4410        adapter: AdapterKind,
4411    }
4412
4413    struct RecordingStoreMailboxRuntime {
4414        requests: StdMutex<Vec<RecordedStoreMailboxRequest>>,
4415    }
4416
4417    impl RecordingStoreMailboxRuntime {
4418        fn new(_store: Arc<InMemoryStore>) -> Self {
4419            Self {
4420                requests: StdMutex::new(Vec::new()),
4421            }
4422        }
4423    }
4424
4425    #[async_trait::async_trait]
4426    impl RunDispatchExecutor for RecordingStoreMailboxRuntime {
4427        async fn run(
4428            &self,
4429            request: RunRequest,
4430            _sink: Arc<dyn EventSink>,
4431        ) -> Result<AgentRunResult, AgentLoopError> {
4432            let run_id = request
4433                .continue_run_id
4434                .clone()
4435                .or(request.run_id_hint.clone())
4436                .unwrap_or_else(|| "recorded-run".to_string());
4437            self.requests
4438                .lock()
4439                .expect("lock poisoned")
4440                .push(RecordedStoreMailboxRequest {
4441                    thread_id: request.thread_id,
4442                    continue_run_id: request.continue_run_id,
4443                    run_mode: request.run_mode,
4444                    adapter: request.adapter,
4445                });
4446            Ok(AgentRunResult {
4447                run_id,
4448                response: "ok".to_string(),
4449                termination: TerminationReason::NaturalEnd,
4450                steps: 1,
4451            })
4452        }
4453
4454        fn cancel(&self, _id: &str) -> bool {
4455            false
4456        }
4457
4458        async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4459            false
4460        }
4461
4462        fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4463            false
4464        }
4465
4466        fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4467            false
4468        }
4469    }
4470
4471    struct ScriptedLlm {
4472        responses: StdMutex<Vec<StreamResult>>,
4473    }
4474
4475    impl ScriptedLlm {
4476        fn new(responses: Vec<StreamResult>) -> Self {
4477            Self {
4478                responses: StdMutex::new(responses),
4479            }
4480        }
4481    }
4482
4483    #[async_trait]
4484    impl LlmExecutor for ScriptedLlm {
4485        async fn execute(
4486            &self,
4487            _request: InferenceRequest,
4488        ) -> Result<StreamResult, InferenceExecutionError> {
4489            let mut responses = self.responses.lock().expect("lock poisoned");
4490            if responses.is_empty() {
4491                Ok(StreamResult {
4492                    content: vec![ContentBlock::text("done")],
4493                    tool_calls: vec![],
4494                    usage: None,
4495                    stop_reason: Some(StopReason::EndTurn),
4496                    has_incomplete_tool_calls: false,
4497                })
4498            } else {
4499                Ok(responses.remove(0))
4500            }
4501        }
4502
4503        fn name(&self) -> &str {
4504            "scripted"
4505        }
4506    }
4507
4508    struct RecordingLlm {
4509        responses: StdMutex<Vec<StreamResult>>,
4510        requests: Arc<StdMutex<Vec<InferenceRequest>>>,
4511    }
4512
4513    impl RecordingLlm {
4514        fn new(
4515            responses: Vec<StreamResult>,
4516            requests: Arc<StdMutex<Vec<InferenceRequest>>>,
4517        ) -> Self {
4518            Self {
4519                responses: StdMutex::new(responses),
4520                requests,
4521            }
4522        }
4523    }
4524
4525    #[async_trait]
4526    impl LlmExecutor for RecordingLlm {
4527        async fn execute(
4528            &self,
4529            request: InferenceRequest,
4530        ) -> Result<StreamResult, InferenceExecutionError> {
4531            self.requests.lock().expect("lock poisoned").push(request);
4532            let mut responses = self.responses.lock().expect("lock poisoned");
4533            if responses.is_empty() {
4534                Ok(StreamResult {
4535                    content: vec![ContentBlock::text("done")],
4536                    tool_calls: vec![],
4537                    usage: None,
4538                    stop_reason: Some(StopReason::EndTurn),
4539                    has_incomplete_tool_calls: false,
4540                })
4541            } else {
4542                Ok(responses.remove(0))
4543            }
4544        }
4545
4546        fn name(&self) -> &str {
4547            "recording"
4548        }
4549    }
4550
4551    struct FixedResolver {
4552        agent: ResolvedAgent,
4553        plugins: Vec<Arc<dyn Plugin>>,
4554    }
4555
4556    impl awaken_runtime::AgentResolver for FixedResolver {
4557        fn resolve(&self, _agent_id: &str) -> Result<ResolvedAgent, awaken_runtime::RuntimeError> {
4558            let mut agent = self.agent.clone();
4559            agent.env = build_agent_env(&self.plugins, &agent)?;
4560            Ok(agent)
4561        }
4562    }
4563
4564    struct SpawnShortBgTaskTool {
4565        manager: Arc<BackgroundTaskManager>,
4566        delay: Duration,
4567    }
4568
4569    #[async_trait]
4570    impl Tool for SpawnShortBgTaskTool {
4571        fn descriptor(&self) -> ToolDescriptor {
4572            ToolDescriptor::new("spawn_bg", "spawn_bg", "Spawn a short background task")
4573        }
4574
4575        async fn execute(
4576            &self,
4577            _args: Value,
4578            ctx: &ToolCallContext,
4579        ) -> Result<ToolOutput, ToolError> {
4580            let delay = self.delay;
4581            self.manager
4582                .spawn(
4583                    &ctx.run_identity.thread_id,
4584                    "bg",
4585                    None,
4586                    "short task",
4587                    TaskParentContext::default(),
4588                    move |_task_ctx| async move {
4589                        sleep(delay).await;
4590                        BackgroundTaskResult::Success(json!({"done": true}))
4591                    },
4592                )
4593                .await
4594                .map_err(|e| ToolError::ExecutionFailed(e.to_string()))?;
4595            Ok(ToolResult::success("spawn_bg", json!({"spawned": true})).into())
4596        }
4597    }
4598
4599    struct BlockingTool {
4600        started: StdMutex<Option<tokio::sync::oneshot::Sender<()>>>,
4601        release: tokio::sync::Mutex<Option<tokio::sync::oneshot::Receiver<()>>>,
4602    }
4603
4604    impl BlockingTool {
4605        fn new(
4606            started: tokio::sync::oneshot::Sender<()>,
4607            release: tokio::sync::oneshot::Receiver<()>,
4608        ) -> Self {
4609            Self {
4610                started: StdMutex::new(Some(started)),
4611                release: tokio::sync::Mutex::new(Some(release)),
4612            }
4613        }
4614    }
4615
4616    #[async_trait]
4617    impl Tool for BlockingTool {
4618        fn descriptor(&self) -> ToolDescriptor {
4619            ToolDescriptor::new("block", "block", "wait until released")
4620        }
4621
4622        async fn execute(
4623            &self,
4624            _args: Value,
4625            _ctx: &ToolCallContext,
4626        ) -> Result<ToolOutput, ToolError> {
4627            if let Some(started) = self.started.lock().expect("lock poisoned").take() {
4628                let _ = started.send(());
4629            }
4630            let release = self.release.lock().await.take();
4631            if let Some(release) = release {
4632                let _ = release.await;
4633            }
4634            Ok(ToolResult::success("block", json!({"released": true})).into())
4635        }
4636    }
4637
4638    async fn wait_for_latest_run<F>(
4639        store: &InMemoryStore,
4640        thread_id: &str,
4641        predicate: F,
4642    ) -> RunRecord
4643    where
4644        F: Fn(&RunRecord) -> bool,
4645    {
4646        let deadline = Instant::now() + Duration::from_secs(2);
4647        loop {
4648            if let Some(run) = store
4649                .latest_run(thread_id)
4650                .await
4651                .expect("latest run lookup should succeed")
4652                && predicate(&run)
4653            {
4654                return run;
4655            }
4656
4657            assert!(
4658                Instant::now() < deadline,
4659                "timed out waiting for run predicate on thread {thread_id}"
4660            );
4661            sleep(Duration::from_millis(10)).await;
4662        }
4663    }
4664
4665    async fn wait_for_dispatch<F>(
4666        store: &InMemoryMailboxStore,
4667        dispatch_id: &str,
4668        predicate: F,
4669    ) -> RunDispatch
4670    where
4671        F: Fn(&RunDispatch) -> bool,
4672    {
4673        let deadline = Instant::now() + Duration::from_secs(2);
4674        loop {
4675            if let Some(dispatch) = store
4676                .load_dispatch(dispatch_id)
4677                .await
4678                .expect("mailbox dispatch lookup should succeed")
4679                && predicate(&dispatch)
4680            {
4681                return dispatch;
4682            }
4683
4684            assert!(
4685                Instant::now() < deadline,
4686                "timed out waiting for mailbox dispatch predicate on dispatch {dispatch_id}"
4687            );
4688            sleep(Duration::from_millis(10)).await;
4689        }
4690    }
4691
4692    async fn prepare_queued_dispatch(
4693        mailbox: &Arc<Mailbox>,
4694        thread_id: &str,
4695        content: &str,
4696    ) -> RunDispatch {
4697        let mut request =
4698            RunRequest::new(thread_id, vec![Message::user(content)]).with_agent_id("agent");
4699        let (validated_thread_id, messages) =
4700            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
4701                .expect("test input should validate");
4702        mailbox
4703            .prepare_run_for_dispatch(&mut request, &validated_thread_id, &messages)
4704            .await
4705            .expect("prepare queued run");
4706        mailbox
4707            .build_dispatch(&request, &validated_thread_id)
4708            .expect("build queued dispatch")
4709    }
4710
4711    async fn enqueue_prepared_dispatch(
4712        mailbox: &Arc<Mailbox>,
4713        store: &InMemoryMailboxStore,
4714        thread_id: &str,
4715        content: &str,
4716    ) -> MailboxSubmitResult {
4717        let dispatch = prepare_queued_dispatch(mailbox, thread_id, content).await;
4718        let result = MailboxSubmitResult {
4719            dispatch_id: dispatch.dispatch_id.clone(),
4720            run_id: dispatch.run_id.clone(),
4721            thread_id: dispatch.thread_id.clone(),
4722            status: MailboxDispatchStatus::Queued,
4723        };
4724        store
4725            .enqueue(&dispatch)
4726            .await
4727            .expect("enqueue queued dispatch");
4728        result
4729    }
4730
4731    fn seeded_waiting_run(run_id: &str, thread_id: &str, agent_id: &str) -> RunRecord {
4732        RunRecord {
4733            run_id: run_id.to_string(),
4734            thread_id: thread_id.to_string(),
4735            agent_id: agent_id.to_string(),
4736            parent_run_id: None,
4737            request: None,
4738            input: None,
4739            output: None,
4740            status: RunStatus::Waiting,
4741            termination_reason: None,
4742            final_output: None,
4743            error_payload: None,
4744            dispatch_id: None,
4745            session_id: None,
4746            transport_request_id: None,
4747            waiting: Some(RunWaitingState {
4748                reason: WaitingReason::BackgroundTasks,
4749                ticket_ids: Vec::new(),
4750                tickets: Vec::new(),
4751                since_dispatch_id: None,
4752                message: None,
4753            }),
4754            outcome: None,
4755            created_at: 1,
4756            started_at: None,
4757            finished_at: None,
4758            updated_at: 1,
4759            steps: 2,
4760            input_tokens: 0,
4761            output_tokens: 0,
4762            state: None,
4763        }
4764    }
4765
4766    // ── Tests ────────────────────────────────────────────────────────
4767
4768    #[test]
4769    fn mailbox_config_defaults() {
4770        let config = MailboxConfig::default();
4771        assert_eq!(config.lease_ms, 30_000);
4772        assert_eq!(config.suspended_lease_ms, 600_000);
4773        assert_eq!(config.lease_renewal_interval, Duration::from_secs(10));
4774        assert_eq!(config.sweep_interval, Duration::from_secs(30));
4775        assert_eq!(config.gc_interval, Duration::from_secs(60));
4776        assert_eq!(config.gc_ttl, Duration::from_secs(24 * 60 * 60));
4777        assert_eq!(config.default_max_attempts, 5);
4778        assert_eq!(config.default_retry_delay_ms, 250);
4779        assert_eq!(config.max_retry_delay_ms, 30_000);
4780    }
4781
4782    #[test]
4783    fn dispatch_signal_blocked_nack_delay_backs_off_and_caps() {
4784        assert_eq!(
4785            dispatch_signal_blocked_nack_delay(None),
4786            Duration::from_millis(500)
4787        );
4788        assert_eq!(
4789            dispatch_signal_blocked_nack_delay(Some(3)),
4790            Duration::from_secs(2)
4791        );
4792        assert_eq!(
4793            dispatch_signal_blocked_nack_delay(Some(100)),
4794            Duration::from_secs(30)
4795        );
4796    }
4797
4798    #[test]
4799    fn mailbox_lifecycle_config_defaults() {
4800        let config = MailboxLifecycleConfig::default();
4801        assert_eq!(config.startup_delay, Duration::ZERO);
4802        assert_eq!(config.startup_recovery.max_attempts, 1);
4803        assert_eq!(
4804            config.startup_recovery.retry_delay,
4805            Duration::from_millis(250)
4806        );
4807        assert!(config.maintenance_callback.is_none());
4808    }
4809
4810    #[tokio::test]
4811    async fn start_lifecycle_ready_fails_when_startup_recovery_fails() {
4812        let store = Arc::new(RecoverFlakyMailboxStore::new(1));
4813        let runtime = make_runtime();
4814        let mailbox = Arc::new(Mailbox::new(
4815            runtime,
4816            store,
4817            Arc::new(InMemoryStore::new()),
4818            "test-consumer".to_string(),
4819            MailboxConfig::default(),
4820        ));
4821
4822        let error = match mailbox
4823            .start_lifecycle_ready(MailboxLifecycleConfig {
4824                startup_recovery: MailboxStartupRecoveryConfig {
4825                    max_attempts: 1,
4826                    retry_delay: Duration::ZERO,
4827                },
4828                ..Default::default()
4829            })
4830            .await
4831        {
4832            Ok(_) => panic!("ready lifecycle should fail when startup recovery fails"),
4833            Err(error) => error,
4834        };
4835
4836        assert!(
4837            error
4838                .to_string()
4839                .contains("injected startup recovery failure")
4840        );
4841        assert!(
4842            !mailbox
4843                .lifecycle_is_running()
4844                .expect("lifecycle state should be readable")
4845        );
4846    }
4847
4848    #[tokio::test]
4849    async fn start_lifecycle_ready_retries_startup_recovery_until_ready() {
4850        let store = Arc::new(RecoverFlakyMailboxStore::new(1));
4851        let runtime = make_runtime();
4852        let mailbox = Arc::new(Mailbox::new(
4853            runtime,
4854            store.clone(),
4855            Arc::new(InMemoryStore::new()),
4856            "test-consumer".to_string(),
4857            MailboxConfig::default(),
4858        ));
4859
4860        let mut request = RunRequest::new("thread-retry-recover", vec![Message::user("recover")])
4861            .with_agent_id("missing-agent");
4862        let (thread_id, messages) =
4863            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
4864                .unwrap();
4865        mailbox
4866            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
4867            .await
4868            .expect("prepare queued run");
4869        let dispatch = mailbox
4870            .build_dispatch(&request, &thread_id)
4871            .expect("build queued dispatch");
4872        let dispatch_id = dispatch.dispatch_id.clone();
4873        store
4874            .enqueue(&dispatch)
4875            .await
4876            .expect("enqueue queued dispatch");
4877
4878        let handle = mailbox
4879            .start_lifecycle_ready(MailboxLifecycleConfig {
4880                startup_recovery: MailboxStartupRecoveryConfig {
4881                    max_attempts: 2,
4882                    retry_delay: Duration::ZERO,
4883                },
4884                ..Default::default()
4885            })
4886            .await
4887            .expect("ready lifecycle should retry startup recovery");
4888
4889        let recovered = wait_for_dispatch(&store.inner, &dispatch_id, |dispatch| {
4890            dispatch.status == RunDispatchStatus::DeadLetter
4891        })
4892        .await;
4893        assert_eq!(recovered.status, RunDispatchStatus::DeadLetter);
4894        handle.shutdown().await.expect("shutdown lifecycle");
4895    }
4896
4897    #[tokio::test]
4898    async fn start_lifecycle_ready_serializes_concurrent_recovery() {
4899        let store = Arc::new(RecoverFlakyMailboxStore::new(0));
4900        let runtime = make_runtime();
4901        let mailbox = Arc::new(Mailbox::new(
4902            runtime,
4903            store.clone(),
4904            Arc::new(InMemoryStore::new()),
4905            "test-consumer".to_string(),
4906            MailboxConfig::default(),
4907        ));
4908
4909        let mut starters = Vec::new();
4910        for _ in 0..32 {
4911            let mailbox = Arc::clone(&mailbox);
4912            starters.push(tokio::spawn(async move {
4913                mailbox
4914                    .start_lifecycle_ready(MailboxLifecycleConfig::default())
4915                    .await
4916            }));
4917        }
4918
4919        let mut handles = Vec::new();
4920        for starter in starters {
4921            handles.push(
4922                starter
4923                    .await
4924                    .expect("starter task should not panic")
4925                    .expect("ready lifecycle should start"),
4926            );
4927        }
4928
4929        assert_eq!(
4930            store.reclaim_calls(),
4931            1,
4932            "concurrent ready starts should run startup recovery once"
4933        );
4934        assert!(handles.iter().all(MailboxLifecycleHandle::is_running));
4935        handles[0].shutdown().await.expect("shutdown lifecycle");
4936        assert!(handles.iter().all(|handle| !handle.is_running()));
4937    }
4938
4939    #[tokio::test]
4940    async fn start_lifecycle_does_not_bypass_ready_transition() {
4941        let store = Arc::new(RecoverFlakyMailboxStore::new(0));
4942        let runtime = make_runtime();
4943        let mailbox = Arc::new(Mailbox::new(
4944            runtime,
4945            store.clone(),
4946            Arc::new(InMemoryStore::new()),
4947            "test-consumer".to_string(),
4948            MailboxConfig::default(),
4949        ));
4950
4951        let ready_mailbox = Arc::clone(&mailbox);
4952        let ready = tokio::spawn(async move {
4953            ready_mailbox
4954                .start_lifecycle_ready(MailboxLifecycleConfig {
4955                    startup_delay: Duration::from_millis(75),
4956                    startup_recovery: MailboxStartupRecoveryConfig {
4957                        max_attempts: 1,
4958                        retry_delay: Duration::ZERO,
4959                    },
4960                    ..Default::default()
4961                })
4962                .await
4963        });
4964        sleep(Duration::from_millis(10)).await;
4965
4966        let err = match mailbox.start_lifecycle(MailboxLifecycleConfig::default()) {
4967            Ok(_) => panic!("sync start must not race ready startup"),
4968            Err(error) => error,
4969        };
4970        assert!(
4971            err.to_string()
4972                .contains("lifecycle transition is already running")
4973        );
4974
4975        let handle = ready
4976            .await
4977            .expect("ready task should not panic")
4978            .expect("ready lifecycle should start");
4979        assert_eq!(
4980            store.reclaim_calls(),
4981            1,
4982            "ready recovery should not be duplicated by sync start"
4983        );
4984        handle.shutdown().await.expect("shutdown lifecycle");
4985    }
4986
4987    #[tokio::test]
4988    async fn start_lifecycle_is_idempotent_and_drop_does_not_abort_recovery() {
4989        let store = make_store();
4990        let runtime = make_runtime();
4991        let mailbox = make_mailbox(runtime, store.clone());
4992
4993        let mut request = RunRequest::new("thread-drop-recover", vec![Message::user("recover")])
4994            .with_agent_id("missing-agent");
4995        let (thread_id, messages) =
4996            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
4997                .unwrap();
4998        mailbox
4999            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5000            .await
5001            .expect("prepare queued run");
5002        let dispatch = mailbox
5003            .build_dispatch(&request, &thread_id)
5004            .expect("build queued dispatch");
5005        let dispatch_id = dispatch.dispatch_id.clone();
5006        store
5007            .enqueue(&dispatch)
5008            .await
5009            .expect("enqueue queued dispatch");
5010
5011        let handle = mailbox
5012            .start_lifecycle(MailboxLifecycleConfig {
5013                startup_delay: Duration::from_millis(10),
5014                ..Default::default()
5015            })
5016            .expect("lifecycle start should succeed");
5017        let duplicate = mailbox
5018            .start_lifecycle(MailboxLifecycleConfig::default())
5019            .expect("duplicate lifecycle start should be a no-op");
5020        assert!(handle.is_running());
5021        assert!(duplicate.is_running());
5022
5023        drop(handle);
5024        drop(duplicate);
5025
5026        wait_for_dispatch(&store, &dispatch_id, |dispatch| {
5027            dispatch.status == RunDispatchStatus::DeadLetter
5028        })
5029        .await;
5030
5031        let cleanup = mailbox
5032            .start_lifecycle(MailboxLifecycleConfig::default())
5033            .expect("should return the existing lifecycle handle");
5034        cleanup.shutdown().await.expect("shutdown lifecycle");
5035        assert!(!cleanup.is_running());
5036    }
5037
5038    #[tokio::test]
5039    async fn start_lifecycle_explicit_abort_allows_restart() {
5040        let store = make_store();
5041        let runtime = make_runtime();
5042        let mailbox = make_mailbox(runtime, store);
5043
5044        let first = mailbox
5045            .start_lifecycle(MailboxLifecycleConfig::default())
5046            .expect("first lifecycle start should succeed");
5047        assert!(first.is_running());
5048        first.shutdown().await.expect("shutdown first lifecycle");
5049        assert!(!first.is_running());
5050
5051        let second = mailbox
5052            .start_lifecycle(MailboxLifecycleConfig::default())
5053            .expect("lifecycle should restart after explicit abort");
5054        assert!(second.is_running());
5055        second.shutdown().await.expect("shutdown second lifecycle");
5056        assert!(!second.is_running());
5057    }
5058
5059    #[tokio::test]
5060    async fn maintenance_callback_runs_on_gc_tick() {
5061        let store = make_store();
5062        let runtime = make_runtime();
5063        let mailbox = Arc::new(Mailbox::new(
5064            runtime,
5065            store,
5066            Arc::new(InMemoryStore::new()),
5067            "test-consumer".to_string(),
5068            MailboxConfig {
5069                gc_interval: Duration::from_millis(10),
5070                sweep_interval: Duration::from_secs(60),
5071                ..Default::default()
5072            },
5073        ));
5074        let calls = Arc::new(AtomicUsize::new(0));
5075        let calls_for_hook = Arc::clone(&calls);
5076        let handle = mailbox
5077            .start_lifecycle(MailboxLifecycleConfig {
5078                maintenance_callback: Some(Arc::new(move || {
5079                    calls_for_hook.fetch_add(1, Ordering::SeqCst);
5080                })),
5081                ..Default::default()
5082            })
5083            .expect("lifecycle should start");
5084
5085        let deadline = Instant::now() + Duration::from_secs(1);
5086        while calls.load(Ordering::SeqCst) == 0 {
5087            assert!(
5088                Instant::now() < deadline,
5089                "maintenance callback did not run"
5090            );
5091            sleep(Duration::from_millis(5)).await;
5092        }
5093        handle.shutdown().await.expect("shutdown lifecycle");
5094    }
5095
5096    #[tokio::test]
5097    async fn start_lifecycle_handle_drop_keeps_lifecycle_running() {
5098        let store = make_store();
5099        let runtime = make_runtime();
5100        let mailbox = make_mailbox(runtime, store);
5101
5102        let handle = mailbox
5103            .start_lifecycle(MailboxLifecycleConfig::default())
5104            .expect("lifecycle should start");
5105        assert!(handle.is_running());
5106        drop(handle);
5107
5108        let handle = mailbox
5109            .start_lifecycle(MailboxLifecycleConfig::default())
5110            .expect("lifecycle should still be running after handle drop");
5111        assert!(handle.is_running());
5112        handle.shutdown().await.expect("shutdown lifecycle");
5113    }
5114
5115    #[tokio::test]
5116    async fn lifecycle_shutdown_waits_for_maintenance_to_quiesce() {
5117        let store = make_store();
5118        let runtime = make_runtime();
5119        let mailbox = Arc::new(Mailbox::new(
5120            runtime,
5121            store,
5122            Arc::new(InMemoryStore::new()),
5123            "test-consumer".to_string(),
5124            MailboxConfig {
5125                gc_interval: Duration::from_millis(10),
5126                sweep_interval: Duration::from_secs(60),
5127                ..Default::default()
5128            },
5129        ));
5130        let calls = Arc::new(AtomicUsize::new(0));
5131        let calls_for_hook = Arc::clone(&calls);
5132        let handle = mailbox
5133            .start_lifecycle(MailboxLifecycleConfig {
5134                maintenance_callback: Some(Arc::new(move || {
5135                    calls_for_hook.fetch_add(1, Ordering::SeqCst);
5136                })),
5137                ..Default::default()
5138            })
5139            .expect("lifecycle should start");
5140
5141        let deadline = Instant::now() + Duration::from_secs(1);
5142        while calls.load(Ordering::SeqCst) == 0 {
5143            assert!(
5144                Instant::now() < deadline,
5145                "maintenance callback did not run"
5146            );
5147            sleep(Duration::from_millis(5)).await;
5148        }
5149
5150        handle.shutdown().await.expect("shutdown should quiesce");
5151        assert!(!handle.is_running());
5152        let calls_after_shutdown = calls.load(Ordering::SeqCst);
5153        sleep(Duration::from_millis(40)).await;
5154        assert_eq!(
5155            calls.load(Ordering::SeqCst),
5156            calls_after_shutdown,
5157            "maintenance callback should not run after shutdown completes"
5158        );
5159    }
5160
5161    #[tokio::test]
5162    async fn concurrent_start_lifecycle_is_idempotent() {
5163        let store = make_store();
5164        let runtime = make_runtime();
5165        let mailbox = make_mailbox(runtime, store);
5166
5167        let mut joins = Vec::new();
5168        for _ in 0..32 {
5169            let mb = Arc::clone(&mailbox);
5170            joins.push(tokio::spawn(async move {
5171                mb.start_lifecycle(MailboxLifecycleConfig::default())
5172            }));
5173        }
5174
5175        let mut handles = Vec::new();
5176        for join in joins {
5177            match join.await.expect("start task should not panic") {
5178                Ok(handle) => handles.push(handle),
5179                Err(err) => panic!("idempotent lifecycle start should not fail: {err}"),
5180            }
5181        }
5182
5183        assert_eq!(handles.len(), 32, "all concurrent starters get a handle");
5184        assert!(handles.iter().all(MailboxLifecycleHandle::is_running));
5185        handles[0].shutdown().await.expect("shutdown lifecycle");
5186        assert!(handles.iter().all(|handle| !handle.is_running()));
5187    }
5188
5189    #[tokio::test]
5190    async fn start_lifecycle_runs_startup_recovery_for_existing_queued_dispatches() {
5191        let store = make_store();
5192        let runtime = make_runtime();
5193        let mailbox = make_mailbox(runtime, store.clone());
5194
5195        let mut request = RunRequest::new("thread-recover", vec![Message::user("recover me")])
5196            .with_agent_id("missing-agent");
5197        let (thread_id, messages) =
5198            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5199                .unwrap();
5200        mailbox
5201            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5202            .await
5203            .expect("prepare queued run");
5204        let dispatch = mailbox
5205            .build_dispatch(&request, &thread_id)
5206            .expect("build queued dispatch");
5207        let dispatch_id = dispatch.dispatch_id.clone();
5208        store
5209            .enqueue(&dispatch)
5210            .await
5211            .expect("enqueue queued dispatch");
5212
5213        let handle = mailbox
5214            .start_lifecycle(MailboxLifecycleConfig::default())
5215            .expect("lifecycle should start");
5216
5217        let recovered = wait_for_dispatch(&store, &dispatch_id, |dispatch| {
5218            dispatch.status == RunDispatchStatus::DeadLetter
5219        })
5220        .await;
5221
5222        assert_eq!(recovered.status, RunDispatchStatus::DeadLetter);
5223        assert!(
5224            recovered
5225                .last_error
5226                .as_deref()
5227                .is_some_and(|error| error.contains("missing-agent")),
5228            "dead-letter error should preserve the runtime failure: {recovered:?}"
5229        );
5230        handle.shutdown().await.expect("shutdown lifecycle");
5231    }
5232
5233    #[tokio::test]
5234    async fn start_lifecycle_reclaims_expired_claimed_dispatches_and_executes_them() {
5235        let store = make_store();
5236        let runtime = make_runtime();
5237        let mailbox = make_mailbox(runtime, store.clone());
5238
5239        let mut request = RunRequest::new("thread-stale", vec![Message::user("recover stale")])
5240            .with_agent_id("missing-agent");
5241        let (thread_id, messages) =
5242            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5243                .unwrap();
5244        mailbox
5245            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5246            .await
5247            .expect("prepare stale run");
5248        let dispatch = mailbox
5249            .build_dispatch(&request, &thread_id)
5250            .expect("build stale claimed dispatch");
5251        let dispatch_id = dispatch.dispatch_id.clone();
5252        let claim_now = dispatch.available_at;
5253        store
5254            .enqueue(&dispatch)
5255            .await
5256            .expect("enqueue queued dispatch");
5257        let claimed = store
5258            .claim("thread-stale", "dead-consumer", 1, claim_now, 1)
5259            .await
5260            .expect("claim dispatch before simulated crash");
5261        assert_eq!(claimed.len(), 1);
5262        assert_eq!(claimed[0].status, RunDispatchStatus::Claimed);
5263        assert_eq!(claimed[0].lease_until, Some(claim_now + 1));
5264        sleep(Duration::from_millis(2)).await;
5265
5266        let handle = mailbox
5267            .start_lifecycle(MailboxLifecycleConfig::default())
5268            .expect("lifecycle should start");
5269
5270        let recovered = wait_for_dispatch(&store, &dispatch_id, |dispatch| {
5271            dispatch.status == RunDispatchStatus::DeadLetter
5272                && dispatch.run_status == Some(RunStatus::Done)
5273        })
5274        .await;
5275
5276        assert_eq!(recovered.status, RunDispatchStatus::DeadLetter);
5277        assert_eq!(recovered.attempt_count, 1);
5278        let run_id = recovered.run_id.as_str();
5279        assert_ne!(
5280            run_id, dispatch_id,
5281            "recovered stale dispatches should also keep run id separate from mailbox dispatch id"
5282        );
5283        assert!(recovered.dispatch_instance_id.is_some());
5284        assert!(matches!(
5285            recovered.termination,
5286            Some(TerminationReason::Error(ref message)) if message.contains("missing-agent")
5287        ));
5288        assert!(
5289            recovered
5290                .run_error
5291                .as_deref()
5292                .is_some_and(|error| error.contains("missing-agent"))
5293        );
5294        handle.shutdown().await.expect("shutdown lifecycle");
5295    }
5296
5297    #[tokio::test]
5298    async fn dispatch_signal_loop_claims_and_executes_queued_dispatch() {
5299        let store = Arc::new(SignalMailboxStore::new());
5300        let run_store = Arc::new(InMemoryStore::new());
5301        let runtime = Arc::new(RecordingMailboxRuntime::default());
5302        let mailbox = Arc::new(Mailbox::new_with_executor(
5303            runtime,
5304            store.clone(),
5305            run_store.clone(),
5306            "signal-consumer".to_string(),
5307            MailboxConfig::default(),
5308        ));
5309
5310        let mut request = RunRequest::new("thread-signal-loop", vec![Message::user("wake")])
5311            .with_agent_id("agent");
5312        let (thread_id, messages) =
5313            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5314                .expect("input should validate");
5315        mailbox
5316            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5317            .await
5318            .expect("prepare run");
5319        let dispatch = mailbox
5320            .build_dispatch(&request, &thread_id)
5321            .expect("build dispatch");
5322        let dispatch_id = dispatch.dispatch_id.clone();
5323        store.enqueue(&dispatch).await.expect("enqueue dispatch");
5324
5325        let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
5326        let deadline = Instant::now() + Duration::from_secs(2);
5327        let acked = loop {
5328            if let Some(dispatch) = store
5329                .load_dispatch(&dispatch_id)
5330                .await
5331                .expect("dispatch lookup should succeed")
5332                && dispatch.status == RunDispatchStatus::Acked
5333            {
5334                break dispatch;
5335            }
5336            assert!(
5337                Instant::now() < deadline,
5338                "timed out waiting for dispatch signal loop"
5339            );
5340            sleep(Duration::from_millis(10)).await;
5341        };
5342        signal_loop.abort();
5343
5344        assert_eq!(acked.status, RunDispatchStatus::Acked);
5345        assert_eq!(store.acked_signal_count(), 1);
5346    }
5347
5348    #[tokio::test]
5349    async fn dispatch_signal_loop_nacks_and_redelivers_after_claim_error() {
5350        let store = Arc::new(SignalMailboxStore::with_claim_failures(1));
5351        let run_store = Arc::new(InMemoryStore::new());
5352        let runtime = Arc::new(RecordingMailboxRuntime::default());
5353        let mailbox = Arc::new(Mailbox::new_with_executor(
5354            runtime,
5355            store.clone(),
5356            run_store.clone(),
5357            "signal-consumer".to_string(),
5358            MailboxConfig::default(),
5359        ));
5360
5361        let mut request = RunRequest::new("thread-signal-redeliver", vec![Message::user("wake")])
5362            .with_agent_id("agent");
5363        let (thread_id, messages) =
5364            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5365                .expect("input should validate");
5366        mailbox
5367            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5368            .await
5369            .expect("prepare run");
5370        let dispatch = mailbox
5371            .build_dispatch(&request, &thread_id)
5372            .expect("build dispatch");
5373        let dispatch_id = dispatch.dispatch_id.clone();
5374        store.enqueue(&dispatch).await.expect("enqueue dispatch");
5375
5376        let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
5377        let deadline = Instant::now() + Duration::from_secs(2);
5378        loop {
5379            let dispatch = store
5380                .load_dispatch(&dispatch_id)
5381                .await
5382                .expect("dispatch lookup should succeed")
5383                .expect("dispatch should exist");
5384            if dispatch.status == RunDispatchStatus::Acked {
5385                break;
5386            }
5387            assert!(
5388                Instant::now() < deadline,
5389                "timed out waiting for redelivered dispatch signal"
5390            );
5391            sleep(Duration::from_millis(10)).await;
5392        }
5393        signal_loop.abort();
5394
5395        assert_eq!(store.nacked_signal_count(), 1);
5396        assert_eq!(store.acked_signal_count(), 1);
5397    }
5398
5399    #[tokio::test]
5400    async fn dispatch_signal_loop_nacks_when_signal_is_blocked_by_active_claim() {
5401        let store = Arc::new(SignalMailboxStore::new());
5402        let run_store = Arc::new(InMemoryStore::new());
5403        let runtime = Arc::new(RecordingMailboxRuntime::default());
5404        let mailbox = Arc::new(Mailbox::new_with_executor(
5405            runtime,
5406            store.clone(),
5407            run_store.clone(),
5408            "signal-consumer".to_string(),
5409            MailboxConfig::default(),
5410        ));
5411
5412        let mut active = RunRequest::new("thread-signal-blocked", vec![Message::user("active")])
5413            .with_agent_id("agent");
5414        let (thread_id, active_messages) =
5415            validate_run_inputs(active.thread_id.clone(), active.messages.clone(), false)
5416                .expect("active input should validate");
5417        mailbox
5418            .prepare_run_for_dispatch(&mut active, &thread_id, &active_messages)
5419            .await
5420            .expect("prepare active run");
5421        let active_dispatch = mailbox
5422            .build_dispatch(&active, &thread_id)
5423            .expect("build active dispatch");
5424        let active_dispatch_id = active_dispatch.dispatch_id.clone();
5425        store
5426            .enqueue(&active_dispatch)
5427            .await
5428            .expect("enqueue active dispatch");
5429        let claimed = store
5430            .claim(&thread_id, "remote-owner", 30_000, now_ms(), 1)
5431            .await
5432            .expect("claim active dispatch");
5433        assert_eq!(claimed.len(), 1);
5434        let active_claim_token = claimed[0].claim_token.clone().unwrap();
5435
5436        let mut queued = RunRequest::new("thread-signal-blocked", vec![Message::user("queued")])
5437            .with_agent_id("agent");
5438        let (_, queued_messages) =
5439            validate_run_inputs(queued.thread_id.clone(), queued.messages.clone(), false)
5440                .expect("queued input should validate");
5441        mailbox
5442            .prepare_run_for_dispatch(&mut queued, &thread_id, &queued_messages)
5443            .await
5444            .expect("prepare queued run");
5445        let queued_dispatch = mailbox
5446            .build_dispatch(&queued, &thread_id)
5447            .expect("build queued dispatch");
5448        let queued_dispatch_id = queued_dispatch.dispatch_id.clone();
5449        store
5450            .enqueue(&queued_dispatch)
5451            .await
5452            .expect("enqueue queued dispatch");
5453
5454        let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
5455        let deadline = Instant::now() + Duration::from_secs(2);
5456        loop {
5457            if store.nacked_signal_count() > 0 {
5458                break;
5459            }
5460            assert!(
5461                Instant::now() < deadline,
5462                "queued signal blocked by an active claim must be nacked for redelivery"
5463            );
5464            sleep(Duration::from_millis(10)).await;
5465        }
5466
5467        let queued_before_release = store
5468            .load_dispatch(&queued_dispatch_id)
5469            .await
5470            .expect("queued dispatch lookup")
5471            .expect("queued dispatch exists");
5472        assert_eq!(queued_before_release.status, RunDispatchStatus::Queued);
5473
5474        store
5475            .ack(&active_dispatch_id, &active_claim_token, now_ms())
5476            .await
5477            .expect("release active claim");
5478
5479        let deadline = Instant::now() + Duration::from_secs(2);
5480        loop {
5481            let dispatch = store
5482                .load_dispatch(&queued_dispatch_id)
5483                .await
5484                .expect("queued dispatch lookup")
5485                .expect("queued dispatch exists");
5486            if dispatch.status == RunDispatchStatus::Acked {
5487                break;
5488            }
5489            assert!(
5490                Instant::now() < deadline,
5491                "redelivered signal should claim after active claim releases"
5492            );
5493            sleep(Duration::from_millis(10)).await;
5494        }
5495        signal_loop.abort();
5496
5497        assert!(
5498            store.nacked_signal_count() >= 1,
5499            "blocked queued signal must be nacked at least once"
5500        );
5501        assert!(
5502            store.acked_signal_count() >= 2,
5503            "active signal and final queued signal should both be acked"
5504        );
5505    }
5506
5507    #[test]
5508    fn run_request_fields() {
5509        let req = RunRequest::new("t-1", vec![Message::user("hello")]).with_agent_id("agent-a");
5510        assert_eq!(req.thread_id, "t-1");
5511        assert_eq!(req.agent_id.as_deref(), Some("agent-a"));
5512        assert_eq!(req.messages.len(), 1);
5513        assert_eq!(req.run_mode, RunMode::Foreground);
5514        assert_eq!(req.adapter, AdapterKind::Internal);
5515    }
5516
5517    #[test]
5518    fn run_spec_validation_empty_messages_errors() {
5519        let result = validate_run_inputs("thread-1".into(), vec![], false);
5520        assert!(result.is_err());
5521        assert!(matches!(result.unwrap_err(), MailboxError::Validation(_)));
5522    }
5523
5524    #[test]
5525    fn run_spec_validation_allows_decision_only_resume() {
5526        let result = validate_run_inputs("thread-1".into(), vec![], true);
5527        assert!(result.is_ok());
5528        let (thread_id, messages) = result.unwrap();
5529        assert_eq!(thread_id, "thread-1");
5530        assert!(messages.is_empty());
5531    }
5532
5533    #[test]
5534    fn run_spec_validation_blank_thread_id_generates_new() {
5535        let result = validate_run_inputs("  ".into(), vec![Message::user("hi")], false);
5536        assert!(result.is_ok());
5537        let (thread_id, _) = result.unwrap();
5538        assert!(!thread_id.is_empty());
5539        assert_ne!(thread_id.trim(), "");
5540    }
5541
5542    #[test]
5543    fn run_spec_validation_trims_thread_id() {
5544        let result = validate_run_inputs("  my-thread  ".into(), vec![Message::user("hi")], false);
5545        assert!(result.is_ok());
5546        let (thread_id, _) = result.unwrap();
5547        assert_eq!(thread_id, "my-thread");
5548    }
5549
5550    #[test]
5551    fn dispatch_status_enum_variants() {
5552        let running = MailboxDispatchStatus::Running;
5553        let queued = MailboxDispatchStatus::Queued;
5554        assert!(matches!(running, MailboxDispatchStatus::Running));
5555        assert!(matches!(queued, MailboxDispatchStatus::Queued));
5556    }
5557
5558    #[test]
5559    fn mailbox_construction_depends_on_runtime_boundary_not_agent_runtime() {
5560        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
5561        let mailbox = Mailbox::new_with_executor(
5562            runtime,
5563            make_store(),
5564            Arc::new(InMemoryStore::new()),
5565            "decoupled-consumer".to_string(),
5566            MailboxConfig::default(),
5567        );
5568
5569        assert_eq!(mailbox.consumer_id, "decoupled-consumer");
5570    }
5571
5572    #[tokio::test]
5573    async fn submit_background_enqueues_dispatch() {
5574        let store = make_store();
5575        let runtime = make_runtime();
5576        let mailbox = make_mailbox(runtime, store.clone());
5577
5578        let request =
5579            RunRequest::new("thread-1", vec![Message::user("hello")]).with_agent_id("agent-1");
5580        let result = mailbox.submit_background(request).await.unwrap();
5581
5582        assert_eq!(result.thread_id, "thread-1");
5583        assert!(!result.dispatch_id.is_empty());
5584        assert!(!result.run_id.is_empty());
5585        assert_ne!(result.dispatch_id, result.run_id);
5586
5587        // Verify dispatch is in store.
5588        let dispatches = store
5589            .list_dispatches("thread-1", None, 100, 0)
5590            .await
5591            .unwrap();
5592        assert!(!dispatches.is_empty());
5593        assert_eq!(dispatches[0].run_id, result.run_id);
5594    }
5595
5596    #[tokio::test]
5597    async fn submit_background_delivers_scheduled_policy_context() {
5598        let store = make_store();
5599        let runtime = Arc::new(RecordingMailboxRuntime::default());
5600        let mailbox = Arc::new(Mailbox::new(
5601            runtime.clone(),
5602            store,
5603            Arc::new(InMemoryStore::new()),
5604            "recording-consumer".to_string(),
5605            MailboxConfig::default(),
5606        ));
5607
5608        let result = mailbox
5609            .submit_background(
5610                RunRequest::new("thread-policy-bg", vec![Message::user("hello")])
5611                    .with_agent_id("agent-1")
5612                    .with_adapter(AdapterKind::Acp),
5613            )
5614            .await
5615            .expect("background submit should enqueue");
5616
5617        let deadline = Instant::now() + Duration::from_secs(1);
5618        loop {
5619            if !runtime.requests.lock().expect("lock poisoned").is_empty() {
5620                break;
5621            }
5622            assert!(Instant::now() < deadline, "runtime did not receive request");
5623            sleep(Duration::from_millis(5)).await;
5624        }
5625
5626        let requests = runtime.requests.lock().expect("lock poisoned");
5627        assert_eq!(requests.len(), 1);
5628        assert_eq!(requests[0].run_mode, RunMode::Scheduled);
5629        assert_eq!(requests[0].adapter, AdapterKind::Acp);
5630        assert_eq!(
5631            requests[0].dispatch_id.as_deref(),
5632            Some(result.dispatch_id.as_str())
5633        );
5634        assert!(
5635            requests[0].session_id.is_some(),
5636            "dispatch session id should be set"
5637        );
5638    }
5639
5640    #[tokio::test]
5641    async fn prepare_run_for_dispatch_precreates_created_run_and_thread_projection() {
5642        let thread_store = Arc::new(InMemoryStore::new());
5643        let runtime = Arc::new(
5644            AgentRuntime::new(Arc::new(StubResolver))
5645                .with_thread_run_store(thread_store.clone() as Arc<dyn ThreadRunStore>),
5646        );
5647        let mailbox_store = make_store();
5648        let mailbox = make_mailbox_with_run_store(
5649            runtime,
5650            mailbox_store,
5651            thread_store.clone() as Arc<dyn ThreadRunStore>,
5652        );
5653        let mut request = RunRequest::new("thread-created", vec![Message::user("plan this")])
5654            .with_agent_id("agent-created")
5655            .with_transport_request_id("transport-created");
5656        let (thread_id, messages) =
5657            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5658                .unwrap();
5659
5660        let run_id = mailbox
5661            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5662            .await
5663            .expect("precreate");
5664
5665        assert_eq!(request.run_id_hint.as_deref(), Some(run_id.as_str()));
5666        let run = thread_store
5667            .load_run(&run_id)
5668            .await
5669            .expect("load run")
5670            .expect("created run");
5671        assert_eq!(run.status, RunStatus::Created);
5672        assert_eq!(run.agent_id, "agent-created");
5673        let request_snapshot = run.request.as_ref().unwrap();
5674        assert!(
5675            !request_snapshot.input_message_ids.is_empty(),
5676            "new run snapshots should reference thread messages instead of duplicating bodies"
5677        );
5678        assert_eq!(request_snapshot.input_message_count, 1);
5679        assert_eq!(
5680            request_snapshot.input_message_ids,
5681            vec![messages[0].id.clone().expect("message id")]
5682        );
5683        let input = run.input.as_ref().expect("run input message range");
5684        assert_eq!(input.thread_id, "thread-created");
5685        assert_eq!(input.range.unwrap().from_seq, 1);
5686        assert_eq!(input.range.unwrap().to_seq, 1);
5687        assert_eq!(
5688            input.trigger_message_ids,
5689            vec![messages[0].id.clone().expect("message id")]
5690        );
5691        assert_eq!(
5692            run.request
5693                .as_ref()
5694                .unwrap()
5695                .transport_request_id
5696                .as_deref(),
5697            Some("transport-created")
5698        );
5699        let thread = thread_store
5700            .load_thread("thread-created")
5701            .await
5702            .expect("load thread")
5703            .expect("thread projection");
5704        assert_eq!(thread.open_run_id.as_deref(), Some(run_id.as_str()));
5705        assert_eq!(thread.latest_run_id.as_deref(), Some(run_id.as_str()));
5706        assert!(thread.active_run_id.is_none());
5707    }
5708
5709    #[tokio::test]
5710    async fn prepare_run_for_dispatch_inherits_previous_runtime_state() {
5711        let thread_store = Arc::new(InMemoryStore::new());
5712        let mut previous = seeded_waiting_run("run-prev", "thread-state", "agent-prev");
5713        previous.status = RunStatus::Done;
5714        previous.state = Some(awaken_contract::state::PersistedState {
5715            revision: 7,
5716            extensions: std::collections::HashMap::from([(
5717                "remote".to_string(),
5718                json!({"context_id": "remote-ctx-1"}),
5719            )]),
5720        });
5721        thread_store
5722            .checkpoint("thread-state", &[Message::user("first")], &previous)
5723            .await
5724            .expect("seed previous run");
5725
5726        let runtime = Arc::new(
5727            AgentRuntime::new(Arc::new(StubResolver))
5728                .with_thread_run_store(thread_store.clone() as Arc<dyn ThreadRunStore>),
5729        );
5730        let mailbox = make_mailbox_with_run_store(
5731            runtime,
5732            make_store(),
5733            thread_store.clone() as Arc<dyn ThreadRunStore>,
5734        );
5735        let mut request = RunRequest::new("thread-state", vec![Message::user("second")]);
5736        let (thread_id, messages) =
5737            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5738                .unwrap();
5739
5740        let run_id = mailbox
5741            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5742            .await
5743            .expect("precreate");
5744
5745        let run = thread_store
5746            .load_run(&run_id)
5747            .await
5748            .expect("load run")
5749            .expect("created run");
5750        assert_eq!(run.status, RunStatus::Created);
5751        assert_eq!(run.agent_id, "agent-prev");
5752        let input = run.input.as_ref().expect("run input message range");
5753        assert_eq!(input.range.unwrap().from_seq, 1);
5754        assert_eq!(input.range.unwrap().to_seq, 2);
5755        let state = run.state.expect("inherited runtime state");
5756        assert_eq!(state.revision, 7);
5757        assert_eq!(state.extensions["remote"]["context_id"], "remote-ctx-1");
5758    }
5759
5760    #[tokio::test]
5761    async fn cancel_queued_dispatch_works() {
5762        crate::metrics::install_recorder();
5763        let store = make_store();
5764        let run_store = Arc::new(InMemoryStore::new());
5765        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
5766        let mailbox = Arc::new(Mailbox::new_with_executor(
5767            runtime,
5768            store.clone(),
5769            run_store.clone(),
5770            "test-consumer".to_string(),
5771            MailboxConfig::default(),
5772        ));
5773
5774        let result =
5775            enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-cancel", "hello").await;
5776        let dispatch_id = result.dispatch_id.clone();
5777
5778        let cancelled = mailbox.cancel(&dispatch_id).await.unwrap();
5779        assert!(cancelled);
5780
5781        let after = store.load_dispatch(&dispatch_id).await.unwrap().unwrap();
5782        assert_eq!(after.status, RunDispatchStatus::Cancelled);
5783
5784        let run = run_store
5785            .load_run(&result.run_id)
5786            .await
5787            .unwrap()
5788            .expect("queued cancel should keep run inspectable");
5789        assert_eq!(run.status, RunStatus::Done);
5790        assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
5791        assert_eq!(run.dispatch_id.as_deref(), Some(dispatch_id.as_str()));
5792
5793        let output = crate::metrics::render().unwrap_or_default();
5794        assert!(output.contains("operation=\"mark_run_cancelled\""));
5795        assert!(output.contains("outcome=\"cancelled\""));
5796    }
5797
5798    #[tokio::test]
5799    async fn list_dispatches_returns_entries() {
5800        let store = make_store();
5801        let runtime = make_runtime();
5802        let mailbox = make_mailbox(runtime, store.clone());
5803
5804        for i in 0..3 {
5805            let request = RunRequest::new("thread-list", vec![Message::user("msg")])
5806                .with_agent_id(format!("agent-{i}"));
5807            mailbox.submit_background(request).await.unwrap();
5808        }
5809
5810        let dispatches = mailbox
5811            .list_dispatches("thread-list", None, 100, 0)
5812            .await
5813            .unwrap();
5814        assert_eq!(dispatches.len(), 3);
5815    }
5816
5817    #[test]
5818    fn mailbox_error_display() {
5819        let e = MailboxError::Validation("test".to_string());
5820        assert_eq!(e.to_string(), "validation error: test");
5821
5822        let e = MailboxError::Internal("oops".to_string());
5823        assert_eq!(e.to_string(), "internal error: oops");
5824    }
5825
5826    #[test]
5827    fn mailbox_submit_result_fields() {
5828        let result = MailboxSubmitResult {
5829            dispatch_id: "dispatch-1".into(),
5830            run_id: "run-1".into(),
5831            thread_id: "thread-1".into(),
5832            status: MailboxDispatchStatus::Running,
5833        };
5834        assert_eq!(result.dispatch_id, "dispatch-1");
5835        assert_eq!(result.run_id, "run-1");
5836        assert_eq!(result.thread_id, "thread-1");
5837        assert!(matches!(result.status, MailboxDispatchStatus::Running));
5838    }
5839
5840    #[tokio::test]
5841    async fn suspension_aware_sink_sets_flag_on_suspended_tool_call() {
5842        use awaken_contract::contract::event_sink::{EventSink, VecEventSink};
5843        use awaken_contract::contract::suspension::ToolCallOutcome;
5844        use awaken_contract::contract::tool::{ToolResult, ToolStatus};
5845
5846        let inner: Arc<dyn EventSink> = Arc::new(VecEventSink::new());
5847        let suspended = Arc::new(AtomicBool::new(false));
5848        let sink = SuspensionAwareSink {
5849            inner: Arc::clone(&inner),
5850            suspended: Arc::clone(&suspended),
5851        };
5852
5853        // Non-suspended tool call should not set the flag.
5854        sink.emit(AgentEvent::ToolCallDone {
5855            id: "c1".into(),
5856            message_id: "m1".into(),
5857            result: ToolResult {
5858                tool_name: "echo".into(),
5859                status: ToolStatus::Success,
5860                data: serde_json::json!("ok"),
5861                message: None,
5862                suspension: None,
5863                metadata: Default::default(),
5864            },
5865            outcome: ToolCallOutcome::Succeeded,
5866        })
5867        .await;
5868        assert!(!suspended.load(Ordering::Acquire));
5869
5870        // Suspended tool call should set the flag.
5871        sink.emit(AgentEvent::ToolCallDone {
5872            id: "c2".into(),
5873            message_id: "m2".into(),
5874            result: ToolResult {
5875                tool_name: "approve".into(),
5876                status: ToolStatus::Pending,
5877                data: serde_json::json!("pending"),
5878                message: None,
5879                suspension: None,
5880                metadata: Default::default(),
5881            },
5882            outcome: ToolCallOutcome::Suspended,
5883        })
5884        .await;
5885        assert!(suspended.load(Ordering::Acquire));
5886
5887        // ToolCallResumed should reset the flag.
5888        sink.emit(AgentEvent::ToolCallResumed {
5889            target_id: "c2".into(),
5890            result: serde_json::json!({"approved": true}),
5891        })
5892        .await;
5893        assert!(!suspended.load(Ordering::Acquire));
5894    }
5895
5896    // ── classify_error tests ──────────────────────────────────────────
5897
5898    #[test]
5899    fn classify_error_ok_is_completed() {
5900        use awaken_contract::contract::lifecycle::TerminationReason;
5901        let result = Ok(awaken_runtime::loop_runner::AgentRunResult {
5902            run_id: "run-1".to_string(),
5903            response: "done".to_string(),
5904            termination: TerminationReason::NaturalEnd,
5905            steps: 1,
5906        });
5907        assert!(matches!(
5908            classify_error(&result),
5909            MailboxRunOutcome::Completed
5910        ));
5911    }
5912
5913    #[test]
5914    fn classify_error_thread_already_running_is_permanent() {
5915        use awaken_runtime::RuntimeError;
5916        use awaken_runtime::loop_runner::AgentLoopError;
5917        let result = Err(AgentLoopError::RuntimeError(
5918            RuntimeError::ThreadAlreadyRunning {
5919                thread_id: "t1".to_string(),
5920            },
5921        ));
5922        assert!(matches!(
5923            classify_error(&result),
5924            MailboxRunOutcome::PermanentError(_)
5925        ));
5926    }
5927
5928    #[test]
5929    fn classify_error_agent_not_found_is_permanent() {
5930        use awaken_runtime::RuntimeError;
5931        use awaken_runtime::loop_runner::AgentLoopError;
5932        let result = Err(AgentLoopError::RuntimeError(RuntimeError::AgentNotFound {
5933            agent_id: "missing".to_string(),
5934        }));
5935        assert!(matches!(
5936            classify_error(&result),
5937            MailboxRunOutcome::PermanentError(_)
5938        ));
5939    }
5940
5941    #[test]
5942    fn classify_error_resolve_failed_is_permanent() {
5943        use awaken_runtime::RuntimeError;
5944        use awaken_runtime::loop_runner::AgentLoopError;
5945        let result = Err(AgentLoopError::RuntimeError(RuntimeError::ResolveFailed {
5946            message: "not found".to_string(),
5947        }));
5948        assert!(matches!(
5949            classify_error(&result),
5950            MailboxRunOutcome::PermanentError(_)
5951        ));
5952    }
5953
5954    #[test]
5955    fn classify_error_storage_error_is_transient() {
5956        use awaken_runtime::loop_runner::AgentLoopError;
5957        let result = Err(AgentLoopError::StorageError("disk full".to_string()));
5958        assert!(matches!(
5959            classify_error(&result),
5960            MailboxRunOutcome::TransientError(_)
5961        ));
5962    }
5963
5964    #[test]
5965    fn classify_error_inference_failed_is_transient() {
5966        use awaken_runtime::loop_runner::AgentLoopError;
5967        let result = Err(AgentLoopError::InferenceFailed("timeout".to_string()));
5968        assert!(matches!(
5969            classify_error(&result),
5970            MailboxRunOutcome::TransientError(_)
5971        ));
5972    }
5973
5974    #[test]
5975    fn classify_error_phase_error_is_completed() {
5976        use awaken_runtime::loop_runner::AgentLoopError;
5977        let result = Err(AgentLoopError::PhaseError(
5978            awaken_contract::StateError::UnknownKey {
5979                key: "bad".to_string(),
5980            },
5981        ));
5982        // Phase errors are not infra failures -> Completed
5983        assert!(matches!(
5984            classify_error(&result),
5985            MailboxRunOutcome::Completed
5986        ));
5987    }
5988
5989    #[test]
5990    fn classify_error_invalid_resume_is_completed() {
5991        use awaken_runtime::loop_runner::AgentLoopError;
5992        let result = Err(AgentLoopError::InvalidResume("bad resume".to_string()));
5993        assert!(matches!(
5994            classify_error(&result),
5995            MailboxRunOutcome::Completed
5996        ));
5997    }
5998
5999    // ── validate_run_inputs additional tests ──────────────────────────
6000
6001    #[test]
6002    fn validate_run_inputs_preserves_normal_thread_id() {
6003        let (thread_id, msgs) =
6004            validate_run_inputs("my-thread".into(), vec![Message::user("hi")], false).unwrap();
6005        assert_eq!(thread_id, "my-thread");
6006        assert_eq!(msgs.len(), 1);
6007    }
6008
6009    #[test]
6010    fn validate_run_inputs_multiple_messages() {
6011        let (_, msgs) = validate_run_inputs(
6012            "t".into(),
6013            vec![Message::user("a"), Message::user("b"), Message::user("c")],
6014            false,
6015        )
6016        .unwrap();
6017        assert_eq!(msgs.len(), 3);
6018    }
6019
6020    #[test]
6021    fn validate_run_inputs_empty_string_generates_uuid() {
6022        let (thread_id, _) =
6023            validate_run_inputs("".into(), vec![Message::user("hi")], false).unwrap();
6024        assert!(!thread_id.is_empty());
6025        // UUIDv7 is 36 chars with hyphens
6026        assert_eq!(thread_id.len(), 36);
6027    }
6028
6029    // ── MailboxConfig custom values ──────────────────────────────────
6030
6031    #[test]
6032    fn mailbox_config_custom_values() {
6033        let config = MailboxConfig {
6034            lease_ms: 5_000,
6035            suspended_lease_ms: 60_000,
6036            lease_renewal_interval: Duration::from_secs(2),
6037            sweep_interval: Duration::from_secs(5),
6038            gc_interval: Duration::from_secs(10),
6039            gc_ttl: Duration::from_secs(3600),
6040            default_max_attempts: 3,
6041            default_retry_delay_ms: 500,
6042            max_retry_delay_ms: 60_000,
6043        };
6044        assert_eq!(config.lease_ms, 5_000);
6045        assert_eq!(config.default_max_attempts, 3);
6046        assert_eq!(config.default_retry_delay_ms, 500);
6047        assert_eq!(config.max_retry_delay_ms, 60_000);
6048    }
6049
6050    // ── build_dispatch field validation ──────────────────────────────────
6051
6052    #[tokio::test]
6053    async fn build_dispatch_sets_correct_fields() {
6054        let store = make_store();
6055        let runtime = make_runtime();
6056        let mailbox = make_mailbox(runtime, store);
6057
6058        let request =
6059            RunRequest::new("thread-42", vec![Message::user("test")]).with_run_id_hint("run-42");
6060        let dispatch = mailbox.build_dispatch(&request, "thread-42").unwrap();
6061
6062        assert_eq!(dispatch.thread_id, "thread-42");
6063        assert_eq!(dispatch.run_id, "run-42");
6064        assert_eq!(dispatch.status, RunDispatchStatus::Queued);
6065        assert_eq!(dispatch.attempt_count, 0);
6066        assert_eq!(dispatch.max_attempts, 5); // default
6067        assert_eq!(dispatch.priority, 128);
6068        assert_eq!(dispatch.dispatch_epoch, 0);
6069        assert!(dispatch.claim_token.is_none());
6070        assert!(dispatch.claimed_by.is_none());
6071        assert!(dispatch.lease_until.is_none());
6072        assert!(dispatch.last_error.is_none());
6073    }
6074
6075    #[test]
6076    fn build_dispatch_requires_prepared_run_id() {
6077        let store = make_store();
6078        let runtime = make_runtime();
6079        let mailbox = make_mailbox(runtime, store);
6080
6081        let request = RunRequest::new("thread-1", vec![Message::user("hi")]);
6082        assert!(mailbox.build_dispatch(&request, "thread-1").is_err());
6083    }
6084
6085    #[tokio::test]
6086    async fn prepare_run_preserves_request_extras_on_run_snapshot() {
6087        let store = make_store();
6088        let runtime = make_runtime();
6089        let thread_store = Arc::new(InMemoryStore::new());
6090        let mailbox = Arc::new(Mailbox::new(
6091            runtime,
6092            store,
6093            thread_store.clone(),
6094            "test-consumer".to_string(),
6095            MailboxConfig::default(),
6096        ));
6097
6098        let mut request = RunRequest::new("thread-ext", vec![Message::user("hi")])
6099            .with_agent_id("a1")
6100            .with_frontend_tools(vec![awaken_contract::contract::tool::ToolDescriptor::new(
6101                "ft1", "FT1", "desc",
6102            )]);
6103        let (thread_id, messages) =
6104            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6105                .unwrap();
6106        let run_id = mailbox
6107            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6108            .await
6109            .unwrap();
6110        let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6111
6112        let snapshot = run.request.expect("request snapshot");
6113        assert_eq!(snapshot.frontend_tools.len(), 1);
6114        assert!(snapshot.request_extras.is_some());
6115    }
6116
6117    #[test]
6118    fn run_request_extras_serde_roundtrip() {
6119        use awaken_contract::contract::tool::ToolDescriptor;
6120        let extras = RunRequestExtras {
6121            overrides: None,
6122            decisions: vec![],
6123            frontend_tools: vec![ToolDescriptor::new("ft1", "FT1", "desc")],
6124            continue_run_id: None,
6125            run_id_hint: None,
6126            dispatch_id_hint: None,
6127            parent_thread_id: None,
6128            transport_request_id: None,
6129            run_mode: RunMode::Scheduled,
6130            adapter: AdapterKind::Acp,
6131        };
6132        let value = extras.to_value().unwrap().unwrap();
6133        let parsed = RunRequestExtras::from_value(&value).unwrap();
6134        assert_eq!(parsed.frontend_tools.len(), 1);
6135        assert_eq!(parsed.frontend_tools[0].id, "ft1");
6136        assert!(parsed.decisions.is_empty());
6137        assert!(parsed.overrides.is_none());
6138        assert_eq!(parsed.run_mode, RunMode::Scheduled);
6139        assert_eq!(parsed.adapter, AdapterKind::Acp);
6140    }
6141
6142    #[test]
6143    fn run_request_extras_empty_returns_none() {
6144        let extras = RunRequestExtras {
6145            overrides: None,
6146            decisions: vec![],
6147            frontend_tools: vec![],
6148            continue_run_id: None,
6149            run_id_hint: None,
6150            dispatch_id_hint: None,
6151            parent_thread_id: None,
6152            transport_request_id: None,
6153            run_mode: RunMode::Foreground,
6154            adapter: AdapterKind::Internal,
6155        };
6156        assert!(extras.to_value().unwrap().is_none());
6157    }
6158
6159    #[test]
6160    fn run_request_extras_apply_to_request() {
6161        use awaken_contract::contract::tool::ToolDescriptor;
6162        let extras = RunRequestExtras {
6163            overrides: None,
6164            decisions: vec![],
6165            frontend_tools: vec![ToolDescriptor::new("ft1", "FT1", "desc")],
6166            continue_run_id: None,
6167            run_id_hint: Some("run-1".into()),
6168            dispatch_id_hint: Some("dispatch-1".into()),
6169            parent_thread_id: Some("parent-thread".into()),
6170            transport_request_id: Some("transport-1".into()),
6171            run_mode: RunMode::Resume,
6172            adapter: AdapterKind::AgUi,
6173        };
6174        let request = RunRequest::new("t1", vec![Message::user("hi")]);
6175        let applied = extras.apply_to(request);
6176        assert_eq!(applied.frontend_tools.len(), 1);
6177        assert_eq!(applied.run_id_hint.as_deref(), Some("run-1"));
6178        assert_eq!(applied.dispatch_id_hint.as_deref(), Some("dispatch-1"));
6179        assert_eq!(applied.parent_thread_id.as_deref(), Some("parent-thread"));
6180        assert_eq!(applied.transport_request_id.as_deref(), Some("transport-1"));
6181        assert_eq!(applied.run_mode, RunMode::Resume);
6182        assert_eq!(applied.adapter, AdapterKind::AgUi);
6183    }
6184
6185    #[tokio::test]
6186    async fn prepare_run_round_trips_parent_thread_id() {
6187        let store = make_store();
6188        let runtime = make_runtime();
6189        let thread_store = Arc::new(InMemoryStore::new());
6190        let mailbox = Arc::new(Mailbox::new(
6191            runtime,
6192            store,
6193            thread_store.clone(),
6194            "test-consumer".to_string(),
6195            MailboxConfig::default(),
6196        ));
6197        thread_store
6198            .save_thread(&Thread::with_id("thread-parent"))
6199            .await
6200            .unwrap();
6201
6202        let mut request = RunRequest::new("thread-child", vec![Message::user("hi")])
6203            .with_agent_id("agent")
6204            .with_parent_thread_id("thread-parent");
6205        let (thread_id, messages) =
6206            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6207                .unwrap();
6208        let run_id = mailbox
6209            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6210            .await
6211            .unwrap();
6212        let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6213
6214        assert_eq!(
6215            run.request
6216                .as_ref()
6217                .and_then(|snapshot| snapshot.parent_thread_id.as_deref()),
6218            Some("thread-parent")
6219        );
6220    }
6221
6222    #[tokio::test]
6223    async fn prepare_run_preserves_origin_metadata() {
6224        let store = make_store();
6225        let runtime = make_runtime();
6226        let thread_store = Arc::new(InMemoryStore::new());
6227        let mailbox = Arc::new(Mailbox::new(
6228            runtime,
6229            store,
6230            thread_store.clone(),
6231            "test-consumer".to_string(),
6232            MailboxConfig::default(),
6233        ));
6234
6235        let mut request = RunRequest::new("thread-meta", vec![Message::user("hi")])
6236            .with_agent_id("a1")
6237            .with_origin(RunRequestOrigin::A2A)
6238            .with_parent_run_id("parent-run-1");
6239        let (thread_id, messages) =
6240            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6241                .unwrap();
6242        let run_id = mailbox
6243            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6244            .await
6245            .unwrap();
6246        let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6247        let snapshot = run.request.as_ref().unwrap();
6248
6249        assert!(matches!(snapshot.origin, RunRequestOrigin::A2A));
6250        assert_eq!(run.parent_run_id.as_deref(), Some("parent-run-1"));
6251    }
6252
6253    #[tokio::test]
6254    async fn prepare_run_defaults_origin_to_user() {
6255        let store = make_store();
6256        let runtime = make_runtime();
6257        let thread_store = Arc::new(InMemoryStore::new());
6258        let mailbox = Arc::new(Mailbox::new(
6259            runtime,
6260            store,
6261            thread_store.clone(),
6262            "test-consumer".to_string(),
6263            MailboxConfig::default(),
6264        ));
6265
6266        let mut request = RunRequest::new("thread-default", vec![Message::user("hi")]);
6267        let (thread_id, messages) =
6268            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6269                .unwrap();
6270        let run_id = mailbox
6271            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6272            .await
6273            .unwrap();
6274        let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6275
6276        assert!(matches!(
6277            run.request.as_ref().unwrap().origin,
6278            RunRequestOrigin::User
6279        ));
6280        assert!(run.parent_run_id.is_none());
6281    }
6282
6283    // ── MailboxError variants ──────────────────────────────────────
6284
6285    #[test]
6286    fn mailbox_error_store_variant() {
6287        use awaken_contract::contract::storage::StorageError;
6288        let err: MailboxError = StorageError::NotFound("x".to_string()).into();
6289        let msg = err.to_string();
6290        assert!(msg.contains("store error"));
6291    }
6292
6293    // ── MailboxRunOutcome debug ──────────────────────────────────────
6294
6295    #[test]
6296    fn mailbox_run_outcome_debug() {
6297        let completed = MailboxRunOutcome::Completed;
6298        let transient = MailboxRunOutcome::TransientError("oops".to_string());
6299        let permanent = MailboxRunOutcome::PermanentError("fatal".to_string());
6300        assert!(format!("{:?}", completed).contains("Completed"));
6301        assert!(format!("{:?}", transient).contains("oops"));
6302        assert!(format!("{:?}", permanent).contains("fatal"));
6303    }
6304
6305    #[test]
6306    fn mailbox_run_outcome_metric_labels_are_stable() {
6307        assert_eq!(MailboxRunOutcome::Completed.metric_label(), "completed");
6308        assert_eq!(
6309            MailboxRunOutcome::TransientError("retry".into()).metric_label(),
6310            "transient_error"
6311        );
6312        assert_eq!(
6313            MailboxRunOutcome::PermanentError("fatal".into()).metric_label(),
6314            "permanent_error"
6315        );
6316    }
6317
6318    #[tokio::test]
6319    async fn mailbox_execution_records_dispatch_latency_metrics() {
6320        crate::metrics::install_recorder();
6321        let mailbox_store = make_store();
6322        let run_store = Arc::new(InMemoryStore::new());
6323        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(RecordingMailboxRuntime::default());
6324        let mailbox = Arc::new(Mailbox::new_with_executor(
6325            runtime,
6326            mailbox_store.clone(),
6327            run_store,
6328            "test-consumer".to_string(),
6329            MailboxConfig::default(),
6330        ));
6331
6332        let submitted = mailbox
6333            .submit_background(RunRequest::new("thread-metrics", vec![Message::user("go")]))
6334            .await
6335            .expect("submit should succeed");
6336
6337        wait_for_dispatch(&mailbox_store, &submitted.dispatch_id, |dispatch| {
6338            dispatch.status == RunDispatchStatus::Acked
6339        })
6340        .await;
6341
6342        let output = crate::metrics::render().unwrap_or_default();
6343        assert!(output.contains("awaken_mailbox_dispatch_enqueue_to_start_seconds"));
6344        assert!(output.contains("awaken_mailbox_dispatch_eligible_to_start_seconds"));
6345        assert!(output.contains("awaken_mailbox_dispatch_claim_to_start_seconds"));
6346        assert!(output.contains("awaken_mailbox_dispatch_enqueue_to_complete_seconds"));
6347        assert!(output.contains("awaken_mailbox_dispatch_runtime_seconds"));
6348        assert!(output.contains("awaken_runs_total"));
6349        assert!(output.contains("awaken_run_duration_seconds"));
6350        assert!(output.contains("awaken_mailbox_operations_total"));
6351        assert!(output.contains("awaken_mailbox_operation_duration_seconds"));
6352        assert!(output.contains("awaken_mailbox_dispatch_depth"));
6353        assert!(output.contains("status=\"queued\""));
6354        assert!(output.contains("operation=\"enqueue\""));
6355        assert!(output.contains("operation=\"claim\""));
6356        assert!(output.contains("operation=\"ack\""));
6357    }
6358
6359    #[tokio::test]
6360    async fn mailbox_lease_renewal_is_wired_and_prevents_reclaim() {
6361        crate::metrics::install_recorder();
6362        let mailbox_store = make_store();
6363        let run_store = Arc::new(InMemoryStore::new());
6364        let (started_tx, mut started_rx) = tokio::sync::mpsc::unbounded_channel();
6365        let release_first = Arc::new(tokio::sync::Notify::new());
6366        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(BlockingMailboxRuntime::new(
6367            started_tx,
6368            Arc::clone(&release_first),
6369        ));
6370        let mailbox = Arc::new(Mailbox::new_with_executor(
6371            runtime,
6372            mailbox_store.clone(),
6373            run_store,
6374            "lease-metrics-consumer".to_string(),
6375            MailboxConfig {
6376                lease_ms: 80,
6377                lease_renewal_interval: Duration::from_millis(20),
6378                ..MailboxConfig::default()
6379            },
6380        ));
6381
6382        let submitted = mailbox
6383            .submit_background(RunRequest::new(
6384                "thread-lease-renewal",
6385                vec![Message::user("go")],
6386            ))
6387            .await
6388            .expect("submit should succeed");
6389        tokio::time::timeout(Duration::from_secs(2), started_rx.recv())
6390            .await
6391            .expect("runtime should start")
6392            .expect("runtime should report start");
6393
6394        let initial_lease = mailbox_store
6395            .load_dispatch(&submitted.dispatch_id)
6396            .await
6397            .expect("load dispatch")
6398            .expect("dispatch should exist")
6399            .lease_until
6400            .expect("claimed dispatch should have a lease");
6401
6402        tokio::time::timeout(Duration::from_secs(2), async {
6403            loop {
6404                let dispatch = mailbox_store
6405                    .load_dispatch(&submitted.dispatch_id)
6406                    .await
6407                    .expect("load dispatch")
6408                    .expect("dispatch should exist");
6409                if dispatch
6410                    .lease_until
6411                    .is_some_and(|lease| lease > initial_lease)
6412                {
6413                    break;
6414                }
6415                sleep(Duration::from_millis(10)).await;
6416            }
6417        })
6418        .await
6419        .expect("lease renewal should extend the claimed dispatch");
6420
6421        let reclaimed = mailbox_store
6422            .reclaim_expired_leases(now_ms(), 10)
6423            .await
6424            .expect("manual reclaim should succeed");
6425        assert!(
6426            reclaimed.is_empty(),
6427            "renewed dispatch must not be reclaimed while runtime is active"
6428        );
6429
6430        release_first.notify_one();
6431        wait_for_dispatch(&mailbox_store, &submitted.dispatch_id, |dispatch| {
6432            dispatch.status == RunDispatchStatus::Acked
6433        })
6434        .await;
6435
6436        let output = crate::metrics::render().unwrap_or_default();
6437        assert!(output.contains("operation=\"lease_renewal\""));
6438        assert!(output.contains("result=\"ok\""));
6439    }
6440
6441    #[tokio::test]
6442    async fn background_success_records_run_result_and_keeps_dispatch_id_separate_from_run_id() {
6443        let mailbox_store = make_store();
6444        let run_store = Arc::new(InMemoryStore::new());
6445        let llm = Arc::new(ScriptedLlm::new(vec![StreamResult {
6446            content: vec![ContentBlock::text("finished")],
6447            tool_calls: vec![],
6448            usage: None,
6449            stop_reason: Some(StopReason::EndTurn),
6450            has_incomplete_tool_calls: false,
6451        }]));
6452        let resolver = Arc::new(FixedResolver {
6453            agent: ResolvedAgent::new("agent", "m", "sys", llm),
6454            plugins: vec![],
6455        });
6456        let runtime = Arc::new(AgentRuntime::new(resolver));
6457        let mailbox = Arc::new(Mailbox::new(
6458            runtime,
6459            mailbox_store.clone(),
6460            run_store,
6461            "test-consumer".to_string(),
6462            MailboxConfig::default(),
6463        ));
6464
6465        let submitted = mailbox
6466            .submit_background(
6467                RunRequest::new("thread-run-result", vec![Message::user("go")])
6468                    .with_agent_id("agent"),
6469            )
6470            .await
6471            .expect("submit should succeed");
6472
6473        let acked = wait_for_dispatch(&mailbox_store, &submitted.dispatch_id, |dispatch| {
6474            dispatch.status == RunDispatchStatus::Acked
6475                && dispatch.run_status == Some(RunStatus::Done)
6476        })
6477        .await;
6478
6479        let run_id = acked.run_id.as_str();
6480        assert_ne!(
6481            run_id, submitted.dispatch_id,
6482            "default mailbox dispatch IDs must not be used as canonical run IDs"
6483        );
6484        assert!(acked.dispatch_instance_id.is_some());
6485        assert_eq!(acked.termination, Some(TerminationReason::NaturalEnd));
6486        assert_eq!(acked.run_response.as_deref(), Some("finished"));
6487        assert!(acked.run_error.is_none());
6488        assert!(acked.completed_at.is_some());
6489    }
6490
6491    #[tokio::test]
6492    async fn background_permanent_error_records_run_result_before_dead_letter() {
6493        let store = make_store();
6494        let runtime = make_runtime();
6495        let mailbox = make_mailbox(runtime, store.clone());
6496
6497        let submitted = mailbox
6498            .submit_background(
6499                RunRequest::new("thread-missing-agent", vec![Message::user("go")])
6500                    .with_agent_id("missing-agent"),
6501            )
6502            .await
6503            .expect("submit should succeed");
6504
6505        let dead = wait_for_dispatch(&store, &submitted.dispatch_id, |dispatch| {
6506            dispatch.status == RunDispatchStatus::DeadLetter
6507                && dispatch.run_status == Some(RunStatus::Done)
6508                && dispatch.run_error.is_some()
6509        })
6510        .await;
6511
6512        let run_id = dead.run_id.as_str();
6513        assert_ne!(
6514            run_id, submitted.dispatch_id,
6515            "synthetic terminal events must preserve canonical run id instead of reusing dispatch id"
6516        );
6517        assert!(dead.dispatch_instance_id.is_some());
6518        assert!(matches!(
6519            dead.termination,
6520            Some(TerminationReason::Error(ref message)) if message.contains("missing-agent")
6521        ));
6522        assert!(
6523            dead.last_error
6524                .as_deref()
6525                .is_some_and(|error| error.contains("missing-agent"))
6526        );
6527        assert!(
6528            dead.run_error
6529                .as_deref()
6530                .is_some_and(|error| error.contains("missing-agent"))
6531        );
6532        assert!(dead.completed_at.is_some());
6533    }
6534
6535    #[tokio::test]
6536    async fn reconstruct_failure_cleans_worker_and_dispatches_next_queued() {
6537        let store = make_store();
6538        let thread_store = Arc::new(InMemoryStore::new());
6539        let runtime = Arc::new(RecordingMailboxRuntime::default());
6540        let mailbox = Arc::new(Mailbox::new_with_executor(
6541            runtime,
6542            store.clone(),
6543            thread_store.clone(),
6544            "test-consumer".to_string(),
6545            MailboxConfig::default(),
6546        ));
6547        let thread_id = "thread-reconstruct-next";
6548        let now = now_ms();
6549
6550        let missing = RunDispatch {
6551            dispatch_id: "dispatch-missing-run".to_string(),
6552            thread_id: thread_id.to_string(),
6553            run_id: "missing-run".to_string(),
6554            priority: 10,
6555            dedupe_key: None,
6556            dispatch_epoch: 0,
6557            status: RunDispatchStatus::Queued,
6558            available_at: now,
6559            attempt_count: 0,
6560            max_attempts: 3,
6561            last_error: None,
6562            claim_token: None,
6563            claimed_by: None,
6564            lease_until: None,
6565            dispatch_instance_id: None,
6566            run_status: None,
6567            termination: None,
6568            run_response: None,
6569            run_error: None,
6570            completed_at: None,
6571            created_at: now,
6572            updated_at: now,
6573        };
6574        store.enqueue(&missing).await.expect("enqueue missing run");
6575
6576        let mut next_request =
6577            RunRequest::new(thread_id, vec![Message::user("next")]).with_agent_id("agent");
6578        let (_, next_messages) = validate_run_inputs(
6579            next_request.thread_id.clone(),
6580            next_request.messages.clone(),
6581            false,
6582        )
6583        .expect("next input should validate");
6584        mailbox
6585            .prepare_run_for_dispatch(&mut next_request, thread_id, &next_messages)
6586            .await
6587            .expect("prepare next run");
6588        let mut next = mailbox
6589            .build_dispatch(&next_request, thread_id)
6590            .expect("build next dispatch");
6591        next.priority = 20;
6592        next.created_at = now.saturating_add(1);
6593        next.updated_at = next.created_at;
6594        let next_dispatch_id = next.dispatch_id.clone();
6595        store.enqueue(&next).await.expect("enqueue next");
6596
6597        mailbox.get_or_create_worker(thread_id).await;
6598        assert_eq!(
6599            mailbox.try_dispatch_next(thread_id).await,
6600            DispatchAttempt::Claimed
6601        );
6602
6603        let dead = wait_for_dispatch(&store, "dispatch-missing-run", |dispatch| {
6604            dispatch.status == RunDispatchStatus::DeadLetter
6605        })
6606        .await;
6607        assert_eq!(dead.status, RunDispatchStatus::DeadLetter);
6608
6609        let acked = wait_for_dispatch(&store, &next_dispatch_id, |dispatch| {
6610            dispatch.status == RunDispatchStatus::Acked
6611        })
6612        .await;
6613        assert_eq!(acked.status, RunDispatchStatus::Acked);
6614    }
6615
6616    // ── MailboxDispatchStatus ────────────────────────────────────────
6617
6618    #[test]
6619    fn dispatch_status_queued_zero() {
6620        let running = MailboxDispatchStatus::Running;
6621        let status = MailboxDispatchStatus::Queued;
6622        assert!(matches!(running, MailboxDispatchStatus::Running));
6623        assert!(matches!(status, MailboxDispatchStatus::Queued));
6624    }
6625
6626    // ── Interrupt test ──────────────────────────────────────────────
6627
6628    #[tokio::test]
6629    async fn interrupt_bumps_dispatch_epoch() {
6630        let store = make_store();
6631        let runtime = make_runtime();
6632        let mailbox = make_mailbox(runtime, store.clone());
6633
6634        // Submit some dispatches
6635        let request =
6636            RunRequest::new("thread-int", vec![Message::user("a")]).with_agent_id("agent-1");
6637        mailbox.submit_background(request).await.unwrap();
6638
6639        let result = mailbox.interrupt("thread-int").await.unwrap();
6640        // After interrupt, the dispatch epoch should be bumped
6641        assert!(result.new_dispatch_epoch > 0);
6642    }
6643
6644    #[tokio::test]
6645    async fn interrupt_marks_superseded_queued_runs_cancelled() {
6646        crate::metrics::install_recorder();
6647        let store = make_store();
6648        let run_store = Arc::new(InMemoryStore::new());
6649        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6650        let mailbox = Arc::new(Mailbox::new_with_executor(
6651            runtime,
6652            store.clone(),
6653            run_store.clone(),
6654            "test-consumer".to_string(),
6655            MailboxConfig::default(),
6656        ));
6657
6658        let first =
6659            enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-int-runs", "first").await;
6660        let second =
6661            enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-int-runs", "second").await;
6662
6663        let result = mailbox.interrupt_detailed("thread-int-runs").await.unwrap();
6664        assert_eq!(result.superseded_count, 2);
6665        assert_eq!(result.superseded_dispatches.len(), 2);
6666
6667        for submitted in [&first, &second] {
6668            let dispatch = store
6669                .load_dispatch(&submitted.dispatch_id)
6670                .await
6671                .unwrap()
6672                .expect("superseded dispatch should remain inspectable");
6673            assert_eq!(dispatch.status, RunDispatchStatus::Superseded);
6674
6675            let run = run_store
6676                .load_run(&submitted.run_id)
6677                .await
6678                .unwrap()
6679                .expect("superseded run should remain inspectable");
6680            assert_eq!(run.status, RunStatus::Done);
6681            assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
6682            assert_eq!(
6683                run.dispatch_id.as_deref(),
6684                Some(submitted.dispatch_id.as_str())
6685            );
6686        }
6687
6688        let output = crate::metrics::render().unwrap_or_default();
6689        assert!(output.contains("operation=\"mark_run_superseded\""));
6690        assert!(output.contains("outcome=\"superseded\""));
6691    }
6692
6693    #[tokio::test]
6694    async fn foreground_submit_marks_prior_queued_run_cancelled() {
6695        let store = make_store();
6696        let run_store = Arc::new(InMemoryStore::new());
6697        let runtime = Arc::new(CountingMailboxRuntime::default());
6698        let mailbox = Arc::new(Mailbox::new_with_executor(
6699            runtime,
6700            store.clone(),
6701            run_store.clone(),
6702            "foreground-consumer".to_string(),
6703            MailboxConfig::default(),
6704        ));
6705
6706        let old =
6707            enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-submit-supersede", "old")
6708                .await;
6709
6710        let (_new_result, _events) = mailbox
6711            .submit(
6712                RunRequest::new("thread-submit-supersede", vec![Message::user("new")])
6713                    .with_agent_id("agent"),
6714            )
6715            .await
6716            .expect("foreground submit should claim replacement dispatch");
6717
6718        let old_dispatch = store
6719            .load_dispatch(&old.dispatch_id)
6720            .await
6721            .unwrap()
6722            .expect("old dispatch should remain inspectable");
6723        assert_eq!(old_dispatch.status, RunDispatchStatus::Superseded);
6724
6725        let old_run = run_store
6726            .load_run(&old.run_id)
6727            .await
6728            .unwrap()
6729            .expect("old run should remain inspectable");
6730        assert_eq!(old_run.status, RunStatus::Done);
6731        assert_eq!(
6732            old_run.termination_reason,
6733            Some(TerminationReason::Cancelled)
6734        );
6735        assert_eq!(
6736            old_run.dispatch_id.as_deref(),
6737            Some(old.dispatch_id.as_str())
6738        );
6739    }
6740
6741    #[tokio::test]
6742    async fn submit_inline_claim_empty_cancels_precreated_run() {
6743        crate::metrics::install_recorder();
6744        let store = Arc::new(SignalMailboxStore::with_empty_claim_dispatch_once());
6745        let run_store = Arc::new(InMemoryStore::new());
6746        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6747        let mailbox = Arc::new(Mailbox::new_with_executor(
6748            runtime,
6749            store.clone(),
6750            run_store.clone(),
6751            "inline-empty-consumer".to_string(),
6752            MailboxConfig::default(),
6753        ));
6754
6755        let error = match mailbox
6756            .submit(
6757                RunRequest::new("thread-inline-empty", vec![Message::user("go")])
6758                    .with_agent_id("agent"),
6759            )
6760            .await
6761        {
6762            Ok(_) => panic!("inline submit should fail when claim_dispatch returns empty"),
6763            Err(error) => error,
6764        };
6765        assert!(error.to_string().contains(ACTIVE_RUN_CONFLICT_MESSAGE));
6766
6767        let dispatches = store
6768            .inner
6769            .list_dispatches("thread-inline-empty", None, 10, 0)
6770            .await
6771            .expect("list inline cleanup dispatches");
6772        assert_eq!(dispatches.len(), 1);
6773        let dispatch = &dispatches[0];
6774        assert_eq!(dispatch.status, RunDispatchStatus::Cancelled);
6775
6776        let run = run_store
6777            .load_run(&dispatch.run_id)
6778            .await
6779            .unwrap()
6780            .expect("inline cleanup should keep run inspectable");
6781        assert_eq!(run.status, RunStatus::Done);
6782        assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
6783        assert_eq!(
6784            run.dispatch_id.as_deref(),
6785            Some(dispatch.dispatch_id.as_str())
6786        );
6787
6788        let output = crate::metrics::render().unwrap_or_default();
6789        assert!(output.contains("operation=\"mark_run_cancelled\""));
6790        assert!(output.contains("outcome=\"cancelled\""));
6791    }
6792
6793    #[tokio::test]
6794    async fn recover_reconciles_terminal_cancelled_and_superseded_dispatches_after_crash() {
6795        crate::metrics::install_recorder();
6796        let store = make_store();
6797        let run_store = Arc::new(InMemoryStore::new());
6798        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6799        let mailbox = Arc::new(Mailbox::new_with_executor(
6800            runtime,
6801            store.clone(),
6802            run_store.clone(),
6803            "reconcile-consumer".to_string(),
6804            MailboxConfig::default(),
6805        ));
6806
6807        let cancelled = enqueue_prepared_dispatch(
6808            &mailbox,
6809            store.as_ref(),
6810            "thread-reconcile-cancel",
6811            "cancel",
6812        )
6813        .await;
6814        let superseded = enqueue_prepared_dispatch(
6815            &mailbox,
6816            store.as_ref(),
6817            "thread-reconcile-supersede",
6818            "supersede",
6819        )
6820        .await;
6821
6822        store
6823            .cancel(&cancelled.dispatch_id, now_ms())
6824            .await
6825            .expect("direct cancel should terminalize dispatch");
6826        store
6827            .interrupt("thread-reconcile-supersede", now_ms())
6828            .await
6829            .expect("direct interrupt should supersede dispatch");
6830
6831        for submitted in [&cancelled, &superseded] {
6832            let before = run_store
6833                .load_run(&submitted.run_id)
6834                .await
6835                .unwrap()
6836                .expect("prepared run should exist before reconciliation");
6837            assert_eq!(before.status, RunStatus::Created);
6838            assert!(before.dispatch_id.is_none());
6839        }
6840
6841        let recovered = mailbox.recover().await.expect("recover should reconcile");
6842        assert_eq!(recovered, 0);
6843
6844        for submitted in [&cancelled, &superseded] {
6845            let run = run_store
6846                .load_run(&submitted.run_id)
6847                .await
6848                .unwrap()
6849                .expect("reconciled run should remain inspectable");
6850            assert_eq!(run.status, RunStatus::Done);
6851            assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
6852            assert_eq!(
6853                run.dispatch_id.as_deref(),
6854                Some(submitted.dispatch_id.as_str())
6855            );
6856        }
6857
6858        let output = crate::metrics::render().unwrap_or_default();
6859        assert!(output.contains("operation=\"list_terminal_dispatches\""));
6860        assert!(output.contains("operation=\"reconcile_terminal_dispatch\""));
6861    }
6862
6863    #[tokio::test]
6864    async fn reclaim_dead_letter_marks_run_error() {
6865        crate::metrics::install_recorder();
6866        let store = make_store();
6867        let run_store = Arc::new(InMemoryStore::new());
6868        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6869        let mailbox = Arc::new(Mailbox::new_with_executor(
6870            runtime,
6871            store.clone(),
6872            run_store.clone(),
6873            "test-consumer".to_string(),
6874            MailboxConfig::default(),
6875        ));
6876
6877        let mut dispatch = prepare_queued_dispatch(&mailbox, "thread-reclaim-dead", "expire").await;
6878        dispatch.max_attempts = 1;
6879        dispatch.available_at = 1000;
6880        let dispatch_id = dispatch.dispatch_id.clone();
6881        let run_id = dispatch.run_id.clone();
6882        store.enqueue(&dispatch).await.expect("enqueue dispatch");
6883        let claimed = store
6884            .claim("thread-reclaim-dead", "stale-consumer", 100, 1000, 1)
6885            .await
6886            .expect("claim dispatch");
6887        assert_eq!(claimed.len(), 1);
6888
6889        mailbox
6890            .recover()
6891            .await
6892            .expect("recover should reclaim expired lease");
6893
6894        let dead_letter = store
6895            .load_dispatch(&dispatch_id)
6896            .await
6897            .unwrap()
6898            .expect("dead-lettered dispatch should remain inspectable");
6899        assert_eq!(dead_letter.status, RunDispatchStatus::DeadLetter);
6900
6901        let run = run_store
6902            .load_run(&run_id)
6903            .await
6904            .unwrap()
6905            .expect("dead-lettered run should remain inspectable");
6906        assert_eq!(run.status, RunStatus::Done);
6907        assert!(
6908            matches!(run.termination_reason, Some(TerminationReason::Error(_))),
6909            "dead-lettered dispatch should mark the run as errored"
6910        );
6911        assert_eq!(run.dispatch_id.as_deref(), Some(dispatch_id.as_str()));
6912
6913        let output = crate::metrics::render().unwrap_or_default();
6914        assert!(output.contains("operation=\"mark_run_dead_letter\""));
6915        assert!(output.contains("outcome=\"dead_letter\""));
6916    }
6917
6918    #[tokio::test]
6919    async fn sweep_reconciles_dead_letter_dispatch_after_crash() {
6920        let store = make_store();
6921        let run_store = Arc::new(InMemoryStore::new());
6922        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6923        let mailbox = Arc::new(Mailbox::new_with_executor(
6924            runtime,
6925            store.clone(),
6926            run_store.clone(),
6927            "sweep-reconcile-consumer".to_string(),
6928            MailboxConfig::default(),
6929        ));
6930
6931        let mut dispatch =
6932            prepare_queued_dispatch(&mailbox, "thread-sweep-reconcile-dead", "dead").await;
6933        dispatch.available_at = 1;
6934        let dispatch_id = dispatch.dispatch_id.clone();
6935        let run_id = dispatch.run_id.clone();
6936        store.enqueue(&dispatch).await.expect("enqueue dispatch");
6937        let claimed = store
6938            .claim(
6939                "thread-sweep-reconcile-dead",
6940                "stale-consumer",
6941                100,
6942                now_ms(),
6943                1,
6944            )
6945            .await
6946            .expect("claim dispatch");
6947        let claim_token = claimed[0]
6948            .claim_token
6949            .as_deref()
6950            .expect("claimed dispatch should have a claim token")
6951            .to_string();
6952        store
6953            .dead_letter(
6954                &dispatch_id,
6955                &claim_token,
6956                "crashed after dead_letter",
6957                now_ms(),
6958            )
6959            .await
6960            .expect("direct dead_letter should terminalize dispatch");
6961
6962        let before = run_store
6963            .load_run(&run_id)
6964            .await
6965            .unwrap()
6966            .expect("prepared run should exist before sweep reconciliation");
6967        assert_eq!(before.status, RunStatus::Created);
6968
6969        mailbox.run_sweep().await;
6970
6971        let run = run_store
6972            .load_run(&run_id)
6973            .await
6974            .unwrap()
6975            .expect("reconciled run should remain inspectable");
6976        assert_eq!(run.status, RunStatus::Done);
6977        assert!(
6978            matches!(run.termination_reason, Some(TerminationReason::Error(_))),
6979            "dead-lettered dispatch should reconcile the run as errored"
6980        );
6981        assert_eq!(run.dispatch_id.as_deref(), Some(dispatch_id.as_str()));
6982    }
6983
6984    // ── submit streaming returns event channel ──────────────────────
6985
6986    #[tokio::test]
6987    async fn submit_returns_event_channel() {
6988        let store = make_store();
6989        let runtime = make_runtime();
6990        let mailbox = make_mailbox(runtime, store.clone());
6991
6992        let request =
6993            RunRequest::new("thread-stream", vec![Message::user("hi")]).with_agent_id("agent-1");
6994        let (result, _event_rx) = mailbox.submit(request).await.unwrap();
6995
6996        assert_eq!(result.thread_id, "thread-stream");
6997        assert!(!result.dispatch_id.is_empty());
6998        assert!(matches!(
6999            result.status,
7000            MailboxDispatchStatus::Running | MailboxDispatchStatus::Queued
7001        ));
7002    }
7003
7004    #[tokio::test]
7005    async fn live_then_queue_steers_active_run_without_new_dispatch() {
7006        let store = Arc::new(InMemoryStore::new());
7007        let mailbox_store = make_store();
7008        let requests = Arc::new(StdMutex::new(Vec::new()));
7009        let (started_tx, started_rx) = tokio::sync::oneshot::channel();
7010        let (release_tx, release_rx) = tokio::sync::oneshot::channel();
7011        let llm = Arc::new(RecordingLlm::new(
7012            vec![
7013                StreamResult {
7014                    content: vec![ContentBlock::text("start tool")],
7015                    tool_calls: vec![ToolCall::new("block-1", "block", json!({}))],
7016                    usage: None,
7017                    stop_reason: Some(StopReason::ToolUse),
7018                    has_incomplete_tool_calls: false,
7019                },
7020                StreamResult {
7021                    content: vec![ContentBlock::text("saw live input")],
7022                    tool_calls: vec![],
7023                    usage: None,
7024                    stop_reason: Some(StopReason::EndTurn),
7025                    has_incomplete_tool_calls: false,
7026                },
7027            ],
7028            requests.clone(),
7029        ));
7030        let resolver = Arc::new(FixedResolver {
7031            agent: ResolvedAgent::new("agent", "m", "sys", llm)
7032                .with_tool(Arc::new(BlockingTool::new(started_tx, release_rx))),
7033            plugins: vec![],
7034        });
7035        let runtime = Arc::new(
7036            AgentRuntime::new(resolver)
7037                .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>)
7038                .with_mailbox_store(mailbox_store.clone()),
7039        );
7040        let mailbox = make_mailbox_with_run_store(
7041            runtime,
7042            mailbox_store.clone(),
7043            store.clone() as Arc<dyn ThreadRunStore>,
7044        );
7045
7046        let first = mailbox
7047            .submit_background(
7048                RunRequest::new("thread-live-steer", vec![Message::user("start")])
7049                    .with_agent_id("agent"),
7050            )
7051            .await
7052            .expect("initial submit should start");
7053
7054        tokio::time::timeout(Duration::from_secs(1), started_rx)
7055            .await
7056            .expect("tool should start")
7057            .expect("started signal should send");
7058
7059        let steered = mailbox
7060            .submit_live_then_queue(
7061                RunRequest::new("thread-live-steer", vec![Message::user("live steer")])
7062                    .with_agent_id("agent"),
7063                None,
7064            )
7065            .await
7066            .expect("live steer should be accepted");
7067        assert_eq!(steered.status, MailboxDispatchStatus::Running);
7068        assert_eq!(steered.run_id, first.run_id);
7069        assert_eq!(steered.dispatch_id, first.dispatch_id);
7070
7071        let _ = release_tx.send(());
7072        let latest = wait_for_latest_run(&store, "thread-live-steer", |run| {
7073            run.status == RunStatus::Done
7074        })
7075        .await;
7076        assert_eq!(latest.run_id, first.run_id);
7077
7078        let live_message_seen = {
7079            let recorded = requests.lock().expect("lock poisoned");
7080            assert_eq!(recorded.len(), 2);
7081            recorded[1].messages.iter().any(|message| {
7082                message.text() == "live steer"
7083                    && message.role == awaken_contract::contract::message::Role::User
7084                    && message.visibility == awaken_contract::contract::message::Visibility::All
7085            })
7086        };
7087        assert!(
7088            live_message_seen,
7089            "second LLM turn should receive the live user message"
7090        );
7091
7092        let messages = store
7093            .load_messages("thread-live-steer")
7094            .await
7095            .expect("load messages")
7096            .expect("messages should be persisted");
7097        assert_eq!(
7098            messages
7099                .iter()
7100                .filter(|message| message.text() == "live steer")
7101                .count(),
7102            1,
7103            "live message should be persisted exactly once"
7104        );
7105
7106        let dispatches = mailbox_store
7107            .list_dispatches("thread-live-steer", None, 10, 0)
7108            .await
7109            .expect("list dispatches");
7110        assert_eq!(
7111            dispatches
7112                .iter()
7113                .filter(|dispatch| dispatch.run_id == first.run_id)
7114                .count(),
7115            1,
7116            "live steering must not create an extra dispatch"
7117        );
7118    }
7119
7120    #[tokio::test]
7121    async fn live_then_queue_falls_back_to_durable_dispatch_when_receiver_unavailable() {
7122        let mailbox_store = make_store();
7123        let thread_store = Arc::new(InMemoryStore::new());
7124        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7125        let mailbox = Arc::new(Mailbox::new_with_executor(
7126            runtime,
7127            mailbox_store.clone(),
7128            thread_store.clone(),
7129            "test-consumer".to_string(),
7130            MailboxConfig::default(),
7131        ));
7132        let thread_id = "thread-live-fallback";
7133        let mut active_request =
7134            RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7135        let (_, active_messages) = validate_run_inputs(
7136            active_request.thread_id.clone(),
7137            active_request.messages.clone(),
7138            false,
7139        )
7140        .expect("active input should validate");
7141        mailbox
7142            .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7143            .await
7144            .expect("prepare active run");
7145        let active_dispatch = mailbox
7146            .build_dispatch(&active_request, thread_id)
7147            .expect("build active dispatch");
7148        let active_dispatch_id = active_dispatch.dispatch_id.clone();
7149        mailbox_store
7150            .enqueue(&active_dispatch)
7151            .await
7152            .expect("enqueue active dispatch");
7153        mailbox_store
7154            .claim_dispatch(&active_dispatch_id, "test-consumer", 30_000, now_ms())
7155            .await
7156            .expect("claim active dispatch")
7157            .expect("active dispatch should be claimed");
7158        let worker = mailbox.get_or_create_worker(thread_id).await;
7159        {
7160            let mut worker = worker.lock();
7161            worker.status = MailboxWorkerStatus::Running {
7162                dispatch_id: active_dispatch_id.clone(),
7163                run_id: active_dispatch.run_id.clone(),
7164                lease_handle: tokio::spawn(async {}),
7165                sink: Arc::new(ReconnectableEventSink::new(mpsc::channel(16).0)),
7166            };
7167        }
7168
7169        let result = mailbox
7170            .submit_live_then_queue(
7171                RunRequest::new(thread_id, vec![Message::user("fallback")]).with_agent_id("agent"),
7172                None,
7173            )
7174            .await
7175            .expect("fallback submit should succeed");
7176
7177        assert_eq!(result.status, MailboxDispatchStatus::Queued);
7178        assert_ne!(result.dispatch_id, active_dispatch_id);
7179        let messages = thread_store
7180            .load_messages(thread_id)
7181            .await
7182            .expect("load messages")
7183            .expect("messages should exist");
7184        assert_eq!(
7185            messages
7186                .iter()
7187                .filter(|message| message.text() == "fallback")
7188                .count(),
7189            1,
7190            "fallback message should be persisted once"
7191        );
7192        let queued = mailbox_store
7193            .list_dispatches(thread_id, Some(&[RunDispatchStatus::Queued]), 10, 0)
7194            .await
7195            .expect("list queued");
7196        assert_eq!(queued.len(), 1);
7197        assert_eq!(queued[0].dispatch_id, result.dispatch_id);
7198    }
7199
7200    #[tokio::test]
7201    async fn foreground_submit_sends_live_cancel_for_remote_active_dispatch() {
7202        use awaken_contract::contract::mailbox::LiveRunCommand;
7203        use futures::StreamExt;
7204
7205        let mailbox_store = make_store();
7206        let thread_store = Arc::new(InMemoryStore::new());
7207        let runtime = Arc::new(RecordingMailboxRuntime::default());
7208        let mailbox = Arc::new(Mailbox::new_with_executor(
7209            runtime,
7210            mailbox_store.clone(),
7211            thread_store.clone(),
7212            "foreground-consumer".to_string(),
7213            MailboxConfig::default(),
7214        ));
7215        let thread_id = "thread-remote-foreground";
7216
7217        let mut active_request =
7218            RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7219        let (_, active_messages) = validate_run_inputs(
7220            active_request.thread_id.clone(),
7221            active_request.messages.clone(),
7222            false,
7223        )
7224        .expect("active input should validate");
7225        mailbox
7226            .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7227            .await
7228            .expect("prepare active run");
7229        let active_dispatch = mailbox
7230            .build_dispatch(&active_request, thread_id)
7231            .expect("build active dispatch");
7232        let active_dispatch_id = active_dispatch.dispatch_id.clone();
7233        mailbox_store
7234            .enqueue(&active_dispatch)
7235            .await
7236            .expect("enqueue active dispatch");
7237        let claimed = mailbox_store
7238            .claim_dispatch(&active_dispatch_id, "remote-consumer", 30_000, now_ms())
7239            .await
7240            .expect("claim active dispatch")
7241            .expect("active dispatch should be claimed");
7242        let active_claim_token = claimed.claim_token.clone().expect("claim token");
7243
7244        let subscriber = mailbox_store
7245            .open_live_channel_for(&live_target_for_dispatch(&active_dispatch))
7246            .await
7247            .expect("open live channel");
7248        let captured = Arc::new(tokio::sync::Mutex::new(Vec::<LiveRunCommand>::new()));
7249        let captured_clone = captured.clone();
7250        let store_clone = mailbox_store.clone();
7251        let active_dispatch_id_clone = active_dispatch_id.clone();
7252        let active_claim_token_clone = active_claim_token.clone();
7253        let _forwarder = tokio::spawn(async move {
7254            let mut subscriber = subscriber;
7255            while let Some(entry) = subscriber.next().await {
7256                captured_clone.lock().await.push(entry.command.clone());
7257                if matches!(entry.command, LiveRunCommand::Cancel) {
7258                    let release_result = store_clone
7259                        .ack(
7260                            &active_dispatch_id_clone,
7261                            &active_claim_token_clone,
7262                            now_ms(),
7263                        )
7264                        .await;
7265                    if let Err(error) = release_result {
7266                        assert!(
7267                            matches!(error, StorageError::VersionConflict { .. }),
7268                            "remote run release should either ack or be superseded, got {error:?}"
7269                        );
7270                    }
7271                    entry.receipt.ack();
7272                    break;
7273                }
7274                drop(entry.receipt);
7275            }
7276        });
7277
7278        let (submitted, _events) = mailbox
7279            .submit(
7280                RunRequest::new(thread_id, vec![Message::user("replacement")])
7281                    .with_agent_id("agent"),
7282            )
7283            .await
7284            .expect("foreground submit should cancel remote active run and claim replacement");
7285
7286        assert_eq!(submitted.status, MailboxDispatchStatus::Running);
7287        assert_ne!(submitted.dispatch_id, active_dispatch_id);
7288        let commands = captured.lock().await;
7289        assert!(
7290            commands
7291                .iter()
7292                .any(|command| matches!(command, LiveRunCommand::Cancel)),
7293            "foreground submit must deliver live Cancel to the remote active run"
7294        );
7295    }
7296
7297    #[tokio::test]
7298    async fn foreground_submit_does_not_prepare_replacement_when_remote_cancel_times_out() {
7299        use awaken_contract::contract::mailbox::LiveRunCommand;
7300        use futures::StreamExt;
7301
7302        let mailbox_store = make_store();
7303        let thread_store = Arc::new(InMemoryStore::new());
7304        let runtime = Arc::new(RecordingMailboxRuntime::default());
7305        let mailbox = Arc::new(Mailbox::new_with_executor(
7306            runtime,
7307            mailbox_store.clone(),
7308            thread_store.clone(),
7309            "foreground-consumer".to_string(),
7310            MailboxConfig::default(),
7311        ));
7312        let thread_id = "thread-remote-cancel-timeout";
7313
7314        let mut active_request =
7315            RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7316        let (_, active_messages) = validate_run_inputs(
7317            active_request.thread_id.clone(),
7318            active_request.messages.clone(),
7319            false,
7320        )
7321        .expect("active input should validate");
7322        mailbox
7323            .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7324            .await
7325            .expect("prepare active run");
7326        let active_dispatch = mailbox
7327            .build_dispatch(&active_request, thread_id)
7328            .expect("build active dispatch");
7329        let active_dispatch_id = active_dispatch.dispatch_id.clone();
7330        mailbox_store.enqueue(&active_dispatch).await.unwrap();
7331        mailbox_store
7332            .claim_dispatch(&active_dispatch_id, "remote-consumer", 30_000, now_ms())
7333            .await
7334            .unwrap()
7335            .expect("active dispatch should be claimed");
7336
7337        let subscriber = mailbox_store
7338            .open_live_channel_for(&live_target_for_dispatch(&active_dispatch))
7339            .await
7340            .expect("open live channel");
7341        let _forwarder = tokio::spawn(async move {
7342            let mut subscriber = subscriber;
7343            while let Some(entry) = subscriber.next().await {
7344                if matches!(entry.command, LiveRunCommand::Cancel) {
7345                    // Ack the cancel but intentionally keep the dispatch Claimed.
7346                    entry.receipt.ack();
7347                    break;
7348                }
7349                drop(entry.receipt);
7350            }
7351        });
7352
7353        let result = mailbox
7354            .submit(
7355                RunRequest::new(thread_id, vec![Message::user("replacement")])
7356                    .with_agent_id("agent"),
7357            )
7358            .await;
7359        assert!(
7360            matches!(result, Err(MailboxError::Validation(ref message)) if message == ACTIVE_RUN_CONFLICT_MESSAGE),
7361            "foreground submit must fail before writing replacement state when old claim remains active"
7362        );
7363
7364        let messages = thread_store
7365            .load_messages(thread_id)
7366            .await
7367            .expect("load messages")
7368            .expect("active messages should remain");
7369        assert_eq!(messages.len(), 1);
7370        assert_eq!(messages[0].text(), "active");
7371
7372        let all = mailbox_store
7373            .list_dispatches(thread_id, None, 10, 0)
7374            .await
7375            .expect("list dispatches");
7376        assert_eq!(all.len(), 1);
7377        assert_eq!(all[0].dispatch_id, active_dispatch_id);
7378        assert_eq!(all[0].status, RunDispatchStatus::Claimed);
7379    }
7380
7381    #[tokio::test]
7382    async fn foreground_submit_does_not_prepare_replacement_when_local_cancel_times_out() {
7383        let mailbox_store = make_store();
7384        let thread_store = Arc::new(InMemoryStore::new());
7385        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7386        let mailbox = Arc::new(Mailbox::new_with_executor(
7387            runtime,
7388            mailbox_store.clone(),
7389            thread_store.clone(),
7390            "foreground-consumer".to_string(),
7391            MailboxConfig::default(),
7392        ));
7393        let thread_id = "thread-local-cancel-timeout";
7394
7395        let mut active_request =
7396            RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7397        let (_, active_messages) = validate_run_inputs(
7398            active_request.thread_id.clone(),
7399            active_request.messages.clone(),
7400            false,
7401        )
7402        .expect("active input should validate");
7403        mailbox
7404            .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7405            .await
7406            .expect("prepare active run");
7407        let active_dispatch = mailbox
7408            .build_dispatch(&active_request, thread_id)
7409            .expect("build active dispatch");
7410        let active_dispatch_id = active_dispatch.dispatch_id.clone();
7411        mailbox_store.enqueue(&active_dispatch).await.unwrap();
7412        mailbox_store
7413            .claim_dispatch(&active_dispatch_id, "foreground-consumer", 30_000, now_ms())
7414            .await
7415            .unwrap()
7416            .expect("active dispatch should be claimed");
7417
7418        let result = mailbox
7419            .submit(
7420                RunRequest::new(thread_id, vec![Message::user("replacement")])
7421                    .with_agent_id("agent"),
7422            )
7423            .await;
7424        assert!(
7425            matches!(result, Err(MailboxError::Validation(ref message)) if message == ACTIVE_RUN_CONFLICT_MESSAGE),
7426            "foreground submit must fail before writing replacement state when local cancel does not release"
7427        );
7428
7429        let messages = thread_store
7430            .load_messages(thread_id)
7431            .await
7432            .expect("load messages")
7433            .expect("active messages should remain");
7434        assert_eq!(messages.len(), 1);
7435        assert_eq!(messages[0].text(), "active");
7436
7437        let all = mailbox_store
7438            .list_dispatches(thread_id, None, 10, 0)
7439            .await
7440            .expect("list dispatches");
7441        assert_eq!(all.len(), 1);
7442        assert_eq!(all[0].dispatch_id, active_dispatch_id);
7443        assert_eq!(all[0].status, RunDispatchStatus::Claimed);
7444    }
7445
7446    #[tokio::test]
7447    async fn foreground_submit_waits_for_local_cancelled_dispatch_to_release_claim() {
7448        let mailbox_store = make_store();
7449        let thread_store = Arc::new(InMemoryStore::new());
7450        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(ImmediateLocalCancelRuntime);
7451        let mailbox = Arc::new(Mailbox::new_with_executor(
7452            runtime,
7453            mailbox_store.clone(),
7454            thread_store.clone(),
7455            "foreground-consumer".to_string(),
7456            MailboxConfig::default(),
7457        ));
7458        let thread_id = "thread-local-cancel-claim-window";
7459
7460        let mut active_request =
7461            RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7462        let (_, active_messages) = validate_run_inputs(
7463            active_request.thread_id.clone(),
7464            active_request.messages.clone(),
7465            false,
7466        )
7467        .expect("active input should validate");
7468        mailbox
7469            .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7470            .await
7471            .expect("prepare active run");
7472        let active_dispatch = mailbox
7473            .build_dispatch(&active_request, thread_id)
7474            .expect("build active dispatch");
7475        let active_dispatch_id = active_dispatch.dispatch_id.clone();
7476        mailbox_store.enqueue(&active_dispatch).await.unwrap();
7477        mailbox_store
7478            .claim_dispatch(&active_dispatch_id, "foreground-consumer", 30_000, now_ms())
7479            .await
7480            .unwrap()
7481            .expect("active dispatch should be claimed");
7482
7483        let result = mailbox
7484            .submit(
7485                RunRequest::new(thread_id, vec![Message::user("replacement")])
7486                    .with_agent_id("agent"),
7487            )
7488            .await;
7489        assert!(
7490            matches!(result, Err(MailboxError::Validation(ref message)) if message == ACTIVE_RUN_CONFLICT_MESSAGE),
7491            "foreground submit must fail before writing replacement state when local runtime slot released but mailbox claim remains"
7492        );
7493
7494        let messages = thread_store
7495            .load_messages(thread_id)
7496            .await
7497            .expect("load messages")
7498            .expect("active messages should remain");
7499        assert_eq!(messages.len(), 1);
7500        assert_eq!(messages[0].text(), "active");
7501
7502        let all = mailbox_store
7503            .list_dispatches(thread_id, None, 10, 0)
7504            .await
7505            .expect("list dispatches");
7506        assert_eq!(all.len(), 1);
7507        assert_eq!(all[0].dispatch_id, active_dispatch_id);
7508        assert_eq!(all[0].status, RunDispatchStatus::Claimed);
7509    }
7510
7511    /// Cross-node live delivery: no local worker, but the thread has an
7512    /// active Running run recorded globally in ThreadRunStore. Mailbox must
7513    /// publish on the live channel (for the owning node's forwarder to
7514    /// receive) and return Running rather than falling back.
7515    #[tokio::test]
7516    async fn live_then_queue_publishes_for_remote_active_run() {
7517        use awaken_contract::contract::mailbox::LiveRunCommand;
7518        use futures::StreamExt;
7519
7520        let mailbox_store = make_store();
7521        let thread_store = Arc::new(InMemoryStore::new());
7522        let thread_id = "thread-remote";
7523        let remote_run_id = "run-remote";
7524
7525        // Seed a Running run — simulates another node owning this run.
7526        let mut run = seeded_waiting_run(remote_run_id, thread_id, "agent");
7527        run.status = RunStatus::Running;
7528        thread_store
7529            .create_run(&run)
7530            .await
7531            .expect("seed remote run");
7532
7533        // Simulate the remote forwarder: drain the stream and ack each
7534        // entry so the producer's `deliver_live` resolves as Delivered.
7535        let subscriber = mailbox_store
7536            .open_live_channel_for(&live_target_for_run(&run))
7537            .await
7538            .expect("open live channel");
7539        let captured = std::sync::Arc::new(tokio::sync::Mutex::new(Vec::<LiveRunCommand>::new()));
7540        let captured_clone = captured.clone();
7541        let _forwarder = tokio::spawn(async move {
7542            let mut subscriber = subscriber;
7543            while let Some(entry) = subscriber.next().await {
7544                captured_clone.lock().await.push(entry.command.clone());
7545                entry.receipt.ack();
7546            }
7547        });
7548
7549        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7550        let mailbox = Arc::new(Mailbox::new_with_executor(
7551            runtime,
7552            mailbox_store.clone(),
7553            thread_store.clone(),
7554            "test-consumer".to_string(),
7555            MailboxConfig::default(),
7556        ));
7557
7558        let result = mailbox
7559            .submit_live_then_queue(
7560                RunRequest::new(thread_id, vec![Message::user("steer-remote")])
7561                    .with_agent_id("agent"),
7562                None,
7563            )
7564            .await
7565            .expect("submit should succeed");
7566
7567        assert_eq!(result.status, MailboxDispatchStatus::Running);
7568        assert_eq!(result.run_id, remote_run_id);
7569
7570        // The acked subscriber must have captured the message.
7571        let commands = captured.lock().await;
7572        assert_eq!(commands.len(), 1);
7573        match &commands[0] {
7574            LiveRunCommand::Messages(msgs) => assert_eq!(msgs[0].text(), "steer-remote"),
7575            other => panic!("expected Messages, got {other:?}"),
7576        }
7577        drop(commands);
7578
7579        // No new dispatch should have been enqueued.
7580        let queued = mailbox_store
7581            .list_dispatches(thread_id, Some(&[RunDispatchStatus::Queued]), 10, 0)
7582            .await
7583            .expect("list queued");
7584        assert!(
7585            queued.is_empty(),
7586            "cross-node live delivery must not create a dispatch"
7587        );
7588    }
7589
7590    /// Regression for issue #2: cross-node delivery where the subscriber
7591    /// drops the receipt (simulating inbox full / forwarder failure) must
7592    /// fall back to durable queue, not report `Running`.
7593    #[tokio::test]
7594    async fn live_then_queue_falls_back_when_subscriber_drops_receipt() {
7595        use futures::StreamExt;
7596
7597        let mailbox_store = make_store();
7598        let thread_store = Arc::new(InMemoryStore::new());
7599        let thread_id = "thread-dropped-receipt";
7600
7601        let mut run = seeded_waiting_run("run-dropped", thread_id, "agent");
7602        run.status = RunStatus::Running;
7603        thread_store.create_run(&run).await.expect("seed run");
7604
7605        let subscriber = mailbox_store
7606            .open_live_channel_for(&live_target_for_run(&run))
7607            .await
7608            .expect("open live channel");
7609        // Drop every receipt — simulates forwarder that can't hand off.
7610        let _rogue = tokio::spawn(async move {
7611            let mut subscriber = subscriber;
7612            while let Some(entry) = subscriber.next().await {
7613                drop(entry.receipt);
7614            }
7615        });
7616
7617        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7618        let mailbox = Arc::new(Mailbox::new_with_executor(
7619            runtime,
7620            mailbox_store.clone(),
7621            thread_store.clone(),
7622            "test-consumer".to_string(),
7623            MailboxConfig::default(),
7624        ));
7625
7626        let result = mailbox
7627            .submit_live_then_queue(
7628                RunRequest::new(thread_id, vec![Message::user("hello?")]).with_agent_id("agent"),
7629                None,
7630            )
7631            .await
7632            .expect("submit should succeed via queue fallback");
7633
7634        let dispatches = mailbox_store
7635            .list_dispatches(thread_id, None, 10, 0)
7636            .await
7637            .expect("list dispatches");
7638        assert_eq!(
7639            dispatches.len(),
7640            1,
7641            "unacked receipt must force a durable dispatch"
7642        );
7643        assert_eq!(result.dispatch_id, dispatches[0].dispatch_id);
7644    }
7645
7646    /// Contract test documenting the `submit_live_then_queue` at-least-once
7647    /// guarantee: a forwarder that accepts the live command but whose ack
7648    /// is lost (ack publish failure / network timeout) causes the producer
7649    /// to observe `NoSubscriber` and fall back to durable dispatch, even
7650    /// though the run has already received the payload. Callers needing
7651    /// exactly-once effects must use agent-level idempotency.
7652    #[tokio::test]
7653    async fn live_then_queue_is_at_least_once_when_ack_lost() {
7654        use futures::StreamExt;
7655
7656        let mailbox_store = make_store();
7657        let thread_store = Arc::new(InMemoryStore::new());
7658        let thread_id = "thread-ack-lost";
7659
7660        let mut run = seeded_waiting_run("run-ack-lost", thread_id, "agent");
7661        run.status = RunStatus::Running;
7662        thread_store.create_run(&run).await.expect("seed run");
7663
7664        let subscriber = mailbox_store
7665            .open_live_channel_for(&live_target_for_run(&run))
7666            .await
7667            .expect("open live channel");
7668        // Consumer captures the command (simulates `try_send` success)
7669        // but drops the receipt (simulates the ack publish failing).
7670        let accepted = std::sync::Arc::new(tokio::sync::Mutex::new(Vec::<String>::new()));
7671        let accepted_c = accepted.clone();
7672        let _consumer = tokio::spawn(async move {
7673            let mut subscriber = subscriber;
7674            while let Some(entry) = subscriber.next().await {
7675                if let awaken_contract::contract::mailbox::LiveRunCommand::Messages(ref msgs) =
7676                    entry.command
7677                {
7678                    for m in msgs {
7679                        accepted_c.lock().await.push(m.text());
7680                    }
7681                }
7682                // Forwarder accepted, but ack "publish" never happens.
7683                drop(entry.receipt);
7684            }
7685        });
7686
7687        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7688        let mailbox = Arc::new(Mailbox::new_with_executor(
7689            runtime,
7690            mailbox_store.clone(),
7691            thread_store.clone(),
7692            "test-consumer".to_string(),
7693            MailboxConfig::default(),
7694        ));
7695
7696        let result = mailbox
7697            .submit_live_then_queue(
7698                RunRequest::new(thread_id, vec![Message::user("steer-payload")])
7699                    .with_agent_id("agent"),
7700                None,
7701            )
7702            .await
7703            .expect("submit should succeed via queue fallback");
7704
7705        // Contract part 1: the forwarder DID receive the payload.
7706        let received = accepted.lock().await.clone();
7707        assert_eq!(
7708            received.as_slice(),
7709            &["steer-payload".to_string()],
7710            "forwarder must have observed the live command before dropping receipt"
7711        );
7712
7713        // Contract part 2: because the ack was "lost", submit fell back
7714        // to durable dispatch — the SAME payload is now queued for a
7715        // future run.  This is the at-least-once window.
7716        let dispatches = mailbox_store
7717            .list_dispatches(thread_id, None, 10, 0)
7718            .await
7719            .expect("list dispatches");
7720        assert_eq!(dispatches.len(), 1);
7721        assert_eq!(result.dispatch_id, dispatches[0].dispatch_id);
7722    }
7723
7724    /// expected_run_id mismatch against a remote Running run must abort
7725    /// live delivery and fall back to dispatch (preventing steering the
7726    /// wrong run after a rollover).
7727    #[tokio::test]
7728    async fn live_then_queue_rejects_remote_mismatched_expected_run_id() {
7729        let mailbox_store = make_store();
7730        let thread_store = Arc::new(InMemoryStore::new());
7731        let thread_id = "thread-mismatch";
7732
7733        let mut run = seeded_waiting_run("run-actual", thread_id, "agent");
7734        run.status = RunStatus::Running;
7735        thread_store.create_run(&run).await.expect("seed run");
7736
7737        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7738        let mailbox = Arc::new(Mailbox::new_with_executor(
7739            runtime,
7740            mailbox_store.clone(),
7741            thread_store.clone(),
7742            "test-consumer".to_string(),
7743            MailboxConfig::default(),
7744        ));
7745
7746        let result = mailbox
7747            .submit_live_then_queue(
7748                RunRequest::new(thread_id, vec![Message::user("wrong-run")]).with_agent_id("agent"),
7749                Some("run-stale"),
7750            )
7751            .await
7752            .expect("submit should succeed");
7753
7754        assert_ne!(
7755            result.run_id, "run-actual",
7756            "mismatched expected_run_id must not steer the stale remote run"
7757        );
7758    }
7759
7760    #[tokio::test]
7761    async fn send_decision_live_delivers_to_remote_waiting_run() {
7762        use awaken_contract::contract::mailbox::LiveRunCommand;
7763        use futures::StreamExt;
7764
7765        let mailbox_store = make_store();
7766        let thread_store = Arc::new(InMemoryStore::new());
7767        let thread_id = "thread-remote-decision";
7768        let run = seeded_waiting_run("run-remote-decision", thread_id, "agent");
7769        thread_store.create_run(&run).await.expect("seed run");
7770
7771        let subscriber = mailbox_store
7772            .open_live_channel_for(&live_target_for_run(&run))
7773            .await
7774            .expect("open targeted live channel");
7775        let captured = Arc::new(tokio::sync::Mutex::new(Vec::new()));
7776        let captured_c = captured.clone();
7777        let _forwarder = tokio::spawn(async move {
7778            let mut subscriber = subscriber;
7779            while let Some(entry) = subscriber.next().await {
7780                if let LiveRunCommand::Decision(decisions) = entry.command {
7781                    captured_c.lock().await.push(decisions);
7782                    entry.receipt.ack();
7783                    break;
7784                }
7785                drop(entry.receipt);
7786            }
7787        });
7788
7789        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7790        let mailbox = Arc::new(Mailbox::new_with_executor(
7791            runtime,
7792            mailbox_store,
7793            thread_store,
7794            "test-consumer".to_string(),
7795            MailboxConfig::default(),
7796        ));
7797
7798        let delivered = mailbox
7799            .send_decision_live(thread_id, "tool-1".to_string(), make_resume())
7800            .await
7801            .expect("live decision should not error");
7802        assert!(delivered);
7803        let captured = captured.lock().await;
7804        assert_eq!(captured.len(), 1);
7805        assert_eq!(captured[0][0].0, "tool-1");
7806    }
7807
7808    /// Cross-node live delivery when **no subscriber** is attached to the
7809    /// live channel must fall back to `submit_background` — never report
7810    /// `Running` based on a publish the owning node will never observe.
7811    #[tokio::test]
7812    async fn live_then_queue_falls_back_to_queue_when_no_remote_subscriber() {
7813        let mailbox_store = make_store();
7814        let thread_store = Arc::new(InMemoryStore::new());
7815        let thread_id = "thread-no-subscriber";
7816
7817        // Seed a Running run on some other (imaginary) node. Crucially,
7818        // we do NOT call `open_live_channel` — no one is listening.
7819        let mut run = seeded_waiting_run("run-no-listener", thread_id, "agent");
7820        run.status = RunStatus::Running;
7821        thread_store.create_run(&run).await.expect("seed run");
7822
7823        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7824        let mailbox = Arc::new(Mailbox::new_with_executor(
7825            runtime,
7826            mailbox_store.clone(),
7827            thread_store.clone(),
7828            "test-consumer".to_string(),
7829            MailboxConfig::default(),
7830        ));
7831
7832        let result = mailbox
7833            .submit_live_then_queue(
7834                RunRequest::new(thread_id, vec![Message::user("hello?")]).with_agent_id("agent"),
7835                None,
7836            )
7837            .await
7838            .expect("submit should succeed via queue fallback");
7839
7840        // The submit must have entered the durable queue — a new dispatch
7841        // appears whether Queued or Claimed (depending on whether a worker
7842        // picks it up). The key property is: a dispatch was enqueued rather
7843        // than silently declared `Running` on the remote.
7844        let all_dispatches = mailbox_store
7845            .list_dispatches(thread_id, None, 10, 0)
7846            .await
7847            .expect("list dispatches");
7848        assert_eq!(
7849            all_dispatches.len(),
7850            1,
7851            "no-subscriber cross-node must fall back to durable queue"
7852        );
7853        assert_eq!(result.dispatch_id, all_dispatches[0].dispatch_id);
7854    }
7855
7856    #[tokio::test]
7857    async fn waiting_thread_is_reactivated_by_incoming_message() {
7858        let store = Arc::new(InMemoryStore::new());
7859        store
7860            .create_run(&seeded_waiting_run(
7861                "run-waiting",
7862                "thread-waiting",
7863                "agent",
7864            ))
7865            .await
7866            .expect("seed waiting run");
7867
7868        let llm = Arc::new(ScriptedLlm::new(vec![StreamResult {
7869            content: vec![ContentBlock::text("reactivated")],
7870            tool_calls: vec![],
7871            usage: None,
7872            stop_reason: Some(StopReason::EndTurn),
7873            has_incomplete_tool_calls: false,
7874        }]));
7875        let resolver = Arc::new(FixedResolver {
7876            agent: ResolvedAgent::new("agent", "m", "sys", llm),
7877            plugins: vec![],
7878        });
7879        let runtime = Arc::new(
7880            AgentRuntime::new(resolver)
7881                .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>),
7882        );
7883        let mailbox_store = make_store();
7884        let mailbox = make_mailbox_with_run_store(
7885            runtime,
7886            mailbox_store,
7887            store.clone() as Arc<dyn ThreadRunStore>,
7888        );
7889
7890        let submitted = mailbox
7891            .submit_background(
7892                RunRequest::new("thread-waiting", vec![Message::user("poke")])
7893                    .with_agent_id("agent"),
7894            )
7895            .await
7896            .expect("submit should succeed");
7897        assert_eq!(submitted.run_id, "run-waiting");
7898
7899        let latest = wait_for_latest_run(&store, "thread-waiting", |run| {
7900            run.status == RunStatus::Done && run.updated_at > 1
7901        })
7902        .await;
7903
7904        assert_eq!(
7905            latest.run_id, "run-waiting",
7906            "incoming messages should continue the existing waiting run"
7907        );
7908        assert_eq!(latest.status, RunStatus::Done);
7909    }
7910
7911    #[tokio::test]
7912    async fn structured_user_input_waiting_thread_is_reused_by_incoming_message() {
7913        let store = Arc::new(InMemoryStore::new());
7914        let mut waiting = seeded_waiting_run("run-user-input", "thread-user-input", "agent");
7915        waiting.waiting = Some(RunWaitingState {
7916            reason: WaitingReason::UserInput,
7917            ticket_ids: Vec::new(),
7918            tickets: Vec::new(),
7919            since_dispatch_id: None,
7920            message: Some("waiting for user input".to_string()),
7921        });
7922        store.create_run(&waiting).await.expect("seed waiting run");
7923
7924        let llm = Arc::new(ScriptedLlm::new(vec![StreamResult {
7925            content: vec![ContentBlock::text("continued")],
7926            tool_calls: vec![],
7927            usage: None,
7928            stop_reason: Some(StopReason::EndTurn),
7929            has_incomplete_tool_calls: false,
7930        }]));
7931        let resolver = Arc::new(FixedResolver {
7932            agent: ResolvedAgent::new("agent", "m", "sys", llm),
7933            plugins: vec![],
7934        });
7935        let runtime = Arc::new(
7936            AgentRuntime::new(resolver)
7937                .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>),
7938        );
7939        let mailbox = Arc::new(Mailbox::new(
7940            runtime,
7941            make_store(),
7942            store.clone(),
7943            "test-consumer".to_string(),
7944            MailboxConfig::default(),
7945        ));
7946
7947        let submitted = mailbox
7948            .submit_background(
7949                RunRequest::new("thread-user-input", vec![Message::user("continue")])
7950                    .with_agent_id("agent"),
7951            )
7952            .await
7953            .expect("submit should succeed");
7954
7955        assert_eq!(
7956            submitted.run_id, "run-user-input",
7957            "structured user-input waiting should keep the same user-intent run"
7958        );
7959    }
7960
7961    #[tokio::test]
7962    async fn reusable_waiting_run_prefers_thread_open_run_projection_over_latest_run() {
7963        let store = Arc::new(InMemoryStore::new());
7964        let thread_id = "thread-open-projection";
7965        let mut open = seeded_waiting_run("run-open", thread_id, "agent");
7966        open.waiting = Some(RunWaitingState {
7967            reason: WaitingReason::UserInput,
7968            ticket_ids: Vec::new(),
7969            tickets: Vec::new(),
7970            since_dispatch_id: None,
7971            message: Some("waiting for explicit input".to_string()),
7972        });
7973        open.updated_at = 10;
7974        let mut newer = seeded_waiting_run("run-newer-latest", thread_id, "agent");
7975        newer.updated_at = 20;
7976
7977        store.create_run(&open).await.expect("seed open run");
7978        store.create_run(&newer).await.expect("seed newer run");
7979        let mut thread = Thread::with_id(thread_id);
7980        thread.open_run_id = Some(open.run_id.clone());
7981        store
7982            .save_thread(&thread)
7983            .await
7984            .expect("save thread projection");
7985
7986        let runtime = Arc::new(RecordingStoreMailboxRuntime::new(store.clone()));
7987        let mailbox = Arc::new(Mailbox::new(
7988            runtime.clone(),
7989            make_store(),
7990            store.clone(),
7991            "test-consumer".to_string(),
7992            MailboxConfig::default(),
7993        ));
7994
7995        let submitted = mailbox
7996            .submit_background(
7997                RunRequest::new(thread_id, vec![Message::user("continue open")])
7998                    .with_agent_id("agent"),
7999            )
8000            .await
8001            .expect("submit should succeed");
8002
8003        assert_eq!(
8004            submitted.run_id, "run-open",
8005            "thread.open_run_id must win over latest_run() when resuming same user intent"
8006        );
8007        let deadline = Instant::now() + Duration::from_secs(1);
8008        loop {
8009            if !runtime.requests.lock().expect("lock poisoned").is_empty() {
8010                break;
8011            }
8012            assert!(Instant::now() < deadline, "request was not dispatched");
8013            sleep(Duration::from_millis(5)).await;
8014        }
8015        let requests = runtime.requests.lock().expect("lock poisoned");
8016        assert_eq!(requests[0].continue_run_id.as_deref(), Some("run-open"));
8017    }
8018
8019    #[tokio::test]
8020    async fn recover_only_enqueues_orphaned_background_task_waiting_runs() {
8021        let store = Arc::new(InMemoryStore::new());
8022        let mut background = seeded_waiting_run("run-bg", "thread-bg-recover", "agent");
8023        background.waiting = Some(RunWaitingState {
8024            reason: WaitingReason::BackgroundTasks,
8025            ticket_ids: Vec::new(),
8026            tickets: Vec::new(),
8027            since_dispatch_id: None,
8028            message: None,
8029        });
8030        store.create_run(&background).await.expect("seed bg run");
8031
8032        let mut user_input = seeded_waiting_run("run-user", "thread-user-recover", "agent");
8033        user_input.waiting = Some(RunWaitingState {
8034            reason: WaitingReason::UserInput,
8035            ticket_ids: Vec::new(),
8036            tickets: Vec::new(),
8037            since_dispatch_id: None,
8038            message: Some("waiting for user".to_string()),
8039        });
8040        store
8041            .create_run(&user_input)
8042            .await
8043            .expect("seed user-input run");
8044
8045        let mailbox_store = make_store();
8046        let runtime = Arc::new(RecordingStoreMailboxRuntime::new(store.clone()));
8047        let mailbox = Arc::new(Mailbox::new(
8048            runtime.clone(),
8049            mailbox_store.clone(),
8050            store.clone(),
8051            "test-consumer".to_string(),
8052            MailboxConfig::default(),
8053        ));
8054
8055        let recovered = mailbox.recover().await.expect("recover should succeed");
8056        assert_eq!(recovered, 1);
8057
8058        let deadline = Instant::now() + Duration::from_secs(1);
8059        loop {
8060            if runtime.requests.lock().expect("lock poisoned").len() == 1 {
8061                break;
8062            }
8063            assert!(Instant::now() < deadline, "recover did not dispatch wake");
8064            sleep(Duration::from_millis(5)).await;
8065        }
8066
8067        {
8068            let requests = runtime.requests.lock().expect("lock poisoned");
8069            assert_eq!(requests.len(), 1);
8070            assert_eq!(requests[0].thread_id, "thread-bg-recover");
8071            assert_eq!(requests[0].continue_run_id.as_deref(), Some("run-bg"));
8072            assert_eq!(requests[0].run_mode, RunMode::InternalWake);
8073            assert_eq!(requests[0].adapter, AdapterKind::Internal);
8074        }
8075
8076        let user_dispatches = mailbox_store
8077            .list_dispatches("thread-user-recover", None, 10, 0)
8078            .await
8079            .expect("list user dispatches");
8080        assert!(
8081            user_dispatches.is_empty(),
8082            "user-input waiting runs must stay suspended until explicit input"
8083        );
8084    }
8085
8086    #[tokio::test]
8087    async fn background_task_completion_should_enqueue_internal_wake_message() {
8088        let store = Arc::new(InMemoryStore::new());
8089        let mailbox_store = make_store();
8090        let manager = Arc::new(BackgroundTaskManager::new());
8091
8092        let llm = Arc::new(ScriptedLlm::new(vec![
8093            StreamResult {
8094                content: vec![ContentBlock::text("spawning task")],
8095                tool_calls: vec![ToolCall::new("c1", "spawn_bg", json!({}))],
8096                usage: None,
8097                stop_reason: Some(StopReason::ToolUse),
8098                has_incomplete_tool_calls: false,
8099            },
8100            StreamResult {
8101                content: vec![ContentBlock::text("waiting for background task")],
8102                tool_calls: vec![],
8103                usage: None,
8104                stop_reason: Some(StopReason::EndTurn),
8105                has_incomplete_tool_calls: false,
8106            },
8107        ]));
8108        let agent = ResolvedAgent::new("agent", "m", "sys", llm).with_tool(Arc::new(
8109            SpawnShortBgTaskTool {
8110                manager: manager.clone(),
8111                delay: Duration::from_millis(25),
8112            },
8113        ));
8114        let resolver = Arc::new(FixedResolver {
8115            agent,
8116            plugins: vec![Arc::new(BackgroundTaskPlugin::new(manager))],
8117        });
8118        let runtime = Arc::new(
8119            AgentRuntime::new(resolver)
8120                .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>),
8121        );
8122        let mailbox = make_mailbox_with_run_store(
8123            runtime,
8124            mailbox_store.clone(),
8125            store.clone() as Arc<dyn ThreadRunStore>,
8126        );
8127
8128        mailbox
8129            .submit_background(
8130                RunRequest::new("thread-bg", vec![Message::user("start")]).with_agent_id("agent"),
8131            )
8132            .await
8133            .expect("submit should succeed");
8134
8135        let waiting =
8136            wait_for_latest_run(&store, "thread-bg", |run| run.is_background_task_waiting()).await;
8137        sleep(Duration::from_millis(100)).await;
8138
8139        let dispatches = mailbox_store
8140            .list_dispatches("thread-bg", None, 10, 0)
8141            .await
8142            .expect("list dispatches should succeed");
8143
8144        assert!(
8145            dispatches.len() >= 2,
8146            "background completion should enqueue an internal wake message; waiting run was {:?}, dispatches were {:?}",
8147            waiting,
8148            dispatches
8149        );
8150        let messages = store
8151            .load_messages("thread-bg")
8152            .await
8153            .expect("load messages")
8154            .unwrap_or_default();
8155        assert!(
8156            messages.iter().any(|msg| {
8157                msg.role == awaken_contract::contract::message::Role::User
8158                    && msg.visibility == awaken_contract::contract::message::Visibility::Internal
8159                    && msg.text().contains("<background-task-event")
8160                    && msg.text().contains("\"done\":true")
8161            }),
8162            "expected a synthetic background wake message after task completion"
8163        );
8164    }
8165
8166    // ── send_decision returns false for unknown id ──────────────────
8167
8168    #[test]
8169    fn send_decision_unknown_id_returns_false() {
8170        let store = make_store();
8171        let runtime = make_runtime();
8172        let mailbox = make_mailbox(runtime, store);
8173
8174        let result = mailbox.send_decision(
8175            "nonexistent",
8176            "tc-1".to_string(),
8177            ToolCallResume {
8178                decision_id: "d1".into(),
8179                action: awaken_contract::contract::suspension::ResumeDecisionAction::Resume,
8180                result: serde_json::json!({"approved": true}),
8181                reason: None,
8182                updated_at: 0,
8183            },
8184        );
8185        assert!(!result);
8186    }
8187
8188    // ── Concurrency tests ───────────────────────────────────────────
8189
8190    #[tokio::test]
8191    async fn concurrent_submit_background_same_thread_only_one_runs() {
8192        let store = make_store();
8193        let runtime = make_runtime();
8194        let mailbox = make_mailbox(runtime, store.clone());
8195
8196        // Submit 5 background dispatches to the same thread concurrently.
8197        let mut handles = Vec::new();
8198        for i in 0..5 {
8199            let mb = Arc::clone(&mailbox);
8200            handles.push(tokio::spawn(async move {
8201                let req = RunRequest::new("thread-conc", vec![Message::user(format!("msg-{i}"))])
8202                    .with_agent_id("agent-1");
8203                mb.submit_background(req).await
8204            }));
8205        }
8206        let results: Vec<_> = futures::future::join_all(handles)
8207            .await
8208            .into_iter()
8209            .map(|r| r.unwrap())
8210            .collect();
8211
8212        // All should succeed (enqueue always works).
8213        assert!(results.iter().all(|r| r.is_ok()));
8214
8215        // At most one should be Running (the rest are Queued).
8216        let running_count = results
8217            .iter()
8218            .filter_map(|r| r.as_ref().ok())
8219            .filter(|r| matches!(r.status, MailboxDispatchStatus::Running))
8220            .count();
8221        assert!(
8222            running_count <= 1,
8223            "at most 1 should be Running, got {running_count}"
8224        );
8225
8226        // Store should have at most 1 Claimed dispatch for this thread.
8227        let dispatches = store
8228            .list_dispatches("thread-conc", Some(&[RunDispatchStatus::Claimed]), 10, 0)
8229            .await
8230            .unwrap();
8231        assert!(
8232            dispatches.len() <= 1,
8233            "store should have at most 1 Claimed dispatch, got {}",
8234            dispatches.len()
8235        );
8236    }
8237
8238    #[tokio::test]
8239    async fn concurrent_submit_same_thread_only_one_claims() {
8240        let store = make_store();
8241        let runtime = make_runtime();
8242        let mailbox = make_mailbox(runtime, store.clone());
8243
8244        // Submit 3 streaming requests to the same thread concurrently.
8245        let mut handles = Vec::new();
8246        for i in 0..3 {
8247            let mb = Arc::clone(&mailbox);
8248            handles.push(tokio::spawn(async move {
8249                let req = RunRequest::new(
8250                    "thread-stream-conc",
8251                    vec![Message::user(format!("msg-{i}"))],
8252                )
8253                .with_agent_id("agent-1");
8254                mb.submit(req).await
8255            }));
8256        }
8257        let results: Vec<_> = futures::future::join_all(handles)
8258            .await
8259            .into_iter()
8260            .map(|r| r.unwrap())
8261            .collect();
8262
8263        // Some may fail (inline-claim rejected), some succeed.
8264        let ok_count = results.iter().filter(|r| r.is_ok()).count();
8265        assert!(ok_count >= 1, "at least 1 should succeed");
8266
8267        // Store should have at most 1 Claimed dispatch.
8268        let dispatches = store
8269            .list_dispatches(
8270                "thread-stream-conc",
8271                Some(&[RunDispatchStatus::Claimed]),
8272                10,
8273                0,
8274            )
8275            .await
8276            .unwrap();
8277        assert!(
8278            dispatches.len() <= 1,
8279            "at most 1 Claimed, got {}",
8280            dispatches.len()
8281        );
8282    }
8283
8284    #[tokio::test]
8285    async fn interrupt_between_claim_and_execution_supersedes_without_runtime_start() {
8286        crate::metrics::install_recorder();
8287        let store = Arc::new(InterruptOnLoadMailboxStore::new());
8288        let run_store = Arc::new(InMemoryStore::new());
8289        let runtime = Arc::new(CountingMailboxRuntime::default());
8290        let mailbox = Arc::new(Mailbox::new_with_executor(
8291            runtime.clone(),
8292            store.clone(),
8293            run_store.clone(),
8294            "epoch-race-consumer".to_string(),
8295            MailboxConfig {
8296                lease_ms: 100,
8297                lease_renewal_interval: Duration::from_millis(20),
8298                ..MailboxConfig::default()
8299            },
8300        ));
8301
8302        let result = mailbox
8303            .submit_background(
8304                RunRequest::new("thread-epoch-race", vec![Message::user("go")])
8305                    .with_agent_id("agent"),
8306            )
8307            .await
8308            .expect("submit should succeed");
8309
8310        tokio::time::timeout(Duration::from_secs(2), async {
8311            loop {
8312                if let Some(dispatch) = store.load_dispatch(&result.dispatch_id).await.unwrap()
8313                    && dispatch.status == RunDispatchStatus::Superseded
8314                {
8315                    break;
8316                }
8317                sleep(Duration::from_millis(10)).await;
8318            }
8319        })
8320        .await
8321        .expect("dispatch should be superseded promptly");
8322
8323        assert_eq!(
8324            runtime.run_count(),
8325            0,
8326            "stale dispatch must not enter runtime"
8327        );
8328        let loaded = store
8329            .load_dispatch(&result.dispatch_id)
8330            .await
8331            .unwrap()
8332            .expect("dispatch should remain inspectable");
8333        assert_eq!(loaded.status, RunDispatchStatus::Superseded);
8334        assert!(loaded.claim_token.is_none());
8335        assert!(loaded.lease_until.is_none());
8336
8337        let run = run_store
8338            .load_run(&result.run_id)
8339            .await
8340            .unwrap()
8341            .expect("prepared run should remain inspectable");
8342        assert_eq!(run.status, RunStatus::Done);
8343        assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
8344        assert_eq!(
8345            run.dispatch_id.as_deref(),
8346            Some(result.dispatch_id.as_str())
8347        );
8348
8349        let output = crate::metrics::render().unwrap_or_default();
8350        assert!(output.contains("operation=\"load_dispatch\""));
8351        assert!(output.contains("operation=\"current_dispatch_epoch\""));
8352        assert!(output.contains("operation=\"supersede_claimed\""));
8353        assert!(output.contains("operation=\"mark_run_superseded\""));
8354    }
8355
8356    #[tokio::test]
8357    async fn dispatch_signal_busy_ack_still_runs_queued_dispatch_after_current_finishes() {
8358        let store = Arc::new(SignalMailboxStore::new());
8359        let run_store = Arc::new(InMemoryStore::new());
8360        let (started_tx, mut started_rx) = tokio::sync::mpsc::unbounded_channel();
8361        let release_first = Arc::new(tokio::sync::Notify::new());
8362        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(BlockingMailboxRuntime::new(
8363            started_tx,
8364            Arc::clone(&release_first),
8365        ));
8366        let mailbox = Arc::new(Mailbox::new_with_executor(
8367            runtime,
8368            store.clone(),
8369            run_store,
8370            "signal-consumer".to_string(),
8371            MailboxConfig::default(),
8372        ));
8373
8374        let mut first = RunRequest::new("thread-signal-busy", vec![Message::user("first")])
8375            .with_agent_id("agent");
8376        let (thread_id, first_messages) =
8377            validate_run_inputs(first.thread_id.clone(), first.messages.clone(), false)
8378                .expect("first input should validate");
8379        mailbox
8380            .prepare_run_for_dispatch(&mut first, &thread_id, &first_messages)
8381            .await
8382            .expect("prepare first run");
8383        let first_dispatch = mailbox
8384            .build_dispatch(&first, &thread_id)
8385            .expect("build first dispatch");
8386        let first_dispatch_id = first_dispatch.dispatch_id.clone();
8387        store.enqueue(&first_dispatch).await.expect("enqueue first");
8388
8389        let mut second = RunRequest::new("thread-signal-busy", vec![Message::user("second")])
8390            .with_agent_id("agent");
8391        let (_, second_messages) =
8392            validate_run_inputs(second.thread_id.clone(), second.messages.clone(), false)
8393                .expect("second input should validate");
8394        mailbox
8395            .prepare_run_for_dispatch(&mut second, &thread_id, &second_messages)
8396            .await
8397            .expect("prepare second run");
8398        let second_dispatch = mailbox
8399            .build_dispatch(&second, &thread_id)
8400            .expect("build second dispatch");
8401        let second_dispatch_id = second_dispatch.dispatch_id.clone();
8402        store
8403            .enqueue(&second_dispatch)
8404            .await
8405            .expect("enqueue second");
8406
8407        let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
8408        let (ordinal, dispatch_id) =
8409            tokio::time::timeout(Duration::from_secs(2), started_rx.recv())
8410                .await
8411                .expect("first dispatch should start")
8412                .expect("runtime should report first start");
8413        assert_eq!(ordinal, 1);
8414        let blocked_dispatch_id = dispatch_id.expect("started dispatch should have an id");
8415        assert!(
8416            blocked_dispatch_id == first_dispatch_id || blocked_dispatch_id == second_dispatch_id,
8417            "started dispatch must be one of the two queued dispatches"
8418        );
8419        let queued_dispatch_id = if blocked_dispatch_id == first_dispatch_id {
8420            second_dispatch_id.as_str()
8421        } else {
8422            first_dispatch_id.as_str()
8423        };
8424
8425        let deadline = Instant::now() + Duration::from_secs(2);
8426        while store.acked_signal_count() < 2 {
8427            assert!(
8428                Instant::now() < deadline,
8429                "signal loop must ack the busy second signal instead of blocking"
8430            );
8431            sleep(Duration::from_millis(10)).await;
8432        }
8433        assert_eq!(store.nacked_signal_count(), 0);
8434        let queued_before_release = store
8435            .load_dispatch(queued_dispatch_id)
8436            .await
8437            .expect("load queued dispatch")
8438            .expect("queued dispatch exists");
8439        assert_eq!(
8440            queued_before_release.status,
8441            RunDispatchStatus::Queued,
8442            "busy signal ack must not claim the other dispatch before the first finishes"
8443        );
8444
8445        release_first.notify_waiters();
8446        let (ordinal, dispatch_id) =
8447            tokio::time::timeout(Duration::from_secs(2), started_rx.recv())
8448                .await
8449                .expect("queued dispatch should start after first finishes")
8450                .expect("runtime should report second start");
8451        assert_eq!(ordinal, 2);
8452        assert_eq!(dispatch_id.as_deref(), Some(queued_dispatch_id));
8453
8454        let first_done = wait_for_dispatch(&store.inner, &first_dispatch_id, |dispatch| {
8455            dispatch.status == RunDispatchStatus::Acked
8456        })
8457        .await;
8458        let second_done = wait_for_dispatch(&store.inner, &second_dispatch_id, |dispatch| {
8459            dispatch.status == RunDispatchStatus::Acked
8460        })
8461        .await;
8462        signal_loop.abort();
8463
8464        assert_eq!(first_done.status, RunDispatchStatus::Acked);
8465        assert_eq!(second_done.status, RunDispatchStatus::Acked);
8466        assert_eq!(store.acked_signal_count(), 2);
8467        assert_eq!(store.nacked_signal_count(), 0);
8468    }
8469
8470    #[tokio::test]
8471    async fn submit_background_returns_correct_status() {
8472        let store = make_store();
8473        let runtime = make_runtime();
8474        let mailbox = make_mailbox(runtime, store.clone());
8475
8476        // First submit should dispatch (Running or Queued depending on timing).
8477        let req1 =
8478            RunRequest::new("thread-status", vec![Message::user("a")]).with_agent_id("agent-1");
8479        let result1 = mailbox.submit_background(req1).await.unwrap();
8480        // First dispatch should be claimed/running since thread is idle.
8481        assert!(
8482            matches!(
8483                result1.status,
8484                MailboxDispatchStatus::Running | MailboxDispatchStatus::Queued
8485            ),
8486            "first dispatch should be Running or Queued"
8487        );
8488
8489        // Second submit while first is running should be Queued.
8490        let req2 =
8491            RunRequest::new("thread-status", vec![Message::user("b")]).with_agent_id("agent-1");
8492        let result2 = mailbox.submit_background(req2).await.unwrap();
8493        assert!(
8494            matches!(result2.status, MailboxDispatchStatus::Queued),
8495            "second dispatch should be Queued while first is running"
8496        );
8497    }
8498
8499    #[tokio::test]
8500    async fn worker_status_not_corrupted_after_empty_claim() {
8501        let store = make_store();
8502        let runtime = make_runtime();
8503        let mailbox = make_mailbox(runtime, store.clone());
8504
8505        // Submit and dispatch a dispatch to get worker into Running state.
8506        let req =
8507            RunRequest::new("thread-guard", vec![Message::user("a")]).with_agent_id("agent-1");
8508        mailbox.submit_background(req).await.unwrap();
8509
8510        // Worker should be Running (or Claiming).
8511        let workers = mailbox.workers.read().await;
8512        if let Some(worker) = workers.get("thread-guard") {
8513            let w = worker.lock();
8514            assert!(
8515                !matches!(w.status, MailboxWorkerStatus::Idle),
8516                "worker should not be Idle after dispatch"
8517            );
8518        }
8519        drop(workers);
8520
8521        // Call try_dispatch_next while Running — should be a no-op.
8522        mailbox.try_dispatch_next("thread-guard").await;
8523
8524        // Worker should still be Running, not reverted to Idle.
8525        let workers = mailbox.workers.read().await;
8526        if let Some(worker) = workers.get("thread-guard") {
8527            let w = worker.lock();
8528            assert!(
8529                !matches!(w.status, MailboxWorkerStatus::Idle),
8530                "worker should still not be Idle"
8531            );
8532        }
8533    }
8534
8535    // ── Coverage gap tests ──────────────────────────────────────────
8536
8537    #[test]
8538    fn run_request_extras_corrupt_json_returns_error() {
8539        let corrupt = serde_json::json!({"overrides": "not-an-object", "decisions": 42});
8540        let result = RunRequestExtras::from_value(&corrupt);
8541        assert!(result.is_err(), "corrupt JSON should fail deserialization");
8542    }
8543
8544    #[tokio::test]
8545    async fn submit_inline_claim_fails_when_thread_already_claimed() {
8546        let store = make_store();
8547        let runtime = make_runtime();
8548        let mailbox = make_mailbox(runtime, store.clone());
8549
8550        // First submit claims successfully.
8551        let req1 =
8552            RunRequest::new("thread-clash", vec![Message::user("first")]).with_agent_id("agent-1");
8553        let result1 = mailbox.submit(req1).await;
8554        assert!(result1.is_ok(), "first submit should succeed");
8555
8556        // Second submit to same thread: interrupt will cancel the first,
8557        // but timing may allow the second to also succeed or fail gracefully.
8558        let req2 =
8559            RunRequest::new("thread-clash", vec![Message::user("second")]).with_agent_id("agent-1");
8560        let result2 = mailbox.submit(req2).await;
8561        // Either succeeds (interrupt cancelled old) or fails with validation error.
8562        // Crucially: no panic, no double-claimed state.
8563        match &result2 {
8564            Ok((r, _)) => assert!(!r.dispatch_id.is_empty()),
8565            Err(MailboxError::Validation(_)) => {} // acceptable
8566            Err(e) => panic!("unexpected error: {e}"),
8567        }
8568
8569        // Store invariant: at most 1 Claimed dispatch for this thread.
8570        let claimed = store
8571            .list_dispatches("thread-clash", Some(&[RunDispatchStatus::Claimed]), 10, 0)
8572            .await
8573            .unwrap();
8574        assert!(
8575            claimed.len() <= 1,
8576            "at most 1 Claimed, got {}",
8577            claimed.len()
8578        );
8579    }
8580
8581    #[tokio::test]
8582    async fn reconnect_sink_returns_false_for_idle_worker() {
8583        let store = make_store();
8584        let runtime = make_runtime();
8585        let mailbox = make_mailbox(runtime, store);
8586
8587        // Create a worker but don't start a run.
8588        mailbox.get_or_create_worker("thread-idle").await;
8589
8590        let (tx, _rx) = tokio::sync::mpsc::channel(16);
8591        let result = mailbox.reconnect_sink("thread-idle", tx).await;
8592        assert!(!result, "reconnect should fail for idle worker");
8593    }
8594
8595    #[tokio::test]
8596    async fn reconnect_sink_returns_false_for_unknown_thread() {
8597        let store = make_store();
8598        let runtime = make_runtime();
8599        let mailbox = make_mailbox(runtime, store);
8600
8601        let (tx, _rx) = tokio::sync::mpsc::channel(16);
8602        let result = mailbox.reconnect_sink("nonexistent", tx).await;
8603        assert!(!result, "reconnect should fail for unknown thread");
8604    }
8605
8606    #[tokio::test]
8607    async fn reconnect_sink_succeeds_for_running_worker() {
8608        let store = make_store();
8609        let runtime = make_runtime();
8610        let mailbox = make_mailbox(runtime, store);
8611
8612        // Directly set the worker to Running status (avoids race with
8613        // spawn_execution resetting to Idle when StubResolver fails).
8614        let worker = mailbox.get_or_create_worker("thread-reconnect").await;
8615        {
8616            let reconnectable = Arc::new(ReconnectableEventSink::new(mpsc::channel(16).0));
8617            let mut w = worker.lock();
8618            w.status = MailboxWorkerStatus::Running {
8619                dispatch_id: "dispatch-fake".into(),
8620                run_id: "run-fake".into(),
8621                lease_handle: tokio::spawn(futures::future::pending::<()>()),
8622                sink: reconnectable,
8623            };
8624        }
8625
8626        let (tx, _rx) = mpsc::channel(16);
8627        let result = mailbox.reconnect_sink("thread-reconnect", tx).await;
8628        assert!(result, "reconnect should succeed for running worker");
8629    }
8630
8631    #[tokio::test]
8632    async fn build_dispatch_extras_roundtrip_with_decisions() {
8633        use awaken_contract::contract::suspension::{ResumeDecisionAction, ToolCallResume};
8634
8635        let decisions = vec![(
8636            "call-1".to_string(),
8637            ToolCallResume {
8638                decision_id: "d-1".into(),
8639                action: ResumeDecisionAction::Resume,
8640                result: serde_json::json!({"approved": true}),
8641                reason: None,
8642                updated_at: 0,
8643            },
8644        )];
8645
8646        let request = RunRequest::new("thread-dec", vec![Message::user("hi")])
8647            .with_agent_id("a1")
8648            .with_decisions(decisions.clone());
8649        let extras = RunRequestExtras::from_request(&request);
8650        assert_eq!(extras.decisions.len(), 1);
8651        assert_eq!(extras.decisions[0].0, "call-1");
8652    }
8653
8654    #[tokio::test]
8655    async fn prepare_run_origin_a2a_roundtrip() {
8656        let store = make_store();
8657        let runtime = make_runtime();
8658        let thread_store = Arc::new(InMemoryStore::new());
8659        let mailbox = Arc::new(Mailbox::new(
8660            runtime,
8661            store,
8662            thread_store.clone(),
8663            "test-consumer".to_string(),
8664            MailboxConfig::default(),
8665        ));
8666
8667        let mut request = RunRequest::new("thread-a2a", vec![Message::user("hi")])
8668            .with_origin(RunRequestOrigin::A2A)
8669            .with_parent_run_id("parent-123");
8670        let (thread_id, messages) =
8671            validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
8672                .unwrap();
8673        let run_id = mailbox
8674            .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
8675            .await
8676            .unwrap();
8677        let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
8678
8679        assert!(matches!(
8680            run.request.as_ref().unwrap().origin,
8681            RunRequestOrigin::A2A
8682        ));
8683        assert_eq!(run.parent_run_id.as_deref(), Some("parent-123"));
8684    }
8685
8686    // ── INLINE_CLAIM_GUARD_MS ───────────────────────────────────────
8687
8688    #[test]
8689    fn inline_claim_guard_is_reasonable() {
8690        assert_eq!(INLINE_CLAIM_GUARD_MS, 60_000);
8691    }
8692
8693    // ── Nack exponential backoff ────────────────────────────────────
8694
8695    #[test]
8696    fn nack_backoff_progression() {
8697        let config = MailboxConfig::default();
8698        // Formula from execute_dispatch: 2^(attempt_count.saturating_sub(1).min(6))
8699        // attempt_count is 0-based on the dispatch at nack time, but incremented
8700        // by the store before re-queue. The backoff in execute_dispatch uses
8701        // dispatch.attempt_count which is the pre-nack value.
8702        for (attempt_count, expected_ms) in [
8703            (1, 250),   // 2^0 * 250 = 250
8704            (2, 500),   // 2^1 * 250 = 500
8705            (3, 1000),  // 2^2 * 250 = 1000
8706            (4, 2000),  // 2^3 * 250 = 2000
8707            (5, 4000),  // 2^4 * 250 = 4000
8708            (6, 8000),  // 2^5 * 250 = 8000
8709            (7, 16000), // 2^6 * 250 = 16000
8710        ] {
8711            let backoff_factor = 2u64.pow((attempt_count as u32).saturating_sub(1).min(6));
8712            let delay =
8713                (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8714            assert_eq!(delay, expected_ms, "attempt_count={attempt_count}");
8715        }
8716    }
8717
8718    #[test]
8719    fn nack_backoff_caps_at_max() {
8720        let config = MailboxConfig {
8721            max_retry_delay_ms: 5000,
8722            default_retry_delay_ms: 1000,
8723            ..Default::default()
8724        };
8725        // attempt_count=4 → 2^3 = 8 → 1000*8 = 8000, capped at 5000
8726        let backoff_factor = 2u64.pow(3);
8727        let delay = (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8728        assert_eq!(delay, 5000);
8729    }
8730
8731    #[test]
8732    fn nack_backoff_zero_attempt_is_base_delay() {
8733        let config = MailboxConfig::default();
8734        // attempt_count=0 → saturating_sub(1)=0, but min(6)=0 → 2^0=1 → 250*1=250
8735        // However in practice attempt_count starts at 1 after first claim.
8736        let backoff_factor = 2u64.pow(0u32.saturating_sub(1).min(6));
8737        let delay = (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8738        assert_eq!(delay, 250);
8739    }
8740
8741    #[test]
8742    fn nack_backoff_high_attempt_stays_capped() {
8743        let config = MailboxConfig::default();
8744        // attempt_count=100 → min(6)=6 → 2^6=64 → 250*64=16000 < 30000
8745        let backoff_factor = 2u64.pow(100u32.saturating_sub(1).min(6));
8746        let delay = (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8747        assert_eq!(delay, 16000);
8748
8749        // With smaller max: attempt_count=100 → 250*64=16000, capped at 10000
8750        let config2 = MailboxConfig {
8751            max_retry_delay_ms: 10_000,
8752            ..Default::default()
8753        };
8754        let delay2 =
8755            (config2.default_retry_delay_ms * backoff_factor).min(config2.max_retry_delay_ms);
8756        assert_eq!(delay2, 10_000);
8757    }
8758
8759    // ── GC idle workers ─────────────────────────────────────────────
8760
8761    #[tokio::test]
8762    async fn gc_idle_workers_removes_idle_with_no_dispatches() {
8763        let store = make_store();
8764        let runtime = make_runtime();
8765        let mailbox = make_mailbox(runtime, store.clone());
8766
8767        // Manually insert an Idle worker (no dispatches in store for this thread).
8768        {
8769            let mut workers = mailbox.workers.write().await;
8770            workers.insert(
8771                "thread-gc".to_string(),
8772                Arc::new(SyncMutex::new(MailboxWorker::default())),
8773            );
8774        }
8775
8776        // Verify the worker is present.
8777        assert!(mailbox.workers.read().await.contains_key("thread-gc"));
8778
8779        // Run GC — idle worker with no queued dispatches should be removed.
8780        mailbox.gc_idle_workers().await;
8781
8782        assert!(
8783            !mailbox.workers.read().await.contains_key("thread-gc"),
8784            "idle worker with no queued dispatches should be removed"
8785        );
8786    }
8787
8788    #[tokio::test]
8789    async fn gc_idle_workers_keeps_worker_with_queued_dispatches() {
8790        let store = make_store();
8791        let runtime = make_runtime();
8792        let mailbox = make_mailbox(runtime, store.clone());
8793
8794        // Enqueue a dispatch for the thread (background so it goes to store).
8795        let request =
8796            RunRequest::new("thread-gc-keep", vec![Message::user("hi")]).with_agent_id("agent-1");
8797        mailbox.submit_background(request).await.unwrap();
8798
8799        // Force the worker to Idle status (simulating it finished one dispatch
8800        // but another is queued).
8801        {
8802            let mut workers = mailbox.workers.write().await;
8803            workers.insert(
8804                "thread-gc-keep".to_string(),
8805                Arc::new(SyncMutex::new(MailboxWorker::default())),
8806            );
8807        }
8808
8809        // Run GC — worker has queued/claimed dispatches, so it should be kept.
8810        mailbox.gc_idle_workers().await;
8811
8812        // The worker should still exist because there are dispatches in the store.
8813        let has_dispatches = !store
8814            .list_dispatches(
8815                "thread-gc-keep",
8816                Some(&[RunDispatchStatus::Queued, RunDispatchStatus::Claimed]),
8817                1,
8818                0,
8819            )
8820            .await
8821            .unwrap()
8822            .is_empty();
8823        if has_dispatches {
8824            assert!(
8825                mailbox.workers.read().await.contains_key("thread-gc-keep"),
8826                "idle worker with queued dispatches should NOT be removed"
8827            );
8828        }
8829    }
8830
8831    #[tokio::test]
8832    async fn gc_idle_workers_noop_when_empty() {
8833        let store = make_store();
8834        let runtime = make_runtime();
8835        let mailbox = make_mailbox(runtime, store);
8836
8837        // No workers exist — GC should not panic.
8838        mailbox.gc_idle_workers().await;
8839        let workers = mailbox.workers.read().await;
8840        assert!(workers.is_empty());
8841    }
8842
8843    // ── ThreadContext cache tests ───────────────────────────────────
8844
8845    fn make_run_record(run_id: &str, thread_id: &str, status: RunStatus) -> RunRecord {
8846        RunRecord {
8847            run_id: run_id.to_string(),
8848            thread_id: thread_id.to_string(),
8849            agent_id: "agent".to_string(),
8850            parent_run_id: None,
8851            request: None,
8852            input: None,
8853            output: None,
8854            status,
8855            termination_reason: None,
8856            final_output: None,
8857            error_payload: None,
8858            dispatch_id: None,
8859            session_id: None,
8860            transport_request_id: None,
8861            waiting: None,
8862            outcome: None,
8863            created_at: 1,
8864            started_at: None,
8865            finished_at: None,
8866            updated_at: 1,
8867            steps: 0,
8868            input_tokens: 0,
8869            output_tokens: 0,
8870            state: None,
8871        }
8872    }
8873
8874    fn make_waiting_run_record(run_id: &str, thread_id: &str) -> RunRecord {
8875        let mut run = make_run_record(run_id, thread_id, RunStatus::Waiting);
8876        run.waiting = Some(RunWaitingState {
8877            reason: WaitingReason::BackgroundTasks,
8878            ticket_ids: Vec::new(),
8879            tickets: Vec::new(),
8880            since_dispatch_id: None,
8881            message: None,
8882        });
8883        run
8884    }
8885
8886    fn make_noop_mailbox(thread_store: Arc<InMemoryStore>) -> Arc<Mailbox> {
8887        let mailbox_store = make_store();
8888        let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
8889        Arc::new(Mailbox::new_with_executor(
8890            runtime,
8891            mailbox_store,
8892            thread_store,
8893            "test-consumer".into(),
8894            MailboxConfig::default(),
8895        ))
8896    }
8897
8898    #[tokio::test]
8899    async fn thread_context_cache_used_by_reusable_waiting_run_id() {
8900        let thread_store = Arc::new(InMemoryStore::new());
8901        let mailbox = make_noop_mailbox(thread_store.clone());
8902        let thread_id = "thread-ctx-reuse";
8903
8904        // Create a waiting run in the store.
8905        let run = make_waiting_run_record("run-waiting", thread_id);
8906        thread_store
8907            .checkpoint(thread_id, &[Message::user("hi")], &run)
8908            .await
8909            .unwrap();
8910
8911        // Pre-warm the cache on the worker.
8912        let worker = mailbox.get_or_create_worker(thread_id).await;
8913        let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
8914            .await
8915            .unwrap();
8916        {
8917            let mut w = worker.lock();
8918            w.thread_ctx = Some(ctx);
8919        }
8920
8921        let result = mailbox.reusable_waiting_run_id(thread_id).await;
8922        assert_eq!(result, Some("run-waiting".to_string()));
8923    }
8924
8925    #[tokio::test]
8926    async fn thread_context_cache_updated_after_prepare_checkpoint() {
8927        let thread_store = Arc::new(InMemoryStore::new());
8928        let mailbox = make_noop_mailbox(thread_store.clone());
8929        let thread_id = "thread-ctx-checkpoint";
8930
8931        // Persist initial state with a Done run.
8932        let run = make_run_record("run-prev", thread_id, RunStatus::Done);
8933        thread_store
8934            .checkpoint(thread_id, &[Message::user("first")], &run)
8935            .await
8936            .unwrap();
8937
8938        // Pre-warm the cache.
8939        let worker = mailbox.get_or_create_worker(thread_id).await;
8940        let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
8941            .await
8942            .unwrap();
8943        {
8944            let mut w = worker.lock();
8945            w.thread_ctx = Some(ctx);
8946        }
8947
8948        // Prepare a new dispatch — this should update the cache.
8949        let mut request =
8950            RunRequest::new(thread_id, vec![Message::user("second")]).with_agent_id("agent");
8951        let msgs = request.messages.clone();
8952        mailbox
8953            .prepare_run_for_dispatch(&mut request, thread_id, &msgs)
8954            .await
8955            .expect("prepare should succeed");
8956
8957        // Verify cache was updated with both messages and the new run.
8958        let w = worker.lock();
8959        let ctx = w.thread_ctx.as_ref().expect("cache should exist");
8960        assert_eq!(ctx.messages.len(), 2, "cache should have 2 messages");
8961        assert!(ctx.latest_run.is_some(), "cache should have latest run");
8962    }
8963
8964    #[tokio::test]
8965    async fn prepare_run_falls_back_to_store_without_cache() {
8966        let thread_store = Arc::new(InMemoryStore::new());
8967        let mailbox = make_noop_mailbox(thread_store.clone());
8968        let thread_id = "thread-no-cache";
8969
8970        // Persist initial state but do NOT pre-warm the cache.
8971        let run = make_run_record("run-prev", thread_id, RunStatus::Done);
8972        thread_store
8973            .checkpoint(thread_id, &[Message::user("first")], &run)
8974            .await
8975            .unwrap();
8976
8977        // No cache — should fall back to store.
8978        let mut request =
8979            RunRequest::new(thread_id, vec![Message::user("second")]).with_agent_id("agent");
8980        let msgs = request.messages.clone();
8981        let run_id = mailbox
8982            .prepare_run_for_dispatch(&mut request, thread_id, &msgs)
8983            .await
8984            .expect("should succeed from store fallback");
8985        assert!(!run_id.is_empty());
8986
8987        // Store should have both messages.
8988        let stored = thread_store
8989            .load_messages(thread_id)
8990            .await
8991            .unwrap()
8992            .unwrap();
8993        assert_eq!(stored.len(), 2);
8994    }
8995
8996    #[tokio::test]
8997    async fn prepare_run_uses_durable_messages_when_active_cache_is_stale() {
8998        let thread_store = Arc::new(InMemoryStore::new());
8999        let mailbox = make_noop_mailbox(thread_store.clone());
9000        let thread_id = "thread-stale-cache";
9001
9002        let active = make_run_record("run-active", thread_id, RunStatus::Running);
9003        thread_store
9004            .checkpoint(thread_id, &[Message::user("first")], &active)
9005            .await
9006            .unwrap();
9007
9008        // Simulate the cache snapshot captured when the active run started.
9009        let worker = mailbox.get_or_create_worker(thread_id).await;
9010        let stale_ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9011            .await
9012            .unwrap();
9013        {
9014            let mut w = worker.lock();
9015            w.thread_ctx = Some(stale_ctx);
9016        }
9017
9018        // Runtime checkpoints a new assistant message while the worker cache is
9019        // still the older snapshot.
9020        thread_store
9021            .checkpoint(
9022                thread_id,
9023                &[Message::user("first"), Message::assistant("active output")],
9024                &active,
9025            )
9026            .await
9027            .unwrap();
9028
9029        let mut request =
9030            RunRequest::new(thread_id, vec![Message::user("second")]).with_agent_id("agent");
9031        let msgs = request.messages.clone();
9032        mailbox
9033            .prepare_run_for_dispatch(&mut request, thread_id, &msgs)
9034            .await
9035            .expect("prepare should preserve active-run checkpoint");
9036
9037        let stored = thread_store
9038            .load_messages(thread_id)
9039            .await
9040            .unwrap()
9041            .unwrap();
9042        assert_eq!(stored.len(), 3);
9043        assert_eq!(stored[1].text(), "active output");
9044        assert_eq!(stored[2].text(), "second");
9045    }
9046
9047    #[tokio::test]
9048    async fn reusable_waiting_run_id_ignores_stale_worker_cache() {
9049        let thread_store = Arc::new(InMemoryStore::new());
9050        let mailbox = make_noop_mailbox(thread_store.clone());
9051        let thread_id = "thread-stale-waiting-cache";
9052
9053        let waiting = make_waiting_run_record("run-waiting", thread_id);
9054        thread_store
9055            .checkpoint(thread_id, &[Message::user("hi")], &waiting)
9056            .await
9057            .unwrap();
9058
9059        let worker = mailbox.get_or_create_worker(thread_id).await;
9060        let stale_ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9061            .await
9062            .unwrap();
9063        {
9064            let mut w = worker.lock();
9065            w.thread_ctx = Some(stale_ctx);
9066        }
9067
9068        let done = make_run_record("run-waiting", thread_id, RunStatus::Done);
9069        thread_store
9070            .checkpoint(
9071                thread_id,
9072                &[Message::user("hi"), Message::assistant("done")],
9073                &done,
9074            )
9075            .await
9076            .unwrap();
9077
9078        assert_eq!(mailbox.reusable_waiting_run_id(thread_id).await, None);
9079    }
9080
9081    #[tokio::test]
9082    async fn thread_context_cache_cleared_on_idle_transition() {
9083        let thread_store = Arc::new(InMemoryStore::new());
9084        let mailbox = make_noop_mailbox(thread_store.clone());
9085        let thread_id = "thread-ctx-clear";
9086
9087        let run = make_run_record("r1", thread_id, RunStatus::Done);
9088        thread_store
9089            .checkpoint(thread_id, &[Message::user("hi")], &run)
9090            .await
9091            .unwrap();
9092
9093        // Set up worker as Running with a populated cache.
9094        let worker = mailbox.get_or_create_worker(thread_id).await;
9095        let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9096            .await
9097            .unwrap();
9098        {
9099            let mut w = worker.lock();
9100            w.thread_ctx = Some(ctx);
9101            w.status = MailboxWorkerStatus::Running {
9102                dispatch_id: "d1".into(),
9103                run_id: "r1".into(),
9104                lease_handle: tokio::spawn(async {}),
9105                sink: Arc::new(ReconnectableEventSink::new(mpsc::channel(16).0)),
9106            };
9107        }
9108
9109        // Verify cache exists before transition.
9110        assert!(worker.lock().thread_ctx.is_some());
9111
9112        // Simulate completion: transition to Idle and clear cache.
9113        {
9114            let mut w = worker.lock();
9115            let old = std::mem::replace(&mut w.status, MailboxWorkerStatus::Idle);
9116            w.thread_ctx = None;
9117            if let MailboxWorkerStatus::Running { lease_handle, .. } = old {
9118                lease_handle.abort();
9119            }
9120        }
9121
9122        assert!(
9123            worker.lock().thread_ctx.is_none(),
9124            "cache should be cleared on idle transition"
9125        );
9126    }
9127
9128    #[tokio::test]
9129    async fn thread_context_load_populates_run_cache() {
9130        let store = Arc::new(InMemoryStore::new());
9131        let thread_id = "thread-load-test";
9132
9133        let run = make_run_record("r1", thread_id, RunStatus::Done);
9134        store
9135            .checkpoint(thread_id, &[Message::user("msg")], &run)
9136            .await
9137            .unwrap();
9138
9139        let ctx = ThreadContext::load(store.as_ref(), thread_id)
9140            .await
9141            .expect("load should succeed");
9142
9143        assert_eq!(ctx.messages.len(), 1);
9144        assert!(ctx.latest_run.is_some());
9145        assert_eq!(ctx.latest_run.as_ref().unwrap().run_id, "r1");
9146        assert!(ctx.get_run("r1").is_some());
9147        assert!(ctx.get_run("unknown").is_none());
9148    }
9149
9150    #[tokio::test]
9151    async fn reusable_waiting_run_id_returns_none_for_done_cached_run() {
9152        let thread_store = Arc::new(InMemoryStore::new());
9153        let mailbox = make_noop_mailbox(thread_store.clone());
9154        let thread_id = "thread-done-run";
9155
9156        let run = make_run_record("run-done", thread_id, RunStatus::Done);
9157        thread_store
9158            .checkpoint(thread_id, &[Message::user("hi")], &run)
9159            .await
9160            .unwrap();
9161
9162        let worker = mailbox.get_or_create_worker(thread_id).await;
9163        let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9164            .await
9165            .unwrap();
9166        {
9167            let mut w = worker.lock();
9168            w.thread_ctx = Some(ctx);
9169        }
9170
9171        let result = mailbox.reusable_waiting_run_id(thread_id).await;
9172        assert_eq!(result, None, "Done run should not be reusable");
9173    }
9174}